1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31 #include "st.h"
32
33 #define WARN_BUFSIZE 256
34
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36
37
38 OnigSyntaxType OnigSyntaxRuby = {
39 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42 ONIG_SYN_OP_ESC_C_CONTROL )
43 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45 ONIG_SYN_OP2_OPTION_RUBY |
46 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
49 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53 ONIG_SYN_OP2_ESC_H_XDIGIT )
54 , ( SYN_GNU_REGEX_BV |
55 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62 , ONIG_OPTION_NONE
63 ,
64 {
65 (OnigCodePoint )'\\' /* esc */
66 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
67 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
68 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71 }
72 };
73
74 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92 onig_warn = f;
93 }
94
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97 onig_verb_warn = f;
98 }
99
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103 if (IS_NOT_NULL(bbuf)) {
104 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105 xfree(bbuf);
106 }
107 }
108
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112 int r;
113 BBuf *to;
114
115 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116 CHECK_NULL_RETURN_MEMERR(to);
117 r = BBUF_INIT(to, from->alloc);
118 if (r != 0) return r;
119 to->used = from->used;
120 xmemcpy(to->p, from->p, from->used);
121 return 0;
122 }
123
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125 ((env)->num_mem + 1 + (rel_no))
126
127 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128
129 #define MBCODE_START_POS(enc) \
130 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138 if (r) return r;\
139 }\
140 } while (0)
141
142
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144 int i;\
145 empty = 1;\
146 for (i = 0; i < (int )BITSET_SIZE; i++) {\
147 if ((bs)[i] != 0) {\
148 empty = 0; break;\
149 }\
150 }\
151 } while (0)
152
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156 int i;
157 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158 BITSET_SET_BIT(bs, i);
159 }
160 }
161
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166 int i;
167 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174 int i;
175 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181 int i;
182 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188 int i;
189 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195 int i;
196 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202 int i;
203 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209 int x;
210
211 while (n-- > 0) {
212 x = *s2++ - *s1++;
213 if (x) return x;
214 }
215 return 0;
216 }
217
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221 int len = end - src;
222 if (len > 0) {
223 xmemcpy(dest, src, len);
224 dest[len] = (UChar )0;
225 }
226 }
227
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232 int slen, term_len, i;
233 UChar *r;
234
235 slen = end - s;
236 term_len = ONIGENC_MBC_MINLEN(enc);
237
238 r = (UChar* )xmalloc(slen + term_len);
239 CHECK_NULL_RETURN(r);
240 xmemcpy(r, s, slen);
241
242 for (i = 0; i < term_len; i++)
243 r[slen + i] = (UChar )0;
244
245 return r;
246 }
247 #endif
248
249 /* scan pattern methods */
250 #define PEND_VALUE 0
251
252 #define PFETCH_READY UChar* pfetch_prev
253 #define PEND (p < end ? 0 : 1)
254 #define PUNFETCH p = pfetch_prev
255 #define PINC do { \
256 pfetch_prev = p; \
257 p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c) do { \
260 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261 pfetch_prev = p; \
262 p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 } while (0)
264
265 #define PINC_S do { \
266 p += ONIGENC_MBC_ENC_LEN(enc, p); \
267 } while (0)
268 #define PFETCH_S(c) do { \
269 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
270 p += ONIGENC_MBC_ENC_LEN(enc, p); \
271 } while (0)
272
273 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
274 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
275
276 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)277 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
278 int capa)
279 {
280 UChar* r;
281
282 if (dest)
283 r = (UChar* )xrealloc(dest, capa + 1);
284 else
285 r = (UChar* )xmalloc(capa + 1);
286
287 CHECK_NULL_RETURN(r);
288 onig_strcpy(r + (dest_end - dest), src, src_end);
289 return r;
290 }
291
292 /* dest on static area */
293 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)294 strcat_capa_from_static(UChar* dest, UChar* dest_end,
295 const UChar* src, const UChar* src_end, int capa)
296 {
297 UChar* r;
298
299 r = (UChar* )xmalloc(capa + 1);
300 CHECK_NULL_RETURN(r);
301 onig_strcpy(r, dest, dest_end);
302 onig_strcpy(r + (dest_end - dest), src, src_end);
303 return r;
304 }
305
306
307 #ifdef USE_ST_LIBRARY
308
309 typedef struct {
310 UChar* s;
311 UChar* end;
312 } st_str_end_key;
313
314 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)315 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
316 {
317 UChar *p, *q;
318 int c;
319
320 if ((x->end - x->s) != (y->end - y->s))
321 return 1;
322
323 p = x->s;
324 q = y->s;
325 while (p < x->end) {
326 c = (int )*p - (int )*q;
327 if (c != 0) return c;
328
329 p++; q++;
330 }
331
332 return 0;
333 }
334
335 static int
str_end_hash(st_str_end_key * x)336 str_end_hash(st_str_end_key* x)
337 {
338 UChar *p;
339 int val = 0;
340
341 p = x->s;
342 while (p < x->end) {
343 val = val * 997 + (int )*p++;
344 }
345
346 return val + (val >> 5);
347 }
348
349 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)350 onig_st_init_strend_table_with_size(int size)
351 {
352 static struct st_hash_type hashType = {
353 str_end_cmp,
354 str_end_hash,
355 };
356
357 return (hash_table_type* )
358 onig_st_init_table_with_size(&hashType, size);
359 }
360
361 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)362 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
363 const UChar* end_key, hash_data_type *value)
364 {
365 st_str_end_key key;
366
367 key.s = (UChar* )str_key;
368 key.end = (UChar* )end_key;
369
370 return onig_st_lookup(table, (st_data_t )(&key), value);
371 }
372
373 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)374 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
375 const UChar* end_key, hash_data_type value)
376 {
377 st_str_end_key* key;
378 int result;
379
380 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
381 key->s = (UChar* )str_key;
382 key->end = (UChar* )end_key;
383 result = onig_st_insert(table, (st_data_t )key, value);
384 if (result) {
385 xfree(key);
386 }
387 return result;
388 }
389
390 #endif /* USE_ST_LIBRARY */
391
392
393 #ifdef USE_NAMED_GROUP
394
395 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
396
397 typedef struct {
398 UChar* name;
399 int name_len; /* byte length */
400 int back_num; /* number of backrefs */
401 int back_alloc;
402 int back_ref1;
403 int* back_refs;
404 } NameEntry;
405
406 #ifdef USE_ST_LIBRARY
407
408 typedef st_table NameTable;
409 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
410
411 #define NAMEBUF_SIZE 24
412 #define NAMEBUF_SIZE_1 25
413
414 #ifdef ONIG_DEBUG
415 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)416 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
417 {
418 int i;
419 FILE* fp = (FILE* )arg;
420
421 fprintf(fp, "%s: ", e->name);
422 if (e->back_num == 0)
423 fputs("-", fp);
424 else if (e->back_num == 1)
425 fprintf(fp, "%d", e->back_ref1);
426 else {
427 for (i = 0; i < e->back_num; i++) {
428 if (i > 0) fprintf(fp, ", ");
429 fprintf(fp, "%d", e->back_refs[i]);
430 }
431 }
432 fputs("\n", fp);
433 return ST_CONTINUE;
434 }
435
436 extern int
onig_print_names(FILE * fp,regex_t * reg)437 onig_print_names(FILE* fp, regex_t* reg)
438 {
439 NameTable* t = (NameTable* )reg->name_table;
440
441 if (IS_NOT_NULL(t)) {
442 fprintf(fp, "name table\n");
443 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
444 fputs("\n", fp);
445 }
446 return 0;
447 }
448 #endif /* ONIG_DEBUG */
449
450 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)451 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
452 {
453 xfree(e->name);
454 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
455 xfree(key);
456 xfree(e);
457 return ST_DELETE;
458 }
459
460 static int
names_clear(regex_t * reg)461 names_clear(regex_t* reg)
462 {
463 NameTable* t = (NameTable* )reg->name_table;
464
465 if (IS_NOT_NULL(t)) {
466 onig_st_foreach(t, i_free_name_entry, 0);
467 }
468 return 0;
469 }
470
471 extern int
onig_names_free(regex_t * reg)472 onig_names_free(regex_t* reg)
473 {
474 int r;
475 NameTable* t;
476
477 r = names_clear(reg);
478 if (r) return r;
479
480 t = (NameTable* )reg->name_table;
481 if (IS_NOT_NULL(t)) onig_st_free_table(t);
482 reg->name_table = (void* )NULL;
483 return 0;
484 }
485
486 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)487 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
488 {
489 NameEntry* e;
490 NameTable* t = (NameTable* )reg->name_table;
491
492 e = (NameEntry* )NULL;
493 if (IS_NOT_NULL(t)) {
494 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
495 }
496 return e;
497 }
498
499 typedef struct {
500 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
501 regex_t* reg;
502 void* arg;
503 int ret;
504 OnigEncoding enc;
505 } INamesArg;
506
507 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)508 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
509 {
510 int r = (*(arg->func))(e->name,
511 e->name + e->name_len,
512 e->back_num,
513 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
514 arg->reg, arg->arg);
515 if (r != 0) {
516 arg->ret = r;
517 return ST_STOP;
518 }
519 return ST_CONTINUE;
520 }
521
522 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)523 onig_foreach_name(regex_t* reg,
524 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
525 {
526 INamesArg narg;
527 NameTable* t = (NameTable* )reg->name_table;
528
529 narg.ret = 0;
530 if (IS_NOT_NULL(t)) {
531 narg.func = func;
532 narg.reg = reg;
533 narg.arg = arg;
534 narg.enc = reg->enc; /* should be pattern encoding. */
535 onig_st_foreach(t, i_names, (HashDataType )&narg);
536 }
537 return narg.ret;
538 }
539
540 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)541 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
542 {
543 int i;
544
545 if (e->back_num > 1) {
546 for (i = 0; i < e->back_num; i++) {
547 e->back_refs[i] = map[e->back_refs[i]].new_val;
548 }
549 }
550 else if (e->back_num == 1) {
551 e->back_ref1 = map[e->back_ref1].new_val;
552 }
553
554 return ST_CONTINUE;
555 }
556
557 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)558 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
559 {
560 NameTable* t = (NameTable* )reg->name_table;
561
562 if (IS_NOT_NULL(t)) {
563 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
564 }
565 return 0;
566 }
567
568
569 extern int
onig_number_of_names(regex_t * reg)570 onig_number_of_names(regex_t* reg)
571 {
572 NameTable* t = (NameTable* )reg->name_table;
573
574 if (IS_NOT_NULL(t))
575 return t->num_entries;
576 else
577 return 0;
578 }
579
580 #else /* USE_ST_LIBRARY */
581
582 #define INIT_NAMES_ALLOC_NUM 8
583
584 typedef struct {
585 NameEntry* e;
586 int num;
587 int alloc;
588 } NameTable;
589
590 #ifdef ONIG_DEBUG
591 extern int
onig_print_names(FILE * fp,regex_t * reg)592 onig_print_names(FILE* fp, regex_t* reg)
593 {
594 int i, j;
595 NameEntry* e;
596 NameTable* t = (NameTable* )reg->name_table;
597
598 if (IS_NOT_NULL(t) && t->num > 0) {
599 fprintf(fp, "name table\n");
600 for (i = 0; i < t->num; i++) {
601 e = &(t->e[i]);
602 fprintf(fp, "%s: ", e->name);
603 if (e->back_num == 0) {
604 fputs("-", fp);
605 }
606 else if (e->back_num == 1) {
607 fprintf(fp, "%d", e->back_ref1);
608 }
609 else {
610 for (j = 0; j < e->back_num; j++) {
611 if (j > 0) fprintf(fp, ", ");
612 fprintf(fp, "%d", e->back_refs[j]);
613 }
614 }
615 fputs("\n", fp);
616 }
617 fputs("\n", fp);
618 }
619 return 0;
620 }
621 #endif
622
623 static int
names_clear(regex_t * reg)624 names_clear(regex_t* reg)
625 {
626 int i;
627 NameEntry* e;
628 NameTable* t = (NameTable* )reg->name_table;
629
630 if (IS_NOT_NULL(t)) {
631 for (i = 0; i < t->num; i++) {
632 e = &(t->e[i]);
633 if (IS_NOT_NULL(e->name)) {
634 xfree(e->name);
635 e->name = NULL;
636 e->name_len = 0;
637 e->back_num = 0;
638 e->back_alloc = 0;
639 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
640 e->back_refs = (int* )NULL;
641 }
642 }
643 if (IS_NOT_NULL(t->e)) {
644 xfree(t->e);
645 t->e = NULL;
646 }
647 t->num = 0;
648 }
649 return 0;
650 }
651
652 extern int
onig_names_free(regex_t * reg)653 onig_names_free(regex_t* reg)
654 {
655 int r;
656 NameTable* t;
657
658 r = names_clear(reg);
659 if (r) return r;
660
661 t = (NameTable* )reg->name_table;
662 if (IS_NOT_NULL(t)) xfree(t);
663 reg->name_table = NULL;
664 return 0;
665 }
666
667 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)668 name_find(regex_t* reg, UChar* name, UChar* name_end)
669 {
670 int i, len;
671 NameEntry* e;
672 NameTable* t = (NameTable* )reg->name_table;
673
674 if (IS_NOT_NULL(t)) {
675 len = name_end - name;
676 for (i = 0; i < t->num; i++) {
677 e = &(t->e[i]);
678 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
679 return e;
680 }
681 }
682 return (NameEntry* )NULL;
683 }
684
685 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)686 onig_foreach_name(regex_t* reg,
687 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
688 {
689 int i, r;
690 NameEntry* e;
691 NameTable* t = (NameTable* )reg->name_table;
692
693 if (IS_NOT_NULL(t)) {
694 for (i = 0; i < t->num; i++) {
695 e = &(t->e[i]);
696 r = (*func)(e->name, e->name + e->name_len, e->back_num,
697 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
698 reg, arg);
699 if (r != 0) return r;
700 }
701 }
702 return 0;
703 }
704
705 extern int
onig_number_of_names(regex_t * reg)706 onig_number_of_names(regex_t* reg)
707 {
708 NameTable* t = (NameTable* )reg->name_table;
709
710 if (IS_NOT_NULL(t))
711 return t->num;
712 else
713 return 0;
714 }
715
716 #endif /* else USE_ST_LIBRARY */
717
718 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)719 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
720 {
721 int alloc;
722 NameEntry* e;
723 NameTable* t = (NameTable* )reg->name_table;
724
725 if (name_end - name <= 0)
726 return ONIGERR_EMPTY_GROUP_NAME;
727
728 e = name_find(reg, name, name_end);
729 if (IS_NULL(e)) {
730 #ifdef USE_ST_LIBRARY
731 if (IS_NULL(t)) {
732 t = onig_st_init_strend_table_with_size(5);
733 reg->name_table = (void* )t;
734 }
735 e = (NameEntry* )xmalloc(sizeof(NameEntry));
736 CHECK_NULL_RETURN_MEMERR(e);
737
738 e->name = strdup_with_null(reg->enc, name, name_end);
739 if (IS_NULL(e->name)) {
740 xfree(e); return ONIGERR_MEMORY;
741 }
742 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
743 (HashDataType )e);
744
745 e->name_len = name_end - name;
746 e->back_num = 0;
747 e->back_alloc = 0;
748 e->back_refs = (int* )NULL;
749
750 #else
751
752 if (IS_NULL(t)) {
753 alloc = INIT_NAMES_ALLOC_NUM;
754 t = (NameTable* )xmalloc(sizeof(NameTable));
755 CHECK_NULL_RETURN_MEMERR(t);
756 t->e = NULL;
757 t->alloc = 0;
758 t->num = 0;
759
760 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
761 if (IS_NULL(t->e)) {
762 xfree(t);
763 return ONIGERR_MEMORY;
764 }
765 t->alloc = alloc;
766 reg->name_table = t;
767 goto clear;
768 }
769 else if (t->num == t->alloc) {
770 int i;
771
772 alloc = t->alloc * 2;
773 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
774 CHECK_NULL_RETURN_MEMERR(t->e);
775 t->alloc = alloc;
776
777 clear:
778 for (i = t->num; i < t->alloc; i++) {
779 t->e[i].name = NULL;
780 t->e[i].name_len = 0;
781 t->e[i].back_num = 0;
782 t->e[i].back_alloc = 0;
783 t->e[i].back_refs = (int* )NULL;
784 }
785 }
786 e = &(t->e[t->num]);
787 t->num++;
788 e->name = strdup_with_null(reg->enc, name, name_end);
789 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
790 e->name_len = name_end - name;
791 #endif
792 }
793
794 if (e->back_num >= 1 &&
795 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
796 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
797 name, name_end);
798 return ONIGERR_MULTIPLEX_DEFINED_NAME;
799 }
800
801 e->back_num++;
802 if (e->back_num == 1) {
803 e->back_ref1 = backref;
804 }
805 else {
806 if (e->back_num == 2) {
807 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
808 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
809 CHECK_NULL_RETURN_MEMERR(e->back_refs);
810 e->back_alloc = alloc;
811 e->back_refs[0] = e->back_ref1;
812 e->back_refs[1] = backref;
813 }
814 else {
815 if (e->back_num > e->back_alloc) {
816 alloc = e->back_alloc * 2;
817 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
818 CHECK_NULL_RETURN_MEMERR(e->back_refs);
819 e->back_alloc = alloc;
820 }
821 e->back_refs[e->back_num - 1] = backref;
822 }
823 }
824
825 return 0;
826 }
827
828 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)829 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
830 const UChar* name_end, int** nums)
831 {
832 NameEntry* e = name_find(reg, name, name_end);
833
834 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
835
836 switch (e->back_num) {
837 case 0:
838 break;
839 case 1:
840 *nums = &(e->back_ref1);
841 break;
842 default:
843 *nums = e->back_refs;
844 break;
845 }
846 return e->back_num;
847 }
848
849 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)850 onig_name_to_backref_number(regex_t* reg, const UChar* name,
851 const UChar* name_end, OnigRegion *region)
852 {
853 int i, n, *nums;
854
855 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
856 if (n < 0)
857 return n;
858 else if (n == 0)
859 return ONIGERR_PARSER_BUG;
860 else if (n == 1)
861 return nums[0];
862 else {
863 if (IS_NOT_NULL(region)) {
864 for (i = n - 1; i >= 0; i--) {
865 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
866 return nums[i];
867 }
868 }
869 return nums[n - 1];
870 }
871 }
872
873 #else /* USE_NAMED_GROUP */
874
875 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)876 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
877 const UChar* name_end, int** nums)
878 {
879 return ONIG_NO_SUPPORT_CONFIG;
880 }
881
882 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)883 onig_name_to_backref_number(regex_t* reg, const UChar* name,
884 const UChar* name_end, OnigRegion* region)
885 {
886 return ONIG_NO_SUPPORT_CONFIG;
887 }
888
889 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)890 onig_foreach_name(regex_t* reg,
891 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
892 {
893 return ONIG_NO_SUPPORT_CONFIG;
894 }
895
896 extern int
onig_number_of_names(regex_t * reg)897 onig_number_of_names(regex_t* reg)
898 {
899 return 0;
900 }
901 #endif /* else USE_NAMED_GROUP */
902
903 extern int
onig_noname_group_capture_is_active(regex_t * reg)904 onig_noname_group_capture_is_active(regex_t* reg)
905 {
906 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
907 return 0;
908
909 #ifdef USE_NAMED_GROUP
910 if (onig_number_of_names(reg) > 0 &&
911 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
912 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
913 return 0;
914 }
915 #endif
916
917 return 1;
918 }
919
920
921 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
922
923 static void
scan_env_clear(ScanEnv * env)924 scan_env_clear(ScanEnv* env)
925 {
926 int i;
927
928 BIT_STATUS_CLEAR(env->capture_history);
929 BIT_STATUS_CLEAR(env->bt_mem_start);
930 BIT_STATUS_CLEAR(env->bt_mem_end);
931 BIT_STATUS_CLEAR(env->backrefed_mem);
932 env->error = (UChar* )NULL;
933 env->error_end = (UChar* )NULL;
934 env->num_call = 0;
935 env->num_mem = 0;
936 #ifdef USE_NAMED_GROUP
937 env->num_named = 0;
938 #endif
939 env->mem_alloc = 0;
940 env->mem_nodes_dynamic = (Node** )NULL;
941
942 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
943 env->mem_nodes_static[i] = NULL_NODE;
944
945 #ifdef USE_COMBINATION_EXPLOSION_CHECK
946 env->num_comb_exp_check = 0;
947 env->comb_exp_max_regnum = 0;
948 env->curr_max_regnum = 0;
949 env->has_recursion = 0;
950 #endif
951 }
952
953 static int
scan_env_add_mem_entry(ScanEnv * env)954 scan_env_add_mem_entry(ScanEnv* env)
955 {
956 int i, need, alloc;
957 Node** p;
958
959 need = env->num_mem + 1;
960 if (need >= SCANENV_MEMNODES_SIZE) {
961 if (env->mem_alloc <= need) {
962 if (IS_NULL(env->mem_nodes_dynamic)) {
963 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
964 p = (Node** )xmalloc(sizeof(Node*) * alloc);
965 xmemcpy(p, env->mem_nodes_static,
966 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
967 }
968 else {
969 alloc = env->mem_alloc * 2;
970 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
971 }
972 CHECK_NULL_RETURN_MEMERR(p);
973
974 for (i = env->num_mem + 1; i < alloc; i++)
975 p[i] = NULL_NODE;
976
977 env->mem_nodes_dynamic = p;
978 env->mem_alloc = alloc;
979 }
980 }
981
982 env->num_mem++;
983 return env->num_mem;
984 }
985
986 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)987 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
988 {
989 if (env->num_mem >= num)
990 SCANENV_MEM_NODES(env)[num] = node;
991 else
992 return ONIGERR_PARSER_BUG;
993 return 0;
994 }
995
996
997 #ifdef USE_PARSE_TREE_NODE_RECYCLE
998 typedef struct _FreeNode {
999 struct _FreeNode* next;
1000 } FreeNode;
1001
1002 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1003 #endif
1004
1005 extern void
onig_node_free(Node * node)1006 onig_node_free(Node* node)
1007 {
1008 start:
1009 if (IS_NULL(node)) return ;
1010
1011 switch (NTYPE(node)) {
1012 case NT_STR:
1013 if (NSTR(node)->capa != 0 &&
1014 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1015 xfree(NSTR(node)->s);
1016 }
1017 break;
1018
1019 case NT_LIST:
1020 case NT_ALT:
1021 onig_node_free(NCAR(node));
1022 {
1023 Node* next_node = NCDR(node);
1024
1025 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1026 {
1027 FreeNode* n = (FreeNode* )node;
1028
1029 THREAD_ATOMIC_START;
1030 n->next = FreeNodeList;
1031 FreeNodeList = n;
1032 THREAD_ATOMIC_END;
1033 }
1034 #else
1035 xfree(node);
1036 #endif
1037 node = next_node;
1038 goto start;
1039 }
1040 break;
1041
1042 case NT_CCLASS:
1043 {
1044 CClassNode* cc = NCCLASS(node);
1045
1046 if (IS_NCCLASS_SHARE(cc)) return ;
1047 if (cc->mbuf)
1048 bbuf_free(cc->mbuf);
1049 }
1050 break;
1051
1052 case NT_QTFR:
1053 if (NQTFR(node)->target)
1054 onig_node_free(NQTFR(node)->target);
1055 break;
1056
1057 case NT_ENCLOSE:
1058 if (NENCLOSE(node)->target)
1059 onig_node_free(NENCLOSE(node)->target);
1060 break;
1061
1062 case NT_BREF:
1063 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1064 xfree(NBREF(node)->back_dynamic);
1065 break;
1066
1067 case NT_ANCHOR:
1068 if (NANCHOR(node)->target)
1069 onig_node_free(NANCHOR(node)->target);
1070 break;
1071 }
1072
1073 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1074 {
1075 FreeNode* n = (FreeNode* )node;
1076
1077 THREAD_ATOMIC_START;
1078 n->next = FreeNodeList;
1079 FreeNodeList = n;
1080 THREAD_ATOMIC_END;
1081 }
1082 #else
1083 xfree(node);
1084 #endif
1085 }
1086
1087 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1088 extern int
onig_free_node_list(void)1089 onig_free_node_list(void)
1090 {
1091 FreeNode* n;
1092
1093 /* THREAD_ATOMIC_START; */
1094 while (IS_NOT_NULL(FreeNodeList)) {
1095 n = FreeNodeList;
1096 FreeNodeList = FreeNodeList->next;
1097 xfree(n);
1098 }
1099 /* THREAD_ATOMIC_END; */
1100 return 0;
1101 }
1102 #endif
1103
1104 static Node*
node_new(void)1105 node_new(void)
1106 {
1107 Node* node;
1108
1109 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1110 THREAD_ATOMIC_START;
1111 if (IS_NOT_NULL(FreeNodeList)) {
1112 node = (Node* )FreeNodeList;
1113 FreeNodeList = FreeNodeList->next;
1114 THREAD_ATOMIC_END;
1115 return node;
1116 }
1117 THREAD_ATOMIC_END;
1118 #endif
1119
1120 node = (Node* )xmalloc(sizeof(Node));
1121 /* xmemset(node, 0, sizeof(Node)); */
1122 return node;
1123 }
1124
1125
1126 static void
initialize_cclass(CClassNode * cc)1127 initialize_cclass(CClassNode* cc)
1128 {
1129 BITSET_CLEAR(cc->bs);
1130 /* cc->base.flags = 0; */
1131 cc->flags = 0;
1132 cc->mbuf = NULL;
1133 }
1134
1135 static Node*
node_new_cclass(void)1136 node_new_cclass(void)
1137 {
1138 Node* node = node_new();
1139 CHECK_NULL_RETURN(node);
1140
1141 SET_NTYPE(node, NT_CCLASS);
1142 initialize_cclass(NCCLASS(node));
1143 return node;
1144 }
1145
1146 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1147 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1148 const OnigCodePoint ranges[])
1149 {
1150 int n, i;
1151 CClassNode* cc;
1152 OnigCodePoint j;
1153
1154 Node* node = node_new_cclass();
1155 CHECK_NULL_RETURN(node);
1156
1157 cc = NCCLASS(node);
1158 if (not != 0) NCCLASS_SET_NOT(cc);
1159
1160 BITSET_CLEAR(cc->bs);
1161 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1162 n = ONIGENC_CODE_RANGE_NUM(ranges);
1163 for (i = 0; i < n; i++) {
1164 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
1165 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1166 if (j >= sb_out) goto sb_end;
1167
1168 BITSET_SET_BIT(cc->bs, j);
1169 }
1170 }
1171 }
1172
1173 sb_end:
1174 if (IS_NULL(ranges)) {
1175 is_null:
1176 cc->mbuf = NULL;
1177 }
1178 else {
1179 BBuf* bbuf;
1180
1181 n = ONIGENC_CODE_RANGE_NUM(ranges);
1182 if (n == 0) goto is_null;
1183
1184 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1185 CHECK_NULL_RETURN(bbuf);
1186 bbuf->alloc = n + 1;
1187 bbuf->used = n + 1;
1188 bbuf->p = (UChar* )((void* )ranges);
1189
1190 cc->mbuf = bbuf;
1191 }
1192
1193 return node;
1194 }
1195
1196 static Node*
node_new_ctype(int type,int not)1197 node_new_ctype(int type, int not)
1198 {
1199 Node* node = node_new();
1200 CHECK_NULL_RETURN(node);
1201
1202 SET_NTYPE(node, NT_CTYPE);
1203 NCTYPE(node)->ctype = type;
1204 NCTYPE(node)->not = not;
1205 return node;
1206 }
1207
1208 static Node*
node_new_anychar(void)1209 node_new_anychar(void)
1210 {
1211 Node* node = node_new();
1212 CHECK_NULL_RETURN(node);
1213
1214 SET_NTYPE(node, NT_CANY);
1215 return node;
1216 }
1217
1218 static Node*
node_new_list(Node * left,Node * right)1219 node_new_list(Node* left, Node* right)
1220 {
1221 Node* node = node_new();
1222 CHECK_NULL_RETURN(node);
1223
1224 SET_NTYPE(node, NT_LIST);
1225 NCAR(node) = left;
1226 NCDR(node) = right;
1227 return node;
1228 }
1229
1230 extern Node*
onig_node_new_list(Node * left,Node * right)1231 onig_node_new_list(Node* left, Node* right)
1232 {
1233 return node_new_list(left, right);
1234 }
1235
1236 extern Node*
onig_node_list_add(Node * list,Node * x)1237 onig_node_list_add(Node* list, Node* x)
1238 {
1239 Node *n;
1240
1241 n = onig_node_new_list(x, NULL);
1242 if (IS_NULL(n)) return NULL_NODE;
1243
1244 if (IS_NOT_NULL(list)) {
1245 while (IS_NOT_NULL(NCDR(list)))
1246 list = NCDR(list);
1247
1248 NCDR(list) = n;
1249 }
1250
1251 return n;
1252 }
1253
1254 extern Node*
onig_node_new_alt(Node * left,Node * right)1255 onig_node_new_alt(Node* left, Node* right)
1256 {
1257 Node* node = node_new();
1258 CHECK_NULL_RETURN(node);
1259
1260 SET_NTYPE(node, NT_ALT);
1261 NCAR(node) = left;
1262 NCDR(node) = right;
1263 return node;
1264 }
1265
1266 extern Node*
onig_node_new_anchor(int type)1267 onig_node_new_anchor(int type)
1268 {
1269 Node* node = node_new();
1270 CHECK_NULL_RETURN(node);
1271
1272 SET_NTYPE(node, NT_ANCHOR);
1273 NANCHOR(node)->type = type;
1274 NANCHOR(node)->target = NULL;
1275 NANCHOR(node)->char_len = -1;
1276 return node;
1277 }
1278
1279 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1280 node_new_backref(int back_num, int* backrefs, int by_name,
1281 #ifdef USE_BACKREF_WITH_LEVEL
1282 int exist_level, int nest_level,
1283 #endif
1284 ScanEnv* env)
1285 {
1286 int i;
1287 Node* node = node_new();
1288
1289 CHECK_NULL_RETURN(node);
1290
1291 SET_NTYPE(node, NT_BREF);
1292 NBREF(node)->state = 0;
1293 NBREF(node)->back_num = back_num;
1294 NBREF(node)->back_dynamic = (int* )NULL;
1295 if (by_name != 0)
1296 NBREF(node)->state |= NST_NAME_REF;
1297
1298 #ifdef USE_BACKREF_WITH_LEVEL
1299 if (exist_level != 0) {
1300 NBREF(node)->state |= NST_NEST_LEVEL;
1301 NBREF(node)->nest_level = nest_level;
1302 }
1303 #endif
1304
1305 for (i = 0; i < back_num; i++) {
1306 if (backrefs[i] <= env->num_mem &&
1307 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1308 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1309 break;
1310 }
1311 }
1312
1313 if (back_num <= NODE_BACKREFS_SIZE) {
1314 for (i = 0; i < back_num; i++)
1315 NBREF(node)->back_static[i] = backrefs[i];
1316 }
1317 else {
1318 int* p = (int* )xmalloc(sizeof(int) * back_num);
1319 if (IS_NULL(p)) {
1320 onig_node_free(node);
1321 return NULL;
1322 }
1323 NBREF(node)->back_dynamic = p;
1324 for (i = 0; i < back_num; i++)
1325 p[i] = backrefs[i];
1326 }
1327 return node;
1328 }
1329
1330 #ifdef USE_SUBEXP_CALL
1331 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1332 node_new_call(UChar* name, UChar* name_end, int gnum)
1333 {
1334 Node* node = node_new();
1335 CHECK_NULL_RETURN(node);
1336
1337 SET_NTYPE(node, NT_CALL);
1338 NCALL(node)->state = 0;
1339 NCALL(node)->target = NULL_NODE;
1340 NCALL(node)->name = name;
1341 NCALL(node)->name_end = name_end;
1342 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1343 return node;
1344 }
1345 #endif
1346
1347 static Node*
node_new_quantifier(int lower,int upper,int by_number)1348 node_new_quantifier(int lower, int upper, int by_number)
1349 {
1350 Node* node = node_new();
1351 CHECK_NULL_RETURN(node);
1352
1353 SET_NTYPE(node, NT_QTFR);
1354 NQTFR(node)->state = 0;
1355 NQTFR(node)->target = NULL;
1356 NQTFR(node)->lower = lower;
1357 NQTFR(node)->upper = upper;
1358 NQTFR(node)->greedy = 1;
1359 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1360 NQTFR(node)->head_exact = NULL_NODE;
1361 NQTFR(node)->next_head_exact = NULL_NODE;
1362 NQTFR(node)->is_refered = 0;
1363 if (by_number != 0)
1364 NQTFR(node)->state |= NST_BY_NUMBER;
1365
1366 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1367 NQTFR(node)->comb_exp_check_num = 0;
1368 #endif
1369
1370 return node;
1371 }
1372
1373 static Node*
node_new_enclose(int type)1374 node_new_enclose(int type)
1375 {
1376 Node* node = node_new();
1377 CHECK_NULL_RETURN(node);
1378
1379 SET_NTYPE(node, NT_ENCLOSE);
1380 NENCLOSE(node)->type = type;
1381 NENCLOSE(node)->state = 0;
1382 NENCLOSE(node)->regnum = 0;
1383 NENCLOSE(node)->option = 0;
1384 NENCLOSE(node)->target = NULL;
1385 NENCLOSE(node)->call_addr = -1;
1386 NENCLOSE(node)->opt_count = 0;
1387 return node;
1388 }
1389
1390 extern Node*
onig_node_new_enclose(int type)1391 onig_node_new_enclose(int type)
1392 {
1393 return node_new_enclose(type);
1394 }
1395
1396 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1397 node_new_enclose_memory(OnigOptionType option, int is_named)
1398 {
1399 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1400 CHECK_NULL_RETURN(node);
1401 if (is_named != 0)
1402 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1403
1404 #ifdef USE_SUBEXP_CALL
1405 NENCLOSE(node)->option = option;
1406 #endif
1407 return node;
1408 }
1409
1410 static Node*
node_new_option(OnigOptionType option)1411 node_new_option(OnigOptionType option)
1412 {
1413 Node* node = node_new_enclose(ENCLOSE_OPTION);
1414 CHECK_NULL_RETURN(node);
1415 NENCLOSE(node)->option = option;
1416 return node;
1417 }
1418
1419 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1420 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1421 {
1422 int addlen = end - s;
1423
1424 if (addlen > 0) {
1425 int len = NSTR(node)->end - NSTR(node)->s;
1426
1427 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1428 UChar* p;
1429 int capa = len + addlen + NODE_STR_MARGIN;
1430
1431 if (capa <= NSTR(node)->capa) {
1432 onig_strcpy(NSTR(node)->s + len, s, end);
1433 }
1434 else {
1435 if (NSTR(node)->s == NSTR(node)->buf)
1436 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1437 s, end, capa);
1438 else
1439 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1440
1441 CHECK_NULL_RETURN_MEMERR(p);
1442 NSTR(node)->s = p;
1443 NSTR(node)->capa = capa;
1444 }
1445 }
1446 else {
1447 onig_strcpy(NSTR(node)->s + len, s, end);
1448 }
1449 NSTR(node)->end = NSTR(node)->s + len + addlen;
1450 }
1451
1452 return 0;
1453 }
1454
1455 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1456 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1457 {
1458 onig_node_str_clear(node);
1459 return onig_node_str_cat(node, s, end);
1460 }
1461
1462 static int
node_str_cat_char(Node * node,UChar c)1463 node_str_cat_char(Node* node, UChar c)
1464 {
1465 UChar s[1];
1466
1467 s[0] = c;
1468 return onig_node_str_cat(node, s, s + 1);
1469 }
1470
1471 extern void
onig_node_conv_to_str_node(Node * node,int flag)1472 onig_node_conv_to_str_node(Node* node, int flag)
1473 {
1474 SET_NTYPE(node, NT_STR);
1475 NSTR(node)->flag = flag;
1476 NSTR(node)->capa = 0;
1477 NSTR(node)->s = NSTR(node)->buf;
1478 NSTR(node)->end = NSTR(node)->buf;
1479 }
1480
1481 extern void
onig_node_str_clear(Node * node)1482 onig_node_str_clear(Node* node)
1483 {
1484 if (NSTR(node)->capa != 0 &&
1485 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1486 xfree(NSTR(node)->s);
1487 }
1488
1489 NSTR(node)->capa = 0;
1490 NSTR(node)->flag = 0;
1491 NSTR(node)->s = NSTR(node)->buf;
1492 NSTR(node)->end = NSTR(node)->buf;
1493 }
1494
1495 static Node*
node_new_str(const UChar * s,const UChar * end)1496 node_new_str(const UChar* s, const UChar* end)
1497 {
1498 Node* node = node_new();
1499 CHECK_NULL_RETURN(node);
1500
1501 SET_NTYPE(node, NT_STR);
1502 NSTR(node)->capa = 0;
1503 NSTR(node)->flag = 0;
1504 NSTR(node)->s = NSTR(node)->buf;
1505 NSTR(node)->end = NSTR(node)->buf;
1506 if (onig_node_str_cat(node, s, end)) {
1507 onig_node_free(node);
1508 return NULL;
1509 }
1510 return node;
1511 }
1512
1513 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1514 onig_node_new_str(const UChar* s, const UChar* end)
1515 {
1516 return node_new_str(s, end);
1517 }
1518
1519 static Node*
node_new_str_raw(UChar * s,UChar * end)1520 node_new_str_raw(UChar* s, UChar* end)
1521 {
1522 Node* node = node_new_str(s, end);
1523 NSTRING_SET_RAW(node);
1524 return node;
1525 }
1526
1527 static Node*
node_new_empty(void)1528 node_new_empty(void)
1529 {
1530 return node_new_str(NULL, NULL);
1531 }
1532
1533 static Node*
node_new_str_raw_char(UChar c)1534 node_new_str_raw_char(UChar c)
1535 {
1536 UChar p[1];
1537
1538 p[0] = c;
1539 return node_new_str_raw(p, p + 1);
1540 }
1541
1542 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1543 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1544 {
1545 const UChar *p;
1546 Node* n = NULL_NODE;
1547
1548 if (sn->end > sn->s) {
1549 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1550 if (p && p > sn->s) { /* can be splitted. */
1551 n = node_new_str(p, sn->end);
1552 if ((sn->flag & NSTR_RAW) != 0)
1553 NSTRING_SET_RAW(n);
1554 sn->end = (UChar* )p;
1555 }
1556 }
1557 return n;
1558 }
1559
1560 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1561 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1562 {
1563 if (sn->end > sn->s) {
1564 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1565 }
1566 return 0;
1567 }
1568
1569 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1570 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1571 node_str_head_pad(StrNode* sn, int num, UChar val)
1572 {
1573 UChar buf[NODE_STR_BUF_SIZE];
1574 int i, len;
1575
1576 len = sn->end - sn->s;
1577 onig_strcpy(buf, sn->s, sn->end);
1578 onig_strcpy(&(sn->s[num]), buf, buf + len);
1579 sn->end += num;
1580
1581 for (i = 0; i < num; i++) {
1582 sn->s[i] = val;
1583 }
1584 }
1585 #endif
1586
1587 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1588 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1589 {
1590 unsigned int num, val;
1591 OnigCodePoint c;
1592 UChar* p = *src;
1593 PFETCH_READY;
1594
1595 num = 0;
1596 while (!PEND) {
1597 PFETCH(c);
1598 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1599 val = (unsigned int )DIGITVAL(c);
1600 if ((INT_MAX_LIMIT - val) / 10UL < num)
1601 return -1; /* overflow */
1602
1603 num = num * 10 + val;
1604 }
1605 else {
1606 PUNFETCH;
1607 break;
1608 }
1609 }
1610 *src = p;
1611 return num;
1612 }
1613
1614 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1615 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1616 OnigEncoding enc)
1617 {
1618 OnigCodePoint c;
1619 unsigned int num, val;
1620 UChar* p = *src;
1621 PFETCH_READY;
1622
1623 num = 0;
1624 while (!PEND && maxlen-- != 0) {
1625 PFETCH(c);
1626 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1627 val = (unsigned int )XDIGITVAL(enc,c);
1628 if ((INT_MAX_LIMIT - val) / 16UL < num)
1629 return -1; /* overflow */
1630
1631 num = (num << 4) + XDIGITVAL(enc,c);
1632 }
1633 else {
1634 PUNFETCH;
1635 break;
1636 }
1637 }
1638 *src = p;
1639 return num;
1640 }
1641
1642 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1643 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1644 OnigEncoding enc)
1645 {
1646 OnigCodePoint c;
1647 unsigned int num, val;
1648 UChar* p = *src;
1649 PFETCH_READY;
1650
1651 num = 0;
1652 while (!PEND && maxlen-- != 0) {
1653 PFETCH(c);
1654 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1655 val = ODIGITVAL(c);
1656 if ((INT_MAX_LIMIT - val) / 8UL < num)
1657 return -1; /* overflow */
1658
1659 num = (num << 3) + val;
1660 }
1661 else {
1662 PUNFETCH;
1663 break;
1664 }
1665 }
1666 *src = p;
1667 return num;
1668 }
1669
1670
1671 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1672 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1673
1674 /* data format:
1675 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1676 (all data size is OnigCodePoint)
1677 */
1678 static int
new_code_range(BBuf ** pbuf)1679 new_code_range(BBuf** pbuf)
1680 {
1681 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1682 int r;
1683 OnigCodePoint n;
1684 BBuf* bbuf;
1685
1686 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1687 CHECK_NULL_RETURN_MEMERR(*pbuf);
1688 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1689 if (r) return r;
1690
1691 n = 0;
1692 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1693 return 0;
1694 }
1695
1696 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1697 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1698 {
1699 int r, inc_n, pos;
1700 int low, high, bound, x;
1701 OnigCodePoint n, *data;
1702 BBuf* bbuf;
1703
1704 if (from > to) {
1705 n = from; from = to; to = n;
1706 }
1707
1708 if (IS_NULL(*pbuf)) {
1709 r = new_code_range(pbuf);
1710 if (r) return r;
1711 bbuf = *pbuf;
1712 n = 0;
1713 }
1714 else {
1715 bbuf = *pbuf;
1716 GET_CODE_POINT(n, bbuf->p);
1717 }
1718 data = (OnigCodePoint* )(bbuf->p);
1719 data++;
1720
1721 for (low = 0, bound = n; low < bound; ) {
1722 x = (low + bound) >> 1;
1723 if (from > data[x*2 + 1])
1724 low = x + 1;
1725 else
1726 bound = x;
1727 }
1728
1729 for (high = low, bound = n; high < bound; ) {
1730 x = (high + bound) >> 1;
1731 if (to >= data[x*2] - 1)
1732 high = x + 1;
1733 else
1734 bound = x;
1735 }
1736
1737 inc_n = low + 1 - high;
1738 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1739 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1740
1741 if (inc_n != 1) {
1742 if (from > data[low*2])
1743 from = data[low*2];
1744 if (to < data[(high - 1)*2 + 1])
1745 to = data[(high - 1)*2 + 1];
1746 }
1747
1748 if (inc_n != 0 && (OnigCodePoint )high < n) {
1749 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1750 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1751 int size = (n - high) * 2 * SIZE_CODE_POINT;
1752
1753 if (inc_n > 0) {
1754 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1755 }
1756 else {
1757 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1758 }
1759 }
1760
1761 pos = SIZE_CODE_POINT * (1 + low * 2);
1762 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1763 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1764 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1765 n += inc_n;
1766 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1767
1768 return 0;
1769 }
1770
1771 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1772 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1773 {
1774 if (from > to) {
1775 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1776 return 0;
1777 else
1778 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1779 }
1780
1781 return add_code_range_to_buf(pbuf, from, to);
1782 }
1783
1784 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1785 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1786 {
1787 int r, i, n;
1788 OnigCodePoint pre, from, *data, to = 0;
1789
1790 *pbuf = (BBuf* )NULL;
1791 if (IS_NULL(bbuf)) {
1792 set_all:
1793 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1794 }
1795
1796 data = (OnigCodePoint* )(bbuf->p);
1797 GET_CODE_POINT(n, data);
1798 data++;
1799 if (n <= 0) goto set_all;
1800
1801 r = 0;
1802 pre = MBCODE_START_POS(enc);
1803 for (i = 0; i < n; i++) {
1804 from = data[i*2];
1805 to = data[i*2+1];
1806 if (pre <= from - 1) {
1807 r = add_code_range_to_buf(pbuf, pre, from - 1);
1808 if (r != 0) return r;
1809 }
1810 if (to == ~((OnigCodePoint )0)) break;
1811 pre = to + 1;
1812 }
1813 if (to < ~((OnigCodePoint )0)) {
1814 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1815 }
1816 return r;
1817 }
1818
1819 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1820 BBuf *tbuf; \
1821 int tnot; \
1822 tnot = not1; not1 = not2; not2 = tnot; \
1823 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1824 } while (0)
1825
1826 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1827 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1828 BBuf* bbuf2, int not2, BBuf** pbuf)
1829 {
1830 int r;
1831 OnigCodePoint i, n1, *data1;
1832 OnigCodePoint from, to;
1833
1834 *pbuf = (BBuf* )NULL;
1835 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1836 if (not1 != 0 || not2 != 0)
1837 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1838 return 0;
1839 }
1840
1841 r = 0;
1842 if (IS_NULL(bbuf2))
1843 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1844
1845 if (IS_NULL(bbuf1)) {
1846 if (not1 != 0) {
1847 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1848 }
1849 else {
1850 if (not2 == 0) {
1851 return bbuf_clone(pbuf, bbuf2);
1852 }
1853 else {
1854 return not_code_range_buf(enc, bbuf2, pbuf);
1855 }
1856 }
1857 }
1858
1859 if (not1 != 0)
1860 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1861
1862 data1 = (OnigCodePoint* )(bbuf1->p);
1863 GET_CODE_POINT(n1, data1);
1864 data1++;
1865
1866 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1867 r = bbuf_clone(pbuf, bbuf2);
1868 }
1869 else if (not1 == 0) { /* 1 OR (not 2) */
1870 r = not_code_range_buf(enc, bbuf2, pbuf);
1871 }
1872 if (r != 0) return r;
1873
1874 for (i = 0; i < n1; i++) {
1875 from = data1[i*2];
1876 to = data1[i*2+1];
1877 r = add_code_range_to_buf(pbuf, from, to);
1878 if (r != 0) return r;
1879 }
1880 return 0;
1881 }
1882
1883 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1884 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1885 OnigCodePoint* data, int n)
1886 {
1887 int i, r;
1888 OnigCodePoint from2, to2;
1889
1890 for (i = 0; i < n; i++) {
1891 from2 = data[i*2];
1892 to2 = data[i*2+1];
1893 if (from2 < from1) {
1894 if (to2 < from1) continue;
1895 else {
1896 from1 = to2 + 1;
1897 }
1898 }
1899 else if (from2 <= to1) {
1900 if (to2 < to1) {
1901 if (from1 <= from2 - 1) {
1902 r = add_code_range_to_buf(pbuf, from1, from2-1);
1903 if (r != 0) return r;
1904 }
1905 from1 = to2 + 1;
1906 }
1907 else {
1908 to1 = from2 - 1;
1909 }
1910 }
1911 else {
1912 from1 = from2;
1913 }
1914 if (from1 > to1) break;
1915 }
1916 if (from1 <= to1) {
1917 r = add_code_range_to_buf(pbuf, from1, to1);
1918 if (r != 0) return r;
1919 }
1920 return 0;
1921 }
1922
1923 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1924 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1925 {
1926 int r;
1927 OnigCodePoint i, j, n1, n2, *data1, *data2;
1928 OnigCodePoint from, to, from1, to1, from2, to2;
1929
1930 *pbuf = (BBuf* )NULL;
1931 if (IS_NULL(bbuf1)) {
1932 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1933 return bbuf_clone(pbuf, bbuf2);
1934 return 0;
1935 }
1936 else if (IS_NULL(bbuf2)) {
1937 if (not2 != 0)
1938 return bbuf_clone(pbuf, bbuf1);
1939 return 0;
1940 }
1941
1942 if (not1 != 0)
1943 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1944
1945 data1 = (OnigCodePoint* )(bbuf1->p);
1946 data2 = (OnigCodePoint* )(bbuf2->p);
1947 GET_CODE_POINT(n1, data1);
1948 GET_CODE_POINT(n2, data2);
1949 data1++;
1950 data2++;
1951
1952 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1953 for (i = 0; i < n1; i++) {
1954 from1 = data1[i*2];
1955 to1 = data1[i*2+1];
1956 for (j = 0; j < n2; j++) {
1957 from2 = data2[j*2];
1958 to2 = data2[j*2+1];
1959 if (from2 > to1) break;
1960 if (to2 < from1) continue;
1961 from = MAX(from1, from2);
1962 to = MIN(to1, to2);
1963 r = add_code_range_to_buf(pbuf, from, to);
1964 if (r != 0) return r;
1965 }
1966 }
1967 }
1968 else if (not1 == 0) { /* 1 AND (not 2) */
1969 for (i = 0; i < n1; i++) {
1970 from1 = data1[i*2];
1971 to1 = data1[i*2+1];
1972 r = and_code_range1(pbuf, from1, to1, data2, n2);
1973 if (r != 0) return r;
1974 }
1975 }
1976
1977 return 0;
1978 }
1979
1980 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1981 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1982 {
1983 int r, not1, not2;
1984 BBuf *buf1, *buf2, *pbuf;
1985 BitSetRef bsr1, bsr2;
1986 BitSet bs1, bs2;
1987
1988 not1 = IS_NCCLASS_NOT(dest);
1989 bsr1 = dest->bs;
1990 buf1 = dest->mbuf;
1991 not2 = IS_NCCLASS_NOT(cc);
1992 bsr2 = cc->bs;
1993 buf2 = cc->mbuf;
1994
1995 if (not1 != 0) {
1996 bitset_invert_to(bsr1, bs1);
1997 bsr1 = bs1;
1998 }
1999 if (not2 != 0) {
2000 bitset_invert_to(bsr2, bs2);
2001 bsr2 = bs2;
2002 }
2003 bitset_and(bsr1, bsr2);
2004 if (bsr1 != dest->bs) {
2005 bitset_copy(dest->bs, bsr1);
2006 bsr1 = dest->bs;
2007 }
2008 if (not1 != 0) {
2009 bitset_invert(dest->bs);
2010 }
2011
2012 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2013 if (not1 != 0 && not2 != 0) {
2014 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2015 }
2016 else {
2017 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2018 if (r == 0 && not1 != 0) {
2019 BBuf *tbuf;
2020 r = not_code_range_buf(enc, pbuf, &tbuf);
2021 if (r != 0) {
2022 bbuf_free(pbuf);
2023 return r;
2024 }
2025 bbuf_free(pbuf);
2026 pbuf = tbuf;
2027 }
2028 }
2029 if (r != 0) return r;
2030
2031 dest->mbuf = pbuf;
2032 bbuf_free(buf1);
2033 return r;
2034 }
2035 return 0;
2036 }
2037
2038 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2039 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2040 {
2041 int r, not1, not2;
2042 BBuf *buf1, *buf2, *pbuf;
2043 BitSetRef bsr1, bsr2;
2044 BitSet bs1, bs2;
2045
2046 not1 = IS_NCCLASS_NOT(dest);
2047 bsr1 = dest->bs;
2048 buf1 = dest->mbuf;
2049 not2 = IS_NCCLASS_NOT(cc);
2050 bsr2 = cc->bs;
2051 buf2 = cc->mbuf;
2052
2053 if (not1 != 0) {
2054 bitset_invert_to(bsr1, bs1);
2055 bsr1 = bs1;
2056 }
2057 if (not2 != 0) {
2058 bitset_invert_to(bsr2, bs2);
2059 bsr2 = bs2;
2060 }
2061 bitset_or(bsr1, bsr2);
2062 if (bsr1 != dest->bs) {
2063 bitset_copy(dest->bs, bsr1);
2064 bsr1 = dest->bs;
2065 }
2066 if (not1 != 0) {
2067 bitset_invert(dest->bs);
2068 }
2069
2070 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2071 if (not1 != 0 && not2 != 0) {
2072 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2073 }
2074 else {
2075 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2076 if (r == 0 && not1 != 0) {
2077 BBuf *tbuf;
2078 r = not_code_range_buf(enc, pbuf, &tbuf);
2079 if (r != 0) {
2080 bbuf_free(pbuf);
2081 return r;
2082 }
2083 bbuf_free(pbuf);
2084 pbuf = tbuf;
2085 }
2086 }
2087 if (r != 0) return r;
2088
2089 dest->mbuf = pbuf;
2090 bbuf_free(buf1);
2091 return r;
2092 }
2093 else
2094 return 0;
2095 }
2096
2097 static int
conv_backslash_value(int c,ScanEnv * env)2098 conv_backslash_value(int c, ScanEnv* env)
2099 {
2100 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2101 switch (c) {
2102 case 'n': return '\n';
2103 case 't': return '\t';
2104 case 'r': return '\r';
2105 case 'f': return '\f';
2106 case 'a': return '\007';
2107 case 'b': return '\010';
2108 case 'e': return '\033';
2109 case 'v':
2110 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2111 return '\v';
2112 break;
2113
2114 default:
2115 break;
2116 }
2117 }
2118 return c;
2119 }
2120
2121 static int
is_invalid_quantifier_target(Node * node)2122 is_invalid_quantifier_target(Node* node)
2123 {
2124 switch (NTYPE(node)) {
2125 case NT_ANCHOR:
2126 return 1;
2127 break;
2128
2129 case NT_ENCLOSE:
2130 /* allow enclosed elements */
2131 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2132 break;
2133
2134 case NT_LIST:
2135 do {
2136 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2137 } while (IS_NOT_NULL(node = NCDR(node)));
2138 return 0;
2139 break;
2140
2141 case NT_ALT:
2142 do {
2143 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2144 } while (IS_NOT_NULL(node = NCDR(node)));
2145 break;
2146
2147 default:
2148 break;
2149 }
2150 return 0;
2151 }
2152
2153 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2154 static int
popular_quantifier_num(QtfrNode * q)2155 popular_quantifier_num(QtfrNode* q)
2156 {
2157 if (q->greedy) {
2158 if (q->lower == 0) {
2159 if (q->upper == 1) return 0;
2160 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2161 }
2162 else if (q->lower == 1) {
2163 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2164 }
2165 }
2166 else {
2167 if (q->lower == 0) {
2168 if (q->upper == 1) return 3;
2169 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2170 }
2171 else if (q->lower == 1) {
2172 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2173 }
2174 }
2175 return -1;
2176 }
2177
2178
2179 enum ReduceType {
2180 RQ_ASIS = 0, /* as is */
2181 RQ_DEL = 1, /* delete parent */
2182 RQ_A, /* to '*' */
2183 RQ_AQ, /* to '*?' */
2184 RQ_QQ, /* to '??' */
2185 RQ_P_QQ, /* to '+)??' */
2186 RQ_PQ_Q /* to '+?)?' */
2187 };
2188
2189 static enum ReduceType ReduceTypeTable[6][6] = {
2190 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2191 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2192 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2193 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2194 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2195 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2196 };
2197
2198 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2199 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2200 {
2201 int pnum, cnum;
2202 QtfrNode *p, *c;
2203
2204 p = NQTFR(pnode);
2205 c = NQTFR(cnode);
2206 pnum = popular_quantifier_num(p);
2207 cnum = popular_quantifier_num(c);
2208 if (pnum < 0 || cnum < 0) return ;
2209
2210 switch(ReduceTypeTable[cnum][pnum]) {
2211 case RQ_DEL:
2212 *pnode = *cnode;
2213 break;
2214 case RQ_A:
2215 p->target = c->target;
2216 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2217 break;
2218 case RQ_AQ:
2219 p->target = c->target;
2220 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2221 break;
2222 case RQ_QQ:
2223 p->target = c->target;
2224 p->lower = 0; p->upper = 1; p->greedy = 0;
2225 break;
2226 case RQ_P_QQ:
2227 p->target = cnode;
2228 p->lower = 0; p->upper = 1; p->greedy = 0;
2229 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2230 return ;
2231 break;
2232 case RQ_PQ_Q:
2233 p->target = cnode;
2234 p->lower = 0; p->upper = 1; p->greedy = 1;
2235 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2236 return ;
2237 break;
2238 case RQ_ASIS:
2239 p->target = cnode;
2240 return ;
2241 break;
2242 }
2243
2244 c->target = NULL_NODE;
2245 onig_node_free(cnode);
2246 }
2247
2248
2249 enum TokenSyms {
2250 TK_EOT = 0, /* end of token */
2251 TK_RAW_BYTE = 1,
2252 TK_CHAR,
2253 TK_STRING,
2254 TK_CODE_POINT,
2255 TK_ANYCHAR,
2256 TK_CHAR_TYPE,
2257 TK_BACKREF,
2258 TK_CALL,
2259 TK_ANCHOR,
2260 TK_OP_REPEAT,
2261 TK_INTERVAL,
2262 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2263 TK_ALT,
2264 TK_SUBEXP_OPEN,
2265 TK_SUBEXP_CLOSE,
2266 TK_CC_OPEN,
2267 TK_QUOTE_OPEN,
2268 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2269 /* in cc */
2270 TK_CC_CLOSE,
2271 TK_CC_RANGE,
2272 TK_POSIX_BRACKET_OPEN,
2273 TK_CC_AND, /* && */
2274 TK_CC_CC_OPEN /* [ */
2275 };
2276
2277 typedef struct {
2278 enum TokenSyms type;
2279 int escaped;
2280 int base; /* is number: 8, 16 (used in [....]) */
2281 UChar* backp;
2282 union {
2283 UChar* s;
2284 int c;
2285 OnigCodePoint code;
2286 int anchor;
2287 int subtype;
2288 struct {
2289 int lower;
2290 int upper;
2291 int greedy;
2292 int possessive;
2293 } repeat;
2294 struct {
2295 int num;
2296 int ref1;
2297 int* refs;
2298 int by_name;
2299 #ifdef USE_BACKREF_WITH_LEVEL
2300 int exist_level;
2301 int level; /* \k<name+n> */
2302 #endif
2303 } backref;
2304 struct {
2305 UChar* name;
2306 UChar* name_end;
2307 int gnum;
2308 } call;
2309 struct {
2310 int ctype;
2311 int not;
2312 } prop;
2313 } u;
2314 } OnigToken;
2315
2316
2317 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2318 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2319 {
2320 int low, up, syn_allow, non_low = 0;
2321 int r = 0;
2322 OnigCodePoint c;
2323 OnigEncoding enc = env->enc;
2324 UChar* p = *src;
2325 PFETCH_READY;
2326
2327 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2328
2329 if (PEND) {
2330 if (syn_allow)
2331 return 1; /* "....{" : OK! */
2332 else
2333 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2334 }
2335
2336 if (! syn_allow) {
2337 c = PPEEK;
2338 if (c == ')' || c == '(' || c == '|') {
2339 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2340 }
2341 }
2342
2343 low = onig_scan_unsigned_number(&p, end, env->enc);
2344 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2345 if (low > ONIG_MAX_REPEAT_NUM)
2346 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2347
2348 if (p == *src) { /* can't read low */
2349 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2350 /* allow {,n} as {0,n} */
2351 low = 0;
2352 non_low = 1;
2353 }
2354 else
2355 goto invalid;
2356 }
2357
2358 if (PEND) goto invalid;
2359 PFETCH(c);
2360 if (c == ',') {
2361 UChar* prev = p;
2362 up = onig_scan_unsigned_number(&p, end, env->enc);
2363 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2364 if (up > ONIG_MAX_REPEAT_NUM)
2365 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2366
2367 if (p == prev) {
2368 if (non_low != 0)
2369 goto invalid;
2370 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2371 }
2372 }
2373 else {
2374 if (non_low != 0)
2375 goto invalid;
2376
2377 PUNFETCH;
2378 up = low; /* {n} : exact n times */
2379 r = 2; /* fixed */
2380 }
2381
2382 if (PEND) goto invalid;
2383 PFETCH(c);
2384 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2385 if (c != MC_ESC(env->syntax)) goto invalid;
2386 PFETCH(c);
2387 }
2388 if (c != '}') goto invalid;
2389
2390 if (!IS_REPEAT_INFINITE(up) && low > up) {
2391 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2392 }
2393
2394 tok->type = TK_INTERVAL;
2395 tok->u.repeat.lower = low;
2396 tok->u.repeat.upper = up;
2397 *src = p;
2398 return r; /* 0: normal {n,m}, 2: fixed {n} */
2399
2400 invalid:
2401 if (syn_allow)
2402 return 1; /* OK */
2403 else
2404 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2405 }
2406
2407 /* \M-, \C-, \c, or \... */
2408 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2409 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2410 {
2411 int v;
2412 OnigCodePoint c;
2413 OnigEncoding enc = env->enc;
2414 UChar* p = *src;
2415
2416 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2417
2418 PFETCH_S(c);
2419 switch (c) {
2420 case 'M':
2421 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2422 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2423 PFETCH_S(c);
2424 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2425 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2426 PFETCH_S(c);
2427 if (c == MC_ESC(env->syntax)) {
2428 v = fetch_escaped_value(&p, end, env);
2429 if (v < 0) return v;
2430 c = (OnigCodePoint )v;
2431 }
2432 c = ((c & 0xff) | 0x80);
2433 }
2434 else
2435 goto backslash;
2436 break;
2437
2438 case 'C':
2439 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2440 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2441 PFETCH_S(c);
2442 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2443 goto control;
2444 }
2445 else
2446 goto backslash;
2447
2448 case 'c':
2449 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2450 control:
2451 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2452 PFETCH_S(c);
2453 if (c == '?') {
2454 c = 0177;
2455 }
2456 else {
2457 if (c == MC_ESC(env->syntax)) {
2458 v = fetch_escaped_value(&p, end, env);
2459 if (v < 0) return v;
2460 c = (OnigCodePoint )v;
2461 }
2462 c &= 0x9f;
2463 }
2464 break;
2465 }
2466 /* fall through */
2467
2468 default:
2469 {
2470 backslash:
2471 c = conv_backslash_value(c, env);
2472 }
2473 break;
2474 }
2475
2476 *src = p;
2477 return c;
2478 }
2479
2480 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2481
2482 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2483 get_name_end_code_point(OnigCodePoint start)
2484 {
2485 switch (start) {
2486 case '<': return (OnigCodePoint )'>'; break;
2487 case '\'': return (OnigCodePoint )'\''; break;
2488 default:
2489 break;
2490 }
2491
2492 return (OnigCodePoint )0;
2493 }
2494
2495 #ifdef USE_NAMED_GROUP
2496 #ifdef USE_BACKREF_WITH_LEVEL
2497 /*
2498 \k<name+n>, \k<name-n>
2499 \k<num+n>, \k<num-n>
2500 \k<-num+n>, \k<-num-n>
2501 */
2502 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2503 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2504 UChar** rname_end, ScanEnv* env,
2505 int* rback_num, int* rlevel)
2506 {
2507 int r, sign, is_num, exist_level;
2508 OnigCodePoint end_code;
2509 OnigCodePoint c = 0;
2510 OnigEncoding enc = env->enc;
2511 UChar *name_end;
2512 UChar *pnum_head;
2513 UChar *p = *src;
2514 PFETCH_READY;
2515
2516 *rback_num = 0;
2517 is_num = exist_level = 0;
2518 sign = 1;
2519 pnum_head = *src;
2520
2521 end_code = get_name_end_code_point(start_code);
2522
2523 name_end = end;
2524 r = 0;
2525 if (PEND) {
2526 return ONIGERR_EMPTY_GROUP_NAME;
2527 }
2528 else {
2529 PFETCH(c);
2530 if (c == end_code)
2531 return ONIGERR_EMPTY_GROUP_NAME;
2532
2533 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2534 is_num = 1;
2535 }
2536 else if (c == '-') {
2537 is_num = 2;
2538 sign = -1;
2539 pnum_head = p;
2540 }
2541 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2542 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2543 }
2544 }
2545
2546 while (!PEND) {
2547 name_end = p;
2548 PFETCH(c);
2549 if (c == end_code || c == ')' || c == '+' || c == '-') {
2550 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2551 break;
2552 }
2553
2554 if (is_num != 0) {
2555 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2556 is_num = 1;
2557 }
2558 else {
2559 r = ONIGERR_INVALID_GROUP_NAME;
2560 is_num = 0;
2561 }
2562 }
2563 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2564 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2565 }
2566 }
2567
2568 if (r == 0 && c != end_code) {
2569 if (c == '+' || c == '-') {
2570 int level;
2571 int flag = (c == '-' ? -1 : 1);
2572
2573 PFETCH(c);
2574 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2575 PUNFETCH;
2576 level = onig_scan_unsigned_number(&p, end, enc);
2577 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2578 *rlevel = (level * flag);
2579 exist_level = 1;
2580
2581 PFETCH(c);
2582 if (c == end_code)
2583 goto end;
2584 }
2585
2586 err:
2587 r = ONIGERR_INVALID_GROUP_NAME;
2588 name_end = end;
2589 }
2590
2591 end:
2592 if (r == 0) {
2593 if (is_num != 0) {
2594 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2595 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2596 else if (*rback_num == 0) goto err;
2597
2598 *rback_num *= sign;
2599 }
2600
2601 *rname_end = name_end;
2602 *src = p;
2603 return (exist_level ? 1 : 0);
2604 }
2605 else {
2606 onig_scan_env_set_error_string(env, r, *src, name_end);
2607 return r;
2608 }
2609 }
2610 #endif /* USE_BACKREF_WITH_LEVEL */
2611
2612 /*
2613 def: 0 -> define name (don't allow number name)
2614 1 -> reference name (allow number name)
2615 */
2616 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2617 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2618 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2619 {
2620 int r, is_num, sign;
2621 OnigCodePoint end_code;
2622 OnigCodePoint c = 0;
2623 OnigEncoding enc = env->enc;
2624 UChar *name_end;
2625 UChar *pnum_head;
2626 UChar *p = *src;
2627
2628 *rback_num = 0;
2629
2630 end_code = get_name_end_code_point(start_code);
2631
2632 name_end = end;
2633 pnum_head = *src;
2634 r = 0;
2635 is_num = 0;
2636 sign = 1;
2637 if (PEND) {
2638 return ONIGERR_EMPTY_GROUP_NAME;
2639 }
2640 else {
2641 PFETCH_S(c);
2642 if (c == end_code)
2643 return ONIGERR_EMPTY_GROUP_NAME;
2644
2645 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2646 if (ref == 1)
2647 is_num = 1;
2648 else {
2649 r = ONIGERR_INVALID_GROUP_NAME;
2650 is_num = 0;
2651 }
2652 }
2653 else if (c == '-') {
2654 if (ref == 1) {
2655 is_num = 2;
2656 sign = -1;
2657 pnum_head = p;
2658 }
2659 else {
2660 r = ONIGERR_INVALID_GROUP_NAME;
2661 is_num = 0;
2662 }
2663 }
2664 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2665 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2666 }
2667 }
2668
2669 if (r == 0) {
2670 while (!PEND) {
2671 name_end = p;
2672 PFETCH_S(c);
2673 if (c == end_code || c == ')') {
2674 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2675 break;
2676 }
2677
2678 if (is_num != 0) {
2679 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2680 is_num = 1;
2681 }
2682 else {
2683 if (!ONIGENC_IS_CODE_WORD(enc, c))
2684 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2685 else
2686 r = ONIGERR_INVALID_GROUP_NAME;
2687 is_num = 0;
2688 }
2689 }
2690 else {
2691 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2692 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2693 }
2694 }
2695 }
2696
2697 if (c != end_code) {
2698 r = ONIGERR_INVALID_GROUP_NAME;
2699 name_end = end;
2700 }
2701
2702 if (is_num != 0) {
2703 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2704 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2705 else if (*rback_num == 0) {
2706 r = ONIGERR_INVALID_GROUP_NAME;
2707 goto err;
2708 }
2709
2710 *rback_num *= sign;
2711 }
2712
2713 *rname_end = name_end;
2714 *src = p;
2715 return 0;
2716 }
2717 else {
2718 while (!PEND) {
2719 name_end = p;
2720 PFETCH_S(c);
2721 if (c == end_code || c == ')')
2722 break;
2723 }
2724 if (PEND)
2725 name_end = end;
2726
2727 err:
2728 onig_scan_env_set_error_string(env, r, *src, name_end);
2729 return r;
2730 }
2731 }
2732 #else
2733 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2734 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2735 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2736 {
2737 int r, is_num, sign;
2738 OnigCodePoint end_code;
2739 OnigCodePoint c = 0;
2740 UChar *name_end;
2741 OnigEncoding enc = env->enc;
2742 UChar *pnum_head;
2743 UChar *p = *src;
2744 PFETCH_READY;
2745
2746 *rback_num = 0;
2747
2748 end_code = get_name_end_code_point(start_code);
2749
2750 *rname_end = name_end = end;
2751 r = 0;
2752 pnum_head = *src;
2753 is_num = 0;
2754 sign = 1;
2755
2756 if (PEND) {
2757 return ONIGERR_EMPTY_GROUP_NAME;
2758 }
2759 else {
2760 PFETCH(c);
2761 if (c == end_code)
2762 return ONIGERR_EMPTY_GROUP_NAME;
2763
2764 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2765 is_num = 1;
2766 }
2767 else if (c == '-') {
2768 is_num = 2;
2769 sign = -1;
2770 pnum_head = p;
2771 }
2772 else {
2773 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2774 }
2775 }
2776
2777 while (!PEND) {
2778 name_end = p;
2779
2780 PFETCH(c);
2781 if (c == end_code || c == ')') break;
2782 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2783 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2784 }
2785 if (r == 0 && c != end_code) {
2786 r = ONIGERR_INVALID_GROUP_NAME;
2787 name_end = end;
2788 }
2789
2790 if (r == 0) {
2791 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2792 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2793 else if (*rback_num == 0) {
2794 r = ONIGERR_INVALID_GROUP_NAME;
2795 goto err;
2796 }
2797 *rback_num *= sign;
2798
2799 *rname_end = name_end;
2800 *src = p;
2801 return 0;
2802 }
2803 else {
2804 err:
2805 onig_scan_env_set_error_string(env, r, *src, name_end);
2806 return r;
2807 }
2808 }
2809 #endif /* USE_NAMED_GROUP */
2810
2811 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2812 CC_ESC_WARN(ScanEnv* env, UChar *c)
2813 {
2814 if (onig_warn == onig_null_warn) return ;
2815
2816 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2817 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2818 UChar buf[WARN_BUFSIZE];
2819 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2820 env->pattern, env->pattern_end,
2821 (UChar* )"character class has '%s' without escape", c);
2822 (*onig_warn)((char* )buf);
2823 }
2824 }
2825
2826 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2827 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2828 {
2829 if (onig_warn == onig_null_warn) return ;
2830
2831 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2832 UChar buf[WARN_BUFSIZE];
2833 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2834 (env)->pattern, (env)->pattern_end,
2835 (UChar* )"regular expression has '%s' without escape", c);
2836 (*onig_warn)((char* )buf);
2837 }
2838 }
2839
2840 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2841 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2842 UChar **next, OnigEncoding enc)
2843 {
2844 int i;
2845 OnigCodePoint x;
2846 UChar *q;
2847 UChar *p = from;
2848
2849 while (p < to) {
2850 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2851 q = p + enclen(enc, p);
2852 if (x == s[0]) {
2853 for (i = 1; i < n && q < to; i++) {
2854 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2855 if (x != s[i]) break;
2856 q += enclen(enc, q);
2857 }
2858 if (i >= n) {
2859 if (IS_NOT_NULL(next))
2860 *next = q;
2861 return p;
2862 }
2863 }
2864 p = q;
2865 }
2866 return NULL_UCHARP;
2867 }
2868
2869 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2870 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2871 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2872 {
2873 int i, in_esc;
2874 OnigCodePoint x;
2875 UChar *q;
2876 UChar *p = from;
2877
2878 in_esc = 0;
2879 while (p < to) {
2880 if (in_esc) {
2881 in_esc = 0;
2882 p += enclen(enc, p);
2883 }
2884 else {
2885 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2886 q = p + enclen(enc, p);
2887 if (x == s[0]) {
2888 for (i = 1; i < n && q < to; i++) {
2889 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2890 if (x != s[i]) break;
2891 q += enclen(enc, q);
2892 }
2893 if (i >= n) return 1;
2894 p += enclen(enc, p);
2895 }
2896 else {
2897 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2898 if (x == bad) return 0;
2899 else if (x == MC_ESC(syn)) in_esc = 1;
2900 p = q;
2901 }
2902 }
2903 }
2904 return 0;
2905 }
2906
2907 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2908 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2909 {
2910 int num;
2911 OnigCodePoint c, c2;
2912 OnigSyntaxType* syn = env->syntax;
2913 OnigEncoding enc = env->enc;
2914 UChar* prev;
2915 UChar* p = *src;
2916 PFETCH_READY;
2917
2918 if (PEND) {
2919 tok->type = TK_EOT;
2920 return tok->type;
2921 }
2922
2923 PFETCH(c);
2924 tok->type = TK_CHAR;
2925 tok->base = 0;
2926 tok->u.c = c;
2927 tok->escaped = 0;
2928
2929 if (c == ']') {
2930 tok->type = TK_CC_CLOSE;
2931 }
2932 else if (c == '-') {
2933 tok->type = TK_CC_RANGE;
2934 }
2935 else if (c == MC_ESC(syn)) {
2936 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2937 goto end;
2938
2939 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2940
2941 PFETCH(c);
2942 tok->escaped = 1;
2943 tok->u.c = c;
2944 switch (c) {
2945 case 'w':
2946 tok->type = TK_CHAR_TYPE;
2947 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2948 tok->u.prop.not = 0;
2949 break;
2950 case 'W':
2951 tok->type = TK_CHAR_TYPE;
2952 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2953 tok->u.prop.not = 1;
2954 break;
2955 case 'd':
2956 tok->type = TK_CHAR_TYPE;
2957 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2958 tok->u.prop.not = 0;
2959 break;
2960 case 'D':
2961 tok->type = TK_CHAR_TYPE;
2962 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2963 tok->u.prop.not = 1;
2964 break;
2965 case 's':
2966 tok->type = TK_CHAR_TYPE;
2967 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2968 tok->u.prop.not = 0;
2969 break;
2970 case 'S':
2971 tok->type = TK_CHAR_TYPE;
2972 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2973 tok->u.prop.not = 1;
2974 break;
2975 case 'h':
2976 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2977 tok->type = TK_CHAR_TYPE;
2978 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2979 tok->u.prop.not = 0;
2980 break;
2981 case 'H':
2982 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2983 tok->type = TK_CHAR_TYPE;
2984 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2985 tok->u.prop.not = 1;
2986 break;
2987
2988 case 'p':
2989 case 'P':
2990 c2 = PPEEK;
2991 if (c2 == '{' &&
2992 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2993 PINC;
2994 tok->type = TK_CHAR_PROPERTY;
2995 tok->u.prop.not = (c == 'P' ? 1 : 0);
2996
2997 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2998 PFETCH(c2);
2999 if (c2 == '^') {
3000 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3001 }
3002 else
3003 PUNFETCH;
3004 }
3005 }
3006 break;
3007
3008 case 'x':
3009 if (PEND) break;
3010
3011 prev = p;
3012 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3013 PINC;
3014 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3015 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3016 if (!PEND) {
3017 c2 = PPEEK;
3018 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3019 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3020 }
3021
3022 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3023 PINC;
3024 tok->type = TK_CODE_POINT;
3025 tok->base = 16;
3026 tok->u.code = (OnigCodePoint )num;
3027 }
3028 else {
3029 /* can't read nothing or invalid format */
3030 p = prev;
3031 }
3032 }
3033 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3034 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3035 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3036 if (p == prev) { /* can't read nothing. */
3037 num = 0; /* but, it's not error */
3038 }
3039 tok->type = TK_RAW_BYTE;
3040 tok->base = 16;
3041 tok->u.c = num;
3042 }
3043 break;
3044
3045 case 'u':
3046 if (PEND) break;
3047
3048 prev = p;
3049 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3050 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3051 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3052 if (p == prev) { /* can't read nothing. */
3053 num = 0; /* but, it's not error */
3054 }
3055 tok->type = TK_CODE_POINT;
3056 tok->base = 16;
3057 tok->u.code = (OnigCodePoint )num;
3058 }
3059 break;
3060
3061 case '0':
3062 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3063 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3064 PUNFETCH;
3065 prev = p;
3066 num = scan_unsigned_octal_number(&p, end, 3, enc);
3067 if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3068 if (p == prev) { /* can't read nothing. */
3069 num = 0; /* but, it's not error */
3070 }
3071 tok->type = TK_RAW_BYTE;
3072 tok->base = 8;
3073 tok->u.c = num;
3074 }
3075 break;
3076
3077 default:
3078 PUNFETCH;
3079 num = fetch_escaped_value(&p, end, env);
3080 if (num < 0) return num;
3081 if (tok->u.c != num) {
3082 tok->u.code = (OnigCodePoint )num;
3083 tok->type = TK_CODE_POINT;
3084 }
3085 break;
3086 }
3087 }
3088 else if (c == '[') {
3089 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3090 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3091 tok->backp = p; /* point at '[' is readed */
3092 PINC;
3093 if (str_exist_check_with_esc(send, 2, p, end,
3094 (OnigCodePoint )']', enc, syn)) {
3095 tok->type = TK_POSIX_BRACKET_OPEN;
3096 }
3097 else {
3098 PUNFETCH;
3099 goto cc_in_cc;
3100 }
3101 }
3102 else {
3103 cc_in_cc:
3104 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3105 tok->type = TK_CC_CC_OPEN;
3106 }
3107 else {
3108 CC_ESC_WARN(env, (UChar* )"[");
3109 }
3110 }
3111 }
3112 else if (c == '&') {
3113 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3114 !PEND && (PPEEK_IS('&'))) {
3115 PINC;
3116 tok->type = TK_CC_AND;
3117 }
3118 }
3119
3120 end:
3121 *src = p;
3122 return tok->type;
3123 }
3124
3125 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3126 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3127 {
3128 int r, num;
3129 OnigCodePoint c;
3130 OnigEncoding enc = env->enc;
3131 OnigSyntaxType* syn = env->syntax;
3132 UChar* prev;
3133 UChar* p = *src;
3134 PFETCH_READY;
3135
3136 start:
3137 if (PEND) {
3138 tok->type = TK_EOT;
3139 return tok->type;
3140 }
3141
3142 tok->type = TK_STRING;
3143 tok->base = 0;
3144 tok->backp = p;
3145
3146 PFETCH(c);
3147 if (IS_MC_ESC_CODE(c, syn)) {
3148 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3149
3150 tok->backp = p;
3151 PFETCH(c);
3152
3153 tok->u.c = c;
3154 tok->escaped = 1;
3155 switch (c) {
3156 case '*':
3157 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3158 tok->type = TK_OP_REPEAT;
3159 tok->u.repeat.lower = 0;
3160 tok->u.repeat.upper = REPEAT_INFINITE;
3161 goto greedy_check;
3162 break;
3163
3164 case '+':
3165 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3166 tok->type = TK_OP_REPEAT;
3167 tok->u.repeat.lower = 1;
3168 tok->u.repeat.upper = REPEAT_INFINITE;
3169 goto greedy_check;
3170 break;
3171
3172 case '?':
3173 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3174 tok->type = TK_OP_REPEAT;
3175 tok->u.repeat.lower = 0;
3176 tok->u.repeat.upper = 1;
3177 greedy_check:
3178 if (!PEND && PPEEK_IS('?') &&
3179 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3180 PFETCH(c);
3181 tok->u.repeat.greedy = 0;
3182 tok->u.repeat.possessive = 0;
3183 }
3184 else {
3185 possessive_check:
3186 if (!PEND && PPEEK_IS('+') &&
3187 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3188 tok->type != TK_INTERVAL) ||
3189 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3190 tok->type == TK_INTERVAL))) {
3191 PFETCH(c);
3192 tok->u.repeat.greedy = 1;
3193 tok->u.repeat.possessive = 1;
3194 }
3195 else {
3196 tok->u.repeat.greedy = 1;
3197 tok->u.repeat.possessive = 0;
3198 }
3199 }
3200 break;
3201
3202 case '{':
3203 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3204 r = fetch_range_quantifier(&p, end, tok, env);
3205 if (r < 0) return r; /* error */
3206 if (r == 0) goto greedy_check;
3207 else if (r == 2) { /* {n} */
3208 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3209 goto possessive_check;
3210
3211 goto greedy_check;
3212 }
3213 /* r == 1 : normal char */
3214 break;
3215
3216 case '|':
3217 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3218 tok->type = TK_ALT;
3219 break;
3220
3221 case '(':
3222 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3223 tok->type = TK_SUBEXP_OPEN;
3224 break;
3225
3226 case ')':
3227 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3228 tok->type = TK_SUBEXP_CLOSE;
3229 break;
3230
3231 case 'w':
3232 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3233 tok->type = TK_CHAR_TYPE;
3234 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3235 tok->u.prop.not = 0;
3236 break;
3237
3238 case 'W':
3239 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3240 tok->type = TK_CHAR_TYPE;
3241 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3242 tok->u.prop.not = 1;
3243 break;
3244
3245 case 'b':
3246 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3247 tok->type = TK_ANCHOR;
3248 tok->u.anchor = ANCHOR_WORD_BOUND;
3249 break;
3250
3251 case 'B':
3252 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3253 tok->type = TK_ANCHOR;
3254 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3255 break;
3256
3257 #ifdef USE_WORD_BEGIN_END
3258 case '<':
3259 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3260 tok->type = TK_ANCHOR;
3261 tok->u.anchor = ANCHOR_WORD_BEGIN;
3262 break;
3263
3264 case '>':
3265 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3266 tok->type = TK_ANCHOR;
3267 tok->u.anchor = ANCHOR_WORD_END;
3268 break;
3269 #endif
3270
3271 case 's':
3272 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3273 tok->type = TK_CHAR_TYPE;
3274 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3275 tok->u.prop.not = 0;
3276 break;
3277
3278 case 'S':
3279 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3280 tok->type = TK_CHAR_TYPE;
3281 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3282 tok->u.prop.not = 1;
3283 break;
3284
3285 case 'd':
3286 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3287 tok->type = TK_CHAR_TYPE;
3288 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3289 tok->u.prop.not = 0;
3290 break;
3291
3292 case 'D':
3293 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3294 tok->type = TK_CHAR_TYPE;
3295 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3296 tok->u.prop.not = 1;
3297 break;
3298
3299 case 'h':
3300 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3301 tok->type = TK_CHAR_TYPE;
3302 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3303 tok->u.prop.not = 0;
3304 break;
3305
3306 case 'H':
3307 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3308 tok->type = TK_CHAR_TYPE;
3309 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3310 tok->u.prop.not = 1;
3311 break;
3312
3313 case 'A':
3314 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3315 begin_buf:
3316 tok->type = TK_ANCHOR;
3317 tok->u.subtype = ANCHOR_BEGIN_BUF;
3318 break;
3319
3320 case 'Z':
3321 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3322 tok->type = TK_ANCHOR;
3323 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3324 break;
3325
3326 case 'z':
3327 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3328 end_buf:
3329 tok->type = TK_ANCHOR;
3330 tok->u.subtype = ANCHOR_END_BUF;
3331 break;
3332
3333 case 'G':
3334 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3335 tok->type = TK_ANCHOR;
3336 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3337 break;
3338
3339 case '`':
3340 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3341 goto begin_buf;
3342 break;
3343
3344 case '\'':
3345 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3346 goto end_buf;
3347 break;
3348
3349 case 'x':
3350 if (PEND) break;
3351
3352 prev = p;
3353 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3354 PINC;
3355 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3356 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3357 if (!PEND) {
3358 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3359 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3360 }
3361
3362 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3363 PINC;
3364 tok->type = TK_CODE_POINT;
3365 tok->u.code = (OnigCodePoint )num;
3366 }
3367 else {
3368 /* can't read nothing or invalid format */
3369 p = prev;
3370 }
3371 }
3372 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3373 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3374 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3375 if (p == prev) { /* can't read nothing. */
3376 num = 0; /* but, it's not error */
3377 }
3378 tok->type = TK_RAW_BYTE;
3379 tok->base = 16;
3380 tok->u.c = num;
3381 }
3382 break;
3383
3384 case 'u':
3385 if (PEND) break;
3386
3387 prev = p;
3388 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3389 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3390 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3391 if (p == prev) { /* can't read nothing. */
3392 num = 0; /* but, it's not error */
3393 }
3394 tok->type = TK_CODE_POINT;
3395 tok->base = 16;
3396 tok->u.code = (OnigCodePoint )num;
3397 }
3398 break;
3399
3400 case '1': case '2': case '3': case '4':
3401 case '5': case '6': case '7': case '8': case '9':
3402 PUNFETCH;
3403 prev = p;
3404 num = onig_scan_unsigned_number(&p, end, enc);
3405 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3406 goto skip_backref;
3407 }
3408
3409 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3410 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3411 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3412 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3413 return ONIGERR_INVALID_BACKREF;
3414 }
3415
3416 tok->type = TK_BACKREF;
3417 tok->u.backref.num = 1;
3418 tok->u.backref.ref1 = num;
3419 tok->u.backref.by_name = 0;
3420 #ifdef USE_BACKREF_WITH_LEVEL
3421 tok->u.backref.exist_level = 0;
3422 #endif
3423 break;
3424 }
3425
3426 skip_backref:
3427 if (c == '8' || c == '9') {
3428 /* normal char */
3429 p = prev; PINC;
3430 break;
3431 }
3432
3433 p = prev;
3434 /* fall through */
3435 case '0':
3436 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3437 prev = p;
3438 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3439 if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3440 if (p == prev) { /* can't read nothing. */
3441 num = 0; /* but, it's not error */
3442 }
3443 tok->type = TK_RAW_BYTE;
3444 tok->base = 8;
3445 tok->u.c = num;
3446 }
3447 else if (c != '0') {
3448 PINC;
3449 }
3450 break;
3451
3452 #ifdef USE_NAMED_GROUP
3453 case 'k':
3454 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3455 PFETCH(c);
3456 if (c == '<' || c == '\'') {
3457 UChar* name_end;
3458 int* backs;
3459 int back_num;
3460
3461 prev = p;
3462
3463 #ifdef USE_BACKREF_WITH_LEVEL
3464 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3465 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3466 env, &back_num, &tok->u.backref.level);
3467 if (r == 1) tok->u.backref.exist_level = 1;
3468 else tok->u.backref.exist_level = 0;
3469 #else
3470 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3471 #endif
3472 if (r < 0) return r;
3473
3474 if (back_num != 0) {
3475 if (back_num < 0) {
3476 back_num = BACKREF_REL_TO_ABS(back_num, env);
3477 if (back_num <= 0)
3478 return ONIGERR_INVALID_BACKREF;
3479 }
3480
3481 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3482 if (back_num > env->num_mem ||
3483 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3484 return ONIGERR_INVALID_BACKREF;
3485 }
3486 tok->type = TK_BACKREF;
3487 tok->u.backref.by_name = 0;
3488 tok->u.backref.num = 1;
3489 tok->u.backref.ref1 = back_num;
3490 }
3491 else {
3492 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3493 if (num <= 0) {
3494 onig_scan_env_set_error_string(env,
3495 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3496 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3497 }
3498 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3499 int i;
3500 for (i = 0; i < num; i++) {
3501 if (backs[i] > env->num_mem ||
3502 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3503 return ONIGERR_INVALID_BACKREF;
3504 }
3505 }
3506
3507 tok->type = TK_BACKREF;
3508 tok->u.backref.by_name = 1;
3509 if (num == 1) {
3510 tok->u.backref.num = 1;
3511 tok->u.backref.ref1 = backs[0];
3512 }
3513 else {
3514 tok->u.backref.num = num;
3515 tok->u.backref.refs = backs;
3516 }
3517 }
3518 }
3519 else
3520 PUNFETCH;
3521 }
3522 break;
3523 #endif
3524
3525 #ifdef USE_SUBEXP_CALL
3526 case 'g':
3527 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3528 PFETCH(c);
3529 if (c == '<' || c == '\'') {
3530 int gnum;
3531 UChar* name_end;
3532
3533 prev = p;
3534 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3535 if (r < 0) return r;
3536
3537 tok->type = TK_CALL;
3538 tok->u.call.name = prev;
3539 tok->u.call.name_end = name_end;
3540 tok->u.call.gnum = gnum;
3541 }
3542 else
3543 PUNFETCH;
3544 }
3545 break;
3546 #endif
3547
3548 case 'Q':
3549 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3550 tok->type = TK_QUOTE_OPEN;
3551 }
3552 break;
3553
3554 case 'p':
3555 case 'P':
3556 if (PPEEK_IS('{') &&
3557 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3558 PINC;
3559 tok->type = TK_CHAR_PROPERTY;
3560 tok->u.prop.not = (c == 'P' ? 1 : 0);
3561
3562 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3563 PFETCH(c);
3564 if (c == '^') {
3565 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3566 }
3567 else
3568 PUNFETCH;
3569 }
3570 }
3571 break;
3572
3573 default:
3574 PUNFETCH;
3575 num = fetch_escaped_value(&p, end, env);
3576 if (num < 0) return num;
3577 /* set_raw: */
3578 if (tok->u.c != num) {
3579 tok->type = TK_CODE_POINT;
3580 tok->u.code = (OnigCodePoint )num;
3581 }
3582 else { /* string */
3583 p = tok->backp + enclen(enc, tok->backp);
3584 }
3585 break;
3586 }
3587 }
3588 else {
3589 tok->u.c = c;
3590 tok->escaped = 0;
3591
3592 #ifdef USE_VARIABLE_META_CHARS
3593 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3594 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3595 if (c == MC_ANYCHAR(syn))
3596 goto any_char;
3597 else if (c == MC_ANYTIME(syn))
3598 goto anytime;
3599 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3600 goto zero_or_one_time;
3601 else if (c == MC_ONE_OR_MORE_TIME(syn))
3602 goto one_or_more_time;
3603 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3604 tok->type = TK_ANYCHAR_ANYTIME;
3605 goto out;
3606 }
3607 }
3608 #endif
3609
3610 switch (c) {
3611 case '.':
3612 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3613 #ifdef USE_VARIABLE_META_CHARS
3614 any_char:
3615 #endif
3616 tok->type = TK_ANYCHAR;
3617 break;
3618
3619 case '*':
3620 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3621 #ifdef USE_VARIABLE_META_CHARS
3622 anytime:
3623 #endif
3624 tok->type = TK_OP_REPEAT;
3625 tok->u.repeat.lower = 0;
3626 tok->u.repeat.upper = REPEAT_INFINITE;
3627 goto greedy_check;
3628 break;
3629
3630 case '+':
3631 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3632 #ifdef USE_VARIABLE_META_CHARS
3633 one_or_more_time:
3634 #endif
3635 tok->type = TK_OP_REPEAT;
3636 tok->u.repeat.lower = 1;
3637 tok->u.repeat.upper = REPEAT_INFINITE;
3638 goto greedy_check;
3639 break;
3640
3641 case '?':
3642 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3643 #ifdef USE_VARIABLE_META_CHARS
3644 zero_or_one_time:
3645 #endif
3646 tok->type = TK_OP_REPEAT;
3647 tok->u.repeat.lower = 0;
3648 tok->u.repeat.upper = 1;
3649 goto greedy_check;
3650 break;
3651
3652 case '{':
3653 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3654 r = fetch_range_quantifier(&p, end, tok, env);
3655 if (r < 0) return r; /* error */
3656 if (r == 0) goto greedy_check;
3657 else if (r == 2) { /* {n} */
3658 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3659 goto possessive_check;
3660
3661 goto greedy_check;
3662 }
3663 /* r == 1 : normal char */
3664 break;
3665
3666 case '|':
3667 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3668 tok->type = TK_ALT;
3669 break;
3670
3671 case '(':
3672 if (PPEEK_IS('?') &&
3673 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3674 PINC;
3675 if (PPEEK_IS('#')) {
3676 PFETCH(c);
3677 while (1) {
3678 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3679 PFETCH(c);
3680 if (c == MC_ESC(syn)) {
3681 if (!PEND) PFETCH(c);
3682 }
3683 else {
3684 if (c == ')') break;
3685 }
3686 }
3687 goto start;
3688 }
3689 PUNFETCH;
3690 }
3691
3692 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3693 tok->type = TK_SUBEXP_OPEN;
3694 break;
3695
3696 case ')':
3697 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3698 tok->type = TK_SUBEXP_CLOSE;
3699 break;
3700
3701 case '^':
3702 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3703 tok->type = TK_ANCHOR;
3704 tok->u.subtype = (IS_SINGLELINE(env->option)
3705 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3706 break;
3707
3708 case '$':
3709 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3710 tok->type = TK_ANCHOR;
3711 tok->u.subtype = (IS_SINGLELINE(env->option)
3712 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3713 break;
3714
3715 case '[':
3716 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3717 tok->type = TK_CC_OPEN;
3718 break;
3719
3720 case ']':
3721 if (*src > env->pattern) /* /].../ is allowed. */
3722 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3723 break;
3724
3725 case '#':
3726 if (IS_EXTEND(env->option)) {
3727 while (!PEND) {
3728 PFETCH(c);
3729 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3730 break;
3731 }
3732 goto start;
3733 break;
3734 }
3735 break;
3736
3737 case ' ': case '\t': case '\n': case '\r': case '\f':
3738 if (IS_EXTEND(env->option))
3739 goto start;
3740 break;
3741
3742 default:
3743 /* string */
3744 break;
3745 }
3746 }
3747
3748 #ifdef USE_VARIABLE_META_CHARS
3749 out:
3750 #endif
3751 *src = p;
3752 return tok->type;
3753 }
3754
3755 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3756 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3757 OnigEncoding enc ARG_UNUSED,
3758 OnigCodePoint sb_out, const OnigCodePoint mbr[])
3759 {
3760 int i, r;
3761 OnigCodePoint j;
3762
3763 int n = ONIGENC_CODE_RANGE_NUM(mbr);
3764
3765 if (not == 0) {
3766 for (i = 0; i < n; i++) {
3767 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
3768 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3769 if (j >= sb_out) {
3770 if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3771 else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3772 r = add_code_range_to_buf(&(cc->mbuf), j,
3773 ONIGENC_CODE_RANGE_TO(mbr, i));
3774 if (r != 0) return r;
3775 i++;
3776 }
3777
3778 goto sb_end;
3779 }
3780 BITSET_SET_BIT(cc->bs, j);
3781 }
3782 }
3783
3784 sb_end:
3785 for ( ; i < n; i++) {
3786 r = add_code_range_to_buf(&(cc->mbuf),
3787 ONIGENC_CODE_RANGE_FROM(mbr, i),
3788 ONIGENC_CODE_RANGE_TO(mbr, i));
3789 if (r != 0) return r;
3790 }
3791 }
3792 else {
3793 OnigCodePoint prev = 0;
3794
3795 for (i = 0; i < n; i++) {
3796 for (j = prev;
3797 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3798 if (j >= sb_out) {
3799 goto sb_end2;
3800 }
3801 BITSET_SET_BIT(cc->bs, j);
3802 }
3803 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3804 }
3805 for (j = prev; j < sb_out; j++) {
3806 BITSET_SET_BIT(cc->bs, j);
3807 }
3808
3809 sb_end2:
3810 prev = sb_out;
3811
3812 for (i = 0; i < n; i++) {
3813 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3814 r = add_code_range_to_buf(&(cc->mbuf), prev,
3815 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3816 if (r != 0) return r;
3817 }
3818 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3819 }
3820 if (prev < 0x7fffffff) {
3821 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3822 if (r != 0) return r;
3823 }
3824 }
3825
3826 return 0;
3827 }
3828
3829 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3830 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3831 {
3832 int c, r;
3833 const OnigCodePoint *ranges;
3834 OnigCodePoint sb_out;
3835 OnigEncoding enc = env->enc;
3836
3837 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3838 if (r == 0) {
3839 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3840 }
3841 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3842 return r;
3843 }
3844
3845 r = 0;
3846 switch (ctype) {
3847 case ONIGENC_CTYPE_ALPHA:
3848 case ONIGENC_CTYPE_BLANK:
3849 case ONIGENC_CTYPE_CNTRL:
3850 case ONIGENC_CTYPE_DIGIT:
3851 case ONIGENC_CTYPE_LOWER:
3852 case ONIGENC_CTYPE_PUNCT:
3853 case ONIGENC_CTYPE_SPACE:
3854 case ONIGENC_CTYPE_UPPER:
3855 case ONIGENC_CTYPE_XDIGIT:
3856 case ONIGENC_CTYPE_ASCII:
3857 case ONIGENC_CTYPE_ALNUM:
3858 if (not != 0) {
3859 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3860 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3861 BITSET_SET_BIT(cc->bs, c);
3862 }
3863 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3864 }
3865 else {
3866 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3867 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3868 BITSET_SET_BIT(cc->bs, c);
3869 }
3870 }
3871 break;
3872
3873 case ONIGENC_CTYPE_GRAPH:
3874 case ONIGENC_CTYPE_PRINT:
3875 if (not != 0) {
3876 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3877 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3878 BITSET_SET_BIT(cc->bs, c);
3879 }
3880 }
3881 else {
3882 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3883 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3884 BITSET_SET_BIT(cc->bs, c);
3885 }
3886 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3887 }
3888 break;
3889
3890 case ONIGENC_CTYPE_WORD:
3891 if (not == 0) {
3892 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3893 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3894 }
3895 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3896 }
3897 else {
3898 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3899 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3900 && ! ONIGENC_IS_CODE_WORD(enc, c))
3901 BITSET_SET_BIT(cc->bs, c);
3902 }
3903 }
3904 break;
3905
3906 default:
3907 return ONIGERR_PARSER_BUG;
3908 break;
3909 }
3910
3911 return r;
3912 }
3913
3914 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3915 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3916 {
3917 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3918 #define POSIX_BRACKET_NAME_MIN_LEN 4
3919
3920 static PosixBracketEntryType PBS[] = {
3921 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3922 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3923 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3924 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3925 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3926 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3927 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3928 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3929 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3930 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3931 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3932 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3933 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3934 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
3935 { (UChar* )NULL, -1, 0 }
3936 };
3937
3938 PosixBracketEntryType *pb;
3939 int not, i, r;
3940 OnigCodePoint c;
3941 OnigEncoding enc = env->enc;
3942 UChar *p = *src;
3943
3944 if (PPEEK_IS('^')) {
3945 PINC_S;
3946 not = 1;
3947 }
3948 else
3949 not = 0;
3950
3951 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3952 goto not_posix_bracket;
3953
3954 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3955 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3956 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3957 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3958 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3959
3960 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3961 if (r != 0) return r;
3962
3963 PINC_S; PINC_S;
3964 *src = p;
3965 return 0;
3966 }
3967 }
3968
3969 not_posix_bracket:
3970 c = 0;
3971 i = 0;
3972 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3973 PINC_S;
3974 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3975 }
3976 if (c == ':' && ! PEND) {
3977 PINC_S;
3978 if (! PEND) {
3979 PFETCH_S(c);
3980 if (c == ']')
3981 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3982 }
3983 }
3984
3985 return 1; /* 1: is not POSIX bracket, but no error. */
3986 }
3987
3988 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3989 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3990 {
3991 int r;
3992 OnigCodePoint c;
3993 OnigEncoding enc = env->enc;
3994 UChar *prev, *start, *p = *src;
3995
3996 r = 0;
3997 start = prev = p;
3998
3999 while (!PEND) {
4000 prev = p;
4001 PFETCH_S(c);
4002 if (c == '}') {
4003 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4004 if (r < 0) break;
4005
4006 *src = p;
4007 return r;
4008 }
4009 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4010 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4011 break;
4012 }
4013 }
4014
4015 onig_scan_env_set_error_string(env, r, *src, prev);
4016 return r;
4017 }
4018
4019 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4020 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4021 ScanEnv* env)
4022 {
4023 int r, ctype;
4024 CClassNode* cc;
4025
4026 ctype = fetch_char_property_to_ctype(src, end, env);
4027 if (ctype < 0) return ctype;
4028
4029 *np = node_new_cclass();
4030 CHECK_NULL_RETURN_MEMERR(*np);
4031 cc = NCCLASS(*np);
4032 r = add_ctype_to_cc(cc, ctype, 0, env);
4033 if (r != 0) return r;
4034 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4035
4036 return 0;
4037 }
4038
4039
4040 enum CCSTATE {
4041 CCS_VALUE,
4042 CCS_RANGE,
4043 CCS_COMPLETE,
4044 CCS_START
4045 };
4046
4047 enum CCVALTYPE {
4048 CCV_SB,
4049 CCV_CODE_POINT,
4050 CCV_CLASS
4051 };
4052
4053 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4054 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4055 enum CCSTATE* state, ScanEnv* env)
4056 {
4057 int r;
4058
4059 if (*state == CCS_RANGE)
4060 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4061
4062 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4063 if (*type == CCV_SB)
4064 BITSET_SET_BIT(cc->bs, (int )(*vs));
4065 else if (*type == CCV_CODE_POINT) {
4066 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4067 if (r < 0) return r;
4068 }
4069 }
4070
4071 if (*state != CCS_START)
4072 *state = CCS_VALUE;
4073
4074 *type = CCV_CLASS;
4075 return 0;
4076 }
4077
4078 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4079 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4080 int* vs_israw, int v_israw,
4081 enum CCVALTYPE intype, enum CCVALTYPE* type,
4082 enum CCSTATE* state, ScanEnv* env)
4083 {
4084 int r;
4085
4086 switch (*state) {
4087 case CCS_VALUE:
4088 if (*type == CCV_SB)
4089 {
4090 if (*vs > 0xff)
4091 return ONIGERR_INVALID_CODE_POINT_VALUE;
4092 BITSET_SET_BIT(cc->bs, (int )(*vs));
4093 }
4094 else if (*type == CCV_CODE_POINT) {
4095 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4096 if (r < 0) return r;
4097 }
4098 break;
4099
4100 case CCS_RANGE:
4101 if (intype == *type) {
4102 if (intype == CCV_SB) {
4103 if (*vs > 0xff || v > 0xff)
4104 return ONIGERR_INVALID_CODE_POINT_VALUE;
4105
4106 if (*vs > v) {
4107 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4108 goto ccs_range_end;
4109 else
4110 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4111 }
4112 bitset_set_range(cc->bs, (int )*vs, (int )v);
4113 }
4114 else {
4115 r = add_code_range(&(cc->mbuf), env, *vs, v);
4116 if (r < 0) return r;
4117 }
4118 }
4119 else {
4120 #if 0
4121 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4122 #endif
4123 if (*vs > v) {
4124 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4125 goto ccs_range_end;
4126 else
4127 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4128 }
4129 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4130 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4131 if (r < 0) return r;
4132 #if 0
4133 }
4134 else
4135 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4136 #endif
4137 }
4138 ccs_range_end:
4139 *state = CCS_COMPLETE;
4140 break;
4141
4142 case CCS_COMPLETE:
4143 case CCS_START:
4144 *state = CCS_VALUE;
4145 break;
4146
4147 default:
4148 break;
4149 }
4150
4151 *vs_israw = v_israw;
4152 *vs = v;
4153 *type = intype;
4154 return 0;
4155 }
4156
4157 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4158 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4159 ScanEnv* env)
4160 {
4161 int in_esc;
4162 OnigCodePoint code;
4163 OnigEncoding enc = env->enc;
4164 UChar* p = from;
4165
4166 in_esc = 0;
4167 while (! PEND) {
4168 if (ignore_escaped && in_esc) {
4169 in_esc = 0;
4170 }
4171 else {
4172 PFETCH_S(code);
4173 if (code == c) return 1;
4174 if (code == MC_ESC(env->syntax)) in_esc = 1;
4175 }
4176 }
4177 return 0;
4178 }
4179
4180 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4181 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4182 ScanEnv* env)
4183 {
4184 int r, neg, len, fetched, and_start;
4185 OnigCodePoint v, vs;
4186 UChar *p;
4187 Node* node;
4188 CClassNode *cc, *prev_cc;
4189 CClassNode work_cc;
4190
4191 enum CCSTATE state;
4192 enum CCVALTYPE val_type, in_type;
4193 int val_israw, in_israw;
4194
4195 prev_cc = (CClassNode* )NULL;
4196 *np = NULL_NODE;
4197 r = fetch_token_in_cc(tok, src, end, env);
4198 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4199 neg = 1;
4200 r = fetch_token_in_cc(tok, src, end, env);
4201 }
4202 else {
4203 neg = 0;
4204 }
4205
4206 if (r < 0) return r;
4207 if (r == TK_CC_CLOSE) {
4208 if (! code_exist_check((OnigCodePoint )']',
4209 *src, env->pattern_end, 1, env))
4210 return ONIGERR_EMPTY_CHAR_CLASS;
4211
4212 CC_ESC_WARN(env, (UChar* )"]");
4213 r = tok->type = TK_CHAR; /* allow []...] */
4214 }
4215
4216 *np = node = node_new_cclass();
4217 CHECK_NULL_RETURN_MEMERR(node);
4218 cc = NCCLASS(node);
4219
4220 and_start = 0;
4221 state = CCS_START;
4222 p = *src;
4223 while (r != TK_CC_CLOSE) {
4224 fetched = 0;
4225 switch (r) {
4226 case TK_CHAR:
4227 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4228 if (len > 1) {
4229 in_type = CCV_CODE_POINT;
4230 }
4231 else if (len < 0) {
4232 r = len;
4233 goto err;
4234 }
4235 else {
4236 sb_char:
4237 in_type = CCV_SB;
4238 }
4239 v = (OnigCodePoint )tok->u.c;
4240 in_israw = 0;
4241 goto val_entry2;
4242 break;
4243
4244 case TK_RAW_BYTE:
4245 /* tok->base != 0 : octal or hexadec. */
4246 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4247 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4248 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4249 UChar* psave = p;
4250 int i, base = tok->base;
4251
4252 buf[0] = tok->u.c;
4253 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4254 r = fetch_token_in_cc(tok, &p, end, env);
4255 if (r < 0) goto err;
4256 if (r != TK_RAW_BYTE || tok->base != base) {
4257 fetched = 1;
4258 break;
4259 }
4260 buf[i] = tok->u.c;
4261 }
4262
4263 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4264 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4265 goto err;
4266 }
4267
4268 len = enclen(env->enc, buf);
4269 if (i < len) {
4270 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4271 goto err;
4272 }
4273 else if (i > len) { /* fetch back */
4274 p = psave;
4275 for (i = 1; i < len; i++) {
4276 r = fetch_token_in_cc(tok, &p, end, env);
4277 }
4278 fetched = 0;
4279 }
4280
4281 if (i == 1) {
4282 v = (OnigCodePoint )buf[0];
4283 goto raw_single;
4284 }
4285 else {
4286 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4287 in_type = CCV_CODE_POINT;
4288 }
4289 }
4290 else {
4291 v = (OnigCodePoint )tok->u.c;
4292 raw_single:
4293 in_type = CCV_SB;
4294 }
4295 in_israw = 1;
4296 goto val_entry2;
4297 break;
4298
4299 case TK_CODE_POINT:
4300 v = tok->u.code;
4301 in_israw = 1;
4302 val_entry:
4303 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4304 if (len < 0) {
4305 r = len;
4306 goto err;
4307 }
4308 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4309 val_entry2:
4310 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4311 &state, env);
4312 if (r != 0) goto err;
4313 break;
4314
4315 case TK_POSIX_BRACKET_OPEN:
4316 r = parse_posix_bracket(cc, &p, end, env);
4317 if (r < 0) goto err;
4318 if (r == 1) { /* is not POSIX bracket */
4319 CC_ESC_WARN(env, (UChar* )"[");
4320 p = tok->backp;
4321 v = (OnigCodePoint )tok->u.c;
4322 in_israw = 0;
4323 goto val_entry;
4324 }
4325 goto next_class;
4326 break;
4327
4328 case TK_CHAR_TYPE:
4329 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4330 if (r != 0) return r;
4331
4332 next_class:
4333 r = next_state_class(cc, &vs, &val_type, &state, env);
4334 if (r != 0) goto err;
4335 break;
4336
4337 case TK_CHAR_PROPERTY:
4338 {
4339 int ctype;
4340
4341 ctype = fetch_char_property_to_ctype(&p, end, env);
4342 if (ctype < 0) return ctype;
4343 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4344 if (r != 0) return r;
4345 goto next_class;
4346 }
4347 break;
4348
4349 case TK_CC_RANGE:
4350 if (state == CCS_VALUE) {
4351 r = fetch_token_in_cc(tok, &p, end, env);
4352 if (r < 0) goto err;
4353 fetched = 1;
4354 if (r == TK_CC_CLOSE) { /* allow [x-] */
4355 range_end_val:
4356 v = (OnigCodePoint )'-';
4357 in_israw = 0;
4358 goto val_entry;
4359 }
4360 else if (r == TK_CC_AND) {
4361 CC_ESC_WARN(env, (UChar* )"-");
4362 goto range_end_val;
4363 }
4364 state = CCS_RANGE;
4365 }
4366 else if (state == CCS_START) {
4367 /* [-xa] is allowed */
4368 v = (OnigCodePoint )tok->u.c;
4369 in_israw = 0;
4370
4371 r = fetch_token_in_cc(tok, &p, end, env);
4372 if (r < 0) goto err;
4373 fetched = 1;
4374 /* [--x] or [a&&-x] is warned. */
4375 if (r == TK_CC_RANGE || and_start != 0)
4376 CC_ESC_WARN(env, (UChar* )"-");
4377
4378 goto val_entry;
4379 }
4380 else if (state == CCS_RANGE) {
4381 CC_ESC_WARN(env, (UChar* )"-");
4382 goto sb_char; /* [!--x] is allowed */
4383 }
4384 else { /* CCS_COMPLETE */
4385 r = fetch_token_in_cc(tok, &p, end, env);
4386 if (r < 0) goto err;
4387 fetched = 1;
4388 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4389 else if (r == TK_CC_AND) {
4390 CC_ESC_WARN(env, (UChar* )"-");
4391 goto range_end_val;
4392 }
4393
4394 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4395 CC_ESC_WARN(env, (UChar* )"-");
4396 goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4397 }
4398 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4399 goto err;
4400 }
4401 break;
4402
4403 case TK_CC_CC_OPEN: /* [ */
4404 {
4405 Node *anode;
4406 CClassNode* acc;
4407
4408 r = parse_char_class(&anode, tok, &p, end, env);
4409 if (r != 0) goto cc_open_err;
4410 acc = NCCLASS(anode);
4411 r = or_cclass(cc, acc, env->enc);
4412
4413 onig_node_free(anode);
4414 cc_open_err:
4415 if (r != 0) goto err;
4416 }
4417 break;
4418
4419 case TK_CC_AND: /* && */
4420 {
4421 if (state == CCS_VALUE) {
4422 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4423 &val_type, &state, env);
4424 if (r != 0) goto err;
4425 }
4426 /* initialize local variables */
4427 and_start = 1;
4428 state = CCS_START;
4429
4430 if (IS_NOT_NULL(prev_cc)) {
4431 r = and_cclass(prev_cc, cc, env->enc);
4432 if (r != 0) goto err;
4433 bbuf_free(cc->mbuf);
4434 }
4435 else {
4436 prev_cc = cc;
4437 cc = &work_cc;
4438 }
4439 initialize_cclass(cc);
4440 }
4441 break;
4442
4443 case TK_EOT:
4444 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4445 goto err;
4446 break;
4447 default:
4448 r = ONIGERR_PARSER_BUG;
4449 goto err;
4450 break;
4451 }
4452
4453 if (fetched)
4454 r = tok->type;
4455 else {
4456 r = fetch_token_in_cc(tok, &p, end, env);
4457 if (r < 0) goto err;
4458 }
4459 }
4460
4461 if (state == CCS_VALUE) {
4462 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4463 &val_type, &state, env);
4464 if (r != 0) goto err;
4465 }
4466
4467 if (IS_NOT_NULL(prev_cc)) {
4468 r = and_cclass(prev_cc, cc, env->enc);
4469 if (r != 0) goto err;
4470 bbuf_free(cc->mbuf);
4471 cc = prev_cc;
4472 }
4473
4474 if (neg != 0)
4475 NCCLASS_SET_NOT(cc);
4476 else
4477 NCCLASS_CLEAR_NOT(cc);
4478 if (IS_NCCLASS_NOT(cc) &&
4479 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4480 int is_empty;
4481
4482 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4483 if (is_empty != 0)
4484 BITSET_IS_EMPTY(cc->bs, is_empty);
4485
4486 if (is_empty == 0) {
4487 #define NEWLINE_CODE 0x0a
4488
4489 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4490 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4491 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4492 else
4493 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4494 }
4495 }
4496 }
4497 *src = p;
4498 return 0;
4499
4500 err:
4501 if (cc != NCCLASS(*np))
4502 bbuf_free(cc->mbuf);
4503 onig_node_free(*np);
4504 return r;
4505 }
4506
4507 static int parse_subexp(Node** top, OnigToken* tok, int term,
4508 UChar** src, UChar* end, ScanEnv* env);
4509
4510 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4511 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4512 ScanEnv* env)
4513 {
4514 int r, num;
4515 Node *target;
4516 OnigOptionType option;
4517 OnigCodePoint c;
4518 OnigEncoding enc = env->enc;
4519
4520 #ifdef USE_NAMED_GROUP
4521 int list_capture;
4522 #endif
4523
4524 UChar* p = *src;
4525 PFETCH_READY;
4526
4527 *np = NULL;
4528 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4529
4530 option = env->option;
4531 if (PPEEK_IS('?') &&
4532 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4533 PINC;
4534 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4535
4536 PFETCH(c);
4537 switch (c) {
4538 case ':': /* (?:...) grouping only */
4539 group:
4540 r = fetch_token(tok, &p, end, env);
4541 if (r < 0) return r;
4542 r = parse_subexp(np, tok, term, &p, end, env);
4543 if (r < 0) return r;
4544 *src = p;
4545 return 1; /* group */
4546 break;
4547
4548 case '=':
4549 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4550 break;
4551 case '!': /* preceding read */
4552 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4553 break;
4554 case '>': /* (?>...) stop backtrack */
4555 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4556 break;
4557
4558 #ifdef USE_NAMED_GROUP
4559 case '\'':
4560 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4561 goto named_group1;
4562 }
4563 else
4564 return ONIGERR_UNDEFINED_GROUP_OPTION;
4565 break;
4566 #endif
4567
4568 case '<': /* look behind (?<=...), (?<!...) */
4569 PFETCH(c);
4570 if (c == '=')
4571 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4572 else if (c == '!')
4573 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4574 #ifdef USE_NAMED_GROUP
4575 else {
4576 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4577 UChar *name;
4578 UChar *name_end;
4579
4580 PUNFETCH;
4581 c = '<';
4582
4583 named_group1:
4584 list_capture = 0;
4585
4586 named_group2:
4587 name = p;
4588 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4589 if (r < 0) return r;
4590
4591 num = scan_env_add_mem_entry(env);
4592 if (num < 0) return num;
4593 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4594 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4595
4596 r = name_add(env->reg, name, name_end, num, env);
4597 if (r != 0) return r;
4598 *np = node_new_enclose_memory(env->option, 1);
4599 CHECK_NULL_RETURN_MEMERR(*np);
4600 NENCLOSE(*np)->regnum = num;
4601 if (list_capture != 0)
4602 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4603 env->num_named++;
4604 }
4605 else {
4606 return ONIGERR_UNDEFINED_GROUP_OPTION;
4607 }
4608 }
4609 #else
4610 else {
4611 return ONIGERR_UNDEFINED_GROUP_OPTION;
4612 }
4613 #endif
4614 break;
4615
4616 case '@':
4617 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4618 #ifdef USE_NAMED_GROUP
4619 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4620 PFETCH(c);
4621 if (c == '<' || c == '\'') {
4622 list_capture = 1;
4623 goto named_group2; /* (?@<name>...) */
4624 }
4625 PUNFETCH;
4626 }
4627 #endif
4628 *np = node_new_enclose_memory(env->option, 0);
4629 CHECK_NULL_RETURN_MEMERR(*np);
4630 num = scan_env_add_mem_entry(env);
4631 if (num < 0) {
4632 onig_node_free(*np);
4633 return num;
4634 }
4635 else if (num >= (int )BIT_STATUS_BITS_NUM) {
4636 onig_node_free(*np);
4637 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4638 }
4639 NENCLOSE(*np)->regnum = num;
4640 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4641 }
4642 else {
4643 return ONIGERR_UNDEFINED_GROUP_OPTION;
4644 }
4645 break;
4646
4647 #ifdef USE_POSIXLINE_OPTION
4648 case 'p':
4649 #endif
4650 case '-': case 'i': case 'm': case 's': case 'x':
4651 {
4652 int neg = 0;
4653
4654 while (1) {
4655 switch (c) {
4656 case ':':
4657 case ')':
4658 break;
4659
4660 case '-': neg = 1; break;
4661 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4662 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4663 case 's':
4664 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4665 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4666 }
4667 else
4668 return ONIGERR_UNDEFINED_GROUP_OPTION;
4669 break;
4670
4671 case 'm':
4672 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4673 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4674 }
4675 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4676 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4677 }
4678 else
4679 return ONIGERR_UNDEFINED_GROUP_OPTION;
4680 break;
4681 #ifdef USE_POSIXLINE_OPTION
4682 case 'p':
4683 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4684 break;
4685 #endif
4686 default:
4687 return ONIGERR_UNDEFINED_GROUP_OPTION;
4688 }
4689
4690 if (c == ')') {
4691 *np = node_new_option(option);
4692 CHECK_NULL_RETURN_MEMERR(*np);
4693 *src = p;
4694 return 2; /* option only */
4695 }
4696 else if (c == ':') {
4697 OnigOptionType prev = env->option;
4698
4699 env->option = option;
4700 r = fetch_token(tok, &p, end, env);
4701 if (r < 0) return r;
4702 r = parse_subexp(&target, tok, term, &p, end, env);
4703 env->option = prev;
4704 if (r < 0) return r;
4705 *np = node_new_option(option);
4706 CHECK_NULL_RETURN_MEMERR(*np);
4707 NENCLOSE(*np)->target = target;
4708 *src = p;
4709 return 0;
4710 }
4711
4712 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4713 PFETCH(c);
4714 }
4715 }
4716 break;
4717
4718 default:
4719 return ONIGERR_UNDEFINED_GROUP_OPTION;
4720 }
4721 }
4722 else {
4723 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4724 goto group;
4725
4726 *np = node_new_enclose_memory(env->option, 0);
4727 CHECK_NULL_RETURN_MEMERR(*np);
4728 num = scan_env_add_mem_entry(env);
4729 if (num < 0) return num;
4730 NENCLOSE(*np)->regnum = num;
4731 }
4732
4733 CHECK_NULL_RETURN_MEMERR(*np);
4734 r = fetch_token(tok, &p, end, env);
4735 if (r < 0) return r;
4736 r = parse_subexp(&target, tok, term, &p, end, env);
4737 if (r < 0) return r;
4738
4739 if (NTYPE(*np) == NT_ANCHOR)
4740 NANCHOR(*np)->target = target;
4741 else {
4742 NENCLOSE(*np)->target = target;
4743 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4744 /* Don't move this to previous of parse_subexp() */
4745 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4746 if (r != 0) return r;
4747 }
4748 }
4749
4750 *src = p;
4751 return 0;
4752 }
4753
4754 static const char* PopularQStr[] = {
4755 "?", "*", "+", "??", "*?", "+?"
4756 };
4757
4758 static const char* ReduceQStr[] = {
4759 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4760 };
4761
4762 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4763 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4764 {
4765 QtfrNode* qn;
4766
4767 qn = NQTFR(qnode);
4768 if (qn->lower == 1 && qn->upper == 1) {
4769 return 1;
4770 }
4771
4772 switch (NTYPE(target)) {
4773 case NT_STR:
4774 if (! group) {
4775 StrNode* sn = NSTR(target);
4776 if (str_node_can_be_split(sn, env->enc)) {
4777 Node* n = str_node_split_last_char(sn, env->enc);
4778 if (IS_NOT_NULL(n)) {
4779 qn->target = n;
4780 return 2;
4781 }
4782 }
4783 }
4784 break;
4785
4786 case NT_QTFR:
4787 { /* check redundant double repeat. */
4788 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4789 QtfrNode* qnt = NQTFR(target);
4790 int nestq_num = popular_quantifier_num(qn);
4791 int targetq_num = popular_quantifier_num(qnt);
4792
4793 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4794 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4795 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4796 UChar buf[WARN_BUFSIZE];
4797
4798 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4799 case RQ_ASIS:
4800 break;
4801
4802 case RQ_DEL:
4803 if (onig_verb_warn != onig_null_warn) {
4804 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4805 env->pattern, env->pattern_end,
4806 (UChar* )"redundant nested repeat operator");
4807 (*onig_verb_warn)((char* )buf);
4808 }
4809 goto warn_exit;
4810 break;
4811
4812 default:
4813 if (onig_verb_warn != onig_null_warn) {
4814 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4815 env->pattern, env->pattern_end,
4816 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4817 PopularQStr[targetq_num], PopularQStr[nestq_num],
4818 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4819 (*onig_verb_warn)((char* )buf);
4820 }
4821 goto warn_exit;
4822 break;
4823 }
4824 }
4825
4826 warn_exit:
4827 #endif
4828 if (targetq_num >= 0) {
4829 if (nestq_num >= 0) {
4830 onig_reduce_nested_quantifier(qnode, target);
4831 goto q_exit;
4832 }
4833 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4834 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4835 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4836 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4837 }
4838 }
4839 }
4840 }
4841 break;
4842
4843 default:
4844 break;
4845 }
4846
4847 qn->target = target;
4848 q_exit:
4849 return 0;
4850 }
4851
4852
4853 #ifdef USE_SHARED_CCLASS_TABLE
4854
4855 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4856
4857 /* for ctype node hash table */
4858
4859 typedef struct {
4860 OnigEncoding enc;
4861 int not;
4862 int type;
4863 } type_cclass_key;
4864
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4865 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4866 {
4867 if (x->type != y->type) return 1;
4868 if (x->enc != y->enc) return 1;
4869 if (x->not != y->not) return 1;
4870 return 0;
4871 }
4872
type_cclass_hash(type_cclass_key * key)4873 static int type_cclass_hash(type_cclass_key* key)
4874 {
4875 int i, val;
4876 UChar *p;
4877
4878 val = 0;
4879
4880 p = (UChar* )&(key->enc);
4881 for (i = 0; i < (int )sizeof(key->enc); i++) {
4882 val = val * 997 + (int )*p++;
4883 }
4884
4885 p = (UChar* )(&key->type);
4886 for (i = 0; i < (int )sizeof(key->type); i++) {
4887 val = val * 997 + (int )*p++;
4888 }
4889
4890 val += key->not;
4891 return val + (val >> 5);
4892 }
4893
4894 static struct st_hash_type type_type_cclass_hash = {
4895 type_cclass_cmp,
4896 type_cclass_hash,
4897 };
4898
4899 static st_table* OnigTypeCClassTable;
4900
4901
4902 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4903 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4904 {
4905 if (IS_NOT_NULL(node)) {
4906 CClassNode* cc = NCCLASS(node);
4907 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4908 xfree(node);
4909 }
4910
4911 if (IS_NOT_NULL(key)) xfree(key);
4912 return ST_DELETE;
4913 }
4914
4915 extern int
onig_free_shared_cclass_table(void)4916 onig_free_shared_cclass_table(void)
4917 {
4918 if (IS_NOT_NULL(OnigTypeCClassTable)) {
4919 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4920 onig_st_free_table(OnigTypeCClassTable);
4921 OnigTypeCClassTable = NULL;
4922 }
4923
4924 return 0;
4925 }
4926
4927 #endif /* USE_SHARED_CCLASS_TABLE */
4928
4929
4930 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4931 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4932 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4933 {
4934 BBuf *tbuf;
4935 int r;
4936
4937 if (IS_NCCLASS_NOT(cc)) {
4938 bitset_invert(cc->bs);
4939
4940 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4941 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4942 if (r != 0) return r;
4943
4944 bbuf_free(cc->mbuf);
4945 cc->mbuf = tbuf;
4946 }
4947
4948 NCCLASS_CLEAR_NOT(cc);
4949 }
4950
4951 return 0;
4952 }
4953 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4954
4955 typedef struct {
4956 ScanEnv* env;
4957 CClassNode* cc;
4958 Node* alt_root;
4959 Node** ptail;
4960 } IApplyCaseFoldArg;
4961
4962 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4963 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4964 int to_len, void* arg)
4965 {
4966 IApplyCaseFoldArg* iarg;
4967 ScanEnv* env;
4968 CClassNode* cc;
4969 BitSetRef bs;
4970
4971 iarg = (IApplyCaseFoldArg* )arg;
4972 env = iarg->env;
4973 cc = iarg->cc;
4974 bs = cc->bs;
4975
4976 if (to_len == 1) {
4977 int is_in = onig_is_code_in_cc(env->enc, from, cc);
4978 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4979 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4980 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
4981 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4982 add_code_range(&(cc->mbuf), env, *to, *to);
4983 }
4984 else {
4985 BITSET_SET_BIT(bs, *to);
4986 }
4987 }
4988 #else
4989 if (is_in != 0) {
4990 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4991 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4992 add_code_range(&(cc->mbuf), env, *to, *to);
4993 }
4994 else {
4995 if (IS_NCCLASS_NOT(cc)) {
4996 BITSET_CLEAR_BIT(bs, *to);
4997 }
4998 else
4999 BITSET_SET_BIT(bs, *to);
5000 }
5001 }
5002 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5003 }
5004 else {
5005 int r, i, len;
5006 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5007 Node *snode = NULL_NODE;
5008
5009 if (onig_is_code_in_cc(env->enc, from, cc)
5010 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5011 && !IS_NCCLASS_NOT(cc)
5012 #endif
5013 ) {
5014 for (i = 0; i < to_len; i++) {
5015 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5016 if (i == 0) {
5017 snode = onig_node_new_str(buf, buf + len);
5018 CHECK_NULL_RETURN_MEMERR(snode);
5019
5020 /* char-class expanded multi-char only
5021 compare with string folded at match time. */
5022 NSTRING_SET_AMBIG(snode);
5023 }
5024 else {
5025 r = onig_node_str_cat(snode, buf, buf + len);
5026 if (r < 0) {
5027 onig_node_free(snode);
5028 return r;
5029 }
5030 }
5031 }
5032
5033 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5034 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5035 iarg->ptail = &(NCDR((*(iarg->ptail))));
5036 }
5037 }
5038
5039 return 0;
5040 }
5041
5042 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5043 parse_exp(Node** np, OnigToken* tok, int term,
5044 UChar** src, UChar* end, ScanEnv* env)
5045 {
5046 int r, len, group = 0;
5047 Node* qn;
5048 Node** targetp;
5049
5050 *np = NULL;
5051 if (tok->type == (enum TokenSyms )term)
5052 goto end_of_token;
5053
5054 switch (tok->type) {
5055 case TK_ALT:
5056 case TK_EOT:
5057 end_of_token:
5058 *np = node_new_empty();
5059 return tok->type;
5060 break;
5061
5062 case TK_SUBEXP_OPEN:
5063 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5064 if (r < 0) return r;
5065 if (r == 1) group = 1;
5066 else if (r == 2) { /* option only */
5067 Node* target;
5068 OnigOptionType prev = env->option;
5069
5070 env->option = NENCLOSE(*np)->option;
5071 r = fetch_token(tok, src, end, env);
5072 if (r < 0) return r;
5073 r = parse_subexp(&target, tok, term, src, end, env);
5074 env->option = prev;
5075 if (r < 0) return r;
5076 NENCLOSE(*np)->target = target;
5077 return tok->type;
5078 }
5079 break;
5080
5081 case TK_SUBEXP_CLOSE:
5082 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5083 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5084
5085 if (tok->escaped) goto tk_raw_byte;
5086 else goto tk_byte;
5087 break;
5088
5089 case TK_STRING:
5090 tk_byte:
5091 {
5092 *np = node_new_str(tok->backp, *src);
5093 CHECK_NULL_RETURN_MEMERR(*np);
5094
5095 while (1) {
5096 r = fetch_token(tok, src, end, env);
5097 if (r < 0) return r;
5098 if (r != TK_STRING) break;
5099
5100 r = onig_node_str_cat(*np, tok->backp, *src);
5101 if (r < 0) return r;
5102 }
5103
5104 string_end:
5105 targetp = np;
5106 goto repeat;
5107 }
5108 break;
5109
5110 case TK_RAW_BYTE:
5111 tk_raw_byte:
5112 {
5113 *np = node_new_str_raw_char((UChar )tok->u.c);
5114 CHECK_NULL_RETURN_MEMERR(*np);
5115 len = 1;
5116 while (1) {
5117 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5118 if (len == enclen(env->enc, NSTR(*np)->s)) {
5119 r = fetch_token(tok, src, end, env);
5120 NSTRING_CLEAR_RAW(*np);
5121 goto string_end;
5122 }
5123 }
5124
5125 r = fetch_token(tok, src, end, env);
5126 if (r < 0) return r;
5127 if (r != TK_RAW_BYTE) {
5128 /* Don't use this, it is wrong for little endian encodings. */
5129 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5130 int rem;
5131 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5132 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5133 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5134 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5135 NSTRING_CLEAR_RAW(*np);
5136 goto string_end;
5137 }
5138 }
5139 #endif
5140 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5141 }
5142
5143 r = node_str_cat_char(*np, (UChar )tok->u.c);
5144 if (r < 0) return r;
5145
5146 len++;
5147 }
5148 }
5149 break;
5150
5151 case TK_CODE_POINT:
5152 {
5153 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5154 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5155 if (num < 0) return num;
5156 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5157 *np = node_new_str_raw(buf, buf + num);
5158 #else
5159 *np = node_new_str(buf, buf + num);
5160 #endif
5161 CHECK_NULL_RETURN_MEMERR(*np);
5162 }
5163 break;
5164
5165 case TK_QUOTE_OPEN:
5166 {
5167 OnigCodePoint end_op[2];
5168 UChar *qstart, *qend, *nextp;
5169
5170 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5171 end_op[1] = (OnigCodePoint )'E';
5172 qstart = *src;
5173 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5174 if (IS_NULL(qend)) {
5175 nextp = qend = end;
5176 }
5177 *np = node_new_str(qstart, qend);
5178 CHECK_NULL_RETURN_MEMERR(*np);
5179 *src = nextp;
5180 }
5181 break;
5182
5183 case TK_CHAR_TYPE:
5184 {
5185 switch (tok->u.prop.ctype) {
5186 case ONIGENC_CTYPE_WORD:
5187 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5188 CHECK_NULL_RETURN_MEMERR(*np);
5189 break;
5190
5191 case ONIGENC_CTYPE_SPACE:
5192 case ONIGENC_CTYPE_DIGIT:
5193 case ONIGENC_CTYPE_XDIGIT:
5194 {
5195 CClassNode* cc;
5196
5197 #ifdef USE_SHARED_CCLASS_TABLE
5198 const OnigCodePoint *mbr;
5199 OnigCodePoint sb_out;
5200
5201 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5202 &sb_out, &mbr);
5203 if (r == 0 &&
5204 ONIGENC_CODE_RANGE_NUM(mbr)
5205 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5206 type_cclass_key key;
5207 type_cclass_key* new_key;
5208
5209 key.enc = env->enc;
5210 key.not = tok->u.prop.not;
5211 key.type = tok->u.prop.ctype;
5212
5213 THREAD_ATOMIC_START;
5214
5215 if (IS_NULL(OnigTypeCClassTable)) {
5216 OnigTypeCClassTable
5217 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5218 if (IS_NULL(OnigTypeCClassTable)) {
5219 THREAD_ATOMIC_END;
5220 return ONIGERR_MEMORY;
5221 }
5222 }
5223 else {
5224 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5225 (st_data_t* )np)) {
5226 THREAD_ATOMIC_END;
5227 break;
5228 }
5229 }
5230
5231 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5232 sb_out, mbr);
5233 if (IS_NULL(*np)) {
5234 THREAD_ATOMIC_END;
5235 return ONIGERR_MEMORY;
5236 }
5237
5238 cc = NCCLASS(*np);
5239 NCCLASS_SET_SHARE(cc);
5240 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5241 xmemcpy(new_key, &key, sizeof(type_cclass_key));
5242 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5243 (st_data_t )*np);
5244
5245 THREAD_ATOMIC_END;
5246 }
5247 else {
5248 #endif
5249 *np = node_new_cclass();
5250 CHECK_NULL_RETURN_MEMERR(*np);
5251 cc = NCCLASS(*np);
5252 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5253 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5254 #ifdef USE_SHARED_CCLASS_TABLE
5255 }
5256 #endif
5257 }
5258 break;
5259
5260 default:
5261 return ONIGERR_PARSER_BUG;
5262 break;
5263 }
5264 }
5265 break;
5266
5267 case TK_CHAR_PROPERTY:
5268 r = parse_char_property(np, tok, src, end, env);
5269 if (r != 0) return r;
5270 break;
5271
5272 case TK_CC_OPEN:
5273 {
5274 CClassNode* cc;
5275
5276 r = parse_char_class(np, tok, src, end, env);
5277 if (r != 0) return r;
5278
5279 cc = NCCLASS(*np);
5280 if (IS_IGNORECASE(env->option)) {
5281 IApplyCaseFoldArg iarg;
5282
5283 iarg.env = env;
5284 iarg.cc = cc;
5285 iarg.alt_root = NULL_NODE;
5286 iarg.ptail = &(iarg.alt_root);
5287
5288 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5289 i_apply_case_fold, &iarg);
5290 if (r != 0) {
5291 onig_node_free(iarg.alt_root);
5292 return r;
5293 }
5294 if (IS_NOT_NULL(iarg.alt_root)) {
5295 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5296 if (IS_NULL(work)) {
5297 onig_node_free(iarg.alt_root);
5298 return ONIGERR_MEMORY;
5299 }
5300 *np = work;
5301 }
5302 }
5303 }
5304 break;
5305
5306 case TK_ANYCHAR:
5307 *np = node_new_anychar();
5308 CHECK_NULL_RETURN_MEMERR(*np);
5309 break;
5310
5311 case TK_ANYCHAR_ANYTIME:
5312 *np = node_new_anychar();
5313 CHECK_NULL_RETURN_MEMERR(*np);
5314 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5315 CHECK_NULL_RETURN_MEMERR(qn);
5316 NQTFR(qn)->target = *np;
5317 *np = qn;
5318 break;
5319
5320 case TK_BACKREF:
5321 len = tok->u.backref.num;
5322 *np = node_new_backref(len,
5323 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5324 tok->u.backref.by_name,
5325 #ifdef USE_BACKREF_WITH_LEVEL
5326 tok->u.backref.exist_level,
5327 tok->u.backref.level,
5328 #endif
5329 env);
5330 CHECK_NULL_RETURN_MEMERR(*np);
5331 break;
5332
5333 #ifdef USE_SUBEXP_CALL
5334 case TK_CALL:
5335 {
5336 int gnum = tok->u.call.gnum;
5337
5338 if (gnum < 0) {
5339 gnum = BACKREF_REL_TO_ABS(gnum, env);
5340 if (gnum <= 0)
5341 return ONIGERR_INVALID_BACKREF;
5342 }
5343 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5344 CHECK_NULL_RETURN_MEMERR(*np);
5345 env->num_call++;
5346 }
5347 break;
5348 #endif
5349
5350 case TK_ANCHOR:
5351 *np = onig_node_new_anchor(tok->u.anchor);
5352 break;
5353
5354 case TK_OP_REPEAT:
5355 case TK_INTERVAL:
5356 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5357 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5358 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5359 else
5360 *np = node_new_empty();
5361 }
5362 else {
5363 goto tk_byte;
5364 }
5365 break;
5366
5367 default:
5368 return ONIGERR_PARSER_BUG;
5369 break;
5370 }
5371
5372 {
5373 targetp = np;
5374
5375 re_entry:
5376 r = fetch_token(tok, src, end, env);
5377 if (r < 0) return r;
5378
5379 repeat:
5380 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5381 if (is_invalid_quantifier_target(*targetp))
5382 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5383
5384 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5385 (r == TK_INTERVAL ? 1 : 0));
5386 CHECK_NULL_RETURN_MEMERR(qn);
5387 NQTFR(qn)->greedy = tok->u.repeat.greedy;
5388 r = set_quantifier(qn, *targetp, group, env);
5389 if (r < 0) {
5390 onig_node_free(qn);
5391 return r;
5392 }
5393
5394 if (tok->u.repeat.possessive != 0) {
5395 Node* en;
5396 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5397 if (IS_NULL(en)) {
5398 onig_node_free(qn);
5399 return ONIGERR_MEMORY;
5400 }
5401 NENCLOSE(en)->target = qn;
5402 qn = en;
5403 }
5404
5405 if (r == 0) {
5406 *targetp = qn;
5407 }
5408 else if (r == 1) {
5409 onig_node_free(qn);
5410 }
5411 else if (r == 2) { /* split case: /abc+/ */
5412 Node *tmp;
5413
5414 *targetp = node_new_list(*targetp, NULL);
5415 if (IS_NULL(*targetp)) {
5416 onig_node_free(qn);
5417 return ONIGERR_MEMORY;
5418 }
5419 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5420 if (IS_NULL(tmp)) {
5421 onig_node_free(qn);
5422 return ONIGERR_MEMORY;
5423 }
5424 targetp = &(NCAR(tmp));
5425 }
5426 goto re_entry;
5427 }
5428 }
5429
5430 return r;
5431 }
5432
5433 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5434 parse_branch(Node** top, OnigToken* tok, int term,
5435 UChar** src, UChar* end, ScanEnv* env)
5436 {
5437 int r;
5438 Node *node, **headp;
5439
5440 *top = NULL;
5441 r = parse_exp(&node, tok, term, src, end, env);
5442 if (r < 0) return r;
5443
5444 if (r == TK_EOT || r == term || r == TK_ALT) {
5445 *top = node;
5446 }
5447 else {
5448 *top = node_new_list(node, NULL);
5449 headp = &(NCDR(*top));
5450 while (r != TK_EOT && r != term && r != TK_ALT) {
5451 r = parse_exp(&node, tok, term, src, end, env);
5452 if (r < 0) return r;
5453
5454 if (NTYPE(node) == NT_LIST) {
5455 *headp = node;
5456 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5457 headp = &(NCDR(node));
5458 }
5459 else {
5460 *headp = node_new_list(node, NULL);
5461 headp = &(NCDR(*headp));
5462 }
5463 }
5464 }
5465
5466 return r;
5467 }
5468
5469 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5470 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5471 parse_subexp(Node** top, OnigToken* tok, int term,
5472 UChar** src, UChar* end, ScanEnv* env)
5473 {
5474 int r;
5475 Node *node, **headp;
5476
5477 *top = NULL;
5478 r = parse_branch(&node, tok, term, src, end, env);
5479 if (r < 0) {
5480 onig_node_free(node);
5481 return r;
5482 }
5483
5484 if (r == term) {
5485 *top = node;
5486 }
5487 else if (r == TK_ALT) {
5488 *top = onig_node_new_alt(node, NULL);
5489 headp = &(NCDR(*top));
5490 while (r == TK_ALT) {
5491 r = fetch_token(tok, src, end, env);
5492 if (r < 0) return r;
5493 r = parse_branch(&node, tok, term, src, end, env);
5494 if (r < 0) return r;
5495
5496 *headp = onig_node_new_alt(node, NULL);
5497 headp = &(NCDR(*headp));
5498 }
5499
5500 if (tok->type != (enum TokenSyms )term)
5501 goto err;
5502 }
5503 else {
5504 err:
5505 if (term == TK_SUBEXP_CLOSE)
5506 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5507 else
5508 return ONIGERR_PARSER_BUG;
5509 }
5510
5511 return r;
5512 }
5513
5514 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5515 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5516 {
5517 int r;
5518 OnigToken tok;
5519
5520 r = fetch_token(&tok, src, end, env);
5521 if (r < 0) return r;
5522 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5523 if (r < 0) return r;
5524 return 0;
5525 }
5526
5527 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5528 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5529 regex_t* reg, ScanEnv* env)
5530 {
5531 int r;
5532 UChar* p;
5533
5534 #ifdef USE_NAMED_GROUP
5535 names_clear(reg);
5536 #endif
5537
5538 scan_env_clear(env);
5539 env->option = reg->options;
5540 env->case_fold_flag = reg->case_fold_flag;
5541 env->enc = reg->enc;
5542 env->syntax = reg->syntax;
5543 env->pattern = (UChar* )pattern;
5544 env->pattern_end = (UChar* )end;
5545 env->reg = reg;
5546
5547 *root = NULL;
5548 p = (UChar* )pattern;
5549 r = parse_regexp(root, &p, (UChar* )end, env);
5550 reg->num_mem = env->num_mem;
5551 return r;
5552 }
5553
5554 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5555 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5556 UChar* arg, UChar* arg_end)
5557 {
5558 env->error = arg;
5559 env->error_end = arg_end;
5560 }
5561