1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 #define WARN_BUFSIZE 256
33
34 OnigSyntaxType OnigSyntaxRuby = {
35 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
36 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
37 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
38 ONIG_SYN_OP_ESC_C_CONTROL )
39 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
40 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
41 ONIG_SYN_OP2_OPTION_RUBY |
42 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
43 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
44 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
45 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
46 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
47 ONIG_SYN_OP2_ESC_H_XDIGIT )
48 , ( SYN_GNU_REGEX_BV |
49 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
50 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
51 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
52 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
53 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
54 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
55 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
56 , ONIG_OPTION_NONE
57 };
58
59 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
60
onig_null_warn(const char * s)61 extern void onig_null_warn(const char* s) { }
62
63 #ifdef RUBY_PLATFORM
64 extern void
onig_rb_warn(const char * s)65 onig_rb_warn(const char* s)
66 {
67 rb_warn("%s", s);
68 }
69
70 extern void
onig_rb_warning(const char * s)71 onig_rb_warning(const char* s)
72 {
73 rb_warning("%s", s);
74 }
75 #endif
76
77 #ifdef DEFAULT_WARN_FUNCTION
78 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
79 #else
80 static OnigWarnFunc onig_warn = onig_null_warn;
81 #endif
82
83 #ifdef DEFAULT_VERB_WARN_FUNCTION
84 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
85 #else
86 static OnigWarnFunc onig_verb_warn = onig_null_warn;
87 #endif
88
onig_set_warn_func(OnigWarnFunc f)89 extern void onig_set_warn_func(OnigWarnFunc f)
90 {
91 onig_warn = f;
92 }
93
onig_set_verb_warn_func(OnigWarnFunc f)94 extern void onig_set_verb_warn_func(OnigWarnFunc f)
95 {
96 onig_verb_warn = f;
97 }
98
99 static void
bbuf_free(BBuf * bbuf)100 bbuf_free(BBuf* bbuf)
101 {
102 if (IS_NOT_NULL(bbuf)) {
103 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
104 xfree(bbuf);
105 }
106 }
107
108 static int
bbuf_clone(BBuf ** rto,BBuf * from)109 bbuf_clone(BBuf** rto, BBuf* from)
110 {
111 int r;
112 BBuf *to;
113
114 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
115 CHECK_NULL_RETURN_VAL(to, ONIGERR_MEMORY);
116 r = BBUF_INIT(to, from->alloc);
117 if (r != 0) return r;
118 to->used = from->used;
119 xmemcpy(to->p, from->p, from->used);
120 return 0;
121 }
122
123 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
124
125 #define MBCODE_START_POS(enc) \
126 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
127
128 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
129 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
130
131 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
132 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
133 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
134 if (r) return r;\
135 }\
136 } while (0)
137
138
139 #define BITSET_IS_EMPTY(bs,empty) do {\
140 int i;\
141 empty = 1;\
142 for (i = 0; i < BITSET_SIZE; i++) {\
143 if ((bs)[i] != 0) {\
144 empty = 0; break;\
145 }\
146 }\
147 } while (0)
148
149 static void
bitset_set_range(BitSetRef bs,int from,int to)150 bitset_set_range(BitSetRef bs, int from, int to)
151 {
152 int i;
153 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
154 BITSET_SET_BIT(bs, i);
155 }
156 }
157
158 #if 0
159 static void
160 bitset_set_all(BitSetRef bs)
161 {
162 int i;
163 for (i = 0; i < BITSET_SIZE; i++) {
164 bs[i] = ~((Bits )0);
165 }
166 }
167 #endif
168
169 static void
bitset_invert(BitSetRef bs)170 bitset_invert(BitSetRef bs)
171 {
172 int i;
173 for (i = 0; i < BITSET_SIZE; i++) {
174 bs[i] = ~(bs[i]);
175 }
176 }
177
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181 int i;
182 for (i = 0; i < BITSET_SIZE; i++) {
183 to[i] = ~(from[i]);
184 }
185 }
186
187 static void
bitset_and(BitSetRef dest,BitSetRef bs)188 bitset_and(BitSetRef dest, BitSetRef bs)
189 {
190 int i;
191 for (i = 0; i < BITSET_SIZE; i++) {
192 dest[i] &= bs[i];
193 }
194 }
195
196 static void
bitset_or(BitSetRef dest,BitSetRef bs)197 bitset_or(BitSetRef dest, BitSetRef bs)
198 {
199 int i;
200 for (i = 0; i < BITSET_SIZE; i++) {
201 dest[i] |= bs[i];
202 }
203 }
204
205 static void
bitset_copy(BitSetRef dest,BitSetRef bs)206 bitset_copy(BitSetRef dest, BitSetRef bs)
207 {
208 int i;
209 for (i = 0; i < BITSET_SIZE; i++) {
210 dest[i] = bs[i];
211 }
212 }
213
214 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)215 onig_strncmp(const UChar* s1, const UChar* s2, int n)
216 {
217 int x;
218
219 while (n-- > 0) {
220 x = *s2++ - *s1++;
221 if (x) return x;
222 }
223 return 0;
224 }
225
226 static void
k_strcpy(UChar * dest,const UChar * src,const UChar * end)227 k_strcpy(UChar* dest, const UChar* src, const UChar* end)
228 {
229 int len = end - src;
230 if (len > 0) {
231 xmemcpy(dest, src, len);
232 dest[len] = (UChar )0;
233 }
234 }
235
236 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)237 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
238 {
239 int slen, term_len, i;
240 UChar *r;
241
242 slen = end - s;
243 term_len = ONIGENC_MBC_MINLEN(enc);
244
245 r = (UChar* )xmalloc(slen + term_len);
246 CHECK_NULL_RETURN(r);
247 xmemcpy(r, s, slen);
248
249 for (i = 0; i < term_len; i++)
250 r[slen + i] = (UChar )0;
251
252 return r;
253 }
254
255
256 /* scan pattern methods */
257 #define PEND_VALUE 0
258
259 #define PFETCH_READY UChar* pfetch_prev
260 #define PEND (p < end ? 0 : 1)
261 #define PUNFETCH p = pfetch_prev
262 #define PINC do { \
263 pfetch_prev = p; \
264 p += ONIGENC_MBC_ENC_LEN(enc, p); \
265 } while (0)
266 #define PFETCH(c) do { \
267 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
268 pfetch_prev = p; \
269 p += ONIGENC_MBC_ENC_LEN(enc, p); \
270 } while (0)
271
272 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
273 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
274
275 static UChar*
k_strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)276 k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
277 int capa)
278 {
279 UChar* r;
280
281 if (dest)
282 r = (UChar* )xrealloc(dest, capa + 1);
283 else
284 r = (UChar* )xmalloc(capa + 1);
285
286 CHECK_NULL_RETURN(r);
287 k_strcpy(r + (dest_end - dest), src, src_end);
288 return r;
289 }
290
291 /* dest on static area */
292 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)293 strcat_capa_from_static(UChar* dest, UChar* dest_end,
294 const UChar* src, const UChar* src_end, int capa)
295 {
296 UChar* r;
297
298 r = (UChar* )xmalloc(capa + 1);
299 CHECK_NULL_RETURN(r);
300 k_strcpy(r, dest, dest_end);
301 k_strcpy(r + (dest_end - dest), src, src_end);
302 return r;
303 }
304
305 #ifdef USE_NAMED_GROUP
306
307 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
308
309 typedef struct {
310 UChar* name;
311 int name_len; /* byte length */
312 int back_num; /* number of backrefs */
313 int back_alloc;
314 int back_ref1;
315 int* back_refs;
316 } NameEntry;
317
318 #ifdef USE_ST_HASH_TABLE
319
320 #include "st.h"
321
322 typedef struct {
323 unsigned char* s;
324 unsigned char* end;
325 } st_strend_key;
326
327 static int strend_cmp(st_strend_key*, st_strend_key*);
328 static int strend_hash(st_strend_key*);
329
330 static struct st_hash_type type_strend_hash = {
331 strend_cmp,
332 strend_hash,
333 };
334
335 static st_table*
onig_st_init_strend_table_with_size(int size)336 onig_st_init_strend_table_with_size(int size)
337 {
338 return onig_st_init_table_with_size(&type_strend_hash, size);
339 }
340
341 static int
onig_st_lookup_strend(st_table * table,const UChar * str_key,const UChar * end_key,st_data_t * value)342 onig_st_lookup_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t *value)
343 {
344 st_strend_key key;
345
346 key.s = (unsigned char* )str_key;
347 key.end = (unsigned char* )end_key;
348
349 return onig_st_lookup(table, (st_data_t )(&key), value);
350 }
351
352 static int
onig_st_insert_strend(st_table * table,const UChar * str_key,const UChar * end_key,st_data_t value)353 onig_st_insert_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t value)
354 {
355 st_strend_key* key;
356 int result;
357
358 key = (st_strend_key* )xmalloc(sizeof(st_strend_key));
359 key->s = (unsigned char* )str_key;
360 key->end = (unsigned char* )end_key;
361 result = onig_st_insert(table, (st_data_t )key, value);
362 if (result) {
363 xfree(key);
364 }
365 return result;
366 }
367
368 static int
strend_cmp(st_strend_key * x,st_strend_key * y)369 strend_cmp(st_strend_key* x, st_strend_key* y)
370 {
371 unsigned char *p, *q;
372 int c;
373
374 if ((x->end - x->s) != (y->end - y->s))
375 return 1;
376
377 p = x->s;
378 q = y->s;
379 while (p < x->end) {
380 c = (int )*p - (int )*q;
381 if (c != 0) return c;
382
383 p++; q++;
384 }
385
386 return 0;
387 }
388
389 static int
strend_hash(st_strend_key * x)390 strend_hash(st_strend_key* x)
391 {
392 int val;
393 unsigned char *p;
394
395 val = 0;
396 p = x->s;
397 while (p < x->end) {
398 val = val * 997 + (int )*p++;
399 }
400
401 return val + (val >> 5);
402 }
403
404 typedef st_table NameTable;
405 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
406
407 #define NAMEBUF_SIZE 24
408 #define NAMEBUF_SIZE_1 25
409
410 #ifdef ONIG_DEBUG
411 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)412 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
413 {
414 int i;
415 FILE* fp = (FILE* )arg;
416
417 fprintf(fp, "%s: ", e->name);
418 if (e->back_num == 0)
419 fputs("-", fp);
420 else if (e->back_num == 1)
421 fprintf(fp, "%d", e->back_ref1);
422 else {
423 for (i = 0; i < e->back_num; i++) {
424 if (i > 0) fprintf(fp, ", ");
425 fprintf(fp, "%d", e->back_refs[i]);
426 }
427 }
428 fputs("\n", fp);
429 return ST_CONTINUE;
430 }
431
432 extern int
onig_print_names(FILE * fp,regex_t * reg)433 onig_print_names(FILE* fp, regex_t* reg)
434 {
435 NameTable* t = (NameTable* )reg->name_table;
436
437 if (IS_NOT_NULL(t)) {
438 fprintf(fp, "name table\n");
439 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
440 fputs("\n", fp);
441 }
442 return 0;
443 }
444 #endif
445
446 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg)447 i_free_name_entry(UChar* key, NameEntry* e, void* arg)
448 {
449 xfree(e->name);
450 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
451 xfree(key);
452 xfree(e);
453 return ST_DELETE;
454 }
455
456 static int
names_clear(regex_t * reg)457 names_clear(regex_t* reg)
458 {
459 NameTable* t = (NameTable* )reg->name_table;
460
461 if (IS_NOT_NULL(t)) {
462 onig_st_foreach(t, i_free_name_entry, 0);
463 }
464 return 0;
465 }
466
467 extern int
onig_names_free(regex_t * reg)468 onig_names_free(regex_t* reg)
469 {
470 int r;
471 NameTable* t;
472
473 r = names_clear(reg);
474 if (r) return r;
475
476 t = (NameTable* )reg->name_table;
477 if (IS_NOT_NULL(t)) onig_st_free_table(t);
478 reg->name_table = (void* )NULL;
479 return 0;
480 }
481
482 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)483 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
484 {
485 NameEntry* e;
486 NameTable* t = (NameTable* )reg->name_table;
487
488 e = (NameEntry* )NULL;
489 if (IS_NOT_NULL(t)) {
490 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
491 }
492 return e;
493 }
494
495 typedef struct {
496 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
497 regex_t* reg;
498 void* arg;
499 int ret;
500 OnigEncoding enc;
501 } INamesArg;
502
503 static int
i_names(UChar * key,NameEntry * e,INamesArg * arg)504 i_names(UChar* key, NameEntry* e, INamesArg* arg)
505 {
506 int r = (*(arg->func))(e->name,
507 /*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */
508 e->name + e->name_len,
509 e->back_num,
510 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
511 arg->reg, arg->arg);
512 if (r != 0) {
513 arg->ret = r;
514 return ST_STOP;
515 }
516 return ST_CONTINUE;
517 }
518
519 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)520 onig_foreach_name(regex_t* reg,
521 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
522 void* arg)
523 {
524 INamesArg narg;
525 NameTable* t = (NameTable* )reg->name_table;
526
527 narg.ret = 0;
528 if (IS_NOT_NULL(t)) {
529 narg.func = func;
530 narg.reg = reg;
531 narg.arg = arg;
532 narg.enc = reg->enc; /* should be pattern encoding. */
533 onig_st_foreach(t, i_names, (HashDataType )&narg);
534 }
535 return narg.ret;
536 }
537
538 static int
i_renumber_name(UChar * key,NameEntry * e,GroupNumRemap * map)539 i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map)
540 {
541 int i;
542
543 if (e->back_num > 1) {
544 for (i = 0; i < e->back_num; i++) {
545 e->back_refs[i] = map[e->back_refs[i]].new_val;
546 }
547 }
548 else if (e->back_num == 1) {
549 e->back_ref1 = map[e->back_ref1].new_val;
550 }
551
552 return ST_CONTINUE;
553 }
554
555 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)556 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
557 {
558 NameTable* t = (NameTable* )reg->name_table;
559
560 if (IS_NOT_NULL(t)) {
561 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
562 }
563 return 0;
564 }
565
566
567 extern int
onig_number_of_names(regex_t * reg)568 onig_number_of_names(regex_t* reg)
569 {
570 NameTable* t = (NameTable* )reg->name_table;
571
572 if (IS_NOT_NULL(t))
573 return t->num_entries;
574 else
575 return 0;
576 }
577
578 #else /* USE_ST_HASH_TABLE */
579
580 #define INIT_NAMES_ALLOC_NUM 8
581
582 typedef struct {
583 NameEntry* e;
584 int num;
585 int alloc;
586 } NameTable;
587
588
589 #ifdef ONIG_DEBUG
590 extern int
onig_print_names(FILE * fp,regex_t * reg)591 onig_print_names(FILE* fp, regex_t* reg)
592 {
593 int i, j;
594 NameEntry* e;
595 NameTable* t = (NameTable* )reg->name_table;
596
597 if (IS_NOT_NULL(t) && t->num > 0) {
598 fprintf(fp, "name table\n");
599 for (i = 0; i < t->num; i++) {
600 e = &(t->e[i]);
601 fprintf(fp, "%s: ", e->name);
602 if (e->back_num == 0) {
603 fputs("-", fp);
604 }
605 else if (e->back_num == 1) {
606 fprintf(fp, "%d", e->back_ref1);
607 }
608 else {
609 for (j = 0; j < e->back_num; j++) {
610 if (j > 0) fprintf(fp, ", ");
611 fprintf(fp, "%d", e->back_refs[j]);
612 }
613 }
614 fputs("\n", fp);
615 }
616 fputs("\n", fp);
617 }
618 return 0;
619 }
620 #endif
621
622 static int
names_clear(regex_t * reg)623 names_clear(regex_t* reg)
624 {
625 int i;
626 NameEntry* e;
627 NameTable* t = (NameTable* )reg->name_table;
628
629 if (IS_NOT_NULL(t)) {
630 for (i = 0; i < t->num; i++) {
631 e = &(t->e[i]);
632 if (IS_NOT_NULL(e->name)) {
633 xfree(e->name);
634 e->name = NULL;
635 e->name_len = 0;
636 e->back_num = 0;
637 e->back_alloc = 0;
638 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
639 e->back_refs = (int* )NULL;
640 }
641 }
642 if (IS_NOT_NULL(t->e)) {
643 xfree(t->e);
644 t->e = NULL;
645 }
646 t->num = 0;
647 }
648 return 0;
649 }
650
651 extern int
onig_names_free(regex_t * reg)652 onig_names_free(regex_t* reg)
653 {
654 int r;
655 NameTable* t;
656
657 r = names_clear(reg);
658 if (r) return r;
659
660 t = (NameTable* )reg->name_table;
661 if (IS_NOT_NULL(t)) xfree(t);
662 reg->name_table = NULL;
663 return 0;
664 }
665
666 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)667 name_find(regex_t* reg, UChar* name, UChar* name_end)
668 {
669 int i, len;
670 NameEntry* e;
671 NameTable* t = (NameTable* )reg->name_table;
672
673 if (IS_NOT_NULL(t)) {
674 len = name_end - name;
675 for (i = 0; i < t->num; i++) {
676 e = &(t->e[i]);
677 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
678 return e;
679 }
680 }
681 return (NameEntry* )NULL;
682 }
683
684 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)685 onig_foreach_name(regex_t* reg,
686 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
687 void* arg)
688 {
689 int i, r;
690 NameEntry* e;
691 NameTable* t = (NameTable* )reg->name_table;
692
693 if (IS_NOT_NULL(t)) {
694 for (i = 0; i < t->num; i++) {
695 e = &(t->e[i]);
696 r = (*func)(e->name, e->name + e->name_len, e->back_num,
697 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
698 reg, arg);
699 if (r != 0) return r;
700 }
701 }
702 return 0;
703 }
704
705 extern int
onig_number_of_names(regex_t * reg)706 onig_number_of_names(regex_t* reg)
707 {
708 NameTable* t = (NameTable* )reg->name_table;
709
710 if (IS_NOT_NULL(t))
711 return t->num;
712 else
713 return 0;
714 }
715
716 #endif /* else USE_ST_HASH_TABLE */
717
718 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)719 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
720 {
721 int alloc;
722 NameEntry* e;
723 NameTable* t = (NameTable* )reg->name_table;
724
725 if (name_end - name <= 0)
726 return ONIGERR_EMPTY_GROUP_NAME;
727
728 e = name_find(reg, name, name_end);
729 if (IS_NULL(e)) {
730 #ifdef USE_ST_HASH_TABLE
731 if (IS_NULL(t)) {
732 t = onig_st_init_strend_table_with_size(5);
733 reg->name_table = (void* )t;
734 }
735 e = (NameEntry* )xmalloc(sizeof(NameEntry));
736 CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY);
737
738 e->name = strdup_with_null(reg->enc, name, name_end);
739 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
740 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
741 (HashDataType )e);
742
743 e->name_len = name_end - name;
744 e->back_num = 0;
745 e->back_alloc = 0;
746 e->back_refs = (int* )NULL;
747
748 #else
749
750 if (IS_NULL(t)) {
751 alloc = INIT_NAMES_ALLOC_NUM;
752 t = (NameTable* )xmalloc(sizeof(NameTable));
753 CHECK_NULL_RETURN_VAL(t, ONIGERR_MEMORY);
754 t->e = NULL;
755 t->alloc = 0;
756 t->num = 0;
757
758 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
759 if (IS_NULL(t->e)) {
760 xfree(t);
761 return ONIGERR_MEMORY;
762 }
763 t->alloc = alloc;
764 reg->name_table = t;
765 goto clear;
766 }
767 else if (t->num == t->alloc) {
768 int i;
769
770 alloc = t->alloc * 2;
771 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
772 CHECK_NULL_RETURN_VAL(t->e, ONIGERR_MEMORY);
773 t->alloc = alloc;
774
775 clear:
776 for (i = t->num; i < t->alloc; i++) {
777 t->e[i].name = NULL;
778 t->e[i].name_len = 0;
779 t->e[i].back_num = 0;
780 t->e[i].back_alloc = 0;
781 t->e[i].back_refs = (int* )NULL;
782 }
783 }
784 e = &(t->e[t->num]);
785 t->num++;
786 e->name = strdup_with_null(reg->enc, name, name_end);
787 e->name_len = name_end - name;
788 #endif
789 }
790
791 if (e->back_num >= 1 &&
792 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
793 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
794 name, name_end);
795 return ONIGERR_MULTIPLEX_DEFINED_NAME;
796 }
797
798 e->back_num++;
799 if (e->back_num == 1) {
800 e->back_ref1 = backref;
801 }
802 else {
803 if (e->back_num == 2) {
804 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
805 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
806 CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
807 e->back_alloc = alloc;
808 e->back_refs[0] = e->back_ref1;
809 e->back_refs[1] = backref;
810 }
811 else {
812 if (e->back_num > e->back_alloc) {
813 alloc = e->back_alloc * 2;
814 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
815 CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
816 e->back_alloc = alloc;
817 }
818 e->back_refs[e->back_num - 1] = backref;
819 }
820 }
821
822 return 0;
823 }
824
825 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)826 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
827 const UChar* name_end, int** nums)
828 {
829 NameEntry* e;
830
831 e = name_find(reg, name, name_end);
832 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
833
834 switch (e->back_num) {
835 case 0:
836 break;
837 case 1:
838 *nums = &(e->back_ref1);
839 break;
840 default:
841 *nums = e->back_refs;
842 break;
843 }
844 return e->back_num;
845 }
846
847 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)848 onig_name_to_backref_number(regex_t* reg, const UChar* name,
849 const UChar* name_end, OnigRegion *region)
850 {
851 int i, n, *nums;
852
853 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
854 if (n < 0)
855 return n;
856 else if (n == 0)
857 return ONIGERR_PARSER_BUG;
858 else if (n == 1)
859 return nums[0];
860 else {
861 if (IS_NOT_NULL(region)) {
862 for (i = n - 1; i >= 0; i--) {
863 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
864 return nums[i];
865 }
866 }
867 return nums[n - 1];
868 }
869 }
870
871 #else /* USE_NAMED_GROUP */
872
873 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)874 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
875 const UChar* name_end, int** nums)
876 {
877 return ONIG_NO_SUPPORT_CONFIG;
878 }
879
880 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)881 onig_name_to_backref_number(regex_t* reg, const UChar* name,
882 const UChar* name_end, OnigRegion* region)
883 {
884 return ONIG_NO_SUPPORT_CONFIG;
885 }
886
887 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)888 onig_foreach_name(regex_t* reg,
889 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
890 void* arg)
891 {
892 return ONIG_NO_SUPPORT_CONFIG;
893 }
894
895 extern int
onig_number_of_names(regex_t * reg)896 onig_number_of_names(regex_t* reg)
897 {
898 return 0;
899 }
900 #endif /* else USE_NAMED_GROUP */
901
902 extern int
onig_noname_group_capture_is_active(regex_t * reg)903 onig_noname_group_capture_is_active(regex_t* reg)
904 {
905 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
906 return 0;
907
908 #ifdef USE_NAMED_GROUP
909 if (onig_number_of_names(reg) > 0 &&
910 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
911 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
912 return 0;
913 }
914 #endif
915
916 return 1;
917 }
918
919
920 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
921
922 static void
scan_env_clear(ScanEnv * env)923 scan_env_clear(ScanEnv* env)
924 {
925 int i;
926
927 BIT_STATUS_CLEAR(env->capture_history);
928 BIT_STATUS_CLEAR(env->bt_mem_start);
929 BIT_STATUS_CLEAR(env->bt_mem_end);
930 BIT_STATUS_CLEAR(env->backrefed_mem);
931 env->error = (UChar* )NULL;
932 env->error_end = (UChar* )NULL;
933 env->num_call = 0;
934 env->num_mem = 0;
935 #ifdef USE_NAMED_GROUP
936 env->num_named = 0;
937 #endif
938 env->mem_alloc = 0;
939 env->mem_nodes_dynamic = (Node** )NULL;
940
941 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
942 env->mem_nodes_static[i] = NULL_NODE;
943
944 #ifdef USE_COMBINATION_EXPLOSION_CHECK
945 env->num_comb_exp_check = 0;
946 env->comb_exp_max_regnum = 0;
947 env->curr_max_regnum = 0;
948 env->has_recursion = 0;
949 #endif
950 }
951
952 static int
scan_env_add_mem_entry(ScanEnv * env)953 scan_env_add_mem_entry(ScanEnv* env)
954 {
955 int i, need, alloc;
956 Node** p;
957
958 need = env->num_mem + 1;
959 if (need >= SCANENV_MEMNODES_SIZE) {
960 if (env->mem_alloc <= need) {
961 if (IS_NULL(env->mem_nodes_dynamic)) {
962 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
963 p = (Node** )xmalloc(sizeof(Node*) * alloc);
964 xmemcpy(p, env->mem_nodes_static,
965 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
966 }
967 else {
968 alloc = env->mem_alloc * 2;
969 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
970 }
971 CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
972
973 for (i = env->num_mem + 1; i < alloc; i++)
974 p[i] = NULL_NODE;
975
976 env->mem_nodes_dynamic = p;
977 env->mem_alloc = alloc;
978 }
979 }
980
981 env->num_mem++;
982 return env->num_mem;
983 }
984
985 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)986 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
987 {
988 if (env->num_mem >= num)
989 SCANENV_MEM_NODES(env)[num] = node;
990 else
991 return ONIGERR_PARSER_BUG;
992 return 0;
993 }
994
995
996 #ifdef USE_RECYCLE_NODE
997 typedef struct _FreeNode {
998 struct _FreeNode* next;
999 } FreeNode;
1000
1001 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1002 #endif
1003
1004 extern void
onig_node_free(Node * node)1005 onig_node_free(Node* node)
1006 {
1007 start:
1008 if (IS_NULL(node)) return ;
1009
1010 switch (NTYPE(node)) {
1011 case N_STRING:
1012 if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1013 xfree(NSTRING(node).s);
1014 }
1015 break;
1016
1017 case N_LIST:
1018 case N_ALT:
1019 onig_node_free(NCONS(node).left);
1020 /* onig_node_free(NCONS(node).right); */
1021 {
1022 Node* next_node = NCONS(node).right;
1023
1024 #ifdef USE_RECYCLE_NODE
1025 {
1026 FreeNode* n = (FreeNode* )node;
1027
1028 THREAD_ATOMIC_START;
1029 n->next = FreeNodeList;
1030 FreeNodeList = n;
1031 THREAD_ATOMIC_END;
1032 }
1033 #else
1034 xfree(node);
1035 #endif
1036
1037 node = next_node;
1038 goto start;
1039 }
1040 break;
1041
1042 case N_CCLASS:
1043 {
1044 CClassNode* cc = &(NCCLASS(node));
1045
1046 if (IS_CCLASS_SHARE(cc))
1047 return ;
1048
1049 if (cc->mbuf)
1050 bbuf_free(cc->mbuf);
1051 }
1052 break;
1053
1054 case N_QUANTIFIER:
1055 if (NQUANTIFIER(node).target)
1056 onig_node_free(NQUANTIFIER(node).target);
1057 break;
1058
1059 case N_EFFECT:
1060 if (NEFFECT(node).target)
1061 onig_node_free(NEFFECT(node).target);
1062 break;
1063
1064 case N_BACKREF:
1065 if (IS_NOT_NULL(NBACKREF(node).back_dynamic))
1066 xfree(NBACKREF(node).back_dynamic);
1067 break;
1068
1069 case N_ANCHOR:
1070 if (NANCHOR(node).target)
1071 onig_node_free(NANCHOR(node).target);
1072 break;
1073 }
1074
1075 #ifdef USE_RECYCLE_NODE
1076 {
1077 FreeNode* n = (FreeNode* )node;
1078
1079 THREAD_ATOMIC_START;
1080 n->next = FreeNodeList;
1081 FreeNodeList = n;
1082 THREAD_ATOMIC_END;
1083 }
1084 #else
1085 xfree(node);
1086 #endif
1087 }
1088
1089 #ifdef USE_RECYCLE_NODE
1090 extern int
onig_free_node_list(void)1091 onig_free_node_list(void)
1092 {
1093 FreeNode* n;
1094
1095 /* THREAD_ATOMIC_START; */
1096 while (IS_NOT_NULL(FreeNodeList)) {
1097 n = FreeNodeList;
1098 FreeNodeList = FreeNodeList->next;
1099 xfree(n);
1100 }
1101 /* THREAD_ATOMIC_END; */
1102 return 0;
1103 }
1104 #endif
1105
1106 static Node*
node_new(void)1107 node_new(void)
1108 {
1109 Node* node;
1110
1111 #ifdef USE_RECYCLE_NODE
1112 THREAD_ATOMIC_START;
1113 if (IS_NOT_NULL(FreeNodeList)) {
1114 node = (Node* )FreeNodeList;
1115 FreeNodeList = FreeNodeList->next;
1116 THREAD_ATOMIC_END;
1117 return node;
1118 }
1119 THREAD_ATOMIC_END;
1120 #endif
1121
1122 node = (Node* )xmalloc(sizeof(Node));
1123 return node;
1124 }
1125
1126
1127 static void
initialize_cclass(CClassNode * cc)1128 initialize_cclass(CClassNode* cc)
1129 {
1130 BITSET_CLEAR(cc->bs);
1131 cc->flags = 0;
1132 cc->mbuf = NULL;
1133 }
1134
1135 static Node*
node_new_cclass(void)1136 node_new_cclass(void)
1137 {
1138 Node* node = node_new();
1139 CHECK_NULL_RETURN(node);
1140 node->type = N_CCLASS;
1141
1142 initialize_cclass(&(NCCLASS(node)));
1143 return node;
1144 }
1145
1146 static Node*
node_new_cclass_by_codepoint_range(int not,const OnigCodePoint sbr[],const OnigCodePoint mbr[])1147 node_new_cclass_by_codepoint_range(int not,
1148 const OnigCodePoint sbr[], const OnigCodePoint mbr[])
1149 {
1150 CClassNode* cc;
1151 int n, i, j;
1152
1153 Node* node = node_new();
1154 CHECK_NULL_RETURN(node);
1155 node->type = N_CCLASS;
1156
1157 cc = &(NCCLASS(node));
1158 cc->flags = 0;
1159 if (not != 0) CCLASS_SET_NOT(cc);
1160
1161 BITSET_CLEAR(cc->bs);
1162 if (IS_NOT_NULL(sbr)) {
1163 n = ONIGENC_CODE_RANGE_NUM(sbr);
1164 for (i = 0; i < n; i++) {
1165 for (j = ONIGENC_CODE_RANGE_FROM(sbr, i);
1166 j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
1167 BITSET_SET_BIT(cc->bs, j);
1168 }
1169 }
1170 }
1171
1172 if (IS_NULL(mbr)) {
1173 is_null:
1174 cc->mbuf = NULL;
1175 }
1176 else {
1177 BBuf* bbuf;
1178
1179 n = ONIGENC_CODE_RANGE_NUM(mbr);
1180 if (n == 0) goto is_null;
1181
1182 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1183 CHECK_NULL_RETURN_VAL(bbuf, NULL);
1184 bbuf->alloc = n + 1;
1185 bbuf->used = n + 1;
1186 bbuf->p = (UChar* )((void* )mbr);
1187
1188 cc->mbuf = bbuf;
1189 }
1190
1191 return node;
1192 }
1193
1194 static Node*
node_new_ctype(int type)1195 node_new_ctype(int type)
1196 {
1197 Node* node = node_new();
1198 CHECK_NULL_RETURN(node);
1199 node->type = N_CTYPE;
1200 NCTYPE(node).type = type;
1201 return node;
1202 }
1203
1204 static Node*
node_new_anychar(void)1205 node_new_anychar(void)
1206 {
1207 Node* node = node_new();
1208 CHECK_NULL_RETURN(node);
1209 node->type = N_ANYCHAR;
1210 return node;
1211 }
1212
1213 static Node*
node_new_list(Node * left,Node * right)1214 node_new_list(Node* left, Node* right)
1215 {
1216 Node* node = node_new();
1217 CHECK_NULL_RETURN(node);
1218 node->type = N_LIST;
1219 NCONS(node).left = left;
1220 NCONS(node).right = right;
1221 return node;
1222 }
1223
1224 extern Node*
onig_node_new_list(Node * left,Node * right)1225 onig_node_new_list(Node* left, Node* right)
1226 {
1227 return node_new_list(left, right);
1228 }
1229
1230 static Node*
node_new_alt(Node * left,Node * right)1231 node_new_alt(Node* left, Node* right)
1232 {
1233 Node* node = node_new();
1234 CHECK_NULL_RETURN(node);
1235 node->type = N_ALT;
1236 NCONS(node).left = left;
1237 NCONS(node).right = right;
1238 return node;
1239 }
1240
1241 extern Node*
onig_node_new_anchor(int type)1242 onig_node_new_anchor(int type)
1243 {
1244 Node* node = node_new();
1245 CHECK_NULL_RETURN(node);
1246 node->type = N_ANCHOR;
1247 NANCHOR(node).type = type;
1248 NANCHOR(node).target = NULL;
1249 NANCHOR(node).char_len = -1;
1250 return node;
1251 }
1252
1253 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1254 node_new_backref(int back_num, int* backrefs, int by_name,
1255 #ifdef USE_BACKREF_AT_LEVEL
1256 int exist_level, int nest_level,
1257 #endif
1258 ScanEnv* env)
1259 {
1260 int i;
1261 Node* node = node_new();
1262
1263 CHECK_NULL_RETURN(node);
1264 node->type = N_BACKREF;
1265 NBACKREF(node).state = 0;
1266 NBACKREF(node).back_num = back_num;
1267 NBACKREF(node).back_dynamic = (int* )NULL;
1268 if (by_name != 0)
1269 NBACKREF(node).state |= NST_NAME_REF;
1270
1271 #ifdef USE_BACKREF_AT_LEVEL
1272 if (exist_level != 0) {
1273 NBACKREF(node).state |= NST_NEST_LEVEL;
1274 NBACKREF(node).nest_level = nest_level;
1275 }
1276 #endif
1277
1278 for (i = 0; i < back_num; i++) {
1279 if (backrefs[i] <= env->num_mem &&
1280 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1281 NBACKREF(node).state |= NST_RECURSION; /* /...(\1).../ */
1282 break;
1283 }
1284 }
1285
1286 if (back_num <= NODE_BACKREFS_SIZE) {
1287 for (i = 0; i < back_num; i++)
1288 NBACKREF(node).back_static[i] = backrefs[i];
1289 }
1290 else {
1291 int* p = (int* )xmalloc(sizeof(int) * back_num);
1292 if (IS_NULL(p)) {
1293 onig_node_free(node);
1294 return NULL;
1295 }
1296 NBACKREF(node).back_dynamic = p;
1297 for (i = 0; i < back_num; i++)
1298 p[i] = backrefs[i];
1299 }
1300 return node;
1301 }
1302
1303 #ifdef USE_SUBEXP_CALL
1304 static Node*
node_new_call(UChar * name,UChar * name_end)1305 node_new_call(UChar* name, UChar* name_end)
1306 {
1307 Node* node = node_new();
1308 CHECK_NULL_RETURN(node);
1309
1310 node->type = N_CALL;
1311 NCALL(node).state = 0;
1312 NCALL(node).ref_num = CALLNODE_REFNUM_UNDEF;
1313 NCALL(node).target = NULL_NODE;
1314 NCALL(node).name = name;
1315 NCALL(node).name_end = name_end;
1316 return node;
1317 }
1318 #endif
1319
1320 static Node*
node_new_quantifier(int lower,int upper,int by_number)1321 node_new_quantifier(int lower, int upper, int by_number)
1322 {
1323 Node* node = node_new();
1324 CHECK_NULL_RETURN(node);
1325 node->type = N_QUANTIFIER;
1326 NQUANTIFIER(node).state = 0;
1327 NQUANTIFIER(node).target = NULL;
1328 NQUANTIFIER(node).lower = lower;
1329 NQUANTIFIER(node).upper = upper;
1330 NQUANTIFIER(node).greedy = 1;
1331 NQUANTIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1332 NQUANTIFIER(node).head_exact = NULL_NODE;
1333 NQUANTIFIER(node).next_head_exact = NULL_NODE;
1334 NQUANTIFIER(node).is_refered = 0;
1335 if (by_number != 0)
1336 NQUANTIFIER(node).state |= NST_BY_NUMBER;
1337
1338 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1339 NQUANTIFIER(node).comb_exp_check_num = 0;
1340 #endif
1341
1342 return node;
1343 }
1344
1345 static Node*
node_new_effect(int type)1346 node_new_effect(int type)
1347 {
1348 Node* node = node_new();
1349 CHECK_NULL_RETURN(node);
1350 node->type = N_EFFECT;
1351 NEFFECT(node).type = type;
1352 NEFFECT(node).state = 0;
1353 NEFFECT(node).regnum = 0;
1354 NEFFECT(node).option = 0;
1355 NEFFECT(node).target = NULL;
1356 NEFFECT(node).call_addr = -1;
1357 NEFFECT(node).opt_count = 0;
1358 return node;
1359 }
1360
1361 extern Node*
onig_node_new_effect(int type)1362 onig_node_new_effect(int type)
1363 {
1364 return node_new_effect(type);
1365 }
1366
1367 static Node*
node_new_effect_memory(OnigOptionType option,int is_named)1368 node_new_effect_memory(OnigOptionType option, int is_named)
1369 {
1370 Node* node = node_new_effect(EFFECT_MEMORY);
1371 CHECK_NULL_RETURN(node);
1372 if (is_named != 0)
1373 SET_EFFECT_STATUS(node, NST_NAMED_GROUP);
1374
1375 #ifdef USE_SUBEXP_CALL
1376 NEFFECT(node).option = option;
1377 #endif
1378 return node;
1379 }
1380
1381 static Node*
node_new_option(OnigOptionType option)1382 node_new_option(OnigOptionType option)
1383 {
1384 Node* node = node_new_effect(EFFECT_OPTION);
1385 CHECK_NULL_RETURN(node);
1386 NEFFECT(node).option = option;
1387 return node;
1388 }
1389
1390 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1391 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1392 {
1393 int addlen = end - s;
1394
1395 if (addlen > 0) {
1396 int len = NSTRING(node).end - NSTRING(node).s;
1397
1398 if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1399 UChar* p;
1400 int capa = len + addlen + NODE_STR_MARGIN;
1401
1402 if (capa <= NSTRING(node).capa) {
1403 k_strcpy(NSTRING(node).s + len, s, end);
1404 }
1405 else {
1406 if (NSTRING(node).s == NSTRING(node).buf)
1407 p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end,
1408 s, end, capa);
1409 else
1410 p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa);
1411
1412 CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
1413 NSTRING(node).s = p;
1414 NSTRING(node).capa = capa;
1415 }
1416 }
1417 else {
1418 k_strcpy(NSTRING(node).s + len, s, end);
1419 }
1420 NSTRING(node).end = NSTRING(node).s + len + addlen;
1421 }
1422
1423 return 0;
1424 }
1425
1426 static int
node_str_cat_char(Node * node,UChar c)1427 node_str_cat_char(Node* node, UChar c)
1428 {
1429 UChar s[1];
1430
1431 s[0] = c;
1432 return onig_node_str_cat(node, s, s + 1);
1433 }
1434
1435 extern void
onig_node_conv_to_str_node(Node * node,int flag)1436 onig_node_conv_to_str_node(Node* node, int flag)
1437 {
1438 node->type = N_STRING;
1439
1440 NSTRING(node).flag = flag;
1441 NSTRING(node).capa = 0;
1442 NSTRING(node).s = NSTRING(node).buf;
1443 NSTRING(node).end = NSTRING(node).buf;
1444 }
1445
1446 extern void
onig_node_str_clear(Node * node)1447 onig_node_str_clear(Node* node)
1448 {
1449 if (NSTRING(node).capa != 0 &&
1450 IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1451 xfree(NSTRING(node).s);
1452 }
1453
1454 NSTRING(node).capa = 0;
1455 NSTRING(node).flag = 0;
1456 NSTRING(node).s = NSTRING(node).buf;
1457 NSTRING(node).end = NSTRING(node).buf;
1458 }
1459
1460 static Node*
node_new_str(const UChar * s,const UChar * end)1461 node_new_str(const UChar* s, const UChar* end)
1462 {
1463 Node* node = node_new();
1464 CHECK_NULL_RETURN(node);
1465
1466 node->type = N_STRING;
1467 NSTRING(node).capa = 0;
1468 NSTRING(node).flag = 0;
1469 NSTRING(node).s = NSTRING(node).buf;
1470 NSTRING(node).end = NSTRING(node).buf;
1471 if (onig_node_str_cat(node, s, end)) {
1472 onig_node_free(node);
1473 return NULL;
1474 }
1475 return node;
1476 }
1477
1478 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1479 onig_node_new_str(const UChar* s, const UChar* end)
1480 {
1481 return node_new_str(s, end);
1482 }
1483
1484 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
1485 static Node*
node_new_str_raw(UChar * s,UChar * end)1486 node_new_str_raw(UChar* s, UChar* end)
1487 {
1488 Node* node = node_new_str(s, end);
1489 NSTRING_SET_RAW(node);
1490 return node;
1491 }
1492 #endif
1493
1494 static Node*
node_new_empty(void)1495 node_new_empty(void)
1496 {
1497 return node_new_str(NULL, NULL);
1498 }
1499
1500 static Node*
node_new_str_char(UChar c)1501 node_new_str_char(UChar c)
1502 {
1503 UChar p[1];
1504
1505 p[0] = c;
1506 return node_new_str(p, p + 1);
1507 }
1508
1509 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1510 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1511 {
1512 const UChar *p;
1513 Node* n = NULL_NODE;
1514
1515 if (sn->end > sn->s) {
1516 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1517 if (p && p > sn->s) { /* can be splitted. */
1518 n = node_new_str(p, sn->end);
1519 if ((sn->flag & NSTR_RAW) != 0)
1520 NSTRING_SET_RAW(n);
1521 sn->end = (UChar* )p;
1522 }
1523 }
1524 return n;
1525 }
1526
1527 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1528 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1529 {
1530 if (sn->end > sn->s) {
1531 return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1532 }
1533 return 0;
1534 }
1535
1536 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1537 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1538 node_str_head_pad(StrNode* sn, int num, UChar val)
1539 {
1540 UChar buf[NODE_STR_BUF_SIZE];
1541 int i, len;
1542
1543 len = sn->end - sn->s;
1544 onig_strcpy(buf, sn->s, sn->end);
1545 onig_strcpy(&(sn->s[num]), buf, buf + len);
1546 sn->end += num;
1547
1548 for (i = 0; i < num; i++) {
1549 sn->s[i] = val;
1550 }
1551 }
1552 #endif
1553
1554 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1555 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1556 {
1557 unsigned int num, val;
1558 OnigCodePoint c;
1559 UChar* p = *src;
1560 PFETCH_READY;
1561
1562 num = 0;
1563 while (!PEND) {
1564 PFETCH(c);
1565 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1566 val = (unsigned int )DIGITVAL(c);
1567 if ((INT_MAX_LIMIT - val) / 10UL < num)
1568 return -1; /* overflow */
1569
1570 num = num * 10 + val;
1571 }
1572 else {
1573 PUNFETCH;
1574 break;
1575 }
1576 }
1577 *src = p;
1578 return num;
1579 }
1580
1581 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1582 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1583 OnigEncoding enc)
1584 {
1585 OnigCodePoint c;
1586 unsigned int num, val;
1587 UChar* p = *src;
1588 PFETCH_READY;
1589
1590 num = 0;
1591 while (!PEND && maxlen-- != 0) {
1592 PFETCH(c);
1593 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1594 val = (unsigned int )XDIGITVAL(enc,c);
1595 if ((INT_MAX_LIMIT - val) / 16UL < num)
1596 return -1; /* overflow */
1597
1598 num = (num << 4) + XDIGITVAL(enc,c);
1599 }
1600 else {
1601 PUNFETCH;
1602 break;
1603 }
1604 }
1605 *src = p;
1606 return num;
1607 }
1608
1609 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1610 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1611 OnigEncoding enc)
1612 {
1613 OnigCodePoint c;
1614 unsigned int num, val;
1615 UChar* p = *src;
1616 PFETCH_READY;
1617
1618 num = 0;
1619 while (!PEND && maxlen-- != 0) {
1620 PFETCH(c);
1621 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1622 val = ODIGITVAL(c);
1623 if ((INT_MAX_LIMIT - val) / 8UL < num)
1624 return -1; /* overflow */
1625
1626 num = (num << 3) + val;
1627 }
1628 else {
1629 PUNFETCH;
1630 break;
1631 }
1632 }
1633 *src = p;
1634 return num;
1635 }
1636
1637
1638 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1639 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1640
1641 /* data format:
1642 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1643 (all data size is OnigCodePoint)
1644 */
1645 static int
new_code_range(BBuf ** pbuf)1646 new_code_range(BBuf** pbuf)
1647 {
1648 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1649 int r;
1650 OnigCodePoint n;
1651 BBuf* bbuf;
1652
1653 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1654 CHECK_NULL_RETURN_VAL(*pbuf, ONIGERR_MEMORY);
1655 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1656 if (r) return r;
1657
1658 n = 0;
1659 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1660 return 0;
1661 }
1662
1663 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1664 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1665 {
1666 int r, inc_n, pos;
1667 int low, high, bound, x;
1668 OnigCodePoint n, *data;
1669 BBuf* bbuf;
1670
1671 if (from > to) {
1672 n = from; from = to; to = n;
1673 }
1674
1675 if (IS_NULL(*pbuf)) {
1676 r = new_code_range(pbuf);
1677 if (r) return r;
1678 bbuf = *pbuf;
1679 n = 0;
1680 }
1681 else {
1682 bbuf = *pbuf;
1683 GET_CODE_POINT(n, bbuf->p);
1684 }
1685 data = (OnigCodePoint* )(bbuf->p);
1686 data++;
1687
1688 for (low = 0, bound = n; low < bound; ) {
1689 x = (low + bound) >> 1;
1690 if (from > data[x*2 + 1])
1691 low = x + 1;
1692 else
1693 bound = x;
1694 }
1695
1696 for (high = low, bound = n; high < bound; ) {
1697 x = (high + bound) >> 1;
1698 if (to >= data[x*2] - 1)
1699 high = x + 1;
1700 else
1701 bound = x;
1702 }
1703
1704 inc_n = low + 1 - high;
1705 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1706 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1707
1708 if (inc_n != 1) {
1709 if (from > data[low*2])
1710 from = data[low*2];
1711 if (to < data[(high - 1)*2 + 1])
1712 to = data[(high - 1)*2 + 1];
1713 }
1714
1715 if (inc_n != 0 && (OnigCodePoint )high < n) {
1716 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1717 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1718 int size = (n - high) * 2 * SIZE_CODE_POINT;
1719
1720 if (inc_n > 0) {
1721 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1722 }
1723 else {
1724 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1725 }
1726 }
1727
1728 pos = SIZE_CODE_POINT * (1 + low * 2);
1729 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1730 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1731 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1732 n += inc_n;
1733 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1734
1735 return 0;
1736 }
1737
1738 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1739 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1740 {
1741 if (from > to) {
1742 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1743 return 0;
1744 else
1745 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1746 }
1747
1748 return add_code_range_to_buf(pbuf, from, to);
1749 }
1750
1751 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1752 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1753 {
1754 int r, i, n;
1755 OnigCodePoint pre, from, *data, to = 0;
1756
1757 *pbuf = (BBuf* )NULL;
1758 if (IS_NULL(bbuf)) {
1759 set_all:
1760 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1761 }
1762
1763 data = (OnigCodePoint* )(bbuf->p);
1764 GET_CODE_POINT(n, data);
1765 data++;
1766 if (n <= 0) goto set_all;
1767
1768 r = 0;
1769 pre = MBCODE_START_POS(enc);
1770 for (i = 0; i < n; i++) {
1771 from = data[i*2];
1772 to = data[i*2+1];
1773 if (pre <= from - 1) {
1774 r = add_code_range_to_buf(pbuf, pre, from - 1);
1775 if (r != 0) return r;
1776 }
1777 if (to == ~((OnigCodePoint )0)) break;
1778 pre = to + 1;
1779 }
1780 if (to < ~((OnigCodePoint )0)) {
1781 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1782 }
1783 return r;
1784 }
1785
1786 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1787 BBuf *tbuf; \
1788 int tnot; \
1789 tnot = not1; not1 = not2; not2 = tnot; \
1790 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1791 } while (0)
1792
1793 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1794 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1795 BBuf* bbuf2, int not2, BBuf** pbuf)
1796 {
1797 int r;
1798 OnigCodePoint i, n1, *data1;
1799 OnigCodePoint from, to;
1800
1801 *pbuf = (BBuf* )NULL;
1802 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1803 if (not1 != 0 || not2 != 0)
1804 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1805 return 0;
1806 }
1807
1808 r = 0;
1809 if (IS_NULL(bbuf2))
1810 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1811
1812 if (IS_NULL(bbuf1)) {
1813 if (not1 != 0) {
1814 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1815 }
1816 else {
1817 if (not2 == 0) {
1818 return bbuf_clone(pbuf, bbuf2);
1819 }
1820 else {
1821 return not_code_range_buf(enc, bbuf2, pbuf);
1822 }
1823 }
1824 }
1825
1826 if (not1 != 0)
1827 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1828
1829 data1 = (OnigCodePoint* )(bbuf1->p);
1830 GET_CODE_POINT(n1, data1);
1831 data1++;
1832
1833 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1834 r = bbuf_clone(pbuf, bbuf2);
1835 }
1836 else if (not1 == 0) { /* 1 OR (not 2) */
1837 r = not_code_range_buf(enc, bbuf2, pbuf);
1838 }
1839 if (r != 0) return r;
1840
1841 for (i = 0; i < n1; i++) {
1842 from = data1[i*2];
1843 to = data1[i*2+1];
1844 r = add_code_range_to_buf(pbuf, from, to);
1845 if (r != 0) return r;
1846 }
1847 return 0;
1848 }
1849
1850 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1851 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1852 OnigCodePoint* data, int n)
1853 {
1854 int i, r;
1855 OnigCodePoint from2, to2;
1856
1857 for (i = 0; i < n; i++) {
1858 from2 = data[i*2];
1859 to2 = data[i*2+1];
1860 if (from2 < from1) {
1861 if (to2 < from1) continue;
1862 else {
1863 from1 = to2 + 1;
1864 }
1865 }
1866 else if (from2 <= to1) {
1867 if (to2 < to1) {
1868 if (from1 <= from2 - 1) {
1869 r = add_code_range_to_buf(pbuf, from1, from2-1);
1870 if (r != 0) return r;
1871 }
1872 from1 = to2 + 1;
1873 }
1874 else {
1875 to1 = from2 - 1;
1876 }
1877 }
1878 else {
1879 from1 = from2;
1880 }
1881 if (from1 > to1) break;
1882 }
1883 if (from1 <= to1) {
1884 r = add_code_range_to_buf(pbuf, from1, to1);
1885 if (r != 0) return r;
1886 }
1887 return 0;
1888 }
1889
1890 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1891 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1892 {
1893 int r;
1894 OnigCodePoint i, j, n1, n2, *data1, *data2;
1895 OnigCodePoint from, to, from1, to1, from2, to2;
1896
1897 *pbuf = (BBuf* )NULL;
1898 if (IS_NULL(bbuf1)) {
1899 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1900 return bbuf_clone(pbuf, bbuf2);
1901 return 0;
1902 }
1903 else if (IS_NULL(bbuf2)) {
1904 if (not2 != 0)
1905 return bbuf_clone(pbuf, bbuf1);
1906 return 0;
1907 }
1908
1909 if (not1 != 0)
1910 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1911
1912 data1 = (OnigCodePoint* )(bbuf1->p);
1913 data2 = (OnigCodePoint* )(bbuf2->p);
1914 GET_CODE_POINT(n1, data1);
1915 GET_CODE_POINT(n2, data2);
1916 data1++;
1917 data2++;
1918
1919 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1920 for (i = 0; i < n1; i++) {
1921 from1 = data1[i*2];
1922 to1 = data1[i*2+1];
1923 for (j = 0; j < n2; j++) {
1924 from2 = data2[j*2];
1925 to2 = data2[j*2+1];
1926 if (from2 > to1) break;
1927 if (to2 < from1) continue;
1928 from = MAX(from1, from2);
1929 to = MIN(to1, to2);
1930 r = add_code_range_to_buf(pbuf, from, to);
1931 if (r != 0) return r;
1932 }
1933 }
1934 }
1935 else if (not1 == 0) { /* 1 AND (not 2) */
1936 for (i = 0; i < n1; i++) {
1937 from1 = data1[i*2];
1938 to1 = data1[i*2+1];
1939 r = and_code_range1(pbuf, from1, to1, data2, n2);
1940 if (r != 0) return r;
1941 }
1942 }
1943
1944 return 0;
1945 }
1946
1947 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1948 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1949 {
1950 int r, not1, not2;
1951 BBuf *buf1, *buf2, *pbuf;
1952 BitSetRef bsr1, bsr2;
1953 BitSet bs1, bs2;
1954
1955 not1 = IS_CCLASS_NOT(dest);
1956 bsr1 = dest->bs;
1957 buf1 = dest->mbuf;
1958 not2 = IS_CCLASS_NOT(cc);
1959 bsr2 = cc->bs;
1960 buf2 = cc->mbuf;
1961
1962 if (not1 != 0) {
1963 bitset_invert_to(bsr1, bs1);
1964 bsr1 = bs1;
1965 }
1966 if (not2 != 0) {
1967 bitset_invert_to(bsr2, bs2);
1968 bsr2 = bs2;
1969 }
1970 bitset_and(bsr1, bsr2);
1971 if (bsr1 != dest->bs) {
1972 bitset_copy(dest->bs, bsr1);
1973 bsr1 = dest->bs;
1974 }
1975 if (not1 != 0) {
1976 bitset_invert(dest->bs);
1977 }
1978
1979 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
1980 if (not1 != 0 && not2 != 0) {
1981 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
1982 }
1983 else {
1984 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
1985 if (r == 0 && not1 != 0) {
1986 BBuf *tbuf;
1987 r = not_code_range_buf(enc, pbuf, &tbuf);
1988 if (r != 0) {
1989 bbuf_free(pbuf);
1990 return r;
1991 }
1992 bbuf_free(pbuf);
1993 pbuf = tbuf;
1994 }
1995 }
1996 if (r != 0) return r;
1997
1998 dest->mbuf = pbuf;
1999 bbuf_free(buf1);
2000 return r;
2001 }
2002 return 0;
2003 }
2004
2005 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2006 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2007 {
2008 int r, not1, not2;
2009 BBuf *buf1, *buf2, *pbuf;
2010 BitSetRef bsr1, bsr2;
2011 BitSet bs1, bs2;
2012
2013 not1 = IS_CCLASS_NOT(dest);
2014 bsr1 = dest->bs;
2015 buf1 = dest->mbuf;
2016 not2 = IS_CCLASS_NOT(cc);
2017 bsr2 = cc->bs;
2018 buf2 = cc->mbuf;
2019
2020 if (not1 != 0) {
2021 bitset_invert_to(bsr1, bs1);
2022 bsr1 = bs1;
2023 }
2024 if (not2 != 0) {
2025 bitset_invert_to(bsr2, bs2);
2026 bsr2 = bs2;
2027 }
2028 bitset_or(bsr1, bsr2);
2029 if (bsr1 != dest->bs) {
2030 bitset_copy(dest->bs, bsr1);
2031 bsr1 = dest->bs;
2032 }
2033 if (not1 != 0) {
2034 bitset_invert(dest->bs);
2035 }
2036
2037 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2038 if (not1 != 0 && not2 != 0) {
2039 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2040 }
2041 else {
2042 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2043 if (r == 0 && not1 != 0) {
2044 BBuf *tbuf;
2045 r = not_code_range_buf(enc, pbuf, &tbuf);
2046 if (r != 0) {
2047 bbuf_free(pbuf);
2048 return r;
2049 }
2050 bbuf_free(pbuf);
2051 pbuf = tbuf;
2052 }
2053 }
2054 if (r != 0) return r;
2055
2056 dest->mbuf = pbuf;
2057 bbuf_free(buf1);
2058 return r;
2059 }
2060 else
2061 return 0;
2062 }
2063
2064 static int
conv_backslash_value(int c,ScanEnv * env)2065 conv_backslash_value(int c, ScanEnv* env)
2066 {
2067 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2068 switch (c) {
2069 case 'n': return '\n';
2070 case 't': return '\t';
2071 case 'r': return '\r';
2072 case 'f': return '\f';
2073 case 'a': return '\007';
2074 case 'b': return '\010';
2075 case 'e': return '\033';
2076 case 'v':
2077 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2078 return '\v';
2079 break;
2080
2081 default:
2082 break;
2083 }
2084 }
2085 return c;
2086 }
2087
2088 static int
is_invalid_quantifier_target(Node * node)2089 is_invalid_quantifier_target(Node* node)
2090 {
2091 switch (NTYPE(node)) {
2092 case N_ANCHOR:
2093 return 1;
2094 break;
2095
2096 case N_EFFECT:
2097 if (NEFFECT(node).type == EFFECT_OPTION)
2098 return is_invalid_quantifier_target(NEFFECT(node).target);
2099 break;
2100
2101 case N_LIST: /* ex. (?:\G\A)* */
2102 do {
2103 if (! is_invalid_quantifier_target(NCONS(node).left)) return 0;
2104 } while (IS_NOT_NULL(node = NCONS(node).right));
2105 return 0;
2106 break;
2107
2108 case N_ALT: /* ex. (?:abc|\A)* */
2109 do {
2110 if (is_invalid_quantifier_target(NCONS(node).left)) return 1;
2111 } while (IS_NOT_NULL(node = NCONS(node).right));
2112 break;
2113
2114 default:
2115 break;
2116 }
2117 return 0;
2118 }
2119
2120 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2121 static int
popular_quantifier_num(QuantifierNode * qf)2122 popular_quantifier_num(QuantifierNode* qf)
2123 {
2124 if (qf->greedy) {
2125 if (qf->lower == 0) {
2126 if (qf->upper == 1) return 0;
2127 else if (IS_REPEAT_INFINITE(qf->upper)) return 1;
2128 }
2129 else if (qf->lower == 1) {
2130 if (IS_REPEAT_INFINITE(qf->upper)) return 2;
2131 }
2132 }
2133 else {
2134 if (qf->lower == 0) {
2135 if (qf->upper == 1) return 3;
2136 else if (IS_REPEAT_INFINITE(qf->upper)) return 4;
2137 }
2138 else if (qf->lower == 1) {
2139 if (IS_REPEAT_INFINITE(qf->upper)) return 5;
2140 }
2141 }
2142 return -1;
2143 }
2144
2145
2146 enum ReduceType {
2147 RQ_ASIS = 0, /* as is */
2148 RQ_DEL = 1, /* delete parent */
2149 RQ_A, /* to '*' */
2150 RQ_AQ, /* to '*?' */
2151 RQ_QQ, /* to '??' */
2152 RQ_P_QQ, /* to '+)??' */
2153 RQ_PQ_Q /* to '+?)?' */
2154 };
2155
2156 static enum ReduceType ReduceTypeTable[6][6] = {
2157 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2158 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2159 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2160 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2161 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2162 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2163 };
2164
2165 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2166 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2167 {
2168 int pnum, cnum;
2169 QuantifierNode *p, *c;
2170
2171 p = &(NQUANTIFIER(pnode));
2172 c = &(NQUANTIFIER(cnode));
2173 pnum = popular_quantifier_num(p);
2174 cnum = popular_quantifier_num(c);
2175
2176 switch(ReduceTypeTable[cnum][pnum]) {
2177 case RQ_DEL:
2178 *p = *c;
2179 break;
2180 case RQ_A:
2181 p->target = c->target;
2182 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2183 break;
2184 case RQ_AQ:
2185 p->target = c->target;
2186 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2187 break;
2188 case RQ_QQ:
2189 p->target = c->target;
2190 p->lower = 0; p->upper = 1; p->greedy = 0;
2191 break;
2192 case RQ_P_QQ:
2193 p->target = cnode;
2194 p->lower = 0; p->upper = 1; p->greedy = 0;
2195 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2196 return ;
2197 break;
2198 case RQ_PQ_Q:
2199 p->target = cnode;
2200 p->lower = 0; p->upper = 1; p->greedy = 1;
2201 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2202 return ;
2203 break;
2204 case RQ_ASIS:
2205 p->target = cnode;
2206 return ;
2207 break;
2208 }
2209
2210 c->target = NULL_NODE;
2211 onig_node_free(cnode);
2212 }
2213
2214
2215 enum TokenSyms {
2216 TK_EOT = 0, /* end of token */
2217 TK_RAW_BYTE = 1,
2218 TK_CHAR,
2219 TK_STRING,
2220 TK_CODE_POINT,
2221 TK_ANYCHAR,
2222 TK_CHAR_TYPE,
2223 TK_BACKREF,
2224 TK_CALL,
2225 TK_ANCHOR,
2226 TK_OP_REPEAT,
2227 TK_INTERVAL,
2228 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2229 TK_ALT,
2230 TK_SUBEXP_OPEN,
2231 TK_SUBEXP_CLOSE,
2232 TK_CC_OPEN,
2233 TK_QUOTE_OPEN,
2234 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2235 /* in cc */
2236 TK_CC_CLOSE,
2237 TK_CC_RANGE,
2238 TK_POSIX_BRACKET_OPEN,
2239 TK_CC_AND, /* && */
2240 TK_CC_CC_OPEN /* [ */
2241 };
2242
2243 typedef struct {
2244 enum TokenSyms type;
2245 int escaped;
2246 int base; /* is number: 8, 16 (used in [....]) */
2247 UChar* backp;
2248 union {
2249 UChar* s;
2250 int c;
2251 OnigCodePoint code;
2252 int anchor;
2253 int subtype;
2254 struct {
2255 int lower;
2256 int upper;
2257 int greedy;
2258 int possessive;
2259 } repeat;
2260 struct {
2261 int num;
2262 int ref1;
2263 int* refs;
2264 int by_name;
2265 #ifdef USE_BACKREF_AT_LEVEL
2266 int exist_level;
2267 int level; /* \k<name+n> */
2268 #endif
2269 } backref;
2270 struct {
2271 UChar* name;
2272 UChar* name_end;
2273 } call;
2274 struct {
2275 int not;
2276 } prop;
2277 } u;
2278 } OnigToken;
2279
2280
2281 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2282 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2283 {
2284 int low, up, syn_allow, non_low = 0;
2285 int r = 0;
2286 OnigCodePoint c;
2287 OnigEncoding enc = env->enc;
2288 UChar* p = *src;
2289 PFETCH_READY;
2290
2291 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2292
2293 if (PEND) {
2294 if (syn_allow)
2295 return 1; /* "....{" : OK! */
2296 else
2297 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2298 }
2299
2300 if (! syn_allow) {
2301 c = PPEEK;
2302 if (c == ')' || c == '(' || c == '|') {
2303 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2304 }
2305 }
2306
2307 low = onig_scan_unsigned_number(&p, end, env->enc);
2308 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2309 if (low > ONIG_MAX_REPEAT_NUM)
2310 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2311
2312 if (p == *src) { /* can't read low */
2313 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2314 /* allow {,n} as {0,n} */
2315 low = 0;
2316 non_low = 1;
2317 }
2318 else
2319 goto invalid;
2320 }
2321
2322 if (PEND) goto invalid;
2323 PFETCH(c);
2324 if (c == ',') {
2325 UChar* prev = p;
2326 up = onig_scan_unsigned_number(&p, end, env->enc);
2327 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2328 if (up > ONIG_MAX_REPEAT_NUM)
2329 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2330
2331 if (p == prev) {
2332 if (non_low != 0)
2333 goto invalid;
2334 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2335 }
2336 }
2337 else {
2338 if (non_low != 0)
2339 goto invalid;
2340
2341 PUNFETCH;
2342 up = low; /* {n} : exact n times */
2343 r = 2; /* fixed */
2344 }
2345
2346 if (PEND) goto invalid;
2347 PFETCH(c);
2348 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2349 if (c != MC_ESC(enc)) goto invalid;
2350 PFETCH(c);
2351 }
2352 if (c != '}') goto invalid;
2353
2354 if (!IS_REPEAT_INFINITE(up) && low > up) {
2355 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2356 }
2357
2358 tok->type = TK_INTERVAL;
2359 tok->u.repeat.lower = low;
2360 tok->u.repeat.upper = up;
2361 *src = p;
2362 return r; /* 0: normal {n,m}, 2: fixed {n} */
2363
2364 invalid:
2365 if (syn_allow)
2366 return 1; /* OK */
2367 else
2368 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2369 }
2370
2371 /* \M-, \C-, \c, or \... */
2372 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2373 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2374 {
2375 int v;
2376 OnigCodePoint c;
2377 OnigEncoding enc = env->enc;
2378 UChar* p = *src;
2379 PFETCH_READY;
2380
2381 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2382
2383 PFETCH(c);
2384 switch (c) {
2385 case 'M':
2386 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2387 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2388 PFETCH(c);
2389 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2390 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2391 PFETCH(c);
2392 if (c == MC_ESC(enc)) {
2393 v = fetch_escaped_value(&p, end, env);
2394 if (v < 0) return v;
2395 c = (OnigCodePoint )v;
2396 }
2397 c = ((c & 0xff) | 0x80);
2398 }
2399 else
2400 goto backslash;
2401 break;
2402
2403 case 'C':
2404 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2405 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2406 PFETCH(c);
2407 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2408 goto control;
2409 }
2410 else
2411 goto backslash;
2412
2413 case 'c':
2414 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2415 control:
2416 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2417 PFETCH(c);
2418 if (c == '?') {
2419 c = 0177;
2420 }
2421 else {
2422 if (c == MC_ESC(enc)) {
2423 v = fetch_escaped_value(&p, end, env);
2424 if (v < 0) return v;
2425 c = (OnigCodePoint )v;
2426 }
2427 c &= 0x9f;
2428 }
2429 break;
2430 }
2431 /* fall through */
2432
2433 default:
2434 {
2435 backslash:
2436 c = conv_backslash_value(c, env);
2437 }
2438 break;
2439 }
2440
2441 *src = p;
2442 return c;
2443 }
2444
2445 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2446
2447 #ifdef USE_NAMED_GROUP
2448 #ifdef USE_BACKREF_AT_LEVEL
2449 /*
2450 \k<name+n>, \k<name-n>
2451 */
2452 static int
fetch_name_with_level(UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * level)2453 fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end
2454 , ScanEnv* env, int* level)
2455 {
2456 int r, exist_level = 0;
2457 OnigCodePoint c = 0;
2458 OnigCodePoint first_code;
2459 OnigEncoding enc = env->enc;
2460 UChar *name_end;
2461 UChar *p = *src;
2462 PFETCH_READY;
2463
2464 name_end = end;
2465 r = 0;
2466 if (PEND) {
2467 return ONIGERR_EMPTY_GROUP_NAME;
2468 }
2469 else {
2470 PFETCH(c);
2471 first_code = c;
2472 if (c == '>')
2473 return ONIGERR_EMPTY_GROUP_NAME;
2474
2475 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2476 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2477 }
2478 }
2479
2480 while (!PEND) {
2481 name_end = p;
2482 PFETCH(c);
2483 if (c == '>' || c == ')' || c == '+' || c == '-') break;
2484
2485 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2486 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2487 }
2488 }
2489
2490 if (c != '>') {
2491 if (c == '+' || c == '-') {
2492 int num;
2493 int flag = (c == '-' ? -1 : 1);
2494
2495 PFETCH(c);
2496 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2497 PUNFETCH;
2498 num = onig_scan_unsigned_number(&p, end, enc);
2499 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2500 *level = (num * flag);
2501 exist_level = 1;
2502
2503 PFETCH(c);
2504 if (c == '>')
2505 goto first_check;
2506 }
2507
2508 err:
2509 r = ONIGERR_INVALID_GROUP_NAME;
2510 name_end = end;
2511 }
2512 else {
2513 first_check:
2514 if (ONIGENC_IS_CODE_ASCII(first_code) &&
2515 ONIGENC_IS_CODE_UPPER(enc, first_code))
2516 r = ONIGERR_INVALID_GROUP_NAME;
2517 }
2518
2519 if (r == 0) {
2520 *rname_end = name_end;
2521 *src = p;
2522 return (exist_level ? 1 : 0);
2523 }
2524 else {
2525 onig_scan_env_set_error_string(env, r, *src, name_end);
2526 return r;
2527 }
2528 }
2529 #endif /* USE_BACKREF_AT_LEVEL */
2530
2531 /*
2532 def: 0 -> define name (don't allow number name)
2533 1 -> reference name (allow number name)
2534 */
2535 static int
fetch_name(UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int ref)2536 fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2537 {
2538 int r, is_num;
2539 OnigCodePoint c = 0;
2540 OnigCodePoint first_code;
2541 OnigEncoding enc = env->enc;
2542 UChar *name_end;
2543 UChar *p = *src;
2544 PFETCH_READY;
2545
2546 name_end = end;
2547 r = 0;
2548 is_num = 0;
2549 if (PEND) {
2550 return ONIGERR_EMPTY_GROUP_NAME;
2551 }
2552 else {
2553 PFETCH(c);
2554 first_code = c;
2555 if (c == '>')
2556 return ONIGERR_EMPTY_GROUP_NAME;
2557
2558 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2559 if (ref == 1)
2560 is_num = 1;
2561 else {
2562 r = ONIGERR_INVALID_GROUP_NAME;
2563 }
2564 }
2565 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2566 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2567 }
2568 }
2569
2570 while (!PEND) {
2571 name_end = p;
2572 PFETCH(c);
2573 if (c == '>' || c == ')') break;
2574
2575 if (is_num == 1) {
2576 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) {
2577 if (!ONIGENC_IS_CODE_WORD(enc, c))
2578 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2579 else
2580 r = ONIGERR_INVALID_GROUP_NAME;
2581 }
2582 }
2583 else {
2584 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2585 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2586 }
2587 }
2588 }
2589
2590 if (c != '>') {
2591 r = ONIGERR_INVALID_GROUP_NAME;
2592 name_end = end;
2593 }
2594 else {
2595 if (ONIGENC_IS_CODE_ASCII(first_code) &&
2596 ONIGENC_IS_CODE_UPPER(enc, first_code))
2597 r = ONIGERR_INVALID_GROUP_NAME;
2598 }
2599
2600 if (r == 0) {
2601 *rname_end = name_end;
2602 *src = p;
2603 return 0;
2604 }
2605 else {
2606 onig_scan_env_set_error_string(env, r, *src, name_end);
2607 return r;
2608 }
2609 }
2610 #else
2611 static int
fetch_name(UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int ref)2612 fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2613 {
2614 int r, len;
2615 OnigCodePoint c = 0;
2616 UChar *name_end;
2617 OnigEncoding enc = env->enc;
2618 UChar *p = *src;
2619 PFETCH_READY;
2620
2621 r = 0;
2622 while (!PEND) {
2623 name_end = p;
2624 if (enc_len(enc, p) > 1)
2625 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2626
2627 PFETCH(c);
2628 if (c == '>' || c == ')') break;
2629 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2630 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2631 }
2632 if (c != '>') {
2633 r = ONIGERR_INVALID_GROUP_NAME;
2634 name_end = end;
2635 }
2636
2637 if (r == 0) {
2638 *rname_end = name_end;
2639 *src = p;
2640 return 0;
2641 }
2642 else {
2643 err:
2644 onig_scan_env_set_error_string(env, r, *src, name_end);
2645 return r;
2646 }
2647 }
2648 #endif
2649
2650 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2651 CC_ESC_WARN(ScanEnv* env, UChar *c)
2652 {
2653 if (onig_warn == onig_null_warn) return ;
2654
2655 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2656 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2657 UChar buf[WARN_BUFSIZE];
2658 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2659 env->pattern, env->pattern_end,
2660 (UChar* )"character class has '%s' without escape", c);
2661 (*onig_warn)((char* )buf);
2662 }
2663 }
2664
2665 static void
CCEND_ESC_WARN(ScanEnv * env,UChar * c)2666 CCEND_ESC_WARN(ScanEnv* env, UChar* c)
2667 {
2668 if (onig_warn == onig_null_warn) return ;
2669
2670 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2671 UChar buf[WARN_BUFSIZE];
2672 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2673 (env)->pattern, (env)->pattern_end,
2674 (UChar* )"regular expression has '%s' without escape", c);
2675 (*onig_warn)((char* )buf);
2676 }
2677 }
2678
2679 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2680 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2681 UChar **next, OnigEncoding enc)
2682 {
2683 int i;
2684 OnigCodePoint x;
2685 UChar *q;
2686 UChar *p = from;
2687
2688 while (p < to) {
2689 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2690 q = p + enc_len(enc, p);
2691 if (x == s[0]) {
2692 for (i = 1; i < n && q < to; i++) {
2693 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2694 if (x != s[i]) break;
2695 q += enc_len(enc, q);
2696 }
2697 if (i >= n) {
2698 if (IS_NOT_NULL(next))
2699 *next = q;
2700 return p;
2701 }
2702 }
2703 p = q;
2704 }
2705 return NULL_UCHARP;
2706 }
2707
2708 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc)2709 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2710 OnigCodePoint bad, OnigEncoding enc)
2711 {
2712 int i, in_esc;
2713 OnigCodePoint x;
2714 UChar *q;
2715 UChar *p = from;
2716
2717 in_esc = 0;
2718 while (p < to) {
2719 if (in_esc) {
2720 in_esc = 0;
2721 p += enc_len(enc, p);
2722 }
2723 else {
2724 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2725 q = p + enc_len(enc, p);
2726 if (x == s[0]) {
2727 for (i = 1; i < n && q < to; i++) {
2728 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2729 if (x != s[i]) break;
2730 q += enc_len(enc, q);
2731 }
2732 if (i >= n) return 1;
2733 p += enc_len(enc, p);
2734 }
2735 else {
2736 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2737 if (x == bad) return 0;
2738 else if (x == MC_ESC(enc)) in_esc = 1;
2739 p = q;
2740 }
2741 }
2742 }
2743 return 0;
2744 }
2745
2746 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2747 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2748 {
2749 int num;
2750 OnigCodePoint c, c2;
2751 OnigSyntaxType* syn = env->syntax;
2752 OnigEncoding enc = env->enc;
2753 UChar* prev;
2754 UChar* p = *src;
2755 PFETCH_READY;
2756
2757 if (PEND) {
2758 tok->type = TK_EOT;
2759 return tok->type;
2760 }
2761
2762 PFETCH(c);
2763 tok->type = TK_CHAR;
2764 tok->base = 0;
2765 tok->u.c = c;
2766 tok->escaped = 0;
2767
2768 if (c == ']') {
2769 tok->type = TK_CC_CLOSE;
2770 }
2771 else if (c == '-') {
2772 tok->type = TK_CC_RANGE;
2773 }
2774 else if (c == MC_ESC(enc)) {
2775 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2776 goto end;
2777
2778 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2779
2780 PFETCH(c);
2781 tok->escaped = 1;
2782 tok->u.c = c;
2783 switch (c) {
2784 case 'w':
2785 tok->type = TK_CHAR_TYPE;
2786 tok->u.subtype = CTYPE_WORD;
2787 break;
2788 case 'W':
2789 tok->type = TK_CHAR_TYPE;
2790 tok->u.subtype = CTYPE_NOT_WORD;
2791 break;
2792 case 'd':
2793 tok->type = TK_CHAR_TYPE;
2794 tok->u.subtype = CTYPE_DIGIT;
2795 break;
2796 case 'D':
2797 tok->type = TK_CHAR_TYPE;
2798 tok->u.subtype = CTYPE_NOT_DIGIT;
2799 break;
2800 case 's':
2801 tok->type = TK_CHAR_TYPE;
2802 tok->u.subtype = CTYPE_WHITE_SPACE;
2803 break;
2804 case 'S':
2805 tok->type = TK_CHAR_TYPE;
2806 tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
2807 break;
2808 case 'h':
2809 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2810 tok->type = TK_CHAR_TYPE;
2811 tok->u.subtype = CTYPE_XDIGIT;
2812 break;
2813 case 'H':
2814 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2815 tok->type = TK_CHAR_TYPE;
2816 tok->u.subtype = CTYPE_NOT_XDIGIT;
2817 break;
2818
2819 case 'p':
2820 case 'P':
2821 c2 = PPEEK;
2822 if (c2 == '{' &&
2823 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2824 PINC;
2825 tok->type = TK_CHAR_PROPERTY;
2826 tok->u.prop.not = (c == 'P' ? 1 : 0);
2827
2828 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2829 PFETCH(c2);
2830 if (c2 == '^') {
2831 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2832 }
2833 else
2834 PUNFETCH;
2835 }
2836 }
2837 break;
2838
2839 case 'x':
2840 if (PEND) break;
2841
2842 prev = p;
2843 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
2844 PINC;
2845 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
2846 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
2847 if (!PEND) {
2848 c2 = PPEEK;
2849 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
2850 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
2851 }
2852
2853 if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) {
2854 PINC;
2855 tok->type = TK_CODE_POINT;
2856 tok->base = 16;
2857 tok->u.code = (OnigCodePoint )num;
2858 }
2859 else {
2860 /* can't read nothing or invalid format */
2861 p = prev;
2862 }
2863 }
2864 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
2865 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
2866 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2867 if (p == prev) { /* can't read nothing. */
2868 num = 0; /* but, it's not error */
2869 }
2870 tok->type = TK_RAW_BYTE;
2871 tok->base = 16;
2872 tok->u.c = num;
2873 }
2874 break;
2875
2876 case 'u':
2877 if (PEND) break;
2878
2879 prev = p;
2880 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
2881 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
2882 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2883 if (p == prev) { /* can't read nothing. */
2884 num = 0; /* but, it's not error */
2885 }
2886 tok->type = TK_CODE_POINT;
2887 tok->base = 16;
2888 tok->u.code = (OnigCodePoint )num;
2889 }
2890 break;
2891
2892 case '0':
2893 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
2894 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
2895 PUNFETCH;
2896 prev = p;
2897 num = scan_unsigned_octal_number(&p, end, 3, enc);
2898 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2899 if (p == prev) { /* can't read nothing. */
2900 num = 0; /* but, it's not error */
2901 }
2902 tok->type = TK_RAW_BYTE;
2903 tok->base = 8;
2904 tok->u.c = num;
2905 }
2906 break;
2907
2908 default:
2909 PUNFETCH;
2910 num = fetch_escaped_value(&p, end, env);
2911 if (num < 0) return num;
2912 if (tok->u.c != num) {
2913 tok->u.code = (OnigCodePoint )num;
2914 tok->type = TK_CODE_POINT;
2915 }
2916 break;
2917 }
2918 }
2919 else if (c == '[') {
2920 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
2921 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
2922 tok->backp = p; /* point at '[' is readed */
2923 PINC;
2924 if (str_exist_check_with_esc(send, 2, p, end,
2925 (OnigCodePoint )']', enc)) {
2926 tok->type = TK_POSIX_BRACKET_OPEN;
2927 }
2928 else {
2929 PUNFETCH;
2930 goto cc_in_cc;
2931 }
2932 }
2933 else {
2934 cc_in_cc:
2935 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
2936 tok->type = TK_CC_CC_OPEN;
2937 }
2938 else {
2939 CC_ESC_WARN(env, (UChar* )"[");
2940 }
2941 }
2942 }
2943 else if (c == '&') {
2944 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
2945 !PEND && (PPEEK_IS('&'))) {
2946 PINC;
2947 tok->type = TK_CC_AND;
2948 }
2949 }
2950
2951 end:
2952 *src = p;
2953 return tok->type;
2954 }
2955
2956 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2957 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2958 {
2959 int r, num;
2960 OnigCodePoint c;
2961 OnigEncoding enc = env->enc;
2962 OnigSyntaxType* syn = env->syntax;
2963 UChar* prev;
2964 UChar* p = *src;
2965 PFETCH_READY;
2966
2967 start:
2968 if (PEND) {
2969 tok->type = TK_EOT;
2970 return tok->type;
2971 }
2972
2973 tok->type = TK_STRING;
2974 tok->base = 0;
2975 tok->backp = p;
2976
2977 PFETCH(c);
2978 if (IS_MC_ESC_CODE(c, enc, syn)) {
2979 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2980
2981 tok->backp = p;
2982 PFETCH(c);
2983
2984 tok->u.c = c;
2985 tok->escaped = 1;
2986 switch (c) {
2987 case '*':
2988 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
2989 tok->type = TK_OP_REPEAT;
2990 tok->u.repeat.lower = 0;
2991 tok->u.repeat.upper = REPEAT_INFINITE;
2992 goto greedy_check;
2993 break;
2994
2995 case '+':
2996 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
2997 tok->type = TK_OP_REPEAT;
2998 tok->u.repeat.lower = 1;
2999 tok->u.repeat.upper = REPEAT_INFINITE;
3000 goto greedy_check;
3001 break;
3002
3003 case '?':
3004 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3005 tok->type = TK_OP_REPEAT;
3006 tok->u.repeat.lower = 0;
3007 tok->u.repeat.upper = 1;
3008 greedy_check:
3009 if (!PEND && PPEEK_IS('?') &&
3010 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3011 PFETCH(c);
3012 tok->u.repeat.greedy = 0;
3013 tok->u.repeat.possessive = 0;
3014 }
3015 else {
3016 possessive_check:
3017 if (!PEND && PPEEK_IS('+') &&
3018 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3019 tok->type != TK_INTERVAL) ||
3020 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3021 tok->type == TK_INTERVAL))) {
3022 PFETCH(c);
3023 tok->u.repeat.greedy = 1;
3024 tok->u.repeat.possessive = 1;
3025 }
3026 else {
3027 tok->u.repeat.greedy = 1;
3028 tok->u.repeat.possessive = 0;
3029 }
3030 }
3031 break;
3032
3033 case '{':
3034 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3035 r = fetch_range_quantifier(&p, end, tok, env);
3036 if (r < 0) return r; /* error */
3037 if (r == 0) goto greedy_check;
3038 else if (r == 2) { /* {n} */
3039 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3040 goto possessive_check;
3041
3042 goto greedy_check;
3043 }
3044 /* r == 1 : normal char */
3045 break;
3046
3047 case '|':
3048 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3049 tok->type = TK_ALT;
3050 break;
3051
3052 case '(':
3053 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3054 tok->type = TK_SUBEXP_OPEN;
3055 break;
3056
3057 case ')':
3058 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3059 tok->type = TK_SUBEXP_CLOSE;
3060 break;
3061
3062 case 'w':
3063 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3064 tok->type = TK_CHAR_TYPE;
3065 tok->u.subtype = CTYPE_WORD;
3066 break;
3067
3068 case 'W':
3069 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3070 tok->type = TK_CHAR_TYPE;
3071 tok->u.subtype = CTYPE_NOT_WORD;
3072 break;
3073
3074 case 'b':
3075 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3076 tok->type = TK_ANCHOR;
3077 tok->u.anchor = ANCHOR_WORD_BOUND;
3078 break;
3079
3080 case 'B':
3081 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3082 tok->type = TK_ANCHOR;
3083 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3084 break;
3085
3086 #ifdef USE_WORD_BEGIN_END
3087 case '<':
3088 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3089 tok->type = TK_ANCHOR;
3090 tok->u.anchor = ANCHOR_WORD_BEGIN;
3091 break;
3092
3093 case '>':
3094 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3095 tok->type = TK_ANCHOR;
3096 tok->u.anchor = ANCHOR_WORD_END;
3097 break;
3098 #endif
3099
3100 case 's':
3101 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3102 tok->type = TK_CHAR_TYPE;
3103 tok->u.subtype = CTYPE_WHITE_SPACE;
3104 break;
3105
3106 case 'S':
3107 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3108 tok->type = TK_CHAR_TYPE;
3109 tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
3110 break;
3111
3112 case 'd':
3113 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3114 tok->type = TK_CHAR_TYPE;
3115 tok->u.subtype = CTYPE_DIGIT;
3116 break;
3117
3118 case 'D':
3119 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3120 tok->type = TK_CHAR_TYPE;
3121 tok->u.subtype = CTYPE_NOT_DIGIT;
3122 break;
3123
3124 case 'h':
3125 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3126 tok->type = TK_CHAR_TYPE;
3127 tok->u.subtype = CTYPE_XDIGIT;
3128 break;
3129
3130 case 'H':
3131 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3132 tok->type = TK_CHAR_TYPE;
3133 tok->u.subtype = CTYPE_NOT_XDIGIT;
3134 break;
3135
3136 case 'A':
3137 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3138 begin_buf:
3139 tok->type = TK_ANCHOR;
3140 tok->u.subtype = ANCHOR_BEGIN_BUF;
3141 break;
3142
3143 case 'Z':
3144 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3145 tok->type = TK_ANCHOR;
3146 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3147 break;
3148
3149 case 'z':
3150 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3151 end_buf:
3152 tok->type = TK_ANCHOR;
3153 tok->u.subtype = ANCHOR_END_BUF;
3154 break;
3155
3156 case 'G':
3157 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3158 tok->type = TK_ANCHOR;
3159 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3160 break;
3161
3162 case '`':
3163 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3164 goto begin_buf;
3165 break;
3166
3167 case '\'':
3168 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3169 goto end_buf;
3170 break;
3171
3172 case 'x':
3173 if (PEND) break;
3174
3175 prev = p;
3176 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3177 PINC;
3178 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3179 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3180 if (!PEND) {
3181 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3182 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3183 }
3184
3185 if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) {
3186 PINC;
3187 tok->type = TK_CODE_POINT;
3188 tok->u.code = (OnigCodePoint )num;
3189 }
3190 else {
3191 /* can't read nothing or invalid format */
3192 p = prev;
3193 }
3194 }
3195 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3196 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3197 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3198 if (p == prev) { /* can't read nothing. */
3199 num = 0; /* but, it's not error */
3200 }
3201 tok->type = TK_RAW_BYTE;
3202 tok->base = 16;
3203 tok->u.c = num;
3204 }
3205 break;
3206
3207 case 'u':
3208 if (PEND) break;
3209
3210 prev = p;
3211 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3212 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3213 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3214 if (p == prev) { /* can't read nothing. */
3215 num = 0; /* but, it's not error */
3216 }
3217 tok->type = TK_CODE_POINT;
3218 tok->base = 16;
3219 tok->u.code = (OnigCodePoint )num;
3220 }
3221 break;
3222
3223 case '1': case '2': case '3': case '4':
3224 case '5': case '6': case '7': case '8': case '9':
3225 PUNFETCH;
3226 prev = p;
3227 num = onig_scan_unsigned_number(&p, end, enc);
3228 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3229 goto skip_backref;
3230 }
3231
3232 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3233 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3234 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3235 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3236 return ONIGERR_INVALID_BACKREF;
3237 }
3238
3239 tok->type = TK_BACKREF;
3240 tok->u.backref.num = 1;
3241 tok->u.backref.ref1 = num;
3242 tok->u.backref.by_name = 0;
3243 #ifdef USE_BACKREF_AT_LEVEL
3244 tok->u.backref.exist_level = 0;
3245 #endif
3246 break;
3247 }
3248
3249 skip_backref:
3250 if (c == '8' || c == '9') {
3251 /* normal char */
3252 p = prev; PINC;
3253 break;
3254 }
3255
3256 p = prev;
3257 /* fall through */
3258 case '0':
3259 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3260 prev = p;
3261 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3262 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3263 if (p == prev) { /* can't read nothing. */
3264 num = 0; /* but, it's not error */
3265 }
3266 tok->type = TK_RAW_BYTE;
3267 tok->base = 8;
3268 tok->u.c = num;
3269 }
3270 else if (c != '0') {
3271 PINC;
3272 }
3273 break;
3274
3275 #ifdef USE_NAMED_GROUP
3276 case 'k':
3277 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3278 PFETCH(c);
3279 if (c == '<') {
3280 UChar* name_end;
3281 int* backs;
3282
3283 prev = p;
3284
3285 #ifdef USE_BACKREF_AT_LEVEL
3286 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3287 r = fetch_name_with_level(&p, end, &name_end, env, &tok->u.backref.level);
3288 if (r == 1) tok->u.backref.exist_level = 1;
3289 else tok->u.backref.exist_level = 0;
3290 #else
3291 r = fetch_name(&p, end, &name_end, env, 1);
3292 #endif
3293 if (r < 0) return r;
3294
3295 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3296 if (num <= 0) {
3297 onig_scan_env_set_error_string(env,
3298 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3299 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3300 }
3301 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3302 int i;
3303 for (i = 0; i < num; i++) {
3304 if (backs[i] > env->num_mem ||
3305 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3306 return ONIGERR_INVALID_BACKREF;
3307 }
3308 }
3309
3310 tok->type = TK_BACKREF;
3311 tok->u.backref.by_name = 1;
3312 if (num == 1) {
3313 tok->u.backref.num = 1;
3314 tok->u.backref.ref1 = backs[0];
3315 }
3316 else {
3317 tok->u.backref.num = num;
3318 tok->u.backref.refs = backs;
3319 }
3320 }
3321 else
3322 PUNFETCH;
3323 }
3324 break;
3325 #endif
3326
3327 #ifdef USE_SUBEXP_CALL
3328 case 'g':
3329 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3330 PFETCH(c);
3331 if (c == '<') {
3332 UChar* name_end;
3333
3334 prev = p;
3335 r = fetch_name(&p, end, &name_end, env, 1);
3336 if (r < 0) return r;
3337
3338 tok->type = TK_CALL;
3339 tok->u.call.name = prev;
3340 tok->u.call.name_end = name_end;
3341 }
3342 else
3343 PUNFETCH;
3344 }
3345 break;
3346 #endif
3347
3348 case 'Q':
3349 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3350 tok->type = TK_QUOTE_OPEN;
3351 }
3352 break;
3353
3354 case 'p':
3355 case 'P':
3356 if (PPEEK_IS('{') &&
3357 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3358 PINC;
3359 tok->type = TK_CHAR_PROPERTY;
3360 tok->u.prop.not = (c == 'P' ? 1 : 0);
3361
3362 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3363 PFETCH(c);
3364 if (c == '^') {
3365 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3366 }
3367 else
3368 PUNFETCH;
3369 }
3370 }
3371 break;
3372
3373 default:
3374 PUNFETCH;
3375 num = fetch_escaped_value(&p, end, env);
3376 if (num < 0) return num;
3377 /* set_raw: */
3378 if (tok->u.c != num) {
3379 tok->type = TK_CODE_POINT;
3380 tok->u.code = (OnigCodePoint )num;
3381 }
3382 else { /* string */
3383 p = tok->backp + enc_len(enc, tok->backp);
3384 }
3385 break;
3386 }
3387 }
3388 else {
3389 tok->u.c = c;
3390 tok->escaped = 0;
3391
3392 #ifdef USE_VARIABLE_META_CHARS
3393 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3394 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3395 if (c == MC_ANYCHAR(enc))
3396 goto any_char;
3397 else if (c == MC_ANYTIME(enc))
3398 goto anytime;
3399 else if (c == MC_ZERO_OR_ONE_TIME(enc))
3400 goto zero_or_one_time;
3401 else if (c == MC_ONE_OR_MORE_TIME(enc))
3402 goto one_or_more_time;
3403 else if (c == MC_ANYCHAR_ANYTIME(enc)) {
3404 tok->type = TK_ANYCHAR_ANYTIME;
3405 goto out;
3406 }
3407 }
3408 #endif
3409
3410 switch (c) {
3411 case '.':
3412 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3413 #ifdef USE_VARIABLE_META_CHARS
3414 any_char:
3415 #endif
3416 tok->type = TK_ANYCHAR;
3417 break;
3418
3419 case '*':
3420 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3421 #ifdef USE_VARIABLE_META_CHARS
3422 anytime:
3423 #endif
3424 tok->type = TK_OP_REPEAT;
3425 tok->u.repeat.lower = 0;
3426 tok->u.repeat.upper = REPEAT_INFINITE;
3427 goto greedy_check;
3428 break;
3429
3430 case '+':
3431 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3432 #ifdef USE_VARIABLE_META_CHARS
3433 one_or_more_time:
3434 #endif
3435 tok->type = TK_OP_REPEAT;
3436 tok->u.repeat.lower = 1;
3437 tok->u.repeat.upper = REPEAT_INFINITE;
3438 goto greedy_check;
3439 break;
3440
3441 case '?':
3442 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3443 #ifdef USE_VARIABLE_META_CHARS
3444 zero_or_one_time:
3445 #endif
3446 tok->type = TK_OP_REPEAT;
3447 tok->u.repeat.lower = 0;
3448 tok->u.repeat.upper = 1;
3449 goto greedy_check;
3450 break;
3451
3452 case '{':
3453 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3454 r = fetch_range_quantifier(&p, end, tok, env);
3455 if (r < 0) return r; /* error */
3456 if (r == 0) goto greedy_check;
3457 else if (r == 2) { /* {n} */
3458 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3459 goto possessive_check;
3460
3461 goto greedy_check;
3462 }
3463 /* r == 1 : normal char */
3464 break;
3465
3466 case '|':
3467 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3468 tok->type = TK_ALT;
3469 break;
3470
3471 case '(':
3472 if (PPEEK_IS('?') &&
3473 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3474 PINC;
3475 if (PPEEK_IS('#')) {
3476 PFETCH(c);
3477 while (1) {
3478 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3479 PFETCH(c);
3480 if (c == MC_ESC(enc)) {
3481 if (!PEND) PFETCH(c);
3482 }
3483 else {
3484 if (c == ')') break;
3485 }
3486 }
3487 goto start;
3488 }
3489 PUNFETCH;
3490 }
3491
3492 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3493 tok->type = TK_SUBEXP_OPEN;
3494 break;
3495
3496 case ')':
3497 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3498 tok->type = TK_SUBEXP_CLOSE;
3499 break;
3500
3501 case '^':
3502 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3503 tok->type = TK_ANCHOR;
3504 tok->u.subtype = (IS_SINGLELINE(env->option)
3505 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3506 break;
3507
3508 case '$':
3509 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3510 tok->type = TK_ANCHOR;
3511 tok->u.subtype = (IS_SINGLELINE(env->option)
3512 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3513 break;
3514
3515 case '[':
3516 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3517 tok->type = TK_CC_OPEN;
3518 break;
3519
3520 case ']':
3521 if (*src > env->pattern) /* /].../ is allowed. */
3522 CCEND_ESC_WARN(env, (UChar* )"]");
3523 break;
3524
3525 case '#':
3526 if (IS_EXTEND(env->option)) {
3527 while (!PEND) {
3528 PFETCH(c);
3529 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3530 break;
3531 }
3532 goto start;
3533 break;
3534 }
3535 break;
3536
3537 case ' ': case '\t': case '\n': case '\r': case '\f':
3538 if (IS_EXTEND(env->option))
3539 goto start;
3540 break;
3541
3542 default:
3543 /* string */
3544 break;
3545 }
3546 }
3547
3548 #ifdef USE_VARIABLE_META_CHARS
3549 out:
3550 #endif
3551 *src = p;
3552 return tok->type;
3553 }
3554
3555 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype,int not,OnigEncoding enc,const OnigCodePoint sbr[],const OnigCodePoint mbr[])3556 add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc,
3557 const OnigCodePoint sbr[], const OnigCodePoint mbr[])
3558 {
3559 int i, r;
3560 OnigCodePoint j;
3561
3562 int nsb = ONIGENC_CODE_RANGE_NUM(sbr);
3563 int nmb = ONIGENC_CODE_RANGE_NUM(mbr);
3564
3565 if (not == 0) {
3566 for (i = 0; i < nsb; i++) {
3567 for (j = ONIGENC_CODE_RANGE_FROM(sbr, i);
3568 j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
3569 BITSET_SET_BIT(cc->bs, j);
3570 }
3571 }
3572
3573 for (i = 0; i < nmb; i++) {
3574 r = add_code_range_to_buf(&(cc->mbuf),
3575 ONIGENC_CODE_RANGE_FROM(mbr, i),
3576 ONIGENC_CODE_RANGE_TO(mbr, i));
3577 if (r != 0) return r;
3578 }
3579 }
3580 else {
3581 OnigCodePoint prev = 0;
3582
3583 if (ONIGENC_MBC_MINLEN(enc) == 1) {
3584 for (i = 0; i < nsb; i++) {
3585 for (j = prev;
3586 j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) {
3587 BITSET_SET_BIT(cc->bs, j);
3588 }
3589 prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1;
3590 }
3591 if (prev < 0x7f) {
3592 for (j = prev; j < 0x7f; j++) {
3593 BITSET_SET_BIT(cc->bs, j);
3594 }
3595 }
3596
3597 prev = 0x80;
3598 }
3599
3600 for (i = 0; i < nmb; i++) {
3601 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3602 r = add_code_range_to_buf(&(cc->mbuf), prev,
3603 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3604 if (r != 0) return r;
3605 }
3606 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3607 }
3608 if (prev < 0x7fffffff) {
3609 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3610 if (r != 0) return r;
3611 }
3612 }
3613
3614 return 0;
3615 }
3616
3617 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3618 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3619 {
3620 int c, r;
3621 const OnigCodePoint *sbr, *mbr;
3622 OnigEncoding enc = env->enc;
3623
3624 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr);
3625 if (r == 0) {
3626 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr);
3627 }
3628 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3629 return r;
3630 }
3631
3632 r = 0;
3633 switch (ctype) {
3634 case ONIGENC_CTYPE_ALPHA:
3635 case ONIGENC_CTYPE_BLANK:
3636 case ONIGENC_CTYPE_CNTRL:
3637 case ONIGENC_CTYPE_DIGIT:
3638 case ONIGENC_CTYPE_LOWER:
3639 case ONIGENC_CTYPE_PUNCT:
3640 case ONIGENC_CTYPE_SPACE:
3641 case ONIGENC_CTYPE_UPPER:
3642 case ONIGENC_CTYPE_XDIGIT:
3643 case ONIGENC_CTYPE_ASCII:
3644 case ONIGENC_CTYPE_ALNUM:
3645 if (not != 0) {
3646 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3647 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3648 BITSET_SET_BIT(cc->bs, c);
3649 }
3650 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3651 }
3652 else {
3653 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3654 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3655 BITSET_SET_BIT(cc->bs, c);
3656 }
3657 }
3658 break;
3659
3660 case ONIGENC_CTYPE_GRAPH:
3661 case ONIGENC_CTYPE_PRINT:
3662 if (not != 0) {
3663 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3664 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3665 BITSET_SET_BIT(cc->bs, c);
3666 }
3667 }
3668 else {
3669 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3670 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3671 BITSET_SET_BIT(cc->bs, c);
3672 }
3673 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3674 }
3675 break;
3676
3677 case ONIGENC_CTYPE_WORD:
3678 if (not == 0) {
3679 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3680 if (ONIGENC_IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3681 }
3682 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3683 }
3684 else {
3685 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3686 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* 0: invalid code point */
3687 && ! ONIGENC_IS_CODE_WORD(enc, c))
3688 BITSET_SET_BIT(cc->bs, c);
3689 }
3690 }
3691 break;
3692
3693 default:
3694 return ONIGERR_PARSER_BUG;
3695 break;
3696 }
3697
3698 return r;
3699 }
3700
3701 static int
parse_ctype_to_enc_ctype(int pctype,int * not)3702 parse_ctype_to_enc_ctype(int pctype, int* not)
3703 {
3704 int ctype;
3705
3706 switch (pctype) {
3707 case CTYPE_WORD:
3708 ctype = ONIGENC_CTYPE_WORD;
3709 *not = 0;
3710 break;
3711 case CTYPE_NOT_WORD:
3712 ctype = ONIGENC_CTYPE_WORD;
3713 *not = 1;
3714 break;
3715 case CTYPE_WHITE_SPACE:
3716 ctype = ONIGENC_CTYPE_SPACE;
3717 *not = 0;
3718 break;
3719 case CTYPE_NOT_WHITE_SPACE:
3720 ctype = ONIGENC_CTYPE_SPACE;
3721 *not = 1;
3722 break;
3723 case CTYPE_DIGIT:
3724 ctype = ONIGENC_CTYPE_DIGIT;
3725 *not = 0;
3726 break;
3727 case CTYPE_NOT_DIGIT:
3728 ctype = ONIGENC_CTYPE_DIGIT;
3729 *not = 1;
3730 break;
3731 case CTYPE_XDIGIT:
3732 ctype = ONIGENC_CTYPE_XDIGIT;
3733 *not = 0;
3734 break;
3735 case CTYPE_NOT_XDIGIT:
3736 ctype = ONIGENC_CTYPE_XDIGIT;
3737 *not = 1;
3738 break;
3739 default:
3740 return ONIGERR_PARSER_BUG;
3741 break;
3742 }
3743 return ctype;
3744 }
3745
3746 typedef struct {
3747 UChar *name;
3748 int ctype;
3749 short int len;
3750 } PosixBracketEntryType;
3751
3752 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3753 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3754 {
3755 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3756 #define POSIX_BRACKET_NAME_MAX_LEN 6
3757
3758 static PosixBracketEntryType PBS[] = {
3759 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3760 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3761 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3762 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3763 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3764 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3765 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3766 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3767 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3768 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3769 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3770 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3771 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3772 { (UChar* )NULL, -1, 0 }
3773 };
3774
3775 PosixBracketEntryType *pb;
3776 int not, i, r;
3777 OnigCodePoint c;
3778 OnigEncoding enc = env->enc;
3779 UChar *p = *src;
3780 PFETCH_READY;
3781
3782 if (PPEEK_IS('^')) {
3783 PINC;
3784 not = 1;
3785 }
3786 else
3787 not = 0;
3788
3789 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2)
3790 goto not_posix_bracket;
3791
3792 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3793 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3794 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3795 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3796 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3797
3798 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3799 if (r != 0) return r;
3800
3801 PINC; PINC;
3802 *src = p;
3803 return 0;
3804 }
3805 }
3806
3807 not_posix_bracket:
3808 c = 0;
3809 i = 0;
3810 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3811 PINC;
3812 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3813 }
3814 if (c == ':' && ! PEND) {
3815 PINC;
3816 if (! PEND) {
3817 PFETCH(c);
3818 if (c == ']')
3819 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3820 }
3821 }
3822
3823 return 1; /* 1: is not POSIX bracket, but no error. */
3824 }
3825
3826 static int
property_name_to_ctype(UChar * p,UChar * end,OnigEncoding enc)3827 property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc)
3828 {
3829 static PosixBracketEntryType PBS[] = {
3830 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
3831 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
3832 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
3833 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3834 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
3835 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
3836 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
3837 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
3838 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
3839 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
3840 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
3841 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
3842 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
3843 { (UChar* )NULL, -1, 0 }
3844 };
3845
3846 PosixBracketEntryType *pb;
3847 int len;
3848
3849 len = onigenc_strlen(enc, p, end);
3850 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3851 if (len == pb->len &&
3852 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
3853 return pb->ctype;
3854 }
3855
3856 return -1;
3857 }
3858
3859 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3860 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3861 {
3862 int ctype;
3863 OnigCodePoint c;
3864 OnigEncoding enc = env->enc;
3865 UChar *prev, *start, *p = *src;
3866 PFETCH_READY;
3867
3868 /* 'IsXXXX' => 'XXXX' */
3869 if (!PEND &&
3870 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) {
3871 c = PPEEK;
3872 if (c == 'I') {
3873 PINC;
3874 if (! PEND) {
3875 c = PPEEK;
3876 if (c == 's')
3877 PINC;
3878 else
3879 PUNFETCH;
3880 }
3881 }
3882 }
3883
3884 start = prev = p;
3885
3886 while (!PEND) {
3887 prev = p;
3888 PFETCH(c);
3889 if (c == '}') {
3890 ctype = property_name_to_ctype(start, prev, enc);
3891 if (ctype < 0) break;
3892
3893 *src = p;
3894 return ctype;
3895 }
3896 else if (c == '(' || c == ')' || c == '{' || c == '|')
3897 break;
3898 }
3899
3900 onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME,
3901 *src, prev);
3902 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
3903 }
3904
3905 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3906 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
3907 ScanEnv* env)
3908 {
3909 int r, ctype;
3910 CClassNode* cc;
3911
3912 ctype = fetch_char_property_to_ctype(src, end, env);
3913 if (ctype < 0) return ctype;
3914
3915 *np = node_new_cclass();
3916 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
3917 cc = &(NCCLASS(*np));
3918 r = add_ctype_to_cc(cc, ctype, 0, env);
3919 if (r != 0) return r;
3920 if (tok->u.prop.not != 0) CCLASS_SET_NOT(cc);
3921
3922 return 0;
3923 }
3924
3925
3926 enum CCSTATE {
3927 CCS_VALUE,
3928 CCS_RANGE,
3929 CCS_COMPLETE,
3930 CCS_START
3931 };
3932
3933 enum CCVALTYPE {
3934 CCV_SB,
3935 CCV_CODE_POINT,
3936 CCV_CLASS
3937 };
3938
3939 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)3940 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
3941 enum CCSTATE* state, ScanEnv* env)
3942 {
3943 int r;
3944
3945 if (*state == CCS_RANGE)
3946 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
3947
3948 if (*state == CCS_VALUE && *type != CCV_CLASS) {
3949 if (*type == CCV_SB)
3950 BITSET_SET_BIT(cc->bs, (int )(*vs));
3951 else if (*type == CCV_CODE_POINT) {
3952 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3953 if (r < 0) return r;
3954 }
3955 }
3956
3957 *state = CCS_VALUE;
3958 *type = CCV_CLASS;
3959 return 0;
3960 }
3961
3962 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)3963 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
3964 int* vs_israw, int v_israw,
3965 enum CCVALTYPE intype, enum CCVALTYPE* type,
3966 enum CCSTATE* state, ScanEnv* env)
3967 {
3968 int r;
3969
3970 switch (*state) {
3971 case CCS_VALUE:
3972 if (*type == CCV_SB)
3973 BITSET_SET_BIT(cc->bs, (int )(*vs));
3974 else if (*type == CCV_CODE_POINT) {
3975 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3976 if (r < 0) return r;
3977 }
3978 break;
3979
3980 case CCS_RANGE:
3981 if (intype == *type) {
3982 if (intype == CCV_SB) {
3983 if (*vs > 0xff || v > 0xff)
3984 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
3985
3986 if (*vs > v) {
3987 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3988 goto ccs_range_end;
3989 else
3990 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3991 }
3992 bitset_set_range(cc->bs, (int )*vs, (int )v);
3993 }
3994 else {
3995 r = add_code_range(&(cc->mbuf), env, *vs, v);
3996 if (r < 0) return r;
3997 }
3998 }
3999 else {
4000 #if 0
4001 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4002 #endif
4003 if (*vs > v) {
4004 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4005 goto ccs_range_end;
4006 else
4007 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4008 }
4009 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4010 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4011 if (r < 0) return r;
4012 #if 0
4013 }
4014 else
4015 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4016 #endif
4017 }
4018 ccs_range_end:
4019 *state = CCS_COMPLETE;
4020 break;
4021
4022 case CCS_COMPLETE:
4023 case CCS_START:
4024 *state = CCS_VALUE;
4025 break;
4026
4027 default:
4028 break;
4029 }
4030
4031 *vs_israw = v_israw;
4032 *vs = v;
4033 *type = intype;
4034 return 0;
4035 }
4036
4037 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,OnigEncoding enc)4038 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4039 OnigEncoding enc)
4040 {
4041 int in_esc;
4042 OnigCodePoint code;
4043 UChar* p = from;
4044 PFETCH_READY;
4045
4046 in_esc = 0;
4047 while (! PEND) {
4048 if (ignore_escaped && in_esc) {
4049 in_esc = 0;
4050 }
4051 else {
4052 PFETCH(code);
4053 if (code == c) return 1;
4054 if (code == MC_ESC(enc)) in_esc = 1;
4055 }
4056 }
4057 return 0;
4058 }
4059
4060 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4061 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4062 ScanEnv* env)
4063 {
4064 int r, neg, len, fetched, and_start;
4065 OnigCodePoint v, vs;
4066 UChar *p;
4067 Node* node;
4068 CClassNode *cc, *prev_cc;
4069 CClassNode work_cc;
4070
4071 enum CCSTATE state;
4072 enum CCVALTYPE val_type, in_type;
4073 int val_israw, in_israw;
4074
4075 prev_cc = (CClassNode* )NULL;
4076 *np = NULL_NODE;
4077 r = fetch_token_in_cc(tok, src, end, env);
4078 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4079 neg = 1;
4080 r = fetch_token_in_cc(tok, src, end, env);
4081 }
4082 else {
4083 neg = 0;
4084 }
4085
4086 if (r < 0) return r;
4087 if (r == TK_CC_CLOSE) {
4088 if (! code_exist_check((OnigCodePoint )']',
4089 *src, env->pattern_end, 1, env->enc))
4090 return ONIGERR_EMPTY_CHAR_CLASS;
4091
4092 CC_ESC_WARN(env, (UChar* )"]");
4093 r = tok->type = TK_CHAR; /* allow []...] */
4094 }
4095
4096 *np = node = node_new_cclass();
4097 CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY);
4098 cc = &(NCCLASS(node));
4099
4100 and_start = 0;
4101 state = CCS_START;
4102 p = *src;
4103 while (r != TK_CC_CLOSE) {
4104 fetched = 0;
4105 switch (r) {
4106 case TK_CHAR:
4107 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4108 if (len > 1) {
4109 in_type = CCV_CODE_POINT;
4110 }
4111 else {
4112 sb_char:
4113 in_type = CCV_SB;
4114 }
4115 v = (OnigCodePoint )tok->u.c;
4116 in_israw = 0;
4117 goto val_entry2;
4118 break;
4119
4120 case TK_RAW_BYTE:
4121 /* tok->base != 0 : octal or hexadec. */
4122 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4123 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4124 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4125 UChar* psave = p;
4126 int i, base = tok->base;
4127
4128 buf[0] = tok->u.c;
4129 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4130 r = fetch_token_in_cc(tok, &p, end, env);
4131 if (r < 0) goto err;
4132 if (r != TK_RAW_BYTE || tok->base != base) {
4133 fetched = 1;
4134 break;
4135 }
4136 buf[i] = tok->u.c;
4137 }
4138
4139 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4140 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4141 goto err;
4142 }
4143
4144 len = enc_len(env->enc, buf);
4145 if (i < len) {
4146 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4147 goto err;
4148 }
4149 else if (i > len) { /* fetch back */
4150 p = psave;
4151 for (i = 1; i < len; i++) {
4152 r = fetch_token_in_cc(tok, &p, end, env);
4153 }
4154 fetched = 0;
4155 }
4156
4157 if (i == 1) {
4158 v = (OnigCodePoint )buf[0];
4159 goto raw_single;
4160 }
4161 else {
4162 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4163 in_type = CCV_CODE_POINT;
4164 }
4165 }
4166 else {
4167 v = (OnigCodePoint )tok->u.c;
4168 raw_single:
4169 in_type = CCV_SB;
4170 }
4171 in_israw = 1;
4172 goto val_entry2;
4173 break;
4174
4175 case TK_CODE_POINT:
4176 v = tok->u.code;
4177 in_israw = 1;
4178 val_entry:
4179 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4180 if (len < 0) {
4181 r = len;
4182 goto err;
4183 }
4184 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4185 val_entry2:
4186 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4187 &state, env);
4188 if (r != 0) goto err;
4189 break;
4190
4191 case TK_POSIX_BRACKET_OPEN:
4192 r = parse_posix_bracket(cc, &p, end, env);
4193 if (r < 0) goto err;
4194 if (r == 1) { /* is not POSIX bracket */
4195 CC_ESC_WARN(env, (UChar* )"[");
4196 p = tok->backp;
4197 v = (OnigCodePoint )tok->u.c;
4198 in_israw = 0;
4199 goto val_entry;
4200 }
4201 goto next_class;
4202 break;
4203
4204 case TK_CHAR_TYPE:
4205 {
4206 int ctype, not;
4207 ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4208 r = add_ctype_to_cc(cc, ctype, not, env);
4209 if (r != 0) return r;
4210 }
4211
4212 next_class:
4213 r = next_state_class(cc, &vs, &val_type, &state, env);
4214 if (r != 0) goto err;
4215 break;
4216
4217 case TK_CHAR_PROPERTY:
4218 {
4219 int ctype;
4220
4221 ctype = fetch_char_property_to_ctype(&p, end, env);
4222 if (ctype < 0) return ctype;
4223 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4224 if (r != 0) return r;
4225 goto next_class;
4226 }
4227 break;
4228
4229 case TK_CC_RANGE:
4230 if (state == CCS_VALUE) {
4231 r = fetch_token_in_cc(tok, &p, end, env);
4232 if (r < 0) goto err;
4233 fetched = 1;
4234 if (r == TK_CC_CLOSE) { /* allow [x-] */
4235 range_end_val:
4236 v = (OnigCodePoint )'-';
4237 in_israw = 0;
4238 goto val_entry;
4239 }
4240 else if (r == TK_CC_AND) {
4241 CC_ESC_WARN(env, (UChar* )"-");
4242 goto range_end_val;
4243 }
4244 state = CCS_RANGE;
4245 }
4246 else if (state == CCS_START) {
4247 /* [-xa] is allowed */
4248 v = (OnigCodePoint )tok->u.c;
4249 in_israw = 0;
4250
4251 r = fetch_token_in_cc(tok, &p, end, env);
4252 if (r < 0) goto err;
4253 fetched = 1;
4254 /* [--x] or [a&&-x] is warned. */
4255 if (r == TK_CC_RANGE || and_start != 0)
4256 CC_ESC_WARN(env, (UChar* )"-");
4257
4258 goto val_entry;
4259 }
4260 else if (state == CCS_RANGE) {
4261 CC_ESC_WARN(env, (UChar* )"-");
4262 goto sb_char; /* [!--x] is allowed */
4263 }
4264 else { /* CCS_COMPLETE */
4265 r = fetch_token_in_cc(tok, &p, end, env);
4266 if (r < 0) goto err;
4267 fetched = 1;
4268 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4269 else if (r == TK_CC_AND) {
4270 CC_ESC_WARN(env, (UChar* )"-");
4271 goto range_end_val;
4272 }
4273
4274 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4275 CC_ESC_WARN(env, (UChar* )"-");
4276 goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4277 }
4278 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4279 goto err;
4280 }
4281 break;
4282
4283 case TK_CC_CC_OPEN: /* [ */
4284 {
4285 Node *anode;
4286 CClassNode* acc;
4287
4288 r = parse_char_class(&anode, tok, &p, end, env);
4289 if (r != 0) goto cc_open_err;
4290 acc = &(NCCLASS(anode));
4291 r = or_cclass(cc, acc, env->enc);
4292
4293 onig_node_free(anode);
4294 cc_open_err:
4295 if (r != 0) goto err;
4296 }
4297 break;
4298
4299 case TK_CC_AND: /* && */
4300 {
4301 if (state == CCS_VALUE) {
4302 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4303 &val_type, &state, env);
4304 if (r != 0) goto err;
4305 }
4306 /* initialize local variables */
4307 and_start = 1;
4308 state = CCS_START;
4309
4310 if (IS_NOT_NULL(prev_cc)) {
4311 r = and_cclass(prev_cc, cc, env->enc);
4312 if (r != 0) goto err;
4313 bbuf_free(cc->mbuf);
4314 }
4315 else {
4316 prev_cc = cc;
4317 cc = &work_cc;
4318 }
4319 initialize_cclass(cc);
4320 }
4321 break;
4322
4323 case TK_EOT:
4324 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4325 goto err;
4326 break;
4327 default:
4328 r = ONIGERR_PARSER_BUG;
4329 goto err;
4330 break;
4331 }
4332
4333 if (fetched)
4334 r = tok->type;
4335 else {
4336 r = fetch_token_in_cc(tok, &p, end, env);
4337 if (r < 0) goto err;
4338 }
4339 }
4340
4341 if (state == CCS_VALUE) {
4342 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4343 &val_type, &state, env);
4344 if (r != 0) goto err;
4345 }
4346
4347 if (IS_NOT_NULL(prev_cc)) {
4348 r = and_cclass(prev_cc, cc, env->enc);
4349 if (r != 0) goto err;
4350 bbuf_free(cc->mbuf);
4351 cc = prev_cc;
4352 }
4353
4354 if (neg != 0)
4355 CCLASS_SET_NOT(cc);
4356 else
4357 CCLASS_CLEAR_NOT(cc);
4358 if (IS_CCLASS_NOT(cc) &&
4359 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4360 int is_empty;
4361
4362 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4363 if (is_empty != 0)
4364 BITSET_IS_EMPTY(cc->bs, is_empty);
4365
4366 if (is_empty == 0) {
4367 #define NEWLINE_CODE 0x0a
4368
4369 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4370 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4371 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4372 else
4373 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4374 }
4375 }
4376 }
4377 *src = p;
4378 return 0;
4379
4380 err:
4381 if (cc != &(NCCLASS(*np)))
4382 bbuf_free(cc->mbuf);
4383 onig_node_free(*np);
4384 return r;
4385 }
4386
4387 static int parse_subexp(Node** top, OnigToken* tok, int term,
4388 UChar** src, UChar* end, ScanEnv* env);
4389
4390 static int
parse_effect(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4391 parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4392 ScanEnv* env)
4393 {
4394 int r, num;
4395 int list_capture;
4396 Node *target;
4397 OnigOptionType option;
4398 OnigEncoding enc = env->enc;
4399 OnigCodePoint c;
4400 UChar* p = *src;
4401 PFETCH_READY;
4402
4403 *np = NULL;
4404 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4405
4406 option = env->option;
4407 if (PPEEK_IS('?') &&
4408 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4409 PINC;
4410 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4411
4412 PFETCH(c);
4413 switch (c) {
4414 case ':': /* (?:...) grouping only */
4415 group:
4416 r = fetch_token(tok, &p, end, env);
4417 if (r < 0) return r;
4418 r = parse_subexp(np, tok, term, &p, end, env);
4419 if (r < 0) return r;
4420 *src = p;
4421 return 1; /* group */
4422 break;
4423
4424 case '=':
4425 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4426 break;
4427 case '!': /* preceding read */
4428 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4429 break;
4430 case '>': /* (?>...) stop backtrack */
4431 *np = node_new_effect(EFFECT_STOP_BACKTRACK);
4432 break;
4433
4434 case '<': /* look behind (?<=...), (?<!...) */
4435 PFETCH(c);
4436 if (c == '=')
4437 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4438 else if (c == '!')
4439 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4440 #ifdef USE_NAMED_GROUP
4441 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4442 UChar *name;
4443 UChar *name_end;
4444
4445 PUNFETCH;
4446 list_capture = 0;
4447
4448 named_group:
4449 name = p;
4450 r = fetch_name(&p, end, &name_end, env, 0);
4451 if (r < 0) return r;
4452
4453 num = scan_env_add_mem_entry(env);
4454 if (num < 0) return num;
4455 if (list_capture != 0 && num >= BIT_STATUS_BITS_NUM)
4456 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4457
4458 r = name_add(env->reg, name, name_end, num, env);
4459 if (r != 0) return r;
4460 *np = node_new_effect_memory(env->option, 1);
4461 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4462 NEFFECT(*np).regnum = num;
4463 if (list_capture != 0)
4464 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4465 env->num_named++;
4466 }
4467 #endif
4468 else
4469 return ONIGERR_UNDEFINED_GROUP_OPTION;
4470 break;
4471
4472 case '@':
4473 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4474 #ifdef USE_NAMED_GROUP
4475 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4476 PFETCH(c);
4477 if (c == '<') {
4478 list_capture = 1;
4479 goto named_group; /* (?@<name>...) */
4480 }
4481 PUNFETCH;
4482 }
4483 #endif
4484 *np = node_new_effect_memory(env->option, 0);
4485 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4486 num = scan_env_add_mem_entry(env);
4487 if (num < 0) {
4488 onig_node_free(*np);
4489 return num;
4490 }
4491 else if (num >= BIT_STATUS_BITS_NUM) {
4492 onig_node_free(*np);
4493 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4494 }
4495 NEFFECT(*np).regnum = num;
4496 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4497 }
4498 else {
4499 return ONIGERR_UNDEFINED_GROUP_OPTION;
4500 }
4501 break;
4502
4503 #ifdef USE_POSIXLINE_OPTION
4504 case 'p':
4505 #endif
4506 case '-': case 'i': case 'm': case 's': case 'x':
4507 {
4508 int neg = 0;
4509
4510 while (1) {
4511 switch (c) {
4512 case ':':
4513 case ')':
4514 break;
4515
4516 case '-': neg = 1; break;
4517 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4518 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4519 case 's':
4520 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4521 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4522 }
4523 else
4524 return ONIGERR_UNDEFINED_GROUP_OPTION;
4525 break;
4526
4527 case 'm':
4528 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4529 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4530 }
4531 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4532 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4533 }
4534 else
4535 return ONIGERR_UNDEFINED_GROUP_OPTION;
4536 break;
4537 #ifdef USE_POSIXLINE_OPTION
4538 case 'p':
4539 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4540 break;
4541 #endif
4542 default:
4543 return ONIGERR_UNDEFINED_GROUP_OPTION;
4544 }
4545
4546 if (c == ')') {
4547 *np = node_new_option(option);
4548 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4549 *src = p;
4550 return 2; /* option only */
4551 }
4552 else if (c == ':') {
4553 OnigOptionType prev = env->option;
4554
4555 env->option = option;
4556 r = fetch_token(tok, &p, end, env);
4557 if (r < 0) return r;
4558 r = parse_subexp(&target, tok, term, &p, end, env);
4559 env->option = prev;
4560 if (r < 0) return r;
4561 *np = node_new_option(option);
4562 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4563 NEFFECT(*np).target = target;
4564 *src = p;
4565 return 0;
4566 }
4567
4568 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4569 PFETCH(c);
4570 }
4571 }
4572 break;
4573
4574 default:
4575 return ONIGERR_UNDEFINED_GROUP_OPTION;
4576 }
4577 }
4578 else {
4579 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4580 goto group;
4581
4582 *np = node_new_effect_memory(env->option, 0);
4583 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4584 num = scan_env_add_mem_entry(env);
4585 if (num < 0) return num;
4586 NEFFECT(*np).regnum = num;
4587 }
4588
4589 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4590 r = fetch_token(tok, &p, end, env);
4591 if (r < 0) return r;
4592 r = parse_subexp(&target, tok, term, &p, end, env);
4593 if (r < 0) return r;
4594
4595 if (NTYPE(*np) == N_ANCHOR)
4596 NANCHOR(*np).target = target;
4597 else {
4598 NEFFECT(*np).target = target;
4599 if (NEFFECT(*np).type == EFFECT_MEMORY) {
4600 /* Don't move this to previous of parse_subexp() */
4601 r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np);
4602 if (r != 0) return r;
4603 }
4604 }
4605
4606 *src = p;
4607 return 0;
4608 }
4609
4610 static const char* PopularQStr[] = {
4611 "?", "*", "+", "??", "*?", "+?"
4612 };
4613
4614 static const char* ReduceQStr[] = {
4615 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4616 };
4617
4618 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4619 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4620 {
4621 QuantifierNode* qn;
4622
4623 qn = &(NQUANTIFIER(qnode));
4624 if (qn->lower == 1 && qn->upper == 1) {
4625 return 1;
4626 }
4627
4628 switch (NTYPE(target)) {
4629 case N_STRING:
4630 if (! group) {
4631 StrNode* sn = &(NSTRING(target));
4632 if (str_node_can_be_split(sn, env->enc)) {
4633 Node* n = str_node_split_last_char(sn, env->enc);
4634 if (IS_NOT_NULL(n)) {
4635 qn->target = n;
4636 return 2;
4637 }
4638 }
4639 }
4640 break;
4641
4642 case N_QUANTIFIER:
4643 { /* check redundant double repeat. */
4644 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4645 QuantifierNode* qnt = &(NQUANTIFIER(target));
4646 int nestq_num = popular_quantifier_num(qn);
4647 int targetq_num = popular_quantifier_num(qnt);
4648
4649 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4650 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4651 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4652 UChar buf[WARN_BUFSIZE];
4653
4654 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4655 case RQ_ASIS:
4656 break;
4657
4658 case RQ_DEL:
4659 if (onig_verb_warn != onig_null_warn) {
4660 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4661 env->pattern, env->pattern_end,
4662 (UChar* )"redundant nested repeat operator");
4663 (*onig_verb_warn)((char* )buf);
4664 }
4665 goto warn_exit;
4666 break;
4667
4668 default:
4669 if (onig_verb_warn != onig_null_warn) {
4670 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4671 env->pattern, env->pattern_end,
4672 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4673 PopularQStr[targetq_num], PopularQStr[nestq_num],
4674 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4675 (*onig_verb_warn)((char* )buf);
4676 }
4677 goto warn_exit;
4678 break;
4679 }
4680 }
4681
4682 warn_exit:
4683 #endif
4684 if (targetq_num >= 0) {
4685 if (nestq_num >= 0) {
4686 onig_reduce_nested_quantifier(qnode, target);
4687 goto q_exit;
4688 }
4689 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4690 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4691 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4692 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4693 }
4694 }
4695 }
4696 }
4697 break;
4698
4699 default:
4700 break;
4701 }
4702
4703 qn->target = target;
4704 q_exit:
4705 return 0;
4706 }
4707
4708 #ifdef USE_SHARED_CCLASS_TABLE
4709
4710 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4711
4712 /* for ctype node hash table */
4713
4714 typedef struct {
4715 OnigEncoding enc;
4716 int not;
4717 int type;
4718 } type_cclass_key;
4719
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4720 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4721 {
4722 if (x->type != y->type) return 1;
4723 if (x->enc != y->enc) return 1;
4724 if (x->not != y->not) return 1;
4725 return 0;
4726 }
4727
type_cclass_hash(type_cclass_key * key)4728 static int type_cclass_hash(type_cclass_key* key)
4729 {
4730 int i, val;
4731 unsigned char *p;
4732
4733 val = 0;
4734
4735 p = (unsigned char* )&(key->enc);
4736 for (i = 0; i < sizeof(key->enc); i++) {
4737 val = val * 997 + (int )*p++;
4738 }
4739
4740 p = (unsigned char* )(&key->type);
4741 for (i = 0; i < sizeof(key->type); i++) {
4742 val = val * 997 + (int )*p++;
4743 }
4744
4745 val += key->not;
4746 return val + (val >> 5);
4747 }
4748
4749 static struct st_hash_type type_type_cclass_hash = {
4750 type_cclass_cmp,
4751 type_cclass_hash,
4752 };
4753
4754 static st_table* OnigTypeCClassTable;
4755
4756
4757 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg)4758 i_free_shared_class(type_cclass_key* key, Node* node, void* arg)
4759 {
4760 if (IS_NOT_NULL(node)) {
4761 CClassNode* cc = &(NCCLASS(node));
4762 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4763 xfree(node);
4764 }
4765
4766 if (IS_NOT_NULL(key)) xfree(key);
4767 return ST_DELETE;
4768 }
4769
4770 extern int
onig_free_shared_cclass_table(void)4771 onig_free_shared_cclass_table(void)
4772 {
4773 if (IS_NOT_NULL(OnigTypeCClassTable)) {
4774 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4775 onig_st_free_table(OnigTypeCClassTable);
4776 OnigTypeCClassTable = NULL;
4777 }
4778
4779 return 0;
4780 }
4781
4782 #endif /* USE_SHARED_CCLASS_TABLE */
4783
4784
4785 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4786 parse_exp(Node** np, OnigToken* tok, int term,
4787 UChar** src, UChar* end, ScanEnv* env)
4788 {
4789 int r, len, group = 0;
4790 Node* qn;
4791 Node** targetp;
4792
4793 *np = NULL;
4794 if (tok->type == term)
4795 goto end_of_token;
4796
4797 switch (tok->type) {
4798 case TK_ALT:
4799 case TK_EOT:
4800 end_of_token:
4801 *np = node_new_empty();
4802 return tok->type;
4803 break;
4804
4805 case TK_SUBEXP_OPEN:
4806 r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env);
4807 if (r < 0) return r;
4808 if (r == 1) group = 1;
4809 else if (r == 2) { /* option only */
4810 Node* target;
4811 OnigOptionType prev = env->option;
4812
4813 env->option = NEFFECT(*np).option;
4814 r = fetch_token(tok, src, end, env);
4815 if (r < 0) return r;
4816 r = parse_subexp(&target, tok, term, src, end, env);
4817 env->option = prev;
4818 if (r < 0) return r;
4819 NEFFECT(*np).target = target;
4820 return tok->type;
4821 }
4822 break;
4823
4824 case TK_SUBEXP_CLOSE:
4825 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
4826 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
4827
4828 if (tok->escaped) goto tk_raw_byte;
4829 else goto tk_byte;
4830 break;
4831
4832 case TK_STRING:
4833 tk_byte:
4834 {
4835 *np = node_new_str(tok->backp, *src);
4836 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4837
4838 while (1) {
4839 r = fetch_token(tok, src, end, env);
4840 if (r < 0) return r;
4841 if (r != TK_STRING) break;
4842
4843 r = onig_node_str_cat(*np, tok->backp, *src);
4844 if (r < 0) return r;
4845 }
4846
4847 string_end:
4848 targetp = np;
4849 goto repeat;
4850 }
4851 break;
4852
4853 case TK_RAW_BYTE:
4854 tk_raw_byte:
4855 {
4856 *np = node_new_str_char((UChar )tok->u.c);
4857 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4858 len = 1;
4859 while (1) {
4860 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
4861 if (len == enc_len(env->enc, NSTRING(*np).s)) {
4862 r = fetch_token(tok, src, end, env);
4863 goto string_end;
4864 }
4865 }
4866
4867 r = fetch_token(tok, src, end, env);
4868 if (r < 0) return r;
4869 if (r != TK_RAW_BYTE) {
4870 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
4871 int rem;
4872 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
4873 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
4874 (void )node_str_head_pad(&NSTRING(*np), rem, (UChar )0);
4875 if (len + rem == enc_len(env->enc, NSTRING(*np).s)) {
4876 goto string_end;
4877 }
4878 }
4879 #endif
4880 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4881 }
4882
4883 r = node_str_cat_char(*np, (UChar )tok->u.c);
4884 if (r < 0) return r;
4885
4886 len++;
4887 }
4888 }
4889 break;
4890
4891 case TK_CODE_POINT:
4892 {
4893 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4894 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
4895 if (num < 0) return num;
4896 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
4897 *np = node_new_str_raw(buf, buf + num);
4898 #else
4899 *np = node_new_str(buf, buf + num);
4900 #endif
4901 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4902 }
4903 break;
4904
4905 case TK_QUOTE_OPEN:
4906 {
4907 OnigCodePoint end_op[2];
4908 UChar *qstart, *qend, *nextp;
4909
4910 end_op[0] = (OnigCodePoint )MC_ESC(env->enc);
4911 end_op[1] = (OnigCodePoint )'E';
4912 qstart = *src;
4913 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
4914 if (IS_NULL(qend)) {
4915 nextp = qend = end;
4916 }
4917 *np = node_new_str(qstart, qend);
4918 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4919 *src = nextp;
4920 }
4921 break;
4922
4923 case TK_CHAR_TYPE:
4924 {
4925 switch (tok->u.subtype) {
4926 case CTYPE_WORD:
4927 case CTYPE_NOT_WORD:
4928 *np = node_new_ctype(tok->u.subtype);
4929 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4930 break;
4931
4932 case CTYPE_WHITE_SPACE:
4933 case CTYPE_NOT_WHITE_SPACE:
4934 case CTYPE_DIGIT:
4935 case CTYPE_NOT_DIGIT:
4936 case CTYPE_XDIGIT:
4937 case CTYPE_NOT_XDIGIT:
4938 {
4939 CClassNode* cc;
4940 int ctype, not;
4941
4942 #ifdef USE_SHARED_CCLASS_TABLE
4943 const OnigCodePoint *sbr, *mbr;
4944
4945 ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4946 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr);
4947 if (r == 0 &&
4948 ONIGENC_CODE_RANGE_NUM(mbr)
4949 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
4950 type_cclass_key key;
4951 type_cclass_key* new_key;
4952
4953 key.enc = env->enc;
4954 key.not = not;
4955 key.type = ctype;
4956
4957 THREAD_ATOMIC_START;
4958
4959 if (IS_NULL(OnigTypeCClassTable)) {
4960 OnigTypeCClassTable
4961 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
4962 if (IS_NULL(OnigTypeCClassTable)) {
4963 THREAD_ATOMIC_END;
4964 return ONIGERR_MEMORY;
4965 }
4966 }
4967 else {
4968 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
4969 (st_data_t* )np)) {
4970 THREAD_ATOMIC_END;
4971 break;
4972 }
4973 }
4974
4975 *np = node_new_cclass_by_codepoint_range(not, sbr, mbr);
4976 if (IS_NULL(*np)) {
4977 THREAD_ATOMIC_END;
4978 return ONIGERR_MEMORY;
4979 }
4980
4981 CCLASS_SET_SHARE(&(NCCLASS(*np)));
4982 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
4983 xmemcpy(new_key, &key, sizeof(type_cclass_key));
4984 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
4985 (st_data_t )*np);
4986
4987 THREAD_ATOMIC_END;
4988 }
4989 else {
4990 #endif
4991 ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4992 *np = node_new_cclass();
4993 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4994 cc = &(NCCLASS(*np));
4995 add_ctype_to_cc(cc, ctype, 0, env);
4996 if (not != 0) CCLASS_SET_NOT(cc);
4997 #ifdef USE_SHARED_CCLASS_TABLE
4998 }
4999 #endif
5000 }
5001 break;
5002
5003 default:
5004 return ONIGERR_PARSER_BUG;
5005 break;
5006 }
5007 }
5008 break;
5009
5010 case TK_CHAR_PROPERTY:
5011 r = parse_char_property(np, tok, src, end, env);
5012 if (r != 0) return r;
5013 break;
5014
5015 case TK_CC_OPEN:
5016 {
5017 CClassNode* cc;
5018
5019 r = parse_char_class(np, tok, src, end, env);
5020 if (r != 0) return r;
5021
5022 cc = &(NCCLASS(*np));
5023
5024 if (IS_IGNORECASE(env->option)) {
5025 int i, n, in_cc;
5026 const OnigPairAmbigCodes* ccs;
5027 BitSetRef bs = cc->bs;
5028 OnigAmbigType amb;
5029
5030 for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
5031 if ((amb & env->ambig_flag) == 0) continue;
5032
5033 n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs);
5034 for (i = 0; i < n; i++) {
5035 in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc);
5036
5037 if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) ||
5038 (in_cc == 0 && IS_CCLASS_NOT(cc))) {
5039 if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
5040 ccs[i].from >= SINGLE_BYTE_SIZE) {
5041 /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */
5042 add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to);
5043 }
5044 else {
5045 if (BITSET_AT(bs, ccs[i].from)) {
5046 /* /(?i:[^A-C])/.match("a") ==> fail. */
5047 BITSET_SET_BIT(bs, ccs[i].to);
5048 }
5049 if (BITSET_AT(bs, ccs[i].to)) {
5050 BITSET_SET_BIT(bs, ccs[i].from);
5051 }
5052 }
5053 }
5054 }
5055 }
5056 }
5057 }
5058 break;
5059
5060 case TK_ANYCHAR:
5061 *np = node_new_anychar();
5062 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5063 break;
5064
5065 case TK_ANYCHAR_ANYTIME:
5066 *np = node_new_anychar();
5067 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5068 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5069 CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5070 NQUANTIFIER(qn).target = *np;
5071 *np = qn;
5072 break;
5073
5074 case TK_BACKREF:
5075 len = tok->u.backref.num;
5076 *np = node_new_backref(len,
5077 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5078 tok->u.backref.by_name,
5079 #ifdef USE_BACKREF_AT_LEVEL
5080 tok->u.backref.exist_level,
5081 tok->u.backref.level,
5082 #endif
5083 env);
5084 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5085 break;
5086
5087 #ifdef USE_SUBEXP_CALL
5088 case TK_CALL:
5089 *np = node_new_call(tok->u.call.name, tok->u.call.name_end);
5090 CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5091 env->num_call++;
5092 break;
5093 #endif
5094
5095 case TK_ANCHOR:
5096 *np = onig_node_new_anchor(tok->u.anchor);
5097 break;
5098
5099 case TK_OP_REPEAT:
5100 case TK_INTERVAL:
5101 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5102 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5103 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5104 else
5105 *np = node_new_empty();
5106 }
5107 else {
5108 goto tk_byte;
5109 }
5110 break;
5111
5112 default:
5113 return ONIGERR_PARSER_BUG;
5114 break;
5115 }
5116
5117 {
5118 targetp = np;
5119
5120 re_entry:
5121 r = fetch_token(tok, src, end, env);
5122 if (r < 0) return r;
5123
5124 repeat:
5125 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5126 if (is_invalid_quantifier_target(*targetp))
5127 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5128
5129 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5130 (r == TK_INTERVAL ? 1 : 0));
5131 CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5132 NQUANTIFIER(qn).greedy = tok->u.repeat.greedy;
5133 r = set_quantifier(qn, *targetp, group, env);
5134 if (r < 0) return r;
5135
5136 if (tok->u.repeat.possessive != 0) {
5137 Node* en;
5138 en = node_new_effect(EFFECT_STOP_BACKTRACK);
5139 CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
5140 NEFFECT(en).target = qn;
5141 qn = en;
5142 }
5143
5144 if (r == 0) {
5145 *targetp = qn;
5146 }
5147 else if (r == 2) { /* split case: /abc+/ */
5148 Node *tmp;
5149
5150 *targetp = node_new_list(*targetp, NULL);
5151 CHECK_NULL_RETURN_VAL(*targetp, ONIGERR_MEMORY);
5152 tmp = NCONS(*targetp).right = node_new_list(qn, NULL);
5153 CHECK_NULL_RETURN_VAL(tmp, ONIGERR_MEMORY);
5154 targetp = &(NCONS(tmp).left);
5155 }
5156 goto re_entry;
5157 }
5158 }
5159
5160 return r;
5161 }
5162
5163 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5164 parse_branch(Node** top, OnigToken* tok, int term,
5165 UChar** src, UChar* end, ScanEnv* env)
5166 {
5167 int r;
5168 Node *node, **headp;
5169
5170 *top = NULL;
5171 r = parse_exp(&node, tok, term, src, end, env);
5172 if (r < 0) return r;
5173
5174 if (r == TK_EOT || r == term || r == TK_ALT) {
5175 *top = node;
5176 }
5177 else {
5178 *top = node_new_list(node, NULL);
5179 headp = &(NCONS(*top).right);
5180 while (r != TK_EOT && r != term && r != TK_ALT) {
5181 r = parse_exp(&node, tok, term, src, end, env);
5182 if (r < 0) return r;
5183
5184 if (NTYPE(node) == N_LIST) {
5185 *headp = node;
5186 while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right;
5187 headp = &(NCONS(node).right);
5188 }
5189 else {
5190 *headp = node_new_list(node, NULL);
5191 headp = &(NCONS(*headp).right);
5192 }
5193 }
5194 }
5195
5196 return r;
5197 }
5198
5199 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5200 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5201 parse_subexp(Node** top, OnigToken* tok, int term,
5202 UChar** src, UChar* end, ScanEnv* env)
5203 {
5204 int r;
5205 Node *node, **headp;
5206
5207 *top = NULL;
5208 r = parse_branch(&node, tok, term, src, end, env);
5209 if (r < 0) {
5210 onig_node_free(node);
5211 return r;
5212 }
5213
5214 if (r == term) {
5215 *top = node;
5216 }
5217 else if (r == TK_ALT) {
5218 *top = node_new_alt(node, NULL);
5219 headp = &(NCONS(*top).right);
5220 while (r == TK_ALT) {
5221 r = fetch_token(tok, src, end, env);
5222 if (r < 0) return r;
5223 r = parse_branch(&node, tok, term, src, end, env);
5224 if (r < 0) return r;
5225
5226 *headp = node_new_alt(node, NULL);
5227 headp = &(NCONS(*headp).right);
5228 }
5229
5230 if (tok->type != term)
5231 goto err;
5232 }
5233 else {
5234 err:
5235 if (term == TK_SUBEXP_CLOSE)
5236 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5237 else
5238 return ONIGERR_PARSER_BUG;
5239 }
5240
5241 return r;
5242 }
5243
5244 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5245 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5246 {
5247 int r;
5248 OnigToken tok;
5249
5250 r = fetch_token(&tok, src, end, env);
5251 if (r < 0) return r;
5252 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5253 if (r < 0) return r;
5254 return 0;
5255 }
5256
5257 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5258 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg,
5259 ScanEnv* env)
5260 {
5261 int r;
5262 UChar* p;
5263
5264 #ifdef USE_NAMED_GROUP
5265 names_clear(reg);
5266 #endif
5267
5268 scan_env_clear(env);
5269 env->option = reg->options;
5270 env->ambig_flag = reg->ambig_flag;
5271 env->enc = reg->enc;
5272 env->syntax = reg->syntax;
5273 env->pattern = (UChar* )pattern;
5274 env->pattern_end = (UChar* )end;
5275 env->reg = reg;
5276
5277 *root = NULL;
5278 p = (UChar* )pattern;
5279 r = parse_regexp(root, &p, (UChar* )end, env);
5280 reg->num_mem = env->num_mem;
5281 return r;
5282 }
5283
5284 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode,UChar * arg,UChar * arg_end)5285 onig_scan_env_set_error_string(ScanEnv* env, int ecode,
5286 UChar* arg, UChar* arg_end)
5287 {
5288 env->error = arg;
5289 env->error_end = arg_end;
5290 }
5291