1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31 #include "st.h"
32
33 #define WARN_BUFSIZE 256
34
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36
37
38 OnigSyntaxType OnigSyntaxRuby = {
39 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42 ONIG_SYN_OP_ESC_C_CONTROL )
43 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45 ONIG_SYN_OP2_OPTION_RUBY |
46 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
49 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53 ONIG_SYN_OP2_ESC_H_XDIGIT )
54 , ( SYN_GNU_REGEX_BV |
55 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62 , ONIG_OPTION_NONE
63 ,
64 {
65 (OnigCodePoint )'\\' /* esc */
66 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
67 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
68 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71 }
72 };
73
74 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92 onig_warn = f;
93 }
94
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97 onig_verb_warn = f;
98 }
99
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103 if (IS_NOT_NULL(bbuf)) {
104 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105 xfree(bbuf);
106 }
107 }
108
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112 int r;
113 BBuf *to;
114
115 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116 CHECK_NULL_RETURN_MEMERR(to);
117 r = BBUF_INIT(to, from->alloc);
118 if (r != 0) return r;
119 to->used = from->used;
120 xmemcpy(to->p, from->p, from->used);
121 return 0;
122 }
123
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125 ((env)->num_mem + 1 + (rel_no))
126
127 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128
129 #define MBCODE_START_POS(enc) \
130 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138 if (r) return r;\
139 }\
140 } while (0)
141
142
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144 int i;\
145 empty = 1;\
146 for (i = 0; i < (int )BITSET_SIZE; i++) {\
147 if ((bs)[i] != 0) {\
148 empty = 0; break;\
149 }\
150 }\
151 } while (0)
152
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156 int i;
157 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158 BITSET_SET_BIT(bs, i);
159 }
160 }
161
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166 int i;
167 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174 int i;
175 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181 int i;
182 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188 int i;
189 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195 int i;
196 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202 int i;
203 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209 int x;
210
211 while (n-- > 0) {
212 x = *s2++ - *s1++;
213 if (x) return x;
214 }
215 return 0;
216 }
217
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221 int len = end - src;
222 if (len > 0) {
223 xmemcpy(dest, src, len);
224 dest[len] = (UChar )0;
225 }
226 }
227
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232 int slen, term_len, i;
233 UChar *r;
234
235 slen = end - s;
236 term_len = ONIGENC_MBC_MINLEN(enc);
237
238 r = (UChar* )xmalloc(slen + term_len);
239 CHECK_NULL_RETURN(r);
240 xmemcpy(r, s, slen);
241
242 for (i = 0; i < term_len; i++)
243 r[slen + i] = (UChar )0;
244
245 return r;
246 }
247 #endif
248
249 /* scan pattern methods */
250 #define PEND_VALUE 0
251
252 #define PFETCH_READY UChar* pfetch_prev
253 #define PEND (p < end ? 0 : 1)
254 #define PUNFETCH p = pfetch_prev
255 #define PINC do { \
256 pfetch_prev = p; \
257 p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c) do { \
260 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261 pfetch_prev = p; \
262 p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 } while (0)
264
265 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
266 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
267
268 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)269 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
270 int capa)
271 {
272 UChar* r;
273
274 if (dest)
275 r = (UChar* )xrealloc(dest, capa + 1);
276 else
277 r = (UChar* )xmalloc(capa + 1);
278
279 CHECK_NULL_RETURN(r);
280 onig_strcpy(r + (dest_end - dest), src, src_end);
281 return r;
282 }
283
284 /* dest on static area */
285 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)286 strcat_capa_from_static(UChar* dest, UChar* dest_end,
287 const UChar* src, const UChar* src_end, int capa)
288 {
289 UChar* r;
290
291 r = (UChar* )xmalloc(capa + 1);
292 CHECK_NULL_RETURN(r);
293 onig_strcpy(r, dest, dest_end);
294 onig_strcpy(r + (dest_end - dest), src, src_end);
295 return r;
296 }
297
298
299 #ifdef USE_ST_LIBRARY
300
301 typedef struct {
302 UChar* s;
303 UChar* end;
304 } st_str_end_key;
305
306 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)307 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
308 {
309 UChar *p, *q;
310 int c;
311
312 if ((x->end - x->s) != (y->end - y->s))
313 return 1;
314
315 p = x->s;
316 q = y->s;
317 while (p < x->end) {
318 c = (int )*p - (int )*q;
319 if (c != 0) return c;
320
321 p++; q++;
322 }
323
324 return 0;
325 }
326
327 static int
str_end_hash(st_str_end_key * x)328 str_end_hash(st_str_end_key* x)
329 {
330 UChar *p;
331 int val = 0;
332
333 p = x->s;
334 while (p < x->end) {
335 val = val * 997 + (int )*p++;
336 }
337
338 return val + (val >> 5);
339 }
340
341 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)342 onig_st_init_strend_table_with_size(int size)
343 {
344 static struct st_hash_type hashType = {
345 str_end_cmp,
346 str_end_hash,
347 };
348
349 return (hash_table_type* )
350 onig_st_init_table_with_size(&hashType, size);
351 }
352
353 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)354 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
355 const UChar* end_key, hash_data_type *value)
356 {
357 st_str_end_key key;
358
359 key.s = (UChar* )str_key;
360 key.end = (UChar* )end_key;
361
362 return onig_st_lookup(table, (st_data_t )(&key), value);
363 }
364
365 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)366 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
367 const UChar* end_key, hash_data_type value)
368 {
369 st_str_end_key* key;
370 int result;
371
372 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
373 key->s = (UChar* )str_key;
374 key->end = (UChar* )end_key;
375 result = onig_st_insert(table, (st_data_t )key, value);
376 if (result) {
377 xfree(key);
378 }
379 return result;
380 }
381
382 #endif /* USE_ST_LIBRARY */
383
384
385 #ifdef USE_NAMED_GROUP
386
387 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
388
389 typedef struct {
390 UChar* name;
391 int name_len; /* byte length */
392 int back_num; /* number of backrefs */
393 int back_alloc;
394 int back_ref1;
395 int* back_refs;
396 } NameEntry;
397
398 #ifdef USE_ST_LIBRARY
399
400 typedef st_table NameTable;
401 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
402
403 #define NAMEBUF_SIZE 24
404 #define NAMEBUF_SIZE_1 25
405
406 #ifdef ONIG_DEBUG
407 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)408 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
409 {
410 int i;
411 FILE* fp = (FILE* )arg;
412
413 fprintf(fp, "%s: ", e->name);
414 if (e->back_num == 0)
415 fputs("-", fp);
416 else if (e->back_num == 1)
417 fprintf(fp, "%d", e->back_ref1);
418 else {
419 for (i = 0; i < e->back_num; i++) {
420 if (i > 0) fprintf(fp, ", ");
421 fprintf(fp, "%d", e->back_refs[i]);
422 }
423 }
424 fputs("\n", fp);
425 return ST_CONTINUE;
426 }
427
428 extern int
onig_print_names(FILE * fp,regex_t * reg)429 onig_print_names(FILE* fp, regex_t* reg)
430 {
431 NameTable* t = (NameTable* )reg->name_table;
432
433 if (IS_NOT_NULL(t)) {
434 fprintf(fp, "name table\n");
435 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
436 fputs("\n", fp);
437 }
438 return 0;
439 }
440 #endif /* ONIG_DEBUG */
441
442 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)443 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
444 {
445 xfree(e->name);
446 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
447 xfree(key);
448 xfree(e);
449 return ST_DELETE;
450 }
451
452 static int
names_clear(regex_t * reg)453 names_clear(regex_t* reg)
454 {
455 NameTable* t = (NameTable* )reg->name_table;
456
457 if (IS_NOT_NULL(t)) {
458 onig_st_foreach(t, i_free_name_entry, 0);
459 }
460 return 0;
461 }
462
463 extern int
onig_names_free(regex_t * reg)464 onig_names_free(regex_t* reg)
465 {
466 int r;
467 NameTable* t;
468
469 r = names_clear(reg);
470 if (r) return r;
471
472 t = (NameTable* )reg->name_table;
473 if (IS_NOT_NULL(t)) onig_st_free_table(t);
474 reg->name_table = (void* )NULL;
475 return 0;
476 }
477
478 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)479 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
480 {
481 NameEntry* e;
482 NameTable* t = (NameTable* )reg->name_table;
483
484 e = (NameEntry* )NULL;
485 if (IS_NOT_NULL(t)) {
486 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
487 }
488 return e;
489 }
490
491 typedef struct {
492 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
493 regex_t* reg;
494 void* arg;
495 int ret;
496 OnigEncoding enc;
497 } INamesArg;
498
499 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)500 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
501 {
502 int r = (*(arg->func))(e->name,
503 e->name + e->name_len,
504 e->back_num,
505 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
506 arg->reg, arg->arg);
507 if (r != 0) {
508 arg->ret = r;
509 return ST_STOP;
510 }
511 return ST_CONTINUE;
512 }
513
514 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)515 onig_foreach_name(regex_t* reg,
516 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
517 {
518 INamesArg narg;
519 NameTable* t = (NameTable* )reg->name_table;
520
521 narg.ret = 0;
522 if (IS_NOT_NULL(t)) {
523 narg.func = func;
524 narg.reg = reg;
525 narg.arg = arg;
526 narg.enc = reg->enc; /* should be pattern encoding. */
527 onig_st_foreach(t, i_names, (HashDataType )&narg);
528 }
529 return narg.ret;
530 }
531
532 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)533 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
534 {
535 int i;
536
537 if (e->back_num > 1) {
538 for (i = 0; i < e->back_num; i++) {
539 e->back_refs[i] = map[e->back_refs[i]].new_val;
540 }
541 }
542 else if (e->back_num == 1) {
543 e->back_ref1 = map[e->back_ref1].new_val;
544 }
545
546 return ST_CONTINUE;
547 }
548
549 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)550 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
551 {
552 NameTable* t = (NameTable* )reg->name_table;
553
554 if (IS_NOT_NULL(t)) {
555 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
556 }
557 return 0;
558 }
559
560
561 extern int
onig_number_of_names(regex_t * reg)562 onig_number_of_names(regex_t* reg)
563 {
564 NameTable* t = (NameTable* )reg->name_table;
565
566 if (IS_NOT_NULL(t))
567 return t->num_entries;
568 else
569 return 0;
570 }
571
572 #else /* USE_ST_LIBRARY */
573
574 #define INIT_NAMES_ALLOC_NUM 8
575
576 typedef struct {
577 NameEntry* e;
578 int num;
579 int alloc;
580 } NameTable;
581
582 #ifdef ONIG_DEBUG
583 extern int
onig_print_names(FILE * fp,regex_t * reg)584 onig_print_names(FILE* fp, regex_t* reg)
585 {
586 int i, j;
587 NameEntry* e;
588 NameTable* t = (NameTable* )reg->name_table;
589
590 if (IS_NOT_NULL(t) && t->num > 0) {
591 fprintf(fp, "name table\n");
592 for (i = 0; i < t->num; i++) {
593 e = &(t->e[i]);
594 fprintf(fp, "%s: ", e->name);
595 if (e->back_num == 0) {
596 fputs("-", fp);
597 }
598 else if (e->back_num == 1) {
599 fprintf(fp, "%d", e->back_ref1);
600 }
601 else {
602 for (j = 0; j < e->back_num; j++) {
603 if (j > 0) fprintf(fp, ", ");
604 fprintf(fp, "%d", e->back_refs[j]);
605 }
606 }
607 fputs("\n", fp);
608 }
609 fputs("\n", fp);
610 }
611 return 0;
612 }
613 #endif
614
615 static int
names_clear(regex_t * reg)616 names_clear(regex_t* reg)
617 {
618 int i;
619 NameEntry* e;
620 NameTable* t = (NameTable* )reg->name_table;
621
622 if (IS_NOT_NULL(t)) {
623 for (i = 0; i < t->num; i++) {
624 e = &(t->e[i]);
625 if (IS_NOT_NULL(e->name)) {
626 xfree(e->name);
627 e->name = NULL;
628 e->name_len = 0;
629 e->back_num = 0;
630 e->back_alloc = 0;
631 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
632 e->back_refs = (int* )NULL;
633 }
634 }
635 if (IS_NOT_NULL(t->e)) {
636 xfree(t->e);
637 t->e = NULL;
638 }
639 t->num = 0;
640 }
641 return 0;
642 }
643
644 extern int
onig_names_free(regex_t * reg)645 onig_names_free(regex_t* reg)
646 {
647 int r;
648 NameTable* t;
649
650 r = names_clear(reg);
651 if (r) return r;
652
653 t = (NameTable* )reg->name_table;
654 if (IS_NOT_NULL(t)) xfree(t);
655 reg->name_table = NULL;
656 return 0;
657 }
658
659 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)660 name_find(regex_t* reg, UChar* name, UChar* name_end)
661 {
662 int i, len;
663 NameEntry* e;
664 NameTable* t = (NameTable* )reg->name_table;
665
666 if (IS_NOT_NULL(t)) {
667 len = name_end - name;
668 for (i = 0; i < t->num; i++) {
669 e = &(t->e[i]);
670 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
671 return e;
672 }
673 }
674 return (NameEntry* )NULL;
675 }
676
677 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)678 onig_foreach_name(regex_t* reg,
679 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
680 {
681 int i, r;
682 NameEntry* e;
683 NameTable* t = (NameTable* )reg->name_table;
684
685 if (IS_NOT_NULL(t)) {
686 for (i = 0; i < t->num; i++) {
687 e = &(t->e[i]);
688 r = (*func)(e->name, e->name + e->name_len, e->back_num,
689 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
690 reg, arg);
691 if (r != 0) return r;
692 }
693 }
694 return 0;
695 }
696
697 extern int
onig_number_of_names(regex_t * reg)698 onig_number_of_names(regex_t* reg)
699 {
700 NameTable* t = (NameTable* )reg->name_table;
701
702 if (IS_NOT_NULL(t))
703 return t->num;
704 else
705 return 0;
706 }
707
708 #endif /* else USE_ST_LIBRARY */
709
710 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)711 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
712 {
713 int alloc;
714 NameEntry* e;
715 NameTable* t = (NameTable* )reg->name_table;
716
717 if (name_end - name <= 0)
718 return ONIGERR_EMPTY_GROUP_NAME;
719
720 e = name_find(reg, name, name_end);
721 if (IS_NULL(e)) {
722 #ifdef USE_ST_LIBRARY
723 if (IS_NULL(t)) {
724 t = onig_st_init_strend_table_with_size(5);
725 reg->name_table = (void* )t;
726 }
727 e = (NameEntry* )xmalloc(sizeof(NameEntry));
728 CHECK_NULL_RETURN_MEMERR(e);
729
730 e->name = strdup_with_null(reg->enc, name, name_end);
731 if (IS_NULL(e->name)) {
732 xfree(e); return ONIGERR_MEMORY;
733 }
734 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
735 (HashDataType )e);
736
737 e->name_len = name_end - name;
738 e->back_num = 0;
739 e->back_alloc = 0;
740 e->back_refs = (int* )NULL;
741
742 #else
743
744 if (IS_NULL(t)) {
745 alloc = INIT_NAMES_ALLOC_NUM;
746 t = (NameTable* )xmalloc(sizeof(NameTable));
747 CHECK_NULL_RETURN_MEMERR(t);
748 t->e = NULL;
749 t->alloc = 0;
750 t->num = 0;
751
752 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
753 if (IS_NULL(t->e)) {
754 xfree(t);
755 return ONIGERR_MEMORY;
756 }
757 t->alloc = alloc;
758 reg->name_table = t;
759 goto clear;
760 }
761 else if (t->num == t->alloc) {
762 int i;
763
764 alloc = t->alloc * 2;
765 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
766 CHECK_NULL_RETURN_MEMERR(t->e);
767 t->alloc = alloc;
768
769 clear:
770 for (i = t->num; i < t->alloc; i++) {
771 t->e[i].name = NULL;
772 t->e[i].name_len = 0;
773 t->e[i].back_num = 0;
774 t->e[i].back_alloc = 0;
775 t->e[i].back_refs = (int* )NULL;
776 }
777 }
778 e = &(t->e[t->num]);
779 t->num++;
780 e->name = strdup_with_null(reg->enc, name, name_end);
781 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
782 e->name_len = name_end - name;
783 #endif
784 }
785
786 if (e->back_num >= 1 &&
787 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
788 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
789 name, name_end);
790 return ONIGERR_MULTIPLEX_DEFINED_NAME;
791 }
792
793 e->back_num++;
794 if (e->back_num == 1) {
795 e->back_ref1 = backref;
796 }
797 else {
798 if (e->back_num == 2) {
799 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
800 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
801 CHECK_NULL_RETURN_MEMERR(e->back_refs);
802 e->back_alloc = alloc;
803 e->back_refs[0] = e->back_ref1;
804 e->back_refs[1] = backref;
805 }
806 else {
807 if (e->back_num > e->back_alloc) {
808 alloc = e->back_alloc * 2;
809 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
810 CHECK_NULL_RETURN_MEMERR(e->back_refs);
811 e->back_alloc = alloc;
812 }
813 e->back_refs[e->back_num - 1] = backref;
814 }
815 }
816
817 return 0;
818 }
819
820 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)821 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
822 const UChar* name_end, int** nums)
823 {
824 NameEntry* e = name_find(reg, name, name_end);
825
826 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
827
828 switch (e->back_num) {
829 case 0:
830 break;
831 case 1:
832 *nums = &(e->back_ref1);
833 break;
834 default:
835 *nums = e->back_refs;
836 break;
837 }
838 return e->back_num;
839 }
840
841 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)842 onig_name_to_backref_number(regex_t* reg, const UChar* name,
843 const UChar* name_end, OnigRegion *region)
844 {
845 int i, n, *nums;
846
847 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
848 if (n < 0)
849 return n;
850 else if (n == 0)
851 return ONIGERR_PARSER_BUG;
852 else if (n == 1)
853 return nums[0];
854 else {
855 if (IS_NOT_NULL(region)) {
856 for (i = n - 1; i >= 0; i--) {
857 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
858 return nums[i];
859 }
860 }
861 return nums[n - 1];
862 }
863 }
864
865 #else /* USE_NAMED_GROUP */
866
867 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)868 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
869 const UChar* name_end, int** nums)
870 {
871 return ONIG_NO_SUPPORT_CONFIG;
872 }
873
874 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)875 onig_name_to_backref_number(regex_t* reg, const UChar* name,
876 const UChar* name_end, OnigRegion* region)
877 {
878 return ONIG_NO_SUPPORT_CONFIG;
879 }
880
881 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)882 onig_foreach_name(regex_t* reg,
883 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
884 {
885 return ONIG_NO_SUPPORT_CONFIG;
886 }
887
888 extern int
onig_number_of_names(regex_t * reg)889 onig_number_of_names(regex_t* reg)
890 {
891 return 0;
892 }
893 #endif /* else USE_NAMED_GROUP */
894
895 extern int
onig_noname_group_capture_is_active(regex_t * reg)896 onig_noname_group_capture_is_active(regex_t* reg)
897 {
898 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
899 return 0;
900
901 #ifdef USE_NAMED_GROUP
902 if (onig_number_of_names(reg) > 0 &&
903 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
904 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
905 return 0;
906 }
907 #endif
908
909 return 1;
910 }
911
912
913 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
914
915 static void
scan_env_clear(ScanEnv * env)916 scan_env_clear(ScanEnv* env)
917 {
918 int i;
919
920 BIT_STATUS_CLEAR(env->capture_history);
921 BIT_STATUS_CLEAR(env->bt_mem_start);
922 BIT_STATUS_CLEAR(env->bt_mem_end);
923 BIT_STATUS_CLEAR(env->backrefed_mem);
924 env->error = (UChar* )NULL;
925 env->error_end = (UChar* )NULL;
926 env->num_call = 0;
927 env->num_mem = 0;
928 #ifdef USE_NAMED_GROUP
929 env->num_named = 0;
930 #endif
931 env->mem_alloc = 0;
932 env->mem_nodes_dynamic = (Node** )NULL;
933
934 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
935 env->mem_nodes_static[i] = NULL_NODE;
936
937 #ifdef USE_COMBINATION_EXPLOSION_CHECK
938 env->num_comb_exp_check = 0;
939 env->comb_exp_max_regnum = 0;
940 env->curr_max_regnum = 0;
941 env->has_recursion = 0;
942 #endif
943 }
944
945 static int
scan_env_add_mem_entry(ScanEnv * env)946 scan_env_add_mem_entry(ScanEnv* env)
947 {
948 int i, need, alloc;
949 Node** p;
950
951 need = env->num_mem + 1;
952 if (need >= SCANENV_MEMNODES_SIZE) {
953 if (env->mem_alloc <= need) {
954 if (IS_NULL(env->mem_nodes_dynamic)) {
955 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
956 p = (Node** )xmalloc(sizeof(Node*) * alloc);
957 xmemcpy(p, env->mem_nodes_static,
958 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
959 }
960 else {
961 alloc = env->mem_alloc * 2;
962 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
963 }
964 CHECK_NULL_RETURN_MEMERR(p);
965
966 for (i = env->num_mem + 1; i < alloc; i++)
967 p[i] = NULL_NODE;
968
969 env->mem_nodes_dynamic = p;
970 env->mem_alloc = alloc;
971 }
972 }
973
974 env->num_mem++;
975 return env->num_mem;
976 }
977
978 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)979 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
980 {
981 if (env->num_mem >= num)
982 SCANENV_MEM_NODES(env)[num] = node;
983 else
984 return ONIGERR_PARSER_BUG;
985 return 0;
986 }
987
988
989 #ifdef USE_PARSE_TREE_NODE_RECYCLE
990 typedef struct _FreeNode {
991 struct _FreeNode* next;
992 } FreeNode;
993
994 static FreeNode* FreeNodeList = (FreeNode* )NULL;
995 #endif
996
997 extern void
onig_node_free(Node * node)998 onig_node_free(Node* node)
999 {
1000 start:
1001 if (IS_NULL(node)) return ;
1002
1003 switch (NTYPE(node)) {
1004 case NT_STR:
1005 if (NSTR(node)->capa != 0 &&
1006 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1007 xfree(NSTR(node)->s);
1008 }
1009 break;
1010
1011 case NT_LIST:
1012 case NT_ALT:
1013 onig_node_free(NCAR(node));
1014 {
1015 Node* next_node = NCDR(node);
1016
1017 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1018 {
1019 FreeNode* n = (FreeNode* )node;
1020
1021 THREAD_ATOMIC_START;
1022 n->next = FreeNodeList;
1023 FreeNodeList = n;
1024 THREAD_ATOMIC_END;
1025 }
1026 #else
1027 xfree(node);
1028 #endif
1029 node = next_node;
1030 goto start;
1031 }
1032 break;
1033
1034 case NT_CCLASS:
1035 {
1036 CClassNode* cc = NCCLASS(node);
1037
1038 if (IS_NCCLASS_SHARE(cc)) return ;
1039 if (cc->mbuf)
1040 bbuf_free(cc->mbuf);
1041 }
1042 break;
1043
1044 case NT_QTFR:
1045 if (NQTFR(node)->target)
1046 onig_node_free(NQTFR(node)->target);
1047 break;
1048
1049 case NT_ENCLOSE:
1050 if (NENCLOSE(node)->target)
1051 onig_node_free(NENCLOSE(node)->target);
1052 break;
1053
1054 case NT_BREF:
1055 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1056 xfree(NBREF(node)->back_dynamic);
1057 break;
1058
1059 case NT_ANCHOR:
1060 if (NANCHOR(node)->target)
1061 onig_node_free(NANCHOR(node)->target);
1062 break;
1063 }
1064
1065 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1066 {
1067 FreeNode* n = (FreeNode* )node;
1068
1069 THREAD_ATOMIC_START;
1070 n->next = FreeNodeList;
1071 FreeNodeList = n;
1072 THREAD_ATOMIC_END;
1073 }
1074 #else
1075 xfree(node);
1076 #endif
1077 }
1078
1079 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1080 extern int
onig_free_node_list(void)1081 onig_free_node_list(void)
1082 {
1083 FreeNode* n;
1084
1085 /* THREAD_ATOMIC_START; */
1086 while (IS_NOT_NULL(FreeNodeList)) {
1087 n = FreeNodeList;
1088 FreeNodeList = FreeNodeList->next;
1089 xfree(n);
1090 }
1091 /* THREAD_ATOMIC_END; */
1092 return 0;
1093 }
1094 #endif
1095
1096 static Node*
node_new(void)1097 node_new(void)
1098 {
1099 Node* node;
1100
1101 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1102 THREAD_ATOMIC_START;
1103 if (IS_NOT_NULL(FreeNodeList)) {
1104 node = (Node* )FreeNodeList;
1105 FreeNodeList = FreeNodeList->next;
1106 THREAD_ATOMIC_END;
1107 return node;
1108 }
1109 THREAD_ATOMIC_END;
1110 #endif
1111
1112 node = (Node* )xmalloc(sizeof(Node));
1113 /* xmemset(node, 0, sizeof(Node)); */
1114 return node;
1115 }
1116
1117
1118 static void
initialize_cclass(CClassNode * cc)1119 initialize_cclass(CClassNode* cc)
1120 {
1121 BITSET_CLEAR(cc->bs);
1122 /* cc->base.flags = 0; */
1123 cc->flags = 0;
1124 cc->mbuf = NULL;
1125 }
1126
1127 static Node*
node_new_cclass(void)1128 node_new_cclass(void)
1129 {
1130 Node* node = node_new();
1131 CHECK_NULL_RETURN(node);
1132
1133 SET_NTYPE(node, NT_CCLASS);
1134 initialize_cclass(NCCLASS(node));
1135 return node;
1136 }
1137
1138 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1139 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1140 const OnigCodePoint ranges[])
1141 {
1142 int n, i;
1143 CClassNode* cc;
1144 OnigCodePoint j;
1145
1146 Node* node = node_new_cclass();
1147 CHECK_NULL_RETURN(node);
1148
1149 cc = NCCLASS(node);
1150 if (not != 0) NCCLASS_SET_NOT(cc);
1151
1152 BITSET_CLEAR(cc->bs);
1153 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1154 n = ONIGENC_CODE_RANGE_NUM(ranges);
1155 for (i = 0; i < n; i++) {
1156 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
1157 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1158 if (j >= sb_out) goto sb_end;
1159
1160 BITSET_SET_BIT(cc->bs, j);
1161 }
1162 }
1163 }
1164
1165 sb_end:
1166 if (IS_NULL(ranges)) {
1167 is_null:
1168 cc->mbuf = NULL;
1169 }
1170 else {
1171 BBuf* bbuf;
1172
1173 n = ONIGENC_CODE_RANGE_NUM(ranges);
1174 if (n == 0) goto is_null;
1175
1176 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1177 CHECK_NULL_RETURN(bbuf);
1178 bbuf->alloc = n + 1;
1179 bbuf->used = n + 1;
1180 bbuf->p = (UChar* )((void* )ranges);
1181
1182 cc->mbuf = bbuf;
1183 }
1184
1185 return node;
1186 }
1187
1188 static Node*
node_new_ctype(int type,int not)1189 node_new_ctype(int type, int not)
1190 {
1191 Node* node = node_new();
1192 CHECK_NULL_RETURN(node);
1193
1194 SET_NTYPE(node, NT_CTYPE);
1195 NCTYPE(node)->ctype = type;
1196 NCTYPE(node)->not = not;
1197 return node;
1198 }
1199
1200 static Node*
node_new_anychar(void)1201 node_new_anychar(void)
1202 {
1203 Node* node = node_new();
1204 CHECK_NULL_RETURN(node);
1205
1206 SET_NTYPE(node, NT_CANY);
1207 return node;
1208 }
1209
1210 static Node*
node_new_list(Node * left,Node * right)1211 node_new_list(Node* left, Node* right)
1212 {
1213 Node* node = node_new();
1214 CHECK_NULL_RETURN(node);
1215
1216 SET_NTYPE(node, NT_LIST);
1217 NCAR(node) = left;
1218 NCDR(node) = right;
1219 return node;
1220 }
1221
1222 extern Node*
onig_node_new_list(Node * left,Node * right)1223 onig_node_new_list(Node* left, Node* right)
1224 {
1225 return node_new_list(left, right);
1226 }
1227
1228 extern Node*
onig_node_list_add(Node * list,Node * x)1229 onig_node_list_add(Node* list, Node* x)
1230 {
1231 Node *n;
1232
1233 n = onig_node_new_list(x, NULL);
1234 if (IS_NULL(n)) return NULL_NODE;
1235
1236 if (IS_NOT_NULL(list)) {
1237 while (IS_NOT_NULL(NCDR(list)))
1238 list = NCDR(list);
1239
1240 NCDR(list) = n;
1241 }
1242
1243 return n;
1244 }
1245
1246 extern Node*
onig_node_new_alt(Node * left,Node * right)1247 onig_node_new_alt(Node* left, Node* right)
1248 {
1249 Node* node = node_new();
1250 CHECK_NULL_RETURN(node);
1251
1252 SET_NTYPE(node, NT_ALT);
1253 NCAR(node) = left;
1254 NCDR(node) = right;
1255 return node;
1256 }
1257
1258 extern Node*
onig_node_new_anchor(int type)1259 onig_node_new_anchor(int type)
1260 {
1261 Node* node = node_new();
1262 CHECK_NULL_RETURN(node);
1263
1264 SET_NTYPE(node, NT_ANCHOR);
1265 NANCHOR(node)->type = type;
1266 NANCHOR(node)->target = NULL;
1267 NANCHOR(node)->char_len = -1;
1268 return node;
1269 }
1270
1271 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1272 node_new_backref(int back_num, int* backrefs, int by_name,
1273 #ifdef USE_BACKREF_WITH_LEVEL
1274 int exist_level, int nest_level,
1275 #endif
1276 ScanEnv* env)
1277 {
1278 int i;
1279 Node* node = node_new();
1280
1281 CHECK_NULL_RETURN(node);
1282
1283 SET_NTYPE(node, NT_BREF);
1284 NBREF(node)->state = 0;
1285 NBREF(node)->back_num = back_num;
1286 NBREF(node)->back_dynamic = (int* )NULL;
1287 if (by_name != 0)
1288 NBREF(node)->state |= NST_NAME_REF;
1289
1290 #ifdef USE_BACKREF_WITH_LEVEL
1291 if (exist_level != 0) {
1292 NBREF(node)->state |= NST_NEST_LEVEL;
1293 NBREF(node)->nest_level = nest_level;
1294 }
1295 #endif
1296
1297 for (i = 0; i < back_num; i++) {
1298 if (backrefs[i] <= env->num_mem &&
1299 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1300 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1301 break;
1302 }
1303 }
1304
1305 if (back_num <= NODE_BACKREFS_SIZE) {
1306 for (i = 0; i < back_num; i++)
1307 NBREF(node)->back_static[i] = backrefs[i];
1308 }
1309 else {
1310 int* p = (int* )xmalloc(sizeof(int) * back_num);
1311 if (IS_NULL(p)) {
1312 onig_node_free(node);
1313 return NULL;
1314 }
1315 NBREF(node)->back_dynamic = p;
1316 for (i = 0; i < back_num; i++)
1317 p[i] = backrefs[i];
1318 }
1319 return node;
1320 }
1321
1322 #ifdef USE_SUBEXP_CALL
1323 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1324 node_new_call(UChar* name, UChar* name_end, int gnum)
1325 {
1326 Node* node = node_new();
1327 CHECK_NULL_RETURN(node);
1328
1329 SET_NTYPE(node, NT_CALL);
1330 NCALL(node)->state = 0;
1331 NCALL(node)->target = NULL_NODE;
1332 NCALL(node)->name = name;
1333 NCALL(node)->name_end = name_end;
1334 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1335 return node;
1336 }
1337 #endif
1338
1339 static Node*
node_new_quantifier(int lower,int upper,int by_number)1340 node_new_quantifier(int lower, int upper, int by_number)
1341 {
1342 Node* node = node_new();
1343 CHECK_NULL_RETURN(node);
1344
1345 SET_NTYPE(node, NT_QTFR);
1346 NQTFR(node)->state = 0;
1347 NQTFR(node)->target = NULL;
1348 NQTFR(node)->lower = lower;
1349 NQTFR(node)->upper = upper;
1350 NQTFR(node)->greedy = 1;
1351 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1352 NQTFR(node)->head_exact = NULL_NODE;
1353 NQTFR(node)->next_head_exact = NULL_NODE;
1354 NQTFR(node)->is_refered = 0;
1355 if (by_number != 0)
1356 NQTFR(node)->state |= NST_BY_NUMBER;
1357
1358 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1359 NQTFR(node)->comb_exp_check_num = 0;
1360 #endif
1361
1362 return node;
1363 }
1364
1365 static Node*
node_new_enclose(int type)1366 node_new_enclose(int type)
1367 {
1368 Node* node = node_new();
1369 CHECK_NULL_RETURN(node);
1370
1371 SET_NTYPE(node, NT_ENCLOSE);
1372 NENCLOSE(node)->type = type;
1373 NENCLOSE(node)->state = 0;
1374 NENCLOSE(node)->regnum = 0;
1375 NENCLOSE(node)->option = 0;
1376 NENCLOSE(node)->target = NULL;
1377 NENCLOSE(node)->call_addr = -1;
1378 NENCLOSE(node)->opt_count = 0;
1379 return node;
1380 }
1381
1382 extern Node*
onig_node_new_enclose(int type)1383 onig_node_new_enclose(int type)
1384 {
1385 return node_new_enclose(type);
1386 }
1387
1388 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1389 node_new_enclose_memory(OnigOptionType option, int is_named)
1390 {
1391 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1392 CHECK_NULL_RETURN(node);
1393 if (is_named != 0)
1394 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1395
1396 #ifdef USE_SUBEXP_CALL
1397 NENCLOSE(node)->option = option;
1398 #endif
1399 return node;
1400 }
1401
1402 static Node*
node_new_option(OnigOptionType option)1403 node_new_option(OnigOptionType option)
1404 {
1405 Node* node = node_new_enclose(ENCLOSE_OPTION);
1406 CHECK_NULL_RETURN(node);
1407 NENCLOSE(node)->option = option;
1408 return node;
1409 }
1410
1411 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1412 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1413 {
1414 int addlen = end - s;
1415
1416 if (addlen > 0) {
1417 int len = NSTR(node)->end - NSTR(node)->s;
1418
1419 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1420 UChar* p;
1421 int capa = len + addlen + NODE_STR_MARGIN;
1422
1423 if (capa <= NSTR(node)->capa) {
1424 onig_strcpy(NSTR(node)->s + len, s, end);
1425 }
1426 else {
1427 if (NSTR(node)->s == NSTR(node)->buf)
1428 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1429 s, end, capa);
1430 else
1431 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1432
1433 CHECK_NULL_RETURN_MEMERR(p);
1434 NSTR(node)->s = p;
1435 NSTR(node)->capa = capa;
1436 }
1437 }
1438 else {
1439 onig_strcpy(NSTR(node)->s + len, s, end);
1440 }
1441 NSTR(node)->end = NSTR(node)->s + len + addlen;
1442 }
1443
1444 return 0;
1445 }
1446
1447 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1448 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1449 {
1450 onig_node_str_clear(node);
1451 return onig_node_str_cat(node, s, end);
1452 }
1453
1454 static int
node_str_cat_char(Node * node,UChar c)1455 node_str_cat_char(Node* node, UChar c)
1456 {
1457 UChar s[1];
1458
1459 s[0] = c;
1460 return onig_node_str_cat(node, s, s + 1);
1461 }
1462
1463 extern void
onig_node_conv_to_str_node(Node * node,int flag)1464 onig_node_conv_to_str_node(Node* node, int flag)
1465 {
1466 SET_NTYPE(node, NT_STR);
1467 NSTR(node)->flag = flag;
1468 NSTR(node)->capa = 0;
1469 NSTR(node)->s = NSTR(node)->buf;
1470 NSTR(node)->end = NSTR(node)->buf;
1471 }
1472
1473 extern void
onig_node_str_clear(Node * node)1474 onig_node_str_clear(Node* node)
1475 {
1476 if (NSTR(node)->capa != 0 &&
1477 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1478 xfree(NSTR(node)->s);
1479 }
1480
1481 NSTR(node)->capa = 0;
1482 NSTR(node)->flag = 0;
1483 NSTR(node)->s = NSTR(node)->buf;
1484 NSTR(node)->end = NSTR(node)->buf;
1485 }
1486
1487 static Node*
node_new_str(const UChar * s,const UChar * end)1488 node_new_str(const UChar* s, const UChar* end)
1489 {
1490 Node* node = node_new();
1491 CHECK_NULL_RETURN(node);
1492
1493 SET_NTYPE(node, NT_STR);
1494 NSTR(node)->capa = 0;
1495 NSTR(node)->flag = 0;
1496 NSTR(node)->s = NSTR(node)->buf;
1497 NSTR(node)->end = NSTR(node)->buf;
1498 if (onig_node_str_cat(node, s, end)) {
1499 onig_node_free(node);
1500 return NULL;
1501 }
1502 return node;
1503 }
1504
1505 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1506 onig_node_new_str(const UChar* s, const UChar* end)
1507 {
1508 return node_new_str(s, end);
1509 }
1510
1511 static Node*
node_new_str_raw(UChar * s,UChar * end)1512 node_new_str_raw(UChar* s, UChar* end)
1513 {
1514 Node* node = node_new_str(s, end);
1515 NSTRING_SET_RAW(node);
1516 return node;
1517 }
1518
1519 static Node*
node_new_empty(void)1520 node_new_empty(void)
1521 {
1522 return node_new_str(NULL, NULL);
1523 }
1524
1525 static Node*
node_new_str_raw_char(UChar c)1526 node_new_str_raw_char(UChar c)
1527 {
1528 UChar p[1];
1529
1530 p[0] = c;
1531 return node_new_str_raw(p, p + 1);
1532 }
1533
1534 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1535 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1536 {
1537 const UChar *p;
1538 Node* n = NULL_NODE;
1539
1540 if (sn->end > sn->s) {
1541 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1542 if (p && p > sn->s) { /* can be splitted. */
1543 n = node_new_str(p, sn->end);
1544 if ((sn->flag & NSTR_RAW) != 0)
1545 NSTRING_SET_RAW(n);
1546 sn->end = (UChar* )p;
1547 }
1548 }
1549 return n;
1550 }
1551
1552 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1553 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1554 {
1555 if (sn->end > sn->s) {
1556 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1557 }
1558 return 0;
1559 }
1560
1561 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1562 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1563 node_str_head_pad(StrNode* sn, int num, UChar val)
1564 {
1565 UChar buf[NODE_STR_BUF_SIZE];
1566 int i, len;
1567
1568 len = sn->end - sn->s;
1569 onig_strcpy(buf, sn->s, sn->end);
1570 onig_strcpy(&(sn->s[num]), buf, buf + len);
1571 sn->end += num;
1572
1573 for (i = 0; i < num; i++) {
1574 sn->s[i] = val;
1575 }
1576 }
1577 #endif
1578
1579 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1580 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1581 {
1582 unsigned int num, val;
1583 OnigCodePoint c;
1584 UChar* p = *src;
1585 PFETCH_READY;
1586
1587 num = 0;
1588 while (!PEND) {
1589 PFETCH(c);
1590 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1591 val = (unsigned int )DIGITVAL(c);
1592 if ((INT_MAX_LIMIT - val) / 10UL < num)
1593 return -1; /* overflow */
1594
1595 num = num * 10 + val;
1596 }
1597 else {
1598 PUNFETCH;
1599 break;
1600 }
1601 }
1602 *src = p;
1603 return num;
1604 }
1605
1606 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1607 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1608 OnigEncoding enc)
1609 {
1610 OnigCodePoint c;
1611 unsigned int num, val;
1612 UChar* p = *src;
1613 PFETCH_READY;
1614
1615 num = 0;
1616 while (!PEND && maxlen-- != 0) {
1617 PFETCH(c);
1618 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1619 val = (unsigned int )XDIGITVAL(enc,c);
1620 if ((INT_MAX_LIMIT - val) / 16UL < num)
1621 return -1; /* overflow */
1622
1623 num = (num << 4) + XDIGITVAL(enc,c);
1624 }
1625 else {
1626 PUNFETCH;
1627 break;
1628 }
1629 }
1630 *src = p;
1631 return num;
1632 }
1633
1634 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1635 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1636 OnigEncoding enc)
1637 {
1638 OnigCodePoint c;
1639 unsigned int num, val;
1640 UChar* p = *src;
1641 PFETCH_READY;
1642
1643 num = 0;
1644 while (!PEND && maxlen-- != 0) {
1645 PFETCH(c);
1646 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1647 val = ODIGITVAL(c);
1648 if ((INT_MAX_LIMIT - val) / 8UL < num)
1649 return -1; /* overflow */
1650
1651 num = (num << 3) + val;
1652 }
1653 else {
1654 PUNFETCH;
1655 break;
1656 }
1657 }
1658 *src = p;
1659 return num;
1660 }
1661
1662
1663 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1664 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1665
1666 /* data format:
1667 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1668 (all data size is OnigCodePoint)
1669 */
1670 static int
new_code_range(BBuf ** pbuf)1671 new_code_range(BBuf** pbuf)
1672 {
1673 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1674 int r;
1675 OnigCodePoint n;
1676 BBuf* bbuf;
1677
1678 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1679 CHECK_NULL_RETURN_MEMERR(*pbuf);
1680 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1681 if (r) return r;
1682
1683 n = 0;
1684 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1685 return 0;
1686 }
1687
1688 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1689 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1690 {
1691 int r, inc_n, pos;
1692 int low, high, bound, x;
1693 OnigCodePoint n, *data;
1694 BBuf* bbuf;
1695
1696 if (from > to) {
1697 n = from; from = to; to = n;
1698 }
1699
1700 if (IS_NULL(*pbuf)) {
1701 r = new_code_range(pbuf);
1702 if (r) return r;
1703 bbuf = *pbuf;
1704 n = 0;
1705 }
1706 else {
1707 bbuf = *pbuf;
1708 GET_CODE_POINT(n, bbuf->p);
1709 }
1710 data = (OnigCodePoint* )(bbuf->p);
1711 data++;
1712
1713 for (low = 0, bound = n; low < bound; ) {
1714 x = (low + bound) >> 1;
1715 if (from > data[x*2 + 1])
1716 low = x + 1;
1717 else
1718 bound = x;
1719 }
1720
1721 for (high = low, bound = n; high < bound; ) {
1722 x = (high + bound) >> 1;
1723 if (to >= data[x*2] - 1)
1724 high = x + 1;
1725 else
1726 bound = x;
1727 }
1728
1729 inc_n = low + 1 - high;
1730 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1731 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1732
1733 if (inc_n != 1) {
1734 if (from > data[low*2])
1735 from = data[low*2];
1736 if (to < data[(high - 1)*2 + 1])
1737 to = data[(high - 1)*2 + 1];
1738 }
1739
1740 if (inc_n != 0 && (OnigCodePoint )high < n) {
1741 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1742 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1743 int size = (n - high) * 2 * SIZE_CODE_POINT;
1744
1745 if (inc_n > 0) {
1746 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1747 }
1748 else {
1749 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1750 }
1751 }
1752
1753 pos = SIZE_CODE_POINT * (1 + low * 2);
1754 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1755 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1756 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1757 n += inc_n;
1758 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1759
1760 return 0;
1761 }
1762
1763 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1764 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1765 {
1766 if (from > to) {
1767 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1768 return 0;
1769 else
1770 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1771 }
1772
1773 return add_code_range_to_buf(pbuf, from, to);
1774 }
1775
1776 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1777 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1778 {
1779 int r, i, n;
1780 OnigCodePoint pre, from, *data, to = 0;
1781
1782 *pbuf = (BBuf* )NULL;
1783 if (IS_NULL(bbuf)) {
1784 set_all:
1785 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1786 }
1787
1788 data = (OnigCodePoint* )(bbuf->p);
1789 GET_CODE_POINT(n, data);
1790 data++;
1791 if (n <= 0) goto set_all;
1792
1793 r = 0;
1794 pre = MBCODE_START_POS(enc);
1795 for (i = 0; i < n; i++) {
1796 from = data[i*2];
1797 to = data[i*2+1];
1798 if (pre <= from - 1) {
1799 r = add_code_range_to_buf(pbuf, pre, from - 1);
1800 if (r != 0) return r;
1801 }
1802 if (to == ~((OnigCodePoint )0)) break;
1803 pre = to + 1;
1804 }
1805 if (to < ~((OnigCodePoint )0)) {
1806 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1807 }
1808 return r;
1809 }
1810
1811 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1812 BBuf *tbuf; \
1813 int tnot; \
1814 tnot = not1; not1 = not2; not2 = tnot; \
1815 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1816 } while (0)
1817
1818 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1819 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1820 BBuf* bbuf2, int not2, BBuf** pbuf)
1821 {
1822 int r;
1823 OnigCodePoint i, n1, *data1;
1824 OnigCodePoint from, to;
1825
1826 *pbuf = (BBuf* )NULL;
1827 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1828 if (not1 != 0 || not2 != 0)
1829 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1830 return 0;
1831 }
1832
1833 r = 0;
1834 if (IS_NULL(bbuf2))
1835 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1836
1837 if (IS_NULL(bbuf1)) {
1838 if (not1 != 0) {
1839 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1840 }
1841 else {
1842 if (not2 == 0) {
1843 return bbuf_clone(pbuf, bbuf2);
1844 }
1845 else {
1846 return not_code_range_buf(enc, bbuf2, pbuf);
1847 }
1848 }
1849 }
1850
1851 if (not1 != 0)
1852 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1853
1854 data1 = (OnigCodePoint* )(bbuf1->p);
1855 GET_CODE_POINT(n1, data1);
1856 data1++;
1857
1858 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1859 r = bbuf_clone(pbuf, bbuf2);
1860 }
1861 else if (not1 == 0) { /* 1 OR (not 2) */
1862 r = not_code_range_buf(enc, bbuf2, pbuf);
1863 }
1864 if (r != 0) return r;
1865
1866 for (i = 0; i < n1; i++) {
1867 from = data1[i*2];
1868 to = data1[i*2+1];
1869 r = add_code_range_to_buf(pbuf, from, to);
1870 if (r != 0) return r;
1871 }
1872 return 0;
1873 }
1874
1875 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1876 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1877 OnigCodePoint* data, int n)
1878 {
1879 int i, r;
1880 OnigCodePoint from2, to2;
1881
1882 for (i = 0; i < n; i++) {
1883 from2 = data[i*2];
1884 to2 = data[i*2+1];
1885 if (from2 < from1) {
1886 if (to2 < from1) continue;
1887 else {
1888 from1 = to2 + 1;
1889 }
1890 }
1891 else if (from2 <= to1) {
1892 if (to2 < to1) {
1893 if (from1 <= from2 - 1) {
1894 r = add_code_range_to_buf(pbuf, from1, from2-1);
1895 if (r != 0) return r;
1896 }
1897 from1 = to2 + 1;
1898 }
1899 else {
1900 to1 = from2 - 1;
1901 }
1902 }
1903 else {
1904 from1 = from2;
1905 }
1906 if (from1 > to1) break;
1907 }
1908 if (from1 <= to1) {
1909 r = add_code_range_to_buf(pbuf, from1, to1);
1910 if (r != 0) return r;
1911 }
1912 return 0;
1913 }
1914
1915 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1916 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1917 {
1918 int r;
1919 OnigCodePoint i, j, n1, n2, *data1, *data2;
1920 OnigCodePoint from, to, from1, to1, from2, to2;
1921
1922 *pbuf = (BBuf* )NULL;
1923 if (IS_NULL(bbuf1)) {
1924 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1925 return bbuf_clone(pbuf, bbuf2);
1926 return 0;
1927 }
1928 else if (IS_NULL(bbuf2)) {
1929 if (not2 != 0)
1930 return bbuf_clone(pbuf, bbuf1);
1931 return 0;
1932 }
1933
1934 if (not1 != 0)
1935 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1936
1937 data1 = (OnigCodePoint* )(bbuf1->p);
1938 data2 = (OnigCodePoint* )(bbuf2->p);
1939 GET_CODE_POINT(n1, data1);
1940 GET_CODE_POINT(n2, data2);
1941 data1++;
1942 data2++;
1943
1944 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1945 for (i = 0; i < n1; i++) {
1946 from1 = data1[i*2];
1947 to1 = data1[i*2+1];
1948 for (j = 0; j < n2; j++) {
1949 from2 = data2[j*2];
1950 to2 = data2[j*2+1];
1951 if (from2 > to1) break;
1952 if (to2 < from1) continue;
1953 from = MAX(from1, from2);
1954 to = MIN(to1, to2);
1955 r = add_code_range_to_buf(pbuf, from, to);
1956 if (r != 0) return r;
1957 }
1958 }
1959 }
1960 else if (not1 == 0) { /* 1 AND (not 2) */
1961 for (i = 0; i < n1; i++) {
1962 from1 = data1[i*2];
1963 to1 = data1[i*2+1];
1964 r = and_code_range1(pbuf, from1, to1, data2, n2);
1965 if (r != 0) return r;
1966 }
1967 }
1968
1969 return 0;
1970 }
1971
1972 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1973 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1974 {
1975 int r, not1, not2;
1976 BBuf *buf1, *buf2, *pbuf;
1977 BitSetRef bsr1, bsr2;
1978 BitSet bs1, bs2;
1979
1980 not1 = IS_NCCLASS_NOT(dest);
1981 bsr1 = dest->bs;
1982 buf1 = dest->mbuf;
1983 not2 = IS_NCCLASS_NOT(cc);
1984 bsr2 = cc->bs;
1985 buf2 = cc->mbuf;
1986
1987 if (not1 != 0) {
1988 bitset_invert_to(bsr1, bs1);
1989 bsr1 = bs1;
1990 }
1991 if (not2 != 0) {
1992 bitset_invert_to(bsr2, bs2);
1993 bsr2 = bs2;
1994 }
1995 bitset_and(bsr1, bsr2);
1996 if (bsr1 != dest->bs) {
1997 bitset_copy(dest->bs, bsr1);
1998 bsr1 = dest->bs;
1999 }
2000 if (not1 != 0) {
2001 bitset_invert(dest->bs);
2002 }
2003
2004 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2005 if (not1 != 0 && not2 != 0) {
2006 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2007 }
2008 else {
2009 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2010 if (r == 0 && not1 != 0) {
2011 BBuf *tbuf;
2012 r = not_code_range_buf(enc, pbuf, &tbuf);
2013 if (r != 0) {
2014 bbuf_free(pbuf);
2015 return r;
2016 }
2017 bbuf_free(pbuf);
2018 pbuf = tbuf;
2019 }
2020 }
2021 if (r != 0) return r;
2022
2023 dest->mbuf = pbuf;
2024 bbuf_free(buf1);
2025 return r;
2026 }
2027 return 0;
2028 }
2029
2030 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2031 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2032 {
2033 int r, not1, not2;
2034 BBuf *buf1, *buf2, *pbuf;
2035 BitSetRef bsr1, bsr2;
2036 BitSet bs1, bs2;
2037
2038 not1 = IS_NCCLASS_NOT(dest);
2039 bsr1 = dest->bs;
2040 buf1 = dest->mbuf;
2041 not2 = IS_NCCLASS_NOT(cc);
2042 bsr2 = cc->bs;
2043 buf2 = cc->mbuf;
2044
2045 if (not1 != 0) {
2046 bitset_invert_to(bsr1, bs1);
2047 bsr1 = bs1;
2048 }
2049 if (not2 != 0) {
2050 bitset_invert_to(bsr2, bs2);
2051 bsr2 = bs2;
2052 }
2053 bitset_or(bsr1, bsr2);
2054 if (bsr1 != dest->bs) {
2055 bitset_copy(dest->bs, bsr1);
2056 bsr1 = dest->bs;
2057 }
2058 if (not1 != 0) {
2059 bitset_invert(dest->bs);
2060 }
2061
2062 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2063 if (not1 != 0 && not2 != 0) {
2064 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2065 }
2066 else {
2067 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2068 if (r == 0 && not1 != 0) {
2069 BBuf *tbuf;
2070 r = not_code_range_buf(enc, pbuf, &tbuf);
2071 if (r != 0) {
2072 bbuf_free(pbuf);
2073 return r;
2074 }
2075 bbuf_free(pbuf);
2076 pbuf = tbuf;
2077 }
2078 }
2079 if (r != 0) return r;
2080
2081 dest->mbuf = pbuf;
2082 bbuf_free(buf1);
2083 return r;
2084 }
2085 else
2086 return 0;
2087 }
2088
2089 static int
conv_backslash_value(int c,ScanEnv * env)2090 conv_backslash_value(int c, ScanEnv* env)
2091 {
2092 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2093 switch (c) {
2094 case 'n': return '\n';
2095 case 't': return '\t';
2096 case 'r': return '\r';
2097 case 'f': return '\f';
2098 case 'a': return '\007';
2099 case 'b': return '\010';
2100 case 'e': return '\033';
2101 case 'v':
2102 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2103 return '\v';
2104 break;
2105
2106 default:
2107 break;
2108 }
2109 }
2110 return c;
2111 }
2112
2113 static int
is_invalid_quantifier_target(Node * node)2114 is_invalid_quantifier_target(Node* node)
2115 {
2116 switch (NTYPE(node)) {
2117 case NT_ANCHOR:
2118 return 1;
2119 break;
2120
2121 case NT_ENCLOSE:
2122 /* allow enclosed elements */
2123 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2124 break;
2125
2126 case NT_LIST:
2127 do {
2128 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2129 } while (IS_NOT_NULL(node = NCDR(node)));
2130 return 0;
2131 break;
2132
2133 case NT_ALT:
2134 do {
2135 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2136 } while (IS_NOT_NULL(node = NCDR(node)));
2137 break;
2138
2139 default:
2140 break;
2141 }
2142 return 0;
2143 }
2144
2145 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2146 static int
popular_quantifier_num(QtfrNode * q)2147 popular_quantifier_num(QtfrNode* q)
2148 {
2149 if (q->greedy) {
2150 if (q->lower == 0) {
2151 if (q->upper == 1) return 0;
2152 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2153 }
2154 else if (q->lower == 1) {
2155 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2156 }
2157 }
2158 else {
2159 if (q->lower == 0) {
2160 if (q->upper == 1) return 3;
2161 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2162 }
2163 else if (q->lower == 1) {
2164 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2165 }
2166 }
2167 return -1;
2168 }
2169
2170
2171 enum ReduceType {
2172 RQ_ASIS = 0, /* as is */
2173 RQ_DEL = 1, /* delete parent */
2174 RQ_A, /* to '*' */
2175 RQ_AQ, /* to '*?' */
2176 RQ_QQ, /* to '??' */
2177 RQ_P_QQ, /* to '+)??' */
2178 RQ_PQ_Q /* to '+?)?' */
2179 };
2180
2181 static enum ReduceType ReduceTypeTable[6][6] = {
2182 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2183 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2184 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2185 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2186 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2187 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2188 };
2189
2190 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2191 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2192 {
2193 int pnum, cnum;
2194 QtfrNode *p, *c;
2195
2196 p = NQTFR(pnode);
2197 c = NQTFR(cnode);
2198 pnum = popular_quantifier_num(p);
2199 cnum = popular_quantifier_num(c);
2200 if (pnum < 0 || cnum < 0) return ;
2201
2202 switch(ReduceTypeTable[cnum][pnum]) {
2203 case RQ_DEL:
2204 *pnode = *cnode;
2205 break;
2206 case RQ_A:
2207 p->target = c->target;
2208 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2209 break;
2210 case RQ_AQ:
2211 p->target = c->target;
2212 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2213 break;
2214 case RQ_QQ:
2215 p->target = c->target;
2216 p->lower = 0; p->upper = 1; p->greedy = 0;
2217 break;
2218 case RQ_P_QQ:
2219 p->target = cnode;
2220 p->lower = 0; p->upper = 1; p->greedy = 0;
2221 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2222 return ;
2223 break;
2224 case RQ_PQ_Q:
2225 p->target = cnode;
2226 p->lower = 0; p->upper = 1; p->greedy = 1;
2227 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2228 return ;
2229 break;
2230 case RQ_ASIS:
2231 p->target = cnode;
2232 return ;
2233 break;
2234 }
2235
2236 c->target = NULL_NODE;
2237 onig_node_free(cnode);
2238 }
2239
2240
2241 enum TokenSyms {
2242 TK_EOT = 0, /* end of token */
2243 TK_RAW_BYTE = 1,
2244 TK_CHAR,
2245 TK_STRING,
2246 TK_CODE_POINT,
2247 TK_ANYCHAR,
2248 TK_CHAR_TYPE,
2249 TK_BACKREF,
2250 TK_CALL,
2251 TK_ANCHOR,
2252 TK_OP_REPEAT,
2253 TK_INTERVAL,
2254 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2255 TK_ALT,
2256 TK_SUBEXP_OPEN,
2257 TK_SUBEXP_CLOSE,
2258 TK_CC_OPEN,
2259 TK_QUOTE_OPEN,
2260 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2261 /* in cc */
2262 TK_CC_CLOSE,
2263 TK_CC_RANGE,
2264 TK_POSIX_BRACKET_OPEN,
2265 TK_CC_AND, /* && */
2266 TK_CC_CC_OPEN /* [ */
2267 };
2268
2269 typedef struct {
2270 enum TokenSyms type;
2271 int escaped;
2272 int base; /* is number: 8, 16 (used in [....]) */
2273 UChar* backp;
2274 union {
2275 UChar* s;
2276 int c;
2277 OnigCodePoint code;
2278 int anchor;
2279 int subtype;
2280 struct {
2281 int lower;
2282 int upper;
2283 int greedy;
2284 int possessive;
2285 } repeat;
2286 struct {
2287 int num;
2288 int ref1;
2289 int* refs;
2290 int by_name;
2291 #ifdef USE_BACKREF_WITH_LEVEL
2292 int exist_level;
2293 int level; /* \k<name+n> */
2294 #endif
2295 } backref;
2296 struct {
2297 UChar* name;
2298 UChar* name_end;
2299 int gnum;
2300 } call;
2301 struct {
2302 int ctype;
2303 int not;
2304 } prop;
2305 } u;
2306 } OnigToken;
2307
2308
2309 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2310 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2311 {
2312 int low, up, syn_allow, non_low = 0;
2313 int r = 0;
2314 OnigCodePoint c;
2315 OnigEncoding enc = env->enc;
2316 UChar* p = *src;
2317 PFETCH_READY;
2318
2319 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2320
2321 if (PEND) {
2322 if (syn_allow)
2323 return 1; /* "....{" : OK! */
2324 else
2325 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2326 }
2327
2328 if (! syn_allow) {
2329 c = PPEEK;
2330 if (c == ')' || c == '(' || c == '|') {
2331 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2332 }
2333 }
2334
2335 low = onig_scan_unsigned_number(&p, end, env->enc);
2336 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2337 if (low > ONIG_MAX_REPEAT_NUM)
2338 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2339
2340 if (p == *src) { /* can't read low */
2341 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2342 /* allow {,n} as {0,n} */
2343 low = 0;
2344 non_low = 1;
2345 }
2346 else
2347 goto invalid;
2348 }
2349
2350 if (PEND) goto invalid;
2351 PFETCH(c);
2352 if (c == ',') {
2353 UChar* prev = p;
2354 up = onig_scan_unsigned_number(&p, end, env->enc);
2355 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2356 if (up > ONIG_MAX_REPEAT_NUM)
2357 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2358
2359 if (p == prev) {
2360 if (non_low != 0)
2361 goto invalid;
2362 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2363 }
2364 }
2365 else {
2366 if (non_low != 0)
2367 goto invalid;
2368
2369 PUNFETCH;
2370 up = low; /* {n} : exact n times */
2371 r = 2; /* fixed */
2372 }
2373
2374 if (PEND) goto invalid;
2375 PFETCH(c);
2376 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2377 if (c != MC_ESC(env->syntax)) goto invalid;
2378 PFETCH(c);
2379 }
2380 if (c != '}') goto invalid;
2381
2382 if (!IS_REPEAT_INFINITE(up) && low > up) {
2383 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2384 }
2385
2386 tok->type = TK_INTERVAL;
2387 tok->u.repeat.lower = low;
2388 tok->u.repeat.upper = up;
2389 *src = p;
2390 return r; /* 0: normal {n,m}, 2: fixed {n} */
2391
2392 invalid:
2393 if (syn_allow)
2394 return 1; /* OK */
2395 else
2396 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2397 }
2398
2399 /* \M-, \C-, \c, or \... */
2400 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2401 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2402 {
2403 int v;
2404 OnigCodePoint c;
2405 OnigEncoding enc = env->enc;
2406 UChar* p = *src;
2407 PFETCH_READY;
2408
2409 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2410
2411 PFETCH(c);
2412 switch (c) {
2413 case 'M':
2414 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2415 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2416 PFETCH(c);
2417 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2418 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2419 PFETCH(c);
2420 if (c == MC_ESC(env->syntax)) {
2421 v = fetch_escaped_value(&p, end, env);
2422 if (v < 0) return v;
2423 c = (OnigCodePoint )v;
2424 }
2425 c = ((c & 0xff) | 0x80);
2426 }
2427 else
2428 goto backslash;
2429 break;
2430
2431 case 'C':
2432 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2433 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2434 PFETCH(c);
2435 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2436 goto control;
2437 }
2438 else
2439 goto backslash;
2440
2441 case 'c':
2442 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2443 control:
2444 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2445 PFETCH(c);
2446 if (c == '?') {
2447 c = 0177;
2448 }
2449 else {
2450 if (c == MC_ESC(env->syntax)) {
2451 v = fetch_escaped_value(&p, end, env);
2452 if (v < 0) return v;
2453 c = (OnigCodePoint )v;
2454 }
2455 c &= 0x9f;
2456 }
2457 break;
2458 }
2459 /* fall through */
2460
2461 default:
2462 {
2463 backslash:
2464 c = conv_backslash_value(c, env);
2465 }
2466 break;
2467 }
2468
2469 *src = p;
2470 return c;
2471 }
2472
2473 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2474
2475 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2476 get_name_end_code_point(OnigCodePoint start)
2477 {
2478 switch (start) {
2479 case '<': return (OnigCodePoint )'>'; break;
2480 case '\'': return (OnigCodePoint )'\''; break;
2481 default:
2482 break;
2483 }
2484
2485 return (OnigCodePoint )0;
2486 }
2487
2488 #ifdef USE_NAMED_GROUP
2489 #ifdef USE_BACKREF_WITH_LEVEL
2490 /*
2491 \k<name+n>, \k<name-n>
2492 \k<num+n>, \k<num-n>
2493 \k<-num+n>, \k<-num-n>
2494 */
2495 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2496 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2497 UChar** rname_end, ScanEnv* env,
2498 int* rback_num, int* rlevel)
2499 {
2500 int r, sign, is_num, exist_level;
2501 OnigCodePoint end_code;
2502 OnigCodePoint c = 0;
2503 OnigEncoding enc = env->enc;
2504 UChar *name_end;
2505 UChar *pnum_head;
2506 UChar *p = *src;
2507 PFETCH_READY;
2508
2509 *rback_num = 0;
2510 is_num = exist_level = 0;
2511 sign = 1;
2512 pnum_head = *src;
2513
2514 end_code = get_name_end_code_point(start_code);
2515
2516 name_end = end;
2517 r = 0;
2518 if (PEND) {
2519 return ONIGERR_EMPTY_GROUP_NAME;
2520 }
2521 else {
2522 PFETCH(c);
2523 if (c == end_code)
2524 return ONIGERR_EMPTY_GROUP_NAME;
2525
2526 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2527 is_num = 1;
2528 }
2529 else if (c == '-') {
2530 is_num = 2;
2531 sign = -1;
2532 pnum_head = p;
2533 }
2534 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2535 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2536 }
2537 }
2538
2539 while (!PEND) {
2540 name_end = p;
2541 PFETCH(c);
2542 if (c == end_code || c == ')' || c == '+' || c == '-') {
2543 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2544 break;
2545 }
2546
2547 if (is_num != 0) {
2548 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2549 is_num = 1;
2550 }
2551 else {
2552 r = ONIGERR_INVALID_GROUP_NAME;
2553 is_num = 0;
2554 }
2555 }
2556 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2557 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2558 }
2559 }
2560
2561 if (r == 0 && c != end_code) {
2562 if (c == '+' || c == '-') {
2563 int level;
2564 int flag = (c == '-' ? -1 : 1);
2565
2566 PFETCH(c);
2567 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2568 PUNFETCH;
2569 level = onig_scan_unsigned_number(&p, end, enc);
2570 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2571 *rlevel = (level * flag);
2572 exist_level = 1;
2573
2574 PFETCH(c);
2575 if (c == end_code)
2576 goto end;
2577 }
2578
2579 err:
2580 r = ONIGERR_INVALID_GROUP_NAME;
2581 name_end = end;
2582 }
2583
2584 end:
2585 if (r == 0) {
2586 if (is_num != 0) {
2587 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2588 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2589 else if (*rback_num == 0) goto err;
2590
2591 *rback_num *= sign;
2592 }
2593
2594 *rname_end = name_end;
2595 *src = p;
2596 return (exist_level ? 1 : 0);
2597 }
2598 else {
2599 onig_scan_env_set_error_string(env, r, *src, name_end);
2600 return r;
2601 }
2602 }
2603 #endif /* USE_BACKREF_WITH_LEVEL */
2604
2605 /*
2606 def: 0 -> define name (don't allow number name)
2607 1 -> reference name (allow number name)
2608 */
2609 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2610 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2611 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2612 {
2613 int r, is_num, sign;
2614 OnigCodePoint end_code;
2615 OnigCodePoint c = 0;
2616 OnigEncoding enc = env->enc;
2617 UChar *name_end;
2618 UChar *pnum_head;
2619 UChar *p = *src;
2620 PFETCH_READY;
2621
2622 *rback_num = 0;
2623
2624 end_code = get_name_end_code_point(start_code);
2625
2626 name_end = end;
2627 pnum_head = *src;
2628 r = 0;
2629 is_num = 0;
2630 sign = 1;
2631 if (PEND) {
2632 return ONIGERR_EMPTY_GROUP_NAME;
2633 }
2634 else {
2635 PFETCH(c);
2636 if (c == end_code)
2637 return ONIGERR_EMPTY_GROUP_NAME;
2638
2639 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2640 if (ref == 1)
2641 is_num = 1;
2642 else {
2643 r = ONIGERR_INVALID_GROUP_NAME;
2644 is_num = 0;
2645 }
2646 }
2647 else if (c == '-') {
2648 if (ref == 1) {
2649 is_num = 2;
2650 sign = -1;
2651 pnum_head = p;
2652 }
2653 else {
2654 r = ONIGERR_INVALID_GROUP_NAME;
2655 is_num = 0;
2656 }
2657 }
2658 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2659 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2660 }
2661 }
2662
2663 if (r == 0) {
2664 while (!PEND) {
2665 name_end = p;
2666 PFETCH(c);
2667 if (c == end_code || c == ')') {
2668 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2669 break;
2670 }
2671
2672 if (is_num != 0) {
2673 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2674 is_num = 1;
2675 }
2676 else {
2677 if (!ONIGENC_IS_CODE_WORD(enc, c))
2678 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2679 else
2680 r = ONIGERR_INVALID_GROUP_NAME;
2681
2682 is_num = 0;
2683 }
2684 }
2685 else {
2686 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2687 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2688 }
2689 }
2690 }
2691
2692 if (c != end_code) {
2693 r = ONIGERR_INVALID_GROUP_NAME;
2694 name_end = end;
2695 }
2696
2697 if (is_num != 0) {
2698 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2699 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2700 else if (*rback_num == 0) {
2701 r = ONIGERR_INVALID_GROUP_NAME;
2702 goto err;
2703 }
2704
2705 *rback_num *= sign;
2706 }
2707
2708 *rname_end = name_end;
2709 *src = p;
2710 return 0;
2711 }
2712 else {
2713 while (!PEND) {
2714 name_end = p;
2715 PFETCH(c);
2716 if (c == end_code || c == ')')
2717 break;
2718 }
2719 if (PEND)
2720 name_end = end;
2721
2722 err:
2723 onig_scan_env_set_error_string(env, r, *src, name_end);
2724 return r;
2725 }
2726 }
2727 #else
2728 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2729 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2730 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2731 {
2732 int r, is_num, sign;
2733 OnigCodePoint end_code;
2734 OnigCodePoint c = 0;
2735 UChar *name_end;
2736 OnigEncoding enc = env->enc;
2737 UChar *pnum_head;
2738 UChar *p = *src;
2739 PFETCH_READY;
2740
2741 *rback_num = 0;
2742
2743 end_code = get_name_end_code_point(start_code);
2744
2745 *rname_end = name_end = end;
2746 r = 0;
2747 pnum_head = *src;
2748 is_num = 0;
2749 sign = 1;
2750
2751 if (PEND) {
2752 return ONIGERR_EMPTY_GROUP_NAME;
2753 }
2754 else {
2755 PFETCH(c);
2756 if (c == end_code)
2757 return ONIGERR_EMPTY_GROUP_NAME;
2758
2759 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2760 is_num = 1;
2761 }
2762 else if (c == '-') {
2763 is_num = 2;
2764 sign = -1;
2765 pnum_head = p;
2766 }
2767 else {
2768 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2769 }
2770 }
2771
2772 while (!PEND) {
2773 name_end = p;
2774
2775 PFETCH(c);
2776 if (c == end_code || c == ')') break;
2777 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2778 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2779 }
2780 if (r == 0 && c != end_code) {
2781 r = ONIGERR_INVALID_GROUP_NAME;
2782 name_end = end;
2783 }
2784
2785 if (r == 0) {
2786 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2787 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2788 else if (*rback_num == 0) {
2789 r = ONIGERR_INVALID_GROUP_NAME;
2790 goto err;
2791 }
2792 *rback_num *= sign;
2793
2794 *rname_end = name_end;
2795 *src = p;
2796 return 0;
2797 }
2798 else {
2799 err:
2800 onig_scan_env_set_error_string(env, r, *src, name_end);
2801 return r;
2802 }
2803 }
2804 #endif /* USE_NAMED_GROUP */
2805
2806 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2807 CC_ESC_WARN(ScanEnv* env, UChar *c)
2808 {
2809 if (onig_warn == onig_null_warn) return ;
2810
2811 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2812 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2813 UChar buf[WARN_BUFSIZE];
2814 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2815 env->pattern, env->pattern_end,
2816 (UChar* )"character class has '%s' without escape", c);
2817 (*onig_warn)((char* )buf);
2818 }
2819 }
2820
2821 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2822 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2823 {
2824 if (onig_warn == onig_null_warn) return ;
2825
2826 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2827 UChar buf[WARN_BUFSIZE];
2828 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2829 (env)->pattern, (env)->pattern_end,
2830 (UChar* )"regular expression has '%s' without escape", c);
2831 (*onig_warn)((char* )buf);
2832 }
2833 }
2834
2835 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2836 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2837 UChar **next, OnigEncoding enc)
2838 {
2839 int i;
2840 OnigCodePoint x;
2841 UChar *q;
2842 UChar *p = from;
2843
2844 while (p < to) {
2845 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2846 q = p + enclen(enc, p);
2847 if (x == s[0]) {
2848 for (i = 1; i < n && q < to; i++) {
2849 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2850 if (x != s[i]) break;
2851 q += enclen(enc, q);
2852 }
2853 if (i >= n) {
2854 if (IS_NOT_NULL(next))
2855 *next = q;
2856 return p;
2857 }
2858 }
2859 p = q;
2860 }
2861 return NULL_UCHARP;
2862 }
2863
2864 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2865 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2866 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2867 {
2868 int i, in_esc;
2869 OnigCodePoint x;
2870 UChar *q;
2871 UChar *p = from;
2872
2873 in_esc = 0;
2874 while (p < to) {
2875 if (in_esc) {
2876 in_esc = 0;
2877 p += enclen(enc, p);
2878 }
2879 else {
2880 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2881 q = p + enclen(enc, p);
2882 if (x == s[0]) {
2883 for (i = 1; i < n && q < to; i++) {
2884 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2885 if (x != s[i]) break;
2886 q += enclen(enc, q);
2887 }
2888 if (i >= n) return 1;
2889 p += enclen(enc, p);
2890 }
2891 else {
2892 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2893 if (x == bad) return 0;
2894 else if (x == MC_ESC(syn)) in_esc = 1;
2895 p = q;
2896 }
2897 }
2898 }
2899 return 0;
2900 }
2901
2902 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2903 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2904 {
2905 int num;
2906 OnigCodePoint c, c2;
2907 OnigSyntaxType* syn = env->syntax;
2908 OnigEncoding enc = env->enc;
2909 UChar* prev;
2910 UChar* p = *src;
2911 PFETCH_READY;
2912
2913 if (PEND) {
2914 tok->type = TK_EOT;
2915 return tok->type;
2916 }
2917
2918 PFETCH(c);
2919 tok->type = TK_CHAR;
2920 tok->base = 0;
2921 tok->u.c = c;
2922 tok->escaped = 0;
2923
2924 if (c == ']') {
2925 tok->type = TK_CC_CLOSE;
2926 }
2927 else if (c == '-') {
2928 tok->type = TK_CC_RANGE;
2929 }
2930 else if (c == MC_ESC(syn)) {
2931 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2932 goto end;
2933
2934 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2935
2936 PFETCH(c);
2937 tok->escaped = 1;
2938 tok->u.c = c;
2939 switch (c) {
2940 case 'w':
2941 tok->type = TK_CHAR_TYPE;
2942 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2943 tok->u.prop.not = 0;
2944 break;
2945 case 'W':
2946 tok->type = TK_CHAR_TYPE;
2947 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2948 tok->u.prop.not = 1;
2949 break;
2950 case 'd':
2951 tok->type = TK_CHAR_TYPE;
2952 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2953 tok->u.prop.not = 0;
2954 break;
2955 case 'D':
2956 tok->type = TK_CHAR_TYPE;
2957 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2958 tok->u.prop.not = 1;
2959 break;
2960 case 's':
2961 tok->type = TK_CHAR_TYPE;
2962 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2963 tok->u.prop.not = 0;
2964 break;
2965 case 'S':
2966 tok->type = TK_CHAR_TYPE;
2967 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2968 tok->u.prop.not = 1;
2969 break;
2970 case 'h':
2971 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2972 tok->type = TK_CHAR_TYPE;
2973 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2974 tok->u.prop.not = 0;
2975 break;
2976 case 'H':
2977 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2978 tok->type = TK_CHAR_TYPE;
2979 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2980 tok->u.prop.not = 1;
2981 break;
2982
2983 case 'p':
2984 case 'P':
2985 c2 = PPEEK;
2986 if (c2 == '{' &&
2987 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2988 PINC;
2989 tok->type = TK_CHAR_PROPERTY;
2990 tok->u.prop.not = (c == 'P' ? 1 : 0);
2991
2992 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2993 PFETCH(c2);
2994 if (c2 == '^') {
2995 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2996 }
2997 else
2998 PUNFETCH;
2999 }
3000 }
3001 break;
3002
3003 case 'x':
3004 if (PEND) break;
3005
3006 prev = p;
3007 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3008 PINC;
3009 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3010 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3011 if (!PEND) {
3012 c2 = PPEEK;
3013 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3014 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3015 }
3016
3017 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3018 PINC;
3019 tok->type = TK_CODE_POINT;
3020 tok->base = 16;
3021 tok->u.code = (OnigCodePoint )num;
3022 }
3023 else {
3024 /* can't read nothing or invalid format */
3025 p = prev;
3026 }
3027 }
3028 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3029 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3030 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3031 if (p == prev) { /* can't read nothing. */
3032 num = 0; /* but, it's not error */
3033 }
3034 tok->type = TK_RAW_BYTE;
3035 tok->base = 16;
3036 tok->u.c = num;
3037 }
3038 break;
3039
3040 case 'u':
3041 if (PEND) break;
3042
3043 prev = p;
3044 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3045 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3046 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3047 if (p == prev) { /* can't read nothing. */
3048 num = 0; /* but, it's not error */
3049 }
3050 tok->type = TK_CODE_POINT;
3051 tok->base = 16;
3052 tok->u.code = (OnigCodePoint )num;
3053 }
3054 break;
3055
3056 case '0':
3057 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3058 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3059 PUNFETCH;
3060 prev = p;
3061 num = scan_unsigned_octal_number(&p, end, 3, enc);
3062 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3063 if (p == prev) { /* can't read nothing. */
3064 num = 0; /* but, it's not error */
3065 }
3066 tok->type = TK_RAW_BYTE;
3067 tok->base = 8;
3068 tok->u.c = num;
3069 }
3070 break;
3071
3072 default:
3073 PUNFETCH;
3074 num = fetch_escaped_value(&p, end, env);
3075 if (num < 0) return num;
3076 if (tok->u.c != num) {
3077 tok->u.code = (OnigCodePoint )num;
3078 tok->type = TK_CODE_POINT;
3079 }
3080 break;
3081 }
3082 }
3083 else if (c == '[') {
3084 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3085 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3086 tok->backp = p; /* point at '[' is readed */
3087 PINC;
3088 if (str_exist_check_with_esc(send, 2, p, end,
3089 (OnigCodePoint )']', enc, syn)) {
3090 tok->type = TK_POSIX_BRACKET_OPEN;
3091 }
3092 else {
3093 PUNFETCH;
3094 goto cc_in_cc;
3095 }
3096 }
3097 else {
3098 cc_in_cc:
3099 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3100 tok->type = TK_CC_CC_OPEN;
3101 }
3102 else {
3103 CC_ESC_WARN(env, (UChar* )"[");
3104 }
3105 }
3106 }
3107 else if (c == '&') {
3108 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3109 !PEND && (PPEEK_IS('&'))) {
3110 PINC;
3111 tok->type = TK_CC_AND;
3112 }
3113 }
3114
3115 end:
3116 *src = p;
3117 return tok->type;
3118 }
3119
3120 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3121 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3122 {
3123 int r, num;
3124 OnigCodePoint c;
3125 OnigEncoding enc = env->enc;
3126 OnigSyntaxType* syn = env->syntax;
3127 UChar* prev;
3128 UChar* p = *src;
3129 PFETCH_READY;
3130
3131 start:
3132 if (PEND) {
3133 tok->type = TK_EOT;
3134 return tok->type;
3135 }
3136
3137 tok->type = TK_STRING;
3138 tok->base = 0;
3139 tok->backp = p;
3140
3141 PFETCH(c);
3142 if (IS_MC_ESC_CODE(c, syn)) {
3143 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3144
3145 tok->backp = p;
3146 PFETCH(c);
3147
3148 tok->u.c = c;
3149 tok->escaped = 1;
3150 switch (c) {
3151 case '*':
3152 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3153 tok->type = TK_OP_REPEAT;
3154 tok->u.repeat.lower = 0;
3155 tok->u.repeat.upper = REPEAT_INFINITE;
3156 goto greedy_check;
3157 break;
3158
3159 case '+':
3160 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3161 tok->type = TK_OP_REPEAT;
3162 tok->u.repeat.lower = 1;
3163 tok->u.repeat.upper = REPEAT_INFINITE;
3164 goto greedy_check;
3165 break;
3166
3167 case '?':
3168 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3169 tok->type = TK_OP_REPEAT;
3170 tok->u.repeat.lower = 0;
3171 tok->u.repeat.upper = 1;
3172 greedy_check:
3173 if (!PEND && PPEEK_IS('?') &&
3174 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3175 PFETCH(c);
3176 tok->u.repeat.greedy = 0;
3177 tok->u.repeat.possessive = 0;
3178 }
3179 else {
3180 possessive_check:
3181 if (!PEND && PPEEK_IS('+') &&
3182 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3183 tok->type != TK_INTERVAL) ||
3184 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3185 tok->type == TK_INTERVAL))) {
3186 PFETCH(c);
3187 tok->u.repeat.greedy = 1;
3188 tok->u.repeat.possessive = 1;
3189 }
3190 else {
3191 tok->u.repeat.greedy = 1;
3192 tok->u.repeat.possessive = 0;
3193 }
3194 }
3195 break;
3196
3197 case '{':
3198 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3199 r = fetch_range_quantifier(&p, end, tok, env);
3200 if (r < 0) return r; /* error */
3201 if (r == 0) goto greedy_check;
3202 else if (r == 2) { /* {n} */
3203 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3204 goto possessive_check;
3205
3206 goto greedy_check;
3207 }
3208 /* r == 1 : normal char */
3209 break;
3210
3211 case '|':
3212 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3213 tok->type = TK_ALT;
3214 break;
3215
3216 case '(':
3217 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3218 tok->type = TK_SUBEXP_OPEN;
3219 break;
3220
3221 case ')':
3222 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3223 tok->type = TK_SUBEXP_CLOSE;
3224 break;
3225
3226 case 'w':
3227 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3228 tok->type = TK_CHAR_TYPE;
3229 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3230 tok->u.prop.not = 0;
3231 break;
3232
3233 case 'W':
3234 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3235 tok->type = TK_CHAR_TYPE;
3236 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3237 tok->u.prop.not = 1;
3238 break;
3239
3240 case 'b':
3241 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3242 tok->type = TK_ANCHOR;
3243 tok->u.anchor = ANCHOR_WORD_BOUND;
3244 break;
3245
3246 case 'B':
3247 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3248 tok->type = TK_ANCHOR;
3249 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3250 break;
3251
3252 #ifdef USE_WORD_BEGIN_END
3253 case '<':
3254 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3255 tok->type = TK_ANCHOR;
3256 tok->u.anchor = ANCHOR_WORD_BEGIN;
3257 break;
3258
3259 case '>':
3260 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3261 tok->type = TK_ANCHOR;
3262 tok->u.anchor = ANCHOR_WORD_END;
3263 break;
3264 #endif
3265
3266 case 's':
3267 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3268 tok->type = TK_CHAR_TYPE;
3269 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3270 tok->u.prop.not = 0;
3271 break;
3272
3273 case 'S':
3274 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3275 tok->type = TK_CHAR_TYPE;
3276 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3277 tok->u.prop.not = 1;
3278 break;
3279
3280 case 'd':
3281 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3282 tok->type = TK_CHAR_TYPE;
3283 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3284 tok->u.prop.not = 0;
3285 break;
3286
3287 case 'D':
3288 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3289 tok->type = TK_CHAR_TYPE;
3290 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3291 tok->u.prop.not = 1;
3292 break;
3293
3294 case 'h':
3295 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3296 tok->type = TK_CHAR_TYPE;
3297 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3298 tok->u.prop.not = 0;
3299 break;
3300
3301 case 'H':
3302 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3303 tok->type = TK_CHAR_TYPE;
3304 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3305 tok->u.prop.not = 1;
3306 break;
3307
3308 case 'A':
3309 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3310 begin_buf:
3311 tok->type = TK_ANCHOR;
3312 tok->u.subtype = ANCHOR_BEGIN_BUF;
3313 break;
3314
3315 case 'Z':
3316 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3317 tok->type = TK_ANCHOR;
3318 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3319 break;
3320
3321 case 'z':
3322 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3323 end_buf:
3324 tok->type = TK_ANCHOR;
3325 tok->u.subtype = ANCHOR_END_BUF;
3326 break;
3327
3328 case 'G':
3329 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3330 tok->type = TK_ANCHOR;
3331 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3332 break;
3333
3334 case '`':
3335 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3336 goto begin_buf;
3337 break;
3338
3339 case '\'':
3340 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3341 goto end_buf;
3342 break;
3343
3344 case 'x':
3345 if (PEND) break;
3346
3347 prev = p;
3348 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3349 PINC;
3350 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3351 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3352 if (!PEND) {
3353 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3354 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3355 }
3356
3357 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3358 PINC;
3359 tok->type = TK_CODE_POINT;
3360 tok->u.code = (OnigCodePoint )num;
3361 }
3362 else {
3363 /* can't read nothing or invalid format */
3364 p = prev;
3365 }
3366 }
3367 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3368 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3369 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3370 if (p == prev) { /* can't read nothing. */
3371 num = 0; /* but, it's not error */
3372 }
3373 tok->type = TK_RAW_BYTE;
3374 tok->base = 16;
3375 tok->u.c = num;
3376 }
3377 break;
3378
3379 case 'u':
3380 if (PEND) break;
3381
3382 prev = p;
3383 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3384 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3385 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3386 if (p == prev) { /* can't read nothing. */
3387 num = 0; /* but, it's not error */
3388 }
3389 tok->type = TK_CODE_POINT;
3390 tok->base = 16;
3391 tok->u.code = (OnigCodePoint )num;
3392 }
3393 break;
3394
3395 case '1': case '2': case '3': case '4':
3396 case '5': case '6': case '7': case '8': case '9':
3397 PUNFETCH;
3398 prev = p;
3399 num = onig_scan_unsigned_number(&p, end, enc);
3400 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3401 goto skip_backref;
3402 }
3403
3404 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3405 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3406 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3407 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3408 return ONIGERR_INVALID_BACKREF;
3409 }
3410
3411 tok->type = TK_BACKREF;
3412 tok->u.backref.num = 1;
3413 tok->u.backref.ref1 = num;
3414 tok->u.backref.by_name = 0;
3415 #ifdef USE_BACKREF_WITH_LEVEL
3416 tok->u.backref.exist_level = 0;
3417 #endif
3418 break;
3419 }
3420
3421 skip_backref:
3422 if (c == '8' || c == '9') {
3423 /* normal char */
3424 p = prev; PINC;
3425 break;
3426 }
3427
3428 p = prev;
3429 /* fall through */
3430 case '0':
3431 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3432 prev = p;
3433 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3434 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3435 if (p == prev) { /* can't read nothing. */
3436 num = 0; /* but, it's not error */
3437 }
3438 tok->type = TK_RAW_BYTE;
3439 tok->base = 8;
3440 tok->u.c = num;
3441 }
3442 else if (c != '0') {
3443 PINC;
3444 }
3445 break;
3446
3447 #ifdef USE_NAMED_GROUP
3448 case 'k':
3449 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3450 PFETCH(c);
3451 if (c == '<' || c == '\'') {
3452 UChar* name_end;
3453 int* backs;
3454 int back_num;
3455
3456 prev = p;
3457
3458 #ifdef USE_BACKREF_WITH_LEVEL
3459 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3460 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3461 env, &back_num, &tok->u.backref.level);
3462 if (r == 1) tok->u.backref.exist_level = 1;
3463 else tok->u.backref.exist_level = 0;
3464 #else
3465 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3466 #endif
3467 if (r < 0) return r;
3468
3469 if (back_num != 0) {
3470 if (back_num < 0) {
3471 back_num = BACKREF_REL_TO_ABS(back_num, env);
3472 if (back_num <= 0)
3473 return ONIGERR_INVALID_BACKREF;
3474 }
3475
3476 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3477 if (back_num > env->num_mem ||
3478 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3479 return ONIGERR_INVALID_BACKREF;
3480 }
3481 tok->type = TK_BACKREF;
3482 tok->u.backref.by_name = 0;
3483 tok->u.backref.num = 1;
3484 tok->u.backref.ref1 = back_num;
3485 }
3486 else {
3487 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3488 if (num <= 0) {
3489 onig_scan_env_set_error_string(env,
3490 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3491 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3492 }
3493 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3494 int i;
3495 for (i = 0; i < num; i++) {
3496 if (backs[i] > env->num_mem ||
3497 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3498 return ONIGERR_INVALID_BACKREF;
3499 }
3500 }
3501
3502 tok->type = TK_BACKREF;
3503 tok->u.backref.by_name = 1;
3504 if (num == 1) {
3505 tok->u.backref.num = 1;
3506 tok->u.backref.ref1 = backs[0];
3507 }
3508 else {
3509 tok->u.backref.num = num;
3510 tok->u.backref.refs = backs;
3511 }
3512 }
3513 }
3514 else
3515 PUNFETCH;
3516 }
3517 break;
3518 #endif
3519
3520 #ifdef USE_SUBEXP_CALL
3521 case 'g':
3522 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3523 PFETCH(c);
3524 if (c == '<' || c == '\'') {
3525 int gnum;
3526 UChar* name_end;
3527
3528 prev = p;
3529 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3530 if (r < 0) return r;
3531
3532 tok->type = TK_CALL;
3533 tok->u.call.name = prev;
3534 tok->u.call.name_end = name_end;
3535 tok->u.call.gnum = gnum;
3536 }
3537 else
3538 PUNFETCH;
3539 }
3540 break;
3541 #endif
3542
3543 case 'Q':
3544 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3545 tok->type = TK_QUOTE_OPEN;
3546 }
3547 break;
3548
3549 case 'p':
3550 case 'P':
3551 if (PPEEK_IS('{') &&
3552 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3553 PINC;
3554 tok->type = TK_CHAR_PROPERTY;
3555 tok->u.prop.not = (c == 'P' ? 1 : 0);
3556
3557 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3558 PFETCH(c);
3559 if (c == '^') {
3560 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3561 }
3562 else
3563 PUNFETCH;
3564 }
3565 }
3566 break;
3567
3568 default:
3569 PUNFETCH;
3570 num = fetch_escaped_value(&p, end, env);
3571 if (num < 0) return num;
3572 /* set_raw: */
3573 if (tok->u.c != num) {
3574 tok->type = TK_CODE_POINT;
3575 tok->u.code = (OnigCodePoint )num;
3576 }
3577 else { /* string */
3578 p = tok->backp + enclen(enc, tok->backp);
3579 }
3580 break;
3581 }
3582 }
3583 else {
3584 tok->u.c = c;
3585 tok->escaped = 0;
3586
3587 #ifdef USE_VARIABLE_META_CHARS
3588 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3589 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3590 if (c == MC_ANYCHAR(syn))
3591 goto any_char;
3592 else if (c == MC_ANYTIME(syn))
3593 goto anytime;
3594 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3595 goto zero_or_one_time;
3596 else if (c == MC_ONE_OR_MORE_TIME(syn))
3597 goto one_or_more_time;
3598 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3599 tok->type = TK_ANYCHAR_ANYTIME;
3600 goto out;
3601 }
3602 }
3603 #endif
3604
3605 switch (c) {
3606 case '.':
3607 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3608 #ifdef USE_VARIABLE_META_CHARS
3609 any_char:
3610 #endif
3611 tok->type = TK_ANYCHAR;
3612 break;
3613
3614 case '*':
3615 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3616 #ifdef USE_VARIABLE_META_CHARS
3617 anytime:
3618 #endif
3619 tok->type = TK_OP_REPEAT;
3620 tok->u.repeat.lower = 0;
3621 tok->u.repeat.upper = REPEAT_INFINITE;
3622 goto greedy_check;
3623 break;
3624
3625 case '+':
3626 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3627 #ifdef USE_VARIABLE_META_CHARS
3628 one_or_more_time:
3629 #endif
3630 tok->type = TK_OP_REPEAT;
3631 tok->u.repeat.lower = 1;
3632 tok->u.repeat.upper = REPEAT_INFINITE;
3633 goto greedy_check;
3634 break;
3635
3636 case '?':
3637 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3638 #ifdef USE_VARIABLE_META_CHARS
3639 zero_or_one_time:
3640 #endif
3641 tok->type = TK_OP_REPEAT;
3642 tok->u.repeat.lower = 0;
3643 tok->u.repeat.upper = 1;
3644 goto greedy_check;
3645 break;
3646
3647 case '{':
3648 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3649 r = fetch_range_quantifier(&p, end, tok, env);
3650 if (r < 0) return r; /* error */
3651 if (r == 0) goto greedy_check;
3652 else if (r == 2) { /* {n} */
3653 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3654 goto possessive_check;
3655
3656 goto greedy_check;
3657 }
3658 /* r == 1 : normal char */
3659 break;
3660
3661 case '|':
3662 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3663 tok->type = TK_ALT;
3664 break;
3665
3666 case '(':
3667 if (PPEEK_IS('?') &&
3668 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3669 PINC;
3670 if (PPEEK_IS('#')) {
3671 PFETCH(c);
3672 while (1) {
3673 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3674 PFETCH(c);
3675 if (c == MC_ESC(syn)) {
3676 if (!PEND) PFETCH(c);
3677 }
3678 else {
3679 if (c == ')') break;
3680 }
3681 }
3682 goto start;
3683 }
3684 PUNFETCH;
3685 }
3686
3687 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3688 tok->type = TK_SUBEXP_OPEN;
3689 break;
3690
3691 case ')':
3692 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3693 tok->type = TK_SUBEXP_CLOSE;
3694 break;
3695
3696 case '^':
3697 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3698 tok->type = TK_ANCHOR;
3699 tok->u.subtype = (IS_SINGLELINE(env->option)
3700 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3701 break;
3702
3703 case '$':
3704 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3705 tok->type = TK_ANCHOR;
3706 tok->u.subtype = (IS_SINGLELINE(env->option)
3707 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3708 break;
3709
3710 case '[':
3711 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3712 tok->type = TK_CC_OPEN;
3713 break;
3714
3715 case ']':
3716 if (*src > env->pattern) /* /].../ is allowed. */
3717 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3718 break;
3719
3720 case '#':
3721 if (IS_EXTEND(env->option)) {
3722 while (!PEND) {
3723 PFETCH(c);
3724 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3725 break;
3726 }
3727 goto start;
3728 break;
3729 }
3730 break;
3731
3732 case ' ': case '\t': case '\n': case '\r': case '\f':
3733 if (IS_EXTEND(env->option))
3734 goto start;
3735 break;
3736
3737 default:
3738 /* string */
3739 break;
3740 }
3741 }
3742
3743 #ifdef USE_VARIABLE_META_CHARS
3744 out:
3745 #endif
3746 *src = p;
3747 return tok->type;
3748 }
3749
3750 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3751 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3752 OnigEncoding enc ARG_UNUSED,
3753 OnigCodePoint sb_out, const OnigCodePoint mbr[])
3754 {
3755 int i, r;
3756 OnigCodePoint j;
3757
3758 int n = ONIGENC_CODE_RANGE_NUM(mbr);
3759
3760 if (not == 0) {
3761 for (i = 0; i < n; i++) {
3762 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
3763 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3764 if (j >= sb_out) {
3765 if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3766 else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3767 r = add_code_range_to_buf(&(cc->mbuf), j,
3768 ONIGENC_CODE_RANGE_TO(mbr, i));
3769 if (r != 0) return r;
3770 i++;
3771 }
3772
3773 goto sb_end;
3774 }
3775 BITSET_SET_BIT(cc->bs, j);
3776 }
3777 }
3778
3779 sb_end:
3780 for ( ; i < n; i++) {
3781 r = add_code_range_to_buf(&(cc->mbuf),
3782 ONIGENC_CODE_RANGE_FROM(mbr, i),
3783 ONIGENC_CODE_RANGE_TO(mbr, i));
3784 if (r != 0) return r;
3785 }
3786 }
3787 else {
3788 OnigCodePoint prev = 0;
3789
3790 for (i = 0; i < n; i++) {
3791 for (j = prev;
3792 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3793 if (j >= sb_out) {
3794 goto sb_end2;
3795 }
3796 BITSET_SET_BIT(cc->bs, j);
3797 }
3798 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3799 }
3800 for (j = prev; j < sb_out; j++) {
3801 BITSET_SET_BIT(cc->bs, j);
3802 }
3803
3804 sb_end2:
3805 prev = sb_out;
3806
3807 for (i = 0; i < n; i++) {
3808 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3809 r = add_code_range_to_buf(&(cc->mbuf), prev,
3810 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3811 if (r != 0) return r;
3812 }
3813 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3814 }
3815 if (prev < 0x7fffffff) {
3816 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3817 if (r != 0) return r;
3818 }
3819 }
3820
3821 return 0;
3822 }
3823
3824 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3825 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3826 {
3827 int c, r;
3828 const OnigCodePoint *ranges;
3829 OnigCodePoint sb_out;
3830 OnigEncoding enc = env->enc;
3831
3832 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3833 if (r == 0) {
3834 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3835 }
3836 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3837 return r;
3838 }
3839
3840 r = 0;
3841 switch (ctype) {
3842 case ONIGENC_CTYPE_ALPHA:
3843 case ONIGENC_CTYPE_BLANK:
3844 case ONIGENC_CTYPE_CNTRL:
3845 case ONIGENC_CTYPE_DIGIT:
3846 case ONIGENC_CTYPE_LOWER:
3847 case ONIGENC_CTYPE_PUNCT:
3848 case ONIGENC_CTYPE_SPACE:
3849 case ONIGENC_CTYPE_UPPER:
3850 case ONIGENC_CTYPE_XDIGIT:
3851 case ONIGENC_CTYPE_ASCII:
3852 case ONIGENC_CTYPE_ALNUM:
3853 if (not != 0) {
3854 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3855 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3856 BITSET_SET_BIT(cc->bs, c);
3857 }
3858 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3859 }
3860 else {
3861 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3862 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3863 BITSET_SET_BIT(cc->bs, c);
3864 }
3865 }
3866 break;
3867
3868 case ONIGENC_CTYPE_GRAPH:
3869 case ONIGENC_CTYPE_PRINT:
3870 if (not != 0) {
3871 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3872 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3873 BITSET_SET_BIT(cc->bs, c);
3874 }
3875 }
3876 else {
3877 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3878 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3879 BITSET_SET_BIT(cc->bs, c);
3880 }
3881 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3882 }
3883 break;
3884
3885 case ONIGENC_CTYPE_WORD:
3886 if (not == 0) {
3887 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3888 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3889 }
3890 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3891 }
3892 else {
3893 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3894 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3895 && ! ONIGENC_IS_CODE_WORD(enc, c))
3896 BITSET_SET_BIT(cc->bs, c);
3897 }
3898 }
3899 break;
3900
3901 default:
3902 return ONIGERR_PARSER_BUG;
3903 break;
3904 }
3905
3906 return r;
3907 }
3908
3909 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3910 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3911 {
3912 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3913 #define POSIX_BRACKET_NAME_MIN_LEN 4
3914
3915 static PosixBracketEntryType PBS[] = {
3916 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3917 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3918 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3919 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3920 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3921 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3922 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3923 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3924 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3925 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3926 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3927 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3928 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3929 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
3930 { (UChar* )NULL, -1, 0 }
3931 };
3932
3933 PosixBracketEntryType *pb;
3934 int not, i, r;
3935 OnigCodePoint c;
3936 OnigEncoding enc = env->enc;
3937 UChar *p = *src;
3938 PFETCH_READY;
3939
3940 if (PPEEK_IS('^')) {
3941 PINC;
3942 not = 1;
3943 }
3944 else
3945 not = 0;
3946
3947 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3948 goto not_posix_bracket;
3949
3950 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3951 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3952 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3953 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3954 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3955
3956 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3957 if (r != 0) return r;
3958
3959 PINC; PINC;
3960 *src = p;
3961 return 0;
3962 }
3963 }
3964
3965 not_posix_bracket:
3966 c = 0;
3967 i = 0;
3968 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3969 PINC;
3970 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3971 }
3972 if (c == ':' && ! PEND) {
3973 PINC;
3974 if (! PEND) {
3975 PFETCH(c);
3976 if (c == ']')
3977 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3978 }
3979 }
3980
3981 return 1; /* 1: is not POSIX bracket, but no error. */
3982 }
3983
3984 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3985 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3986 {
3987 int r;
3988 OnigCodePoint c;
3989 OnigEncoding enc = env->enc;
3990 UChar *prev, *start, *p = *src;
3991 PFETCH_READY;
3992
3993 r = 0;
3994 start = prev = p;
3995
3996 while (!PEND) {
3997 prev = p;
3998 PFETCH(c);
3999 if (c == '}') {
4000 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4001 if (r < 0) break;
4002
4003 *src = p;
4004 return r;
4005 }
4006 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4007 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4008 break;
4009 }
4010 }
4011
4012 onig_scan_env_set_error_string(env, r, *src, prev);
4013 return r;
4014 }
4015
4016 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4017 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4018 ScanEnv* env)
4019 {
4020 int r, ctype;
4021 CClassNode* cc;
4022
4023 ctype = fetch_char_property_to_ctype(src, end, env);
4024 if (ctype < 0) return ctype;
4025
4026 *np = node_new_cclass();
4027 CHECK_NULL_RETURN_MEMERR(*np);
4028 cc = NCCLASS(*np);
4029 r = add_ctype_to_cc(cc, ctype, 0, env);
4030 if (r != 0) return r;
4031 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4032
4033 return 0;
4034 }
4035
4036
4037 enum CCSTATE {
4038 CCS_VALUE,
4039 CCS_RANGE,
4040 CCS_COMPLETE,
4041 CCS_START
4042 };
4043
4044 enum CCVALTYPE {
4045 CCV_SB,
4046 CCV_CODE_POINT,
4047 CCV_CLASS
4048 };
4049
4050 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4051 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4052 enum CCSTATE* state, ScanEnv* env)
4053 {
4054 int r;
4055
4056 if (*state == CCS_RANGE)
4057 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4058
4059 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4060 if (*type == CCV_SB)
4061 BITSET_SET_BIT(cc->bs, (int )(*vs));
4062 else if (*type == CCV_CODE_POINT) {
4063 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4064 if (r < 0) return r;
4065 }
4066 }
4067
4068 *state = CCS_VALUE;
4069 *type = CCV_CLASS;
4070 return 0;
4071 }
4072
4073 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4074 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4075 int* vs_israw, int v_israw,
4076 enum CCVALTYPE intype, enum CCVALTYPE* type,
4077 enum CCSTATE* state, ScanEnv* env)
4078 {
4079 int r;
4080
4081 switch (*state) {
4082 case CCS_VALUE:
4083 if (*type == CCV_SB)
4084 BITSET_SET_BIT(cc->bs, (int )(*vs));
4085 else if (*type == CCV_CODE_POINT) {
4086 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4087 if (r < 0) return r;
4088 }
4089 break;
4090
4091 case CCS_RANGE:
4092 if (intype == *type) {
4093 if (intype == CCV_SB) {
4094 if (*vs > 0xff || v > 0xff)
4095 return ONIGERR_INVALID_CODE_POINT_VALUE;
4096
4097 if (*vs > v) {
4098 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4099 goto ccs_range_end;
4100 else
4101 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4102 }
4103 bitset_set_range(cc->bs, (int )*vs, (int )v);
4104 }
4105 else {
4106 r = add_code_range(&(cc->mbuf), env, *vs, v);
4107 if (r < 0) return r;
4108 }
4109 }
4110 else {
4111 #if 0
4112 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4113 #endif
4114 if (*vs > v) {
4115 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4116 goto ccs_range_end;
4117 else
4118 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4119 }
4120 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4121 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4122 if (r < 0) return r;
4123 #if 0
4124 }
4125 else
4126 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4127 #endif
4128 }
4129 ccs_range_end:
4130 *state = CCS_COMPLETE;
4131 break;
4132
4133 case CCS_COMPLETE:
4134 case CCS_START:
4135 *state = CCS_VALUE;
4136 break;
4137
4138 default:
4139 break;
4140 }
4141
4142 *vs_israw = v_israw;
4143 *vs = v;
4144 *type = intype;
4145 return 0;
4146 }
4147
4148 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4149 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4150 ScanEnv* env)
4151 {
4152 int in_esc;
4153 OnigCodePoint code;
4154 OnigEncoding enc = env->enc;
4155 UChar* p = from;
4156 PFETCH_READY;
4157
4158 in_esc = 0;
4159 while (! PEND) {
4160 if (ignore_escaped && in_esc) {
4161 in_esc = 0;
4162 }
4163 else {
4164 PFETCH(code);
4165 if (code == c) return 1;
4166 if (code == MC_ESC(env->syntax)) in_esc = 1;
4167 }
4168 }
4169 return 0;
4170 }
4171
4172 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4173 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4174 ScanEnv* env)
4175 {
4176 int r, neg, len, fetched, and_start;
4177 OnigCodePoint v, vs;
4178 UChar *p;
4179 Node* node;
4180 CClassNode *cc, *prev_cc;
4181 CClassNode work_cc;
4182
4183 enum CCSTATE state;
4184 enum CCVALTYPE val_type, in_type;
4185 int val_israw, in_israw;
4186
4187 prev_cc = (CClassNode* )NULL;
4188 *np = NULL_NODE;
4189 r = fetch_token_in_cc(tok, src, end, env);
4190 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4191 neg = 1;
4192 r = fetch_token_in_cc(tok, src, end, env);
4193 }
4194 else {
4195 neg = 0;
4196 }
4197
4198 if (r < 0) return r;
4199 if (r == TK_CC_CLOSE) {
4200 if (! code_exist_check((OnigCodePoint )']',
4201 *src, env->pattern_end, 1, env))
4202 return ONIGERR_EMPTY_CHAR_CLASS;
4203
4204 CC_ESC_WARN(env, (UChar* )"]");
4205 r = tok->type = TK_CHAR; /* allow []...] */
4206 }
4207
4208 *np = node = node_new_cclass();
4209 CHECK_NULL_RETURN_MEMERR(node);
4210 cc = NCCLASS(node);
4211
4212 and_start = 0;
4213 state = CCS_START;
4214 p = *src;
4215 while (r != TK_CC_CLOSE) {
4216 fetched = 0;
4217 switch (r) {
4218 case TK_CHAR:
4219 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4220 if (len > 1) {
4221 in_type = CCV_CODE_POINT;
4222 }
4223 else if (len < 0) {
4224 r = len;
4225 goto err;
4226 }
4227 else {
4228 sb_char:
4229 in_type = CCV_SB;
4230 }
4231 v = (OnigCodePoint )tok->u.c;
4232 in_israw = 0;
4233 goto val_entry2;
4234 break;
4235
4236 case TK_RAW_BYTE:
4237 /* tok->base != 0 : octal or hexadec. */
4238 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4239 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4240 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4241 UChar* psave = p;
4242 int i, base = tok->base;
4243
4244 buf[0] = tok->u.c;
4245 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4246 r = fetch_token_in_cc(tok, &p, end, env);
4247 if (r < 0) goto err;
4248 if (r != TK_RAW_BYTE || tok->base != base) {
4249 fetched = 1;
4250 break;
4251 }
4252 buf[i] = tok->u.c;
4253 }
4254
4255 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4256 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4257 goto err;
4258 }
4259
4260 len = enclen(env->enc, buf);
4261 if (i < len) {
4262 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4263 goto err;
4264 }
4265 else if (i > len) { /* fetch back */
4266 p = psave;
4267 for (i = 1; i < len; i++) {
4268 r = fetch_token_in_cc(tok, &p, end, env);
4269 }
4270 fetched = 0;
4271 }
4272
4273 if (i == 1) {
4274 v = (OnigCodePoint )buf[0];
4275 goto raw_single;
4276 }
4277 else {
4278 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4279 in_type = CCV_CODE_POINT;
4280 }
4281 }
4282 else {
4283 v = (OnigCodePoint )tok->u.c;
4284 raw_single:
4285 in_type = CCV_SB;
4286 }
4287 in_israw = 1;
4288 goto val_entry2;
4289 break;
4290
4291 case TK_CODE_POINT:
4292 v = tok->u.code;
4293 in_israw = 1;
4294 val_entry:
4295 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4296 if (len < 0) {
4297 r = len;
4298 goto err;
4299 }
4300 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4301 val_entry2:
4302 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4303 &state, env);
4304 if (r != 0) goto err;
4305 break;
4306
4307 case TK_POSIX_BRACKET_OPEN:
4308 r = parse_posix_bracket(cc, &p, end, env);
4309 if (r < 0) goto err;
4310 if (r == 1) { /* is not POSIX bracket */
4311 CC_ESC_WARN(env, (UChar* )"[");
4312 p = tok->backp;
4313 v = (OnigCodePoint )tok->u.c;
4314 in_israw = 0;
4315 goto val_entry;
4316 }
4317 goto next_class;
4318 break;
4319
4320 case TK_CHAR_TYPE:
4321 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4322 if (r != 0) return r;
4323
4324 next_class:
4325 r = next_state_class(cc, &vs, &val_type, &state, env);
4326 if (r != 0) goto err;
4327 break;
4328
4329 case TK_CHAR_PROPERTY:
4330 {
4331 int ctype;
4332
4333 ctype = fetch_char_property_to_ctype(&p, end, env);
4334 if (ctype < 0) return ctype;
4335 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4336 if (r != 0) return r;
4337 goto next_class;
4338 }
4339 break;
4340
4341 case TK_CC_RANGE:
4342 if (state == CCS_VALUE) {
4343 r = fetch_token_in_cc(tok, &p, end, env);
4344 if (r < 0) goto err;
4345 fetched = 1;
4346 if (r == TK_CC_CLOSE) { /* allow [x-] */
4347 range_end_val:
4348 v = (OnigCodePoint )'-';
4349 in_israw = 0;
4350 goto val_entry;
4351 }
4352 else if (r == TK_CC_AND) {
4353 CC_ESC_WARN(env, (UChar* )"-");
4354 goto range_end_val;
4355 }
4356 state = CCS_RANGE;
4357 }
4358 else if (state == CCS_START) {
4359 /* [-xa] is allowed */
4360 v = (OnigCodePoint )tok->u.c;
4361 in_israw = 0;
4362
4363 r = fetch_token_in_cc(tok, &p, end, env);
4364 if (r < 0) goto err;
4365 fetched = 1;
4366 /* [--x] or [a&&-x] is warned. */
4367 if (r == TK_CC_RANGE || and_start != 0)
4368 CC_ESC_WARN(env, (UChar* )"-");
4369
4370 goto val_entry;
4371 }
4372 else if (state == CCS_RANGE) {
4373 CC_ESC_WARN(env, (UChar* )"-");
4374 goto sb_char; /* [!--x] is allowed */
4375 }
4376 else { /* CCS_COMPLETE */
4377 r = fetch_token_in_cc(tok, &p, end, env);
4378 if (r < 0) goto err;
4379 fetched = 1;
4380 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4381 else if (r == TK_CC_AND) {
4382 CC_ESC_WARN(env, (UChar* )"-");
4383 goto range_end_val;
4384 }
4385
4386 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4387 CC_ESC_WARN(env, (UChar* )"-");
4388 goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4389 }
4390 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4391 goto err;
4392 }
4393 break;
4394
4395 case TK_CC_CC_OPEN: /* [ */
4396 {
4397 Node *anode;
4398 CClassNode* acc;
4399
4400 r = parse_char_class(&anode, tok, &p, end, env);
4401 if (r != 0) goto cc_open_err;
4402 acc = NCCLASS(anode);
4403 r = or_cclass(cc, acc, env->enc);
4404
4405 onig_node_free(anode);
4406 cc_open_err:
4407 if (r != 0) goto err;
4408 }
4409 break;
4410
4411 case TK_CC_AND: /* && */
4412 {
4413 if (state == CCS_VALUE) {
4414 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4415 &val_type, &state, env);
4416 if (r != 0) goto err;
4417 }
4418 /* initialize local variables */
4419 and_start = 1;
4420 state = CCS_START;
4421
4422 if (IS_NOT_NULL(prev_cc)) {
4423 r = and_cclass(prev_cc, cc, env->enc);
4424 if (r != 0) goto err;
4425 bbuf_free(cc->mbuf);
4426 }
4427 else {
4428 prev_cc = cc;
4429 cc = &work_cc;
4430 }
4431 initialize_cclass(cc);
4432 }
4433 break;
4434
4435 case TK_EOT:
4436 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4437 goto err;
4438 break;
4439 default:
4440 r = ONIGERR_PARSER_BUG;
4441 goto err;
4442 break;
4443 }
4444
4445 if (fetched)
4446 r = tok->type;
4447 else {
4448 r = fetch_token_in_cc(tok, &p, end, env);
4449 if (r < 0) goto err;
4450 }
4451 }
4452
4453 if (state == CCS_VALUE) {
4454 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4455 &val_type, &state, env);
4456 if (r != 0) goto err;
4457 }
4458
4459 if (IS_NOT_NULL(prev_cc)) {
4460 r = and_cclass(prev_cc, cc, env->enc);
4461 if (r != 0) goto err;
4462 bbuf_free(cc->mbuf);
4463 cc = prev_cc;
4464 }
4465
4466 if (neg != 0)
4467 NCCLASS_SET_NOT(cc);
4468 else
4469 NCCLASS_CLEAR_NOT(cc);
4470 if (IS_NCCLASS_NOT(cc) &&
4471 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4472 int is_empty;
4473
4474 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4475 if (is_empty != 0)
4476 BITSET_IS_EMPTY(cc->bs, is_empty);
4477
4478 if (is_empty == 0) {
4479 #define NEWLINE_CODE 0x0a
4480
4481 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4482 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4483 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4484 else
4485 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4486 }
4487 }
4488 }
4489 *src = p;
4490 return 0;
4491
4492 err:
4493 if (cc != NCCLASS(*np))
4494 bbuf_free(cc->mbuf);
4495 onig_node_free(*np);
4496 return r;
4497 }
4498
4499 static int parse_subexp(Node** top, OnigToken* tok, int term,
4500 UChar** src, UChar* end, ScanEnv* env);
4501
4502 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4503 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4504 ScanEnv* env)
4505 {
4506 int r, num;
4507 Node *target;
4508 OnigOptionType option;
4509 OnigCodePoint c;
4510 OnigEncoding enc = env->enc;
4511
4512 #ifdef USE_NAMED_GROUP
4513 int list_capture;
4514 #endif
4515
4516 UChar* p = *src;
4517 PFETCH_READY;
4518
4519 *np = NULL;
4520 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4521
4522 option = env->option;
4523 if (PPEEK_IS('?') &&
4524 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4525 PINC;
4526 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4527
4528 PFETCH(c);
4529 switch (c) {
4530 case ':': /* (?:...) grouping only */
4531 group:
4532 r = fetch_token(tok, &p, end, env);
4533 if (r < 0) return r;
4534 r = parse_subexp(np, tok, term, &p, end, env);
4535 if (r < 0) return r;
4536 *src = p;
4537 return 1; /* group */
4538 break;
4539
4540 case '=':
4541 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4542 break;
4543 case '!': /* preceding read */
4544 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4545 break;
4546 case '>': /* (?>...) stop backtrack */
4547 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4548 break;
4549
4550 #ifdef USE_NAMED_GROUP
4551 case '\'':
4552 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4553 goto named_group1;
4554 }
4555 else
4556 return ONIGERR_UNDEFINED_GROUP_OPTION;
4557 break;
4558 #endif
4559
4560 case '<': /* look behind (?<=...), (?<!...) */
4561 PFETCH(c);
4562 if (c == '=')
4563 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4564 else if (c == '!')
4565 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4566 #ifdef USE_NAMED_GROUP
4567 else {
4568 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4569 UChar *name;
4570 UChar *name_end;
4571
4572 PUNFETCH;
4573 c = '<';
4574
4575 named_group1:
4576 list_capture = 0;
4577
4578 named_group2:
4579 name = p;
4580 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4581 if (r < 0) return r;
4582
4583 num = scan_env_add_mem_entry(env);
4584 if (num < 0) return num;
4585 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4586 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4587
4588 r = name_add(env->reg, name, name_end, num, env);
4589 if (r != 0) return r;
4590 *np = node_new_enclose_memory(env->option, 1);
4591 CHECK_NULL_RETURN_MEMERR(*np);
4592 NENCLOSE(*np)->regnum = num;
4593 if (list_capture != 0)
4594 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4595 env->num_named++;
4596 }
4597 else {
4598 return ONIGERR_UNDEFINED_GROUP_OPTION;
4599 }
4600 }
4601 #else
4602 else {
4603 return ONIGERR_UNDEFINED_GROUP_OPTION;
4604 }
4605 #endif
4606 break;
4607
4608 case '@':
4609 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4610 #ifdef USE_NAMED_GROUP
4611 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4612 PFETCH(c);
4613 if (c == '<' || c == '\'') {
4614 list_capture = 1;
4615 goto named_group2; /* (?@<name>...) */
4616 }
4617 PUNFETCH;
4618 }
4619 #endif
4620 *np = node_new_enclose_memory(env->option, 0);
4621 CHECK_NULL_RETURN_MEMERR(*np);
4622 num = scan_env_add_mem_entry(env);
4623 if (num < 0) {
4624 onig_node_free(*np);
4625 return num;
4626 }
4627 else if (num >= (int )BIT_STATUS_BITS_NUM) {
4628 onig_node_free(*np);
4629 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4630 }
4631 NENCLOSE(*np)->regnum = num;
4632 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4633 }
4634 else {
4635 return ONIGERR_UNDEFINED_GROUP_OPTION;
4636 }
4637 break;
4638
4639 #ifdef USE_POSIXLINE_OPTION
4640 case 'p':
4641 #endif
4642 case '-': case 'i': case 'm': case 's': case 'x':
4643 {
4644 int neg = 0;
4645
4646 while (1) {
4647 switch (c) {
4648 case ':':
4649 case ')':
4650 break;
4651
4652 case '-': neg = 1; break;
4653 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4654 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4655 case 's':
4656 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4657 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4658 }
4659 else
4660 return ONIGERR_UNDEFINED_GROUP_OPTION;
4661 break;
4662
4663 case 'm':
4664 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4665 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4666 }
4667 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4668 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4669 }
4670 else
4671 return ONIGERR_UNDEFINED_GROUP_OPTION;
4672 break;
4673 #ifdef USE_POSIXLINE_OPTION
4674 case 'p':
4675 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4676 break;
4677 #endif
4678 default:
4679 return ONIGERR_UNDEFINED_GROUP_OPTION;
4680 }
4681
4682 if (c == ')') {
4683 *np = node_new_option(option);
4684 CHECK_NULL_RETURN_MEMERR(*np);
4685 *src = p;
4686 return 2; /* option only */
4687 }
4688 else if (c == ':') {
4689 OnigOptionType prev = env->option;
4690
4691 env->option = option;
4692 r = fetch_token(tok, &p, end, env);
4693 if (r < 0) return r;
4694 r = parse_subexp(&target, tok, term, &p, end, env);
4695 env->option = prev;
4696 if (r < 0) return r;
4697 *np = node_new_option(option);
4698 CHECK_NULL_RETURN_MEMERR(*np);
4699 NENCLOSE(*np)->target = target;
4700 *src = p;
4701 return 0;
4702 }
4703
4704 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4705 PFETCH(c);
4706 }
4707 }
4708 break;
4709
4710 default:
4711 return ONIGERR_UNDEFINED_GROUP_OPTION;
4712 }
4713 }
4714 else {
4715 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4716 goto group;
4717
4718 *np = node_new_enclose_memory(env->option, 0);
4719 CHECK_NULL_RETURN_MEMERR(*np);
4720 num = scan_env_add_mem_entry(env);
4721 if (num < 0) return num;
4722 NENCLOSE(*np)->regnum = num;
4723 }
4724
4725 CHECK_NULL_RETURN_MEMERR(*np);
4726 r = fetch_token(tok, &p, end, env);
4727 if (r < 0) return r;
4728 r = parse_subexp(&target, tok, term, &p, end, env);
4729 if (r < 0) return r;
4730
4731 if (NTYPE(*np) == NT_ANCHOR)
4732 NANCHOR(*np)->target = target;
4733 else {
4734 NENCLOSE(*np)->target = target;
4735 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4736 /* Don't move this to previous of parse_subexp() */
4737 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4738 if (r != 0) return r;
4739 }
4740 }
4741
4742 *src = p;
4743 return 0;
4744 }
4745
4746 static const char* PopularQStr[] = {
4747 "?", "*", "+", "??", "*?", "+?"
4748 };
4749
4750 static const char* ReduceQStr[] = {
4751 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4752 };
4753
4754 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4755 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4756 {
4757 QtfrNode* qn;
4758
4759 qn = NQTFR(qnode);
4760 if (qn->lower == 1 && qn->upper == 1) {
4761 return 1;
4762 }
4763
4764 switch (NTYPE(target)) {
4765 case NT_STR:
4766 if (! group) {
4767 StrNode* sn = NSTR(target);
4768 if (str_node_can_be_split(sn, env->enc)) {
4769 Node* n = str_node_split_last_char(sn, env->enc);
4770 if (IS_NOT_NULL(n)) {
4771 qn->target = n;
4772 return 2;
4773 }
4774 }
4775 }
4776 break;
4777
4778 case NT_QTFR:
4779 { /* check redundant double repeat. */
4780 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4781 QtfrNode* qnt = NQTFR(target);
4782 int nestq_num = popular_quantifier_num(qn);
4783 int targetq_num = popular_quantifier_num(qnt);
4784
4785 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4786 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4787 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4788 UChar buf[WARN_BUFSIZE];
4789
4790 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4791 case RQ_ASIS:
4792 break;
4793
4794 case RQ_DEL:
4795 if (onig_verb_warn != onig_null_warn) {
4796 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4797 env->pattern, env->pattern_end,
4798 (UChar* )"redundant nested repeat operator");
4799 (*onig_verb_warn)((char* )buf);
4800 }
4801 goto warn_exit;
4802 break;
4803
4804 default:
4805 if (onig_verb_warn != onig_null_warn) {
4806 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4807 env->pattern, env->pattern_end,
4808 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4809 PopularQStr[targetq_num], PopularQStr[nestq_num],
4810 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4811 (*onig_verb_warn)((char* )buf);
4812 }
4813 goto warn_exit;
4814 break;
4815 }
4816 }
4817
4818 warn_exit:
4819 #endif
4820 if (targetq_num >= 0) {
4821 if (nestq_num >= 0) {
4822 onig_reduce_nested_quantifier(qnode, target);
4823 goto q_exit;
4824 }
4825 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4826 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4827 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4828 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4829 }
4830 }
4831 }
4832 }
4833 break;
4834
4835 default:
4836 break;
4837 }
4838
4839 qn->target = target;
4840 q_exit:
4841 return 0;
4842 }
4843
4844
4845 #ifdef USE_SHARED_CCLASS_TABLE
4846
4847 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4848
4849 /* for ctype node hash table */
4850
4851 typedef struct {
4852 OnigEncoding enc;
4853 int not;
4854 int type;
4855 } type_cclass_key;
4856
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4857 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4858 {
4859 if (x->type != y->type) return 1;
4860 if (x->enc != y->enc) return 1;
4861 if (x->not != y->not) return 1;
4862 return 0;
4863 }
4864
type_cclass_hash(type_cclass_key * key)4865 static int type_cclass_hash(type_cclass_key* key)
4866 {
4867 int i, val;
4868 UChar *p;
4869
4870 val = 0;
4871
4872 p = (UChar* )&(key->enc);
4873 for (i = 0; i < (int )sizeof(key->enc); i++) {
4874 val = val * 997 + (int )*p++;
4875 }
4876
4877 p = (UChar* )(&key->type);
4878 for (i = 0; i < (int )sizeof(key->type); i++) {
4879 val = val * 997 + (int )*p++;
4880 }
4881
4882 val += key->not;
4883 return val + (val >> 5);
4884 }
4885
4886 static struct st_hash_type type_type_cclass_hash = {
4887 type_cclass_cmp,
4888 type_cclass_hash,
4889 };
4890
4891 static st_table* OnigTypeCClassTable;
4892
4893
4894 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4895 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4896 {
4897 if (IS_NOT_NULL(node)) {
4898 CClassNode* cc = NCCLASS(node);
4899 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4900 xfree(node);
4901 }
4902
4903 if (IS_NOT_NULL(key)) xfree(key);
4904 return ST_DELETE;
4905 }
4906
4907 extern int
onig_free_shared_cclass_table(void)4908 onig_free_shared_cclass_table(void)
4909 {
4910 if (IS_NOT_NULL(OnigTypeCClassTable)) {
4911 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4912 onig_st_free_table(OnigTypeCClassTable);
4913 OnigTypeCClassTable = NULL;
4914 }
4915
4916 return 0;
4917 }
4918
4919 #endif /* USE_SHARED_CCLASS_TABLE */
4920
4921
4922 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4923 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4924 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4925 {
4926 BBuf *tbuf;
4927 int r;
4928
4929 if (IS_NCCLASS_NOT(cc)) {
4930 bitset_invert(cc->bs);
4931
4932 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4933 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4934 if (r != 0) return r;
4935
4936 bbuf_free(cc->mbuf);
4937 cc->mbuf = tbuf;
4938 }
4939
4940 NCCLASS_CLEAR_NOT(cc);
4941 }
4942
4943 return 0;
4944 }
4945 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4946
4947 typedef struct {
4948 ScanEnv* env;
4949 CClassNode* cc;
4950 Node* alt_root;
4951 Node** ptail;
4952 } IApplyCaseFoldArg;
4953
4954 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4955 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4956 int to_len, void* arg)
4957 {
4958 IApplyCaseFoldArg* iarg;
4959 ScanEnv* env;
4960 CClassNode* cc;
4961 BitSetRef bs;
4962
4963 iarg = (IApplyCaseFoldArg* )arg;
4964 env = iarg->env;
4965 cc = iarg->cc;
4966 bs = cc->bs;
4967
4968 if (to_len == 1) {
4969 int is_in = onig_is_code_in_cc(env->enc, from, cc);
4970 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4971 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4972 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
4973 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4974 add_code_range(&(cc->mbuf), env, *to, *to);
4975 }
4976 else {
4977 BITSET_SET_BIT(bs, *to);
4978 }
4979 }
4980 #else
4981 if (is_in != 0) {
4982 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4983 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4984 add_code_range(&(cc->mbuf), env, *to, *to);
4985 }
4986 else {
4987 if (IS_NCCLASS_NOT(cc)) {
4988 BITSET_CLEAR_BIT(bs, *to);
4989 }
4990 else
4991 BITSET_SET_BIT(bs, *to);
4992 }
4993 }
4994 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4995 }
4996 else {
4997 int r, i, len;
4998 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4999 Node *snode = NULL_NODE;
5000
5001 if (onig_is_code_in_cc(env->enc, from, cc)
5002 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5003 && !IS_NCCLASS_NOT(cc)
5004 #endif
5005 ) {
5006 for (i = 0; i < to_len; i++) {
5007 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5008 if (i == 0) {
5009 snode = onig_node_new_str(buf, buf + len);
5010 CHECK_NULL_RETURN_MEMERR(snode);
5011
5012 /* char-class expanded multi-char only
5013 compare with string folded at match time. */
5014 NSTRING_SET_AMBIG(snode);
5015 }
5016 else {
5017 r = onig_node_str_cat(snode, buf, buf + len);
5018 if (r < 0) {
5019 onig_node_free(snode);
5020 return r;
5021 }
5022 }
5023 }
5024
5025 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5026 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5027 iarg->ptail = &(NCDR((*(iarg->ptail))));
5028 }
5029 }
5030
5031 return 0;
5032 }
5033
5034 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5035 parse_exp(Node** np, OnigToken* tok, int term,
5036 UChar** src, UChar* end, ScanEnv* env)
5037 {
5038 int r, len, group = 0;
5039 Node* qn;
5040 Node** targetp;
5041
5042 *np = NULL;
5043 if (tok->type == (enum TokenSyms )term)
5044 goto end_of_token;
5045
5046 switch (tok->type) {
5047 case TK_ALT:
5048 case TK_EOT:
5049 end_of_token:
5050 *np = node_new_empty();
5051 return tok->type;
5052 break;
5053
5054 case TK_SUBEXP_OPEN:
5055 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5056 if (r < 0) return r;
5057 if (r == 1) group = 1;
5058 else if (r == 2) { /* option only */
5059 Node* target;
5060 OnigOptionType prev = env->option;
5061
5062 env->option = NENCLOSE(*np)->option;
5063 r = fetch_token(tok, src, end, env);
5064 if (r < 0) return r;
5065 r = parse_subexp(&target, tok, term, src, end, env);
5066 env->option = prev;
5067 if (r < 0) return r;
5068 NENCLOSE(*np)->target = target;
5069 return tok->type;
5070 }
5071 break;
5072
5073 case TK_SUBEXP_CLOSE:
5074 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5075 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5076
5077 if (tok->escaped) goto tk_raw_byte;
5078 else goto tk_byte;
5079 break;
5080
5081 case TK_STRING:
5082 tk_byte:
5083 {
5084 *np = node_new_str(tok->backp, *src);
5085 CHECK_NULL_RETURN_MEMERR(*np);
5086
5087 while (1) {
5088 r = fetch_token(tok, src, end, env);
5089 if (r < 0) return r;
5090 if (r != TK_STRING) break;
5091
5092 r = onig_node_str_cat(*np, tok->backp, *src);
5093 if (r < 0) return r;
5094 }
5095
5096 string_end:
5097 targetp = np;
5098 goto repeat;
5099 }
5100 break;
5101
5102 case TK_RAW_BYTE:
5103 tk_raw_byte:
5104 {
5105 *np = node_new_str_raw_char((UChar )tok->u.c);
5106 CHECK_NULL_RETURN_MEMERR(*np);
5107 len = 1;
5108 while (1) {
5109 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5110 if (len == enclen(env->enc, NSTR(*np)->s)) {
5111 r = fetch_token(tok, src, end, env);
5112 NSTRING_CLEAR_RAW(*np);
5113 goto string_end;
5114 }
5115 }
5116
5117 r = fetch_token(tok, src, end, env);
5118 if (r < 0) return r;
5119 if (r != TK_RAW_BYTE) {
5120 /* Don't use this, it is wrong for little endian encodings. */
5121 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5122 int rem;
5123 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5124 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5125 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5126 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5127 NSTRING_CLEAR_RAW(*np);
5128 goto string_end;
5129 }
5130 }
5131 #endif
5132 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5133 }
5134
5135 r = node_str_cat_char(*np, (UChar )tok->u.c);
5136 if (r < 0) return r;
5137
5138 len++;
5139 }
5140 }
5141 break;
5142
5143 case TK_CODE_POINT:
5144 {
5145 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5146 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5147 if (num < 0) return num;
5148 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5149 *np = node_new_str_raw(buf, buf + num);
5150 #else
5151 *np = node_new_str(buf, buf + num);
5152 #endif
5153 CHECK_NULL_RETURN_MEMERR(*np);
5154 }
5155 break;
5156
5157 case TK_QUOTE_OPEN:
5158 {
5159 OnigCodePoint end_op[2];
5160 UChar *qstart, *qend, *nextp;
5161
5162 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5163 end_op[1] = (OnigCodePoint )'E';
5164 qstart = *src;
5165 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5166 if (IS_NULL(qend)) {
5167 nextp = qend = end;
5168 }
5169 *np = node_new_str(qstart, qend);
5170 CHECK_NULL_RETURN_MEMERR(*np);
5171 *src = nextp;
5172 }
5173 break;
5174
5175 case TK_CHAR_TYPE:
5176 {
5177 switch (tok->u.prop.ctype) {
5178 case ONIGENC_CTYPE_WORD:
5179 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5180 CHECK_NULL_RETURN_MEMERR(*np);
5181 break;
5182
5183 case ONIGENC_CTYPE_SPACE:
5184 case ONIGENC_CTYPE_DIGIT:
5185 case ONIGENC_CTYPE_XDIGIT:
5186 {
5187 CClassNode* cc;
5188
5189 #ifdef USE_SHARED_CCLASS_TABLE
5190 const OnigCodePoint *mbr;
5191 OnigCodePoint sb_out;
5192
5193 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5194 &sb_out, &mbr);
5195 if (r == 0 &&
5196 ONIGENC_CODE_RANGE_NUM(mbr)
5197 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5198 type_cclass_key key;
5199 type_cclass_key* new_key;
5200
5201 key.enc = env->enc;
5202 key.not = tok->u.prop.not;
5203 key.type = tok->u.prop.ctype;
5204
5205 THREAD_ATOMIC_START;
5206
5207 if (IS_NULL(OnigTypeCClassTable)) {
5208 OnigTypeCClassTable
5209 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5210 if (IS_NULL(OnigTypeCClassTable)) {
5211 THREAD_ATOMIC_END;
5212 return ONIGERR_MEMORY;
5213 }
5214 }
5215 else {
5216 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5217 (st_data_t* )np)) {
5218 THREAD_ATOMIC_END;
5219 break;
5220 }
5221 }
5222
5223 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5224 sb_out, mbr);
5225 if (IS_NULL(*np)) {
5226 THREAD_ATOMIC_END;
5227 return ONIGERR_MEMORY;
5228 }
5229
5230 cc = NCCLASS(*np);
5231 NCCLASS_SET_SHARE(cc);
5232 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5233 xmemcpy(new_key, &key, sizeof(type_cclass_key));
5234 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5235 (st_data_t )*np);
5236
5237 THREAD_ATOMIC_END;
5238 }
5239 else {
5240 #endif
5241 *np = node_new_cclass();
5242 CHECK_NULL_RETURN_MEMERR(*np);
5243 cc = NCCLASS(*np);
5244 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5245 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5246 #ifdef USE_SHARED_CCLASS_TABLE
5247 }
5248 #endif
5249 }
5250 break;
5251
5252 default:
5253 return ONIGERR_PARSER_BUG;
5254 break;
5255 }
5256 }
5257 break;
5258
5259 case TK_CHAR_PROPERTY:
5260 r = parse_char_property(np, tok, src, end, env);
5261 if (r != 0) return r;
5262 break;
5263
5264 case TK_CC_OPEN:
5265 {
5266 CClassNode* cc;
5267
5268 r = parse_char_class(np, tok, src, end, env);
5269 if (r != 0) return r;
5270
5271 cc = NCCLASS(*np);
5272 if (IS_IGNORECASE(env->option)) {
5273 IApplyCaseFoldArg iarg;
5274
5275 iarg.env = env;
5276 iarg.cc = cc;
5277 iarg.alt_root = NULL_NODE;
5278 iarg.ptail = &(iarg.alt_root);
5279
5280 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5281 i_apply_case_fold, &iarg);
5282 if (r != 0) {
5283 onig_node_free(iarg.alt_root);
5284 return r;
5285 }
5286 if (IS_NOT_NULL(iarg.alt_root)) {
5287 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5288 if (IS_NULL(work)) {
5289 onig_node_free(iarg.alt_root);
5290 return ONIGERR_MEMORY;
5291 }
5292 *np = work;
5293 }
5294 }
5295 }
5296 break;
5297
5298 case TK_ANYCHAR:
5299 *np = node_new_anychar();
5300 CHECK_NULL_RETURN_MEMERR(*np);
5301 break;
5302
5303 case TK_ANYCHAR_ANYTIME:
5304 *np = node_new_anychar();
5305 CHECK_NULL_RETURN_MEMERR(*np);
5306 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5307 CHECK_NULL_RETURN_MEMERR(qn);
5308 NQTFR(qn)->target = *np;
5309 *np = qn;
5310 break;
5311
5312 case TK_BACKREF:
5313 len = tok->u.backref.num;
5314 *np = node_new_backref(len,
5315 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5316 tok->u.backref.by_name,
5317 #ifdef USE_BACKREF_WITH_LEVEL
5318 tok->u.backref.exist_level,
5319 tok->u.backref.level,
5320 #endif
5321 env);
5322 CHECK_NULL_RETURN_MEMERR(*np);
5323 break;
5324
5325 #ifdef USE_SUBEXP_CALL
5326 case TK_CALL:
5327 {
5328 int gnum = tok->u.call.gnum;
5329
5330 if (gnum < 0) {
5331 gnum = BACKREF_REL_TO_ABS(gnum, env);
5332 if (gnum <= 0)
5333 return ONIGERR_INVALID_BACKREF;
5334 }
5335 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5336 CHECK_NULL_RETURN_MEMERR(*np);
5337 env->num_call++;
5338 }
5339 break;
5340 #endif
5341
5342 case TK_ANCHOR:
5343 *np = onig_node_new_anchor(tok->u.anchor);
5344 break;
5345
5346 case TK_OP_REPEAT:
5347 case TK_INTERVAL:
5348 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5349 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5350 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5351 else
5352 *np = node_new_empty();
5353 }
5354 else {
5355 goto tk_byte;
5356 }
5357 break;
5358
5359 default:
5360 return ONIGERR_PARSER_BUG;
5361 break;
5362 }
5363
5364 {
5365 targetp = np;
5366
5367 re_entry:
5368 r = fetch_token(tok, src, end, env);
5369 if (r < 0) return r;
5370
5371 repeat:
5372 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5373 if (is_invalid_quantifier_target(*targetp))
5374 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5375
5376 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5377 (r == TK_INTERVAL ? 1 : 0));
5378 CHECK_NULL_RETURN_MEMERR(qn);
5379 NQTFR(qn)->greedy = tok->u.repeat.greedy;
5380 r = set_quantifier(qn, *targetp, group, env);
5381 if (r < 0) {
5382 onig_node_free(qn);
5383 return r;
5384 }
5385
5386 if (tok->u.repeat.possessive != 0) {
5387 Node* en;
5388 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5389 if (IS_NULL(en)) {
5390 onig_node_free(qn);
5391 return ONIGERR_MEMORY;
5392 }
5393 NENCLOSE(en)->target = qn;
5394 qn = en;
5395 }
5396
5397 if (r == 0) {
5398 *targetp = qn;
5399 }
5400 else if (r == 1) {
5401 onig_node_free(qn);
5402 }
5403 else if (r == 2) { /* split case: /abc+/ */
5404 Node *tmp;
5405
5406 *targetp = node_new_list(*targetp, NULL);
5407 if (IS_NULL(*targetp)) {
5408 onig_node_free(qn);
5409 return ONIGERR_MEMORY;
5410 }
5411 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5412 if (IS_NULL(tmp)) {
5413 onig_node_free(qn);
5414 return ONIGERR_MEMORY;
5415 }
5416 targetp = &(NCAR(tmp));
5417 }
5418 goto re_entry;
5419 }
5420 }
5421
5422 return r;
5423 }
5424
5425 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5426 parse_branch(Node** top, OnigToken* tok, int term,
5427 UChar** src, UChar* end, ScanEnv* env)
5428 {
5429 int r;
5430 Node *node, **headp;
5431
5432 *top = NULL;
5433 r = parse_exp(&node, tok, term, src, end, env);
5434 if (r < 0) return r;
5435
5436 if (r == TK_EOT || r == term || r == TK_ALT) {
5437 *top = node;
5438 }
5439 else {
5440 *top = node_new_list(node, NULL);
5441 headp = &(NCDR(*top));
5442 while (r != TK_EOT && r != term && r != TK_ALT) {
5443 r = parse_exp(&node, tok, term, src, end, env);
5444 if (r < 0) return r;
5445
5446 if (NTYPE(node) == NT_LIST) {
5447 *headp = node;
5448 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5449 headp = &(NCDR(node));
5450 }
5451 else {
5452 *headp = node_new_list(node, NULL);
5453 headp = &(NCDR(*headp));
5454 }
5455 }
5456 }
5457
5458 return r;
5459 }
5460
5461 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5462 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5463 parse_subexp(Node** top, OnigToken* tok, int term,
5464 UChar** src, UChar* end, ScanEnv* env)
5465 {
5466 int r;
5467 Node *node, **headp;
5468
5469 *top = NULL;
5470 r = parse_branch(&node, tok, term, src, end, env);
5471 if (r < 0) {
5472 onig_node_free(node);
5473 return r;
5474 }
5475
5476 if (r == term) {
5477 *top = node;
5478 }
5479 else if (r == TK_ALT) {
5480 *top = onig_node_new_alt(node, NULL);
5481 headp = &(NCDR(*top));
5482 while (r == TK_ALT) {
5483 r = fetch_token(tok, src, end, env);
5484 if (r < 0) return r;
5485 r = parse_branch(&node, tok, term, src, end, env);
5486 if (r < 0) return r;
5487
5488 *headp = onig_node_new_alt(node, NULL);
5489 headp = &(NCDR(*headp));
5490 }
5491
5492 if (tok->type != (enum TokenSyms )term)
5493 goto err;
5494 }
5495 else {
5496 err:
5497 if (term == TK_SUBEXP_CLOSE)
5498 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5499 else
5500 return ONIGERR_PARSER_BUG;
5501 }
5502
5503 return r;
5504 }
5505
5506 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5507 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5508 {
5509 int r;
5510 OnigToken tok;
5511
5512 r = fetch_token(&tok, src, end, env);
5513 if (r < 0) return r;
5514 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5515 if (r < 0) return r;
5516 return 0;
5517 }
5518
5519 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5520 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5521 regex_t* reg, ScanEnv* env)
5522 {
5523 int r;
5524 UChar* p;
5525
5526 #ifdef USE_NAMED_GROUP
5527 names_clear(reg);
5528 #endif
5529
5530 scan_env_clear(env);
5531 env->option = reg->options;
5532 env->case_fold_flag = reg->case_fold_flag;
5533 env->enc = reg->enc;
5534 env->syntax = reg->syntax;
5535 env->pattern = (UChar* )pattern;
5536 env->pattern_end = (UChar* )end;
5537 env->reg = reg;
5538
5539 *root = NULL;
5540 p = (UChar* )pattern;
5541 r = parse_regexp(root, &p, (UChar* )end, env);
5542 reg->num_mem = env->num_mem;
5543 return r;
5544 }
5545
5546 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5547 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5548 UChar* arg, UChar* arg_end)
5549 {
5550 env->error = arg;
5551 env->error_end = arg_end;
5552 }
5553