1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2017 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29 #include "regparse.h"
30 #include "st.h"
31
32 #ifdef DEBUG_NODE_FREE
33 #include <stdio.h>
34 #endif
35
36 #define WARN_BUFSIZE 256
37
38 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
39
40
41 OnigSyntaxType OnigSyntaxRuby = {
42 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
43 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
44 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
45 ONIG_SYN_OP_ESC_CONTROL_CHARS |
46 ONIG_SYN_OP_ESC_C_CONTROL )
47 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
48 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
49 ONIG_SYN_OP2_OPTION_RUBY |
50 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
51 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
52 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
53 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
54 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
55 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
56 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
57 ONIG_SYN_OP2_ESC_H_XDIGIT )
58 , ( SYN_GNU_REGEX_BV |
59 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
60 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
61 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
62 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
63 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
64 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
65 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
66 , ONIG_OPTION_NONE
67 ,
68 {
69 (OnigCodePoint )'\\' /* esc */
70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
71 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
72 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
73 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
74 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
75 }
76 };
77
78 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
79
onig_null_warn(const char * s ARG_UNUSED)80 extern void onig_null_warn(const char* s ARG_UNUSED) { }
81
82 #ifdef DEFAULT_WARN_FUNCTION
83 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
84 #else
85 static OnigWarnFunc onig_warn = onig_null_warn;
86 #endif
87
88 #ifdef DEFAULT_VERB_WARN_FUNCTION
89 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
90 #else
91 static OnigWarnFunc onig_verb_warn = onig_null_warn;
92 #endif
93
onig_set_warn_func(OnigWarnFunc f)94 extern void onig_set_warn_func(OnigWarnFunc f)
95 {
96 onig_warn = f;
97 }
98
onig_set_verb_warn_func(OnigWarnFunc f)99 extern void onig_set_verb_warn_func(OnigWarnFunc f)
100 {
101 onig_verb_warn = f;
102 }
103
104 extern void
onig_warning(const char * s)105 onig_warning(const char* s)
106 {
107 if (onig_warn == onig_null_warn) return ;
108
109 (*onig_warn)(s);
110 }
111
112 #define DEFAULT_MAX_CAPTURE_NUM 32767
113
114 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM;
115
116 extern int
onig_set_capture_num_limit(int num)117 onig_set_capture_num_limit(int num)
118 {
119 if (num < 0) return -1;
120
121 MaxCaptureNum = num;
122 return 0;
123 }
124
125 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
126
127 extern unsigned int
onig_get_parse_depth_limit(void)128 onig_get_parse_depth_limit(void)
129 {
130 return ParseDepthLimit;
131 }
132
133 extern int
onig_set_parse_depth_limit(unsigned int depth)134 onig_set_parse_depth_limit(unsigned int depth)
135 {
136 if (depth == 0)
137 ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
138 else
139 ParseDepthLimit = depth;
140 return 0;
141 }
142
143
144 static void
bbuf_free(BBuf * bbuf)145 bbuf_free(BBuf* bbuf)
146 {
147 if (IS_NOT_NULL(bbuf)) {
148 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
149 xfree(bbuf);
150 }
151 }
152
153 static int
bbuf_clone(BBuf ** rto,BBuf * from)154 bbuf_clone(BBuf** rto, BBuf* from)
155 {
156 int r;
157 BBuf *to;
158
159 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
160 CHECK_NULL_RETURN_MEMERR(to);
161 r = BBUF_INIT(to, from->alloc);
162 if (r != 0) return r;
163 to->used = from->used;
164 xmemcpy(to->p, from->p, from->used);
165 return 0;
166 }
167
168 #define BACKREF_REL_TO_ABS(rel_no, env) \
169 ((env)->num_mem + 1 + (rel_no))
170
171 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
172
173 #define MBCODE_START_POS(enc) \
174 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
175
176 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
177 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
178
179 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
180 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
181 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
182 if (r) return r;\
183 }\
184 } while (0)
185
186
187 #define BITSET_IS_EMPTY(bs,empty) do {\
188 int i;\
189 empty = 1;\
190 for (i = 0; i < (int )BITSET_SIZE; i++) {\
191 if ((bs)[i] != 0) {\
192 empty = 0; break;\
193 }\
194 }\
195 } while (0)
196
197 static void
bitset_set_range(BitSetRef bs,int from,int to)198 bitset_set_range(BitSetRef bs, int from, int to)
199 {
200 int i;
201 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
202 BITSET_SET_BIT(bs, i);
203 }
204 }
205
206 #if 0
207 static void
208 bitset_set_all(BitSetRef bs)
209 {
210 int i;
211 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
212 }
213 #endif
214
215 static void
bitset_invert(BitSetRef bs)216 bitset_invert(BitSetRef bs)
217 {
218 int i;
219 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
220 }
221
222 static void
bitset_invert_to(BitSetRef from,BitSetRef to)223 bitset_invert_to(BitSetRef from, BitSetRef to)
224 {
225 int i;
226 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
227 }
228
229 static void
bitset_and(BitSetRef dest,BitSetRef bs)230 bitset_and(BitSetRef dest, BitSetRef bs)
231 {
232 int i;
233 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
234 }
235
236 static void
bitset_or(BitSetRef dest,BitSetRef bs)237 bitset_or(BitSetRef dest, BitSetRef bs)
238 {
239 int i;
240 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
241 }
242
243 static void
bitset_copy(BitSetRef dest,BitSetRef bs)244 bitset_copy(BitSetRef dest, BitSetRef bs)
245 {
246 int i;
247 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
248 }
249
250 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)251 onig_strncmp(const UChar* s1, const UChar* s2, int n)
252 {
253 int x;
254
255 while (n-- > 0) {
256 x = *s2++ - *s1++;
257 if (x) return x;
258 }
259 return 0;
260 }
261
262 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)263 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
264 {
265 int len = end - src;
266 if (len > 0) {
267 xmemcpy(dest, src, len);
268 dest[len] = (UChar )0;
269 }
270 }
271
272 #ifdef USE_NAMED_GROUP
273 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)274 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
275 {
276 int slen, term_len, i;
277 UChar *r;
278
279 slen = end - s;
280 term_len = ONIGENC_MBC_MINLEN(enc);
281
282 r = (UChar* )xmalloc(slen + term_len);
283 CHECK_NULL_RETURN(r);
284 xmemcpy(r, s, slen);
285
286 for (i = 0; i < term_len; i++)
287 r[slen + i] = (UChar )0;
288
289 return r;
290 }
291 #endif
292
293 /* scan pattern methods */
294 #define PEND_VALUE 0
295
296 #define PFETCH_READY UChar* pfetch_prev
297 #define PEND (p < end ? 0 : 1)
298 #define PUNFETCH p = pfetch_prev
299 #define PINC do { \
300 pfetch_prev = p; \
301 p += ONIGENC_MBC_ENC_LEN(enc, p); \
302 } while (0)
303 #define PFETCH(c) do { \
304 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
305 pfetch_prev = p; \
306 p += ONIGENC_MBC_ENC_LEN(enc, p); \
307 if(UNEXPECTED(p > end)) p = end; \
308 } while (0)
309
310 #define PINC_S do { \
311 p += ONIGENC_MBC_ENC_LEN(enc, p); \
312 if(UNEXPECTED(p > end)) p = end; \
313 } while (0)
314 #define PFETCH_S(c) do { \
315 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
316 p += ONIGENC_MBC_ENC_LEN(enc, p); \
317 if(UNEXPECTED(p > end)) p = end; \
318 } while (0)
319
320 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
321 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
322
323 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)324 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
325 int capa)
326 {
327 UChar* r;
328
329 if (dest)
330 r = (UChar* )xrealloc(dest, capa + 1);
331 else
332 r = (UChar* )xmalloc(capa + 1);
333
334 CHECK_NULL_RETURN(r);
335 onig_strcpy(r + (dest_end - dest), src, src_end);
336 return r;
337 }
338
339 /* dest on static area */
340 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)341 strcat_capa_from_static(UChar* dest, UChar* dest_end,
342 const UChar* src, const UChar* src_end, int capa)
343 {
344 UChar* r;
345
346 r = (UChar* )xmalloc(capa + 1);
347 CHECK_NULL_RETURN(r);
348 onig_strcpy(r, dest, dest_end);
349 onig_strcpy(r + (dest_end - dest), src, src_end);
350 return r;
351 }
352
353
354 #ifdef USE_ST_LIBRARY
355
356 typedef struct {
357 UChar* s;
358 UChar* end;
359 } st_str_end_key;
360
361 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)362 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
363 {
364 UChar *p, *q;
365 int c;
366
367 if ((x->end - x->s) != (y->end - y->s))
368 return 1;
369
370 p = x->s;
371 q = y->s;
372 while (p < x->end) {
373 c = (int )*p - (int )*q;
374 if (c != 0) return c;
375
376 p++; q++;
377 }
378
379 return 0;
380 }
381
382 static int
str_end_hash(st_str_end_key * x)383 str_end_hash(st_str_end_key* x)
384 {
385 UChar *p;
386 int val = 0;
387
388 p = x->s;
389 while (p < x->end) {
390 val = val * 997 + (int )*p++;
391 }
392
393 return val + (val >> 5);
394 }
395
396 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)397 onig_st_init_strend_table_with_size(int size)
398 {
399 static struct st_hash_type hashType = {
400 str_end_cmp,
401 str_end_hash,
402 };
403
404 return (hash_table_type* )
405 onig_st_init_table_with_size(&hashType, size);
406 }
407
408 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)409 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
410 const UChar* end_key, hash_data_type *value)
411 {
412 st_str_end_key key;
413
414 key.s = (UChar* )str_key;
415 key.end = (UChar* )end_key;
416
417 return onig_st_lookup(table, (st_data_t )(&key), value);
418 }
419
420 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)421 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
422 const UChar* end_key, hash_data_type value)
423 {
424 st_str_end_key* key;
425 int result;
426
427 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
428 key->s = (UChar* )str_key;
429 key->end = (UChar* )end_key;
430 result = onig_st_insert(table, (st_data_t )key, value);
431 if (result) {
432 xfree(key);
433 }
434 return result;
435 }
436
437 #endif /* USE_ST_LIBRARY */
438
439
440 #ifdef USE_NAMED_GROUP
441
442 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
443
444 typedef struct {
445 UChar* name;
446 int name_len; /* byte length */
447 int back_num; /* number of backrefs */
448 int back_alloc;
449 int back_ref1;
450 int* back_refs;
451 } NameEntry;
452
453 #ifdef USE_ST_LIBRARY
454
455 typedef st_table NameTable;
456 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
457
458 #define NAMEBUF_SIZE 24
459 #define NAMEBUF_SIZE_1 25
460
461 #ifdef ONIG_DEBUG
462 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)463 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
464 {
465 int i;
466 FILE* fp = (FILE* )arg;
467
468 fprintf(fp, "%s: ", e->name);
469 if (e->back_num == 0)
470 fputs("-", fp);
471 else if (e->back_num == 1)
472 fprintf(fp, "%d", e->back_ref1);
473 else {
474 for (i = 0; i < e->back_num; i++) {
475 if (i > 0) fprintf(fp, ", ");
476 fprintf(fp, "%d", e->back_refs[i]);
477 }
478 }
479 fputs("\n", fp);
480 return ST_CONTINUE;
481 }
482
483 extern int
onig_print_names(FILE * fp,regex_t * reg)484 onig_print_names(FILE* fp, regex_t* reg)
485 {
486 NameTable* t = (NameTable* )reg->name_table;
487
488 if (IS_NOT_NULL(t)) {
489 fprintf(fp, "name table\n");
490 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
491 fputs("\n", fp);
492 }
493 return 0;
494 }
495 #endif /* ONIG_DEBUG */
496
497 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)498 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
499 {
500 xfree(e->name);
501 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
502 xfree(key);
503 xfree(e);
504 return ST_DELETE;
505 }
506
507 static int
names_clear(regex_t * reg)508 names_clear(regex_t* reg)
509 {
510 NameTable* t = (NameTable* )reg->name_table;
511
512 if (IS_NOT_NULL(t)) {
513 onig_st_foreach(t, i_free_name_entry, 0);
514 }
515 return 0;
516 }
517
518 extern int
onig_names_free(regex_t * reg)519 onig_names_free(regex_t* reg)
520 {
521 int r;
522 NameTable* t;
523
524 r = names_clear(reg);
525 if (r) return r;
526
527 t = (NameTable* )reg->name_table;
528 if (IS_NOT_NULL(t)) onig_st_free_table(t);
529 reg->name_table = (void* )NULL;
530 return 0;
531 }
532
533 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)534 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
535 {
536 NameEntry* e;
537 NameTable* t = (NameTable* )reg->name_table;
538
539 e = (NameEntry* )NULL;
540 if (IS_NOT_NULL(t)) {
541 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
542 }
543 return e;
544 }
545
546 typedef struct {
547 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
548 regex_t* reg;
549 void* arg;
550 int ret;
551 OnigEncoding enc;
552 } INamesArg;
553
554 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)555 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
556 {
557 int r = (*(arg->func))(e->name,
558 e->name + e->name_len,
559 e->back_num,
560 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
561 arg->reg, arg->arg);
562 if (r != 0) {
563 arg->ret = r;
564 return ST_STOP;
565 }
566 return ST_CONTINUE;
567 }
568
569 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)570 onig_foreach_name(regex_t* reg,
571 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
572 {
573 INamesArg narg;
574 NameTable* t = (NameTable* )reg->name_table;
575
576 narg.ret = 0;
577 if (IS_NOT_NULL(t)) {
578 narg.func = func;
579 narg.reg = reg;
580 narg.arg = arg;
581 narg.enc = reg->enc; /* should be pattern encoding. */
582 onig_st_foreach(t, i_names, (HashDataType )&narg);
583 }
584 return narg.ret;
585 }
586
587 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)588 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
589 {
590 int i;
591
592 if (e->back_num > 1) {
593 for (i = 0; i < e->back_num; i++) {
594 e->back_refs[i] = map[e->back_refs[i]].new_val;
595 }
596 }
597 else if (e->back_num == 1) {
598 e->back_ref1 = map[e->back_ref1].new_val;
599 }
600
601 return ST_CONTINUE;
602 }
603
604 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)605 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
606 {
607 NameTable* t = (NameTable* )reg->name_table;
608
609 if (IS_NOT_NULL(t)) {
610 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
611 }
612 return 0;
613 }
614
615
616 extern int
onig_number_of_names(regex_t * reg)617 onig_number_of_names(regex_t* reg)
618 {
619 NameTable* t = (NameTable* )reg->name_table;
620
621 if (IS_NOT_NULL(t))
622 return t->num_entries;
623 else
624 return 0;
625 }
626
627 #else /* USE_ST_LIBRARY */
628
629 #define INIT_NAMES_ALLOC_NUM 8
630
631 typedef struct {
632 NameEntry* e;
633 int num;
634 int alloc;
635 } NameTable;
636
637 #ifdef ONIG_DEBUG
638 extern int
onig_print_names(FILE * fp,regex_t * reg)639 onig_print_names(FILE* fp, regex_t* reg)
640 {
641 int i, j;
642 NameEntry* e;
643 NameTable* t = (NameTable* )reg->name_table;
644
645 if (IS_NOT_NULL(t) && t->num > 0) {
646 fprintf(fp, "name table\n");
647 for (i = 0; i < t->num; i++) {
648 e = &(t->e[i]);
649 fprintf(fp, "%s: ", e->name);
650 if (e->back_num == 0) {
651 fputs("-", fp);
652 }
653 else if (e->back_num == 1) {
654 fprintf(fp, "%d", e->back_ref1);
655 }
656 else {
657 for (j = 0; j < e->back_num; j++) {
658 if (j > 0) fprintf(fp, ", ");
659 fprintf(fp, "%d", e->back_refs[j]);
660 }
661 }
662 fputs("\n", fp);
663 }
664 fputs("\n", fp);
665 }
666 return 0;
667 }
668 #endif
669
670 static int
names_clear(regex_t * reg)671 names_clear(regex_t* reg)
672 {
673 int i;
674 NameEntry* e;
675 NameTable* t = (NameTable* )reg->name_table;
676
677 if (IS_NOT_NULL(t)) {
678 for (i = 0; i < t->num; i++) {
679 e = &(t->e[i]);
680 if (IS_NOT_NULL(e->name)) {
681 xfree(e->name);
682 e->name = NULL;
683 e->name_len = 0;
684 e->back_num = 0;
685 e->back_alloc = 0;
686 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
687 e->back_refs = (int* )NULL;
688 }
689 }
690 if (IS_NOT_NULL(t->e)) {
691 xfree(t->e);
692 t->e = NULL;
693 }
694 t->num = 0;
695 }
696 return 0;
697 }
698
699 extern int
onig_names_free(regex_t * reg)700 onig_names_free(regex_t* reg)
701 {
702 int r;
703 NameTable* t;
704
705 r = names_clear(reg);
706 if (r) return r;
707
708 t = (NameTable* )reg->name_table;
709 if (IS_NOT_NULL(t)) xfree(t);
710 reg->name_table = NULL;
711 return 0;
712 }
713
714 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)715 name_find(regex_t* reg, UChar* name, UChar* name_end)
716 {
717 int i, len;
718 NameEntry* e;
719 NameTable* t = (NameTable* )reg->name_table;
720
721 if (IS_NOT_NULL(t)) {
722 len = name_end - name;
723 for (i = 0; i < t->num; i++) {
724 e = &(t->e[i]);
725 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
726 return e;
727 }
728 }
729 return (NameEntry* )NULL;
730 }
731
732 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)733 onig_foreach_name(regex_t* reg,
734 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
735 {
736 int i, r;
737 NameEntry* e;
738 NameTable* t = (NameTable* )reg->name_table;
739
740 if (IS_NOT_NULL(t)) {
741 for (i = 0; i < t->num; i++) {
742 e = &(t->e[i]);
743 r = (*func)(e->name, e->name + e->name_len, e->back_num,
744 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
745 reg, arg);
746 if (r != 0) return r;
747 }
748 }
749 return 0;
750 }
751
752 extern int
onig_number_of_names(regex_t * reg)753 onig_number_of_names(regex_t* reg)
754 {
755 NameTable* t = (NameTable* )reg->name_table;
756
757 if (IS_NOT_NULL(t))
758 return t->num;
759 else
760 return 0;
761 }
762
763 #endif /* else USE_ST_LIBRARY */
764
765 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)766 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
767 {
768 int alloc;
769 NameEntry* e;
770 NameTable* t = (NameTable* )reg->name_table;
771
772 if (name_end - name <= 0)
773 return ONIGERR_EMPTY_GROUP_NAME;
774
775 e = name_find(reg, name, name_end);
776 if (IS_NULL(e)) {
777 #ifdef USE_ST_LIBRARY
778 if (IS_NULL(t)) {
779 t = onig_st_init_strend_table_with_size(5);
780 reg->name_table = (void* )t;
781 }
782 e = (NameEntry* )xmalloc(sizeof(NameEntry));
783 CHECK_NULL_RETURN_MEMERR(e);
784
785 e->name = strdup_with_null(reg->enc, name, name_end);
786 if (IS_NULL(e->name)) {
787 xfree(e); return ONIGERR_MEMORY;
788 }
789 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
790 (HashDataType )e);
791
792 e->name_len = name_end - name;
793 e->back_num = 0;
794 e->back_alloc = 0;
795 e->back_refs = (int* )NULL;
796
797 #else
798
799 if (IS_NULL(t)) {
800 alloc = INIT_NAMES_ALLOC_NUM;
801 t = (NameTable* )xmalloc(sizeof(NameTable));
802 CHECK_NULL_RETURN_MEMERR(t);
803 t->e = NULL;
804 t->alloc = 0;
805 t->num = 0;
806
807 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
808 if (IS_NULL(t->e)) {
809 xfree(t);
810 return ONIGERR_MEMORY;
811 }
812 t->alloc = alloc;
813 reg->name_table = t;
814 goto clear;
815 }
816 else if (t->num == t->alloc) {
817 int i;
818
819 alloc = t->alloc * 2;
820 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
821 CHECK_NULL_RETURN_MEMERR(t->e);
822 t->alloc = alloc;
823
824 clear:
825 for (i = t->num; i < t->alloc; i++) {
826 t->e[i].name = NULL;
827 t->e[i].name_len = 0;
828 t->e[i].back_num = 0;
829 t->e[i].back_alloc = 0;
830 t->e[i].back_refs = (int* )NULL;
831 }
832 }
833 e = &(t->e[t->num]);
834 t->num++;
835 e->name = strdup_with_null(reg->enc, name, name_end);
836 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
837 e->name_len = name_end - name;
838 #endif
839 }
840
841 if (e->back_num >= 1 &&
842 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
843 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
844 name, name_end);
845 return ONIGERR_MULTIPLEX_DEFINED_NAME;
846 }
847
848 e->back_num++;
849 if (e->back_num == 1) {
850 e->back_ref1 = backref;
851 }
852 else {
853 if (e->back_num == 2) {
854 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
855 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
856 CHECK_NULL_RETURN_MEMERR(e->back_refs);
857 e->back_alloc = alloc;
858 e->back_refs[0] = e->back_ref1;
859 e->back_refs[1] = backref;
860 }
861 else {
862 if (e->back_num > e->back_alloc) {
863 alloc = e->back_alloc * 2;
864 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
865 CHECK_NULL_RETURN_MEMERR(e->back_refs);
866 e->back_alloc = alloc;
867 }
868 e->back_refs[e->back_num - 1] = backref;
869 }
870 }
871
872 return 0;
873 }
874
875 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)876 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
877 const UChar* name_end, int** nums)
878 {
879 NameEntry* e = name_find(reg, name, name_end);
880
881 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
882
883 switch (e->back_num) {
884 case 0:
885 break;
886 case 1:
887 *nums = &(e->back_ref1);
888 break;
889 default:
890 *nums = e->back_refs;
891 break;
892 }
893 return e->back_num;
894 }
895
896 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)897 onig_name_to_backref_number(regex_t* reg, const UChar* name,
898 const UChar* name_end, OnigRegion *region)
899 {
900 int i, n, *nums;
901
902 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
903 if (n < 0)
904 return n;
905 else if (n == 0)
906 return ONIGERR_PARSER_BUG;
907 else if (n == 1)
908 return nums[0];
909 else {
910 if (IS_NOT_NULL(region)) {
911 for (i = n - 1; i >= 0; i--) {
912 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
913 return nums[i];
914 }
915 }
916 return nums[n - 1];
917 }
918 }
919
920 #else /* USE_NAMED_GROUP */
921
922 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)923 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
924 const UChar* name_end, int** nums)
925 {
926 return ONIG_NO_SUPPORT_CONFIG;
927 }
928
929 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)930 onig_name_to_backref_number(regex_t* reg, const UChar* name,
931 const UChar* name_end, OnigRegion* region)
932 {
933 return ONIG_NO_SUPPORT_CONFIG;
934 }
935
936 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)937 onig_foreach_name(regex_t* reg,
938 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
939 {
940 return ONIG_NO_SUPPORT_CONFIG;
941 }
942
943 extern int
onig_number_of_names(regex_t * reg)944 onig_number_of_names(regex_t* reg)
945 {
946 return 0;
947 }
948 #endif /* else USE_NAMED_GROUP */
949
950 extern int
onig_noname_group_capture_is_active(regex_t * reg)951 onig_noname_group_capture_is_active(regex_t* reg)
952 {
953 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
954 return 0;
955
956 #ifdef USE_NAMED_GROUP
957 if (onig_number_of_names(reg) > 0 &&
958 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
959 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
960 return 0;
961 }
962 #endif
963
964 return 1;
965 }
966
967
968 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
969
970 static void
scan_env_clear(ScanEnv * env)971 scan_env_clear(ScanEnv* env)
972 {
973 int i;
974
975 BIT_STATUS_CLEAR(env->capture_history);
976 BIT_STATUS_CLEAR(env->bt_mem_start);
977 BIT_STATUS_CLEAR(env->bt_mem_end);
978 BIT_STATUS_CLEAR(env->backrefed_mem);
979 env->error = (UChar* )NULL;
980 env->error_end = (UChar* )NULL;
981 env->num_call = 0;
982 env->num_mem = 0;
983 #ifdef USE_NAMED_GROUP
984 env->num_named = 0;
985 #endif
986 env->mem_alloc = 0;
987 env->mem_nodes_dynamic = (Node** )NULL;
988
989 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
990 env->mem_nodes_static[i] = NULL_NODE;
991
992 #ifdef USE_COMBINATION_EXPLOSION_CHECK
993 env->num_comb_exp_check = 0;
994 env->comb_exp_max_regnum = 0;
995 env->curr_max_regnum = 0;
996 env->has_recursion = 0;
997 #endif
998 env->parse_depth = 0;
999 }
1000
1001 static int
scan_env_add_mem_entry(ScanEnv * env)1002 scan_env_add_mem_entry(ScanEnv* env)
1003 {
1004 int i, need, alloc;
1005 Node** p;
1006
1007 need = env->num_mem + 1;
1008 if (need > MaxCaptureNum && MaxCaptureNum != 0)
1009 return ONIGERR_TOO_MANY_CAPTURES;
1010
1011 if (need >= SCANENV_MEMNODES_SIZE) {
1012 if (env->mem_alloc <= need) {
1013 if (IS_NULL(env->mem_nodes_dynamic)) {
1014 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
1015 p = (Node** )xmalloc(sizeof(Node*) * alloc);
1016 xmemcpy(p, env->mem_nodes_static,
1017 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
1018 }
1019 else {
1020 alloc = env->mem_alloc * 2;
1021 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
1022 }
1023 CHECK_NULL_RETURN_MEMERR(p);
1024
1025 for (i = env->num_mem + 1; i < alloc; i++)
1026 p[i] = NULL_NODE;
1027
1028 env->mem_nodes_dynamic = p;
1029 env->mem_alloc = alloc;
1030 }
1031 }
1032
1033 env->num_mem++;
1034 return env->num_mem;
1035 }
1036
1037 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)1038 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
1039 {
1040 if (env->num_mem >= num)
1041 SCANENV_MEM_NODES(env)[num] = node;
1042 else
1043 return ONIGERR_PARSER_BUG;
1044 return 0;
1045 }
1046
1047 extern void
onig_node_free(Node * node)1048 onig_node_free(Node* node)
1049 {
1050 start:
1051 if (IS_NULL(node)) return ;
1052
1053 #ifdef DEBUG_NODE_FREE
1054 fprintf(stderr, "onig_node_free: %p\n", node);
1055 #endif
1056
1057 switch (NTYPE(node)) {
1058 case NT_STR:
1059 if (NSTR(node)->capa != 0 &&
1060 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1061 xfree(NSTR(node)->s);
1062 }
1063 break;
1064
1065 case NT_LIST:
1066 case NT_ALT:
1067 onig_node_free(NCAR(node));
1068 {
1069 Node* next_node = NCDR(node);
1070
1071 xfree(node);
1072 node = next_node;
1073 goto start;
1074 }
1075 break;
1076
1077 case NT_CCLASS:
1078 {
1079 CClassNode* cc = NCCLASS(node);
1080
1081 if (IS_NCCLASS_SHARE(cc)) return ;
1082 if (cc->mbuf)
1083 bbuf_free(cc->mbuf);
1084 }
1085 break;
1086
1087 case NT_QTFR:
1088 if (NQTFR(node)->target)
1089 onig_node_free(NQTFR(node)->target);
1090 break;
1091
1092 case NT_ENCLOSE:
1093 if (NENCLOSE(node)->target)
1094 onig_node_free(NENCLOSE(node)->target);
1095 break;
1096
1097 case NT_BREF:
1098 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1099 xfree(NBREF(node)->back_dynamic);
1100 break;
1101
1102 case NT_ANCHOR:
1103 if (NANCHOR(node)->target)
1104 onig_node_free(NANCHOR(node)->target);
1105 break;
1106 }
1107
1108 xfree(node);
1109 }
1110
1111 static Node*
node_new(void)1112 node_new(void)
1113 {
1114 Node* node;
1115
1116 node = (Node* )xmalloc(sizeof(Node));
1117 /* xmemset(node, 0, sizeof(Node)); */
1118 #ifdef DEBUG_NODE_FREE
1119 fprintf(stderr, "node_new: %p\n", node);
1120 #endif
1121 return node;
1122 }
1123
1124
1125 static void
initialize_cclass(CClassNode * cc)1126 initialize_cclass(CClassNode* cc)
1127 {
1128 BITSET_CLEAR(cc->bs);
1129 /* cc->base.flags = 0; */
1130 cc->flags = 0;
1131 cc->mbuf = NULL;
1132 }
1133
1134 static Node*
node_new_cclass(void)1135 node_new_cclass(void)
1136 {
1137 Node* node = node_new();
1138 CHECK_NULL_RETURN(node);
1139
1140 SET_NTYPE(node, NT_CCLASS);
1141 initialize_cclass(NCCLASS(node));
1142 return node;
1143 }
1144
1145 static Node*
node_new_ctype(int type,int not)1146 node_new_ctype(int type, int not)
1147 {
1148 Node* node = node_new();
1149 CHECK_NULL_RETURN(node);
1150
1151 SET_NTYPE(node, NT_CTYPE);
1152 NCTYPE(node)->ctype = type;
1153 NCTYPE(node)->not = not;
1154 return node;
1155 }
1156
1157 static Node*
node_new_anychar(void)1158 node_new_anychar(void)
1159 {
1160 Node* node = node_new();
1161 CHECK_NULL_RETURN(node);
1162
1163 SET_NTYPE(node, NT_CANY);
1164 return node;
1165 }
1166
1167 static Node*
node_new_list(Node * left,Node * right)1168 node_new_list(Node* left, Node* right)
1169 {
1170 Node* node = node_new();
1171 CHECK_NULL_RETURN(node);
1172
1173 SET_NTYPE(node, NT_LIST);
1174 NCAR(node) = left;
1175 NCDR(node) = right;
1176 return node;
1177 }
1178
1179 extern Node*
onig_node_new_list(Node * left,Node * right)1180 onig_node_new_list(Node* left, Node* right)
1181 {
1182 return node_new_list(left, right);
1183 }
1184
1185 extern Node*
onig_node_list_add(Node * list,Node * x)1186 onig_node_list_add(Node* list, Node* x)
1187 {
1188 Node *n;
1189
1190 n = onig_node_new_list(x, NULL);
1191 if (IS_NULL(n)) return NULL_NODE;
1192
1193 if (IS_NOT_NULL(list)) {
1194 while (IS_NOT_NULL(NCDR(list)))
1195 list = NCDR(list);
1196
1197 NCDR(list) = n;
1198 }
1199
1200 return n;
1201 }
1202
1203 extern Node*
onig_node_new_alt(Node * left,Node * right)1204 onig_node_new_alt(Node* left, Node* right)
1205 {
1206 Node* node = node_new();
1207 CHECK_NULL_RETURN(node);
1208
1209 SET_NTYPE(node, NT_ALT);
1210 NCAR(node) = left;
1211 NCDR(node) = right;
1212 return node;
1213 }
1214
1215 extern Node*
onig_node_new_anchor(int type)1216 onig_node_new_anchor(int type)
1217 {
1218 Node* node = node_new();
1219 CHECK_NULL_RETURN(node);
1220
1221 SET_NTYPE(node, NT_ANCHOR);
1222 NANCHOR(node)->type = type;
1223 NANCHOR(node)->target = NULL;
1224 NANCHOR(node)->char_len = -1;
1225 return node;
1226 }
1227
1228 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1229 node_new_backref(int back_num, int* backrefs, int by_name,
1230 #ifdef USE_BACKREF_WITH_LEVEL
1231 int exist_level, int nest_level,
1232 #endif
1233 ScanEnv* env)
1234 {
1235 int i;
1236 Node* node = node_new();
1237
1238 CHECK_NULL_RETURN(node);
1239
1240 SET_NTYPE(node, NT_BREF);
1241 NBREF(node)->state = 0;
1242 NBREF(node)->back_num = back_num;
1243 NBREF(node)->back_dynamic = (int* )NULL;
1244 if (by_name != 0)
1245 NBREF(node)->state |= NST_NAME_REF;
1246
1247 #ifdef USE_BACKREF_WITH_LEVEL
1248 if (exist_level != 0) {
1249 NBREF(node)->state |= NST_NEST_LEVEL;
1250 NBREF(node)->nest_level = nest_level;
1251 }
1252 #endif
1253
1254 for (i = 0; i < back_num; i++) {
1255 if (backrefs[i] <= env->num_mem &&
1256 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1257 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1258 break;
1259 }
1260 }
1261
1262 if (back_num <= NODE_BACKREFS_SIZE) {
1263 for (i = 0; i < back_num; i++)
1264 NBREF(node)->back_static[i] = backrefs[i];
1265 }
1266 else {
1267 int* p = (int* )xmalloc(sizeof(int) * back_num);
1268 if (IS_NULL(p)) {
1269 onig_node_free(node);
1270 return NULL;
1271 }
1272 NBREF(node)->back_dynamic = p;
1273 for (i = 0; i < back_num; i++)
1274 p[i] = backrefs[i];
1275 }
1276 return node;
1277 }
1278
1279 #ifdef USE_SUBEXP_CALL
1280 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1281 node_new_call(UChar* name, UChar* name_end, int gnum)
1282 {
1283 Node* node = node_new();
1284 CHECK_NULL_RETURN(node);
1285
1286 SET_NTYPE(node, NT_CALL);
1287 NCALL(node)->state = 0;
1288 NCALL(node)->target = NULL_NODE;
1289 NCALL(node)->name = name;
1290 NCALL(node)->name_end = name_end;
1291 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1292 return node;
1293 }
1294 #endif
1295
1296 static Node*
node_new_quantifier(int lower,int upper,int by_number)1297 node_new_quantifier(int lower, int upper, int by_number)
1298 {
1299 Node* node = node_new();
1300 CHECK_NULL_RETURN(node);
1301
1302 SET_NTYPE(node, NT_QTFR);
1303 NQTFR(node)->state = 0;
1304 NQTFR(node)->target = NULL;
1305 NQTFR(node)->lower = lower;
1306 NQTFR(node)->upper = upper;
1307 NQTFR(node)->greedy = 1;
1308 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1309 NQTFR(node)->head_exact = NULL_NODE;
1310 NQTFR(node)->next_head_exact = NULL_NODE;
1311 NQTFR(node)->is_refered = 0;
1312 if (by_number != 0)
1313 NQTFR(node)->state |= NST_BY_NUMBER;
1314
1315 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1316 NQTFR(node)->comb_exp_check_num = 0;
1317 #endif
1318
1319 return node;
1320 }
1321
1322 static Node*
node_new_enclose(int type)1323 node_new_enclose(int type)
1324 {
1325 Node* node = node_new();
1326 CHECK_NULL_RETURN(node);
1327
1328 SET_NTYPE(node, NT_ENCLOSE);
1329 NENCLOSE(node)->type = type;
1330 NENCLOSE(node)->state = 0;
1331 NENCLOSE(node)->regnum = 0;
1332 NENCLOSE(node)->option = 0;
1333 NENCLOSE(node)->target = NULL;
1334 NENCLOSE(node)->call_addr = -1;
1335 NENCLOSE(node)->opt_count = 0;
1336 return node;
1337 }
1338
1339 extern Node*
onig_node_new_enclose(int type)1340 onig_node_new_enclose(int type)
1341 {
1342 return node_new_enclose(type);
1343 }
1344
1345 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1346 node_new_enclose_memory(OnigOptionType option, int is_named)
1347 {
1348 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1349 CHECK_NULL_RETURN(node);
1350 if (is_named != 0)
1351 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1352
1353 #ifdef USE_SUBEXP_CALL
1354 NENCLOSE(node)->option = option;
1355 #endif
1356 return node;
1357 }
1358
1359 static Node*
node_new_option(OnigOptionType option)1360 node_new_option(OnigOptionType option)
1361 {
1362 Node* node = node_new_enclose(ENCLOSE_OPTION);
1363 CHECK_NULL_RETURN(node);
1364 NENCLOSE(node)->option = option;
1365 return node;
1366 }
1367
1368 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1369 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1370 {
1371 int addlen = end - s;
1372
1373 if (addlen > 0) {
1374 int len = NSTR(node)->end - NSTR(node)->s;
1375
1376 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1377 UChar* p;
1378 int capa = len + addlen + NODE_STR_MARGIN;
1379
1380 if (capa <= NSTR(node)->capa) {
1381 onig_strcpy(NSTR(node)->s + len, s, end);
1382 }
1383 else {
1384 if (NSTR(node)->s == NSTR(node)->buf)
1385 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1386 s, end, capa);
1387 else
1388 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1389
1390 CHECK_NULL_RETURN_MEMERR(p);
1391 NSTR(node)->s = p;
1392 NSTR(node)->capa = capa;
1393 }
1394 }
1395 else {
1396 onig_strcpy(NSTR(node)->s + len, s, end);
1397 }
1398 NSTR(node)->end = NSTR(node)->s + len + addlen;
1399 }
1400
1401 return 0;
1402 }
1403
1404 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1405 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1406 {
1407 onig_node_str_clear(node);
1408 return onig_node_str_cat(node, s, end);
1409 }
1410
1411 static int
node_str_cat_char(Node * node,UChar c)1412 node_str_cat_char(Node* node, UChar c)
1413 {
1414 UChar s[1];
1415
1416 s[0] = c;
1417 return onig_node_str_cat(node, s, s + 1);
1418 }
1419
1420 extern void
onig_node_conv_to_str_node(Node * node,int flag)1421 onig_node_conv_to_str_node(Node* node, int flag)
1422 {
1423 SET_NTYPE(node, NT_STR);
1424 NSTR(node)->flag = flag;
1425 NSTR(node)->capa = 0;
1426 NSTR(node)->s = NSTR(node)->buf;
1427 NSTR(node)->end = NSTR(node)->buf;
1428 }
1429
1430 extern void
onig_node_str_clear(Node * node)1431 onig_node_str_clear(Node* node)
1432 {
1433 if (NSTR(node)->capa != 0 &&
1434 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1435 xfree(NSTR(node)->s);
1436 }
1437
1438 NSTR(node)->capa = 0;
1439 NSTR(node)->flag = 0;
1440 NSTR(node)->s = NSTR(node)->buf;
1441 NSTR(node)->end = NSTR(node)->buf;
1442 }
1443
1444 static Node*
node_new_str(const UChar * s,const UChar * end)1445 node_new_str(const UChar* s, const UChar* end)
1446 {
1447 Node* node = node_new();
1448 CHECK_NULL_RETURN(node);
1449
1450 SET_NTYPE(node, NT_STR);
1451 NSTR(node)->capa = 0;
1452 NSTR(node)->flag = 0;
1453 NSTR(node)->s = NSTR(node)->buf;
1454 NSTR(node)->end = NSTR(node)->buf;
1455 if (onig_node_str_cat(node, s, end)) {
1456 onig_node_free(node);
1457 return NULL;
1458 }
1459 return node;
1460 }
1461
1462 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1463 onig_node_new_str(const UChar* s, const UChar* end)
1464 {
1465 return node_new_str(s, end);
1466 }
1467
1468 static Node*
node_new_str_raw(UChar * s,UChar * end)1469 node_new_str_raw(UChar* s, UChar* end)
1470 {
1471 Node* node = node_new_str(s, end);
1472 NSTRING_SET_RAW(node);
1473 return node;
1474 }
1475
1476 static Node*
node_new_empty(void)1477 node_new_empty(void)
1478 {
1479 return node_new_str(NULL, NULL);
1480 }
1481
1482 static Node*
node_new_str_raw_char(UChar c)1483 node_new_str_raw_char(UChar c)
1484 {
1485 UChar p[1];
1486
1487 p[0] = c;
1488 return node_new_str_raw(p, p + 1);
1489 }
1490
1491 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1492 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1493 {
1494 const UChar *p;
1495 Node* n = NULL_NODE;
1496
1497 if (sn->end > sn->s) {
1498 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1499 if (p && p > sn->s) { /* can be split. */
1500 n = node_new_str(p, sn->end);
1501 if ((sn->flag & NSTR_RAW) != 0)
1502 NSTRING_SET_RAW(n);
1503
1504 sn->end = (UChar* )p;
1505 }
1506 }
1507 return n;
1508 }
1509
1510 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1511 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1512 {
1513 if (sn->end > sn->s) {
1514 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1515 }
1516 return 0;
1517 }
1518
1519 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1520 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1521 node_str_head_pad(StrNode* sn, int num, UChar val)
1522 {
1523 UChar buf[NODE_STR_BUF_SIZE];
1524 int i, len;
1525
1526 len = sn->end - sn->s;
1527 onig_strcpy(buf, sn->s, sn->end);
1528 onig_strcpy(&(sn->s[num]), buf, buf + len);
1529 sn->end += num;
1530
1531 for (i = 0; i < num; i++) {
1532 sn->s[i] = val;
1533 }
1534 }
1535 #endif
1536
1537 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1538 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1539 {
1540 unsigned int num, val;
1541 OnigCodePoint c;
1542 UChar* p = *src;
1543 PFETCH_READY;
1544
1545 num = 0;
1546 while (!PEND) {
1547 PFETCH(c);
1548 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1549 val = (unsigned int )DIGITVAL(c);
1550 if ((INT_MAX_LIMIT - val) / 10UL < num)
1551 return -1; /* overflow */
1552
1553 num = num * 10 + val;
1554 }
1555 else {
1556 PUNFETCH;
1557 break;
1558 }
1559 }
1560 *src = p;
1561 return num;
1562 }
1563
1564 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1565 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1566 OnigEncoding enc)
1567 {
1568 OnigCodePoint c;
1569 unsigned int num, val;
1570 UChar* p = *src;
1571 PFETCH_READY;
1572
1573 num = 0;
1574 while (! PEND && maxlen-- != 0) {
1575 PFETCH(c);
1576 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1577 val = (unsigned int )XDIGITVAL(enc,c);
1578 if ((INT_MAX_LIMIT - val) / 16UL < num)
1579 return -1; /* overflow */
1580
1581 num = (num << 4) + XDIGITVAL(enc,c);
1582 }
1583 else {
1584 PUNFETCH;
1585 break;
1586 }
1587 }
1588 *src = p;
1589 return num;
1590 }
1591
1592 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1593 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1594 OnigEncoding enc)
1595 {
1596 OnigCodePoint c;
1597 unsigned int num, val;
1598 UChar* p = *src;
1599 PFETCH_READY;
1600
1601 num = 0;
1602 while (!PEND && maxlen-- != 0) {
1603 PFETCH(c);
1604 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1605 val = ODIGITVAL(c);
1606 if ((INT_MAX_LIMIT - val) / 8UL < num)
1607 return -1; /* overflow */
1608
1609 num = (num << 3) + val;
1610 }
1611 else {
1612 PUNFETCH;
1613 break;
1614 }
1615 }
1616 *src = p;
1617 return num;
1618 }
1619
1620
1621 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1622 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1623
1624 /* data format:
1625 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1626 (all data size is OnigCodePoint)
1627 */
1628 static int
new_code_range(BBuf ** pbuf)1629 new_code_range(BBuf** pbuf)
1630 {
1631 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1632 int r;
1633 OnigCodePoint n;
1634 BBuf* bbuf;
1635
1636 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1637 CHECK_NULL_RETURN_MEMERR(*pbuf);
1638 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1639 if (r) return r;
1640
1641 n = 0;
1642 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1643 return 0;
1644 }
1645
1646 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1647 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1648 {
1649 int r, inc_n, pos;
1650 int low, high, bound, x;
1651 OnigCodePoint n, *data;
1652 BBuf* bbuf;
1653
1654 if (from > to) {
1655 n = from; from = to; to = n;
1656 }
1657
1658 if (IS_NULL(*pbuf)) {
1659 r = new_code_range(pbuf);
1660 if (r) return r;
1661 bbuf = *pbuf;
1662 n = 0;
1663 }
1664 else {
1665 bbuf = *pbuf;
1666 GET_CODE_POINT(n, bbuf->p);
1667 }
1668 data = (OnigCodePoint* )(bbuf->p);
1669 data++;
1670
1671 for (low = 0, bound = n; low < bound; ) {
1672 x = (low + bound) >> 1;
1673 if (from > data[x*2 + 1])
1674 low = x + 1;
1675 else
1676 bound = x;
1677 }
1678
1679 high = (to == ~((OnigCodePoint )0)) ? n : low;
1680 for (bound = n; high < bound; ) {
1681 x = (high + bound) >> 1;
1682 if (to + 1 >= data[x*2])
1683 high = x + 1;
1684 else
1685 bound = x;
1686 }
1687
1688 inc_n = low + 1 - high;
1689 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1690 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1691
1692 if (inc_n != 1) {
1693 if (from > data[low*2])
1694 from = data[low*2];
1695 if (to < data[(high - 1)*2 + 1])
1696 to = data[(high - 1)*2 + 1];
1697 }
1698
1699 if (inc_n != 0 && (OnigCodePoint )high < n) {
1700 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1701 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1702 int size = (n - high) * 2 * SIZE_CODE_POINT;
1703
1704 if (inc_n > 0) {
1705 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1706 }
1707 else {
1708 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1709 }
1710 }
1711
1712 pos = SIZE_CODE_POINT * (1 + low * 2);
1713 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1714 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1715 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1716 n += inc_n;
1717 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1718
1719 return 0;
1720 }
1721
1722 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1723 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1724 {
1725 if (from > to) {
1726 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1727 return 0;
1728 else
1729 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1730 }
1731
1732 return add_code_range_to_buf(pbuf, from, to);
1733 }
1734
1735 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1736 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1737 {
1738 int r, i, n;
1739 OnigCodePoint pre, from, *data, to = 0;
1740
1741 *pbuf = (BBuf* )NULL;
1742 if (IS_NULL(bbuf)) {
1743 set_all:
1744 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1745 }
1746
1747 data = (OnigCodePoint* )(bbuf->p);
1748 GET_CODE_POINT(n, data);
1749 data++;
1750 if (n <= 0) goto set_all;
1751
1752 r = 0;
1753 pre = MBCODE_START_POS(enc);
1754 for (i = 0; i < n; i++) {
1755 from = data[i*2];
1756 to = data[i*2+1];
1757 if (pre <= from - 1) {
1758 r = add_code_range_to_buf(pbuf, pre, from - 1);
1759 if (r != 0) return r;
1760 }
1761 if (to == ~((OnigCodePoint )0)) break;
1762 pre = to + 1;
1763 }
1764 if (to < ~((OnigCodePoint )0)) {
1765 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1766 }
1767 return r;
1768 }
1769
1770 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1771 BBuf *tbuf; \
1772 int tnot; \
1773 tnot = not1; not1 = not2; not2 = tnot; \
1774 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1775 } while (0)
1776
1777 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1778 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1779 BBuf* bbuf2, int not2, BBuf** pbuf)
1780 {
1781 int r;
1782 OnigCodePoint i, n1, *data1;
1783 OnigCodePoint from, to;
1784
1785 *pbuf = (BBuf* )NULL;
1786 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1787 if (not1 != 0 || not2 != 0)
1788 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1789 return 0;
1790 }
1791
1792 r = 0;
1793 if (IS_NULL(bbuf2))
1794 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1795
1796 if (IS_NULL(bbuf1)) {
1797 if (not1 != 0) {
1798 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1799 }
1800 else {
1801 if (not2 == 0) {
1802 return bbuf_clone(pbuf, bbuf2);
1803 }
1804 else {
1805 return not_code_range_buf(enc, bbuf2, pbuf);
1806 }
1807 }
1808 }
1809
1810 if (not1 != 0)
1811 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1812
1813 data1 = (OnigCodePoint* )(bbuf1->p);
1814 GET_CODE_POINT(n1, data1);
1815 data1++;
1816
1817 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1818 r = bbuf_clone(pbuf, bbuf2);
1819 }
1820 else if (not1 == 0) { /* 1 OR (not 2) */
1821 r = not_code_range_buf(enc, bbuf2, pbuf);
1822 }
1823 if (r != 0) return r;
1824
1825 for (i = 0; i < n1; i++) {
1826 from = data1[i*2];
1827 to = data1[i*2+1];
1828 r = add_code_range_to_buf(pbuf, from, to);
1829 if (r != 0) return r;
1830 }
1831 return 0;
1832 }
1833
1834 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1835 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1836 OnigCodePoint* data, int n)
1837 {
1838 int i, r;
1839 OnigCodePoint from2, to2;
1840
1841 for (i = 0; i < n; i++) {
1842 from2 = data[i*2];
1843 to2 = data[i*2+1];
1844 if (from2 < from1) {
1845 if (to2 < from1) continue;
1846 else {
1847 from1 = to2 + 1;
1848 }
1849 }
1850 else if (from2 <= to1) {
1851 if (to2 < to1) {
1852 if (from1 <= from2 - 1) {
1853 r = add_code_range_to_buf(pbuf, from1, from2-1);
1854 if (r != 0) return r;
1855 }
1856 from1 = to2 + 1;
1857 }
1858 else {
1859 to1 = from2 - 1;
1860 }
1861 }
1862 else {
1863 from1 = from2;
1864 }
1865 if (from1 > to1) break;
1866 }
1867 if (from1 <= to1) {
1868 r = add_code_range_to_buf(pbuf, from1, to1);
1869 if (r != 0) return r;
1870 }
1871 return 0;
1872 }
1873
1874 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1875 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1876 {
1877 int r;
1878 OnigCodePoint i, j, n1, n2, *data1, *data2;
1879 OnigCodePoint from, to, from1, to1, from2, to2;
1880
1881 *pbuf = (BBuf* )NULL;
1882 if (IS_NULL(bbuf1)) {
1883 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1884 return bbuf_clone(pbuf, bbuf2);
1885 return 0;
1886 }
1887 else if (IS_NULL(bbuf2)) {
1888 if (not2 != 0)
1889 return bbuf_clone(pbuf, bbuf1);
1890 return 0;
1891 }
1892
1893 if (not1 != 0)
1894 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1895
1896 data1 = (OnigCodePoint* )(bbuf1->p);
1897 data2 = (OnigCodePoint* )(bbuf2->p);
1898 GET_CODE_POINT(n1, data1);
1899 GET_CODE_POINT(n2, data2);
1900 data1++;
1901 data2++;
1902
1903 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1904 for (i = 0; i < n1; i++) {
1905 from1 = data1[i*2];
1906 to1 = data1[i*2+1];
1907 for (j = 0; j < n2; j++) {
1908 from2 = data2[j*2];
1909 to2 = data2[j*2+1];
1910 if (from2 > to1) break;
1911 if (to2 < from1) continue;
1912 from = MAX(from1, from2);
1913 to = MIN(to1, to2);
1914 r = add_code_range_to_buf(pbuf, from, to);
1915 if (r != 0) return r;
1916 }
1917 }
1918 }
1919 else if (not1 == 0) { /* 1 AND (not 2) */
1920 for (i = 0; i < n1; i++) {
1921 from1 = data1[i*2];
1922 to1 = data1[i*2+1];
1923 r = and_code_range1(pbuf, from1, to1, data2, n2);
1924 if (r != 0) return r;
1925 }
1926 }
1927
1928 return 0;
1929 }
1930
1931 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1932 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1933 {
1934 int r, not1, not2;
1935 BBuf *buf1, *buf2, *pbuf;
1936 BitSetRef bsr1, bsr2;
1937 BitSet bs1, bs2;
1938
1939 not1 = IS_NCCLASS_NOT(dest);
1940 bsr1 = dest->bs;
1941 buf1 = dest->mbuf;
1942 not2 = IS_NCCLASS_NOT(cc);
1943 bsr2 = cc->bs;
1944 buf2 = cc->mbuf;
1945
1946 if (not1 != 0) {
1947 bitset_invert_to(bsr1, bs1);
1948 bsr1 = bs1;
1949 }
1950 if (not2 != 0) {
1951 bitset_invert_to(bsr2, bs2);
1952 bsr2 = bs2;
1953 }
1954 bitset_and(bsr1, bsr2);
1955 if (bsr1 != dest->bs) {
1956 bitset_copy(dest->bs, bsr1);
1957 bsr1 = dest->bs;
1958 }
1959 if (not1 != 0) {
1960 bitset_invert(dest->bs);
1961 }
1962
1963 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
1964 if (not1 != 0 && not2 != 0) {
1965 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
1966 }
1967 else {
1968 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
1969 if (r == 0 && not1 != 0) {
1970 BBuf *tbuf;
1971 r = not_code_range_buf(enc, pbuf, &tbuf);
1972 if (r != 0) {
1973 bbuf_free(pbuf);
1974 return r;
1975 }
1976 bbuf_free(pbuf);
1977 pbuf = tbuf;
1978 }
1979 }
1980 if (r != 0) return r;
1981
1982 dest->mbuf = pbuf;
1983 bbuf_free(buf1);
1984 return r;
1985 }
1986 return 0;
1987 }
1988
1989 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1990 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1991 {
1992 int r, not1, not2;
1993 BBuf *buf1, *buf2, *pbuf;
1994 BitSetRef bsr1, bsr2;
1995 BitSet bs1, bs2;
1996
1997 not1 = IS_NCCLASS_NOT(dest);
1998 bsr1 = dest->bs;
1999 buf1 = dest->mbuf;
2000 not2 = IS_NCCLASS_NOT(cc);
2001 bsr2 = cc->bs;
2002 buf2 = cc->mbuf;
2003
2004 if (not1 != 0) {
2005 bitset_invert_to(bsr1, bs1);
2006 bsr1 = bs1;
2007 }
2008 if (not2 != 0) {
2009 bitset_invert_to(bsr2, bs2);
2010 bsr2 = bs2;
2011 }
2012 bitset_or(bsr1, bsr2);
2013 if (bsr1 != dest->bs) {
2014 bitset_copy(dest->bs, bsr1);
2015 bsr1 = dest->bs;
2016 }
2017 if (not1 != 0) {
2018 bitset_invert(dest->bs);
2019 }
2020
2021 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2022 if (not1 != 0 && not2 != 0) {
2023 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2024 }
2025 else {
2026 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2027 if (r == 0 && not1 != 0) {
2028 BBuf *tbuf;
2029 r = not_code_range_buf(enc, pbuf, &tbuf);
2030 if (r != 0) {
2031 bbuf_free(pbuf);
2032 return r;
2033 }
2034 bbuf_free(pbuf);
2035 pbuf = tbuf;
2036 }
2037 }
2038 if (r != 0) return r;
2039
2040 dest->mbuf = pbuf;
2041 bbuf_free(buf1);
2042 return r;
2043 }
2044 else
2045 return 0;
2046 }
2047
2048 static OnigCodePoint
conv_backslash_value(OnigCodePoint c,ScanEnv * env)2049 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
2050 {
2051 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2052 switch (c) {
2053 case 'n': return '\n';
2054 case 't': return '\t';
2055 case 'r': return '\r';
2056 case 'f': return '\f';
2057 case 'a': return '\007';
2058 case 'b': return '\010';
2059 case 'e': return '\033';
2060 case 'v':
2061 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2062 return '\v';
2063 break;
2064
2065 default:
2066 break;
2067 }
2068 }
2069 return c;
2070 }
2071
2072 static int
is_invalid_quantifier_target(Node * node)2073 is_invalid_quantifier_target(Node* node)
2074 {
2075 switch (NTYPE(node)) {
2076 case NT_ANCHOR:
2077 return 1;
2078 break;
2079
2080 case NT_ENCLOSE:
2081 /* allow enclosed elements */
2082 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2083 break;
2084
2085 case NT_LIST:
2086 do {
2087 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2088 } while (IS_NOT_NULL(node = NCDR(node)));
2089 return 0;
2090 break;
2091
2092 case NT_ALT:
2093 do {
2094 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2095 } while (IS_NOT_NULL(node = NCDR(node)));
2096 break;
2097
2098 default:
2099 break;
2100 }
2101 return 0;
2102 }
2103
2104 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2105 static int
popular_quantifier_num(QtfrNode * q)2106 popular_quantifier_num(QtfrNode* q)
2107 {
2108 if (q->greedy) {
2109 if (q->lower == 0) {
2110 if (q->upper == 1) return 0;
2111 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2112 }
2113 else if (q->lower == 1) {
2114 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2115 }
2116 }
2117 else {
2118 if (q->lower == 0) {
2119 if (q->upper == 1) return 3;
2120 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2121 }
2122 else if (q->lower == 1) {
2123 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2124 }
2125 }
2126 return -1;
2127 }
2128
2129
2130 enum ReduceType {
2131 RQ_ASIS = 0, /* as is */
2132 RQ_DEL = 1, /* delete parent */
2133 RQ_A, /* to '*' */
2134 RQ_AQ, /* to '*?' */
2135 RQ_QQ, /* to '??' */
2136 RQ_P_QQ, /* to '+)??' */
2137 RQ_PQ_Q /* to '+?)?' */
2138 };
2139
2140 static enum ReduceType ReduceTypeTable[6][6] = {
2141 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2142 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2143 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2144 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2145 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2146 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2147 };
2148
2149 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2150 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2151 {
2152 int pnum, cnum;
2153 QtfrNode *p, *c;
2154
2155 p = NQTFR(pnode);
2156 c = NQTFR(cnode);
2157 pnum = popular_quantifier_num(p);
2158 cnum = popular_quantifier_num(c);
2159 if (pnum < 0 || cnum < 0) return ;
2160
2161 switch(ReduceTypeTable[cnum][pnum]) {
2162 case RQ_DEL:
2163 *pnode = *cnode;
2164 break;
2165 case RQ_A:
2166 p->target = c->target;
2167 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2168 break;
2169 case RQ_AQ:
2170 p->target = c->target;
2171 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2172 break;
2173 case RQ_QQ:
2174 p->target = c->target;
2175 p->lower = 0; p->upper = 1; p->greedy = 0;
2176 break;
2177 case RQ_P_QQ:
2178 p->target = cnode;
2179 p->lower = 0; p->upper = 1; p->greedy = 0;
2180 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2181 return ;
2182 break;
2183 case RQ_PQ_Q:
2184 p->target = cnode;
2185 p->lower = 0; p->upper = 1; p->greedy = 1;
2186 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2187 return ;
2188 break;
2189 case RQ_ASIS:
2190 p->target = cnode;
2191 return ;
2192 break;
2193 }
2194
2195 c->target = NULL_NODE;
2196 onig_node_free(cnode);
2197 }
2198
2199
2200 enum TokenSyms {
2201 TK_EOT = 0, /* end of token */
2202 TK_RAW_BYTE = 1,
2203 TK_CHAR,
2204 TK_STRING,
2205 TK_CODE_POINT,
2206 TK_ANYCHAR,
2207 TK_CHAR_TYPE,
2208 TK_BACKREF,
2209 TK_CALL,
2210 TK_ANCHOR,
2211 TK_OP_REPEAT,
2212 TK_INTERVAL,
2213 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2214 TK_ALT,
2215 TK_SUBEXP_OPEN,
2216 TK_SUBEXP_CLOSE,
2217 TK_CC_OPEN,
2218 TK_QUOTE_OPEN,
2219 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2220 /* in cc */
2221 TK_CC_CLOSE,
2222 TK_CC_RANGE,
2223 TK_POSIX_BRACKET_OPEN,
2224 TK_CC_AND, /* && */
2225 TK_CC_CC_OPEN /* [ */
2226 };
2227
2228 typedef struct {
2229 enum TokenSyms type;
2230 int escaped;
2231 int base; /* is number: 8, 16 (used in [....]) */
2232 UChar* backp;
2233 union {
2234 UChar* s;
2235 int c;
2236 OnigCodePoint code;
2237 int anchor;
2238 int subtype;
2239 struct {
2240 int lower;
2241 int upper;
2242 int greedy;
2243 int possessive;
2244 } repeat;
2245 struct {
2246 int num;
2247 int ref1;
2248 int* refs;
2249 int by_name;
2250 #ifdef USE_BACKREF_WITH_LEVEL
2251 int exist_level;
2252 int level; /* \k<name+n> */
2253 #endif
2254 } backref;
2255 struct {
2256 UChar* name;
2257 UChar* name_end;
2258 int gnum;
2259 } call;
2260 struct {
2261 int ctype;
2262 int not;
2263 } prop;
2264 } u;
2265 } OnigToken;
2266
2267
2268 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2269 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2270 {
2271 int low, up, syn_allow, non_low = 0;
2272 int r = 0;
2273 OnigCodePoint c;
2274 OnigEncoding enc = env->enc;
2275 UChar* p = *src;
2276 PFETCH_READY;
2277
2278 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2279
2280 if (PEND) {
2281 if (syn_allow)
2282 return 1; /* "....{" : OK! */
2283 else
2284 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2285 }
2286
2287 if (! syn_allow) {
2288 c = PPEEK;
2289 if (c == ')' || c == '(' || c == '|') {
2290 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2291 }
2292 }
2293
2294 low = onig_scan_unsigned_number(&p, end, env->enc);
2295 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2296 if (low > ONIG_MAX_REPEAT_NUM)
2297 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2298
2299 if (p == *src) { /* can't read low */
2300 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2301 /* allow {,n} as {0,n} */
2302 low = 0;
2303 non_low = 1;
2304 }
2305 else
2306 goto invalid;
2307 }
2308
2309 if (PEND) goto invalid;
2310 PFETCH(c);
2311 if (c == ',') {
2312 UChar* prev = p;
2313 up = onig_scan_unsigned_number(&p, end, env->enc);
2314 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2315 if (up > ONIG_MAX_REPEAT_NUM)
2316 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2317
2318 if (p == prev) {
2319 if (non_low != 0)
2320 goto invalid;
2321 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2322 }
2323 }
2324 else {
2325 if (non_low != 0)
2326 goto invalid;
2327
2328 PUNFETCH;
2329 up = low; /* {n} : exact n times */
2330 r = 2; /* fixed */
2331 }
2332
2333 if (PEND) goto invalid;
2334 PFETCH(c);
2335 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2336 if (c != MC_ESC(env->syntax)) goto invalid;
2337 PFETCH(c);
2338 }
2339 if (c != '}') goto invalid;
2340
2341 if (!IS_REPEAT_INFINITE(up) && low > up) {
2342 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2343 }
2344
2345 tok->type = TK_INTERVAL;
2346 tok->u.repeat.lower = low;
2347 tok->u.repeat.upper = up;
2348 *src = p;
2349 return r; /* 0: normal {n,m}, 2: fixed {n} */
2350
2351 invalid:
2352 if (syn_allow) {
2353 /* *src = p; */ /* !!! Don't do this line !!! */
2354 return 1; /* OK */
2355 }
2356 else
2357 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2358 }
2359
2360 /* \M-, \C-, \c, or \... */
2361 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)2362 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
2363 {
2364 int v;
2365 OnigCodePoint c;
2366 OnigEncoding enc = env->enc;
2367 UChar* p = *src;
2368
2369 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2370
2371 PFETCH_S(c);
2372 switch (c) {
2373 case 'M':
2374 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2375 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2376 PFETCH_S(c);
2377 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2378 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2379 PFETCH_S(c);
2380 if (c == MC_ESC(env->syntax)) {
2381 v = fetch_escaped_value(&p, end, env, &c);
2382 if (v < 0) return v;
2383 }
2384 c = ((c & 0xff) | 0x80);
2385 }
2386 else
2387 goto backslash;
2388 break;
2389
2390 case 'C':
2391 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2392 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2393 PFETCH_S(c);
2394 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2395 goto control;
2396 }
2397 else
2398 goto backslash;
2399
2400 case 'c':
2401 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2402 control:
2403 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2404 PFETCH_S(c);
2405 if (c == '?') {
2406 c = 0177;
2407 }
2408 else {
2409 if (c == MC_ESC(env->syntax)) {
2410 v = fetch_escaped_value(&p, end, env, &c);
2411 if (v < 0) return v;
2412 }
2413 c &= 0x9f;
2414 }
2415 break;
2416 }
2417 /* fall through */
2418
2419 default:
2420 {
2421 backslash:
2422 c = conv_backslash_value(c, env);
2423 }
2424 break;
2425 }
2426
2427 *src = p;
2428 *val = c;
2429 return 0;
2430 }
2431
2432 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2433
2434 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2435 get_name_end_code_point(OnigCodePoint start)
2436 {
2437 switch (start) {
2438 case '<': return (OnigCodePoint )'>'; break;
2439 case '\'': return (OnigCodePoint )'\''; break;
2440 default:
2441 break;
2442 }
2443
2444 return (OnigCodePoint )0;
2445 }
2446
2447 #ifdef USE_NAMED_GROUP
2448 #ifdef USE_BACKREF_WITH_LEVEL
2449 /*
2450 \k<name+n>, \k<name-n>
2451 \k<num+n>, \k<num-n>
2452 \k<-num+n>, \k<-num-n>
2453 */
2454 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2455 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2456 UChar** rname_end, ScanEnv* env,
2457 int* rback_num, int* rlevel)
2458 {
2459 int r, sign, is_num, exist_level;
2460 OnigCodePoint end_code;
2461 OnigCodePoint c = 0;
2462 OnigEncoding enc = env->enc;
2463 UChar *name_end;
2464 UChar *pnum_head;
2465 UChar *p = *src;
2466 PFETCH_READY;
2467
2468 *rback_num = 0;
2469 is_num = exist_level = 0;
2470 sign = 1;
2471 pnum_head = *src;
2472
2473 end_code = get_name_end_code_point(start_code);
2474
2475 name_end = end;
2476 r = 0;
2477 if (PEND) {
2478 return ONIGERR_EMPTY_GROUP_NAME;
2479 }
2480 else {
2481 PFETCH(c);
2482 if (c == end_code)
2483 return ONIGERR_EMPTY_GROUP_NAME;
2484
2485 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2486 is_num = 1;
2487 }
2488 else if (c == '-') {
2489 is_num = 2;
2490 sign = -1;
2491 pnum_head = p;
2492 }
2493 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2494 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2495 }
2496 }
2497
2498 while (!PEND) {
2499 name_end = p;
2500 PFETCH(c);
2501 if (c == end_code || c == ')' || c == '+' || c == '-') {
2502 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2503 break;
2504 }
2505
2506 if (is_num != 0) {
2507 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2508 is_num = 1;
2509 }
2510 else {
2511 r = ONIGERR_INVALID_GROUP_NAME;
2512 is_num = 0;
2513 }
2514 }
2515 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2516 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2517 }
2518 }
2519
2520 if (r == 0 && c != end_code) {
2521 if (c == '+' || c == '-') {
2522 int level;
2523 int flag = (c == '-' ? -1 : 1);
2524
2525 if (PEND) {
2526 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2527 goto end;
2528 }
2529 PFETCH(c);
2530 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2531 PUNFETCH;
2532 level = onig_scan_unsigned_number(&p, end, enc);
2533 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2534 *rlevel = (level * flag);
2535 exist_level = 1;
2536
2537 if (!PEND) {
2538 PFETCH(c);
2539 if (c == end_code)
2540 goto end;
2541 }
2542 }
2543
2544 err:
2545 r = ONIGERR_INVALID_GROUP_NAME;
2546 name_end = end;
2547 }
2548
2549 end:
2550 if (r == 0) {
2551 if (is_num != 0) {
2552 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2553 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2554 else if (*rback_num == 0) goto err;
2555
2556 *rback_num *= sign;
2557 }
2558
2559 *rname_end = name_end;
2560 *src = p;
2561 return (exist_level ? 1 : 0);
2562 }
2563 else {
2564 onig_scan_env_set_error_string(env, r, *src, name_end);
2565 return r;
2566 }
2567 }
2568 #endif /* USE_BACKREF_WITH_LEVEL */
2569
2570 /*
2571 ref: 0 -> define name (don't allow number name)
2572 1 -> reference name (allow number name)
2573 */
2574 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2575 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2576 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2577 {
2578 int r, is_num, sign;
2579 OnigCodePoint end_code;
2580 OnigCodePoint c = 0;
2581 OnigEncoding enc = env->enc;
2582 UChar *name_end;
2583 UChar *pnum_head;
2584 UChar *p = *src;
2585
2586 *rback_num = 0;
2587
2588 end_code = get_name_end_code_point(start_code);
2589
2590 name_end = end;
2591 pnum_head = *src;
2592 r = 0;
2593 is_num = 0;
2594 sign = 1;
2595 if (PEND) {
2596 return ONIGERR_EMPTY_GROUP_NAME;
2597 }
2598 else {
2599 PFETCH_S(c);
2600 if (c == end_code)
2601 return ONIGERR_EMPTY_GROUP_NAME;
2602
2603 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2604 if (ref == 1)
2605 is_num = 1;
2606 else {
2607 r = ONIGERR_INVALID_GROUP_NAME;
2608 is_num = 0;
2609 }
2610 }
2611 else if (c == '-') {
2612 if (ref == 1) {
2613 is_num = 2;
2614 sign = -1;
2615 pnum_head = p;
2616 }
2617 else {
2618 r = ONIGERR_INVALID_GROUP_NAME;
2619 is_num = 0;
2620 }
2621 }
2622 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2623 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2624 }
2625 }
2626
2627 if (r == 0) {
2628 while (!PEND) {
2629 name_end = p;
2630 PFETCH_S(c);
2631 if (c == end_code || c == ')') {
2632 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2633 break;
2634 }
2635
2636 if (is_num != 0) {
2637 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2638 is_num = 1;
2639 }
2640 else {
2641 if (!ONIGENC_IS_CODE_WORD(enc, c))
2642 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2643 else
2644 r = ONIGERR_INVALID_GROUP_NAME;
2645 is_num = 0;
2646 }
2647 }
2648 else {
2649 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2650 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2651 }
2652 }
2653 }
2654
2655 if (c != end_code) {
2656 r = ONIGERR_INVALID_GROUP_NAME;
2657 name_end = end;
2658 }
2659
2660 if (is_num != 0) {
2661 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2662 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2663 else if (*rback_num == 0) {
2664 r = ONIGERR_INVALID_GROUP_NAME;
2665 goto err;
2666 }
2667
2668 *rback_num *= sign;
2669 }
2670
2671 *rname_end = name_end;
2672 *src = p;
2673 return 0;
2674 }
2675 else {
2676 while (!PEND) {
2677 name_end = p;
2678 PFETCH_S(c);
2679 if (c == end_code || c == ')')
2680 break;
2681 }
2682 if (PEND)
2683 name_end = end;
2684
2685 err:
2686 onig_scan_env_set_error_string(env, r, *src, name_end);
2687 return r;
2688 }
2689 }
2690 #else
2691 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2692 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2693 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2694 {
2695 int r, is_num, sign;
2696 OnigCodePoint end_code;
2697 OnigCodePoint c = 0;
2698 UChar *name_end;
2699 OnigEncoding enc = env->enc;
2700 UChar *pnum_head;
2701 UChar *p = *src;
2702 PFETCH_READY;
2703
2704 *rback_num = 0;
2705
2706 end_code = get_name_end_code_point(start_code);
2707
2708 *rname_end = name_end = end;
2709 r = 0;
2710 pnum_head = *src;
2711 is_num = 0;
2712 sign = 1;
2713
2714 if (PEND) {
2715 return ONIGERR_EMPTY_GROUP_NAME;
2716 }
2717 else {
2718 PFETCH(c);
2719 if (c == end_code)
2720 return ONIGERR_EMPTY_GROUP_NAME;
2721
2722 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2723 is_num = 1;
2724 }
2725 else if (c == '-') {
2726 is_num = 2;
2727 sign = -1;
2728 pnum_head = p;
2729 }
2730 else {
2731 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2732 }
2733 }
2734
2735 while (!PEND) {
2736 name_end = p;
2737
2738 PFETCH(c);
2739 if (c == end_code || c == ')') break;
2740 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2741 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2742 }
2743 if (r == 0 && c != end_code) {
2744 r = ONIGERR_INVALID_GROUP_NAME;
2745 name_end = end;
2746 }
2747
2748 if (r == 0) {
2749 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2750 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2751 else if (*rback_num == 0) {
2752 r = ONIGERR_INVALID_GROUP_NAME;
2753 goto err;
2754 }
2755 *rback_num *= sign;
2756
2757 *rname_end = name_end;
2758 *src = p;
2759 return 0;
2760 }
2761 else {
2762 err:
2763 onig_scan_env_set_error_string(env, r, *src, name_end);
2764 return r;
2765 }
2766 }
2767 #endif /* USE_NAMED_GROUP */
2768
2769 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2770 CC_ESC_WARN(ScanEnv* env, UChar *c)
2771 {
2772 if (onig_warn == onig_null_warn) return ;
2773
2774 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2775 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2776 UChar buf[WARN_BUFSIZE];
2777 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2778 env->pattern, env->pattern_end,
2779 (UChar* )"character class has '%s' without escape", c);
2780 (*onig_warn)((char* )buf);
2781 }
2782 }
2783
2784 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2785 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2786 {
2787 if (onig_warn == onig_null_warn) return ;
2788
2789 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2790 UChar buf[WARN_BUFSIZE];
2791 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2792 (env)->pattern, (env)->pattern_end,
2793 (UChar* )"regular expression has '%s' without escape", c);
2794 (*onig_warn)((char* )buf);
2795 }
2796 }
2797
2798 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2799 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2800 UChar **next, OnigEncoding enc)
2801 {
2802 int i;
2803 OnigCodePoint x;
2804 UChar *q;
2805 UChar *p = from;
2806
2807 while (p < to) {
2808 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2809 q = p + enclen(enc, p);
2810 if (x == s[0]) {
2811 for (i = 1; i < n && q < to; i++) {
2812 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2813 if (x != s[i]) break;
2814 q += enclen(enc, q);
2815 }
2816 if (i >= n) {
2817 if (IS_NOT_NULL(next))
2818 *next = q;
2819 return p;
2820 }
2821 }
2822 p = q;
2823 }
2824 return NULL_UCHARP;
2825 }
2826
2827 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2828 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2829 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2830 {
2831 int i, in_esc;
2832 OnigCodePoint x;
2833 UChar *q;
2834 UChar *p = from;
2835
2836 in_esc = 0;
2837 while (p < to) {
2838 if (in_esc) {
2839 in_esc = 0;
2840 p += enclen(enc, p);
2841 }
2842 else {
2843 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2844 q = p + enclen(enc, p);
2845 if (x == s[0]) {
2846 for (i = 1; i < n && q < to; i++) {
2847 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2848 if (x != s[i]) break;
2849 q += enclen(enc, q);
2850 }
2851 if (i >= n) return 1;
2852 p += enclen(enc, p);
2853 }
2854 else {
2855 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2856 if (x == bad) return 0;
2857 else if (x == MC_ESC(syn)) in_esc = 1;
2858 p = q;
2859 }
2860 }
2861 }
2862 return 0;
2863 }
2864
2865 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2866 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2867 {
2868 int num;
2869 OnigCodePoint c, c2;
2870 OnigSyntaxType* syn = env->syntax;
2871 OnigEncoding enc = env->enc;
2872 UChar* prev;
2873 UChar* p = *src;
2874 PFETCH_READY;
2875
2876 if (PEND) {
2877 tok->type = TK_EOT;
2878 return tok->type;
2879 }
2880
2881 PFETCH(c);
2882 tok->type = TK_CHAR;
2883 tok->base = 0;
2884 tok->u.c = c;
2885 tok->escaped = 0;
2886
2887 if (c == ']') {
2888 tok->type = TK_CC_CLOSE;
2889 }
2890 else if (c == '-') {
2891 tok->type = TK_CC_RANGE;
2892 }
2893 else if (c == MC_ESC(syn)) {
2894 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2895 goto end;
2896
2897 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2898
2899 PFETCH(c);
2900 tok->escaped = 1;
2901 tok->u.c = c;
2902 switch (c) {
2903 case 'w':
2904 tok->type = TK_CHAR_TYPE;
2905 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2906 tok->u.prop.not = 0;
2907 break;
2908 case 'W':
2909 tok->type = TK_CHAR_TYPE;
2910 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2911 tok->u.prop.not = 1;
2912 break;
2913 case 'd':
2914 tok->type = TK_CHAR_TYPE;
2915 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2916 tok->u.prop.not = 0;
2917 break;
2918 case 'D':
2919 tok->type = TK_CHAR_TYPE;
2920 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2921 tok->u.prop.not = 1;
2922 break;
2923 case 's':
2924 tok->type = TK_CHAR_TYPE;
2925 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2926 tok->u.prop.not = 0;
2927 break;
2928 case 'S':
2929 tok->type = TK_CHAR_TYPE;
2930 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2931 tok->u.prop.not = 1;
2932 break;
2933 case 'h':
2934 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2935 tok->type = TK_CHAR_TYPE;
2936 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2937 tok->u.prop.not = 0;
2938 break;
2939 case 'H':
2940 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2941 tok->type = TK_CHAR_TYPE;
2942 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2943 tok->u.prop.not = 1;
2944 break;
2945
2946 case 'p':
2947 case 'P':
2948 if (PEND) break;
2949
2950 c2 = PPEEK;
2951 if (c2 == '{' &&
2952 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2953 PINC;
2954 tok->type = TK_CHAR_PROPERTY;
2955 tok->u.prop.not = (c == 'P' ? 1 : 0);
2956
2957 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2958 PFETCH(c2);
2959 if (c2 == '^') {
2960 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2961 }
2962 else
2963 PUNFETCH;
2964 }
2965 }
2966 break;
2967
2968 case 'o':
2969 if (PEND) break;
2970
2971 prev = p;
2972 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
2973 PINC;
2974 num = scan_unsigned_octal_number(&p, end, 11, enc);
2975 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
2976 if (!PEND) {
2977 c2 = PPEEK;
2978 if (ONIGENC_IS_CODE_DIGIT(enc, c2))
2979 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
2980 }
2981
2982 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
2983 PINC;
2984 tok->type = TK_CODE_POINT;
2985 tok->base = 8;
2986 tok->u.code = (OnigCodePoint )num;
2987 }
2988 else {
2989 /* can't read nothing or invalid format */
2990 p = prev;
2991 }
2992 }
2993 break;
2994
2995 case 'x':
2996 if (PEND) break;
2997
2998 prev = p;
2999 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3000 PINC;
3001 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3002 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3003 if (!PEND) {
3004 c2 = PPEEK;
3005 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3006 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3007 }
3008
3009 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3010 PINC;
3011 tok->type = TK_CODE_POINT;
3012 tok->base = 16;
3013 tok->u.code = (OnigCodePoint )num;
3014 }
3015 else {
3016 /* can't read nothing or invalid format */
3017 p = prev;
3018 }
3019 }
3020 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3021 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3022 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3023 if (p == prev) { /* can't read nothing. */
3024 num = 0; /* but, it's not error */
3025 }
3026 tok->type = TK_RAW_BYTE;
3027 tok->base = 16;
3028 tok->u.c = num;
3029 }
3030 break;
3031
3032 case 'u':
3033 if (PEND) break;
3034
3035 prev = p;
3036 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3037 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3038 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3039 if (p == prev) { /* can't read nothing. */
3040 num = 0; /* but, it's not error */
3041 }
3042 tok->type = TK_CODE_POINT;
3043 tok->base = 16;
3044 tok->u.code = (OnigCodePoint )num;
3045 }
3046 break;
3047
3048 case '0':
3049 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3050 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3051 PUNFETCH;
3052 prev = p;
3053 num = scan_unsigned_octal_number(&p, end, 3, enc);
3054 if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3055 if (p == prev) { /* can't read nothing. */
3056 num = 0; /* but, it's not error */
3057 }
3058 tok->type = TK_RAW_BYTE;
3059 tok->base = 8;
3060 tok->u.c = num;
3061 }
3062 break;
3063
3064 default:
3065 PUNFETCH;
3066 num = fetch_escaped_value(&p, end, env, &c2);
3067 if (num < 0) return num;
3068 if (tok->u.c != c2) {
3069 tok->u.code = c2;
3070 tok->type = TK_CODE_POINT;
3071 }
3072 break;
3073 }
3074 }
3075 else if (c == '[') {
3076 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3077 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3078 tok->backp = p; /* point at '[' is read */
3079 PINC;
3080 if (str_exist_check_with_esc(send, 2, p, end,
3081 (OnigCodePoint )']', enc, syn)) {
3082 tok->type = TK_POSIX_BRACKET_OPEN;
3083 }
3084 else {
3085 PUNFETCH;
3086 goto cc_in_cc;
3087 }
3088 }
3089 else {
3090 cc_in_cc:
3091 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3092 tok->type = TK_CC_CC_OPEN;
3093 }
3094 else {
3095 CC_ESC_WARN(env, (UChar* )"[");
3096 }
3097 }
3098 }
3099 else if (c == '&') {
3100 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3101 !PEND && (PPEEK_IS('&'))) {
3102 PINC;
3103 tok->type = TK_CC_AND;
3104 }
3105 }
3106
3107 end:
3108 *src = p;
3109 return tok->type;
3110 }
3111
3112 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3113 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3114 {
3115 int r, num;
3116 OnigCodePoint c;
3117 OnigEncoding enc = env->enc;
3118 OnigSyntaxType* syn = env->syntax;
3119 UChar* prev;
3120 UChar* p = *src;
3121 PFETCH_READY;
3122
3123 start:
3124 if (PEND) {
3125 tok->type = TK_EOT;
3126 return tok->type;
3127 }
3128
3129 tok->type = TK_STRING;
3130 tok->base = 0;
3131 tok->backp = p;
3132
3133 PFETCH(c);
3134 if (IS_MC_ESC_CODE(c, syn)) {
3135 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3136
3137 tok->backp = p;
3138 PFETCH(c);
3139
3140 tok->u.c = c;
3141 tok->escaped = 1;
3142 switch (c) {
3143 case '*':
3144 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3145 tok->type = TK_OP_REPEAT;
3146 tok->u.repeat.lower = 0;
3147 tok->u.repeat.upper = REPEAT_INFINITE;
3148 goto greedy_check;
3149 break;
3150
3151 case '+':
3152 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3153 tok->type = TK_OP_REPEAT;
3154 tok->u.repeat.lower = 1;
3155 tok->u.repeat.upper = REPEAT_INFINITE;
3156 goto greedy_check;
3157 break;
3158
3159 case '?':
3160 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3161 tok->type = TK_OP_REPEAT;
3162 tok->u.repeat.lower = 0;
3163 tok->u.repeat.upper = 1;
3164 greedy_check:
3165 if (!PEND && PPEEK_IS('?') &&
3166 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3167 PFETCH(c);
3168 tok->u.repeat.greedy = 0;
3169 tok->u.repeat.possessive = 0;
3170 }
3171 else {
3172 possessive_check:
3173 if (!PEND && PPEEK_IS('+') &&
3174 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3175 tok->type != TK_INTERVAL) ||
3176 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3177 tok->type == TK_INTERVAL))) {
3178 PFETCH(c);
3179 tok->u.repeat.greedy = 1;
3180 tok->u.repeat.possessive = 1;
3181 }
3182 else {
3183 tok->u.repeat.greedy = 1;
3184 tok->u.repeat.possessive = 0;
3185 }
3186 }
3187 break;
3188
3189 case '{':
3190 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3191 r = fetch_range_quantifier(&p, end, tok, env);
3192 if (r < 0) return r; /* error */
3193 if (r == 0) goto greedy_check;
3194 else if (r == 2) { /* {n} */
3195 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3196 goto possessive_check;
3197
3198 goto greedy_check;
3199 }
3200 /* r == 1 : normal char */
3201 break;
3202
3203 case '|':
3204 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3205 tok->type = TK_ALT;
3206 break;
3207
3208 case '(':
3209 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3210 tok->type = TK_SUBEXP_OPEN;
3211 break;
3212
3213 case ')':
3214 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3215 tok->type = TK_SUBEXP_CLOSE;
3216 break;
3217
3218 case 'w':
3219 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3220 tok->type = TK_CHAR_TYPE;
3221 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3222 tok->u.prop.not = 0;
3223 break;
3224
3225 case 'W':
3226 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3227 tok->type = TK_CHAR_TYPE;
3228 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3229 tok->u.prop.not = 1;
3230 break;
3231
3232 case 'b':
3233 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3234 tok->type = TK_ANCHOR;
3235 tok->u.anchor = ANCHOR_WORD_BOUND;
3236 break;
3237
3238 case 'B':
3239 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3240 tok->type = TK_ANCHOR;
3241 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3242 break;
3243
3244 #ifdef USE_WORD_BEGIN_END
3245 case '<':
3246 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3247 tok->type = TK_ANCHOR;
3248 tok->u.anchor = ANCHOR_WORD_BEGIN;
3249 break;
3250
3251 case '>':
3252 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3253 tok->type = TK_ANCHOR;
3254 tok->u.anchor = ANCHOR_WORD_END;
3255 break;
3256 #endif
3257
3258 case 's':
3259 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3260 tok->type = TK_CHAR_TYPE;
3261 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3262 tok->u.prop.not = 0;
3263 break;
3264
3265 case 'S':
3266 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3267 tok->type = TK_CHAR_TYPE;
3268 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3269 tok->u.prop.not = 1;
3270 break;
3271
3272 case 'd':
3273 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3274 tok->type = TK_CHAR_TYPE;
3275 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3276 tok->u.prop.not = 0;
3277 break;
3278
3279 case 'D':
3280 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3281 tok->type = TK_CHAR_TYPE;
3282 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3283 tok->u.prop.not = 1;
3284 break;
3285
3286 case 'h':
3287 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3288 tok->type = TK_CHAR_TYPE;
3289 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3290 tok->u.prop.not = 0;
3291 break;
3292
3293 case 'H':
3294 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3295 tok->type = TK_CHAR_TYPE;
3296 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3297 tok->u.prop.not = 1;
3298 break;
3299
3300 case 'A':
3301 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3302 begin_buf:
3303 tok->type = TK_ANCHOR;
3304 tok->u.subtype = ANCHOR_BEGIN_BUF;
3305 break;
3306
3307 case 'Z':
3308 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3309 tok->type = TK_ANCHOR;
3310 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3311 break;
3312
3313 case 'z':
3314 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3315 end_buf:
3316 tok->type = TK_ANCHOR;
3317 tok->u.subtype = ANCHOR_END_BUF;
3318 break;
3319
3320 case 'G':
3321 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3322 tok->type = TK_ANCHOR;
3323 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3324 break;
3325
3326 case '`':
3327 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3328 goto begin_buf;
3329 break;
3330
3331 case '\'':
3332 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3333 goto end_buf;
3334 break;
3335
3336 case 'o':
3337 if (PEND) break;
3338
3339 prev = p;
3340 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
3341 PINC;
3342 num = scan_unsigned_octal_number(&p, end, 11, enc);
3343 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3344 if (!PEND) {
3345 if (ONIGENC_IS_CODE_DIGIT(enc, PPEEK))
3346 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3347 }
3348
3349 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3350 PINC;
3351 tok->type = TK_CODE_POINT;
3352 tok->u.code = (OnigCodePoint )num;
3353 }
3354 else {
3355 /* can't read nothing or invalid format */
3356 p = prev;
3357 }
3358 }
3359 break;
3360
3361 case 'x':
3362 if (PEND) break;
3363
3364 prev = p;
3365 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3366 PINC;
3367 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3368 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3369 if (!PEND) {
3370 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3371 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3372 }
3373
3374 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3375 PINC;
3376 tok->type = TK_CODE_POINT;
3377 tok->u.code = (OnigCodePoint )num;
3378 }
3379 else {
3380 /* can't read nothing or invalid format */
3381 p = prev;
3382 }
3383 }
3384 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3385 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3386 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3387 if (p == prev) { /* can't read nothing. */
3388 num = 0; /* but, it's not error */
3389 }
3390 tok->type = TK_RAW_BYTE;
3391 tok->base = 16;
3392 tok->u.c = num;
3393 }
3394 break;
3395
3396 case 'u':
3397 if (PEND) break;
3398
3399 prev = p;
3400 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3401 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3402 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3403 if (p == prev) { /* can't read nothing. */
3404 num = 0; /* but, it's not error */
3405 }
3406 tok->type = TK_CODE_POINT;
3407 tok->base = 16;
3408 tok->u.code = (OnigCodePoint )num;
3409 }
3410 break;
3411
3412 case '1': case '2': case '3': case '4':
3413 case '5': case '6': case '7': case '8': case '9':
3414 PUNFETCH;
3415 prev = p;
3416 num = onig_scan_unsigned_number(&p, end, enc);
3417 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3418 goto skip_backref;
3419 }
3420
3421 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3422 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3423 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3424 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3425 return ONIGERR_INVALID_BACKREF;
3426 }
3427
3428 tok->type = TK_BACKREF;
3429 tok->u.backref.num = 1;
3430 tok->u.backref.ref1 = num;
3431 tok->u.backref.by_name = 0;
3432 #ifdef USE_BACKREF_WITH_LEVEL
3433 tok->u.backref.exist_level = 0;
3434 #endif
3435 break;
3436 }
3437
3438 skip_backref:
3439 if (c == '8' || c == '9') {
3440 /* normal char */
3441 p = prev; PINC;
3442 break;
3443 }
3444
3445 p = prev;
3446 /* fall through */
3447 case '0':
3448 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3449 prev = p;
3450 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3451 if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3452 if (p == prev) { /* can't read nothing. */
3453 num = 0; /* but, it's not error */
3454 }
3455 tok->type = TK_RAW_BYTE;
3456 tok->base = 8;
3457 tok->u.c = num;
3458 }
3459 else if (c != '0') {
3460 PINC;
3461 }
3462 break;
3463
3464 #ifdef USE_NAMED_GROUP
3465 case 'k':
3466 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3467 PFETCH(c);
3468 if (c == '<' || c == '\'') {
3469 UChar* name_end;
3470 int* backs;
3471 int back_num;
3472
3473 prev = p;
3474
3475 #ifdef USE_BACKREF_WITH_LEVEL
3476 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3477 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3478 env, &back_num, &tok->u.backref.level);
3479 if (r == 1) tok->u.backref.exist_level = 1;
3480 else tok->u.backref.exist_level = 0;
3481 #else
3482 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3483 #endif
3484 if (r < 0) return r;
3485
3486 if (back_num != 0) {
3487 if (back_num < 0) {
3488 back_num = BACKREF_REL_TO_ABS(back_num, env);
3489 if (back_num <= 0)
3490 return ONIGERR_INVALID_BACKREF;
3491 }
3492
3493 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3494 if (back_num > env->num_mem ||
3495 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3496 return ONIGERR_INVALID_BACKREF;
3497 }
3498 tok->type = TK_BACKREF;
3499 tok->u.backref.by_name = 0;
3500 tok->u.backref.num = 1;
3501 tok->u.backref.ref1 = back_num;
3502 }
3503 else {
3504 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3505 if (num <= 0) {
3506 onig_scan_env_set_error_string(env,
3507 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3508 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3509 }
3510 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3511 int i;
3512 for (i = 0; i < num; i++) {
3513 if (backs[i] > env->num_mem ||
3514 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3515 return ONIGERR_INVALID_BACKREF;
3516 }
3517 }
3518
3519 tok->type = TK_BACKREF;
3520 tok->u.backref.by_name = 1;
3521 if (num == 1) {
3522 tok->u.backref.num = 1;
3523 tok->u.backref.ref1 = backs[0];
3524 }
3525 else {
3526 tok->u.backref.num = num;
3527 tok->u.backref.refs = backs;
3528 }
3529 }
3530 }
3531 else
3532 PUNFETCH;
3533 }
3534 break;
3535 #endif
3536
3537 #ifdef USE_SUBEXP_CALL
3538 case 'g':
3539 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3540 PFETCH(c);
3541 if (c == '<' || c == '\'') {
3542 int gnum;
3543 UChar* name_end;
3544
3545 prev = p;
3546 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3547 if (r < 0) return r;
3548
3549 tok->type = TK_CALL;
3550 tok->u.call.name = prev;
3551 tok->u.call.name_end = name_end;
3552 tok->u.call.gnum = gnum;
3553 }
3554 else
3555 PUNFETCH;
3556 }
3557 break;
3558 #endif
3559
3560 case 'Q':
3561 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3562 tok->type = TK_QUOTE_OPEN;
3563 }
3564 break;
3565
3566 case 'p':
3567 case 'P':
3568 if (!PEND && PPEEK_IS('{') &&
3569 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3570 PINC;
3571 tok->type = TK_CHAR_PROPERTY;
3572 tok->u.prop.not = (c == 'P' ? 1 : 0);
3573
3574 if (!PEND &&
3575 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3576 PFETCH(c);
3577 if (c == '^') {
3578 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3579 }
3580 else
3581 PUNFETCH;
3582 }
3583 }
3584 break;
3585
3586 default:
3587 {
3588 OnigCodePoint c2;
3589
3590 PUNFETCH;
3591 num = fetch_escaped_value(&p, end, env, &c2);
3592 if (num < 0) return num;
3593 /* set_raw: */
3594 if (tok->u.c != c2) {
3595 tok->type = TK_CODE_POINT;
3596 tok->u.code = c2;
3597 }
3598 else { /* string */
3599 int len;
3600 SAFE_ENC_LEN(enc, tok->backp, end, len);
3601 p = tok->backp + len;
3602 }
3603 }
3604 break;
3605 }
3606 }
3607 else {
3608 tok->u.c = c;
3609 tok->escaped = 0;
3610
3611 #ifdef USE_VARIABLE_META_CHARS
3612 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3613 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3614 if (c == MC_ANYCHAR(syn))
3615 goto any_char;
3616 else if (c == MC_ANYTIME(syn))
3617 goto anytime;
3618 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3619 goto zero_or_one_time;
3620 else if (c == MC_ONE_OR_MORE_TIME(syn))
3621 goto one_or_more_time;
3622 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3623 tok->type = TK_ANYCHAR_ANYTIME;
3624 goto out;
3625 }
3626 }
3627 #endif
3628
3629 switch (c) {
3630 case '.':
3631 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3632 #ifdef USE_VARIABLE_META_CHARS
3633 any_char:
3634 #endif
3635 tok->type = TK_ANYCHAR;
3636 break;
3637
3638 case '*':
3639 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3640 #ifdef USE_VARIABLE_META_CHARS
3641 anytime:
3642 #endif
3643 tok->type = TK_OP_REPEAT;
3644 tok->u.repeat.lower = 0;
3645 tok->u.repeat.upper = REPEAT_INFINITE;
3646 goto greedy_check;
3647 break;
3648
3649 case '+':
3650 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3651 #ifdef USE_VARIABLE_META_CHARS
3652 one_or_more_time:
3653 #endif
3654 tok->type = TK_OP_REPEAT;
3655 tok->u.repeat.lower = 1;
3656 tok->u.repeat.upper = REPEAT_INFINITE;
3657 goto greedy_check;
3658 break;
3659
3660 case '?':
3661 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3662 #ifdef USE_VARIABLE_META_CHARS
3663 zero_or_one_time:
3664 #endif
3665 tok->type = TK_OP_REPEAT;
3666 tok->u.repeat.lower = 0;
3667 tok->u.repeat.upper = 1;
3668 goto greedy_check;
3669 break;
3670
3671 case '{':
3672 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3673 r = fetch_range_quantifier(&p, end, tok, env);
3674 if (r < 0) return r; /* error */
3675 if (r == 0) goto greedy_check;
3676 else if (r == 2) { /* {n} */
3677 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3678 goto possessive_check;
3679
3680 goto greedy_check;
3681 }
3682 /* r == 1 : normal char */
3683 break;
3684
3685 case '|':
3686 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3687 tok->type = TK_ALT;
3688 break;
3689
3690 case '(':
3691 if (!PEND && PPEEK_IS('?') &&
3692 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3693 PINC;
3694 if (!PEND && PPEEK_IS('#')) {
3695 PFETCH(c);
3696 while (1) {
3697 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3698 PFETCH(c);
3699 if (c == MC_ESC(syn)) {
3700 if (!PEND) PFETCH(c);
3701 }
3702 else {
3703 if (c == ')') break;
3704 }
3705 }
3706 goto start;
3707 }
3708 PUNFETCH;
3709 }
3710
3711 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3712 tok->type = TK_SUBEXP_OPEN;
3713 break;
3714
3715 case ')':
3716 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3717 tok->type = TK_SUBEXP_CLOSE;
3718 break;
3719
3720 case '^':
3721 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3722 tok->type = TK_ANCHOR;
3723 tok->u.subtype = (IS_SINGLELINE(env->option)
3724 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3725 break;
3726
3727 case '$':
3728 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3729 tok->type = TK_ANCHOR;
3730 tok->u.subtype = (IS_SINGLELINE(env->option)
3731 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3732 break;
3733
3734 case '[':
3735 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3736 tok->type = TK_CC_OPEN;
3737 break;
3738
3739 case ']':
3740 if (*src > env->pattern) /* /].../ is allowed. */
3741 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3742 break;
3743
3744 case '#':
3745 if (IS_EXTEND(env->option)) {
3746 while (!PEND) {
3747 PFETCH(c);
3748 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3749 break;
3750 }
3751 goto start;
3752 break;
3753 }
3754 break;
3755
3756 case ' ': case '\t': case '\n': case '\r': case '\f':
3757 if (IS_EXTEND(env->option))
3758 goto start;
3759 break;
3760
3761 default:
3762 /* string */
3763 break;
3764 }
3765 }
3766
3767 #ifdef USE_VARIABLE_META_CHARS
3768 out:
3769 #endif
3770 *src = p;
3771 return tok->type;
3772 }
3773
3774 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3775 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3776 OnigEncoding enc ARG_UNUSED,
3777 OnigCodePoint sb_out, const OnigCodePoint mbr[])
3778 {
3779 int i, r;
3780 OnigCodePoint j;
3781
3782 int n = ONIGENC_CODE_RANGE_NUM(mbr);
3783
3784 if (not == 0) {
3785 for (i = 0; i < n; i++) {
3786 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
3787 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3788 if (j >= sb_out) {
3789 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3790 r = add_code_range_to_buf(&(cc->mbuf), j,
3791 ONIGENC_CODE_RANGE_TO(mbr, i));
3792 if (r != 0) return r;
3793 i++;
3794 }
3795
3796 goto sb_end;
3797 }
3798 BITSET_SET_BIT(cc->bs, j);
3799 }
3800 }
3801
3802 sb_end:
3803 for ( ; i < n; i++) {
3804 r = add_code_range_to_buf(&(cc->mbuf),
3805 ONIGENC_CODE_RANGE_FROM(mbr, i),
3806 ONIGENC_CODE_RANGE_TO(mbr, i));
3807 if (r != 0) return r;
3808 }
3809 }
3810 else {
3811 OnigCodePoint prev = 0;
3812
3813 for (i = 0; i < n; i++) {
3814 for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3815 if (j >= sb_out) {
3816 goto sb_end2;
3817 }
3818 BITSET_SET_BIT(cc->bs, j);
3819 }
3820 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3821 }
3822 for (j = prev; j < sb_out; j++) {
3823 BITSET_SET_BIT(cc->bs, j);
3824 }
3825
3826 sb_end2:
3827 prev = sb_out;
3828
3829 for (i = 0; i < n; i++) {
3830 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3831 r = add_code_range_to_buf(&(cc->mbuf), prev,
3832 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3833 if (r != 0) return r;
3834 }
3835 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3836 }
3837 if (prev < 0x7fffffff) {
3838 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3839 if (r != 0) return r;
3840 }
3841 }
3842
3843 return 0;
3844 }
3845
3846 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3847 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3848 {
3849 int c, r;
3850 const OnigCodePoint *ranges;
3851 OnigCodePoint sb_out;
3852 OnigEncoding enc = env->enc;
3853
3854 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3855 if (r == 0) {
3856 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3857 }
3858 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3859 return r;
3860 }
3861
3862 r = 0;
3863 switch (ctype) {
3864 case ONIGENC_CTYPE_ALPHA:
3865 case ONIGENC_CTYPE_BLANK:
3866 case ONIGENC_CTYPE_CNTRL:
3867 case ONIGENC_CTYPE_DIGIT:
3868 case ONIGENC_CTYPE_LOWER:
3869 case ONIGENC_CTYPE_PUNCT:
3870 case ONIGENC_CTYPE_SPACE:
3871 case ONIGENC_CTYPE_UPPER:
3872 case ONIGENC_CTYPE_XDIGIT:
3873 case ONIGENC_CTYPE_ASCII:
3874 case ONIGENC_CTYPE_ALNUM:
3875 if (not != 0) {
3876 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3877 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3878 BITSET_SET_BIT(cc->bs, c);
3879 }
3880 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3881 }
3882 else {
3883 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3884 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3885 BITSET_SET_BIT(cc->bs, c);
3886 }
3887 }
3888 break;
3889
3890 case ONIGENC_CTYPE_GRAPH:
3891 case ONIGENC_CTYPE_PRINT:
3892 if (not != 0) {
3893 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3894 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3895 BITSET_SET_BIT(cc->bs, c);
3896 }
3897 }
3898 else {
3899 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3900 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3901 BITSET_SET_BIT(cc->bs, c);
3902 }
3903 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3904 }
3905 break;
3906
3907 case ONIGENC_CTYPE_WORD:
3908 if (not == 0) {
3909 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3910 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3911 }
3912 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3913 }
3914 else {
3915 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3916 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3917 && ! ONIGENC_IS_CODE_WORD(enc, c))
3918 BITSET_SET_BIT(cc->bs, c);
3919 }
3920 }
3921 break;
3922
3923 default:
3924 return ONIGERR_PARSER_BUG;
3925 break;
3926 }
3927
3928 return r;
3929 }
3930
3931 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3932 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3933 {
3934 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3935 #define POSIX_BRACKET_NAME_MIN_LEN 4
3936
3937 static PosixBracketEntryType PBS[] = {
3938 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3939 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3940 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3941 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3942 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3943 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3944 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3945 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3946 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3947 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3948 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3949 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3950 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3951 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
3952 { (UChar* )NULL, -1, 0 }
3953 };
3954
3955 PosixBracketEntryType *pb;
3956 int not, i, r;
3957 OnigCodePoint c;
3958 OnigEncoding enc = env->enc;
3959 UChar *p = *src;
3960
3961 if (PPEEK_IS('^')) {
3962 PINC_S;
3963 not = 1;
3964 }
3965 else
3966 not = 0;
3967
3968 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3969 goto not_posix_bracket;
3970
3971 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3972 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3973 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3974 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3975 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3976
3977 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3978 if (r != 0) return r;
3979
3980 PINC_S; PINC_S;
3981 *src = p;
3982 return 0;
3983 }
3984 }
3985
3986 not_posix_bracket:
3987 c = 0;
3988 i = 0;
3989 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3990 PINC_S;
3991 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3992 }
3993 if (c == ':' && ! PEND) {
3994 PINC_S;
3995 if (! PEND) {
3996 PFETCH_S(c);
3997 if (c == ']')
3998 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3999 }
4000 }
4001
4002 return 1; /* 1: is not POSIX bracket, but no error. */
4003 }
4004
4005 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)4006 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
4007 {
4008 int r;
4009 OnigCodePoint c;
4010 OnigEncoding enc = env->enc;
4011 UChar *prev, *start, *p = *src;
4012
4013 r = 0;
4014 start = prev = p;
4015
4016 while (!PEND) {
4017 prev = p;
4018 PFETCH_S(c);
4019 if (c == '}') {
4020 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4021 if (r < 0) break;
4022
4023 *src = p;
4024 return r;
4025 }
4026 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4027 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4028 break;
4029 }
4030 }
4031
4032 onig_scan_env_set_error_string(env, r, *src, prev);
4033 return r;
4034 }
4035
4036 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4037 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4038 ScanEnv* env)
4039 {
4040 int r, ctype;
4041 CClassNode* cc;
4042
4043 ctype = fetch_char_property_to_ctype(src, end, env);
4044 if (ctype < 0) return ctype;
4045
4046 *np = node_new_cclass();
4047 CHECK_NULL_RETURN_MEMERR(*np);
4048 cc = NCCLASS(*np);
4049 r = add_ctype_to_cc(cc, ctype, 0, env);
4050 if (r != 0) return r;
4051 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4052
4053 return 0;
4054 }
4055
4056
4057 enum CCSTATE {
4058 CCS_VALUE,
4059 CCS_RANGE,
4060 CCS_COMPLETE,
4061 CCS_START
4062 };
4063
4064 enum CCVALTYPE {
4065 CCV_SB,
4066 CCV_CODE_POINT,
4067 CCV_CLASS
4068 };
4069
4070 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4071 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4072 enum CCSTATE* state, ScanEnv* env)
4073 {
4074 int r;
4075
4076 if (*state == CCS_RANGE)
4077 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4078
4079 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4080 if (*type == CCV_SB)
4081 BITSET_SET_BIT(cc->bs, (int )(*vs));
4082 else if (*type == CCV_CODE_POINT) {
4083 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4084 if (r < 0) return r;
4085 }
4086 }
4087
4088 if (*state != CCS_START)
4089 *state = CCS_VALUE;
4090
4091 *type = CCV_CLASS;
4092 return 0;
4093 }
4094
4095 static int
next_state_val(CClassNode * cc,OnigCodePoint * from,OnigCodePoint to,int * from_israw,int to_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4096 next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
4097 int* from_israw, int to_israw,
4098 enum CCVALTYPE intype, enum CCVALTYPE* type,
4099 enum CCSTATE* state, ScanEnv* env)
4100 {
4101 int r;
4102
4103 switch (*state) {
4104 case CCS_VALUE:
4105 if (*type == CCV_SB) {
4106 if (*from > 0xff)
4107 return ONIGERR_INVALID_CODE_POINT_VALUE;
4108
4109 BITSET_SET_BIT(cc->bs, (int )(*from));
4110 }
4111 else if (*type == CCV_CODE_POINT) {
4112 r = add_code_range(&(cc->mbuf), env, *from, *from);
4113 if (r < 0) return r;
4114 }
4115 break;
4116
4117 case CCS_RANGE:
4118 if (intype == *type) {
4119 if (intype == CCV_SB) {
4120 if (*from > 0xff || to > 0xff)
4121 return ONIGERR_INVALID_CODE_POINT_VALUE;
4122
4123 if (*from > to) {
4124 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4125 goto ccs_range_end;
4126 else
4127 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4128 }
4129 bitset_set_range(cc->bs, (int )*from, (int )to);
4130 }
4131 else {
4132 r = add_code_range(&(cc->mbuf), env, *from, to);
4133 if (r < 0) return r;
4134 }
4135 }
4136 else {
4137 if (*from > to) {
4138 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4139 goto ccs_range_end;
4140 else
4141 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4142 }
4143 bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
4144 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
4145 if (r < 0) return r;
4146 }
4147 ccs_range_end:
4148 *state = CCS_COMPLETE;
4149 break;
4150
4151 case CCS_COMPLETE:
4152 case CCS_START:
4153 *state = CCS_VALUE;
4154 break;
4155
4156 default:
4157 break;
4158 }
4159
4160 *from_israw = to_israw;
4161 *from = to;
4162 *type = intype;
4163 return 0;
4164 }
4165
4166 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4167 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4168 ScanEnv* env)
4169 {
4170 int in_esc;
4171 OnigCodePoint code;
4172 OnigEncoding enc = env->enc;
4173 UChar* p = from;
4174
4175 in_esc = 0;
4176 while (! PEND) {
4177 if (ignore_escaped && in_esc) {
4178 in_esc = 0;
4179 }
4180 else {
4181 PFETCH_S(code);
4182 if (code == c) return 1;
4183 if (code == MC_ESC(env->syntax)) in_esc = 1;
4184 }
4185 }
4186 return 0;
4187 }
4188
4189 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4190 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4191 ScanEnv* env)
4192 {
4193 int r, neg, len, fetched, and_start;
4194 OnigCodePoint v, vs;
4195 UChar *p;
4196 Node* node;
4197 CClassNode *cc, *prev_cc;
4198 CClassNode work_cc;
4199
4200 enum CCSTATE state;
4201 enum CCVALTYPE val_type, in_type;
4202 int val_israw, in_israw;
4203
4204 *np = NULL_NODE;
4205 env->parse_depth++;
4206 if (env->parse_depth > ParseDepthLimit)
4207 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
4208 prev_cc = (CClassNode* )NULL;
4209 r = fetch_token_in_cc(tok, src, end, env);
4210 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4211 neg = 1;
4212 r = fetch_token_in_cc(tok, src, end, env);
4213 }
4214 else {
4215 neg = 0;
4216 }
4217
4218 if (r < 0) return r;
4219 if (r == TK_CC_CLOSE) {
4220 if (! code_exist_check((OnigCodePoint )']',
4221 *src, env->pattern_end, 1, env))
4222 return ONIGERR_EMPTY_CHAR_CLASS;
4223
4224 CC_ESC_WARN(env, (UChar* )"]");
4225 r = tok->type = TK_CHAR; /* allow []...] */
4226 }
4227
4228 *np = node = node_new_cclass();
4229 CHECK_NULL_RETURN_MEMERR(node);
4230 cc = NCCLASS(node);
4231
4232 and_start = 0;
4233 state = CCS_START;
4234 p = *src;
4235 while (r != TK_CC_CLOSE) {
4236 fetched = 0;
4237 switch (r) {
4238 case TK_CHAR:
4239 any_char_in:
4240 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4241 if (len > 1) {
4242 in_type = CCV_CODE_POINT;
4243 }
4244 else if (len < 0) {
4245 r = len;
4246 goto err;
4247 }
4248 else {
4249 /* sb_char: */
4250 in_type = CCV_SB;
4251 }
4252 v = (OnigCodePoint )tok->u.c;
4253 in_israw = 0;
4254 goto val_entry2;
4255 break;
4256
4257 case TK_RAW_BYTE:
4258 /* tok->base != 0 : octal or hexadec. */
4259 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4260 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4261 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4262 UChar* psave = p;
4263 int i, base = tok->base;
4264
4265 buf[0] = tok->u.c;
4266 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4267 r = fetch_token_in_cc(tok, &p, end, env);
4268 if (r < 0) goto err;
4269 if (r != TK_RAW_BYTE || tok->base != base) {
4270 fetched = 1;
4271 break;
4272 }
4273 buf[i] = tok->u.c;
4274 }
4275
4276 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4277 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4278 goto err;
4279 }
4280
4281 len = enclen(env->enc, buf);
4282 if (i < len) {
4283 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4284 goto err;
4285 }
4286 else if (i > len) { /* fetch back */
4287 p = psave;
4288 for (i = 1; i < len; i++) {
4289 r = fetch_token_in_cc(tok, &p, end, env);
4290 }
4291 fetched = 0;
4292 }
4293
4294 if (i == 1) {
4295 v = (OnigCodePoint )buf[0];
4296 goto raw_single;
4297 }
4298 else {
4299 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4300 in_type = CCV_CODE_POINT;
4301 }
4302 }
4303 else {
4304 v = (OnigCodePoint )tok->u.c;
4305 raw_single:
4306 in_type = CCV_SB;
4307 }
4308 in_israw = 1;
4309 goto val_entry2;
4310 break;
4311
4312 case TK_CODE_POINT:
4313 v = tok->u.code;
4314 in_israw = 1;
4315 val_entry:
4316 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4317 if (len < 0) {
4318 r = len;
4319 goto err;
4320 }
4321 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4322 val_entry2:
4323 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4324 &state, env);
4325 if (r != 0) goto err;
4326 break;
4327
4328 case TK_POSIX_BRACKET_OPEN:
4329 r = parse_posix_bracket(cc, &p, end, env);
4330 if (r < 0) goto err;
4331 if (r == 1) { /* is not POSIX bracket */
4332 CC_ESC_WARN(env, (UChar* )"[");
4333 p = tok->backp;
4334 v = (OnigCodePoint )tok->u.c;
4335 in_israw = 0;
4336 goto val_entry;
4337 }
4338 goto next_class;
4339 break;
4340
4341 case TK_CHAR_TYPE:
4342 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4343 if (r != 0) return r;
4344
4345 next_class:
4346 r = next_state_class(cc, &vs, &val_type, &state, env);
4347 if (r != 0) goto err;
4348 break;
4349
4350 case TK_CHAR_PROPERTY:
4351 {
4352 int ctype;
4353
4354 ctype = fetch_char_property_to_ctype(&p, end, env);
4355 if (ctype < 0) return ctype;
4356 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4357 if (r != 0) return r;
4358 goto next_class;
4359 }
4360 break;
4361
4362 case TK_CC_RANGE:
4363 if (state == CCS_VALUE) {
4364 r = fetch_token_in_cc(tok, &p, end, env);
4365 if (r < 0) goto err;
4366 fetched = 1;
4367 if (r == TK_CC_CLOSE) { /* allow [x-] */
4368 range_end_val:
4369 v = (OnigCodePoint )'-';
4370 in_israw = 0;
4371 goto val_entry;
4372 }
4373 else if (r == TK_CC_AND) {
4374 CC_ESC_WARN(env, (UChar* )"-");
4375 goto range_end_val;
4376 }
4377 state = CCS_RANGE;
4378 }
4379 else if (state == CCS_START) {
4380 /* [-xa] is allowed */
4381 v = (OnigCodePoint )tok->u.c;
4382 in_israw = 0;
4383
4384 r = fetch_token_in_cc(tok, &p, end, env);
4385 if (r < 0) goto err;
4386 fetched = 1;
4387 /* [--x] or [a&&-x] is warned. */
4388 if (r == TK_CC_RANGE || and_start != 0)
4389 CC_ESC_WARN(env, (UChar* )"-");
4390
4391 goto val_entry;
4392 }
4393 else if (state == CCS_RANGE) {
4394 CC_ESC_WARN(env, (UChar* )"-");
4395 goto any_char_in; /* [!--x] is allowed */
4396 }
4397 else { /* CCS_COMPLETE */
4398 r = fetch_token_in_cc(tok, &p, end, env);
4399 if (r < 0) goto err;
4400 fetched = 1;
4401 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4402 else if (r == TK_CC_AND) {
4403 CC_ESC_WARN(env, (UChar* )"-");
4404 goto range_end_val;
4405 }
4406
4407 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4408 CC_ESC_WARN(env, (UChar* )"-");
4409 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
4410 }
4411 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4412 goto err;
4413 }
4414 break;
4415
4416 case TK_CC_CC_OPEN: /* [ */
4417 {
4418 Node *anode;
4419 CClassNode* acc;
4420
4421 r = parse_char_class(&anode, tok, &p, end, env);
4422 if (r != 0) {
4423 onig_node_free(anode);
4424 goto cc_open_err;
4425 }
4426 acc = NCCLASS(anode);
4427 r = or_cclass(cc, acc, env->enc);
4428
4429 onig_node_free(anode);
4430 cc_open_err:
4431 if (r != 0) goto err;
4432 }
4433 break;
4434
4435 case TK_CC_AND: /* && */
4436 {
4437 if (state == CCS_VALUE) {
4438 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4439 &val_type, &state, env);
4440 if (r != 0) goto err;
4441 }
4442 /* initialize local variables */
4443 and_start = 1;
4444 state = CCS_START;
4445
4446 if (IS_NOT_NULL(prev_cc)) {
4447 r = and_cclass(prev_cc, cc, env->enc);
4448 if (r != 0) goto err;
4449 bbuf_free(cc->mbuf);
4450 }
4451 else {
4452 prev_cc = cc;
4453 cc = &work_cc;
4454 }
4455 initialize_cclass(cc);
4456 }
4457 break;
4458
4459 case TK_EOT:
4460 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4461 goto err;
4462 break;
4463 default:
4464 r = ONIGERR_PARSER_BUG;
4465 goto err;
4466 break;
4467 }
4468
4469 if (fetched)
4470 r = tok->type;
4471 else {
4472 r = fetch_token_in_cc(tok, &p, end, env);
4473 if (r < 0) goto err;
4474 }
4475 }
4476
4477 if (state == CCS_VALUE) {
4478 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4479 &val_type, &state, env);
4480 if (r != 0) goto err;
4481 }
4482
4483 if (IS_NOT_NULL(prev_cc)) {
4484 r = and_cclass(prev_cc, cc, env->enc);
4485 if (r != 0) goto err;
4486 bbuf_free(cc->mbuf);
4487 cc = prev_cc;
4488 }
4489
4490 if (neg != 0)
4491 NCCLASS_SET_NOT(cc);
4492 else
4493 NCCLASS_CLEAR_NOT(cc);
4494 if (IS_NCCLASS_NOT(cc) &&
4495 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4496 int is_empty;
4497
4498 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4499 if (is_empty != 0)
4500 BITSET_IS_EMPTY(cc->bs, is_empty);
4501
4502 if (is_empty == 0) {
4503 #define NEWLINE_CODE 0x0a
4504
4505 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4506 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4507 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4508 else
4509 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4510 }
4511 }
4512 }
4513 *src = p;
4514 env->parse_depth--;
4515 return 0;
4516
4517 err:
4518 if (cc != NCCLASS(*np))
4519 bbuf_free(cc->mbuf);
4520 return r;
4521 }
4522
4523 static int parse_subexp(Node** top, OnigToken* tok, int term,
4524 UChar** src, UChar* end, ScanEnv* env);
4525
4526 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4527 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4528 ScanEnv* env)
4529 {
4530 int r, num;
4531 Node *target;
4532 OnigOptionType option;
4533 OnigCodePoint c;
4534 OnigEncoding enc = env->enc;
4535
4536 #ifdef USE_NAMED_GROUP
4537 int list_capture;
4538 #endif
4539
4540 UChar* p = *src;
4541 PFETCH_READY;
4542
4543 *np = NULL;
4544 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4545
4546 option = env->option;
4547 if (PPEEK_IS('?') &&
4548 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4549 PINC;
4550 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4551
4552 PFETCH(c);
4553 switch (c) {
4554 case ':': /* (?:...) grouping only */
4555 group:
4556 r = fetch_token(tok, &p, end, env);
4557 if (r < 0) return r;
4558 r = parse_subexp(np, tok, term, &p, end, env);
4559 if (r < 0) return r;
4560 *src = p;
4561 return 1; /* group */
4562 break;
4563
4564 case '=':
4565 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4566 break;
4567 case '!': /* preceding read */
4568 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4569 break;
4570 case '>': /* (?>...) stop backtrack */
4571 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4572 break;
4573
4574 #ifdef USE_NAMED_GROUP
4575 case '\'':
4576 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4577 goto named_group1;
4578 }
4579 else
4580 return ONIGERR_UNDEFINED_GROUP_OPTION;
4581 break;
4582 #endif
4583
4584 case '<': /* look behind (?<=...), (?<!...) */
4585 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4586 PFETCH(c);
4587 if (c == '=')
4588 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4589 else if (c == '!')
4590 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4591 #ifdef USE_NAMED_GROUP
4592 else {
4593 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4594 UChar *name;
4595 UChar *name_end;
4596
4597 PUNFETCH;
4598 c = '<';
4599
4600 named_group1:
4601 list_capture = 0;
4602
4603 named_group2:
4604 name = p;
4605 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4606 if (r < 0) return r;
4607
4608 num = scan_env_add_mem_entry(env);
4609 if (num < 0) return num;
4610 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4611 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4612
4613 r = name_add(env->reg, name, name_end, num, env);
4614 if (r != 0) return r;
4615 *np = node_new_enclose_memory(env->option, 1);
4616 CHECK_NULL_RETURN_MEMERR(*np);
4617 NENCLOSE(*np)->regnum = num;
4618 if (list_capture != 0)
4619 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4620 env->num_named++;
4621 }
4622 else {
4623 return ONIGERR_UNDEFINED_GROUP_OPTION;
4624 }
4625 }
4626 #else
4627 else {
4628 return ONIGERR_UNDEFINED_GROUP_OPTION;
4629 }
4630 #endif
4631 break;
4632
4633 case '@':
4634 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4635 #ifdef USE_NAMED_GROUP
4636 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4637 PFETCH(c);
4638 if (c == '<' || c == '\'') {
4639 list_capture = 1;
4640 goto named_group2; /* (?@<name>...) */
4641 }
4642 PUNFETCH;
4643 }
4644 #endif
4645 *np = node_new_enclose_memory(env->option, 0);
4646 CHECK_NULL_RETURN_MEMERR(*np);
4647 num = scan_env_add_mem_entry(env);
4648 if (num < 0) {
4649 return num;
4650 }
4651 else if (num >= (int )BIT_STATUS_BITS_NUM) {
4652 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4653 }
4654 NENCLOSE(*np)->regnum = num;
4655 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4656 }
4657 else {
4658 return ONIGERR_UNDEFINED_GROUP_OPTION;
4659 }
4660 break;
4661
4662 #ifdef USE_POSIXLINE_OPTION
4663 case 'p':
4664 #endif
4665 case '-': case 'i': case 'm': case 's': case 'x':
4666 {
4667 int neg = 0;
4668
4669 while (1) {
4670 switch (c) {
4671 case ':':
4672 case ')':
4673 break;
4674
4675 case '-': neg = 1; break;
4676 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4677 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4678 case 's':
4679 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4680 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4681 }
4682 else
4683 return ONIGERR_UNDEFINED_GROUP_OPTION;
4684 break;
4685
4686 case 'm':
4687 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4688 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4689 }
4690 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4691 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4692 }
4693 else
4694 return ONIGERR_UNDEFINED_GROUP_OPTION;
4695 break;
4696 #ifdef USE_POSIXLINE_OPTION
4697 case 'p':
4698 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4699 break;
4700 #endif
4701 default:
4702 return ONIGERR_UNDEFINED_GROUP_OPTION;
4703 }
4704
4705 if (c == ')') {
4706 *np = node_new_option(option);
4707 CHECK_NULL_RETURN_MEMERR(*np);
4708 *src = p;
4709 return 2; /* option only */
4710 }
4711 else if (c == ':') {
4712 OnigOptionType prev = env->option;
4713
4714 env->option = option;
4715 r = fetch_token(tok, &p, end, env);
4716 if (r < 0) return r;
4717 r = parse_subexp(&target, tok, term, &p, end, env);
4718 env->option = prev;
4719 if (r < 0) {
4720 onig_node_free(target);
4721 return r;
4722 }
4723 *np = node_new_option(option);
4724 CHECK_NULL_RETURN_MEMERR(*np);
4725 NENCLOSE(*np)->target = target;
4726 *src = p;
4727 return 0;
4728 }
4729
4730 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4731 PFETCH(c);
4732 }
4733 }
4734 break;
4735
4736 default:
4737 return ONIGERR_UNDEFINED_GROUP_OPTION;
4738 }
4739 }
4740 else {
4741 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4742 goto group;
4743
4744 *np = node_new_enclose_memory(env->option, 0);
4745 CHECK_NULL_RETURN_MEMERR(*np);
4746 num = scan_env_add_mem_entry(env);
4747 if (num < 0) return num;
4748 NENCLOSE(*np)->regnum = num;
4749 }
4750
4751 CHECK_NULL_RETURN_MEMERR(*np);
4752 r = fetch_token(tok, &p, end, env);
4753 if (r < 0) return r;
4754 r = parse_subexp(&target, tok, term, &p, end, env);
4755 if (r < 0) {
4756 onig_node_free(target);
4757 return r;
4758 }
4759
4760 if (NTYPE(*np) == NT_ANCHOR)
4761 NANCHOR(*np)->target = target;
4762 else {
4763 NENCLOSE(*np)->target = target;
4764 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4765 /* Don't move this to previous of parse_subexp() */
4766 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4767 if (r != 0) return r;
4768 }
4769 }
4770
4771 *src = p;
4772 return 0;
4773 }
4774
4775 static const char* PopularQStr[] = {
4776 "?", "*", "+", "??", "*?", "+?"
4777 };
4778
4779 static const char* ReduceQStr[] = {
4780 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4781 };
4782
4783 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4784 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4785 {
4786 QtfrNode* qn;
4787
4788 qn = NQTFR(qnode);
4789 if (qn->lower == 1 && qn->upper == 1) {
4790 return 1;
4791 }
4792
4793 switch (NTYPE(target)) {
4794 case NT_STR:
4795 if (! group) {
4796 StrNode* sn = NSTR(target);
4797 if (str_node_can_be_split(sn, env->enc)) {
4798 Node* n = str_node_split_last_char(sn, env->enc);
4799 if (IS_NOT_NULL(n)) {
4800 qn->target = n;
4801 return 2;
4802 }
4803 }
4804 }
4805 break;
4806
4807 case NT_QTFR:
4808 { /* check redundant double repeat. */
4809 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4810 QtfrNode* qnt = NQTFR(target);
4811 int nestq_num = popular_quantifier_num(qn);
4812 int targetq_num = popular_quantifier_num(qnt);
4813
4814 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4815 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4816 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4817 UChar buf[WARN_BUFSIZE];
4818
4819 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4820 case RQ_ASIS:
4821 break;
4822
4823 case RQ_DEL:
4824 if (onig_verb_warn != onig_null_warn) {
4825 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4826 env->pattern, env->pattern_end,
4827 (UChar* )"redundant nested repeat operator");
4828 (*onig_verb_warn)((char* )buf);
4829 }
4830 goto warn_exit;
4831 break;
4832
4833 default:
4834 if (onig_verb_warn != onig_null_warn) {
4835 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4836 env->pattern, env->pattern_end,
4837 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4838 PopularQStr[targetq_num], PopularQStr[nestq_num],
4839 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4840 (*onig_verb_warn)((char* )buf);
4841 }
4842 goto warn_exit;
4843 break;
4844 }
4845 }
4846
4847 warn_exit:
4848 #endif
4849 if (targetq_num >= 0) {
4850 if (nestq_num >= 0) {
4851 onig_reduce_nested_quantifier(qnode, target);
4852 goto q_exit;
4853 }
4854 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4855 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4856 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4857 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4858 }
4859 }
4860 }
4861 }
4862 break;
4863
4864 default:
4865 break;
4866 }
4867
4868 qn->target = target;
4869 q_exit:
4870 return 0;
4871 }
4872
4873
4874 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4875 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4876 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4877 {
4878 BBuf *tbuf;
4879 int r;
4880
4881 if (IS_NCCLASS_NOT(cc)) {
4882 bitset_invert(cc->bs);
4883
4884 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4885 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4886 if (r != 0) return r;
4887
4888 bbuf_free(cc->mbuf);
4889 cc->mbuf = tbuf;
4890 }
4891
4892 NCCLASS_CLEAR_NOT(cc);
4893 }
4894
4895 return 0;
4896 }
4897 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4898
4899 typedef struct {
4900 ScanEnv* env;
4901 CClassNode* cc;
4902 Node* alt_root;
4903 Node** ptail;
4904 } IApplyCaseFoldArg;
4905
4906 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4907 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4908 int to_len, void* arg)
4909 {
4910 IApplyCaseFoldArg* iarg;
4911 ScanEnv* env;
4912 CClassNode* cc;
4913 BitSetRef bs;
4914
4915 iarg = (IApplyCaseFoldArg* )arg;
4916 env = iarg->env;
4917 cc = iarg->cc;
4918 bs = cc->bs;
4919
4920 if (to_len == 1) {
4921 int is_in = onig_is_code_in_cc(env->enc, from, cc);
4922 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4923 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4924 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
4925 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4926 add_code_range(&(cc->mbuf), env, *to, *to);
4927 }
4928 else {
4929 BITSET_SET_BIT(bs, *to);
4930 }
4931 }
4932 #else
4933 if (is_in != 0) {
4934 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4935 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4936 add_code_range(&(cc->mbuf), env, *to, *to);
4937 }
4938 else {
4939 if (IS_NCCLASS_NOT(cc)) {
4940 BITSET_CLEAR_BIT(bs, *to);
4941 }
4942 else
4943 BITSET_SET_BIT(bs, *to);
4944 }
4945 }
4946 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4947 }
4948 else {
4949 int r, i, len;
4950 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4951 Node *snode = NULL_NODE;
4952
4953 if (onig_is_code_in_cc(env->enc, from, cc)
4954 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4955 && !IS_NCCLASS_NOT(cc)
4956 #endif
4957 ) {
4958 for (i = 0; i < to_len; i++) {
4959 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
4960 if (i == 0) {
4961 snode = onig_node_new_str(buf, buf + len);
4962 CHECK_NULL_RETURN_MEMERR(snode);
4963
4964 /* char-class expanded multi-char only
4965 compare with string folded at match time. */
4966 NSTRING_SET_AMBIG(snode);
4967 }
4968 else {
4969 r = onig_node_str_cat(snode, buf, buf + len);
4970 if (r < 0) {
4971 onig_node_free(snode);
4972 return r;
4973 }
4974 }
4975 }
4976
4977 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
4978 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
4979 iarg->ptail = &(NCDR((*(iarg->ptail))));
4980 }
4981 }
4982
4983 return 0;
4984 }
4985
4986 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4987 parse_exp(Node** np, OnigToken* tok, int term,
4988 UChar** src, UChar* end, ScanEnv* env)
4989 {
4990 int r, len, group = 0;
4991 Node* qn;
4992 Node** targetp;
4993
4994 *np = NULL;
4995 if (tok->type == (enum TokenSyms )term)
4996 goto end_of_token;
4997
4998 switch (tok->type) {
4999 case TK_ALT:
5000 case TK_EOT:
5001 end_of_token:
5002 *np = node_new_empty();
5003 return tok->type;
5004 break;
5005
5006 case TK_SUBEXP_OPEN:
5007 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5008 if (r < 0) return r;
5009 if (r == 1) group = 1;
5010 else if (r == 2) { /* option only */
5011 Node* target;
5012 OnigOptionType prev = env->option;
5013
5014 env->option = NENCLOSE(*np)->option;
5015 r = fetch_token(tok, src, end, env);
5016 if (r < 0) return r;
5017 r = parse_subexp(&target, tok, term, src, end, env);
5018 env->option = prev;
5019 if (r < 0) {
5020 onig_node_free(target);
5021 return r;
5022 }
5023 NENCLOSE(*np)->target = target;
5024 return tok->type;
5025 }
5026 break;
5027
5028 case TK_SUBEXP_CLOSE:
5029 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5030 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5031
5032 if (tok->escaped) goto tk_raw_byte;
5033 else goto tk_byte;
5034 break;
5035
5036 case TK_STRING:
5037 tk_byte:
5038 {
5039 *np = node_new_str(tok->backp, *src);
5040 CHECK_NULL_RETURN_MEMERR(*np);
5041
5042 while (1) {
5043 r = fetch_token(tok, src, end, env);
5044 if (r < 0) return r;
5045 if (r != TK_STRING) break;
5046
5047 r = onig_node_str_cat(*np, tok->backp, *src);
5048 if (r < 0) return r;
5049 }
5050
5051 string_end:
5052 targetp = np;
5053 goto repeat;
5054 }
5055 break;
5056
5057 case TK_RAW_BYTE:
5058 tk_raw_byte:
5059 {
5060 *np = node_new_str_raw_char((UChar )tok->u.c);
5061 CHECK_NULL_RETURN_MEMERR(*np);
5062 len = 1;
5063 while (1) {
5064 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5065 if (len == enclen(env->enc, NSTR(*np)->s)) {//should not enclen_end()
5066 r = fetch_token(tok, src, end, env);
5067 NSTRING_CLEAR_RAW(*np);
5068 goto string_end;
5069 }
5070 }
5071
5072 r = fetch_token(tok, src, end, env);
5073 if (r < 0) return r;
5074 if (r != TK_RAW_BYTE) {
5075 /* Don't use this, it is wrong for little endian encodings. */
5076 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5077 int rem;
5078 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5079 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5080 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5081 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5082 NSTRING_CLEAR_RAW(*np);
5083 goto string_end;
5084 }
5085 }
5086 #endif
5087 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5088 }
5089
5090 r = node_str_cat_char(*np, (UChar )tok->u.c);
5091 if (r < 0) return r;
5092
5093 len++;
5094 }
5095 }
5096 break;
5097
5098 case TK_CODE_POINT:
5099 {
5100 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5101 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5102 if (num < 0) return num;
5103 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5104 *np = node_new_str_raw(buf, buf + num);
5105 #else
5106 *np = node_new_str(buf, buf + num);
5107 #endif
5108 CHECK_NULL_RETURN_MEMERR(*np);
5109 }
5110 break;
5111
5112 case TK_QUOTE_OPEN:
5113 {
5114 OnigCodePoint end_op[2];
5115 UChar *qstart, *qend, *nextp;
5116
5117 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5118 end_op[1] = (OnigCodePoint )'E';
5119 qstart = *src;
5120 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5121 if (IS_NULL(qend)) {
5122 nextp = qend = end;
5123 }
5124 *np = node_new_str(qstart, qend);
5125 CHECK_NULL_RETURN_MEMERR(*np);
5126 *src = nextp;
5127 }
5128 break;
5129
5130 case TK_CHAR_TYPE:
5131 {
5132 switch (tok->u.prop.ctype) {
5133 case ONIGENC_CTYPE_WORD:
5134 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5135 CHECK_NULL_RETURN_MEMERR(*np);
5136 break;
5137
5138 case ONIGENC_CTYPE_SPACE:
5139 case ONIGENC_CTYPE_DIGIT:
5140 case ONIGENC_CTYPE_XDIGIT:
5141 {
5142 CClassNode* cc;
5143
5144 *np = node_new_cclass();
5145 CHECK_NULL_RETURN_MEMERR(*np);
5146 cc = NCCLASS(*np);
5147 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5148 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5149 }
5150 break;
5151
5152 default:
5153 return ONIGERR_PARSER_BUG;
5154 break;
5155 }
5156 }
5157 break;
5158
5159 case TK_CHAR_PROPERTY:
5160 r = parse_char_property(np, tok, src, end, env);
5161 if (r != 0) return r;
5162 break;
5163
5164 case TK_CC_OPEN:
5165 {
5166 CClassNode* cc;
5167
5168 r = parse_char_class(np, tok, src, end, env);
5169 if (r != 0) return r;
5170
5171 cc = NCCLASS(*np);
5172 if (IS_IGNORECASE(env->option)) {
5173 IApplyCaseFoldArg iarg;
5174
5175 iarg.env = env;
5176 iarg.cc = cc;
5177 iarg.alt_root = NULL_NODE;
5178 iarg.ptail = &(iarg.alt_root);
5179
5180 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5181 i_apply_case_fold, &iarg);
5182 if (r != 0) {
5183 onig_node_free(iarg.alt_root);
5184 return r;
5185 }
5186 if (IS_NOT_NULL(iarg.alt_root)) {
5187 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5188 if (IS_NULL(work)) {
5189 onig_node_free(iarg.alt_root);
5190 return ONIGERR_MEMORY;
5191 }
5192 *np = work;
5193 }
5194 }
5195 }
5196 break;
5197
5198 case TK_ANYCHAR:
5199 *np = node_new_anychar();
5200 CHECK_NULL_RETURN_MEMERR(*np);
5201 break;
5202
5203 case TK_ANYCHAR_ANYTIME:
5204 *np = node_new_anychar();
5205 CHECK_NULL_RETURN_MEMERR(*np);
5206 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5207 CHECK_NULL_RETURN_MEMERR(qn);
5208 NQTFR(qn)->target = *np;
5209 *np = qn;
5210 break;
5211
5212 case TK_BACKREF:
5213 len = tok->u.backref.num;
5214 *np = node_new_backref(len,
5215 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5216 tok->u.backref.by_name,
5217 #ifdef USE_BACKREF_WITH_LEVEL
5218 tok->u.backref.exist_level,
5219 tok->u.backref.level,
5220 #endif
5221 env);
5222 CHECK_NULL_RETURN_MEMERR(*np);
5223 break;
5224
5225 #ifdef USE_SUBEXP_CALL
5226 case TK_CALL:
5227 {
5228 int gnum = tok->u.call.gnum;
5229
5230 if (gnum < 0) {
5231 gnum = BACKREF_REL_TO_ABS(gnum, env);
5232 if (gnum <= 0)
5233 return ONIGERR_INVALID_BACKREF;
5234 }
5235 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5236 CHECK_NULL_RETURN_MEMERR(*np);
5237 env->num_call++;
5238 }
5239 break;
5240 #endif
5241
5242 case TK_ANCHOR:
5243 *np = onig_node_new_anchor(tok->u.anchor);
5244 break;
5245
5246 case TK_OP_REPEAT:
5247 case TK_INTERVAL:
5248 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5249 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5250 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5251 else
5252 *np = node_new_empty();
5253 }
5254 else {
5255 goto tk_byte;
5256 }
5257 break;
5258
5259 default:
5260 return ONIGERR_PARSER_BUG;
5261 break;
5262 }
5263
5264 {
5265 targetp = np;
5266
5267 re_entry:
5268 r = fetch_token(tok, src, end, env);
5269 if (r < 0) return r;
5270
5271 repeat:
5272 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5273 if (is_invalid_quantifier_target(*targetp))
5274 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5275
5276 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5277 (r == TK_INTERVAL ? 1 : 0));
5278 CHECK_NULL_RETURN_MEMERR(qn);
5279 NQTFR(qn)->greedy = tok->u.repeat.greedy;
5280 r = set_quantifier(qn, *targetp, group, env);
5281 if (r < 0) {
5282 onig_node_free(qn);
5283 return r;
5284 }
5285
5286 if (tok->u.repeat.possessive != 0) {
5287 Node* en;
5288 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5289 if (IS_NULL(en)) {
5290 onig_node_free(qn);
5291 return ONIGERR_MEMORY;
5292 }
5293 NENCLOSE(en)->target = qn;
5294 qn = en;
5295 }
5296
5297 if (r == 0) {
5298 *targetp = qn;
5299 }
5300 else if (r == 1) {
5301 onig_node_free(qn);
5302 }
5303 else if (r == 2) { /* split case: /abc+/ */
5304 Node *tmp;
5305
5306 *targetp = node_new_list(*targetp, NULL);
5307 if (IS_NULL(*targetp)) {
5308 onig_node_free(qn);
5309 return ONIGERR_MEMORY;
5310 }
5311 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5312 if (IS_NULL(tmp)) {
5313 onig_node_free(qn);
5314 return ONIGERR_MEMORY;
5315 }
5316 targetp = &(NCAR(tmp));
5317 }
5318 goto re_entry;
5319 }
5320 }
5321
5322 return r;
5323 }
5324
5325 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5326 parse_branch(Node** top, OnigToken* tok, int term,
5327 UChar** src, UChar* end, ScanEnv* env)
5328 {
5329 int r;
5330 Node *node, **headp;
5331
5332 *top = NULL;
5333 r = parse_exp(&node, tok, term, src, end, env);
5334 if (r < 0) {
5335 onig_node_free(node);
5336 return r;
5337 }
5338
5339 if (r == TK_EOT || r == term || r == TK_ALT) {
5340 *top = node;
5341 }
5342 else {
5343 *top = node_new_list(node, NULL);
5344 headp = &(NCDR(*top));
5345 while (r != TK_EOT && r != term && r != TK_ALT) {
5346 r = parse_exp(&node, tok, term, src, end, env);
5347 if (r < 0) {
5348 onig_node_free(node);
5349 return r;
5350 }
5351
5352 if (NTYPE(node) == NT_LIST) {
5353 *headp = node;
5354 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5355 headp = &(NCDR(node));
5356 }
5357 else {
5358 *headp = node_new_list(node, NULL);
5359 headp = &(NCDR(*headp));
5360 }
5361 }
5362 }
5363
5364 return r;
5365 }
5366
5367 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5368 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5369 parse_subexp(Node** top, OnigToken* tok, int term,
5370 UChar** src, UChar* end, ScanEnv* env)
5371 {
5372 int r;
5373 Node *node, **headp;
5374
5375 *top = NULL;
5376 env->parse_depth++;
5377 if (env->parse_depth > ParseDepthLimit)
5378 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
5379 r = parse_branch(&node, tok, term, src, end, env);
5380 if (r < 0) {
5381 onig_node_free(node);
5382 return r;
5383 }
5384
5385 if (r == term) {
5386 *top = node;
5387 }
5388 else if (r == TK_ALT) {
5389 *top = onig_node_new_alt(node, NULL);
5390 headp = &(NCDR(*top));
5391 while (r == TK_ALT) {
5392 r = fetch_token(tok, src, end, env);
5393 if (r < 0) return r;
5394 r = parse_branch(&node, tok, term, src, end, env);
5395 if (r < 0) {
5396 onig_node_free(node);
5397 return r;
5398 }
5399 *headp = onig_node_new_alt(node, NULL);
5400 headp = &(NCDR(*headp));
5401 }
5402
5403 if (tok->type != (enum TokenSyms )term)
5404 goto err;
5405 }
5406 else {
5407 onig_node_free(node);
5408 err:
5409 if (term == TK_SUBEXP_CLOSE)
5410 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5411 else
5412 return ONIGERR_PARSER_BUG;
5413 }
5414
5415 env->parse_depth--;
5416 return r;
5417 }
5418
5419 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5420 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5421 {
5422 int r;
5423 OnigToken tok;
5424
5425 r = fetch_token(&tok, src, end, env);
5426 if (r < 0) return r;
5427 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5428 if (r < 0) return r;
5429 return 0;
5430 }
5431
5432 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5433 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5434 regex_t* reg, ScanEnv* env)
5435 {
5436 int r;
5437 UChar* p;
5438
5439 #ifdef USE_NAMED_GROUP
5440 names_clear(reg);
5441 #endif
5442
5443 scan_env_clear(env);
5444 env->option = reg->options;
5445 env->case_fold_flag = reg->case_fold_flag;
5446 env->enc = reg->enc;
5447 env->syntax = reg->syntax;
5448 env->pattern = (UChar* )pattern;
5449 env->pattern_end = (UChar* )end;
5450 env->reg = reg;
5451
5452 *root = NULL;
5453
5454 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
5455 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
5456
5457 p = (UChar* )pattern;
5458 r = parse_regexp(root, &p, (UChar* )end, env);
5459 reg->num_mem = env->num_mem;
5460 return r;
5461 }
5462
5463 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5464 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5465 UChar* arg, UChar* arg_end)
5466 {
5467 env->error = arg;
5468 env->error_end = arg_end;
5469 }
5470