1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31 #include "st.h"
32
33 #define WARN_BUFSIZE 256
34
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36
37
38 OnigSyntaxType OnigSyntaxRuby = {
39 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42 ONIG_SYN_OP_ESC_C_CONTROL )
43 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45 ONIG_SYN_OP2_OPTION_RUBY |
46 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
49 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53 ONIG_SYN_OP2_ESC_H_XDIGIT )
54 , ( SYN_GNU_REGEX_BV |
55 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62 , ONIG_OPTION_NONE
63 ,
64 {
65 (OnigCodePoint )'\\' /* esc */
66 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
67 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
68 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71 }
72 };
73
74 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92 onig_warn = f;
93 }
94
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97 onig_verb_warn = f;
98 }
99
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103 if (IS_NOT_NULL(bbuf)) {
104 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105 xfree(bbuf);
106 }
107 }
108
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112 int r;
113 BBuf *to;
114
115 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116 CHECK_NULL_RETURN_MEMERR(to);
117 r = BBUF_INIT(to, from->alloc);
118 if (r != 0) return r;
119 to->used = from->used;
120 xmemcpy(to->p, from->p, from->used);
121 return 0;
122 }
123
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125 ((env)->num_mem + 1 + (rel_no))
126
127 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128
129 #define MBCODE_START_POS(enc) \
130 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138 if (r) return r;\
139 }\
140 } while (0)
141
142
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144 int i;\
145 empty = 1;\
146 for (i = 0; i < (int )BITSET_SIZE; i++) {\
147 if ((bs)[i] != 0) {\
148 empty = 0; break;\
149 }\
150 }\
151 } while (0)
152
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156 int i;
157 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158 BITSET_SET_BIT(bs, i);
159 }
160 }
161
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166 int i;
167 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174 int i;
175 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181 int i;
182 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188 int i;
189 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195 int i;
196 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202 int i;
203 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209 int x;
210
211 while (n-- > 0) {
212 x = *s2++ - *s1++;
213 if (x) return x;
214 }
215 return 0;
216 }
217
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221 int len = end - src;
222 if (len > 0) {
223 xmemcpy(dest, src, len);
224 dest[len] = (UChar )0;
225 }
226 }
227
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232 int slen, term_len, i;
233 UChar *r;
234
235 slen = end - s;
236 term_len = ONIGENC_MBC_MINLEN(enc);
237
238 r = (UChar* )xmalloc(slen + term_len);
239 CHECK_NULL_RETURN(r);
240 xmemcpy(r, s, slen);
241
242 for (i = 0; i < term_len; i++)
243 r[slen + i] = (UChar )0;
244
245 return r;
246 }
247 #endif
248
249 /* scan pattern methods */
250 #define PEND_VALUE 0
251
252 #define PFETCH_READY UChar* pfetch_prev
253 #define PEND (p < end ? 0 : 1)
254 #define PUNFETCH p = pfetch_prev
255 #define PINC do { \
256 pfetch_prev = p; \
257 p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c) do { \
260 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261 pfetch_prev = p; \
262 p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 if(UNEXPECTED(p > end)) p = end; \
264 } while (0)
265
266 #define PINC_S do { \
267 p += ONIGENC_MBC_ENC_LEN(enc, p); \
268 if(UNEXPECTED(p > end)) p = end; \
269 } while (0)
270 #define PFETCH_S(c) do { \
271 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
272 p += ONIGENC_MBC_ENC_LEN(enc, p); \
273 if(UNEXPECTED(p > end)) p = end; \
274 } while (0)
275
276 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
277 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
278
279 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)280 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
281 int capa)
282 {
283 UChar* r;
284
285 if (dest)
286 r = (UChar* )xrealloc(dest, capa + 1);
287 else
288 r = (UChar* )xmalloc(capa + 1);
289
290 CHECK_NULL_RETURN(r);
291 onig_strcpy(r + (dest_end - dest), src, src_end);
292 return r;
293 }
294
295 /* dest on static area */
296 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)297 strcat_capa_from_static(UChar* dest, UChar* dest_end,
298 const UChar* src, const UChar* src_end, int capa)
299 {
300 UChar* r;
301
302 r = (UChar* )xmalloc(capa + 1);
303 CHECK_NULL_RETURN(r);
304 onig_strcpy(r, dest, dest_end);
305 onig_strcpy(r + (dest_end - dest), src, src_end);
306 return r;
307 }
308
309
310 #ifdef USE_ST_LIBRARY
311
312 typedef struct {
313 UChar* s;
314 UChar* end;
315 } st_str_end_key;
316
317 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)318 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
319 {
320 UChar *p, *q;
321 int c;
322
323 if ((x->end - x->s) != (y->end - y->s))
324 return 1;
325
326 p = x->s;
327 q = y->s;
328 while (p < x->end) {
329 c = (int )*p - (int )*q;
330 if (c != 0) return c;
331
332 p++; q++;
333 }
334
335 return 0;
336 }
337
338 static int
str_end_hash(st_str_end_key * x)339 str_end_hash(st_str_end_key* x)
340 {
341 UChar *p;
342 int val = 0;
343
344 p = x->s;
345 while (p < x->end) {
346 val = val * 997 + (int )*p++;
347 }
348
349 return val + (val >> 5);
350 }
351
352 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)353 onig_st_init_strend_table_with_size(int size)
354 {
355 static struct st_hash_type hashType = {
356 str_end_cmp,
357 str_end_hash,
358 };
359
360 return (hash_table_type* )
361 onig_st_init_table_with_size(&hashType, size);
362 }
363
364 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)365 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
366 const UChar* end_key, hash_data_type *value)
367 {
368 st_str_end_key key;
369
370 key.s = (UChar* )str_key;
371 key.end = (UChar* )end_key;
372
373 return onig_st_lookup(table, (st_data_t )(&key), value);
374 }
375
376 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)377 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
378 const UChar* end_key, hash_data_type value)
379 {
380 st_str_end_key* key;
381 int result;
382
383 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
384 key->s = (UChar* )str_key;
385 key->end = (UChar* )end_key;
386 result = onig_st_insert(table, (st_data_t )key, value);
387 if (result) {
388 xfree(key);
389 }
390 return result;
391 }
392
393 #endif /* USE_ST_LIBRARY */
394
395
396 #ifdef USE_NAMED_GROUP
397
398 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
399
400 typedef struct {
401 UChar* name;
402 int name_len; /* byte length */
403 int back_num; /* number of backrefs */
404 int back_alloc;
405 int back_ref1;
406 int* back_refs;
407 } NameEntry;
408
409 #ifdef USE_ST_LIBRARY
410
411 typedef st_table NameTable;
412 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
413
414 #define NAMEBUF_SIZE 24
415 #define NAMEBUF_SIZE_1 25
416
417 #ifdef ONIG_DEBUG
418 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)419 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
420 {
421 int i;
422 FILE* fp = (FILE* )arg;
423
424 fprintf(fp, "%s: ", e->name);
425 if (e->back_num == 0)
426 fputs("-", fp);
427 else if (e->back_num == 1)
428 fprintf(fp, "%d", e->back_ref1);
429 else {
430 for (i = 0; i < e->back_num; i++) {
431 if (i > 0) fprintf(fp, ", ");
432 fprintf(fp, "%d", e->back_refs[i]);
433 }
434 }
435 fputs("\n", fp);
436 return ST_CONTINUE;
437 }
438
439 extern int
onig_print_names(FILE * fp,regex_t * reg)440 onig_print_names(FILE* fp, regex_t* reg)
441 {
442 NameTable* t = (NameTable* )reg->name_table;
443
444 if (IS_NOT_NULL(t)) {
445 fprintf(fp, "name table\n");
446 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
447 fputs("\n", fp);
448 }
449 return 0;
450 }
451 #endif /* ONIG_DEBUG */
452
453 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)454 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
455 {
456 xfree(e->name);
457 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
458 xfree(key);
459 xfree(e);
460 return ST_DELETE;
461 }
462
463 static int
names_clear(regex_t * reg)464 names_clear(regex_t* reg)
465 {
466 NameTable* t = (NameTable* )reg->name_table;
467
468 if (IS_NOT_NULL(t)) {
469 onig_st_foreach(t, i_free_name_entry, 0);
470 }
471 return 0;
472 }
473
474 extern int
onig_names_free(regex_t * reg)475 onig_names_free(regex_t* reg)
476 {
477 int r;
478 NameTable* t;
479
480 r = names_clear(reg);
481 if (r) return r;
482
483 t = (NameTable* )reg->name_table;
484 if (IS_NOT_NULL(t)) onig_st_free_table(t);
485 reg->name_table = (void* )NULL;
486 return 0;
487 }
488
489 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)490 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
491 {
492 NameEntry* e;
493 NameTable* t = (NameTable* )reg->name_table;
494
495 e = (NameEntry* )NULL;
496 if (IS_NOT_NULL(t)) {
497 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
498 }
499 return e;
500 }
501
502 typedef struct {
503 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
504 regex_t* reg;
505 void* arg;
506 int ret;
507 OnigEncoding enc;
508 } INamesArg;
509
510 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)511 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
512 {
513 int r = (*(arg->func))(e->name,
514 e->name + e->name_len,
515 e->back_num,
516 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
517 arg->reg, arg->arg);
518 if (r != 0) {
519 arg->ret = r;
520 return ST_STOP;
521 }
522 return ST_CONTINUE;
523 }
524
525 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)526 onig_foreach_name(regex_t* reg,
527 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
528 {
529 INamesArg narg;
530 NameTable* t = (NameTable* )reg->name_table;
531
532 narg.ret = 0;
533 if (IS_NOT_NULL(t)) {
534 narg.func = func;
535 narg.reg = reg;
536 narg.arg = arg;
537 narg.enc = reg->enc; /* should be pattern encoding. */
538 onig_st_foreach(t, i_names, (HashDataType )&narg);
539 }
540 return narg.ret;
541 }
542
543 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)544 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
545 {
546 int i;
547
548 if (e->back_num > 1) {
549 for (i = 0; i < e->back_num; i++) {
550 e->back_refs[i] = map[e->back_refs[i]].new_val;
551 }
552 }
553 else if (e->back_num == 1) {
554 e->back_ref1 = map[e->back_ref1].new_val;
555 }
556
557 return ST_CONTINUE;
558 }
559
560 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)561 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
562 {
563 NameTable* t = (NameTable* )reg->name_table;
564
565 if (IS_NOT_NULL(t)) {
566 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
567 }
568 return 0;
569 }
570
571
572 extern int
onig_number_of_names(regex_t * reg)573 onig_number_of_names(regex_t* reg)
574 {
575 NameTable* t = (NameTable* )reg->name_table;
576
577 if (IS_NOT_NULL(t))
578 return t->num_entries;
579 else
580 return 0;
581 }
582
583 #else /* USE_ST_LIBRARY */
584
585 #define INIT_NAMES_ALLOC_NUM 8
586
587 typedef struct {
588 NameEntry* e;
589 int num;
590 int alloc;
591 } NameTable;
592
593 #ifdef ONIG_DEBUG
594 extern int
onig_print_names(FILE * fp,regex_t * reg)595 onig_print_names(FILE* fp, regex_t* reg)
596 {
597 int i, j;
598 NameEntry* e;
599 NameTable* t = (NameTable* )reg->name_table;
600
601 if (IS_NOT_NULL(t) && t->num > 0) {
602 fprintf(fp, "name table\n");
603 for (i = 0; i < t->num; i++) {
604 e = &(t->e[i]);
605 fprintf(fp, "%s: ", e->name);
606 if (e->back_num == 0) {
607 fputs("-", fp);
608 }
609 else if (e->back_num == 1) {
610 fprintf(fp, "%d", e->back_ref1);
611 }
612 else {
613 for (j = 0; j < e->back_num; j++) {
614 if (j > 0) fprintf(fp, ", ");
615 fprintf(fp, "%d", e->back_refs[j]);
616 }
617 }
618 fputs("\n", fp);
619 }
620 fputs("\n", fp);
621 }
622 return 0;
623 }
624 #endif
625
626 static int
names_clear(regex_t * reg)627 names_clear(regex_t* reg)
628 {
629 int i;
630 NameEntry* e;
631 NameTable* t = (NameTable* )reg->name_table;
632
633 if (IS_NOT_NULL(t)) {
634 for (i = 0; i < t->num; i++) {
635 e = &(t->e[i]);
636 if (IS_NOT_NULL(e->name)) {
637 xfree(e->name);
638 e->name = NULL;
639 e->name_len = 0;
640 e->back_num = 0;
641 e->back_alloc = 0;
642 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
643 e->back_refs = (int* )NULL;
644 }
645 }
646 if (IS_NOT_NULL(t->e)) {
647 xfree(t->e);
648 t->e = NULL;
649 }
650 t->num = 0;
651 }
652 return 0;
653 }
654
655 extern int
onig_names_free(regex_t * reg)656 onig_names_free(regex_t* reg)
657 {
658 int r;
659 NameTable* t;
660
661 r = names_clear(reg);
662 if (r) return r;
663
664 t = (NameTable* )reg->name_table;
665 if (IS_NOT_NULL(t)) xfree(t);
666 reg->name_table = NULL;
667 return 0;
668 }
669
670 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)671 name_find(regex_t* reg, UChar* name, UChar* name_end)
672 {
673 int i, len;
674 NameEntry* e;
675 NameTable* t = (NameTable* )reg->name_table;
676
677 if (IS_NOT_NULL(t)) {
678 len = name_end - name;
679 for (i = 0; i < t->num; i++) {
680 e = &(t->e[i]);
681 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
682 return e;
683 }
684 }
685 return (NameEntry* )NULL;
686 }
687
688 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)689 onig_foreach_name(regex_t* reg,
690 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
691 {
692 int i, r;
693 NameEntry* e;
694 NameTable* t = (NameTable* )reg->name_table;
695
696 if (IS_NOT_NULL(t)) {
697 for (i = 0; i < t->num; i++) {
698 e = &(t->e[i]);
699 r = (*func)(e->name, e->name + e->name_len, e->back_num,
700 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
701 reg, arg);
702 if (r != 0) return r;
703 }
704 }
705 return 0;
706 }
707
708 extern int
onig_number_of_names(regex_t * reg)709 onig_number_of_names(regex_t* reg)
710 {
711 NameTable* t = (NameTable* )reg->name_table;
712
713 if (IS_NOT_NULL(t))
714 return t->num;
715 else
716 return 0;
717 }
718
719 #endif /* else USE_ST_LIBRARY */
720
721 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)722 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
723 {
724 int alloc;
725 NameEntry* e;
726 NameTable* t = (NameTable* )reg->name_table;
727
728 if (name_end - name <= 0)
729 return ONIGERR_EMPTY_GROUP_NAME;
730
731 e = name_find(reg, name, name_end);
732 if (IS_NULL(e)) {
733 #ifdef USE_ST_LIBRARY
734 if (IS_NULL(t)) {
735 t = onig_st_init_strend_table_with_size(5);
736 reg->name_table = (void* )t;
737 }
738 e = (NameEntry* )xmalloc(sizeof(NameEntry));
739 CHECK_NULL_RETURN_MEMERR(e);
740
741 e->name = strdup_with_null(reg->enc, name, name_end);
742 if (IS_NULL(e->name)) {
743 xfree(e); return ONIGERR_MEMORY;
744 }
745 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
746 (HashDataType )e);
747
748 e->name_len = name_end - name;
749 e->back_num = 0;
750 e->back_alloc = 0;
751 e->back_refs = (int* )NULL;
752
753 #else
754
755 if (IS_NULL(t)) {
756 alloc = INIT_NAMES_ALLOC_NUM;
757 t = (NameTable* )xmalloc(sizeof(NameTable));
758 CHECK_NULL_RETURN_MEMERR(t);
759 t->e = NULL;
760 t->alloc = 0;
761 t->num = 0;
762
763 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
764 if (IS_NULL(t->e)) {
765 xfree(t);
766 return ONIGERR_MEMORY;
767 }
768 t->alloc = alloc;
769 reg->name_table = t;
770 goto clear;
771 }
772 else if (t->num == t->alloc) {
773 int i;
774
775 alloc = t->alloc * 2;
776 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
777 CHECK_NULL_RETURN_MEMERR(t->e);
778 t->alloc = alloc;
779
780 clear:
781 for (i = t->num; i < t->alloc; i++) {
782 t->e[i].name = NULL;
783 t->e[i].name_len = 0;
784 t->e[i].back_num = 0;
785 t->e[i].back_alloc = 0;
786 t->e[i].back_refs = (int* )NULL;
787 }
788 }
789 e = &(t->e[t->num]);
790 t->num++;
791 e->name = strdup_with_null(reg->enc, name, name_end);
792 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
793 e->name_len = name_end - name;
794 #endif
795 }
796
797 if (e->back_num >= 1 &&
798 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
799 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
800 name, name_end);
801 return ONIGERR_MULTIPLEX_DEFINED_NAME;
802 }
803
804 e->back_num++;
805 if (e->back_num == 1) {
806 e->back_ref1 = backref;
807 }
808 else {
809 if (e->back_num == 2) {
810 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
811 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
812 CHECK_NULL_RETURN_MEMERR(e->back_refs);
813 e->back_alloc = alloc;
814 e->back_refs[0] = e->back_ref1;
815 e->back_refs[1] = backref;
816 }
817 else {
818 if (e->back_num > e->back_alloc) {
819 alloc = e->back_alloc * 2;
820 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
821 CHECK_NULL_RETURN_MEMERR(e->back_refs);
822 e->back_alloc = alloc;
823 }
824 e->back_refs[e->back_num - 1] = backref;
825 }
826 }
827
828 return 0;
829 }
830
831 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)832 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
833 const UChar* name_end, int** nums)
834 {
835 NameEntry* e = name_find(reg, name, name_end);
836
837 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
838
839 switch (e->back_num) {
840 case 0:
841 break;
842 case 1:
843 *nums = &(e->back_ref1);
844 break;
845 default:
846 *nums = e->back_refs;
847 break;
848 }
849 return e->back_num;
850 }
851
852 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)853 onig_name_to_backref_number(regex_t* reg, const UChar* name,
854 const UChar* name_end, OnigRegion *region)
855 {
856 int i, n, *nums;
857
858 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
859 if (n < 0)
860 return n;
861 else if (n == 0)
862 return ONIGERR_PARSER_BUG;
863 else if (n == 1)
864 return nums[0];
865 else {
866 if (IS_NOT_NULL(region)) {
867 for (i = n - 1; i >= 0; i--) {
868 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
869 return nums[i];
870 }
871 }
872 return nums[n - 1];
873 }
874 }
875
876 #else /* USE_NAMED_GROUP */
877
878 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)879 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
880 const UChar* name_end, int** nums)
881 {
882 return ONIG_NO_SUPPORT_CONFIG;
883 }
884
885 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)886 onig_name_to_backref_number(regex_t* reg, const UChar* name,
887 const UChar* name_end, OnigRegion* region)
888 {
889 return ONIG_NO_SUPPORT_CONFIG;
890 }
891
892 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)893 onig_foreach_name(regex_t* reg,
894 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
895 {
896 return ONIG_NO_SUPPORT_CONFIG;
897 }
898
899 extern int
onig_number_of_names(regex_t * reg)900 onig_number_of_names(regex_t* reg)
901 {
902 return 0;
903 }
904 #endif /* else USE_NAMED_GROUP */
905
906 extern int
onig_noname_group_capture_is_active(regex_t * reg)907 onig_noname_group_capture_is_active(regex_t* reg)
908 {
909 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
910 return 0;
911
912 #ifdef USE_NAMED_GROUP
913 if (onig_number_of_names(reg) > 0 &&
914 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
915 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
916 return 0;
917 }
918 #endif
919
920 return 1;
921 }
922
923
924 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
925
926 static void
scan_env_clear(ScanEnv * env)927 scan_env_clear(ScanEnv* env)
928 {
929 int i;
930
931 BIT_STATUS_CLEAR(env->capture_history);
932 BIT_STATUS_CLEAR(env->bt_mem_start);
933 BIT_STATUS_CLEAR(env->bt_mem_end);
934 BIT_STATUS_CLEAR(env->backrefed_mem);
935 env->error = (UChar* )NULL;
936 env->error_end = (UChar* )NULL;
937 env->num_call = 0;
938 env->num_mem = 0;
939 #ifdef USE_NAMED_GROUP
940 env->num_named = 0;
941 #endif
942 env->mem_alloc = 0;
943 env->mem_nodes_dynamic = (Node** )NULL;
944
945 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
946 env->mem_nodes_static[i] = NULL_NODE;
947
948 #ifdef USE_COMBINATION_EXPLOSION_CHECK
949 env->num_comb_exp_check = 0;
950 env->comb_exp_max_regnum = 0;
951 env->curr_max_regnum = 0;
952 env->has_recursion = 0;
953 #endif
954 }
955
956 static int
scan_env_add_mem_entry(ScanEnv * env)957 scan_env_add_mem_entry(ScanEnv* env)
958 {
959 int i, need, alloc;
960 Node** p;
961
962 need = env->num_mem + 1;
963 if (need >= SCANENV_MEMNODES_SIZE) {
964 if (env->mem_alloc <= need) {
965 if (IS_NULL(env->mem_nodes_dynamic)) {
966 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
967 p = (Node** )xmalloc(sizeof(Node*) * alloc);
968 xmemcpy(p, env->mem_nodes_static,
969 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
970 }
971 else {
972 alloc = env->mem_alloc * 2;
973 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
974 }
975 CHECK_NULL_RETURN_MEMERR(p);
976
977 for (i = env->num_mem + 1; i < alloc; i++)
978 p[i] = NULL_NODE;
979
980 env->mem_nodes_dynamic = p;
981 env->mem_alloc = alloc;
982 }
983 }
984
985 env->num_mem++;
986 return env->num_mem;
987 }
988
989 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)990 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
991 {
992 if (env->num_mem >= num)
993 SCANENV_MEM_NODES(env)[num] = node;
994 else
995 return ONIGERR_PARSER_BUG;
996 return 0;
997 }
998
999
1000 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1001 typedef struct _FreeNode {
1002 struct _FreeNode* next;
1003 } FreeNode;
1004
1005 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1006 #endif
1007
1008 extern void
onig_node_free(Node * node)1009 onig_node_free(Node* node)
1010 {
1011 start:
1012 if (IS_NULL(node)) return ;
1013
1014 switch (NTYPE(node)) {
1015 case NT_STR:
1016 if (NSTR(node)->capa != 0 &&
1017 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1018 xfree(NSTR(node)->s);
1019 }
1020 break;
1021
1022 case NT_LIST:
1023 case NT_ALT:
1024 onig_node_free(NCAR(node));
1025 {
1026 Node* next_node = NCDR(node);
1027
1028 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1029 {
1030 FreeNode* n = (FreeNode* )node;
1031
1032 THREAD_ATOMIC_START;
1033 n->next = FreeNodeList;
1034 FreeNodeList = n;
1035 THREAD_ATOMIC_END;
1036 }
1037 #else
1038 xfree(node);
1039 #endif
1040 node = next_node;
1041 goto start;
1042 }
1043 break;
1044
1045 case NT_CCLASS:
1046 {
1047 CClassNode* cc = NCCLASS(node);
1048
1049 if (IS_NCCLASS_SHARE(cc)) return ;
1050 if (cc->mbuf)
1051 bbuf_free(cc->mbuf);
1052 }
1053 break;
1054
1055 case NT_QTFR:
1056 if (NQTFR(node)->target)
1057 onig_node_free(NQTFR(node)->target);
1058 break;
1059
1060 case NT_ENCLOSE:
1061 if (NENCLOSE(node)->target)
1062 onig_node_free(NENCLOSE(node)->target);
1063 break;
1064
1065 case NT_BREF:
1066 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1067 xfree(NBREF(node)->back_dynamic);
1068 break;
1069
1070 case NT_ANCHOR:
1071 if (NANCHOR(node)->target)
1072 onig_node_free(NANCHOR(node)->target);
1073 break;
1074 }
1075
1076 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1077 {
1078 FreeNode* n = (FreeNode* )node;
1079
1080 THREAD_ATOMIC_START;
1081 n->next = FreeNodeList;
1082 FreeNodeList = n;
1083 THREAD_ATOMIC_END;
1084 }
1085 #else
1086 xfree(node);
1087 #endif
1088 }
1089
1090 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1091 extern int
onig_free_node_list(void)1092 onig_free_node_list(void)
1093 {
1094 FreeNode* n;
1095
1096 /* THREAD_ATOMIC_START; */
1097 while (IS_NOT_NULL(FreeNodeList)) {
1098 n = FreeNodeList;
1099 FreeNodeList = FreeNodeList->next;
1100 xfree(n);
1101 }
1102 /* THREAD_ATOMIC_END; */
1103 return 0;
1104 }
1105 #endif
1106
1107 static Node*
node_new(void)1108 node_new(void)
1109 {
1110 Node* node;
1111
1112 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1113 THREAD_ATOMIC_START;
1114 if (IS_NOT_NULL(FreeNodeList)) {
1115 node = (Node* )FreeNodeList;
1116 FreeNodeList = FreeNodeList->next;
1117 THREAD_ATOMIC_END;
1118 return node;
1119 }
1120 THREAD_ATOMIC_END;
1121 #endif
1122
1123 node = (Node* )xmalloc(sizeof(Node));
1124 /* xmemset(node, 0, sizeof(Node)); */
1125 return node;
1126 }
1127
1128
1129 static void
initialize_cclass(CClassNode * cc)1130 initialize_cclass(CClassNode* cc)
1131 {
1132 BITSET_CLEAR(cc->bs);
1133 /* cc->base.flags = 0; */
1134 cc->flags = 0;
1135 cc->mbuf = NULL;
1136 }
1137
1138 static Node*
node_new_cclass(void)1139 node_new_cclass(void)
1140 {
1141 Node* node = node_new();
1142 CHECK_NULL_RETURN(node);
1143
1144 SET_NTYPE(node, NT_CCLASS);
1145 initialize_cclass(NCCLASS(node));
1146 return node;
1147 }
1148
1149 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1150 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1151 const OnigCodePoint ranges[])
1152 {
1153 int n, i;
1154 CClassNode* cc;
1155 OnigCodePoint j;
1156
1157 Node* node = node_new_cclass();
1158 CHECK_NULL_RETURN(node);
1159
1160 cc = NCCLASS(node);
1161 if (not != 0) NCCLASS_SET_NOT(cc);
1162
1163 BITSET_CLEAR(cc->bs);
1164 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1165 n = ONIGENC_CODE_RANGE_NUM(ranges);
1166 for (i = 0; i < n; i++) {
1167 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
1168 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1169 if (j >= sb_out) goto sb_end;
1170
1171 BITSET_SET_BIT(cc->bs, j);
1172 }
1173 }
1174 }
1175
1176 sb_end:
1177 if (IS_NULL(ranges)) {
1178 is_null:
1179 cc->mbuf = NULL;
1180 }
1181 else {
1182 BBuf* bbuf;
1183
1184 n = ONIGENC_CODE_RANGE_NUM(ranges);
1185 if (n == 0) goto is_null;
1186
1187 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1188 CHECK_NULL_RETURN(bbuf);
1189 bbuf->alloc = n + 1;
1190 bbuf->used = n + 1;
1191 bbuf->p = (UChar* )((void* )ranges);
1192
1193 cc->mbuf = bbuf;
1194 }
1195
1196 return node;
1197 }
1198
1199 static Node*
node_new_ctype(int type,int not)1200 node_new_ctype(int type, int not)
1201 {
1202 Node* node = node_new();
1203 CHECK_NULL_RETURN(node);
1204
1205 SET_NTYPE(node, NT_CTYPE);
1206 NCTYPE(node)->ctype = type;
1207 NCTYPE(node)->not = not;
1208 return node;
1209 }
1210
1211 static Node*
node_new_anychar(void)1212 node_new_anychar(void)
1213 {
1214 Node* node = node_new();
1215 CHECK_NULL_RETURN(node);
1216
1217 SET_NTYPE(node, NT_CANY);
1218 return node;
1219 }
1220
1221 static Node*
node_new_list(Node * left,Node * right)1222 node_new_list(Node* left, Node* right)
1223 {
1224 Node* node = node_new();
1225 CHECK_NULL_RETURN(node);
1226
1227 SET_NTYPE(node, NT_LIST);
1228 NCAR(node) = left;
1229 NCDR(node) = right;
1230 return node;
1231 }
1232
1233 extern Node*
onig_node_new_list(Node * left,Node * right)1234 onig_node_new_list(Node* left, Node* right)
1235 {
1236 return node_new_list(left, right);
1237 }
1238
1239 extern Node*
onig_node_list_add(Node * list,Node * x)1240 onig_node_list_add(Node* list, Node* x)
1241 {
1242 Node *n;
1243
1244 n = onig_node_new_list(x, NULL);
1245 if (IS_NULL(n)) return NULL_NODE;
1246
1247 if (IS_NOT_NULL(list)) {
1248 while (IS_NOT_NULL(NCDR(list)))
1249 list = NCDR(list);
1250
1251 NCDR(list) = n;
1252 }
1253
1254 return n;
1255 }
1256
1257 extern Node*
onig_node_new_alt(Node * left,Node * right)1258 onig_node_new_alt(Node* left, Node* right)
1259 {
1260 Node* node = node_new();
1261 CHECK_NULL_RETURN(node);
1262
1263 SET_NTYPE(node, NT_ALT);
1264 NCAR(node) = left;
1265 NCDR(node) = right;
1266 return node;
1267 }
1268
1269 extern Node*
onig_node_new_anchor(int type)1270 onig_node_new_anchor(int type)
1271 {
1272 Node* node = node_new();
1273 CHECK_NULL_RETURN(node);
1274
1275 SET_NTYPE(node, NT_ANCHOR);
1276 NANCHOR(node)->type = type;
1277 NANCHOR(node)->target = NULL;
1278 NANCHOR(node)->char_len = -1;
1279 return node;
1280 }
1281
1282 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1283 node_new_backref(int back_num, int* backrefs, int by_name,
1284 #ifdef USE_BACKREF_WITH_LEVEL
1285 int exist_level, int nest_level,
1286 #endif
1287 ScanEnv* env)
1288 {
1289 int i;
1290 Node* node = node_new();
1291
1292 CHECK_NULL_RETURN(node);
1293
1294 SET_NTYPE(node, NT_BREF);
1295 NBREF(node)->state = 0;
1296 NBREF(node)->back_num = back_num;
1297 NBREF(node)->back_dynamic = (int* )NULL;
1298 if (by_name != 0)
1299 NBREF(node)->state |= NST_NAME_REF;
1300
1301 #ifdef USE_BACKREF_WITH_LEVEL
1302 if (exist_level != 0) {
1303 NBREF(node)->state |= NST_NEST_LEVEL;
1304 NBREF(node)->nest_level = nest_level;
1305 }
1306 #endif
1307
1308 for (i = 0; i < back_num; i++) {
1309 if (backrefs[i] <= env->num_mem &&
1310 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1311 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1312 break;
1313 }
1314 }
1315
1316 if (back_num <= NODE_BACKREFS_SIZE) {
1317 for (i = 0; i < back_num; i++)
1318 NBREF(node)->back_static[i] = backrefs[i];
1319 }
1320 else {
1321 int* p = (int* )xmalloc(sizeof(int) * back_num);
1322 if (IS_NULL(p)) {
1323 onig_node_free(node);
1324 return NULL;
1325 }
1326 NBREF(node)->back_dynamic = p;
1327 for (i = 0; i < back_num; i++)
1328 p[i] = backrefs[i];
1329 }
1330 return node;
1331 }
1332
1333 #ifdef USE_SUBEXP_CALL
1334 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1335 node_new_call(UChar* name, UChar* name_end, int gnum)
1336 {
1337 Node* node = node_new();
1338 CHECK_NULL_RETURN(node);
1339
1340 SET_NTYPE(node, NT_CALL);
1341 NCALL(node)->state = 0;
1342 NCALL(node)->target = NULL_NODE;
1343 NCALL(node)->name = name;
1344 NCALL(node)->name_end = name_end;
1345 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1346 return node;
1347 }
1348 #endif
1349
1350 static Node*
node_new_quantifier(int lower,int upper,int by_number)1351 node_new_quantifier(int lower, int upper, int by_number)
1352 {
1353 Node* node = node_new();
1354 CHECK_NULL_RETURN(node);
1355
1356 SET_NTYPE(node, NT_QTFR);
1357 NQTFR(node)->state = 0;
1358 NQTFR(node)->target = NULL;
1359 NQTFR(node)->lower = lower;
1360 NQTFR(node)->upper = upper;
1361 NQTFR(node)->greedy = 1;
1362 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1363 NQTFR(node)->head_exact = NULL_NODE;
1364 NQTFR(node)->next_head_exact = NULL_NODE;
1365 NQTFR(node)->is_refered = 0;
1366 if (by_number != 0)
1367 NQTFR(node)->state |= NST_BY_NUMBER;
1368
1369 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1370 NQTFR(node)->comb_exp_check_num = 0;
1371 #endif
1372
1373 return node;
1374 }
1375
1376 static Node*
node_new_enclose(int type)1377 node_new_enclose(int type)
1378 {
1379 Node* node = node_new();
1380 CHECK_NULL_RETURN(node);
1381
1382 SET_NTYPE(node, NT_ENCLOSE);
1383 NENCLOSE(node)->type = type;
1384 NENCLOSE(node)->state = 0;
1385 NENCLOSE(node)->regnum = 0;
1386 NENCLOSE(node)->option = 0;
1387 NENCLOSE(node)->target = NULL;
1388 NENCLOSE(node)->call_addr = -1;
1389 NENCLOSE(node)->opt_count = 0;
1390 return node;
1391 }
1392
1393 extern Node*
onig_node_new_enclose(int type)1394 onig_node_new_enclose(int type)
1395 {
1396 return node_new_enclose(type);
1397 }
1398
1399 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1400 node_new_enclose_memory(OnigOptionType option, int is_named)
1401 {
1402 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1403 CHECK_NULL_RETURN(node);
1404 if (is_named != 0)
1405 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1406
1407 #ifdef USE_SUBEXP_CALL
1408 NENCLOSE(node)->option = option;
1409 #endif
1410 return node;
1411 }
1412
1413 static Node*
node_new_option(OnigOptionType option)1414 node_new_option(OnigOptionType option)
1415 {
1416 Node* node = node_new_enclose(ENCLOSE_OPTION);
1417 CHECK_NULL_RETURN(node);
1418 NENCLOSE(node)->option = option;
1419 return node;
1420 }
1421
1422 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1423 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1424 {
1425 int addlen = end - s;
1426
1427 if (addlen > 0) {
1428 int len = NSTR(node)->end - NSTR(node)->s;
1429
1430 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1431 UChar* p;
1432 int capa = len + addlen + NODE_STR_MARGIN;
1433
1434 if (capa <= NSTR(node)->capa) {
1435 onig_strcpy(NSTR(node)->s + len, s, end);
1436 }
1437 else {
1438 if (NSTR(node)->s == NSTR(node)->buf)
1439 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1440 s, end, capa);
1441 else
1442 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1443
1444 CHECK_NULL_RETURN_MEMERR(p);
1445 NSTR(node)->s = p;
1446 NSTR(node)->capa = capa;
1447 }
1448 }
1449 else {
1450 onig_strcpy(NSTR(node)->s + len, s, end);
1451 }
1452 NSTR(node)->end = NSTR(node)->s + len + addlen;
1453 }
1454
1455 return 0;
1456 }
1457
1458 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1459 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1460 {
1461 onig_node_str_clear(node);
1462 return onig_node_str_cat(node, s, end);
1463 }
1464
1465 static int
node_str_cat_char(Node * node,UChar c)1466 node_str_cat_char(Node* node, UChar c)
1467 {
1468 UChar s[1];
1469
1470 s[0] = c;
1471 return onig_node_str_cat(node, s, s + 1);
1472 }
1473
1474 extern void
onig_node_conv_to_str_node(Node * node,int flag)1475 onig_node_conv_to_str_node(Node* node, int flag)
1476 {
1477 SET_NTYPE(node, NT_STR);
1478 NSTR(node)->flag = flag;
1479 NSTR(node)->capa = 0;
1480 NSTR(node)->s = NSTR(node)->buf;
1481 NSTR(node)->end = NSTR(node)->buf;
1482 }
1483
1484 extern void
onig_node_str_clear(Node * node)1485 onig_node_str_clear(Node* node)
1486 {
1487 if (NSTR(node)->capa != 0 &&
1488 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1489 xfree(NSTR(node)->s);
1490 }
1491
1492 NSTR(node)->capa = 0;
1493 NSTR(node)->flag = 0;
1494 NSTR(node)->s = NSTR(node)->buf;
1495 NSTR(node)->end = NSTR(node)->buf;
1496 }
1497
1498 static Node*
node_new_str(const UChar * s,const UChar * end)1499 node_new_str(const UChar* s, const UChar* end)
1500 {
1501 Node* node = node_new();
1502 CHECK_NULL_RETURN(node);
1503
1504 SET_NTYPE(node, NT_STR);
1505 NSTR(node)->capa = 0;
1506 NSTR(node)->flag = 0;
1507 NSTR(node)->s = NSTR(node)->buf;
1508 NSTR(node)->end = NSTR(node)->buf;
1509 if (onig_node_str_cat(node, s, end)) {
1510 onig_node_free(node);
1511 return NULL;
1512 }
1513 return node;
1514 }
1515
1516 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1517 onig_node_new_str(const UChar* s, const UChar* end)
1518 {
1519 return node_new_str(s, end);
1520 }
1521
1522 static Node*
node_new_str_raw(UChar * s,UChar * end)1523 node_new_str_raw(UChar* s, UChar* end)
1524 {
1525 Node* node = node_new_str(s, end);
1526 NSTRING_SET_RAW(node);
1527 return node;
1528 }
1529
1530 static Node*
node_new_empty(void)1531 node_new_empty(void)
1532 {
1533 return node_new_str(NULL, NULL);
1534 }
1535
1536 static Node*
node_new_str_raw_char(UChar c)1537 node_new_str_raw_char(UChar c)
1538 {
1539 UChar p[1];
1540
1541 p[0] = c;
1542 return node_new_str_raw(p, p + 1);
1543 }
1544
1545 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1546 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1547 {
1548 const UChar *p;
1549 Node* n = NULL_NODE;
1550
1551 if (sn->end > sn->s) {
1552 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1553 if (p && p > sn->s) { /* can be splitted. */
1554 n = node_new_str(p, sn->end);
1555 if ((sn->flag & NSTR_RAW) != 0)
1556 NSTRING_SET_RAW(n);
1557 sn->end = (UChar* )p;
1558 }
1559 }
1560 return n;
1561 }
1562
1563 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1564 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1565 {
1566 if (sn->end > sn->s) {
1567 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1568 }
1569 return 0;
1570 }
1571
1572 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1573 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1574 node_str_head_pad(StrNode* sn, int num, UChar val)
1575 {
1576 UChar buf[NODE_STR_BUF_SIZE];
1577 int i, len;
1578
1579 len = sn->end - sn->s;
1580 onig_strcpy(buf, sn->s, sn->end);
1581 onig_strcpy(&(sn->s[num]), buf, buf + len);
1582 sn->end += num;
1583
1584 for (i = 0; i < num; i++) {
1585 sn->s[i] = val;
1586 }
1587 }
1588 #endif
1589
1590 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1591 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1592 {
1593 unsigned int num, val;
1594 OnigCodePoint c;
1595 UChar* p = *src;
1596 PFETCH_READY;
1597
1598 num = 0;
1599 while (!PEND) {
1600 PFETCH(c);
1601 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1602 val = (unsigned int )DIGITVAL(c);
1603 if ((INT_MAX_LIMIT - val) / 10UL < num)
1604 return -1; /* overflow */
1605
1606 num = num * 10 + val;
1607 }
1608 else {
1609 PUNFETCH;
1610 break;
1611 }
1612 }
1613 *src = p;
1614 return num;
1615 }
1616
1617 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1618 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1619 OnigEncoding enc)
1620 {
1621 OnigCodePoint c;
1622 unsigned int num, val;
1623 UChar* p = *src;
1624 PFETCH_READY;
1625
1626 num = 0;
1627 while (!PEND && maxlen-- != 0) {
1628 PFETCH(c);
1629 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1630 val = (unsigned int )XDIGITVAL(enc,c);
1631 if ((INT_MAX_LIMIT - val) / 16UL < num)
1632 return -1; /* overflow */
1633
1634 num = (num << 4) + XDIGITVAL(enc,c);
1635 }
1636 else {
1637 PUNFETCH;
1638 break;
1639 }
1640 }
1641 *src = p;
1642 return num;
1643 }
1644
1645 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1646 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1647 OnigEncoding enc)
1648 {
1649 OnigCodePoint c;
1650 unsigned int num, val;
1651 UChar* p = *src;
1652 PFETCH_READY;
1653
1654 num = 0;
1655 while (!PEND && maxlen-- != 0) {
1656 PFETCH(c);
1657 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1658 val = ODIGITVAL(c);
1659 if ((INT_MAX_LIMIT - val) / 8UL < num)
1660 return -1; /* overflow */
1661
1662 num = (num << 3) + val;
1663 }
1664 else {
1665 PUNFETCH;
1666 break;
1667 }
1668 }
1669 *src = p;
1670 return num;
1671 }
1672
1673
1674 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1675 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1676
1677 /* data format:
1678 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1679 (all data size is OnigCodePoint)
1680 */
1681 static int
new_code_range(BBuf ** pbuf)1682 new_code_range(BBuf** pbuf)
1683 {
1684 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1685 int r;
1686 OnigCodePoint n;
1687 BBuf* bbuf;
1688
1689 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1690 CHECK_NULL_RETURN_MEMERR(*pbuf);
1691 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1692 if (r) return r;
1693
1694 n = 0;
1695 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1696 return 0;
1697 }
1698
1699 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1700 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1701 {
1702 int r, inc_n, pos;
1703 int low, high, bound, x;
1704 OnigCodePoint n, *data;
1705 BBuf* bbuf;
1706
1707 if (from > to) {
1708 n = from; from = to; to = n;
1709 }
1710
1711 if (IS_NULL(*pbuf)) {
1712 r = new_code_range(pbuf);
1713 if (r) return r;
1714 bbuf = *pbuf;
1715 n = 0;
1716 }
1717 else {
1718 bbuf = *pbuf;
1719 GET_CODE_POINT(n, bbuf->p);
1720 }
1721 data = (OnigCodePoint* )(bbuf->p);
1722 data++;
1723
1724 for (low = 0, bound = n; low < bound; ) {
1725 x = (low + bound) >> 1;
1726 if (from > data[x*2 + 1])
1727 low = x + 1;
1728 else
1729 bound = x;
1730 }
1731
1732 for (high = low, bound = n; high < bound; ) {
1733 x = (high + bound) >> 1;
1734 if (to >= data[x*2] - 1)
1735 high = x + 1;
1736 else
1737 bound = x;
1738 }
1739
1740 inc_n = low + 1 - high;
1741 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1742 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1743
1744 if (inc_n != 1) {
1745 if (from > data[low*2])
1746 from = data[low*2];
1747 if (to < data[(high - 1)*2 + 1])
1748 to = data[(high - 1)*2 + 1];
1749 }
1750
1751 if (inc_n != 0 && (OnigCodePoint )high < n) {
1752 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1753 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1754 int size = (n - high) * 2 * SIZE_CODE_POINT;
1755
1756 if (inc_n > 0) {
1757 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1758 }
1759 else {
1760 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1761 }
1762 }
1763
1764 pos = SIZE_CODE_POINT * (1 + low * 2);
1765 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1766 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1767 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1768 n += inc_n;
1769 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1770
1771 return 0;
1772 }
1773
1774 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1775 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1776 {
1777 if (from > to) {
1778 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1779 return 0;
1780 else
1781 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1782 }
1783
1784 return add_code_range_to_buf(pbuf, from, to);
1785 }
1786
1787 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1788 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1789 {
1790 int r, i, n;
1791 OnigCodePoint pre, from, *data, to = 0;
1792
1793 *pbuf = (BBuf* )NULL;
1794 if (IS_NULL(bbuf)) {
1795 set_all:
1796 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1797 }
1798
1799 data = (OnigCodePoint* )(bbuf->p);
1800 GET_CODE_POINT(n, data);
1801 data++;
1802 if (n <= 0) goto set_all;
1803
1804 r = 0;
1805 pre = MBCODE_START_POS(enc);
1806 for (i = 0; i < n; i++) {
1807 from = data[i*2];
1808 to = data[i*2+1];
1809 if (pre <= from - 1) {
1810 r = add_code_range_to_buf(pbuf, pre, from - 1);
1811 if (r != 0) return r;
1812 }
1813 if (to == ~((OnigCodePoint )0)) break;
1814 pre = to + 1;
1815 }
1816 if (to < ~((OnigCodePoint )0)) {
1817 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1818 }
1819 return r;
1820 }
1821
1822 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1823 BBuf *tbuf; \
1824 int tnot; \
1825 tnot = not1; not1 = not2; not2 = tnot; \
1826 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1827 } while (0)
1828
1829 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1830 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1831 BBuf* bbuf2, int not2, BBuf** pbuf)
1832 {
1833 int r;
1834 OnigCodePoint i, n1, *data1;
1835 OnigCodePoint from, to;
1836
1837 *pbuf = (BBuf* )NULL;
1838 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1839 if (not1 != 0 || not2 != 0)
1840 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1841 return 0;
1842 }
1843
1844 r = 0;
1845 if (IS_NULL(bbuf2))
1846 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1847
1848 if (IS_NULL(bbuf1)) {
1849 if (not1 != 0) {
1850 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1851 }
1852 else {
1853 if (not2 == 0) {
1854 return bbuf_clone(pbuf, bbuf2);
1855 }
1856 else {
1857 return not_code_range_buf(enc, bbuf2, pbuf);
1858 }
1859 }
1860 }
1861
1862 if (not1 != 0)
1863 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1864
1865 data1 = (OnigCodePoint* )(bbuf1->p);
1866 GET_CODE_POINT(n1, data1);
1867 data1++;
1868
1869 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1870 r = bbuf_clone(pbuf, bbuf2);
1871 }
1872 else if (not1 == 0) { /* 1 OR (not 2) */
1873 r = not_code_range_buf(enc, bbuf2, pbuf);
1874 }
1875 if (r != 0) return r;
1876
1877 for (i = 0; i < n1; i++) {
1878 from = data1[i*2];
1879 to = data1[i*2+1];
1880 r = add_code_range_to_buf(pbuf, from, to);
1881 if (r != 0) return r;
1882 }
1883 return 0;
1884 }
1885
1886 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1887 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1888 OnigCodePoint* data, int n)
1889 {
1890 int i, r;
1891 OnigCodePoint from2, to2;
1892
1893 for (i = 0; i < n; i++) {
1894 from2 = data[i*2];
1895 to2 = data[i*2+1];
1896 if (from2 < from1) {
1897 if (to2 < from1) continue;
1898 else {
1899 from1 = to2 + 1;
1900 }
1901 }
1902 else if (from2 <= to1) {
1903 if (to2 < to1) {
1904 if (from1 <= from2 - 1) {
1905 r = add_code_range_to_buf(pbuf, from1, from2-1);
1906 if (r != 0) return r;
1907 }
1908 from1 = to2 + 1;
1909 }
1910 else {
1911 to1 = from2 - 1;
1912 }
1913 }
1914 else {
1915 from1 = from2;
1916 }
1917 if (from1 > to1) break;
1918 }
1919 if (from1 <= to1) {
1920 r = add_code_range_to_buf(pbuf, from1, to1);
1921 if (r != 0) return r;
1922 }
1923 return 0;
1924 }
1925
1926 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1927 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1928 {
1929 int r;
1930 OnigCodePoint i, j, n1, n2, *data1, *data2;
1931 OnigCodePoint from, to, from1, to1, from2, to2;
1932
1933 *pbuf = (BBuf* )NULL;
1934 if (IS_NULL(bbuf1)) {
1935 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1936 return bbuf_clone(pbuf, bbuf2);
1937 return 0;
1938 }
1939 else if (IS_NULL(bbuf2)) {
1940 if (not2 != 0)
1941 return bbuf_clone(pbuf, bbuf1);
1942 return 0;
1943 }
1944
1945 if (not1 != 0)
1946 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1947
1948 data1 = (OnigCodePoint* )(bbuf1->p);
1949 data2 = (OnigCodePoint* )(bbuf2->p);
1950 GET_CODE_POINT(n1, data1);
1951 GET_CODE_POINT(n2, data2);
1952 data1++;
1953 data2++;
1954
1955 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1956 for (i = 0; i < n1; i++) {
1957 from1 = data1[i*2];
1958 to1 = data1[i*2+1];
1959 for (j = 0; j < n2; j++) {
1960 from2 = data2[j*2];
1961 to2 = data2[j*2+1];
1962 if (from2 > to1) break;
1963 if (to2 < from1) continue;
1964 from = MAX(from1, from2);
1965 to = MIN(to1, to2);
1966 r = add_code_range_to_buf(pbuf, from, to);
1967 if (r != 0) return r;
1968 }
1969 }
1970 }
1971 else if (not1 == 0) { /* 1 AND (not 2) */
1972 for (i = 0; i < n1; i++) {
1973 from1 = data1[i*2];
1974 to1 = data1[i*2+1];
1975 r = and_code_range1(pbuf, from1, to1, data2, n2);
1976 if (r != 0) return r;
1977 }
1978 }
1979
1980 return 0;
1981 }
1982
1983 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1984 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1985 {
1986 int r, not1, not2;
1987 BBuf *buf1, *buf2, *pbuf;
1988 BitSetRef bsr1, bsr2;
1989 BitSet bs1, bs2;
1990
1991 not1 = IS_NCCLASS_NOT(dest);
1992 bsr1 = dest->bs;
1993 buf1 = dest->mbuf;
1994 not2 = IS_NCCLASS_NOT(cc);
1995 bsr2 = cc->bs;
1996 buf2 = cc->mbuf;
1997
1998 if (not1 != 0) {
1999 bitset_invert_to(bsr1, bs1);
2000 bsr1 = bs1;
2001 }
2002 if (not2 != 0) {
2003 bitset_invert_to(bsr2, bs2);
2004 bsr2 = bs2;
2005 }
2006 bitset_and(bsr1, bsr2);
2007 if (bsr1 != dest->bs) {
2008 bitset_copy(dest->bs, bsr1);
2009 bsr1 = dest->bs;
2010 }
2011 if (not1 != 0) {
2012 bitset_invert(dest->bs);
2013 }
2014
2015 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2016 if (not1 != 0 && not2 != 0) {
2017 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2018 }
2019 else {
2020 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2021 if (r == 0 && not1 != 0) {
2022 BBuf *tbuf;
2023 r = not_code_range_buf(enc, pbuf, &tbuf);
2024 if (r != 0) {
2025 bbuf_free(pbuf);
2026 return r;
2027 }
2028 bbuf_free(pbuf);
2029 pbuf = tbuf;
2030 }
2031 }
2032 if (r != 0) return r;
2033
2034 dest->mbuf = pbuf;
2035 bbuf_free(buf1);
2036 return r;
2037 }
2038 return 0;
2039 }
2040
2041 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2042 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2043 {
2044 int r, not1, not2;
2045 BBuf *buf1, *buf2, *pbuf;
2046 BitSetRef bsr1, bsr2;
2047 BitSet bs1, bs2;
2048
2049 not1 = IS_NCCLASS_NOT(dest);
2050 bsr1 = dest->bs;
2051 buf1 = dest->mbuf;
2052 not2 = IS_NCCLASS_NOT(cc);
2053 bsr2 = cc->bs;
2054 buf2 = cc->mbuf;
2055
2056 if (not1 != 0) {
2057 bitset_invert_to(bsr1, bs1);
2058 bsr1 = bs1;
2059 }
2060 if (not2 != 0) {
2061 bitset_invert_to(bsr2, bs2);
2062 bsr2 = bs2;
2063 }
2064 bitset_or(bsr1, bsr2);
2065 if (bsr1 != dest->bs) {
2066 bitset_copy(dest->bs, bsr1);
2067 bsr1 = dest->bs;
2068 }
2069 if (not1 != 0) {
2070 bitset_invert(dest->bs);
2071 }
2072
2073 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2074 if (not1 != 0 && not2 != 0) {
2075 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2076 }
2077 else {
2078 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2079 if (r == 0 && not1 != 0) {
2080 BBuf *tbuf;
2081 r = not_code_range_buf(enc, pbuf, &tbuf);
2082 if (r != 0) {
2083 bbuf_free(pbuf);
2084 return r;
2085 }
2086 bbuf_free(pbuf);
2087 pbuf = tbuf;
2088 }
2089 }
2090 if (r != 0) return r;
2091
2092 dest->mbuf = pbuf;
2093 bbuf_free(buf1);
2094 return r;
2095 }
2096 else
2097 return 0;
2098 }
2099
2100 static int
conv_backslash_value(int c,ScanEnv * env)2101 conv_backslash_value(int c, ScanEnv* env)
2102 {
2103 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2104 switch (c) {
2105 case 'n': return '\n';
2106 case 't': return '\t';
2107 case 'r': return '\r';
2108 case 'f': return '\f';
2109 case 'a': return '\007';
2110 case 'b': return '\010';
2111 case 'e': return '\033';
2112 case 'v':
2113 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2114 return '\v';
2115 break;
2116
2117 default:
2118 break;
2119 }
2120 }
2121 return c;
2122 }
2123
2124 static int
is_invalid_quantifier_target(Node * node)2125 is_invalid_quantifier_target(Node* node)
2126 {
2127 switch (NTYPE(node)) {
2128 case NT_ANCHOR:
2129 return 1;
2130 break;
2131
2132 case NT_ENCLOSE:
2133 /* allow enclosed elements */
2134 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2135 break;
2136
2137 case NT_LIST:
2138 do {
2139 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2140 } while (IS_NOT_NULL(node = NCDR(node)));
2141 return 0;
2142 break;
2143
2144 case NT_ALT:
2145 do {
2146 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2147 } while (IS_NOT_NULL(node = NCDR(node)));
2148 break;
2149
2150 default:
2151 break;
2152 }
2153 return 0;
2154 }
2155
2156 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2157 static int
popular_quantifier_num(QtfrNode * q)2158 popular_quantifier_num(QtfrNode* q)
2159 {
2160 if (q->greedy) {
2161 if (q->lower == 0) {
2162 if (q->upper == 1) return 0;
2163 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2164 }
2165 else if (q->lower == 1) {
2166 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2167 }
2168 }
2169 else {
2170 if (q->lower == 0) {
2171 if (q->upper == 1) return 3;
2172 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2173 }
2174 else if (q->lower == 1) {
2175 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2176 }
2177 }
2178 return -1;
2179 }
2180
2181
2182 enum ReduceType {
2183 RQ_ASIS = 0, /* as is */
2184 RQ_DEL = 1, /* delete parent */
2185 RQ_A, /* to '*' */
2186 RQ_AQ, /* to '*?' */
2187 RQ_QQ, /* to '??' */
2188 RQ_P_QQ, /* to '+)??' */
2189 RQ_PQ_Q /* to '+?)?' */
2190 };
2191
2192 static enum ReduceType ReduceTypeTable[6][6] = {
2193 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2194 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2195 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2196 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2197 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2198 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2199 };
2200
2201 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2202 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2203 {
2204 int pnum, cnum;
2205 QtfrNode *p, *c;
2206
2207 p = NQTFR(pnode);
2208 c = NQTFR(cnode);
2209 pnum = popular_quantifier_num(p);
2210 cnum = popular_quantifier_num(c);
2211 if (pnum < 0 || cnum < 0) return ;
2212
2213 switch(ReduceTypeTable[cnum][pnum]) {
2214 case RQ_DEL:
2215 *pnode = *cnode;
2216 break;
2217 case RQ_A:
2218 p->target = c->target;
2219 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2220 break;
2221 case RQ_AQ:
2222 p->target = c->target;
2223 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2224 break;
2225 case RQ_QQ:
2226 p->target = c->target;
2227 p->lower = 0; p->upper = 1; p->greedy = 0;
2228 break;
2229 case RQ_P_QQ:
2230 p->target = cnode;
2231 p->lower = 0; p->upper = 1; p->greedy = 0;
2232 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2233 return ;
2234 break;
2235 case RQ_PQ_Q:
2236 p->target = cnode;
2237 p->lower = 0; p->upper = 1; p->greedy = 1;
2238 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2239 return ;
2240 break;
2241 case RQ_ASIS:
2242 p->target = cnode;
2243 return ;
2244 break;
2245 }
2246
2247 c->target = NULL_NODE;
2248 onig_node_free(cnode);
2249 }
2250
2251
2252 enum TokenSyms {
2253 TK_EOT = 0, /* end of token */
2254 TK_RAW_BYTE = 1,
2255 TK_CHAR,
2256 TK_STRING,
2257 TK_CODE_POINT,
2258 TK_ANYCHAR,
2259 TK_CHAR_TYPE,
2260 TK_BACKREF,
2261 TK_CALL,
2262 TK_ANCHOR,
2263 TK_OP_REPEAT,
2264 TK_INTERVAL,
2265 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2266 TK_ALT,
2267 TK_SUBEXP_OPEN,
2268 TK_SUBEXP_CLOSE,
2269 TK_CC_OPEN,
2270 TK_QUOTE_OPEN,
2271 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2272 /* in cc */
2273 TK_CC_CLOSE,
2274 TK_CC_RANGE,
2275 TK_POSIX_BRACKET_OPEN,
2276 TK_CC_AND, /* && */
2277 TK_CC_CC_OPEN /* [ */
2278 };
2279
2280 typedef struct {
2281 enum TokenSyms type;
2282 int escaped;
2283 int base; /* is number: 8, 16 (used in [....]) */
2284 UChar* backp;
2285 union {
2286 UChar* s;
2287 int c;
2288 OnigCodePoint code;
2289 int anchor;
2290 int subtype;
2291 struct {
2292 int lower;
2293 int upper;
2294 int greedy;
2295 int possessive;
2296 } repeat;
2297 struct {
2298 int num;
2299 int ref1;
2300 int* refs;
2301 int by_name;
2302 #ifdef USE_BACKREF_WITH_LEVEL
2303 int exist_level;
2304 int level; /* \k<name+n> */
2305 #endif
2306 } backref;
2307 struct {
2308 UChar* name;
2309 UChar* name_end;
2310 int gnum;
2311 } call;
2312 struct {
2313 int ctype;
2314 int not;
2315 } prop;
2316 } u;
2317 } OnigToken;
2318
2319
2320 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2321 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2322 {
2323 int low, up, syn_allow, non_low = 0;
2324 int r = 0;
2325 OnigCodePoint c;
2326 OnigEncoding enc = env->enc;
2327 UChar* p = *src;
2328 PFETCH_READY;
2329
2330 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2331
2332 if (PEND) {
2333 if (syn_allow)
2334 return 1; /* "....{" : OK! */
2335 else
2336 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2337 }
2338
2339 if (! syn_allow) {
2340 c = PPEEK;
2341 if (c == ')' || c == '(' || c == '|') {
2342 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2343 }
2344 }
2345
2346 low = onig_scan_unsigned_number(&p, end, env->enc);
2347 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2348 if (low > ONIG_MAX_REPEAT_NUM)
2349 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2350
2351 if (p == *src) { /* can't read low */
2352 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2353 /* allow {,n} as {0,n} */
2354 low = 0;
2355 non_low = 1;
2356 }
2357 else
2358 goto invalid;
2359 }
2360
2361 if (PEND) goto invalid;
2362 PFETCH(c);
2363 if (c == ',') {
2364 UChar* prev = p;
2365 up = onig_scan_unsigned_number(&p, end, env->enc);
2366 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2367 if (up > ONIG_MAX_REPEAT_NUM)
2368 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2369
2370 if (p == prev) {
2371 if (non_low != 0)
2372 goto invalid;
2373 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2374 }
2375 }
2376 else {
2377 if (non_low != 0)
2378 goto invalid;
2379
2380 PUNFETCH;
2381 up = low; /* {n} : exact n times */
2382 r = 2; /* fixed */
2383 }
2384
2385 if (PEND) goto invalid;
2386 PFETCH(c);
2387 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2388 if (c != MC_ESC(env->syntax)) goto invalid;
2389 PFETCH(c);
2390 }
2391 if (c != '}') goto invalid;
2392
2393 if (!IS_REPEAT_INFINITE(up) && low > up) {
2394 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2395 }
2396
2397 tok->type = TK_INTERVAL;
2398 tok->u.repeat.lower = low;
2399 tok->u.repeat.upper = up;
2400 *src = p;
2401 return r; /* 0: normal {n,m}, 2: fixed {n} */
2402
2403 invalid:
2404 if (syn_allow)
2405 return 1; /* OK */
2406 else
2407 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2408 }
2409
2410 /* \M-, \C-, \c, or \... */
2411 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2412 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2413 {
2414 int v;
2415 OnigCodePoint c;
2416 OnigEncoding enc = env->enc;
2417 UChar* p = *src;
2418
2419 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2420
2421 PFETCH_S(c);
2422 switch (c) {
2423 case 'M':
2424 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2425 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2426 PFETCH_S(c);
2427 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2428 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2429 PFETCH_S(c);
2430 if (c == MC_ESC(env->syntax)) {
2431 v = fetch_escaped_value(&p, end, env);
2432 if (v < 0) return v;
2433 c = (OnigCodePoint )v;
2434 }
2435 c = ((c & 0xff) | 0x80);
2436 }
2437 else
2438 goto backslash;
2439 break;
2440
2441 case 'C':
2442 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2443 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2444 PFETCH_S(c);
2445 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2446 goto control;
2447 }
2448 else
2449 goto backslash;
2450
2451 case 'c':
2452 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2453 control:
2454 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2455 PFETCH_S(c);
2456 if (c == '?') {
2457 c = 0177;
2458 }
2459 else {
2460 if (c == MC_ESC(env->syntax)) {
2461 v = fetch_escaped_value(&p, end, env);
2462 if (v < 0) return v;
2463 c = (OnigCodePoint )v;
2464 }
2465 c &= 0x9f;
2466 }
2467 break;
2468 }
2469 /* fall through */
2470
2471 default:
2472 {
2473 backslash:
2474 c = conv_backslash_value(c, env);
2475 }
2476 break;
2477 }
2478
2479 *src = p;
2480 return c;
2481 }
2482
2483 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2484
2485 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2486 get_name_end_code_point(OnigCodePoint start)
2487 {
2488 switch (start) {
2489 case '<': return (OnigCodePoint )'>'; break;
2490 case '\'': return (OnigCodePoint )'\''; break;
2491 default:
2492 break;
2493 }
2494
2495 return (OnigCodePoint )0;
2496 }
2497
2498 #ifdef USE_NAMED_GROUP
2499 #ifdef USE_BACKREF_WITH_LEVEL
2500 /*
2501 \k<name+n>, \k<name-n>
2502 \k<num+n>, \k<num-n>
2503 \k<-num+n>, \k<-num-n>
2504 */
2505 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2506 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2507 UChar** rname_end, ScanEnv* env,
2508 int* rback_num, int* rlevel)
2509 {
2510 int r, sign, is_num, exist_level;
2511 OnigCodePoint end_code;
2512 OnigCodePoint c = 0;
2513 OnigEncoding enc = env->enc;
2514 UChar *name_end;
2515 UChar *pnum_head;
2516 UChar *p = *src;
2517 PFETCH_READY;
2518
2519 *rback_num = 0;
2520 is_num = exist_level = 0;
2521 sign = 1;
2522 pnum_head = *src;
2523
2524 end_code = get_name_end_code_point(start_code);
2525
2526 name_end = end;
2527 r = 0;
2528 if (PEND) {
2529 return ONIGERR_EMPTY_GROUP_NAME;
2530 }
2531 else {
2532 PFETCH(c);
2533 if (c == end_code)
2534 return ONIGERR_EMPTY_GROUP_NAME;
2535
2536 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2537 is_num = 1;
2538 }
2539 else if (c == '-') {
2540 is_num = 2;
2541 sign = -1;
2542 pnum_head = p;
2543 }
2544 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2545 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2546 }
2547 }
2548
2549 while (!PEND) {
2550 name_end = p;
2551 PFETCH(c);
2552 if (c == end_code || c == ')' || c == '+' || c == '-') {
2553 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2554 break;
2555 }
2556
2557 if (is_num != 0) {
2558 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2559 is_num = 1;
2560 }
2561 else {
2562 r = ONIGERR_INVALID_GROUP_NAME;
2563 is_num = 0;
2564 }
2565 }
2566 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2567 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2568 }
2569 }
2570
2571 if (r == 0 && c != end_code) {
2572 if (c == '+' || c == '-') {
2573 int level;
2574 int flag = (c == '-' ? -1 : 1);
2575
2576 PFETCH(c);
2577 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2578 PUNFETCH;
2579 level = onig_scan_unsigned_number(&p, end, enc);
2580 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2581 *rlevel = (level * flag);
2582 exist_level = 1;
2583
2584 PFETCH(c);
2585 if (c == end_code)
2586 goto end;
2587 }
2588
2589 err:
2590 r = ONIGERR_INVALID_GROUP_NAME;
2591 name_end = end;
2592 }
2593
2594 end:
2595 if (r == 0) {
2596 if (is_num != 0) {
2597 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2598 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2599 else if (*rback_num == 0) goto err;
2600
2601 *rback_num *= sign;
2602 }
2603
2604 *rname_end = name_end;
2605 *src = p;
2606 return (exist_level ? 1 : 0);
2607 }
2608 else {
2609 onig_scan_env_set_error_string(env, r, *src, name_end);
2610 return r;
2611 }
2612 }
2613 #endif /* USE_BACKREF_WITH_LEVEL */
2614
2615 /*
2616 def: 0 -> define name (don't allow number name)
2617 1 -> reference name (allow number name)
2618 */
2619 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2620 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2621 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2622 {
2623 int r, is_num, sign;
2624 OnigCodePoint end_code;
2625 OnigCodePoint c = 0;
2626 OnigEncoding enc = env->enc;
2627 UChar *name_end;
2628 UChar *pnum_head;
2629 UChar *p = *src;
2630
2631 *rback_num = 0;
2632
2633 end_code = get_name_end_code_point(start_code);
2634
2635 name_end = end;
2636 pnum_head = *src;
2637 r = 0;
2638 is_num = 0;
2639 sign = 1;
2640 if (PEND) {
2641 return ONIGERR_EMPTY_GROUP_NAME;
2642 }
2643 else {
2644 PFETCH_S(c);
2645 if (c == end_code)
2646 return ONIGERR_EMPTY_GROUP_NAME;
2647
2648 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2649 if (ref == 1)
2650 is_num = 1;
2651 else {
2652 r = ONIGERR_INVALID_GROUP_NAME;
2653 is_num = 0;
2654 }
2655 }
2656 else if (c == '-') {
2657 if (ref == 1) {
2658 is_num = 2;
2659 sign = -1;
2660 pnum_head = p;
2661 }
2662 else {
2663 r = ONIGERR_INVALID_GROUP_NAME;
2664 is_num = 0;
2665 }
2666 }
2667 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2668 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2669 }
2670 }
2671
2672 if (r == 0) {
2673 while (!PEND) {
2674 name_end = p;
2675 PFETCH_S(c);
2676 if (c == end_code || c == ')') {
2677 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2678 break;
2679 }
2680
2681 if (is_num != 0) {
2682 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2683 is_num = 1;
2684 }
2685 else {
2686 if (!ONIGENC_IS_CODE_WORD(enc, c))
2687 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2688 else
2689 r = ONIGERR_INVALID_GROUP_NAME;
2690 is_num = 0;
2691 }
2692 }
2693 else {
2694 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2695 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2696 }
2697 }
2698 }
2699
2700 if (c != end_code) {
2701 r = ONIGERR_INVALID_GROUP_NAME;
2702 name_end = end;
2703 }
2704
2705 if (is_num != 0) {
2706 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2707 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2708 else if (*rback_num == 0) {
2709 r = ONIGERR_INVALID_GROUP_NAME;
2710 goto err;
2711 }
2712
2713 *rback_num *= sign;
2714 }
2715
2716 *rname_end = name_end;
2717 *src = p;
2718 return 0;
2719 }
2720 else {
2721 while (!PEND) {
2722 name_end = p;
2723 PFETCH_S(c);
2724 if (c == end_code || c == ')')
2725 break;
2726 }
2727 if (PEND)
2728 name_end = end;
2729
2730 err:
2731 onig_scan_env_set_error_string(env, r, *src, name_end);
2732 return r;
2733 }
2734 }
2735 #else
2736 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2737 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2738 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2739 {
2740 int r, is_num, sign;
2741 OnigCodePoint end_code;
2742 OnigCodePoint c = 0;
2743 UChar *name_end;
2744 OnigEncoding enc = env->enc;
2745 UChar *pnum_head;
2746 UChar *p = *src;
2747 PFETCH_READY;
2748
2749 *rback_num = 0;
2750
2751 end_code = get_name_end_code_point(start_code);
2752
2753 *rname_end = name_end = end;
2754 r = 0;
2755 pnum_head = *src;
2756 is_num = 0;
2757 sign = 1;
2758
2759 if (PEND) {
2760 return ONIGERR_EMPTY_GROUP_NAME;
2761 }
2762 else {
2763 PFETCH(c);
2764 if (c == end_code)
2765 return ONIGERR_EMPTY_GROUP_NAME;
2766
2767 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2768 is_num = 1;
2769 }
2770 else if (c == '-') {
2771 is_num = 2;
2772 sign = -1;
2773 pnum_head = p;
2774 }
2775 else {
2776 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2777 }
2778 }
2779
2780 while (!PEND) {
2781 name_end = p;
2782
2783 PFETCH(c);
2784 if (c == end_code || c == ')') break;
2785 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2786 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2787 }
2788 if (r == 0 && c != end_code) {
2789 r = ONIGERR_INVALID_GROUP_NAME;
2790 name_end = end;
2791 }
2792
2793 if (r == 0) {
2794 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2795 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2796 else if (*rback_num == 0) {
2797 r = ONIGERR_INVALID_GROUP_NAME;
2798 goto err;
2799 }
2800 *rback_num *= sign;
2801
2802 *rname_end = name_end;
2803 *src = p;
2804 return 0;
2805 }
2806 else {
2807 err:
2808 onig_scan_env_set_error_string(env, r, *src, name_end);
2809 return r;
2810 }
2811 }
2812 #endif /* USE_NAMED_GROUP */
2813
2814 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2815 CC_ESC_WARN(ScanEnv* env, UChar *c)
2816 {
2817 if (onig_warn == onig_null_warn) return ;
2818
2819 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2820 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2821 UChar buf[WARN_BUFSIZE];
2822 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2823 env->pattern, env->pattern_end,
2824 (UChar* )"character class has '%s' without escape", c);
2825 (*onig_warn)((char* )buf);
2826 }
2827 }
2828
2829 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2830 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2831 {
2832 if (onig_warn == onig_null_warn) return ;
2833
2834 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2835 UChar buf[WARN_BUFSIZE];
2836 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2837 (env)->pattern, (env)->pattern_end,
2838 (UChar* )"regular expression has '%s' without escape", c);
2839 (*onig_warn)((char* )buf);
2840 }
2841 }
2842
2843 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2844 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2845 UChar **next, OnigEncoding enc)
2846 {
2847 int i;
2848 OnigCodePoint x;
2849 UChar *q;
2850 UChar *p = from;
2851
2852 while (p < to) {
2853 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2854 q = p + enclen(enc, p);
2855 if (x == s[0]) {
2856 for (i = 1; i < n && q < to; i++) {
2857 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2858 if (x != s[i]) break;
2859 q += enclen(enc, q);
2860 }
2861 if (i >= n) {
2862 if (IS_NOT_NULL(next))
2863 *next = q;
2864 return p;
2865 }
2866 }
2867 p = q;
2868 }
2869 return NULL_UCHARP;
2870 }
2871
2872 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2873 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2874 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2875 {
2876 int i, in_esc;
2877 OnigCodePoint x;
2878 UChar *q;
2879 UChar *p = from;
2880
2881 in_esc = 0;
2882 while (p < to) {
2883 if (in_esc) {
2884 in_esc = 0;
2885 p += enclen(enc, p);
2886 }
2887 else {
2888 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2889 q = p + enclen(enc, p);
2890 if (x == s[0]) {
2891 for (i = 1; i < n && q < to; i++) {
2892 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2893 if (x != s[i]) break;
2894 q += enclen(enc, q);
2895 }
2896 if (i >= n) return 1;
2897 p += enclen(enc, p);
2898 }
2899 else {
2900 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2901 if (x == bad) return 0;
2902 else if (x == MC_ESC(syn)) in_esc = 1;
2903 p = q;
2904 }
2905 }
2906 }
2907 return 0;
2908 }
2909
2910 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2911 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2912 {
2913 int num;
2914 OnigCodePoint c, c2;
2915 OnigSyntaxType* syn = env->syntax;
2916 OnigEncoding enc = env->enc;
2917 UChar* prev;
2918 UChar* p = *src;
2919 PFETCH_READY;
2920
2921 if (PEND) {
2922 tok->type = TK_EOT;
2923 return tok->type;
2924 }
2925
2926 PFETCH(c);
2927 tok->type = TK_CHAR;
2928 tok->base = 0;
2929 tok->u.c = c;
2930 tok->escaped = 0;
2931
2932 if (c == ']') {
2933 tok->type = TK_CC_CLOSE;
2934 }
2935 else if (c == '-') {
2936 tok->type = TK_CC_RANGE;
2937 }
2938 else if (c == MC_ESC(syn)) {
2939 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2940 goto end;
2941
2942 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2943
2944 PFETCH(c);
2945 tok->escaped = 1;
2946 tok->u.c = c;
2947 switch (c) {
2948 case 'w':
2949 tok->type = TK_CHAR_TYPE;
2950 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2951 tok->u.prop.not = 0;
2952 break;
2953 case 'W':
2954 tok->type = TK_CHAR_TYPE;
2955 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2956 tok->u.prop.not = 1;
2957 break;
2958 case 'd':
2959 tok->type = TK_CHAR_TYPE;
2960 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2961 tok->u.prop.not = 0;
2962 break;
2963 case 'D':
2964 tok->type = TK_CHAR_TYPE;
2965 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2966 tok->u.prop.not = 1;
2967 break;
2968 case 's':
2969 tok->type = TK_CHAR_TYPE;
2970 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2971 tok->u.prop.not = 0;
2972 break;
2973 case 'S':
2974 tok->type = TK_CHAR_TYPE;
2975 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2976 tok->u.prop.not = 1;
2977 break;
2978 case 'h':
2979 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2980 tok->type = TK_CHAR_TYPE;
2981 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2982 tok->u.prop.not = 0;
2983 break;
2984 case 'H':
2985 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2986 tok->type = TK_CHAR_TYPE;
2987 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2988 tok->u.prop.not = 1;
2989 break;
2990
2991 case 'p':
2992 case 'P':
2993 c2 = PPEEK;
2994 if (c2 == '{' &&
2995 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2996 PINC;
2997 tok->type = TK_CHAR_PROPERTY;
2998 tok->u.prop.not = (c == 'P' ? 1 : 0);
2999
3000 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3001 PFETCH(c2);
3002 if (c2 == '^') {
3003 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3004 }
3005 else
3006 PUNFETCH;
3007 }
3008 }
3009 break;
3010
3011 case 'x':
3012 if (PEND) break;
3013
3014 prev = p;
3015 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3016 PINC;
3017 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3018 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3019 if (!PEND) {
3020 c2 = PPEEK;
3021 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3022 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3023 }
3024
3025 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3026 PINC;
3027 tok->type = TK_CODE_POINT;
3028 tok->base = 16;
3029 tok->u.code = (OnigCodePoint )num;
3030 }
3031 else {
3032 /* can't read nothing or invalid format */
3033 p = prev;
3034 }
3035 }
3036 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3037 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3038 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3039 if (p == prev) { /* can't read nothing. */
3040 num = 0; /* but, it's not error */
3041 }
3042 tok->type = TK_RAW_BYTE;
3043 tok->base = 16;
3044 tok->u.c = num;
3045 }
3046 break;
3047
3048 case 'u':
3049 if (PEND) break;
3050
3051 prev = p;
3052 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3053 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3054 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3055 if (p == prev) { /* can't read nothing. */
3056 num = 0; /* but, it's not error */
3057 }
3058 tok->type = TK_CODE_POINT;
3059 tok->base = 16;
3060 tok->u.code = (OnigCodePoint )num;
3061 }
3062 break;
3063
3064 case '0':
3065 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3066 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3067 PUNFETCH;
3068 prev = p;
3069 num = scan_unsigned_octal_number(&p, end, 3, enc);
3070 if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3071 if (p == prev) { /* can't read nothing. */
3072 num = 0; /* but, it's not error */
3073 }
3074 tok->type = TK_RAW_BYTE;
3075 tok->base = 8;
3076 tok->u.c = num;
3077 }
3078 break;
3079
3080 default:
3081 PUNFETCH;
3082 num = fetch_escaped_value(&p, end, env);
3083 if (num < 0) return num;
3084 if (tok->u.c != num) {
3085 tok->u.code = (OnigCodePoint )num;
3086 tok->type = TK_CODE_POINT;
3087 }
3088 break;
3089 }
3090 }
3091 else if (c == '[') {
3092 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3093 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3094 tok->backp = p; /* point at '[' is readed */
3095 PINC;
3096 if (str_exist_check_with_esc(send, 2, p, end,
3097 (OnigCodePoint )']', enc, syn)) {
3098 tok->type = TK_POSIX_BRACKET_OPEN;
3099 }
3100 else {
3101 PUNFETCH;
3102 goto cc_in_cc;
3103 }
3104 }
3105 else {
3106 cc_in_cc:
3107 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3108 tok->type = TK_CC_CC_OPEN;
3109 }
3110 else {
3111 CC_ESC_WARN(env, (UChar* )"[");
3112 }
3113 }
3114 }
3115 else if (c == '&') {
3116 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3117 !PEND && (PPEEK_IS('&'))) {
3118 PINC;
3119 tok->type = TK_CC_AND;
3120 }
3121 }
3122
3123 end:
3124 *src = p;
3125 return tok->type;
3126 }
3127
3128 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3129 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3130 {
3131 int r, num;
3132 OnigCodePoint c;
3133 OnigEncoding enc = env->enc;
3134 OnigSyntaxType* syn = env->syntax;
3135 UChar* prev;
3136 UChar* p = *src;
3137 PFETCH_READY;
3138
3139 start:
3140 if (PEND) {
3141 tok->type = TK_EOT;
3142 return tok->type;
3143 }
3144
3145 tok->type = TK_STRING;
3146 tok->base = 0;
3147 tok->backp = p;
3148
3149 PFETCH(c);
3150 if (IS_MC_ESC_CODE(c, syn)) {
3151 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3152
3153 tok->backp = p;
3154 PFETCH(c);
3155
3156 tok->u.c = c;
3157 tok->escaped = 1;
3158 switch (c) {
3159 case '*':
3160 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3161 tok->type = TK_OP_REPEAT;
3162 tok->u.repeat.lower = 0;
3163 tok->u.repeat.upper = REPEAT_INFINITE;
3164 goto greedy_check;
3165 break;
3166
3167 case '+':
3168 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3169 tok->type = TK_OP_REPEAT;
3170 tok->u.repeat.lower = 1;
3171 tok->u.repeat.upper = REPEAT_INFINITE;
3172 goto greedy_check;
3173 break;
3174
3175 case '?':
3176 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3177 tok->type = TK_OP_REPEAT;
3178 tok->u.repeat.lower = 0;
3179 tok->u.repeat.upper = 1;
3180 greedy_check:
3181 if (!PEND && PPEEK_IS('?') &&
3182 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3183 PFETCH(c);
3184 tok->u.repeat.greedy = 0;
3185 tok->u.repeat.possessive = 0;
3186 }
3187 else {
3188 possessive_check:
3189 if (!PEND && PPEEK_IS('+') &&
3190 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3191 tok->type != TK_INTERVAL) ||
3192 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3193 tok->type == TK_INTERVAL))) {
3194 PFETCH(c);
3195 tok->u.repeat.greedy = 1;
3196 tok->u.repeat.possessive = 1;
3197 }
3198 else {
3199 tok->u.repeat.greedy = 1;
3200 tok->u.repeat.possessive = 0;
3201 }
3202 }
3203 break;
3204
3205 case '{':
3206 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3207 r = fetch_range_quantifier(&p, end, tok, env);
3208 if (r < 0) return r; /* error */
3209 if (r == 0) goto greedy_check;
3210 else if (r == 2) { /* {n} */
3211 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3212 goto possessive_check;
3213
3214 goto greedy_check;
3215 }
3216 /* r == 1 : normal char */
3217 break;
3218
3219 case '|':
3220 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3221 tok->type = TK_ALT;
3222 break;
3223
3224 case '(':
3225 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3226 tok->type = TK_SUBEXP_OPEN;
3227 break;
3228
3229 case ')':
3230 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3231 tok->type = TK_SUBEXP_CLOSE;
3232 break;
3233
3234 case 'w':
3235 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3236 tok->type = TK_CHAR_TYPE;
3237 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3238 tok->u.prop.not = 0;
3239 break;
3240
3241 case 'W':
3242 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3243 tok->type = TK_CHAR_TYPE;
3244 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3245 tok->u.prop.not = 1;
3246 break;
3247
3248 case 'b':
3249 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3250 tok->type = TK_ANCHOR;
3251 tok->u.anchor = ANCHOR_WORD_BOUND;
3252 break;
3253
3254 case 'B':
3255 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3256 tok->type = TK_ANCHOR;
3257 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3258 break;
3259
3260 #ifdef USE_WORD_BEGIN_END
3261 case '<':
3262 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3263 tok->type = TK_ANCHOR;
3264 tok->u.anchor = ANCHOR_WORD_BEGIN;
3265 break;
3266
3267 case '>':
3268 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3269 tok->type = TK_ANCHOR;
3270 tok->u.anchor = ANCHOR_WORD_END;
3271 break;
3272 #endif
3273
3274 case 's':
3275 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3276 tok->type = TK_CHAR_TYPE;
3277 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3278 tok->u.prop.not = 0;
3279 break;
3280
3281 case 'S':
3282 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3283 tok->type = TK_CHAR_TYPE;
3284 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3285 tok->u.prop.not = 1;
3286 break;
3287
3288 case 'd':
3289 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3290 tok->type = TK_CHAR_TYPE;
3291 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3292 tok->u.prop.not = 0;
3293 break;
3294
3295 case 'D':
3296 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3297 tok->type = TK_CHAR_TYPE;
3298 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3299 tok->u.prop.not = 1;
3300 break;
3301
3302 case 'h':
3303 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3304 tok->type = TK_CHAR_TYPE;
3305 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3306 tok->u.prop.not = 0;
3307 break;
3308
3309 case 'H':
3310 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3311 tok->type = TK_CHAR_TYPE;
3312 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3313 tok->u.prop.not = 1;
3314 break;
3315
3316 case 'A':
3317 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3318 begin_buf:
3319 tok->type = TK_ANCHOR;
3320 tok->u.subtype = ANCHOR_BEGIN_BUF;
3321 break;
3322
3323 case 'Z':
3324 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3325 tok->type = TK_ANCHOR;
3326 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3327 break;
3328
3329 case 'z':
3330 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3331 end_buf:
3332 tok->type = TK_ANCHOR;
3333 tok->u.subtype = ANCHOR_END_BUF;
3334 break;
3335
3336 case 'G':
3337 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3338 tok->type = TK_ANCHOR;
3339 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3340 break;
3341
3342 case '`':
3343 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3344 goto begin_buf;
3345 break;
3346
3347 case '\'':
3348 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3349 goto end_buf;
3350 break;
3351
3352 case 'x':
3353 if (PEND) break;
3354
3355 prev = p;
3356 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3357 PINC;
3358 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3359 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3360 if (!PEND) {
3361 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3362 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3363 }
3364
3365 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3366 PINC;
3367 tok->type = TK_CODE_POINT;
3368 tok->u.code = (OnigCodePoint )num;
3369 }
3370 else {
3371 /* can't read nothing or invalid format */
3372 p = prev;
3373 }
3374 }
3375 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3376 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3377 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3378 if (p == prev) { /* can't read nothing. */
3379 num = 0; /* but, it's not error */
3380 }
3381 tok->type = TK_RAW_BYTE;
3382 tok->base = 16;
3383 tok->u.c = num;
3384 }
3385 break;
3386
3387 case 'u':
3388 if (PEND) break;
3389
3390 prev = p;
3391 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3392 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3393 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3394 if (p == prev) { /* can't read nothing. */
3395 num = 0; /* but, it's not error */
3396 }
3397 tok->type = TK_CODE_POINT;
3398 tok->base = 16;
3399 tok->u.code = (OnigCodePoint )num;
3400 }
3401 break;
3402
3403 case '1': case '2': case '3': case '4':
3404 case '5': case '6': case '7': case '8': case '9':
3405 PUNFETCH;
3406 prev = p;
3407 num = onig_scan_unsigned_number(&p, end, enc);
3408 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3409 goto skip_backref;
3410 }
3411
3412 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3413 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3414 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3415 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3416 return ONIGERR_INVALID_BACKREF;
3417 }
3418
3419 tok->type = TK_BACKREF;
3420 tok->u.backref.num = 1;
3421 tok->u.backref.ref1 = num;
3422 tok->u.backref.by_name = 0;
3423 #ifdef USE_BACKREF_WITH_LEVEL
3424 tok->u.backref.exist_level = 0;
3425 #endif
3426 break;
3427 }
3428
3429 skip_backref:
3430 if (c == '8' || c == '9') {
3431 /* normal char */
3432 p = prev; PINC;
3433 break;
3434 }
3435
3436 p = prev;
3437 /* fall through */
3438 case '0':
3439 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3440 prev = p;
3441 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3442 if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3443 if (p == prev) { /* can't read nothing. */
3444 num = 0; /* but, it's not error */
3445 }
3446 tok->type = TK_RAW_BYTE;
3447 tok->base = 8;
3448 tok->u.c = num;
3449 }
3450 else if (c != '0') {
3451 PINC;
3452 }
3453 break;
3454
3455 #ifdef USE_NAMED_GROUP
3456 case 'k':
3457 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3458 PFETCH(c);
3459 if (c == '<' || c == '\'') {
3460 UChar* name_end;
3461 int* backs;
3462 int back_num;
3463
3464 prev = p;
3465
3466 #ifdef USE_BACKREF_WITH_LEVEL
3467 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3468 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3469 env, &back_num, &tok->u.backref.level);
3470 if (r == 1) tok->u.backref.exist_level = 1;
3471 else tok->u.backref.exist_level = 0;
3472 #else
3473 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3474 #endif
3475 if (r < 0) return r;
3476
3477 if (back_num != 0) {
3478 if (back_num < 0) {
3479 back_num = BACKREF_REL_TO_ABS(back_num, env);
3480 if (back_num <= 0)
3481 return ONIGERR_INVALID_BACKREF;
3482 }
3483
3484 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3485 if (back_num > env->num_mem ||
3486 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3487 return ONIGERR_INVALID_BACKREF;
3488 }
3489 tok->type = TK_BACKREF;
3490 tok->u.backref.by_name = 0;
3491 tok->u.backref.num = 1;
3492 tok->u.backref.ref1 = back_num;
3493 }
3494 else {
3495 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3496 if (num <= 0) {
3497 onig_scan_env_set_error_string(env,
3498 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3499 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3500 }
3501 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3502 int i;
3503 for (i = 0; i < num; i++) {
3504 if (backs[i] > env->num_mem ||
3505 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3506 return ONIGERR_INVALID_BACKREF;
3507 }
3508 }
3509
3510 tok->type = TK_BACKREF;
3511 tok->u.backref.by_name = 1;
3512 if (num == 1) {
3513 tok->u.backref.num = 1;
3514 tok->u.backref.ref1 = backs[0];
3515 }
3516 else {
3517 tok->u.backref.num = num;
3518 tok->u.backref.refs = backs;
3519 }
3520 }
3521 }
3522 else
3523 PUNFETCH;
3524 }
3525 break;
3526 #endif
3527
3528 #ifdef USE_SUBEXP_CALL
3529 case 'g':
3530 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3531 PFETCH(c);
3532 if (c == '<' || c == '\'') {
3533 int gnum;
3534 UChar* name_end;
3535
3536 prev = p;
3537 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3538 if (r < 0) return r;
3539
3540 tok->type = TK_CALL;
3541 tok->u.call.name = prev;
3542 tok->u.call.name_end = name_end;
3543 tok->u.call.gnum = gnum;
3544 }
3545 else
3546 PUNFETCH;
3547 }
3548 break;
3549 #endif
3550
3551 case 'Q':
3552 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3553 tok->type = TK_QUOTE_OPEN;
3554 }
3555 break;
3556
3557 case 'p':
3558 case 'P':
3559 if (PPEEK_IS('{') &&
3560 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3561 PINC;
3562 tok->type = TK_CHAR_PROPERTY;
3563 tok->u.prop.not = (c == 'P' ? 1 : 0);
3564
3565 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3566 PFETCH(c);
3567 if (c == '^') {
3568 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3569 }
3570 else
3571 PUNFETCH;
3572 }
3573 }
3574 break;
3575
3576 default:
3577 PUNFETCH;
3578 num = fetch_escaped_value(&p, end, env);
3579 if (num < 0) return num;
3580 /* set_raw: */
3581 if (tok->u.c != num) {
3582 tok->type = TK_CODE_POINT;
3583 tok->u.code = (OnigCodePoint )num;
3584 }
3585 else { /* string */
3586 int len;
3587 SAFE_ENC_LEN(enc, tok->backp, end, len);
3588 p = tok->backp + len;
3589 }
3590 break;
3591 }
3592 }
3593 else {
3594 tok->u.c = c;
3595 tok->escaped = 0;
3596
3597 #ifdef USE_VARIABLE_META_CHARS
3598 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3599 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3600 if (c == MC_ANYCHAR(syn))
3601 goto any_char;
3602 else if (c == MC_ANYTIME(syn))
3603 goto anytime;
3604 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3605 goto zero_or_one_time;
3606 else if (c == MC_ONE_OR_MORE_TIME(syn))
3607 goto one_or_more_time;
3608 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3609 tok->type = TK_ANYCHAR_ANYTIME;
3610 goto out;
3611 }
3612 }
3613 #endif
3614
3615 switch (c) {
3616 case '.':
3617 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3618 #ifdef USE_VARIABLE_META_CHARS
3619 any_char:
3620 #endif
3621 tok->type = TK_ANYCHAR;
3622 break;
3623
3624 case '*':
3625 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3626 #ifdef USE_VARIABLE_META_CHARS
3627 anytime:
3628 #endif
3629 tok->type = TK_OP_REPEAT;
3630 tok->u.repeat.lower = 0;
3631 tok->u.repeat.upper = REPEAT_INFINITE;
3632 goto greedy_check;
3633 break;
3634
3635 case '+':
3636 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3637 #ifdef USE_VARIABLE_META_CHARS
3638 one_or_more_time:
3639 #endif
3640 tok->type = TK_OP_REPEAT;
3641 tok->u.repeat.lower = 1;
3642 tok->u.repeat.upper = REPEAT_INFINITE;
3643 goto greedy_check;
3644 break;
3645
3646 case '?':
3647 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3648 #ifdef USE_VARIABLE_META_CHARS
3649 zero_or_one_time:
3650 #endif
3651 tok->type = TK_OP_REPEAT;
3652 tok->u.repeat.lower = 0;
3653 tok->u.repeat.upper = 1;
3654 goto greedy_check;
3655 break;
3656
3657 case '{':
3658 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3659 r = fetch_range_quantifier(&p, end, tok, env);
3660 if (r < 0) return r; /* error */
3661 if (r == 0) goto greedy_check;
3662 else if (r == 2) { /* {n} */
3663 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3664 goto possessive_check;
3665
3666 goto greedy_check;
3667 }
3668 /* r == 1 : normal char */
3669 break;
3670
3671 case '|':
3672 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3673 tok->type = TK_ALT;
3674 break;
3675
3676 case '(':
3677 if (PPEEK_IS('?') &&
3678 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3679 PINC;
3680 if (PPEEK_IS('#')) {
3681 PFETCH(c);
3682 while (1) {
3683 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3684 PFETCH(c);
3685 if (c == MC_ESC(syn)) {
3686 if (!PEND) PFETCH(c);
3687 }
3688 else {
3689 if (c == ')') break;
3690 }
3691 }
3692 goto start;
3693 }
3694 PUNFETCH;
3695 }
3696
3697 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3698 tok->type = TK_SUBEXP_OPEN;
3699 break;
3700
3701 case ')':
3702 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3703 tok->type = TK_SUBEXP_CLOSE;
3704 break;
3705
3706 case '^':
3707 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3708 tok->type = TK_ANCHOR;
3709 tok->u.subtype = (IS_SINGLELINE(env->option)
3710 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3711 break;
3712
3713 case '$':
3714 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3715 tok->type = TK_ANCHOR;
3716 tok->u.subtype = (IS_SINGLELINE(env->option)
3717 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3718 break;
3719
3720 case '[':
3721 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3722 tok->type = TK_CC_OPEN;
3723 break;
3724
3725 case ']':
3726 if (*src > env->pattern) /* /].../ is allowed. */
3727 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3728 break;
3729
3730 case '#':
3731 if (IS_EXTEND(env->option)) {
3732 while (!PEND) {
3733 PFETCH(c);
3734 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3735 break;
3736 }
3737 goto start;
3738 break;
3739 }
3740 break;
3741
3742 case ' ': case '\t': case '\n': case '\r': case '\f':
3743 if (IS_EXTEND(env->option))
3744 goto start;
3745 break;
3746
3747 default:
3748 /* string */
3749 break;
3750 }
3751 }
3752
3753 #ifdef USE_VARIABLE_META_CHARS
3754 out:
3755 #endif
3756 *src = p;
3757 return tok->type;
3758 }
3759
3760 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3761 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3762 OnigEncoding enc ARG_UNUSED,
3763 OnigCodePoint sb_out, const OnigCodePoint mbr[])
3764 {
3765 int i, r;
3766 OnigCodePoint j;
3767
3768 int n = ONIGENC_CODE_RANGE_NUM(mbr);
3769
3770 if (not == 0) {
3771 for (i = 0; i < n; i++) {
3772 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
3773 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3774 if (j >= sb_out) {
3775 if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3776 else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3777 r = add_code_range_to_buf(&(cc->mbuf), j,
3778 ONIGENC_CODE_RANGE_TO(mbr, i));
3779 if (r != 0) return r;
3780 i++;
3781 }
3782
3783 goto sb_end;
3784 }
3785 BITSET_SET_BIT(cc->bs, j);
3786 }
3787 }
3788
3789 sb_end:
3790 for ( ; i < n; i++) {
3791 r = add_code_range_to_buf(&(cc->mbuf),
3792 ONIGENC_CODE_RANGE_FROM(mbr, i),
3793 ONIGENC_CODE_RANGE_TO(mbr, i));
3794 if (r != 0) return r;
3795 }
3796 }
3797 else {
3798 OnigCodePoint prev = 0;
3799
3800 for (i = 0; i < n; i++) {
3801 for (j = prev;
3802 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3803 if (j >= sb_out) {
3804 goto sb_end2;
3805 }
3806 BITSET_SET_BIT(cc->bs, j);
3807 }
3808 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3809 }
3810 for (j = prev; j < sb_out; j++) {
3811 BITSET_SET_BIT(cc->bs, j);
3812 }
3813
3814 sb_end2:
3815 prev = sb_out;
3816
3817 for (i = 0; i < n; i++) {
3818 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3819 r = add_code_range_to_buf(&(cc->mbuf), prev,
3820 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3821 if (r != 0) return r;
3822 }
3823 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3824 }
3825 if (prev < 0x7fffffff) {
3826 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3827 if (r != 0) return r;
3828 }
3829 }
3830
3831 return 0;
3832 }
3833
3834 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3835 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3836 {
3837 int c, r;
3838 const OnigCodePoint *ranges;
3839 OnigCodePoint sb_out;
3840 OnigEncoding enc = env->enc;
3841
3842 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3843 if (r == 0) {
3844 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3845 }
3846 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3847 return r;
3848 }
3849
3850 r = 0;
3851 switch (ctype) {
3852 case ONIGENC_CTYPE_ALPHA:
3853 case ONIGENC_CTYPE_BLANK:
3854 case ONIGENC_CTYPE_CNTRL:
3855 case ONIGENC_CTYPE_DIGIT:
3856 case ONIGENC_CTYPE_LOWER:
3857 case ONIGENC_CTYPE_PUNCT:
3858 case ONIGENC_CTYPE_SPACE:
3859 case ONIGENC_CTYPE_UPPER:
3860 case ONIGENC_CTYPE_XDIGIT:
3861 case ONIGENC_CTYPE_ASCII:
3862 case ONIGENC_CTYPE_ALNUM:
3863 if (not != 0) {
3864 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3865 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3866 BITSET_SET_BIT(cc->bs, c);
3867 }
3868 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3869 }
3870 else {
3871 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3872 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3873 BITSET_SET_BIT(cc->bs, c);
3874 }
3875 }
3876 break;
3877
3878 case ONIGENC_CTYPE_GRAPH:
3879 case ONIGENC_CTYPE_PRINT:
3880 if (not != 0) {
3881 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3882 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3883 BITSET_SET_BIT(cc->bs, c);
3884 }
3885 }
3886 else {
3887 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3888 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3889 BITSET_SET_BIT(cc->bs, c);
3890 }
3891 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3892 }
3893 break;
3894
3895 case ONIGENC_CTYPE_WORD:
3896 if (not == 0) {
3897 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3898 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3899 }
3900 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3901 }
3902 else {
3903 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3904 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3905 && ! ONIGENC_IS_CODE_WORD(enc, c))
3906 BITSET_SET_BIT(cc->bs, c);
3907 }
3908 }
3909 break;
3910
3911 default:
3912 return ONIGERR_PARSER_BUG;
3913 break;
3914 }
3915
3916 return r;
3917 }
3918
3919 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3920 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3921 {
3922 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3923 #define POSIX_BRACKET_NAME_MIN_LEN 4
3924
3925 static PosixBracketEntryType PBS[] = {
3926 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3927 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3928 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3929 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3930 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3931 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3932 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3933 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3934 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3935 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3936 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3937 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3938 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3939 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
3940 { (UChar* )NULL, -1, 0 }
3941 };
3942
3943 PosixBracketEntryType *pb;
3944 int not, i, r;
3945 OnigCodePoint c;
3946 OnigEncoding enc = env->enc;
3947 UChar *p = *src;
3948
3949 if (PPEEK_IS('^')) {
3950 PINC_S;
3951 not = 1;
3952 }
3953 else
3954 not = 0;
3955
3956 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3957 goto not_posix_bracket;
3958
3959 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3960 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3961 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3962 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3963 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3964
3965 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3966 if (r != 0) return r;
3967
3968 PINC_S; PINC_S;
3969 *src = p;
3970 return 0;
3971 }
3972 }
3973
3974 not_posix_bracket:
3975 c = 0;
3976 i = 0;
3977 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3978 PINC_S;
3979 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3980 }
3981 if (c == ':' && ! PEND) {
3982 PINC_S;
3983 if (! PEND) {
3984 PFETCH_S(c);
3985 if (c == ']')
3986 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3987 }
3988 }
3989
3990 return 1; /* 1: is not POSIX bracket, but no error. */
3991 }
3992
3993 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3994 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3995 {
3996 int r;
3997 OnigCodePoint c;
3998 OnigEncoding enc = env->enc;
3999 UChar *prev, *start, *p = *src;
4000
4001 r = 0;
4002 start = prev = p;
4003
4004 while (!PEND) {
4005 prev = p;
4006 PFETCH_S(c);
4007 if (c == '}') {
4008 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4009 if (r < 0) break;
4010
4011 *src = p;
4012 return r;
4013 }
4014 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4015 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4016 break;
4017 }
4018 }
4019
4020 onig_scan_env_set_error_string(env, r, *src, prev);
4021 return r;
4022 }
4023
4024 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4025 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4026 ScanEnv* env)
4027 {
4028 int r, ctype;
4029 CClassNode* cc;
4030
4031 ctype = fetch_char_property_to_ctype(src, end, env);
4032 if (ctype < 0) return ctype;
4033
4034 *np = node_new_cclass();
4035 CHECK_NULL_RETURN_MEMERR(*np);
4036 cc = NCCLASS(*np);
4037 r = add_ctype_to_cc(cc, ctype, 0, env);
4038 if (r != 0) return r;
4039 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4040
4041 return 0;
4042 }
4043
4044
4045 enum CCSTATE {
4046 CCS_VALUE,
4047 CCS_RANGE,
4048 CCS_COMPLETE,
4049 CCS_START
4050 };
4051
4052 enum CCVALTYPE {
4053 CCV_SB,
4054 CCV_CODE_POINT,
4055 CCV_CLASS
4056 };
4057
4058 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4059 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4060 enum CCSTATE* state, ScanEnv* env)
4061 {
4062 int r;
4063
4064 if (*state == CCS_RANGE)
4065 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4066
4067 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4068 if (*type == CCV_SB)
4069 BITSET_SET_BIT(cc->bs, (int )(*vs));
4070 else if (*type == CCV_CODE_POINT) {
4071 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4072 if (r < 0) return r;
4073 }
4074 }
4075
4076 if (*state != CCS_START)
4077 *state = CCS_VALUE;
4078
4079 *type = CCV_CLASS;
4080 return 0;
4081 }
4082
4083 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4084 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4085 int* vs_israw, int v_israw,
4086 enum CCVALTYPE intype, enum CCVALTYPE* type,
4087 enum CCSTATE* state, ScanEnv* env)
4088 {
4089 int r;
4090
4091 switch (*state) {
4092 case CCS_VALUE:
4093 if (*type == CCV_SB)
4094 {
4095 if (*vs > 0xff)
4096 return ONIGERR_INVALID_CODE_POINT_VALUE;
4097 BITSET_SET_BIT(cc->bs, (int )(*vs));
4098 }
4099 else if (*type == CCV_CODE_POINT) {
4100 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4101 if (r < 0) return r;
4102 }
4103 break;
4104
4105 case CCS_RANGE:
4106 if (intype == *type) {
4107 if (intype == CCV_SB) {
4108 if (*vs > 0xff || v > 0xff)
4109 return ONIGERR_INVALID_CODE_POINT_VALUE;
4110
4111 if (*vs > v) {
4112 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4113 goto ccs_range_end;
4114 else
4115 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4116 }
4117 bitset_set_range(cc->bs, (int )*vs, (int )v);
4118 }
4119 else {
4120 r = add_code_range(&(cc->mbuf), env, *vs, v);
4121 if (r < 0) return r;
4122 }
4123 }
4124 else {
4125 #if 0
4126 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4127 #endif
4128 if (*vs > v) {
4129 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4130 goto ccs_range_end;
4131 else
4132 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4133 }
4134 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4135 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4136 if (r < 0) return r;
4137 #if 0
4138 }
4139 else
4140 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4141 #endif
4142 }
4143 ccs_range_end:
4144 *state = CCS_COMPLETE;
4145 break;
4146
4147 case CCS_COMPLETE:
4148 case CCS_START:
4149 *state = CCS_VALUE;
4150 break;
4151
4152 default:
4153 break;
4154 }
4155
4156 *vs_israw = v_israw;
4157 *vs = v;
4158 *type = intype;
4159 return 0;
4160 }
4161
4162 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4163 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4164 ScanEnv* env)
4165 {
4166 int in_esc;
4167 OnigCodePoint code;
4168 OnigEncoding enc = env->enc;
4169 UChar* p = from;
4170
4171 in_esc = 0;
4172 while (! PEND) {
4173 if (ignore_escaped && in_esc) {
4174 in_esc = 0;
4175 }
4176 else {
4177 PFETCH_S(code);
4178 if (code == c) return 1;
4179 if (code == MC_ESC(env->syntax)) in_esc = 1;
4180 }
4181 }
4182 return 0;
4183 }
4184
4185 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4186 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4187 ScanEnv* env)
4188 {
4189 int r, neg, len, fetched, and_start;
4190 OnigCodePoint v, vs;
4191 UChar *p;
4192 Node* node;
4193 CClassNode *cc, *prev_cc;
4194 CClassNode work_cc;
4195
4196 enum CCSTATE state;
4197 enum CCVALTYPE val_type, in_type;
4198 int val_israw, in_israw;
4199
4200 prev_cc = (CClassNode* )NULL;
4201 *np = NULL_NODE;
4202 r = fetch_token_in_cc(tok, src, end, env);
4203 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4204 neg = 1;
4205 r = fetch_token_in_cc(tok, src, end, env);
4206 }
4207 else {
4208 neg = 0;
4209 }
4210
4211 if (r < 0) return r;
4212 if (r == TK_CC_CLOSE) {
4213 if (! code_exist_check((OnigCodePoint )']',
4214 *src, env->pattern_end, 1, env))
4215 return ONIGERR_EMPTY_CHAR_CLASS;
4216
4217 CC_ESC_WARN(env, (UChar* )"]");
4218 r = tok->type = TK_CHAR; /* allow []...] */
4219 }
4220
4221 *np = node = node_new_cclass();
4222 CHECK_NULL_RETURN_MEMERR(node);
4223 cc = NCCLASS(node);
4224
4225 and_start = 0;
4226 state = CCS_START;
4227 p = *src;
4228 while (r != TK_CC_CLOSE) {
4229 fetched = 0;
4230 switch (r) {
4231 case TK_CHAR:
4232 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4233 if (len > 1) {
4234 in_type = CCV_CODE_POINT;
4235 }
4236 else if (len < 0) {
4237 r = len;
4238 goto err;
4239 }
4240 else {
4241 sb_char:
4242 in_type = CCV_SB;
4243 }
4244 v = (OnigCodePoint )tok->u.c;
4245 in_israw = 0;
4246 goto val_entry2;
4247 break;
4248
4249 case TK_RAW_BYTE:
4250 /* tok->base != 0 : octal or hexadec. */
4251 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4252 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4253 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4254 UChar* psave = p;
4255 int i, base = tok->base;
4256
4257 buf[0] = tok->u.c;
4258 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4259 r = fetch_token_in_cc(tok, &p, end, env);
4260 if (r < 0) goto err;
4261 if (r != TK_RAW_BYTE || tok->base != base) {
4262 fetched = 1;
4263 break;
4264 }
4265 buf[i] = tok->u.c;
4266 }
4267
4268 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4269 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4270 goto err;
4271 }
4272
4273 len = enclen(env->enc, buf);
4274 if (i < len) {
4275 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4276 goto err;
4277 }
4278 else if (i > len) { /* fetch back */
4279 p = psave;
4280 for (i = 1; i < len; i++) {
4281 r = fetch_token_in_cc(tok, &p, end, env);
4282 }
4283 fetched = 0;
4284 }
4285
4286 if (i == 1) {
4287 v = (OnigCodePoint )buf[0];
4288 goto raw_single;
4289 }
4290 else {
4291 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4292 in_type = CCV_CODE_POINT;
4293 }
4294 }
4295 else {
4296 v = (OnigCodePoint )tok->u.c;
4297 raw_single:
4298 in_type = CCV_SB;
4299 }
4300 in_israw = 1;
4301 goto val_entry2;
4302 break;
4303
4304 case TK_CODE_POINT:
4305 v = tok->u.code;
4306 in_israw = 1;
4307 val_entry:
4308 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4309 if (len < 0) {
4310 r = len;
4311 goto err;
4312 }
4313 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4314 val_entry2:
4315 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4316 &state, env);
4317 if (r != 0) goto err;
4318 break;
4319
4320 case TK_POSIX_BRACKET_OPEN:
4321 r = parse_posix_bracket(cc, &p, end, env);
4322 if (r < 0) goto err;
4323 if (r == 1) { /* is not POSIX bracket */
4324 CC_ESC_WARN(env, (UChar* )"[");
4325 p = tok->backp;
4326 v = (OnigCodePoint )tok->u.c;
4327 in_israw = 0;
4328 goto val_entry;
4329 }
4330 goto next_class;
4331 break;
4332
4333 case TK_CHAR_TYPE:
4334 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4335 if (r != 0) return r;
4336
4337 next_class:
4338 r = next_state_class(cc, &vs, &val_type, &state, env);
4339 if (r != 0) goto err;
4340 break;
4341
4342 case TK_CHAR_PROPERTY:
4343 {
4344 int ctype;
4345
4346 ctype = fetch_char_property_to_ctype(&p, end, env);
4347 if (ctype < 0) return ctype;
4348 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4349 if (r != 0) return r;
4350 goto next_class;
4351 }
4352 break;
4353
4354 case TK_CC_RANGE:
4355 if (state == CCS_VALUE) {
4356 r = fetch_token_in_cc(tok, &p, end, env);
4357 if (r < 0) goto err;
4358 fetched = 1;
4359 if (r == TK_CC_CLOSE) { /* allow [x-] */
4360 range_end_val:
4361 v = (OnigCodePoint )'-';
4362 in_israw = 0;
4363 goto val_entry;
4364 }
4365 else if (r == TK_CC_AND) {
4366 CC_ESC_WARN(env, (UChar* )"-");
4367 goto range_end_val;
4368 }
4369 state = CCS_RANGE;
4370 }
4371 else if (state == CCS_START) {
4372 /* [-xa] is allowed */
4373 v = (OnigCodePoint )tok->u.c;
4374 in_israw = 0;
4375
4376 r = fetch_token_in_cc(tok, &p, end, env);
4377 if (r < 0) goto err;
4378 fetched = 1;
4379 /* [--x] or [a&&-x] is warned. */
4380 if (r == TK_CC_RANGE || and_start != 0)
4381 CC_ESC_WARN(env, (UChar* )"-");
4382
4383 goto val_entry;
4384 }
4385 else if (state == CCS_RANGE) {
4386 CC_ESC_WARN(env, (UChar* )"-");
4387 goto sb_char; /* [!--x] is allowed */
4388 }
4389 else { /* CCS_COMPLETE */
4390 r = fetch_token_in_cc(tok, &p, end, env);
4391 if (r < 0) goto err;
4392 fetched = 1;
4393 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4394 else if (r == TK_CC_AND) {
4395 CC_ESC_WARN(env, (UChar* )"-");
4396 goto range_end_val;
4397 }
4398
4399 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4400 CC_ESC_WARN(env, (UChar* )"-");
4401 goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4402 }
4403 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4404 goto err;
4405 }
4406 break;
4407
4408 case TK_CC_CC_OPEN: /* [ */
4409 {
4410 Node *anode;
4411 CClassNode* acc;
4412
4413 r = parse_char_class(&anode, tok, &p, end, env);
4414 if (r != 0) goto cc_open_err;
4415 acc = NCCLASS(anode);
4416 r = or_cclass(cc, acc, env->enc);
4417
4418 onig_node_free(anode);
4419 cc_open_err:
4420 if (r != 0) goto err;
4421 }
4422 break;
4423
4424 case TK_CC_AND: /* && */
4425 {
4426 if (state == CCS_VALUE) {
4427 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4428 &val_type, &state, env);
4429 if (r != 0) goto err;
4430 }
4431 /* initialize local variables */
4432 and_start = 1;
4433 state = CCS_START;
4434
4435 if (IS_NOT_NULL(prev_cc)) {
4436 r = and_cclass(prev_cc, cc, env->enc);
4437 if (r != 0) goto err;
4438 bbuf_free(cc->mbuf);
4439 }
4440 else {
4441 prev_cc = cc;
4442 cc = &work_cc;
4443 }
4444 initialize_cclass(cc);
4445 }
4446 break;
4447
4448 case TK_EOT:
4449 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4450 goto err;
4451 break;
4452 default:
4453 r = ONIGERR_PARSER_BUG;
4454 goto err;
4455 break;
4456 }
4457
4458 if (fetched)
4459 r = tok->type;
4460 else {
4461 r = fetch_token_in_cc(tok, &p, end, env);
4462 if (r < 0) goto err;
4463 }
4464 }
4465
4466 if (state == CCS_VALUE) {
4467 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4468 &val_type, &state, env);
4469 if (r != 0) goto err;
4470 }
4471
4472 if (IS_NOT_NULL(prev_cc)) {
4473 r = and_cclass(prev_cc, cc, env->enc);
4474 if (r != 0) goto err;
4475 bbuf_free(cc->mbuf);
4476 cc = prev_cc;
4477 }
4478
4479 if (neg != 0)
4480 NCCLASS_SET_NOT(cc);
4481 else
4482 NCCLASS_CLEAR_NOT(cc);
4483 if (IS_NCCLASS_NOT(cc) &&
4484 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4485 int is_empty;
4486
4487 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4488 if (is_empty != 0)
4489 BITSET_IS_EMPTY(cc->bs, is_empty);
4490
4491 if (is_empty == 0) {
4492 #define NEWLINE_CODE 0x0a
4493
4494 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4495 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4496 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4497 else
4498 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4499 }
4500 }
4501 }
4502 *src = p;
4503 return 0;
4504
4505 err:
4506 if (cc != NCCLASS(*np))
4507 bbuf_free(cc->mbuf);
4508 onig_node_free(*np);
4509 return r;
4510 }
4511
4512 static int parse_subexp(Node** top, OnigToken* tok, int term,
4513 UChar** src, UChar* end, ScanEnv* env);
4514
4515 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4516 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4517 ScanEnv* env)
4518 {
4519 int r, num;
4520 Node *target;
4521 OnigOptionType option;
4522 OnigCodePoint c;
4523 OnigEncoding enc = env->enc;
4524
4525 #ifdef USE_NAMED_GROUP
4526 int list_capture;
4527 #endif
4528
4529 UChar* p = *src;
4530 PFETCH_READY;
4531
4532 *np = NULL;
4533 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4534
4535 option = env->option;
4536 if (PPEEK_IS('?') &&
4537 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4538 PINC;
4539 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4540
4541 PFETCH(c);
4542 switch (c) {
4543 case ':': /* (?:...) grouping only */
4544 group:
4545 r = fetch_token(tok, &p, end, env);
4546 if (r < 0) return r;
4547 r = parse_subexp(np, tok, term, &p, end, env);
4548 if (r < 0) return r;
4549 *src = p;
4550 return 1; /* group */
4551 break;
4552
4553 case '=':
4554 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4555 break;
4556 case '!': /* preceding read */
4557 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4558 break;
4559 case '>': /* (?>...) stop backtrack */
4560 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4561 break;
4562
4563 #ifdef USE_NAMED_GROUP
4564 case '\'':
4565 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4566 goto named_group1;
4567 }
4568 else
4569 return ONIGERR_UNDEFINED_GROUP_OPTION;
4570 break;
4571 #endif
4572
4573 case '<': /* look behind (?<=...), (?<!...) */
4574 PFETCH(c);
4575 if (c == '=')
4576 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4577 else if (c == '!')
4578 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4579 #ifdef USE_NAMED_GROUP
4580 else {
4581 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4582 UChar *name;
4583 UChar *name_end;
4584
4585 PUNFETCH;
4586 c = '<';
4587
4588 named_group1:
4589 list_capture = 0;
4590
4591 named_group2:
4592 name = p;
4593 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4594 if (r < 0) return r;
4595
4596 num = scan_env_add_mem_entry(env);
4597 if (num < 0) return num;
4598 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4599 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4600
4601 r = name_add(env->reg, name, name_end, num, env);
4602 if (r != 0) return r;
4603 *np = node_new_enclose_memory(env->option, 1);
4604 CHECK_NULL_RETURN_MEMERR(*np);
4605 NENCLOSE(*np)->regnum = num;
4606 if (list_capture != 0)
4607 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4608 env->num_named++;
4609 }
4610 else {
4611 return ONIGERR_UNDEFINED_GROUP_OPTION;
4612 }
4613 }
4614 #else
4615 else {
4616 return ONIGERR_UNDEFINED_GROUP_OPTION;
4617 }
4618 #endif
4619 break;
4620
4621 case '@':
4622 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4623 #ifdef USE_NAMED_GROUP
4624 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4625 PFETCH(c);
4626 if (c == '<' || c == '\'') {
4627 list_capture = 1;
4628 goto named_group2; /* (?@<name>...) */
4629 }
4630 PUNFETCH;
4631 }
4632 #endif
4633 *np = node_new_enclose_memory(env->option, 0);
4634 CHECK_NULL_RETURN_MEMERR(*np);
4635 num = scan_env_add_mem_entry(env);
4636 if (num < 0) {
4637 onig_node_free(*np);
4638 return num;
4639 }
4640 else if (num >= (int )BIT_STATUS_BITS_NUM) {
4641 onig_node_free(*np);
4642 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4643 }
4644 NENCLOSE(*np)->regnum = num;
4645 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4646 }
4647 else {
4648 return ONIGERR_UNDEFINED_GROUP_OPTION;
4649 }
4650 break;
4651
4652 #ifdef USE_POSIXLINE_OPTION
4653 case 'p':
4654 #endif
4655 case '-': case 'i': case 'm': case 's': case 'x':
4656 {
4657 int neg = 0;
4658
4659 while (1) {
4660 switch (c) {
4661 case ':':
4662 case ')':
4663 break;
4664
4665 case '-': neg = 1; break;
4666 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4667 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4668 case 's':
4669 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4670 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4671 }
4672 else
4673 return ONIGERR_UNDEFINED_GROUP_OPTION;
4674 break;
4675
4676 case 'm':
4677 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4678 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4679 }
4680 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4681 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4682 }
4683 else
4684 return ONIGERR_UNDEFINED_GROUP_OPTION;
4685 break;
4686 #ifdef USE_POSIXLINE_OPTION
4687 case 'p':
4688 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4689 break;
4690 #endif
4691 default:
4692 return ONIGERR_UNDEFINED_GROUP_OPTION;
4693 }
4694
4695 if (c == ')') {
4696 *np = node_new_option(option);
4697 CHECK_NULL_RETURN_MEMERR(*np);
4698 *src = p;
4699 return 2; /* option only */
4700 }
4701 else if (c == ':') {
4702 OnigOptionType prev = env->option;
4703
4704 env->option = option;
4705 r = fetch_token(tok, &p, end, env);
4706 if (r < 0) return r;
4707 r = parse_subexp(&target, tok, term, &p, end, env);
4708 env->option = prev;
4709 if (r < 0) return r;
4710 *np = node_new_option(option);
4711 CHECK_NULL_RETURN_MEMERR(*np);
4712 NENCLOSE(*np)->target = target;
4713 *src = p;
4714 return 0;
4715 }
4716
4717 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4718 PFETCH(c);
4719 }
4720 }
4721 break;
4722
4723 default:
4724 return ONIGERR_UNDEFINED_GROUP_OPTION;
4725 }
4726 }
4727 else {
4728 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4729 goto group;
4730
4731 *np = node_new_enclose_memory(env->option, 0);
4732 CHECK_NULL_RETURN_MEMERR(*np);
4733 num = scan_env_add_mem_entry(env);
4734 if (num < 0) return num;
4735 NENCLOSE(*np)->regnum = num;
4736 }
4737
4738 CHECK_NULL_RETURN_MEMERR(*np);
4739 r = fetch_token(tok, &p, end, env);
4740 if (r < 0) return r;
4741 r = parse_subexp(&target, tok, term, &p, end, env);
4742 if (r < 0) return r;
4743
4744 if (NTYPE(*np) == NT_ANCHOR)
4745 NANCHOR(*np)->target = target;
4746 else {
4747 NENCLOSE(*np)->target = target;
4748 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4749 /* Don't move this to previous of parse_subexp() */
4750 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4751 if (r != 0) return r;
4752 }
4753 }
4754
4755 *src = p;
4756 return 0;
4757 }
4758
4759 static const char* PopularQStr[] = {
4760 "?", "*", "+", "??", "*?", "+?"
4761 };
4762
4763 static const char* ReduceQStr[] = {
4764 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4765 };
4766
4767 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4768 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4769 {
4770 QtfrNode* qn;
4771
4772 qn = NQTFR(qnode);
4773 if (qn->lower == 1 && qn->upper == 1) {
4774 return 1;
4775 }
4776
4777 switch (NTYPE(target)) {
4778 case NT_STR:
4779 if (! group) {
4780 StrNode* sn = NSTR(target);
4781 if (str_node_can_be_split(sn, env->enc)) {
4782 Node* n = str_node_split_last_char(sn, env->enc);
4783 if (IS_NOT_NULL(n)) {
4784 qn->target = n;
4785 return 2;
4786 }
4787 }
4788 }
4789 break;
4790
4791 case NT_QTFR:
4792 { /* check redundant double repeat. */
4793 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4794 QtfrNode* qnt = NQTFR(target);
4795 int nestq_num = popular_quantifier_num(qn);
4796 int targetq_num = popular_quantifier_num(qnt);
4797
4798 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4799 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4800 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4801 UChar buf[WARN_BUFSIZE];
4802
4803 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4804 case RQ_ASIS:
4805 break;
4806
4807 case RQ_DEL:
4808 if (onig_verb_warn != onig_null_warn) {
4809 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4810 env->pattern, env->pattern_end,
4811 (UChar* )"redundant nested repeat operator");
4812 (*onig_verb_warn)((char* )buf);
4813 }
4814 goto warn_exit;
4815 break;
4816
4817 default:
4818 if (onig_verb_warn != onig_null_warn) {
4819 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4820 env->pattern, env->pattern_end,
4821 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4822 PopularQStr[targetq_num], PopularQStr[nestq_num],
4823 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4824 (*onig_verb_warn)((char* )buf);
4825 }
4826 goto warn_exit;
4827 break;
4828 }
4829 }
4830
4831 warn_exit:
4832 #endif
4833 if (targetq_num >= 0) {
4834 if (nestq_num >= 0) {
4835 onig_reduce_nested_quantifier(qnode, target);
4836 goto q_exit;
4837 }
4838 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4839 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4840 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4841 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4842 }
4843 }
4844 }
4845 }
4846 break;
4847
4848 default:
4849 break;
4850 }
4851
4852 qn->target = target;
4853 q_exit:
4854 return 0;
4855 }
4856
4857
4858 #ifdef USE_SHARED_CCLASS_TABLE
4859
4860 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4861
4862 /* for ctype node hash table */
4863
4864 typedef struct {
4865 OnigEncoding enc;
4866 int not;
4867 int type;
4868 } type_cclass_key;
4869
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4870 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4871 {
4872 if (x->type != y->type) return 1;
4873 if (x->enc != y->enc) return 1;
4874 if (x->not != y->not) return 1;
4875 return 0;
4876 }
4877
type_cclass_hash(type_cclass_key * key)4878 static int type_cclass_hash(type_cclass_key* key)
4879 {
4880 int i, val;
4881 UChar *p;
4882
4883 val = 0;
4884
4885 p = (UChar* )&(key->enc);
4886 for (i = 0; i < (int )sizeof(key->enc); i++) {
4887 val = val * 997 + (int )*p++;
4888 }
4889
4890 p = (UChar* )(&key->type);
4891 for (i = 0; i < (int )sizeof(key->type); i++) {
4892 val = val * 997 + (int )*p++;
4893 }
4894
4895 val += key->not;
4896 return val + (val >> 5);
4897 }
4898
4899 static struct st_hash_type type_type_cclass_hash = {
4900 type_cclass_cmp,
4901 type_cclass_hash,
4902 };
4903
4904 static st_table* OnigTypeCClassTable;
4905
4906
4907 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4908 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4909 {
4910 if (IS_NOT_NULL(node)) {
4911 CClassNode* cc = NCCLASS(node);
4912 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4913 xfree(node);
4914 }
4915
4916 if (IS_NOT_NULL(key)) xfree(key);
4917 return ST_DELETE;
4918 }
4919
4920 extern int
onig_free_shared_cclass_table(void)4921 onig_free_shared_cclass_table(void)
4922 {
4923 if (IS_NOT_NULL(OnigTypeCClassTable)) {
4924 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4925 onig_st_free_table(OnigTypeCClassTable);
4926 OnigTypeCClassTable = NULL;
4927 }
4928
4929 return 0;
4930 }
4931
4932 #endif /* USE_SHARED_CCLASS_TABLE */
4933
4934
4935 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4936 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4937 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4938 {
4939 BBuf *tbuf;
4940 int r;
4941
4942 if (IS_NCCLASS_NOT(cc)) {
4943 bitset_invert(cc->bs);
4944
4945 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4946 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4947 if (r != 0) return r;
4948
4949 bbuf_free(cc->mbuf);
4950 cc->mbuf = tbuf;
4951 }
4952
4953 NCCLASS_CLEAR_NOT(cc);
4954 }
4955
4956 return 0;
4957 }
4958 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4959
4960 typedef struct {
4961 ScanEnv* env;
4962 CClassNode* cc;
4963 Node* alt_root;
4964 Node** ptail;
4965 } IApplyCaseFoldArg;
4966
4967 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4968 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4969 int to_len, void* arg)
4970 {
4971 IApplyCaseFoldArg* iarg;
4972 ScanEnv* env;
4973 CClassNode* cc;
4974 BitSetRef bs;
4975
4976 iarg = (IApplyCaseFoldArg* )arg;
4977 env = iarg->env;
4978 cc = iarg->cc;
4979 bs = cc->bs;
4980
4981 if (to_len == 1) {
4982 int is_in = onig_is_code_in_cc(env->enc, from, cc);
4983 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4984 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4985 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
4986 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4987 add_code_range(&(cc->mbuf), env, *to, *to);
4988 }
4989 else {
4990 BITSET_SET_BIT(bs, *to);
4991 }
4992 }
4993 #else
4994 if (is_in != 0) {
4995 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4996 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4997 add_code_range(&(cc->mbuf), env, *to, *to);
4998 }
4999 else {
5000 if (IS_NCCLASS_NOT(cc)) {
5001 BITSET_CLEAR_BIT(bs, *to);
5002 }
5003 else
5004 BITSET_SET_BIT(bs, *to);
5005 }
5006 }
5007 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5008 }
5009 else {
5010 int r, i, len;
5011 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5012 Node *snode = NULL_NODE;
5013
5014 if (onig_is_code_in_cc(env->enc, from, cc)
5015 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5016 && !IS_NCCLASS_NOT(cc)
5017 #endif
5018 ) {
5019 for (i = 0; i < to_len; i++) {
5020 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5021 if (i == 0) {
5022 snode = onig_node_new_str(buf, buf + len);
5023 CHECK_NULL_RETURN_MEMERR(snode);
5024
5025 /* char-class expanded multi-char only
5026 compare with string folded at match time. */
5027 NSTRING_SET_AMBIG(snode);
5028 }
5029 else {
5030 r = onig_node_str_cat(snode, buf, buf + len);
5031 if (r < 0) {
5032 onig_node_free(snode);
5033 return r;
5034 }
5035 }
5036 }
5037
5038 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5039 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5040 iarg->ptail = &(NCDR((*(iarg->ptail))));
5041 }
5042 }
5043
5044 return 0;
5045 }
5046
5047 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5048 parse_exp(Node** np, OnigToken* tok, int term,
5049 UChar** src, UChar* end, ScanEnv* env)
5050 {
5051 int r, len, group = 0;
5052 Node* qn;
5053 Node** targetp;
5054
5055 *np = NULL;
5056 if (tok->type == (enum TokenSyms )term)
5057 goto end_of_token;
5058
5059 switch (tok->type) {
5060 case TK_ALT:
5061 case TK_EOT:
5062 end_of_token:
5063 *np = node_new_empty();
5064 return tok->type;
5065 break;
5066
5067 case TK_SUBEXP_OPEN:
5068 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5069 if (r < 0) return r;
5070 if (r == 1) group = 1;
5071 else if (r == 2) { /* option only */
5072 Node* target;
5073 OnigOptionType prev = env->option;
5074
5075 env->option = NENCLOSE(*np)->option;
5076 r = fetch_token(tok, src, end, env);
5077 if (r < 0) return r;
5078 r = parse_subexp(&target, tok, term, src, end, env);
5079 env->option = prev;
5080 if (r < 0) return r;
5081 NENCLOSE(*np)->target = target;
5082 return tok->type;
5083 }
5084 break;
5085
5086 case TK_SUBEXP_CLOSE:
5087 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5088 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5089
5090 if (tok->escaped) goto tk_raw_byte;
5091 else goto tk_byte;
5092 break;
5093
5094 case TK_STRING:
5095 tk_byte:
5096 {
5097 *np = node_new_str(tok->backp, *src);
5098 CHECK_NULL_RETURN_MEMERR(*np);
5099
5100 while (1) {
5101 r = fetch_token(tok, src, end, env);
5102 if (r < 0) return r;
5103 if (r != TK_STRING) break;
5104
5105 r = onig_node_str_cat(*np, tok->backp, *src);
5106 if (r < 0) return r;
5107 }
5108
5109 string_end:
5110 targetp = np;
5111 goto repeat;
5112 }
5113 break;
5114
5115 case TK_RAW_BYTE:
5116 tk_raw_byte:
5117 {
5118 *np = node_new_str_raw_char((UChar )tok->u.c);
5119 CHECK_NULL_RETURN_MEMERR(*np);
5120 len = 1;
5121 while (1) {
5122 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5123 if (len == enclen(env->enc, NSTR(*np)->s)) {
5124 r = fetch_token(tok, src, end, env);
5125 NSTRING_CLEAR_RAW(*np);
5126 goto string_end;
5127 }
5128 }
5129
5130 r = fetch_token(tok, src, end, env);
5131 if (r < 0) return r;
5132 if (r != TK_RAW_BYTE) {
5133 /* Don't use this, it is wrong for little endian encodings. */
5134 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5135 int rem;
5136 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5137 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5138 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5139 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5140 NSTRING_CLEAR_RAW(*np);
5141 goto string_end;
5142 }
5143 }
5144 #endif
5145 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5146 }
5147
5148 r = node_str_cat_char(*np, (UChar )tok->u.c);
5149 if (r < 0) return r;
5150
5151 len++;
5152 }
5153 }
5154 break;
5155
5156 case TK_CODE_POINT:
5157 {
5158 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5159 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5160 if (num < 0) return num;
5161 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5162 *np = node_new_str_raw(buf, buf + num);
5163 #else
5164 *np = node_new_str(buf, buf + num);
5165 #endif
5166 CHECK_NULL_RETURN_MEMERR(*np);
5167 }
5168 break;
5169
5170 case TK_QUOTE_OPEN:
5171 {
5172 OnigCodePoint end_op[2];
5173 UChar *qstart, *qend, *nextp;
5174
5175 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5176 end_op[1] = (OnigCodePoint )'E';
5177 qstart = *src;
5178 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5179 if (IS_NULL(qend)) {
5180 nextp = qend = end;
5181 }
5182 *np = node_new_str(qstart, qend);
5183 CHECK_NULL_RETURN_MEMERR(*np);
5184 *src = nextp;
5185 }
5186 break;
5187
5188 case TK_CHAR_TYPE:
5189 {
5190 switch (tok->u.prop.ctype) {
5191 case ONIGENC_CTYPE_WORD:
5192 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5193 CHECK_NULL_RETURN_MEMERR(*np);
5194 break;
5195
5196 case ONIGENC_CTYPE_SPACE:
5197 case ONIGENC_CTYPE_DIGIT:
5198 case ONIGENC_CTYPE_XDIGIT:
5199 {
5200 CClassNode* cc;
5201
5202 #ifdef USE_SHARED_CCLASS_TABLE
5203 const OnigCodePoint *mbr;
5204 OnigCodePoint sb_out;
5205
5206 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5207 &sb_out, &mbr);
5208 if (r == 0 &&
5209 ONIGENC_CODE_RANGE_NUM(mbr)
5210 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5211 type_cclass_key key;
5212 type_cclass_key* new_key;
5213
5214 key.enc = env->enc;
5215 key.not = tok->u.prop.not;
5216 key.type = tok->u.prop.ctype;
5217
5218 THREAD_ATOMIC_START;
5219
5220 if (IS_NULL(OnigTypeCClassTable)) {
5221 OnigTypeCClassTable
5222 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5223 if (IS_NULL(OnigTypeCClassTable)) {
5224 THREAD_ATOMIC_END;
5225 return ONIGERR_MEMORY;
5226 }
5227 }
5228 else {
5229 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5230 (st_data_t* )np)) {
5231 THREAD_ATOMIC_END;
5232 break;
5233 }
5234 }
5235
5236 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5237 sb_out, mbr);
5238 if (IS_NULL(*np)) {
5239 THREAD_ATOMIC_END;
5240 return ONIGERR_MEMORY;
5241 }
5242
5243 cc = NCCLASS(*np);
5244 NCCLASS_SET_SHARE(cc);
5245 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5246 xmemcpy(new_key, &key, sizeof(type_cclass_key));
5247 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5248 (st_data_t )*np);
5249
5250 THREAD_ATOMIC_END;
5251 }
5252 else {
5253 #endif
5254 *np = node_new_cclass();
5255 CHECK_NULL_RETURN_MEMERR(*np);
5256 cc = NCCLASS(*np);
5257 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5258 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5259 #ifdef USE_SHARED_CCLASS_TABLE
5260 }
5261 #endif
5262 }
5263 break;
5264
5265 default:
5266 return ONIGERR_PARSER_BUG;
5267 break;
5268 }
5269 }
5270 break;
5271
5272 case TK_CHAR_PROPERTY:
5273 r = parse_char_property(np, tok, src, end, env);
5274 if (r != 0) return r;
5275 break;
5276
5277 case TK_CC_OPEN:
5278 {
5279 CClassNode* cc;
5280
5281 r = parse_char_class(np, tok, src, end, env);
5282 if (r != 0) return r;
5283
5284 cc = NCCLASS(*np);
5285 if (IS_IGNORECASE(env->option)) {
5286 IApplyCaseFoldArg iarg;
5287
5288 iarg.env = env;
5289 iarg.cc = cc;
5290 iarg.alt_root = NULL_NODE;
5291 iarg.ptail = &(iarg.alt_root);
5292
5293 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5294 i_apply_case_fold, &iarg);
5295 if (r != 0) {
5296 onig_node_free(iarg.alt_root);
5297 return r;
5298 }
5299 if (IS_NOT_NULL(iarg.alt_root)) {
5300 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5301 if (IS_NULL(work)) {
5302 onig_node_free(iarg.alt_root);
5303 return ONIGERR_MEMORY;
5304 }
5305 *np = work;
5306 }
5307 }
5308 }
5309 break;
5310
5311 case TK_ANYCHAR:
5312 *np = node_new_anychar();
5313 CHECK_NULL_RETURN_MEMERR(*np);
5314 break;
5315
5316 case TK_ANYCHAR_ANYTIME:
5317 *np = node_new_anychar();
5318 CHECK_NULL_RETURN_MEMERR(*np);
5319 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5320 CHECK_NULL_RETURN_MEMERR(qn);
5321 NQTFR(qn)->target = *np;
5322 *np = qn;
5323 break;
5324
5325 case TK_BACKREF:
5326 len = tok->u.backref.num;
5327 *np = node_new_backref(len,
5328 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5329 tok->u.backref.by_name,
5330 #ifdef USE_BACKREF_WITH_LEVEL
5331 tok->u.backref.exist_level,
5332 tok->u.backref.level,
5333 #endif
5334 env);
5335 CHECK_NULL_RETURN_MEMERR(*np);
5336 break;
5337
5338 #ifdef USE_SUBEXP_CALL
5339 case TK_CALL:
5340 {
5341 int gnum = tok->u.call.gnum;
5342
5343 if (gnum < 0) {
5344 gnum = BACKREF_REL_TO_ABS(gnum, env);
5345 if (gnum <= 0)
5346 return ONIGERR_INVALID_BACKREF;
5347 }
5348 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5349 CHECK_NULL_RETURN_MEMERR(*np);
5350 env->num_call++;
5351 }
5352 break;
5353 #endif
5354
5355 case TK_ANCHOR:
5356 *np = onig_node_new_anchor(tok->u.anchor);
5357 break;
5358
5359 case TK_OP_REPEAT:
5360 case TK_INTERVAL:
5361 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5362 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5363 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5364 else
5365 *np = node_new_empty();
5366 }
5367 else {
5368 goto tk_byte;
5369 }
5370 break;
5371
5372 default:
5373 return ONIGERR_PARSER_BUG;
5374 break;
5375 }
5376
5377 {
5378 targetp = np;
5379
5380 re_entry:
5381 r = fetch_token(tok, src, end, env);
5382 if (r < 0) return r;
5383
5384 repeat:
5385 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5386 if (is_invalid_quantifier_target(*targetp))
5387 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5388
5389 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5390 (r == TK_INTERVAL ? 1 : 0));
5391 CHECK_NULL_RETURN_MEMERR(qn);
5392 NQTFR(qn)->greedy = tok->u.repeat.greedy;
5393 r = set_quantifier(qn, *targetp, group, env);
5394 if (r < 0) {
5395 onig_node_free(qn);
5396 return r;
5397 }
5398
5399 if (tok->u.repeat.possessive != 0) {
5400 Node* en;
5401 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5402 if (IS_NULL(en)) {
5403 onig_node_free(qn);
5404 return ONIGERR_MEMORY;
5405 }
5406 NENCLOSE(en)->target = qn;
5407 qn = en;
5408 }
5409
5410 if (r == 0) {
5411 *targetp = qn;
5412 }
5413 else if (r == 1) {
5414 onig_node_free(qn);
5415 }
5416 else if (r == 2) { /* split case: /abc+/ */
5417 Node *tmp;
5418
5419 *targetp = node_new_list(*targetp, NULL);
5420 if (IS_NULL(*targetp)) {
5421 onig_node_free(qn);
5422 return ONIGERR_MEMORY;
5423 }
5424 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5425 if (IS_NULL(tmp)) {
5426 onig_node_free(qn);
5427 return ONIGERR_MEMORY;
5428 }
5429 targetp = &(NCAR(tmp));
5430 }
5431 goto re_entry;
5432 }
5433 }
5434
5435 return r;
5436 }
5437
5438 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5439 parse_branch(Node** top, OnigToken* tok, int term,
5440 UChar** src, UChar* end, ScanEnv* env)
5441 {
5442 int r;
5443 Node *node, **headp;
5444
5445 *top = NULL;
5446 r = parse_exp(&node, tok, term, src, end, env);
5447 if (r < 0) return r;
5448
5449 if (r == TK_EOT || r == term || r == TK_ALT) {
5450 *top = node;
5451 }
5452 else {
5453 *top = node_new_list(node, NULL);
5454 headp = &(NCDR(*top));
5455 while (r != TK_EOT && r != term && r != TK_ALT) {
5456 r = parse_exp(&node, tok, term, src, end, env);
5457 if (r < 0) return r;
5458
5459 if (NTYPE(node) == NT_LIST) {
5460 *headp = node;
5461 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5462 headp = &(NCDR(node));
5463 }
5464 else {
5465 *headp = node_new_list(node, NULL);
5466 headp = &(NCDR(*headp));
5467 }
5468 }
5469 }
5470
5471 return r;
5472 }
5473
5474 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5475 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5476 parse_subexp(Node** top, OnigToken* tok, int term,
5477 UChar** src, UChar* end, ScanEnv* env)
5478 {
5479 int r;
5480 Node *node, **headp;
5481
5482 *top = NULL;
5483 r = parse_branch(&node, tok, term, src, end, env);
5484 if (r < 0) {
5485 onig_node_free(node);
5486 return r;
5487 }
5488
5489 if (r == term) {
5490 *top = node;
5491 }
5492 else if (r == TK_ALT) {
5493 *top = onig_node_new_alt(node, NULL);
5494 headp = &(NCDR(*top));
5495 while (r == TK_ALT) {
5496 r = fetch_token(tok, src, end, env);
5497 if (r < 0) return r;
5498 r = parse_branch(&node, tok, term, src, end, env);
5499 if (r < 0) return r;
5500
5501 *headp = onig_node_new_alt(node, NULL);
5502 headp = &(NCDR(*headp));
5503 }
5504
5505 if (tok->type != (enum TokenSyms )term)
5506 goto err;
5507 }
5508 else {
5509 err:
5510 if (term == TK_SUBEXP_CLOSE)
5511 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5512 else
5513 return ONIGERR_PARSER_BUG;
5514 }
5515
5516 return r;
5517 }
5518
5519 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5520 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5521 {
5522 int r;
5523 OnigToken tok;
5524
5525 r = fetch_token(&tok, src, end, env);
5526 if (r < 0) return r;
5527 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5528 if (r < 0) return r;
5529 return 0;
5530 }
5531
5532 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5533 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5534 regex_t* reg, ScanEnv* env)
5535 {
5536 int r;
5537 UChar* p;
5538
5539 #ifdef USE_NAMED_GROUP
5540 names_clear(reg);
5541 #endif
5542
5543 scan_env_clear(env);
5544 env->option = reg->options;
5545 env->case_fold_flag = reg->case_fold_flag;
5546 env->enc = reg->enc;
5547 env->syntax = reg->syntax;
5548 env->pattern = (UChar* )pattern;
5549 env->pattern_end = (UChar* )end;
5550 env->reg = reg;
5551
5552 *root = NULL;
5553 p = (UChar* )pattern;
5554 r = parse_regexp(root, &p, (UChar* )end, env);
5555 reg->num_mem = env->num_mem;
5556 return r;
5557 }
5558
5559 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5560 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5561 UChar* arg, UChar* arg_end)
5562 {
5563 env->error = arg;
5564 env->error_end = arg_end;
5565 }
5566