1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31 #include "st.h"
32
33 #ifdef DEBUG_NODE_FREE
34 #include <stdio.h>
35 #endif
36
37 #define INIT_TAG_NAMES_ALLOC_NUM 5
38
39 #define WARN_BUFSIZE 256
40
41 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
42
43 #define IS_ALLOWED_CODE_IN_CALLOUT_NAME(c) \
44 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_' /* || c == '!' */)
45 #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \
46 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
47
48
49 OnigSyntaxType OnigSyntaxOniguruma = {
50 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
51 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
52 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
53 ONIG_SYN_OP_ESC_CONTROL_CHARS |
54 ONIG_SYN_OP_ESC_C_CONTROL )
55 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
56 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
57 ONIG_SYN_OP2_OPTION_ONIGURUMA |
58 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
59 ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
60 ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
61 ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS |
62 ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME |
63 ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
64 ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
65 ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT |
66 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
67 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
68 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
69 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
70 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
71 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
72 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
73 ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
74 , ( SYN_GNU_REGEX_BV |
75 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
76 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
77 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
78 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
79 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
80 ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC |
81 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
82 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
83 , ONIG_OPTION_NONE
84 ,
85 {
86 (OnigCodePoint )'\\' /* esc */
87 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
88 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
89 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
90 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
91 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
92 }
93 };
94
95 OnigSyntaxType OnigSyntaxRuby = {
96 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
97 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
98 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
99 ONIG_SYN_OP_ESC_CONTROL_CHARS |
100 ONIG_SYN_OP_ESC_C_CONTROL )
101 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
102 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
103 ONIG_SYN_OP2_OPTION_RUBY |
104 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
105 ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
106 ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
107 ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
108 ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
109 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
110 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
111 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
112 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
113 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
114 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
115 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
116 ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
117 , ( SYN_GNU_REGEX_BV |
118 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
119 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
120 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
121 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
122 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
123 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
124 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
125 , ONIG_OPTION_NONE
126 ,
127 {
128 (OnigCodePoint )'\\' /* esc */
129 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
130 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
131 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
132 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
133 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
134 }
135 };
136
137 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA;
138
onig_null_warn(const char * s ARG_UNUSED)139 extern void onig_null_warn(const char* s ARG_UNUSED) { }
140
141 #ifdef DEFAULT_WARN_FUNCTION
142 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
143 #else
144 static OnigWarnFunc onig_warn = onig_null_warn;
145 #endif
146
147 #ifdef DEFAULT_VERB_WARN_FUNCTION
148 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
149 #else
150 static OnigWarnFunc onig_verb_warn = onig_null_warn;
151 #endif
152
onig_set_warn_func(OnigWarnFunc f)153 extern void onig_set_warn_func(OnigWarnFunc f)
154 {
155 onig_warn = f;
156 }
157
onig_set_verb_warn_func(OnigWarnFunc f)158 extern void onig_set_verb_warn_func(OnigWarnFunc f)
159 {
160 onig_verb_warn = f;
161 }
162
163 extern void
onig_warning(const char * s)164 onig_warning(const char* s)
165 {
166 if (onig_warn == onig_null_warn) return ;
167
168 (*onig_warn)(s);
169 }
170
171 #define DEFAULT_MAX_CAPTURE_NUM 32767
172
173 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM;
174
175 extern int
onig_set_capture_num_limit(int num)176 onig_set_capture_num_limit(int num)
177 {
178 if (num < 0) return -1;
179
180 MaxCaptureNum = num;
181 return 0;
182 }
183
184 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
185
186 extern unsigned int
onig_get_parse_depth_limit(void)187 onig_get_parse_depth_limit(void)
188 {
189 return ParseDepthLimit;
190 }
191
192 extern int
onig_set_parse_depth_limit(unsigned int depth)193 onig_set_parse_depth_limit(unsigned int depth)
194 {
195 if (depth == 0)
196 ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
197 else
198 ParseDepthLimit = depth;
199 return 0;
200 }
201
202 #ifdef ONIG_DEBUG_PARSE
203 #define INC_PARSE_DEPTH(d) do {\
204 (d)++;\
205 if (env->max_parse_depth < (d)) env->max_parse_depth = d;\
206 if ((d) > ParseDepthLimit) \
207 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
208 } while (0)
209 #else
210 #define INC_PARSE_DEPTH(d) do {\
211 (d)++;\
212 if ((d) > ParseDepthLimit) \
213 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
214 } while (0)
215 #endif
216
217 #define DEC_PARSE_DEPTH(d) (d)--
218
219
220 static int
bbuf_init(BBuf * buf,int size)221 bbuf_init(BBuf* buf, int size)
222 {
223 if (size <= 0) {
224 size = 0;
225 buf->p = NULL;
226 }
227 else {
228 buf->p = (UChar* )xmalloc(size);
229 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
230 }
231
232 buf->alloc = size;
233 buf->used = 0;
234 return 0;
235 }
236
237 static void
bbuf_free(BBuf * bbuf)238 bbuf_free(BBuf* bbuf)
239 {
240 if (IS_NOT_NULL(bbuf)) {
241 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
242 xfree(bbuf);
243 }
244 }
245
246 static int
bbuf_clone(BBuf ** rto,BBuf * from)247 bbuf_clone(BBuf** rto, BBuf* from)
248 {
249 int r;
250 BBuf *to;
251
252 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
253 CHECK_NULL_RETURN_MEMERR(to);
254 r = BB_INIT(to, from->alloc);
255 if (r != 0) {
256 xfree(to->p);
257 *rto = 0;
258 return r;
259 }
260 to->used = from->used;
261 xmemcpy(to->p, from->p, from->used);
262 return 0;
263 }
264
265 static int
backref_rel_to_abs(int rel_no,ScanEnv * env)266 backref_rel_to_abs(int rel_no, ScanEnv* env)
267 {
268 if (rel_no > 0) {
269 return env->num_mem + rel_no;
270 }
271 else {
272 return env->num_mem + 1 + rel_no;
273 }
274 }
275
276 #define OPTION_ON(v,f) ((v) |= (f))
277 #define OPTION_OFF(v,f) ((v) &= ~(f))
278
279 #define OPTION_NEGATE(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
280
281 #define MBCODE_START_POS(enc) \
282 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
283
284 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
285 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
286
287 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
288 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
289 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
290 if (r != 0) return r;\
291 }\
292 } while (0)
293
294
295 #define BITSET_IS_EMPTY(bs,empty) do {\
296 int i;\
297 empty = 1;\
298 for (i = 0; i < (int )BITSET_SIZE; i++) {\
299 if ((bs)[i] != 0) {\
300 empty = 0; break;\
301 }\
302 }\
303 } while (0)
304
305 static void
bitset_set_range(BitSetRef bs,int from,int to)306 bitset_set_range(BitSetRef bs, int from, int to)
307 {
308 int i;
309 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
310 BITSET_SET_BIT(bs, i);
311 }
312 }
313
314 static void
bitset_invert(BitSetRef bs)315 bitset_invert(BitSetRef bs)
316 {
317 int i;
318 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
319 }
320
321 static void
bitset_invert_to(BitSetRef from,BitSetRef to)322 bitset_invert_to(BitSetRef from, BitSetRef to)
323 {
324 int i;
325 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
326 }
327
328 static void
bitset_and(BitSetRef dest,BitSetRef bs)329 bitset_and(BitSetRef dest, BitSetRef bs)
330 {
331 int i;
332 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
333 }
334
335 static void
bitset_or(BitSetRef dest,BitSetRef bs)336 bitset_or(BitSetRef dest, BitSetRef bs)
337 {
338 int i;
339 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
340 }
341
342 static void
bitset_copy(BitSetRef dest,BitSetRef bs)343 bitset_copy(BitSetRef dest, BitSetRef bs)
344 {
345 int i;
346 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
347 }
348
349 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)350 onig_strncmp(const UChar* s1, const UChar* s2, int n)
351 {
352 int x;
353
354 while (n-- > 0) {
355 x = *s2++ - *s1++;
356 if (x) return x;
357 }
358 return 0;
359 }
360
361 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)362 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
363 {
364 int len = (int )(end - src);
365 if (len > 0) {
366 xmemcpy(dest, src, len);
367 dest[len] = (UChar )0;
368 }
369 }
370
371 static int
save_entry(ScanEnv * env,enum SaveType type,int * id)372 save_entry(ScanEnv* env, enum SaveType type, int* id)
373 {
374 int nid = env->save_num;
375
376 env->save_num++;
377 *id = nid;
378 return 0;
379 }
380
381 /* scan pattern methods */
382 #define PEND_VALUE 0
383
384 #define PFETCH_READY UChar* pfetch_prev
385 #define PEND (p < end ? 0 : 1)
386 #define PUNFETCH p = pfetch_prev
387 #define PINC do { \
388 pfetch_prev = p; \
389 p += ONIGENC_MBC_ENC_LEN(enc, p); \
390 } while (0)
391 #define PFETCH(c) do { \
392 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
393 pfetch_prev = p; \
394 p += ONIGENC_MBC_ENC_LEN(enc, p); \
395 } while (0)
396
397 #define PINC_S do { \
398 p += ONIGENC_MBC_ENC_LEN(enc, p); \
399 } while (0)
400 #define PFETCH_S(c) do { \
401 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
402 p += ONIGENC_MBC_ENC_LEN(enc, p); \
403 } while (0)
404
405 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
406 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
407
408 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)409 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
410 int capa)
411 {
412 UChar* r;
413
414 if (dest)
415 r = (UChar* )xrealloc(dest, capa + 1);
416 else
417 r = (UChar* )xmalloc(capa + 1);
418
419 CHECK_NULL_RETURN(r);
420 onig_strcpy(r + (dest_end - dest), src, src_end);
421 return r;
422 }
423
424 /* dest on static area */
425 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)426 strcat_capa_from_static(UChar* dest, UChar* dest_end,
427 const UChar* src, const UChar* src_end, int capa)
428 {
429 UChar* r;
430
431 r = (UChar* )xmalloc(capa + 1);
432 CHECK_NULL_RETURN(r);
433 onig_strcpy(r, dest, dest_end);
434 onig_strcpy(r + (dest_end - dest), src, src_end);
435 return r;
436 }
437
438
439 #ifdef USE_ST_LIBRARY
440
441 typedef struct {
442 UChar* s;
443 UChar* end;
444 } st_str_end_key;
445
446 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)447 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
448 {
449 UChar *p, *q;
450 int c;
451
452 if ((x->end - x->s) != (y->end - y->s))
453 return 1;
454
455 p = x->s;
456 q = y->s;
457 while (p < x->end) {
458 c = (int )*p - (int )*q;
459 if (c != 0) return c;
460
461 p++; q++;
462 }
463
464 return 0;
465 }
466
467 static int
str_end_hash(st_str_end_key * x)468 str_end_hash(st_str_end_key* x)
469 {
470 UChar *p;
471 unsigned val = 0;
472
473 p = x->s;
474 while (p < x->end) {
475 val = val * 997 + (unsigned )*p++;
476 }
477
478 return (int) (val + (val >> 5));
479 }
480
481 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)482 onig_st_init_strend_table_with_size(int size)
483 {
484 static struct st_hash_type hashType = {
485 str_end_cmp,
486 str_end_hash,
487 };
488
489 return (hash_table_type* )
490 onig_st_init_table_with_size(&hashType, size);
491 }
492
493 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)494 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
495 const UChar* end_key, hash_data_type *value)
496 {
497 st_str_end_key key;
498
499 key.s = (UChar* )str_key;
500 key.end = (UChar* )end_key;
501
502 return onig_st_lookup(table, (st_data_t )(&key), value);
503 }
504
505 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)506 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
507 const UChar* end_key, hash_data_type value)
508 {
509 st_str_end_key* key;
510 int result;
511
512 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
513 CHECK_NULL_RETURN_MEMERR(key);
514
515 key->s = (UChar* )str_key;
516 key->end = (UChar* )end_key;
517 result = onig_st_insert(table, (st_data_t )key, value);
518 if (result) {
519 xfree(key);
520 }
521 return result;
522 }
523
524
525 #ifdef USE_CALLOUT
526
527 typedef struct {
528 OnigEncoding enc;
529 int type; /* callout type: single or not */
530 UChar* s;
531 UChar* end;
532 } st_callout_name_key;
533
534 static int
callout_name_table_cmp(st_callout_name_key * x,st_callout_name_key * y)535 callout_name_table_cmp(st_callout_name_key* x, st_callout_name_key* y)
536 {
537 UChar *p, *q;
538 int c;
539
540 if (x->enc != y->enc) return 1;
541 if (x->type != y->type) return 1;
542 if ((x->end - x->s) != (y->end - y->s))
543 return 1;
544
545 p = x->s;
546 q = y->s;
547 while (p < x->end) {
548 c = (int )*p - (int )*q;
549 if (c != 0) return c;
550
551 p++; q++;
552 }
553
554 return 0;
555 }
556
557 static int
callout_name_table_hash(st_callout_name_key * x)558 callout_name_table_hash(st_callout_name_key* x)
559 {
560 UChar *p;
561 unsigned int val = 0;
562
563 p = x->s;
564 while (p < x->end) {
565 val = val * 997 + (unsigned int )*p++;
566 }
567
568 /* use intptr_t for escape warning in Windows */
569 return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type);
570 }
571
572 extern hash_table_type*
onig_st_init_callout_name_table_with_size(int size)573 onig_st_init_callout_name_table_with_size(int size)
574 {
575 static struct st_hash_type hashType = {
576 callout_name_table_cmp,
577 callout_name_table_hash,
578 };
579
580 return (hash_table_type* )
581 onig_st_init_table_with_size(&hashType, size);
582 }
583
584 extern int
onig_st_lookup_callout_name_table(hash_table_type * table,OnigEncoding enc,int type,const UChar * str_key,const UChar * end_key,hash_data_type * value)585 onig_st_lookup_callout_name_table(hash_table_type* table,
586 OnigEncoding enc,
587 int type,
588 const UChar* str_key,
589 const UChar* end_key,
590 hash_data_type *value)
591 {
592 st_callout_name_key key;
593
594 key.enc = enc;
595 key.type = type;
596 key.s = (UChar* )str_key;
597 key.end = (UChar* )end_key;
598
599 return onig_st_lookup(table, (st_data_t )(&key), value);
600 }
601
602 static int
st_insert_callout_name_table(hash_table_type * table,OnigEncoding enc,int type,UChar * str_key,UChar * end_key,hash_data_type value)603 st_insert_callout_name_table(hash_table_type* table,
604 OnigEncoding enc, int type,
605 UChar* str_key, UChar* end_key,
606 hash_data_type value)
607 {
608 st_callout_name_key* key;
609 int result;
610
611 key = (st_callout_name_key* )xmalloc(sizeof(st_callout_name_key));
612 CHECK_NULL_RETURN_MEMERR(key);
613
614 /* key->s: don't duplicate, because str_key is duped in callout_name_entry() */
615 key->enc = enc;
616 key->type = type;
617 key->s = str_key;
618 key->end = end_key;
619 result = onig_st_insert(table, (st_data_t )key, value);
620 if (result) {
621 xfree(key);
622 }
623 return result;
624 }
625 #endif
626
627 #endif /* USE_ST_LIBRARY */
628
629
630 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
631
632 typedef struct {
633 UChar* name;
634 int name_len; /* byte length */
635 int back_num; /* number of backrefs */
636 int back_alloc;
637 int back_ref1;
638 int* back_refs;
639 } NameEntry;
640
641 #ifdef USE_ST_LIBRARY
642
643 #define INIT_NAMES_ALLOC_NUM 5
644
645 typedef st_table NameTable;
646 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
647
648 #define NAMEBUF_SIZE 24
649 #define NAMEBUF_SIZE_1 25
650
651 #ifdef ONIG_DEBUG
652 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)653 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
654 {
655 int i;
656 FILE* fp = (FILE* )arg;
657
658 fprintf(fp, "%s: ", e->name);
659 if (e->back_num == 0)
660 fputs("-", fp);
661 else if (e->back_num == 1)
662 fprintf(fp, "%d", e->back_ref1);
663 else {
664 for (i = 0; i < e->back_num; i++) {
665 if (i > 0) fprintf(fp, ", ");
666 fprintf(fp, "%d", e->back_refs[i]);
667 }
668 }
669 fputs("\n", fp);
670 return ST_CONTINUE;
671 }
672
673 extern int
onig_print_names(FILE * fp,regex_t * reg)674 onig_print_names(FILE* fp, regex_t* reg)
675 {
676 NameTable* t = (NameTable* )reg->name_table;
677
678 if (IS_NOT_NULL(t)) {
679 fprintf(fp, "name table\n");
680 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
681 fputs("\n", fp);
682 }
683 return 0;
684 }
685 #endif /* ONIG_DEBUG */
686
687 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)688 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
689 {
690 xfree(e->name);
691 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
692 xfree(key);
693 xfree(e);
694 return ST_DELETE;
695 }
696
697 static int
names_clear(regex_t * reg)698 names_clear(regex_t* reg)
699 {
700 NameTable* t = (NameTable* )reg->name_table;
701
702 if (IS_NOT_NULL(t)) {
703 onig_st_foreach(t, i_free_name_entry, 0);
704 }
705 return 0;
706 }
707
708 extern int
onig_names_free(regex_t * reg)709 onig_names_free(regex_t* reg)
710 {
711 int r;
712 NameTable* t;
713
714 r = names_clear(reg);
715 if (r != 0) return r;
716
717 t = (NameTable* )reg->name_table;
718 if (IS_NOT_NULL(t)) onig_st_free_table(t);
719 reg->name_table = (void* )NULL;
720 return 0;
721 }
722
723 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)724 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
725 {
726 NameEntry* e;
727 NameTable* t = (NameTable* )reg->name_table;
728
729 e = (NameEntry* )NULL;
730 if (IS_NOT_NULL(t)) {
731 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
732 }
733 return e;
734 }
735
736 typedef struct {
737 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
738 regex_t* reg;
739 void* arg;
740 int ret;
741 OnigEncoding enc;
742 } INamesArg;
743
744 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)745 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
746 {
747 int r = (*(arg->func))(e->name,
748 e->name + e->name_len,
749 e->back_num,
750 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
751 arg->reg, arg->arg);
752 if (r != 0) {
753 arg->ret = r;
754 return ST_STOP;
755 }
756 return ST_CONTINUE;
757 }
758
759 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)760 onig_foreach_name(regex_t* reg,
761 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
762 {
763 INamesArg narg;
764 NameTable* t = (NameTable* )reg->name_table;
765
766 narg.ret = 0;
767 if (IS_NOT_NULL(t)) {
768 narg.func = func;
769 narg.reg = reg;
770 narg.arg = arg;
771 narg.enc = reg->enc; /* should be pattern encoding. */
772 onig_st_foreach(t, i_names, (HashDataType )&narg);
773 }
774 return narg.ret;
775 }
776
777 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)778 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
779 {
780 int i;
781
782 if (e->back_num > 1) {
783 for (i = 0; i < e->back_num; i++) {
784 e->back_refs[i] = map[e->back_refs[i]].new_val;
785 }
786 }
787 else if (e->back_num == 1) {
788 e->back_ref1 = map[e->back_ref1].new_val;
789 }
790
791 return ST_CONTINUE;
792 }
793
794 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)795 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
796 {
797 NameTable* t = (NameTable* )reg->name_table;
798
799 if (IS_NOT_NULL(t)) {
800 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
801 }
802 return 0;
803 }
804
805
806 extern int
onig_number_of_names(regex_t * reg)807 onig_number_of_names(regex_t* reg)
808 {
809 NameTable* t = (NameTable* )reg->name_table;
810
811 if (IS_NOT_NULL(t))
812 return t->num_entries;
813 else
814 return 0;
815 }
816
817 #else /* USE_ST_LIBRARY */
818
819 #define INIT_NAMES_ALLOC_NUM 8
820
821 typedef struct {
822 NameEntry* e;
823 int num;
824 int alloc;
825 } NameTable;
826
827 #ifdef ONIG_DEBUG
828 extern int
onig_print_names(FILE * fp,regex_t * reg)829 onig_print_names(FILE* fp, regex_t* reg)
830 {
831 int i, j;
832 NameEntry* e;
833 NameTable* t = (NameTable* )reg->name_table;
834
835 if (IS_NOT_NULL(t) && t->num > 0) {
836 fprintf(fp, "name table\n");
837 for (i = 0; i < t->num; i++) {
838 e = &(t->e[i]);
839 fprintf(fp, "%s: ", e->name);
840 if (e->back_num == 0) {
841 fputs("-", fp);
842 }
843 else if (e->back_num == 1) {
844 fprintf(fp, "%d", e->back_ref1);
845 }
846 else {
847 for (j = 0; j < e->back_num; j++) {
848 if (j > 0) fprintf(fp, ", ");
849 fprintf(fp, "%d", e->back_refs[j]);
850 }
851 }
852 fputs("\n", fp);
853 }
854 fputs("\n", fp);
855 }
856 return 0;
857 }
858 #endif
859
860 static int
names_clear(regex_t * reg)861 names_clear(regex_t* reg)
862 {
863 int i;
864 NameEntry* e;
865 NameTable* t = (NameTable* )reg->name_table;
866
867 if (IS_NOT_NULL(t)) {
868 for (i = 0; i < t->num; i++) {
869 e = &(t->e[i]);
870 if (IS_NOT_NULL(e->name)) {
871 xfree(e->name);
872 e->name = NULL;
873 e->name_len = 0;
874 e->back_num = 0;
875 e->back_alloc = 0;
876 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
877 e->back_refs = (int* )NULL;
878 }
879 }
880 if (IS_NOT_NULL(t->e)) {
881 xfree(t->e);
882 t->e = NULL;
883 }
884 t->num = 0;
885 }
886 return 0;
887 }
888
889 extern int
onig_names_free(regex_t * reg)890 onig_names_free(regex_t* reg)
891 {
892 int r;
893 NameTable* t;
894
895 r = names_clear(reg);
896 if (r != 0) return r;
897
898 t = (NameTable* )reg->name_table;
899 if (IS_NOT_NULL(t)) xfree(t);
900 reg->name_table = NULL;
901 return 0;
902 }
903
904 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)905 name_find(regex_t* reg, UChar* name, UChar* name_end)
906 {
907 int i, len;
908 NameEntry* e;
909 NameTable* t = (NameTable* )reg->name_table;
910
911 if (IS_NOT_NULL(t)) {
912 len = name_end - name;
913 for (i = 0; i < t->num; i++) {
914 e = &(t->e[i]);
915 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
916 return e;
917 }
918 }
919 return (NameEntry* )NULL;
920 }
921
922 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)923 onig_foreach_name(regex_t* reg,
924 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
925 {
926 int i, r;
927 NameEntry* e;
928 NameTable* t = (NameTable* )reg->name_table;
929
930 if (IS_NOT_NULL(t)) {
931 for (i = 0; i < t->num; i++) {
932 e = &(t->e[i]);
933 r = (*func)(e->name, e->name + e->name_len, e->back_num,
934 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
935 reg, arg);
936 if (r != 0) return r;
937 }
938 }
939 return 0;
940 }
941
942 extern int
onig_number_of_names(regex_t * reg)943 onig_number_of_names(regex_t* reg)
944 {
945 NameTable* t = (NameTable* )reg->name_table;
946
947 if (IS_NOT_NULL(t))
948 return t->num;
949 else
950 return 0;
951 }
952
953 #endif /* else USE_ST_LIBRARY */
954
955 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)956 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
957 {
958 int r;
959 int alloc;
960 NameEntry* e;
961 NameTable* t = (NameTable* )reg->name_table;
962
963 if (name_end - name <= 0)
964 return ONIGERR_EMPTY_GROUP_NAME;
965
966 e = name_find(reg, name, name_end);
967 if (IS_NULL(e)) {
968 #ifdef USE_ST_LIBRARY
969 if (IS_NULL(t)) {
970 t = onig_st_init_strend_table_with_size(INIT_NAMES_ALLOC_NUM);
971 CHECK_NULL_RETURN_MEMERR(t);
972 reg->name_table = (void* )t;
973 }
974 e = (NameEntry* )xmalloc(sizeof(NameEntry));
975 CHECK_NULL_RETURN_MEMERR(e);
976
977 e->name = onigenc_strdup(reg->enc, name, name_end);
978 if (IS_NULL(e->name)) {
979 xfree(e); return ONIGERR_MEMORY;
980 }
981 r = onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
982 (HashDataType )e);
983 if (r < 0) return r;
984
985 e->name_len = (int )(name_end - name);
986 e->back_num = 0;
987 e->back_alloc = 0;
988 e->back_refs = (int* )NULL;
989
990 #else
991
992 if (IS_NULL(t)) {
993 alloc = INIT_NAMES_ALLOC_NUM;
994 t = (NameTable* )xmalloc(sizeof(NameTable));
995 CHECK_NULL_RETURN_MEMERR(t);
996 t->e = NULL;
997 t->alloc = 0;
998 t->num = 0;
999
1000 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
1001 if (IS_NULL(t->e)) {
1002 xfree(t);
1003 return ONIGERR_MEMORY;
1004 }
1005 t->alloc = alloc;
1006 reg->name_table = t;
1007 goto clear;
1008 }
1009 else if (t->num == t->alloc) {
1010 int i;
1011
1012 alloc = t->alloc * 2;
1013 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
1014 CHECK_NULL_RETURN_MEMERR(t->e);
1015 t->alloc = alloc;
1016
1017 clear:
1018 for (i = t->num; i < t->alloc; i++) {
1019 t->e[i].name = NULL;
1020 t->e[i].name_len = 0;
1021 t->e[i].back_num = 0;
1022 t->e[i].back_alloc = 0;
1023 t->e[i].back_refs = (int* )NULL;
1024 }
1025 }
1026 e = &(t->e[t->num]);
1027 t->num++;
1028 e->name = onigenc_strdup(reg->enc, name, name_end);
1029 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1030 e->name_len = name_end - name;
1031 #endif
1032 }
1033
1034 if (e->back_num >= 1 &&
1035 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
1036 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1037 name, name_end);
1038 return ONIGERR_MULTIPLEX_DEFINED_NAME;
1039 }
1040
1041 e->back_num++;
1042 if (e->back_num == 1) {
1043 e->back_ref1 = backref;
1044 }
1045 else {
1046 if (e->back_num == 2) {
1047 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
1048 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
1049 CHECK_NULL_RETURN_MEMERR(e->back_refs);
1050 e->back_alloc = alloc;
1051 e->back_refs[0] = e->back_ref1;
1052 e->back_refs[1] = backref;
1053 }
1054 else {
1055 if (e->back_num > e->back_alloc) {
1056 alloc = e->back_alloc * 2;
1057 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
1058 CHECK_NULL_RETURN_MEMERR(e->back_refs);
1059 e->back_alloc = alloc;
1060 }
1061 e->back_refs[e->back_num - 1] = backref;
1062 }
1063 }
1064
1065 return 0;
1066 }
1067
1068 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)1069 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
1070 const UChar* name_end, int** nums)
1071 {
1072 NameEntry* e = name_find(reg, name, name_end);
1073
1074 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
1075
1076 switch (e->back_num) {
1077 case 0:
1078 break;
1079 case 1:
1080 *nums = &(e->back_ref1);
1081 break;
1082 default:
1083 *nums = e->back_refs;
1084 break;
1085 }
1086 return e->back_num;
1087 }
1088
1089 static int
name_to_group_numbers(ScanEnv * env,const UChar * name,const UChar * name_end,int ** nums)1090 name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end,
1091 int** nums)
1092 {
1093 regex_t* reg;
1094 NameEntry* e;
1095
1096 reg = env->reg;
1097 e = name_find(reg, name, name_end);
1098
1099 if (IS_NULL(e)) {
1100 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
1101 (UChar* )name, (UChar* )name_end);
1102 return ONIGERR_UNDEFINED_NAME_REFERENCE;
1103 }
1104
1105 switch (e->back_num) {
1106 case 0:
1107 break;
1108 case 1:
1109 *nums = &(e->back_ref1);
1110 break;
1111 default:
1112 *nums = e->back_refs;
1113 break;
1114 }
1115 return e->back_num;
1116 }
1117
1118 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)1119 onig_name_to_backref_number(regex_t* reg, const UChar* name,
1120 const UChar* name_end, OnigRegion *region)
1121 {
1122 int i, n, *nums;
1123
1124 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
1125 if (n < 0)
1126 return n;
1127 else if (n == 0)
1128 return ONIGERR_PARSER_BUG;
1129 else if (n == 1)
1130 return nums[0];
1131 else {
1132 if (IS_NOT_NULL(region)) {
1133 for (i = n - 1; i >= 0; i--) {
1134 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
1135 return nums[i];
1136 }
1137 }
1138 return nums[n - 1];
1139 }
1140 }
1141
1142 extern int
onig_noname_group_capture_is_active(regex_t * reg)1143 onig_noname_group_capture_is_active(regex_t* reg)
1144 {
1145 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
1146 return 0;
1147
1148 if (onig_number_of_names(reg) > 0 &&
1149 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
1150 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
1151 return 0;
1152 }
1153
1154 return 1;
1155 }
1156
1157 #ifdef USE_CALLOUT
1158
1159 typedef struct {
1160 OnigCalloutType type;
1161 int in;
1162 OnigCalloutFunc start_func;
1163 OnigCalloutFunc end_func;
1164 int arg_num;
1165 int opt_arg_num;
1166 unsigned int arg_types[ONIG_CALLOUT_MAX_ARGS_NUM];
1167 OnigValue opt_defaults[ONIG_CALLOUT_MAX_ARGS_NUM];
1168 UChar* name; /* reference to GlobalCalloutNameTable entry: e->name */
1169 } CalloutNameListEntry;
1170
1171 typedef struct {
1172 int n;
1173 int alloc;
1174 CalloutNameListEntry* v;
1175 } CalloutNameListType;
1176
1177 static CalloutNameListType* GlobalCalloutNameList;
1178
1179 static int
make_callout_func_list(CalloutNameListType ** rs,int init_size)1180 make_callout_func_list(CalloutNameListType** rs, int init_size)
1181 {
1182 CalloutNameListType* s;
1183 CalloutNameListEntry* v;
1184
1185 *rs = 0;
1186
1187 s = xmalloc(sizeof(*s));
1188 if (IS_NULL(s)) return ONIGERR_MEMORY;
1189
1190 v = (CalloutNameListEntry* )xmalloc(sizeof(CalloutNameListEntry) * init_size);
1191 if (IS_NULL(v)) {
1192 xfree(s);
1193 return ONIGERR_MEMORY;
1194 }
1195
1196 s->n = 0;
1197 s->alloc = init_size;
1198 s->v = v;
1199
1200 *rs = s;
1201 return ONIG_NORMAL;
1202 }
1203
1204 static void
free_callout_func_list(CalloutNameListType * s)1205 free_callout_func_list(CalloutNameListType* s)
1206 {
1207 if (IS_NOT_NULL(s)) {
1208 if (IS_NOT_NULL(s->v)) {
1209 int i, j;
1210
1211 for (i = 0; i < s->n; i++) {
1212 CalloutNameListEntry* e = s->v + i;
1213 for (j = e->arg_num - e->opt_arg_num; j < e->arg_num; j++) {
1214 if (e->arg_types[j] == ONIG_TYPE_STRING) {
1215 UChar* p = e->opt_defaults[j].s.start;
1216 if (IS_NOT_NULL(p)) xfree(p);
1217 }
1218 }
1219 }
1220 xfree(s->v);
1221 }
1222 xfree(s);
1223 }
1224 }
1225
1226 static int
callout_func_list_add(CalloutNameListType * s,int * rid)1227 callout_func_list_add(CalloutNameListType* s, int* rid)
1228 {
1229 if (s->n >= s->alloc) {
1230 int new_size = s->alloc * 2;
1231 CalloutNameListEntry* nv = (CalloutNameListEntry* )
1232 xrealloc(s->v, sizeof(CalloutNameListEntry) * new_size);
1233 if (IS_NULL(nv)) return ONIGERR_MEMORY;
1234
1235 s->alloc = new_size;
1236 s->v = nv;
1237 }
1238
1239 *rid = s->n;
1240
1241 xmemset(&(s->v[s->n]), 0, sizeof(*(s->v)));
1242 s->n++;
1243 return ONIG_NORMAL;
1244 }
1245
1246
1247 typedef struct {
1248 UChar* name;
1249 int name_len; /* byte length */
1250 int id;
1251 } CalloutNameEntry;
1252
1253 #ifdef USE_ST_LIBRARY
1254 typedef st_table CalloutNameTable;
1255 #else
1256 typedef struct {
1257 CalloutNameEntry* e;
1258 int num;
1259 int alloc;
1260 } CalloutNameTable;
1261 #endif
1262
1263 static CalloutNameTable* GlobalCalloutNameTable;
1264 static int CalloutNameIDCounter;
1265
1266 #ifdef USE_ST_LIBRARY
1267
1268 static int
i_free_callout_name_entry(st_callout_name_key * key,CalloutNameEntry * e,void * arg ARG_UNUSED)1269 i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e,
1270 void* arg ARG_UNUSED)
1271 {
1272 xfree(e->name);
1273 /*xfree(key->s); */ /* is same as e->name */
1274 xfree(key);
1275 xfree(e);
1276 return ST_DELETE;
1277 }
1278
1279 static int
callout_name_table_clear(CalloutNameTable * t)1280 callout_name_table_clear(CalloutNameTable* t)
1281 {
1282 if (IS_NOT_NULL(t)) {
1283 onig_st_foreach(t, i_free_callout_name_entry, 0);
1284 }
1285 return 0;
1286 }
1287
1288 static int
global_callout_name_table_free(void)1289 global_callout_name_table_free(void)
1290 {
1291 if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1292 int r = callout_name_table_clear(GlobalCalloutNameTable);
1293 if (r != 0) return r;
1294
1295 onig_st_free_table(GlobalCalloutNameTable);
1296 GlobalCalloutNameTable = 0;
1297 CalloutNameIDCounter = 0;
1298 }
1299
1300 return 0;
1301 }
1302
1303 static CalloutNameEntry*
callout_name_find(OnigEncoding enc,int is_not_single,const UChar * name,const UChar * name_end)1304 callout_name_find(OnigEncoding enc, int is_not_single,
1305 const UChar* name, const UChar* name_end)
1306 {
1307 int r;
1308 CalloutNameEntry* e;
1309 CalloutNameTable* t = GlobalCalloutNameTable;
1310
1311 e = (CalloutNameEntry* )NULL;
1312 if (IS_NOT_NULL(t)) {
1313 r = onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1314 (HashDataType* )((void* )(&e)));
1315 if (r == 0) { /* not found */
1316 if (enc != ONIG_ENCODING_ASCII &&
1317 ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
1318 enc = ONIG_ENCODING_ASCII;
1319 onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1320 (HashDataType* )((void* )(&e)));
1321 }
1322 }
1323 }
1324 return e;
1325 }
1326
1327 #else
1328
1329 static int
callout_name_table_clear(CalloutNameTable * t)1330 callout_name_table_clear(CalloutNameTable* t)
1331 {
1332 int i;
1333 CalloutNameEntry* e;
1334
1335 if (IS_NOT_NULL(t)) {
1336 for (i = 0; i < t->num; i++) {
1337 e = &(t->e[i]);
1338 if (IS_NOT_NULL(e->name)) {
1339 xfree(e->name);
1340 e->name = NULL;
1341 e->name_len = 0;
1342 e->id = 0;
1343 e->func = 0;
1344 }
1345 }
1346 if (IS_NOT_NULL(t->e)) {
1347 xfree(t->e);
1348 t->e = NULL;
1349 }
1350 t->num = 0;
1351 }
1352 return 0;
1353 }
1354
1355 static int
global_callout_name_table_free(void)1356 global_callout_name_table_free(void)
1357 {
1358 if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1359 int r = callout_name_table_clear(GlobalCalloutNameTable);
1360 if (r != 0) return r;
1361
1362 xfree(GlobalCalloutNameTable);
1363 GlobalCalloutNameTable = 0;
1364 CalloutNameIDCounter = 0;
1365 }
1366 return 0;
1367 }
1368
1369 static CalloutNameEntry*
callout_name_find(UChar * name,UChar * name_end)1370 callout_name_find(UChar* name, UChar* name_end)
1371 {
1372 int i, len;
1373 CalloutNameEntry* e;
1374 CalloutNameTable* t = Calloutnames;
1375
1376 if (IS_NOT_NULL(t)) {
1377 len = name_end - name;
1378 for (i = 0; i < t->num; i++) {
1379 e = &(t->e[i]);
1380 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
1381 return e;
1382 }
1383 }
1384 return (CalloutNameEntry* )NULL;
1385 }
1386
1387 #endif
1388
1389 /* name string must be single byte char string. */
1390 static int
callout_name_entry(CalloutNameEntry ** rentry,OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end)1391 callout_name_entry(CalloutNameEntry** rentry, OnigEncoding enc,
1392 int is_not_single, UChar* name, UChar* name_end)
1393 {
1394 int r;
1395 CalloutNameEntry* e;
1396 CalloutNameTable* t = GlobalCalloutNameTable;
1397
1398 *rentry = 0;
1399 if (name_end - name <= 0)
1400 return ONIGERR_INVALID_CALLOUT_NAME;
1401
1402 e = callout_name_find(enc, is_not_single, name, name_end);
1403 if (IS_NULL(e)) {
1404 #ifdef USE_ST_LIBRARY
1405 if (IS_NULL(t)) {
1406 t = onig_st_init_callout_name_table_with_size(INIT_NAMES_ALLOC_NUM);
1407 CHECK_NULL_RETURN_MEMERR(t);
1408 GlobalCalloutNameTable = t;
1409 }
1410 e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry));
1411 CHECK_NULL_RETURN_MEMERR(e);
1412
1413 e->name = onigenc_strdup(enc, name, name_end);
1414 if (IS_NULL(e->name)) {
1415 xfree(e); return ONIGERR_MEMORY;
1416 }
1417
1418 r = st_insert_callout_name_table(t, enc, is_not_single,
1419 e->name, (e->name + (name_end - name)),
1420 (HashDataType )e);
1421 if (r < 0) return r;
1422
1423 #else
1424
1425 int alloc;
1426
1427 if (IS_NULL(t)) {
1428 alloc = INIT_NAMES_ALLOC_NUM;
1429 t = (CalloutNameTable* )xmalloc(sizeof(CalloutNameTable));
1430 CHECK_NULL_RETURN_MEMERR(t);
1431 t->e = NULL;
1432 t->alloc = 0;
1433 t->num = 0;
1434
1435 t->e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry) * alloc);
1436 if (IS_NULL(t->e)) {
1437 xfree(t);
1438 return ONIGERR_MEMORY;
1439 }
1440 t->alloc = alloc;
1441 GlobalCalloutNameTable = t;
1442 goto clear;
1443 }
1444 else if (t->num == t->alloc) {
1445 int i;
1446
1447 alloc = t->alloc * 2;
1448 t->e = (CalloutNameEntry* )xrealloc(t->e, sizeof(CalloutNameEntry) * alloc);
1449 CHECK_NULL_RETURN_MEMERR(t->e);
1450 t->alloc = alloc;
1451
1452 clear:
1453 for (i = t->num; i < t->alloc; i++) {
1454 t->e[i].name = NULL;
1455 t->e[i].name_len = 0;
1456 t->e[i].id = 0;
1457 }
1458 }
1459 e = &(t->e[t->num]);
1460 t->num++;
1461 e->name = onigenc_strdup(enc, name, name_end);
1462 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1463 #endif
1464
1465 CalloutNameIDCounter++;
1466 e->id = CalloutNameIDCounter;
1467 e->name_len = (int )(name_end - name);
1468 }
1469
1470 *rentry = e;
1471 return e->id;
1472 }
1473
1474 static int
is_allowed_callout_name(OnigEncoding enc,UChar * name,UChar * name_end)1475 is_allowed_callout_name(OnigEncoding enc, UChar* name, UChar* name_end)
1476 {
1477 UChar* p;
1478 OnigCodePoint c;
1479
1480 if (name >= name_end) return 0;
1481
1482 p = name;
1483 while (p < name_end) {
1484 c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1485 if (! IS_ALLOWED_CODE_IN_CALLOUT_NAME(c))
1486 return 0;
1487
1488 if (p == name) {
1489 if (c >= '0' && c <= '9') return 0;
1490 }
1491
1492 p += ONIGENC_MBC_ENC_LEN(enc, p);
1493 }
1494
1495 return 1;
1496 }
1497
1498 static int
is_allowed_callout_tag_name(OnigEncoding enc,UChar * name,UChar * name_end)1499 is_allowed_callout_tag_name(OnigEncoding enc, UChar* name, UChar* name_end)
1500 {
1501 UChar* p;
1502 OnigCodePoint c;
1503
1504 if (name >= name_end) return 0;
1505
1506 p = name;
1507 while (p < name_end) {
1508 c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1509 if (! IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c))
1510 return 0;
1511
1512 if (p == name) {
1513 if (c >= '0' && c <= '9') return 0;
1514 }
1515
1516 p += ONIGENC_MBC_ENC_LEN(enc, p);
1517 }
1518
1519 return 1;
1520 }
1521
1522 extern int
onig_set_callout_of_name(OnigEncoding enc,OnigCalloutType callout_type,UChar * name,UChar * name_end,int in,OnigCalloutFunc start_func,OnigCalloutFunc end_func,int arg_num,unsigned int arg_types[],int opt_arg_num,OnigValue opt_defaults[])1523 onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type,
1524 UChar* name, UChar* name_end, int in,
1525 OnigCalloutFunc start_func,
1526 OnigCalloutFunc end_func,
1527 int arg_num, unsigned int arg_types[],
1528 int opt_arg_num, OnigValue opt_defaults[])
1529 {
1530 int r;
1531 int i;
1532 int j;
1533 int id;
1534 int is_not_single;
1535 CalloutNameEntry* e;
1536 CalloutNameListEntry* fe;
1537
1538 if (callout_type != ONIG_CALLOUT_TYPE_SINGLE)
1539 return ONIGERR_INVALID_ARGUMENT;
1540
1541 if (arg_num < 0 || arg_num > ONIG_CALLOUT_MAX_ARGS_NUM)
1542 return ONIGERR_INVALID_CALLOUT_ARG;
1543
1544 if (opt_arg_num < 0 || opt_arg_num > arg_num)
1545 return ONIGERR_INVALID_CALLOUT_ARG;
1546
1547 if (start_func == 0 && end_func == 0)
1548 return ONIGERR_INVALID_CALLOUT_ARG;
1549
1550 if ((in & ONIG_CALLOUT_IN_PROGRESS) == 0 && (in & ONIG_CALLOUT_IN_RETRACTION) == 0)
1551 return ONIGERR_INVALID_CALLOUT_ARG;
1552
1553 for (i = 0; i < arg_num; i++) {
1554 unsigned int t = arg_types[i];
1555 if (t == ONIG_TYPE_VOID)
1556 return ONIGERR_INVALID_CALLOUT_ARG;
1557 else {
1558 if (i >= arg_num - opt_arg_num) {
1559 if (t != ONIG_TYPE_LONG && t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING &&
1560 t != ONIG_TYPE_TAG)
1561 return ONIGERR_INVALID_CALLOUT_ARG;
1562 }
1563 else {
1564 if (t != ONIG_TYPE_LONG) {
1565 t = t & ~ONIG_TYPE_LONG;
1566 if (t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG)
1567 return ONIGERR_INVALID_CALLOUT_ARG;
1568 }
1569 }
1570 }
1571 }
1572
1573 if (! is_allowed_callout_name(enc, name, name_end)) {
1574 return ONIGERR_INVALID_CALLOUT_NAME;
1575 }
1576
1577 is_not_single = (callout_type != ONIG_CALLOUT_TYPE_SINGLE);
1578 id = callout_name_entry(&e, enc, is_not_single, name, name_end);
1579 if (id < 0) return id;
1580
1581 r = ONIG_NORMAL;
1582 if (IS_NULL(GlobalCalloutNameList)) {
1583 r = make_callout_func_list(&GlobalCalloutNameList, 10);
1584 if (r != ONIG_NORMAL) return r;
1585 }
1586
1587 while (id >= GlobalCalloutNameList->n) {
1588 int rid;
1589 r = callout_func_list_add(GlobalCalloutNameList, &rid);
1590 if (r != ONIG_NORMAL) return r;
1591 }
1592
1593 fe = GlobalCalloutNameList->v + id;
1594 fe->type = callout_type;
1595 fe->in = in;
1596 fe->start_func = start_func;
1597 fe->end_func = end_func;
1598 fe->arg_num = arg_num;
1599 fe->opt_arg_num = opt_arg_num;
1600 fe->name = e->name;
1601
1602 for (i = 0; i < arg_num; i++) {
1603 fe->arg_types[i] = arg_types[i];
1604 }
1605 for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) {
1606 if (fe->arg_types[i] == ONIG_TYPE_STRING) {
1607 OnigValue* val;
1608 UChar* ds;
1609
1610 if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT;
1611
1612 val = opt_defaults + j;
1613 ds = onigenc_strdup(enc, val->s.start, val->s.end);
1614 CHECK_NULL_RETURN_MEMERR(ds);
1615
1616 fe->opt_defaults[i].s.start = ds;
1617 fe->opt_defaults[i].s.end = ds + (val->s.end - val->s.start);
1618 }
1619 else {
1620 fe->opt_defaults[i] = opt_defaults[j];
1621 }
1622 }
1623
1624 r = id;
1625 return r;
1626 }
1627
1628 static int
get_callout_name_id_by_name(OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end,int * rid)1629 get_callout_name_id_by_name(OnigEncoding enc, int is_not_single,
1630 UChar* name, UChar* name_end, int* rid)
1631 {
1632 int r;
1633 CalloutNameEntry* e;
1634
1635 if (! is_allowed_callout_name(enc, name, name_end)) {
1636 return ONIGERR_INVALID_CALLOUT_NAME;
1637 }
1638
1639 e = callout_name_find(enc, is_not_single, name, name_end);
1640 if (IS_NULL(e)) {
1641 return ONIGERR_UNDEFINED_CALLOUT_NAME;
1642 }
1643
1644 r = ONIG_NORMAL;
1645 *rid = e->id;
1646
1647 return r;
1648 }
1649
1650 extern OnigCalloutFunc
onig_get_callout_start_func(regex_t * reg,int callout_num)1651 onig_get_callout_start_func(regex_t* reg, int callout_num)
1652 {
1653 /* If used for callouts of contents, return 0. */
1654 CalloutListEntry* e;
1655
1656 e = onig_reg_callout_list_at(reg, callout_num);
1657 CHECK_NULL_RETURN(e);
1658 return e->start_func;
1659 }
1660
1661 extern const UChar*
onig_get_callout_tag_start(regex_t * reg,int callout_num)1662 onig_get_callout_tag_start(regex_t* reg, int callout_num)
1663 {
1664 CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1665 CHECK_NULL_RETURN(e);
1666 return e->tag_start;
1667 }
1668
1669 extern const UChar*
onig_get_callout_tag_end(regex_t * reg,int callout_num)1670 onig_get_callout_tag_end(regex_t* reg, int callout_num)
1671 {
1672 CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1673 CHECK_NULL_RETURN(e);
1674 return e->tag_end;
1675 }
1676
1677
1678 extern OnigCalloutType
onig_get_callout_type_by_name_id(int name_id)1679 onig_get_callout_type_by_name_id(int name_id)
1680 {
1681 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1682 return 0;
1683
1684 return GlobalCalloutNameList->v[name_id].type;
1685 }
1686
1687 extern OnigCalloutFunc
onig_get_callout_start_func_by_name_id(int name_id)1688 onig_get_callout_start_func_by_name_id(int name_id)
1689 {
1690 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1691 return 0;
1692
1693 return GlobalCalloutNameList->v[name_id].start_func;
1694 }
1695
1696 extern OnigCalloutFunc
onig_get_callout_end_func_by_name_id(int name_id)1697 onig_get_callout_end_func_by_name_id(int name_id)
1698 {
1699 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1700 return 0;
1701
1702 return GlobalCalloutNameList->v[name_id].end_func;
1703 }
1704
1705 extern int
onig_get_callout_in_by_name_id(int name_id)1706 onig_get_callout_in_by_name_id(int name_id)
1707 {
1708 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1709 return 0;
1710
1711 return GlobalCalloutNameList->v[name_id].in;
1712 }
1713
1714 static int
get_callout_arg_num_by_name_id(int name_id)1715 get_callout_arg_num_by_name_id(int name_id)
1716 {
1717 return GlobalCalloutNameList->v[name_id].arg_num;
1718 }
1719
1720 static int
get_callout_opt_arg_num_by_name_id(int name_id)1721 get_callout_opt_arg_num_by_name_id(int name_id)
1722 {
1723 return GlobalCalloutNameList->v[name_id].opt_arg_num;
1724 }
1725
1726 static unsigned int
get_callout_arg_type_by_name_id(int name_id,int index)1727 get_callout_arg_type_by_name_id(int name_id, int index)
1728 {
1729 return GlobalCalloutNameList->v[name_id].arg_types[index];
1730 }
1731
1732 static OnigValue
get_callout_opt_default_by_name_id(int name_id,int index)1733 get_callout_opt_default_by_name_id(int name_id, int index)
1734 {
1735 return GlobalCalloutNameList->v[name_id].opt_defaults[index];
1736 }
1737
1738 extern UChar*
onig_get_callout_name_by_name_id(int name_id)1739 onig_get_callout_name_by_name_id(int name_id)
1740 {
1741 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1742 return 0;
1743
1744 return GlobalCalloutNameList->v[name_id].name;
1745 }
1746
1747 extern int
onig_global_callout_names_free(void)1748 onig_global_callout_names_free(void)
1749 {
1750 free_callout_func_list(GlobalCalloutNameList);
1751 GlobalCalloutNameList = 0;
1752
1753 global_callout_name_table_free();
1754 return ONIG_NORMAL;
1755 }
1756
1757
1758 typedef st_table CalloutTagTable;
1759 typedef intptr_t CalloutTagVal;
1760
1761 #define CALLOUT_TAG_LIST_FLAG_TAG_EXIST (1<<0)
1762
1763 static int
i_callout_callout_list_set(UChar * key,CalloutTagVal e,void * arg)1764 i_callout_callout_list_set(UChar* key, CalloutTagVal e, void* arg)
1765 {
1766 int num;
1767 RegexExt* ext = (RegexExt* )arg;
1768
1769 num = (int )e - 1;
1770 ext->callout_list[num].flag |= CALLOUT_TAG_LIST_FLAG_TAG_EXIST;
1771 return ST_CONTINUE;
1772 }
1773
1774 static int
setup_ext_callout_list_values(regex_t * reg)1775 setup_ext_callout_list_values(regex_t* reg)
1776 {
1777 int i, j;
1778 RegexExt* ext;
1779
1780 ext = reg->extp;
1781 if (IS_NOT_NULL(ext->tag_table)) {
1782 onig_st_foreach((CalloutTagTable *)ext->tag_table, i_callout_callout_list_set,
1783 (st_data_t )ext);
1784 }
1785
1786 for (i = 0; i < ext->callout_num; i++) {
1787 CalloutListEntry* e = ext->callout_list + i;
1788 if (e->of == ONIG_CALLOUT_OF_NAME) {
1789 for (j = 0; j < e->u.arg.num; j++) {
1790 if (e->u.arg.types[j] == ONIG_TYPE_TAG) {
1791 UChar* start;
1792 UChar* end;
1793 int num;
1794 start = e->u.arg.vals[j].s.start;
1795 end = e->u.arg.vals[j].s.end;
1796 num = onig_get_callout_num_by_tag(reg, start, end);
1797 if (num < 0) return num;
1798 e->u.arg.vals[j].tag = num;
1799 }
1800 }
1801 }
1802 }
1803
1804 return ONIG_NORMAL;
1805 }
1806
1807 extern int
onig_callout_tag_is_exist_at_callout_num(regex_t * reg,int callout_num)1808 onig_callout_tag_is_exist_at_callout_num(regex_t* reg, int callout_num)
1809 {
1810 RegexExt* ext = reg->extp;
1811
1812 if (IS_NULL(ext) || IS_NULL(ext->callout_list)) return 0;
1813 if (callout_num > ext->callout_num) return 0;
1814
1815 return (ext->callout_list[callout_num].flag &
1816 CALLOUT_TAG_LIST_FLAG_TAG_EXIST) != 0;
1817 }
1818
1819 static int
i_free_callout_tag_entry(UChar * key,CalloutTagVal e,void * arg ARG_UNUSED)1820 i_free_callout_tag_entry(UChar* key, CalloutTagVal e, void* arg ARG_UNUSED)
1821 {
1822 xfree(key);
1823 return ST_DELETE;
1824 }
1825
1826 static int
callout_tag_table_clear(CalloutTagTable * t)1827 callout_tag_table_clear(CalloutTagTable* t)
1828 {
1829 if (IS_NOT_NULL(t)) {
1830 onig_st_foreach(t, i_free_callout_tag_entry, 0);
1831 }
1832 return 0;
1833 }
1834
1835 extern int
onig_callout_tag_table_free(void * table)1836 onig_callout_tag_table_free(void* table)
1837 {
1838 CalloutTagTable* t = (CalloutTagTable* )table;
1839
1840 if (IS_NOT_NULL(t)) {
1841 int r = callout_tag_table_clear(t);
1842 if (r != 0) return r;
1843
1844 onig_st_free_table(t);
1845 }
1846
1847 return 0;
1848 }
1849
1850 extern int
onig_get_callout_num_by_tag(regex_t * reg,const UChar * tag,const UChar * tag_end)1851 onig_get_callout_num_by_tag(regex_t* reg,
1852 const UChar* tag, const UChar* tag_end)
1853 {
1854 int r;
1855 RegexExt* ext;
1856 CalloutTagVal e;
1857
1858 ext = reg->extp;
1859 if (IS_NULL(ext) || IS_NULL(ext->tag_table))
1860 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1861
1862 r = onig_st_lookup_strend(ext->tag_table, tag, tag_end,
1863 (HashDataType* )((void* )(&e)));
1864 if (r == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1865 return (int )e;
1866 }
1867
1868 static CalloutTagVal
callout_tag_find(CalloutTagTable * t,const UChar * name,const UChar * name_end)1869 callout_tag_find(CalloutTagTable* t, const UChar* name, const UChar* name_end)
1870 {
1871 CalloutTagVal e;
1872
1873 e = -1;
1874 if (IS_NOT_NULL(t)) {
1875 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
1876 }
1877 return e;
1878 }
1879
1880 static int
callout_tag_table_new(CalloutTagTable ** rt)1881 callout_tag_table_new(CalloutTagTable** rt)
1882 {
1883 CalloutTagTable* t;
1884
1885 *rt = 0;
1886 t = onig_st_init_strend_table_with_size(INIT_TAG_NAMES_ALLOC_NUM);
1887 CHECK_NULL_RETURN_MEMERR(t);
1888
1889 *rt = t;
1890 return ONIG_NORMAL;
1891 }
1892
1893 static int
callout_tag_entry_raw(ScanEnv * env,CalloutTagTable * t,UChar * name,UChar * name_end,CalloutTagVal entry_val)1894 callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name,
1895 UChar* name_end, CalloutTagVal entry_val)
1896 {
1897 int r;
1898 CalloutTagVal val;
1899
1900 if (name_end - name <= 0)
1901 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1902
1903 val = callout_tag_find(t, name, name_end);
1904 if (val >= 0) {
1905 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1906 name, name_end);
1907 return ONIGERR_MULTIPLEX_DEFINED_NAME;
1908 }
1909
1910 r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val);
1911 if (r < 0) return r;
1912
1913 return ONIG_NORMAL;
1914 }
1915
1916 static int
ext_ensure_tag_table(regex_t * reg)1917 ext_ensure_tag_table(regex_t* reg)
1918 {
1919 int r;
1920 RegexExt* ext;
1921 CalloutTagTable* t;
1922
1923 ext = onig_get_regex_ext(reg);
1924 CHECK_NULL_RETURN_MEMERR(ext);
1925
1926 if (IS_NULL(ext->tag_table)) {
1927 r = callout_tag_table_new(&t);
1928 if (r != ONIG_NORMAL) return r;
1929
1930 ext->tag_table = t;
1931 }
1932
1933 return ONIG_NORMAL;
1934 }
1935
1936 static int
callout_tag_entry(ScanEnv * env,regex_t * reg,UChar * name,UChar * name_end,CalloutTagVal entry_val)1937 callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,
1938 CalloutTagVal entry_val)
1939 {
1940 int r;
1941 RegexExt* ext;
1942 CalloutListEntry* e;
1943
1944 r = ext_ensure_tag_table(reg);
1945 if (r != ONIG_NORMAL) return r;
1946
1947 ext = onig_get_regex_ext(reg);
1948 CHECK_NULL_RETURN_MEMERR(ext);
1949 r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val);
1950
1951 e = onig_reg_callout_list_at(reg, (int )entry_val);
1952 CHECK_NULL_RETURN_MEMERR(e);
1953 e->tag_start = name;
1954 e->tag_end = name_end;
1955
1956 return r;
1957 }
1958
1959 #endif /* USE_CALLOUT */
1960
1961
1962 #define INIT_SCANENV_MEMENV_ALLOC_SIZE 16
1963
1964 static void
scan_env_clear(ScanEnv * env)1965 scan_env_clear(ScanEnv* env)
1966 {
1967 MEM_STATUS_CLEAR(env->cap_history);
1968 MEM_STATUS_CLEAR(env->backtrack_mem);
1969 MEM_STATUS_CLEAR(env->backrefed_mem);
1970 env->error = (UChar* )NULL;
1971 env->error_end = (UChar* )NULL;
1972 env->num_call = 0;
1973
1974 #ifdef USE_CALL
1975 env->unset_addr_list = NULL;
1976 env->has_call_zero = 0;
1977 #endif
1978
1979 env->num_mem = 0;
1980 env->num_named = 0;
1981 env->mem_alloc = 0;
1982 env->mem_env_dynamic = (MemEnv* )NULL;
1983
1984 xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static));
1985
1986 env->parse_depth = 0;
1987 #ifdef ONIG_DEBUG_PARSE
1988 env->max_parse_depth = 0;
1989 #endif
1990 env->backref_num = 0;
1991 env->keep_num = 0;
1992 env->save_num = 0;
1993 env->save_alloc_num = 0;
1994 env->saves = 0;
1995 }
1996
1997 static int
scan_env_add_mem_entry(ScanEnv * env)1998 scan_env_add_mem_entry(ScanEnv* env)
1999 {
2000 int i, need, alloc;
2001 MemEnv* p;
2002
2003 need = env->num_mem + 1;
2004 if (need > MaxCaptureNum && MaxCaptureNum != 0)
2005 return ONIGERR_TOO_MANY_CAPTURES;
2006
2007 if (need >= SCANENV_MEMENV_SIZE) {
2008 if (env->mem_alloc <= need) {
2009 if (IS_NULL(env->mem_env_dynamic)) {
2010 alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE;
2011 p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc);
2012 CHECK_NULL_RETURN_MEMERR(p);
2013 xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static));
2014 }
2015 else {
2016 alloc = env->mem_alloc * 2;
2017 p = (MemEnv* )xrealloc(env->mem_env_dynamic, sizeof(MemEnv) * alloc);
2018 CHECK_NULL_RETURN_MEMERR(p);
2019 }
2020
2021 for (i = env->num_mem + 1; i < alloc; i++) {
2022 p[i].mem_node = NULL_NODE;
2023 p[i].empty_repeat_node = NULL_NODE;
2024 }
2025
2026 env->mem_env_dynamic = p;
2027 env->mem_alloc = alloc;
2028 }
2029 }
2030
2031 env->num_mem++;
2032 return env->num_mem;
2033 }
2034
2035 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)2036 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
2037 {
2038 if (env->num_mem >= num)
2039 SCANENV_MEMENV(env)[num].mem_node = node;
2040 else
2041 return ONIGERR_PARSER_BUG;
2042 return 0;
2043 }
2044
2045 extern void
onig_node_free(Node * node)2046 onig_node_free(Node* node)
2047 {
2048 start:
2049 if (IS_NULL(node)) return ;
2050
2051 #ifdef DEBUG_NODE_FREE
2052 fprintf(stderr, "onig_node_free: %p\n", node);
2053 #endif
2054
2055 switch (NODE_TYPE(node)) {
2056 case NODE_STRING:
2057 if (STR_(node)->capacity != 0 &&
2058 IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
2059 xfree(STR_(node)->s);
2060 }
2061 break;
2062
2063 case NODE_LIST:
2064 case NODE_ALT:
2065 onig_node_free(NODE_CAR(node));
2066 {
2067 Node* next_node = NODE_CDR(node);
2068
2069 xfree(node);
2070 node = next_node;
2071 goto start;
2072 }
2073 break;
2074
2075 case NODE_CCLASS:
2076 {
2077 CClassNode* cc = CCLASS_(node);
2078
2079 if (cc->mbuf)
2080 bbuf_free(cc->mbuf);
2081 }
2082 break;
2083
2084 case NODE_BACKREF:
2085 if (IS_NOT_NULL(BACKREF_(node)->back_dynamic))
2086 xfree(BACKREF_(node)->back_dynamic);
2087 break;
2088
2089 case NODE_BAG:
2090 if (NODE_BODY(node))
2091 onig_node_free(NODE_BODY(node));
2092
2093 {
2094 BagNode* en = BAG_(node);
2095 if (en->type == BAG_IF_ELSE) {
2096 onig_node_free(en->te.Then);
2097 onig_node_free(en->te.Else);
2098 }
2099 }
2100 break;
2101
2102 case NODE_QUANT:
2103 case NODE_ANCHOR:
2104 if (NODE_BODY(node))
2105 onig_node_free(NODE_BODY(node));
2106 break;
2107
2108 case NODE_CTYPE:
2109 case NODE_CALL:
2110 case NODE_GIMMICK:
2111 break;
2112 }
2113
2114 xfree(node);
2115 }
2116
2117 static void
cons_node_free_alone(Node * node)2118 cons_node_free_alone(Node* node)
2119 {
2120 NODE_CAR(node) = 0;
2121 NODE_CDR(node) = 0;
2122 onig_node_free(node);
2123 }
2124
2125 static Node*
node_new(void)2126 node_new(void)
2127 {
2128 Node* node;
2129
2130 node = (Node* )xmalloc(sizeof(Node));
2131 CHECK_NULL_RETURN(node);
2132 xmemset(node, 0, sizeof(*node));
2133
2134 #ifdef DEBUG_NODE_FREE
2135 fprintf(stderr, "node_new: %p\n", node);
2136 #endif
2137 return node;
2138 }
2139
2140
2141 static void
initialize_cclass(CClassNode * cc)2142 initialize_cclass(CClassNode* cc)
2143 {
2144 BITSET_CLEAR(cc->bs);
2145 cc->flags = 0;
2146 cc->mbuf = NULL;
2147 }
2148
2149 static Node*
node_new_cclass(void)2150 node_new_cclass(void)
2151 {
2152 Node* node = node_new();
2153 CHECK_NULL_RETURN(node);
2154
2155 NODE_SET_TYPE(node, NODE_CCLASS);
2156 initialize_cclass(CCLASS_(node));
2157 return node;
2158 }
2159
2160 static Node*
node_new_ctype(int type,int not,OnigOptionType options)2161 node_new_ctype(int type, int not, OnigOptionType options)
2162 {
2163 Node* node = node_new();
2164 CHECK_NULL_RETURN(node);
2165
2166 NODE_SET_TYPE(node, NODE_CTYPE);
2167 CTYPE_(node)->ctype = type;
2168 CTYPE_(node)->not = not;
2169 CTYPE_(node)->options = options;
2170 CTYPE_(node)->ascii_mode = IS_ASCII_MODE_CTYPE_OPTION(type, options);
2171 return node;
2172 }
2173
2174 static Node*
node_new_anychar(void)2175 node_new_anychar(void)
2176 {
2177 Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE);
2178 return node;
2179 }
2180
2181 static Node*
node_new_anychar_with_fixed_option(OnigOptionType option)2182 node_new_anychar_with_fixed_option(OnigOptionType option)
2183 {
2184 CtypeNode* ct;
2185 Node* node;
2186
2187 node = node_new_anychar();
2188 CHECK_NULL_RETURN(node);
2189
2190 ct = CTYPE_(node);
2191 ct->options = option;
2192 NODE_STATUS_ADD(node, FIXED_OPTION);
2193 return node;
2194 }
2195
2196 static int
node_new_no_newline(Node ** node,ScanEnv * env)2197 node_new_no_newline(Node** node, ScanEnv* env)
2198 {
2199 Node* n;
2200
2201 n = node_new_anychar_with_fixed_option(ONIG_OPTION_NONE);
2202 CHECK_NULL_RETURN_MEMERR(n);
2203 *node = n;
2204 return 0;
2205 }
2206
2207 static int
node_new_true_anychar(Node ** node,ScanEnv * env)2208 node_new_true_anychar(Node** node, ScanEnv* env)
2209 {
2210 Node* n;
2211
2212 n = node_new_anychar_with_fixed_option(ONIG_OPTION_MULTILINE);
2213 CHECK_NULL_RETURN_MEMERR(n);
2214 *node = n;
2215 return 0;
2216 }
2217
2218 static Node*
node_new_list(Node * left,Node * right)2219 node_new_list(Node* left, Node* right)
2220 {
2221 Node* node = node_new();
2222 CHECK_NULL_RETURN(node);
2223
2224 NODE_SET_TYPE(node, NODE_LIST);
2225 NODE_CAR(node) = left;
2226 NODE_CDR(node) = right;
2227 return node;
2228 }
2229
2230 extern Node*
onig_node_new_list(Node * left,Node * right)2231 onig_node_new_list(Node* left, Node* right)
2232 {
2233 return node_new_list(left, right);
2234 }
2235
2236 extern Node*
onig_node_new_alt(Node * left,Node * right)2237 onig_node_new_alt(Node* left, Node* right)
2238 {
2239 Node* node = node_new();
2240 CHECK_NULL_RETURN(node);
2241
2242 NODE_SET_TYPE(node, NODE_ALT);
2243 NODE_CAR(node) = left;
2244 NODE_CDR(node) = right;
2245 return node;
2246 }
2247
2248 static Node*
make_list_or_alt(NodeType type,int n,Node * ns[])2249 make_list_or_alt(NodeType type, int n, Node* ns[])
2250 {
2251 Node* r;
2252
2253 if (n <= 0) return NULL_NODE;
2254
2255 if (n == 1) {
2256 r = node_new();
2257 CHECK_NULL_RETURN(r);
2258 NODE_SET_TYPE(r, type);
2259 NODE_CAR(r) = ns[0];
2260 NODE_CDR(r) = NULL_NODE;
2261 }
2262 else {
2263 Node* right;
2264
2265 r = node_new();
2266 CHECK_NULL_RETURN(r);
2267
2268 right = make_list_or_alt(type, n - 1, ns + 1);
2269 if (IS_NULL(right)) {
2270 onig_node_free(r);
2271 return NULL_NODE;
2272 }
2273
2274 NODE_SET_TYPE(r, type);
2275 NODE_CAR(r) = ns[0];
2276 NODE_CDR(r) = right;
2277 }
2278
2279 return r;
2280 }
2281
2282 static Node*
make_list(int n,Node * ns[])2283 make_list(int n, Node* ns[])
2284 {
2285 return make_list_or_alt(NODE_LIST, n, ns);
2286 }
2287
2288 static Node*
make_alt(int n,Node * ns[])2289 make_alt(int n, Node* ns[])
2290 {
2291 return make_list_or_alt(NODE_ALT, n, ns);
2292 }
2293
2294 extern Node*
onig_node_new_anchor(int type,int ascii_mode)2295 onig_node_new_anchor(int type, int ascii_mode)
2296 {
2297 Node* node = node_new();
2298 CHECK_NULL_RETURN(node);
2299
2300 NODE_SET_TYPE(node, NODE_ANCHOR);
2301 ANCHOR_(node)->type = type;
2302 ANCHOR_(node)->char_len = -1;
2303 ANCHOR_(node)->ascii_mode = ascii_mode;
2304 return node;
2305 }
2306
2307 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2308 node_new_backref(int back_num, int* backrefs, int by_name,
2309 #ifdef USE_BACKREF_WITH_LEVEL
2310 int exist_level, int nest_level,
2311 #endif
2312 ScanEnv* env)
2313 {
2314 int i;
2315 Node* node = node_new();
2316
2317 CHECK_NULL_RETURN(node);
2318
2319 NODE_SET_TYPE(node, NODE_BACKREF);
2320 BACKREF_(node)->back_num = back_num;
2321 BACKREF_(node)->back_dynamic = (int* )NULL;
2322 if (by_name != 0)
2323 NODE_STATUS_ADD(node, BY_NAME);
2324
2325 #ifdef USE_BACKREF_WITH_LEVEL
2326 if (exist_level != 0) {
2327 NODE_STATUS_ADD(node, NEST_LEVEL);
2328 BACKREF_(node)->nest_level = nest_level;
2329 }
2330 #endif
2331
2332 for (i = 0; i < back_num; i++) {
2333 if (backrefs[i] <= env->num_mem &&
2334 IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) {
2335 NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */
2336 break;
2337 }
2338 }
2339
2340 if (back_num <= NODE_BACKREFS_SIZE) {
2341 for (i = 0; i < back_num; i++)
2342 BACKREF_(node)->back_static[i] = backrefs[i];
2343 }
2344 else {
2345 int* p = (int* )xmalloc(sizeof(int) * back_num);
2346 if (IS_NULL(p)) {
2347 onig_node_free(node);
2348 return NULL;
2349 }
2350 BACKREF_(node)->back_dynamic = p;
2351 for (i = 0; i < back_num; i++)
2352 p[i] = backrefs[i];
2353 }
2354
2355 env->backref_num++;
2356 return node;
2357 }
2358
2359 static Node*
node_new_backref_checker(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2360 node_new_backref_checker(int back_num, int* backrefs, int by_name,
2361 #ifdef USE_BACKREF_WITH_LEVEL
2362 int exist_level, int nest_level,
2363 #endif
2364 ScanEnv* env)
2365 {
2366 Node* node;
2367
2368 node = node_new_backref(back_num, backrefs, by_name,
2369 #ifdef USE_BACKREF_WITH_LEVEL
2370 exist_level, nest_level,
2371 #endif
2372 env);
2373 CHECK_NULL_RETURN(node);
2374
2375 NODE_STATUS_ADD(node, CHECKER);
2376 return node;
2377 }
2378
2379 #ifdef USE_CALL
2380 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum,int by_number)2381 node_new_call(UChar* name, UChar* name_end, int gnum, int by_number)
2382 {
2383 Node* node = node_new();
2384 CHECK_NULL_RETURN(node);
2385
2386 NODE_SET_TYPE(node, NODE_CALL);
2387 CALL_(node)->by_number = by_number;
2388 CALL_(node)->name = name;
2389 CALL_(node)->name_end = name_end;
2390 CALL_(node)->group_num = gnum;
2391 CALL_(node)->entry_count = 1;
2392 return node;
2393 }
2394 #endif
2395
2396 static Node*
node_new_quantifier(int lower,int upper,int by_number)2397 node_new_quantifier(int lower, int upper, int by_number)
2398 {
2399 Node* node = node_new();
2400 CHECK_NULL_RETURN(node);
2401
2402 NODE_SET_TYPE(node, NODE_QUANT);
2403 QUANT_(node)->lower = lower;
2404 QUANT_(node)->upper = upper;
2405 QUANT_(node)->greedy = 1;
2406 QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY;
2407 QUANT_(node)->head_exact = NULL_NODE;
2408 QUANT_(node)->next_head_exact = NULL_NODE;
2409 QUANT_(node)->include_referred = 0;
2410 if (by_number != 0)
2411 NODE_STATUS_ADD(node, BY_NUMBER);
2412
2413 return node;
2414 }
2415
2416 static Node*
node_new_bag(enum BagType type)2417 node_new_bag(enum BagType type)
2418 {
2419 Node* node = node_new();
2420 CHECK_NULL_RETURN(node);
2421
2422 NODE_SET_TYPE(node, NODE_BAG);
2423 BAG_(node)->type = type;
2424
2425 switch (type) {
2426 case BAG_MEMORY:
2427 BAG_(node)->m.regnum = 0;
2428 BAG_(node)->m.called_addr = -1;
2429 BAG_(node)->m.entry_count = 1;
2430 BAG_(node)->m.called_state = 0;
2431 break;
2432
2433 case BAG_OPTION:
2434 BAG_(node)->o.options = 0;
2435 break;
2436
2437 case BAG_STOP_BACKTRACK:
2438 break;
2439
2440 case BAG_IF_ELSE:
2441 BAG_(node)->te.Then = 0;
2442 BAG_(node)->te.Else = 0;
2443 break;
2444 }
2445
2446 BAG_(node)->opt_count = 0;
2447 return node;
2448 }
2449
2450 extern Node*
onig_node_new_bag(enum BagType type)2451 onig_node_new_bag(enum BagType type)
2452 {
2453 return node_new_bag(type);
2454 }
2455
2456 static Node*
node_new_bag_if_else(Node * cond,Node * Then,Node * Else)2457 node_new_bag_if_else(Node* cond, Node* Then, Node* Else)
2458 {
2459 Node* n;
2460 n = node_new_bag(BAG_IF_ELSE);
2461 CHECK_NULL_RETURN(n);
2462
2463 NODE_BODY(n) = cond;
2464 BAG_(n)->te.Then = Then;
2465 BAG_(n)->te.Else = Else;
2466 return n;
2467 }
2468
2469 static Node*
node_new_memory(int is_named)2470 node_new_memory(int is_named)
2471 {
2472 Node* node = node_new_bag(BAG_MEMORY);
2473 CHECK_NULL_RETURN(node);
2474 if (is_named != 0)
2475 NODE_STATUS_ADD(node, NAMED_GROUP);
2476
2477 return node;
2478 }
2479
2480 static Node*
node_new_option(OnigOptionType option)2481 node_new_option(OnigOptionType option)
2482 {
2483 Node* node = node_new_bag(BAG_OPTION);
2484 CHECK_NULL_RETURN(node);
2485 BAG_(node)->o.options = option;
2486 return node;
2487 }
2488
2489 static Node*
node_new_group(Node * content)2490 node_new_group(Node* content)
2491 {
2492 Node* node;
2493
2494 node = node_new();
2495 CHECK_NULL_RETURN(node);
2496 NODE_SET_TYPE(node, NODE_LIST);
2497 NODE_CAR(node) = content;
2498 NODE_CDR(node) = NULL_NODE;
2499
2500 return node;
2501 }
2502
2503 static Node*
node_drop_group(Node * group)2504 node_drop_group(Node* group)
2505 {
2506 Node* content;
2507
2508 content = NODE_CAR(group);
2509 NODE_CAR(group) = NULL_NODE;
2510 onig_node_free(group);
2511 return content;
2512 }
2513
2514 static int
node_new_fail(Node ** node,ScanEnv * env)2515 node_new_fail(Node** node, ScanEnv* env)
2516 {
2517 *node = node_new();
2518 CHECK_NULL_RETURN_MEMERR(*node);
2519
2520 NODE_SET_TYPE(*node, NODE_GIMMICK);
2521 GIMMICK_(*node)->type = GIMMICK_FAIL;
2522 return ONIG_NORMAL;
2523 }
2524
2525 static int
node_new_save_gimmick(Node ** node,enum SaveType save_type,ScanEnv * env)2526 node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)
2527 {
2528 int id;
2529 int r;
2530
2531 r = save_entry(env, save_type, &id);
2532 if (r != ONIG_NORMAL) return r;
2533
2534 *node = node_new();
2535 CHECK_NULL_RETURN_MEMERR(*node);
2536
2537 NODE_SET_TYPE(*node, NODE_GIMMICK);
2538 GIMMICK_(*node)->id = id;
2539 GIMMICK_(*node)->type = GIMMICK_SAVE;
2540 GIMMICK_(*node)->detail_type = (int )save_type;
2541
2542 return ONIG_NORMAL;
2543 }
2544
2545 static int
node_new_update_var_gimmick(Node ** node,enum UpdateVarType update_var_type,int id,ScanEnv * env)2546 node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,
2547 int id, ScanEnv* env)
2548 {
2549 *node = node_new();
2550 CHECK_NULL_RETURN_MEMERR(*node);
2551
2552 NODE_SET_TYPE(*node, NODE_GIMMICK);
2553 GIMMICK_(*node)->id = id;
2554 GIMMICK_(*node)->type = GIMMICK_UPDATE_VAR;
2555 GIMMICK_(*node)->detail_type = (int )update_var_type;
2556
2557 return ONIG_NORMAL;
2558 }
2559
2560 static int
node_new_keep(Node ** node,ScanEnv * env)2561 node_new_keep(Node** node, ScanEnv* env)
2562 {
2563 int r;
2564
2565 r = node_new_save_gimmick(node, SAVE_KEEP, env);
2566 if (r != 0) return r;
2567
2568 env->keep_num++;
2569 return ONIG_NORMAL;
2570 }
2571
2572 #ifdef USE_CALLOUT
2573
2574 extern void
onig_free_reg_callout_list(int n,CalloutListEntry * list)2575 onig_free_reg_callout_list(int n, CalloutListEntry* list)
2576 {
2577 int i;
2578 int j;
2579
2580 if (IS_NULL(list)) return ;
2581
2582 for (i = 0; i < n; i++) {
2583 if (list[i].of == ONIG_CALLOUT_OF_NAME) {
2584 for (j = 0; j < list[i].u.arg.passed_num; j++) {
2585 if (list[i].u.arg.types[j] == ONIG_TYPE_STRING) {
2586 if (IS_NOT_NULL(list[i].u.arg.vals[j].s.start))
2587 xfree(list[i].u.arg.vals[j].s.start);
2588 }
2589 }
2590 }
2591 else { /* ONIG_CALLOUT_OF_CONTENTS */
2592 if (IS_NOT_NULL(list[i].u.content.start)) {
2593 xfree((void* )list[i].u.content.start);
2594 }
2595 }
2596 }
2597
2598 xfree(list);
2599 }
2600
2601 extern CalloutListEntry*
onig_reg_callout_list_at(regex_t * reg,int num)2602 onig_reg_callout_list_at(regex_t* reg, int num)
2603 {
2604 RegexExt* ext = reg->extp;
2605 CHECK_NULL_RETURN(ext);
2606
2607 if (num <= 0 || num > ext->callout_num)
2608 return 0;
2609
2610 num--;
2611 return ext->callout_list + num;
2612 }
2613
2614 static int
reg_callout_list_entry(ScanEnv * env,int * rnum)2615 reg_callout_list_entry(ScanEnv* env, int* rnum)
2616 {
2617 #define INIT_CALLOUT_LIST_NUM 3
2618
2619 int num;
2620 CalloutListEntry* list;
2621 CalloutListEntry* e;
2622 RegexExt* ext;
2623
2624 ext = onig_get_regex_ext(env->reg);
2625 CHECK_NULL_RETURN_MEMERR(ext);
2626
2627 if (IS_NULL(ext->callout_list)) {
2628 list = (CalloutListEntry* )xmalloc(sizeof(*list) * INIT_CALLOUT_LIST_NUM);
2629 CHECK_NULL_RETURN_MEMERR(list);
2630
2631 ext->callout_list = list;
2632 ext->callout_list_alloc = INIT_CALLOUT_LIST_NUM;
2633 ext->callout_num = 0;
2634 }
2635
2636 num = ext->callout_num + 1;
2637 if (num > ext->callout_list_alloc) {
2638 int alloc = ext->callout_list_alloc * 2;
2639 list = (CalloutListEntry* )xrealloc(ext->callout_list,
2640 sizeof(CalloutListEntry) * alloc);
2641 CHECK_NULL_RETURN_MEMERR(list);
2642
2643 ext->callout_list = list;
2644 ext->callout_list_alloc = alloc;
2645 }
2646
2647 e = ext->callout_list + (num - 1);
2648
2649 e->flag = 0;
2650 e->of = 0;
2651 e->in = ONIG_CALLOUT_OF_CONTENTS;
2652 e->type = 0;
2653 e->tag_start = 0;
2654 e->tag_end = 0;
2655 e->start_func = 0;
2656 e->end_func = 0;
2657 e->u.arg.num = 0;
2658 e->u.arg.passed_num = 0;
2659
2660 ext->callout_num = num;
2661 *rnum = num;
2662 return ONIG_NORMAL;
2663 }
2664
2665 static int
node_new_callout(Node ** node,OnigCalloutOf callout_of,int num,int id,ScanEnv * env)2666 node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,
2667 ScanEnv* env)
2668 {
2669 *node = node_new();
2670 CHECK_NULL_RETURN_MEMERR(*node);
2671
2672 NODE_SET_TYPE(*node, NODE_GIMMICK);
2673 GIMMICK_(*node)->id = id;
2674 GIMMICK_(*node)->num = num;
2675 GIMMICK_(*node)->type = GIMMICK_CALLOUT;
2676 GIMMICK_(*node)->detail_type = (int )callout_of;
2677
2678 return ONIG_NORMAL;
2679 }
2680 #endif
2681
2682 static int
make_text_segment(Node ** node,ScanEnv * env)2683 make_text_segment(Node** node, ScanEnv* env)
2684 {
2685 int r;
2686 int i;
2687 Node* x;
2688 Node* ns[2];
2689
2690 /* \X == (?>\O(?:\Y\O)*) */
2691
2692 ns[1] = NULL_NODE;
2693
2694 r = ONIGERR_MEMORY;
2695 ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE);
2696 if (IS_NULL(ns[0])) goto err;
2697
2698 r = node_new_true_anychar(&ns[1], env);
2699 if (r != 0) goto err1;
2700
2701 x = make_list(2, ns);
2702 if (IS_NULL(x)) goto err;
2703 ns[0] = x;
2704 ns[1] = NULL_NODE;
2705
2706 x = node_new_quantifier(0, INFINITE_REPEAT, TRUE);
2707 if (IS_NULL(x)) goto err;
2708
2709 NODE_BODY(x) = ns[0];
2710 ns[0] = NULL_NODE;
2711 ns[1] = x;
2712
2713 r = node_new_true_anychar(&ns[0], env);
2714 if (r != 0) goto err1;
2715
2716 x = make_list(2, ns);
2717 if (IS_NULL(x)) goto err;
2718
2719 ns[0] = x;
2720 ns[1] = NULL_NODE;
2721
2722 x = node_new_bag(BAG_STOP_BACKTRACK);
2723 if (IS_NULL(x)) goto err;
2724
2725 NODE_BODY(x) = ns[0];
2726
2727 *node = x;
2728 return ONIG_NORMAL;
2729
2730 err:
2731 r = ONIGERR_MEMORY;
2732 err1:
2733 for (i = 0; i < 2; i++) onig_node_free(ns[i]);
2734 return r;
2735 }
2736
2737 static int
make_absent_engine(Node ** node,int pre_save_right_id,Node * absent,Node * step_one,int lower,int upper,int possessive,int is_range_cutter,ScanEnv * env)2738 make_absent_engine(Node** node, int pre_save_right_id, Node* absent,
2739 Node* step_one, int lower, int upper, int possessive,
2740 int is_range_cutter, ScanEnv* env)
2741 {
2742 int r;
2743 int i;
2744 int id;
2745 Node* x;
2746 Node* ns[4];
2747
2748 for (i = 0; i < 4; i++) ns[i] = NULL_NODE;
2749
2750 ns[1] = absent;
2751 ns[3] = step_one; /* for err */
2752 r = node_new_save_gimmick(&ns[0], SAVE_S, env);
2753 if (r != 0) goto err;
2754
2755 id = GIMMICK_(ns[0])->id;
2756 r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK,
2757 id, env);
2758 if (r != 0) goto err;
2759
2760 r = node_new_fail(&ns[3], env);
2761 if (r != 0) goto err;
2762
2763 x = make_list(4, ns);
2764 if (IS_NULL(x)) goto err0;
2765
2766 ns[0] = x;
2767 ns[1] = step_one;
2768 ns[2] = ns[3] = NULL_NODE;
2769
2770 x = make_alt(2, ns);
2771 if (IS_NULL(x)) goto err0;
2772
2773 ns[0] = x;
2774
2775 x = node_new_quantifier(lower, upper, FALSE);
2776 if (IS_NULL(x)) goto err0;
2777
2778 NODE_BODY(x) = ns[0];
2779 ns[0] = x;
2780
2781 if (possessive != 0) {
2782 x = node_new_bag(BAG_STOP_BACKTRACK);
2783 if (IS_NULL(x)) goto err0;
2784
2785 NODE_BODY(x) = ns[0];
2786 ns[0] = x;
2787 }
2788
2789 r = node_new_update_var_gimmick(&ns[1], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2790 pre_save_right_id, env);
2791 if (r != 0) goto err;
2792
2793 r = node_new_fail(&ns[2], env);
2794 if (r != 0) goto err;
2795
2796 x = make_list(2, ns + 1);
2797 if (IS_NULL(x)) goto err0;
2798
2799 ns[1] = x; ns[2] = NULL_NODE;
2800
2801 x = make_alt(2, ns);
2802 if (IS_NULL(x)) goto err0;
2803
2804 if (is_range_cutter != FALSE)
2805 NODE_STATUS_ADD(x, SUPER);
2806
2807 *node = x;
2808 return ONIG_NORMAL;
2809
2810 err0:
2811 r = ONIGERR_MEMORY;
2812 err:
2813 for (i = 0; i < 4; i++) onig_node_free(ns[i]);
2814 return r;
2815 }
2816
2817 static int
make_absent_tail(Node ** node1,Node ** node2,int pre_save_right_id,ScanEnv * env)2818 make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,
2819 ScanEnv* env)
2820 {
2821 int r;
2822 int id;
2823 Node* save;
2824 Node* x;
2825 Node* ns[2];
2826
2827 *node1 = *node2 = NULL_NODE;
2828 save = ns[0] = ns[1] = NULL_NODE;
2829
2830 r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
2831 if (r != 0) goto err;
2832
2833 id = GIMMICK_(save)->id;
2834 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2835 id, env);
2836 if (r != 0) goto err;
2837
2838 r = node_new_fail(&ns[1], env);
2839 if (r != 0) goto err;
2840
2841 x = make_list(2, ns);
2842 if (IS_NULL(x)) goto err0;
2843
2844 ns[0] = NULL_NODE; ns[1] = x;
2845
2846 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2847 pre_save_right_id, env);
2848 if (r != 0) goto err;
2849
2850 x = make_alt(2, ns);
2851 if (IS_NULL(x)) goto err0;
2852
2853 *node1 = save;
2854 *node2 = x;
2855 return ONIG_NORMAL;
2856
2857 err0:
2858 r = ONIGERR_MEMORY;
2859 err:
2860 onig_node_free(save);
2861 onig_node_free(ns[0]);
2862 onig_node_free(ns[1]);
2863 return r;
2864 }
2865
2866 static int
make_range_clear(Node ** node,ScanEnv * env)2867 make_range_clear(Node** node, ScanEnv* env)
2868 {
2869 int r;
2870 int id;
2871 Node* save;
2872 Node* x;
2873 Node* ns[2];
2874
2875 *node = NULL_NODE;
2876 save = ns[0] = ns[1] = NULL_NODE;
2877
2878 r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
2879 if (r != 0) goto err;
2880
2881 id = GIMMICK_(save)->id;
2882 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2883 id, env);
2884 if (r != 0) goto err;
2885
2886 r = node_new_fail(&ns[1], env);
2887 if (r != 0) goto err;
2888
2889 x = make_list(2, ns);
2890 if (IS_NULL(x)) goto err0;
2891
2892 ns[0] = NULL_NODE; ns[1] = x;
2893
2894 #define ID_NOT_USED_DONT_CARE_ME 0
2895
2896 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT,
2897 ID_NOT_USED_DONT_CARE_ME, env);
2898 if (r != 0) goto err;
2899
2900 x = make_alt(2, ns);
2901 if (IS_NULL(x)) goto err0;
2902
2903 NODE_STATUS_ADD(x, SUPER);
2904
2905 ns[0] = save;
2906 ns[1] = x;
2907 save = NULL_NODE;
2908 x = make_list(2, ns);
2909 if (IS_NULL(x)) goto err0;
2910
2911 *node = x;
2912 return ONIG_NORMAL;
2913
2914 err0:
2915 r = ONIGERR_MEMORY;
2916 err:
2917 onig_node_free(save);
2918 onig_node_free(ns[0]);
2919 onig_node_free(ns[1]);
2920 return r;
2921 }
2922
2923 static int
is_simple_one_char_repeat(Node * node,Node ** rquant,Node ** rbody,int * is_possessive,ScanEnv * env)2924 is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,
2925 int* is_possessive, ScanEnv* env)
2926 {
2927 Node* quant;
2928 Node* body;
2929
2930 *rquant = *rbody = 0;
2931 *is_possessive = 0;
2932
2933 if (NODE_TYPE(node) == NODE_QUANT) {
2934 quant = node;
2935 }
2936 else {
2937 if (NODE_TYPE(node) == NODE_BAG) {
2938 BagNode* en = BAG_(node);
2939 if (en->type == BAG_STOP_BACKTRACK) {
2940 *is_possessive = 1;
2941 quant = NODE_BAG_BODY(en);
2942 if (NODE_TYPE(quant) != NODE_QUANT)
2943 return 0;
2944 }
2945 else
2946 return 0;
2947 }
2948 else
2949 return 0;
2950 }
2951
2952 if (QUANT_(quant)->greedy == 0)
2953 return 0;
2954
2955 body = NODE_BODY(quant);
2956 switch (NODE_TYPE(body)) {
2957 case NODE_STRING:
2958 {
2959 int len;
2960 StrNode* sn = STR_(body);
2961 UChar *s = sn->s;
2962
2963 len = 0;
2964 while (s < sn->end) {
2965 s += enclen(env->enc, s);
2966 len++;
2967 }
2968 if (len != 1)
2969 return 0;
2970 }
2971
2972 case NODE_CCLASS:
2973 break;
2974
2975 default:
2976 return 0;
2977 break;
2978 }
2979
2980 if (node != quant) {
2981 NODE_BODY(node) = 0;
2982 onig_node_free(node);
2983 }
2984 NODE_BODY(quant) = NULL_NODE;
2985 *rquant = quant;
2986 *rbody = body;
2987 return 1;
2988 }
2989
2990 static int
make_absent_tree_for_simple_one_char_repeat(Node ** node,Node * absent,Node * quant,Node * body,int possessive,ScanEnv * env)2991 make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant,
2992 Node* body, int possessive, ScanEnv* env)
2993 {
2994 int r;
2995 int i;
2996 int id1;
2997 int lower, upper;
2998 Node* x;
2999 Node* ns[4];
3000
3001 *node = NULL_NODE;
3002 r = ONIGERR_MEMORY;
3003 ns[0] = ns[1] = NULL_NODE;
3004 ns[2] = body, ns[3] = absent;
3005
3006 lower = QUANT_(quant)->lower;
3007 upper = QUANT_(quant)->upper;
3008 onig_node_free(quant);
3009
3010 r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3011 if (r != 0) goto err;
3012
3013 id1 = GIMMICK_(ns[0])->id;
3014
3015 r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive,
3016 FALSE, env);
3017 if (r != 0) goto err;
3018
3019 ns[2] = ns[3] = NULL_NODE;
3020
3021 r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
3022 id1, env);
3023 if (r != 0) goto err;
3024
3025 x = make_list(3, ns);
3026 if (IS_NULL(x)) goto err0;
3027
3028 *node = x;
3029 return ONIG_NORMAL;
3030
3031 err0:
3032 r = ONIGERR_MEMORY;
3033 err:
3034 for (i = 0; i < 4; i++) onig_node_free(ns[i]);
3035 return r;
3036 }
3037
3038 static int
make_absent_tree(Node ** node,Node * absent,Node * expr,int is_range_cutter,ScanEnv * env)3039 make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,
3040 ScanEnv* env)
3041 {
3042 int r;
3043 int i;
3044 int id1, id2;
3045 int possessive;
3046 Node* x;
3047 Node* ns[7];
3048
3049 r = ONIGERR_MEMORY;
3050 for (i = 0; i < 7; i++) ns[i] = NULL_NODE;
3051 ns[4] = expr; ns[5] = absent;
3052
3053 if (is_range_cutter == 0) {
3054 Node* quant;
3055 Node* body;
3056
3057 if (expr == NULL_NODE) {
3058 /* default expr \O* */
3059 quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
3060 if (IS_NULL(quant)) goto err0;
3061
3062 r = node_new_true_anychar(&body, env);
3063 if (r != 0) {
3064 onig_node_free(quant);
3065 goto err;
3066 }
3067 possessive = 0;
3068 goto simple;
3069 }
3070 else {
3071 if (is_simple_one_char_repeat(expr, &quant, &body, &possessive, env)) {
3072 simple:
3073 r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant,
3074 body, possessive, env);
3075 if (r != 0) {
3076 ns[4] = NULL_NODE;
3077 onig_node_free(quant);
3078 onig_node_free(body);
3079 goto err;
3080 }
3081
3082 return ONIG_NORMAL;
3083 }
3084 }
3085 }
3086
3087 r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3088 if (r != 0) goto err;
3089
3090 id1 = GIMMICK_(ns[0])->id;
3091
3092 r = node_new_save_gimmick(&ns[1], SAVE_S, env);
3093 if (r != 0) goto err;
3094
3095 id2 = GIMMICK_(ns[1])->id;
3096
3097 r = node_new_true_anychar(&ns[3], env);
3098 if (r != 0) goto err;
3099
3100 possessive = 1;
3101 r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT,
3102 possessive, is_range_cutter, env);
3103 if (r != 0) goto err;
3104
3105 ns[3] = NULL_NODE;
3106 ns[5] = NULL_NODE;
3107
3108 r = node_new_update_var_gimmick(&ns[3], UPDATE_VAR_S_FROM_STACK, id2, env);
3109 if (r != 0) goto err;
3110
3111 if (is_range_cutter != 0) {
3112 x = make_list(4, ns);
3113 if (IS_NULL(x)) goto err0;
3114 }
3115 else {
3116 r = make_absent_tail(&ns[5], &ns[6], id1, env);
3117 if (r != 0) goto err;
3118
3119 x = make_list(7, ns);
3120 if (IS_NULL(x)) goto err0;
3121 }
3122
3123 *node = x;
3124 return ONIG_NORMAL;
3125
3126 err0:
3127 r = ONIGERR_MEMORY;
3128 err:
3129 for (i = 0; i < 7; i++) onig_node_free(ns[i]);
3130 return r;
3131 }
3132
3133 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)3134 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
3135 {
3136 int addlen = (int )(end - s);
3137
3138 if (addlen > 0) {
3139 int len = (int )(STR_(node)->end - STR_(node)->s);
3140
3141 if (STR_(node)->capacity > 0 || (len + addlen > NODE_STRING_BUF_SIZE - 1)) {
3142 UChar* p;
3143 int capa = len + addlen + NODE_STRING_MARGIN;
3144
3145 if (capa <= STR_(node)->capacity) {
3146 onig_strcpy(STR_(node)->s + len, s, end);
3147 }
3148 else {
3149 if (STR_(node)->s == STR_(node)->buf)
3150 p = strcat_capa_from_static(STR_(node)->s, STR_(node)->end,
3151 s, end, capa);
3152 else
3153 p = strcat_capa(STR_(node)->s, STR_(node)->end, s, end, capa);
3154
3155 CHECK_NULL_RETURN_MEMERR(p);
3156 STR_(node)->s = p;
3157 STR_(node)->capacity = capa;
3158 }
3159 }
3160 else {
3161 onig_strcpy(STR_(node)->s + len, s, end);
3162 }
3163 STR_(node)->end = STR_(node)->s + len + addlen;
3164 }
3165
3166 return 0;
3167 }
3168
3169 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)3170 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
3171 {
3172 onig_node_str_clear(node);
3173 return onig_node_str_cat(node, s, end);
3174 }
3175
3176 static int
node_str_cat_char(Node * node,UChar c)3177 node_str_cat_char(Node* node, UChar c)
3178 {
3179 UChar s[1];
3180
3181 s[0] = c;
3182 return onig_node_str_cat(node, s, s + 1);
3183 }
3184
3185 extern void
onig_node_str_clear(Node * node)3186 onig_node_str_clear(Node* node)
3187 {
3188 if (STR_(node)->capacity != 0 &&
3189 IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
3190 xfree(STR_(node)->s);
3191 }
3192
3193 STR_(node)->flag = 0;
3194 STR_(node)->s = STR_(node)->buf;
3195 STR_(node)->end = STR_(node)->buf;
3196 STR_(node)->capacity = 0;
3197 STR_(node)->case_min_len = 0;
3198 }
3199
3200 static Node*
node_new_str(const UChar * s,const UChar * end)3201 node_new_str(const UChar* s, const UChar* end)
3202 {
3203 Node* node = node_new();
3204 CHECK_NULL_RETURN(node);
3205
3206 NODE_SET_TYPE(node, NODE_STRING);
3207 STR_(node)->flag = 0;
3208 STR_(node)->s = STR_(node)->buf;
3209 STR_(node)->end = STR_(node)->buf;
3210 STR_(node)->capacity = 0;
3211 STR_(node)->case_min_len = 0;
3212
3213 if (onig_node_str_cat(node, s, end)) {
3214 onig_node_free(node);
3215 return NULL;
3216 }
3217 return node;
3218 }
3219
3220 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)3221 onig_node_new_str(const UChar* s, const UChar* end)
3222 {
3223 return node_new_str(s, end);
3224 }
3225
3226 static Node*
node_new_str_crude(UChar * s,UChar * end)3227 node_new_str_crude(UChar* s, UChar* end)
3228 {
3229 Node* node = node_new_str(s, end);
3230 CHECK_NULL_RETURN(node);
3231 NODE_STRING_SET_CRUDE(node);
3232 return node;
3233 }
3234
3235 static Node*
node_new_empty(void)3236 node_new_empty(void)
3237 {
3238 return node_new_str(NULL, NULL);
3239 }
3240
3241 static Node*
node_new_str_crude_char(UChar c)3242 node_new_str_crude_char(UChar c)
3243 {
3244 int i;
3245 UChar p[1];
3246 Node* node;
3247
3248 p[0] = c;
3249 node = node_new_str_crude(p, p + 1);
3250
3251 /* clear buf tail */
3252 for (i = 1; i < NODE_STRING_BUF_SIZE; i++)
3253 STR_(node)->buf[i] = '\0';
3254
3255 return node;
3256 }
3257
3258 static Node*
str_node_split_last_char(Node * node,OnigEncoding enc)3259 str_node_split_last_char(Node* node, OnigEncoding enc)
3260 {
3261 const UChar *p;
3262 Node* rn;
3263 StrNode* sn;
3264
3265 sn = STR_(node);
3266 rn = NULL_NODE;
3267 if (sn->end > sn->s) {
3268 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
3269 if (p && p > sn->s) { /* can be split. */
3270 rn = node_new_str(p, sn->end);
3271 CHECK_NULL_RETURN(rn);
3272 if (NODE_STRING_IS_CRUDE(node))
3273 NODE_STRING_SET_CRUDE(rn);
3274
3275 sn->end = (UChar* )p;
3276 }
3277 }
3278 return rn;
3279 }
3280
3281 static int
str_node_can_be_split(Node * node,OnigEncoding enc)3282 str_node_can_be_split(Node* node, OnigEncoding enc)
3283 {
3284 StrNode* sn = STR_(node);
3285 if (sn->end > sn->s) {
3286 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
3287 }
3288 return 0;
3289 }
3290
3291 static int
scan_number(UChar ** src,const UChar * end,OnigEncoding enc)3292 scan_number(UChar** src, const UChar* end, OnigEncoding enc)
3293 {
3294 int num, val;
3295 OnigCodePoint c;
3296 UChar* p = *src;
3297 PFETCH_READY;
3298
3299 num = 0;
3300 while (! PEND) {
3301 PFETCH(c);
3302 if (IS_CODE_DIGIT_ASCII(enc, c)) {
3303 val = (int )DIGITVAL(c);
3304 if ((INT_MAX - val) / 10 < num)
3305 return -1; /* overflow */
3306
3307 num = num * 10 + val;
3308 }
3309 else {
3310 PUNFETCH;
3311 break;
3312 }
3313 }
3314 *src = p;
3315 return num;
3316 }
3317
3318 static int
scan_hexadecimal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3319 scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen,
3320 OnigEncoding enc, OnigCodePoint* rcode)
3321 {
3322 OnigCodePoint code;
3323 OnigCodePoint c;
3324 unsigned int val;
3325 int n;
3326 UChar* p = *src;
3327 PFETCH_READY;
3328
3329 code = 0;
3330 n = 0;
3331 while (! PEND && n < maxlen) {
3332 PFETCH(c);
3333 if (IS_CODE_XDIGIT_ASCII(enc, c)) {
3334 n++;
3335 val = (unsigned int )XDIGITVAL(enc, c);
3336 if ((UINT_MAX - val) / 16UL < code)
3337 return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3338
3339 code = (code << 4) + val;
3340 }
3341 else {
3342 PUNFETCH;
3343 break;
3344 }
3345 }
3346
3347 if (n < minlen)
3348 return ONIGERR_INVALID_CODE_POINT_VALUE;
3349
3350 *rcode = code;
3351 *src = p;
3352 return ONIG_NORMAL;
3353 }
3354
3355 static int
scan_octal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3356 scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen,
3357 OnigEncoding enc, OnigCodePoint* rcode)
3358 {
3359 OnigCodePoint code;
3360 OnigCodePoint c;
3361 unsigned int val;
3362 int n;
3363 UChar* p = *src;
3364 PFETCH_READY;
3365
3366 code = 0;
3367 n = 0;
3368 while (! PEND && n < maxlen) {
3369 PFETCH(c);
3370 if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') {
3371 n++;
3372 val = (unsigned int )ODIGITVAL(c);
3373 if ((UINT_MAX - val) / 8UL < code)
3374 return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3375
3376 code = (code << 3) + val;
3377 }
3378 else {
3379 PUNFETCH;
3380 break;
3381 }
3382 }
3383
3384 if (n < minlen)
3385 return ONIGERR_INVALID_CODE_POINT_VALUE;
3386
3387 *rcode = code;
3388 *src = p;
3389 return ONIG_NORMAL;
3390 }
3391
3392
3393 #define BB_WRITE_CODE_POINT(bbuf,pos,code) \
3394 BB_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
3395
3396 /* data format:
3397 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
3398 (all data size is OnigCodePoint)
3399 */
3400 static int
new_code_range(BBuf ** pbuf)3401 new_code_range(BBuf** pbuf)
3402 {
3403 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
3404 int r;
3405 OnigCodePoint n;
3406 BBuf* bbuf;
3407
3408 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
3409 CHECK_NULL_RETURN_MEMERR(bbuf);
3410 r = BB_INIT(bbuf, INIT_MULTI_BYTE_RANGE_SIZE);
3411 if (r != 0) {
3412 xfree(bbuf);
3413 *pbuf = 0;
3414 return r;
3415 }
3416
3417 n = 0;
3418 BB_WRITE_CODE_POINT(bbuf, 0, n);
3419 return 0;
3420 }
3421
3422 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)3423 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
3424 {
3425 int r, inc_n, pos;
3426 int low, high, bound, x;
3427 OnigCodePoint n, *data;
3428 BBuf* bbuf;
3429
3430 if (from > to) {
3431 n = from; from = to; to = n;
3432 }
3433
3434 if (IS_NULL(*pbuf)) {
3435 r = new_code_range(pbuf);
3436 if (r != 0) return r;
3437 bbuf = *pbuf;
3438 n = 0;
3439 }
3440 else {
3441 bbuf = *pbuf;
3442 GET_CODE_POINT(n, bbuf->p);
3443 }
3444 data = (OnigCodePoint* )(bbuf->p);
3445 data++;
3446
3447 for (low = 0, bound = n; low < bound; ) {
3448 x = (low + bound) >> 1;
3449 if (from > data[x*2 + 1])
3450 low = x + 1;
3451 else
3452 bound = x;
3453 }
3454
3455 high = (to == ~((OnigCodePoint )0)) ? n : low;
3456 for (bound = n; high < bound; ) {
3457 x = (high + bound) >> 1;
3458 if (to + 1 >= data[x*2])
3459 high = x + 1;
3460 else
3461 bound = x;
3462 }
3463
3464 inc_n = low + 1 - high;
3465 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
3466 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
3467
3468 if (inc_n != 1) {
3469 if (from > data[low*2])
3470 from = data[low*2];
3471 if (to < data[(high - 1)*2 + 1])
3472 to = data[(high - 1)*2 + 1];
3473 }
3474
3475 if (inc_n != 0 && (OnigCodePoint )high < n) {
3476 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
3477 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
3478 int size = (n - high) * 2 * SIZE_CODE_POINT;
3479
3480 if (inc_n > 0) {
3481 BB_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
3482 }
3483 else {
3484 BB_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
3485 }
3486 }
3487
3488 pos = SIZE_CODE_POINT * (1 + low * 2);
3489 BB_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
3490 BB_WRITE_CODE_POINT(bbuf, pos, from);
3491 BB_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
3492 n += inc_n;
3493 BB_WRITE_CODE_POINT(bbuf, 0, n);
3494
3495 return 0;
3496 }
3497
3498 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)3499 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
3500 {
3501 if (from > to) {
3502 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3503 return 0;
3504 else
3505 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3506 }
3507
3508 return add_code_range_to_buf(pbuf, from, to);
3509 }
3510
3511 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)3512 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
3513 {
3514 int r, i, n;
3515 OnigCodePoint pre, from, *data, to = 0;
3516
3517 *pbuf = (BBuf* )NULL;
3518 if (IS_NULL(bbuf)) {
3519 set_all:
3520 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3521 }
3522
3523 data = (OnigCodePoint* )(bbuf->p);
3524 GET_CODE_POINT(n, data);
3525 data++;
3526 if (n <= 0) goto set_all;
3527
3528 r = 0;
3529 pre = MBCODE_START_POS(enc);
3530 for (i = 0; i < n; i++) {
3531 from = data[i*2];
3532 to = data[i*2+1];
3533 if (pre <= from - 1) {
3534 r = add_code_range_to_buf(pbuf, pre, from - 1);
3535 if (r != 0) return r;
3536 }
3537 if (to == ~((OnigCodePoint )0)) break;
3538 pre = to + 1;
3539 }
3540 if (to < ~((OnigCodePoint )0)) {
3541 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
3542 }
3543 return r;
3544 }
3545
3546 #define SWAP_BB_NOT(bbuf1, not1, bbuf2, not2) do {\
3547 BBuf *tbuf; \
3548 int tnot; \
3549 tnot = not1; not1 = not2; not2 = tnot; \
3550 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
3551 } while (0)
3552
3553 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)3554 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
3555 BBuf* bbuf2, int not2, BBuf** pbuf)
3556 {
3557 int r;
3558 OnigCodePoint i, n1, *data1;
3559 OnigCodePoint from, to;
3560
3561 *pbuf = (BBuf* )NULL;
3562 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
3563 if (not1 != 0 || not2 != 0)
3564 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3565 return 0;
3566 }
3567
3568 r = 0;
3569 if (IS_NULL(bbuf2))
3570 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3571
3572 if (IS_NULL(bbuf1)) {
3573 if (not1 != 0) {
3574 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3575 }
3576 else {
3577 if (not2 == 0) {
3578 return bbuf_clone(pbuf, bbuf2);
3579 }
3580 else {
3581 return not_code_range_buf(enc, bbuf2, pbuf);
3582 }
3583 }
3584 }
3585
3586 if (not1 != 0)
3587 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3588
3589 data1 = (OnigCodePoint* )(bbuf1->p);
3590 GET_CODE_POINT(n1, data1);
3591 data1++;
3592
3593 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
3594 r = bbuf_clone(pbuf, bbuf2);
3595 }
3596 else if (not1 == 0) { /* 1 OR (not 2) */
3597 r = not_code_range_buf(enc, bbuf2, pbuf);
3598 }
3599 if (r != 0) return r;
3600
3601 for (i = 0; i < n1; i++) {
3602 from = data1[i*2];
3603 to = data1[i*2+1];
3604 r = add_code_range_to_buf(pbuf, from, to);
3605 if (r != 0) return r;
3606 }
3607 return 0;
3608 }
3609
3610 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)3611 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
3612 OnigCodePoint* data, int n)
3613 {
3614 int i, r;
3615 OnigCodePoint from2, to2;
3616
3617 for (i = 0; i < n; i++) {
3618 from2 = data[i*2];
3619 to2 = data[i*2+1];
3620 if (from2 < from1) {
3621 if (to2 < from1) continue;
3622 else {
3623 from1 = to2 + 1;
3624 }
3625 }
3626 else if (from2 <= to1) {
3627 if (to2 < to1) {
3628 if (from1 <= from2 - 1) {
3629 r = add_code_range_to_buf(pbuf, from1, from2-1);
3630 if (r != 0) return r;
3631 }
3632 from1 = to2 + 1;
3633 }
3634 else {
3635 to1 = from2 - 1;
3636 }
3637 }
3638 else {
3639 from1 = from2;
3640 }
3641 if (from1 > to1) break;
3642 }
3643 if (from1 <= to1) {
3644 r = add_code_range_to_buf(pbuf, from1, to1);
3645 if (r != 0) return r;
3646 }
3647 return 0;
3648 }
3649
3650 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)3651 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
3652 {
3653 int r;
3654 OnigCodePoint i, j, n1, n2, *data1, *data2;
3655 OnigCodePoint from, to, from1, to1, from2, to2;
3656
3657 *pbuf = (BBuf* )NULL;
3658 if (IS_NULL(bbuf1)) {
3659 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
3660 return bbuf_clone(pbuf, bbuf2);
3661 return 0;
3662 }
3663 else if (IS_NULL(bbuf2)) {
3664 if (not2 != 0)
3665 return bbuf_clone(pbuf, bbuf1);
3666 return 0;
3667 }
3668
3669 if (not1 != 0)
3670 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3671
3672 data1 = (OnigCodePoint* )(bbuf1->p);
3673 data2 = (OnigCodePoint* )(bbuf2->p);
3674 GET_CODE_POINT(n1, data1);
3675 GET_CODE_POINT(n2, data2);
3676 data1++;
3677 data2++;
3678
3679 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
3680 for (i = 0; i < n1; i++) {
3681 from1 = data1[i*2];
3682 to1 = data1[i*2+1];
3683 for (j = 0; j < n2; j++) {
3684 from2 = data2[j*2];
3685 to2 = data2[j*2+1];
3686 if (from2 > to1) break;
3687 if (to2 < from1) continue;
3688 from = MAX(from1, from2);
3689 to = MIN(to1, to2);
3690 r = add_code_range_to_buf(pbuf, from, to);
3691 if (r != 0) return r;
3692 }
3693 }
3694 }
3695 else if (not1 == 0) { /* 1 AND (not 2) */
3696 for (i = 0; i < n1; i++) {
3697 from1 = data1[i*2];
3698 to1 = data1[i*2+1];
3699 r = and_code_range1(pbuf, from1, to1, data2, n2);
3700 if (r != 0) return r;
3701 }
3702 }
3703
3704 return 0;
3705 }
3706
3707 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)3708 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
3709 {
3710 int r, not1, not2;
3711 BBuf *buf1, *buf2, *pbuf;
3712 BitSetRef bsr1, bsr2;
3713 BitSet bs1, bs2;
3714
3715 not1 = IS_NCCLASS_NOT(dest);
3716 bsr1 = dest->bs;
3717 buf1 = dest->mbuf;
3718 not2 = IS_NCCLASS_NOT(cc);
3719 bsr2 = cc->bs;
3720 buf2 = cc->mbuf;
3721
3722 if (not1 != 0) {
3723 bitset_invert_to(bsr1, bs1);
3724 bsr1 = bs1;
3725 }
3726 if (not2 != 0) {
3727 bitset_invert_to(bsr2, bs2);
3728 bsr2 = bs2;
3729 }
3730 bitset_and(bsr1, bsr2);
3731 if (bsr1 != dest->bs) {
3732 bitset_copy(dest->bs, bsr1);
3733 }
3734 if (not1 != 0) {
3735 bitset_invert(dest->bs);
3736 }
3737
3738 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
3739 if (not1 != 0 && not2 != 0) {
3740 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
3741 }
3742 else {
3743 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
3744 if (r == 0 && not1 != 0) {
3745 BBuf *tbuf;
3746 r = not_code_range_buf(enc, pbuf, &tbuf);
3747 if (r != 0) {
3748 bbuf_free(pbuf);
3749 return r;
3750 }
3751 bbuf_free(pbuf);
3752 pbuf = tbuf;
3753 }
3754 }
3755 if (r != 0) return r;
3756
3757 dest->mbuf = pbuf;
3758 bbuf_free(buf1);
3759 return r;
3760 }
3761 return 0;
3762 }
3763
3764 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)3765 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
3766 {
3767 int r, not1, not2;
3768 BBuf *buf1, *buf2, *pbuf;
3769 BitSetRef bsr1, bsr2;
3770 BitSet bs1, bs2;
3771
3772 not1 = IS_NCCLASS_NOT(dest);
3773 bsr1 = dest->bs;
3774 buf1 = dest->mbuf;
3775 not2 = IS_NCCLASS_NOT(cc);
3776 bsr2 = cc->bs;
3777 buf2 = cc->mbuf;
3778
3779 if (not1 != 0) {
3780 bitset_invert_to(bsr1, bs1);
3781 bsr1 = bs1;
3782 }
3783 if (not2 != 0) {
3784 bitset_invert_to(bsr2, bs2);
3785 bsr2 = bs2;
3786 }
3787 bitset_or(bsr1, bsr2);
3788 if (bsr1 != dest->bs) {
3789 bitset_copy(dest->bs, bsr1);
3790 }
3791 if (not1 != 0) {
3792 bitset_invert(dest->bs);
3793 }
3794
3795 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
3796 if (not1 != 0 && not2 != 0) {
3797 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
3798 }
3799 else {
3800 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
3801 if (r == 0 && not1 != 0) {
3802 BBuf *tbuf;
3803 r = not_code_range_buf(enc, pbuf, &tbuf);
3804 if (r != 0) {
3805 bbuf_free(pbuf);
3806 return r;
3807 }
3808 bbuf_free(pbuf);
3809 pbuf = tbuf;
3810 }
3811 }
3812 if (r != 0) return r;
3813
3814 dest->mbuf = pbuf;
3815 bbuf_free(buf1);
3816 return r;
3817 }
3818 else
3819 return 0;
3820 }
3821
3822 static OnigCodePoint
conv_backslash_value(OnigCodePoint c,ScanEnv * env)3823 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
3824 {
3825 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
3826 switch (c) {
3827 case 'n': return '\n';
3828 case 't': return '\t';
3829 case 'r': return '\r';
3830 case 'f': return '\f';
3831 case 'a': return '\007';
3832 case 'b': return '\010';
3833 case 'e': return '\033';
3834 case 'v':
3835 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
3836 return '\v';
3837 break;
3838
3839 default:
3840 break;
3841 }
3842 }
3843 return c;
3844 }
3845
3846 static int
is_invalid_quantifier_target(Node * node)3847 is_invalid_quantifier_target(Node* node)
3848 {
3849 switch (NODE_TYPE(node)) {
3850 case NODE_ANCHOR:
3851 case NODE_GIMMICK:
3852 return 1;
3853 break;
3854
3855 case NODE_BAG:
3856 /* allow enclosed elements */
3857 /* return is_invalid_quantifier_target(NODE_BODY(node)); */
3858 break;
3859
3860 case NODE_LIST:
3861 do {
3862 if (! is_invalid_quantifier_target(NODE_CAR(node))) return 0;
3863 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3864 return 0;
3865 break;
3866
3867 case NODE_ALT:
3868 do {
3869 if (is_invalid_quantifier_target(NODE_CAR(node))) return 1;
3870 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3871 break;
3872
3873 default:
3874 break;
3875 }
3876 return 0;
3877 }
3878
3879 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
3880 static int
quantifier_type_num(QuantNode * q)3881 quantifier_type_num(QuantNode* q)
3882 {
3883 if (q->greedy) {
3884 if (q->lower == 0) {
3885 if (q->upper == 1) return 0;
3886 else if (IS_INFINITE_REPEAT(q->upper)) return 1;
3887 }
3888 else if (q->lower == 1) {
3889 if (IS_INFINITE_REPEAT(q->upper)) return 2;
3890 }
3891 }
3892 else {
3893 if (q->lower == 0) {
3894 if (q->upper == 1) return 3;
3895 else if (IS_INFINITE_REPEAT(q->upper)) return 4;
3896 }
3897 else if (q->lower == 1) {
3898 if (IS_INFINITE_REPEAT(q->upper)) return 5;
3899 }
3900 }
3901 return -1;
3902 }
3903
3904
3905 enum ReduceType {
3906 RQ_ASIS = 0, /* as is */
3907 RQ_DEL = 1, /* delete parent */
3908 RQ_A, /* to '*' */
3909 RQ_AQ, /* to '*?' */
3910 RQ_QQ, /* to '??' */
3911 RQ_P_QQ, /* to '+)??' */
3912 RQ_PQ_Q /* to '+?)?' */
3913 };
3914
3915 static enum ReduceType ReduceTypeTable[6][6] = {
3916 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
3917 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
3918 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
3919 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
3920 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
3921 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
3922 };
3923
3924 extern int
onig_reduce_nested_quantifier(Node * pnode)3925 onig_reduce_nested_quantifier(Node* pnode)
3926 {
3927 int pnum, cnum;
3928 QuantNode *p, *c;
3929 Node* cnode;
3930
3931 cnode = NODE_BODY(pnode);
3932
3933 p = QUANT_(pnode);
3934 c = QUANT_(cnode);
3935 pnum = quantifier_type_num(p);
3936 cnum = quantifier_type_num(c);
3937 if (pnum < 0 || cnum < 0) {
3938 if (p->lower == p->upper && c->lower == c->upper) {
3939 int n = onig_positive_int_multiply(p->lower, c->lower);
3940 if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
3941
3942 p->lower = p->upper = n;
3943 NODE_BODY(pnode) = NODE_BODY(cnode);
3944 goto remove_cnode;
3945 }
3946
3947 return 0;
3948 }
3949
3950 switch(ReduceTypeTable[cnum][pnum]) {
3951 case RQ_DEL:
3952 *pnode = *cnode;
3953 goto remove_cnode;
3954 break;
3955 case RQ_A:
3956 NODE_BODY(pnode) = NODE_BODY(cnode);
3957 p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1;
3958 goto remove_cnode;
3959 break;
3960 case RQ_AQ:
3961 NODE_BODY(pnode) = NODE_BODY(cnode);
3962 p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0;
3963 goto remove_cnode;
3964 break;
3965 case RQ_QQ:
3966 NODE_BODY(pnode) = NODE_BODY(cnode);
3967 p->lower = 0; p->upper = 1; p->greedy = 0;
3968 goto remove_cnode;
3969 break;
3970 case RQ_P_QQ:
3971 p->lower = 0; p->upper = 1; p->greedy = 0;
3972 c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1;
3973 break;
3974 case RQ_PQ_Q:
3975 p->lower = 0; p->upper = 1; p->greedy = 1;
3976 c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0;
3977 break;
3978 case RQ_ASIS:
3979 break;
3980 }
3981
3982 return 0;
3983
3984 remove_cnode:
3985 NODE_BODY(cnode) = NULL_NODE;
3986 onig_node_free(cnode);
3987 return 0;
3988 }
3989
3990 static int
node_new_general_newline(Node ** node,ScanEnv * env)3991 node_new_general_newline(Node** node, ScanEnv* env)
3992 {
3993 int r;
3994 int dlen, alen;
3995 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
3996 Node* crnl;
3997 Node* ncc;
3998 Node* x;
3999 CClassNode* cc;
4000
4001 dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf);
4002 if (dlen < 0) return dlen;
4003 alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen);
4004 if (alen < 0) return alen;
4005
4006 crnl = node_new_str_crude(buf, buf + dlen + alen);
4007 CHECK_NULL_RETURN_MEMERR(crnl);
4008
4009 ncc = node_new_cclass();
4010 if (IS_NULL(ncc)) goto err2;
4011
4012 cc = CCLASS_(ncc);
4013 if (dlen == 1) {
4014 bitset_set_range(cc->bs, 0x0a, 0x0d);
4015 }
4016 else {
4017 r = add_code_range(&(cc->mbuf), env, 0x0a, 0x0d);
4018 if (r != 0) {
4019 err1:
4020 onig_node_free(ncc);
4021 err2:
4022 onig_node_free(crnl);
4023 return ONIGERR_MEMORY;
4024 }
4025 }
4026
4027 if (ONIGENC_IS_UNICODE_ENCODING(env->enc)) {
4028 r = add_code_range(&(cc->mbuf), env, 0x85, 0x85);
4029 if (r != 0) goto err1;
4030 r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
4031 if (r != 0) goto err1;
4032 }
4033
4034 x = node_new_bag_if_else(crnl, NULL_NODE, ncc);
4035 if (IS_NULL(x)) goto err1;
4036
4037 *node = x;
4038 return 0;
4039 }
4040
4041 enum TokenSyms {
4042 TK_EOT = 0, /* end of token */
4043 TK_CRUDE_BYTE = 1,
4044 TK_CHAR,
4045 TK_STRING,
4046 TK_CODE_POINT,
4047 TK_ANYCHAR,
4048 TK_CHAR_TYPE,
4049 TK_BACKREF,
4050 TK_CALL,
4051 TK_ANCHOR,
4052 TK_REPEAT,
4053 TK_INTERVAL,
4054 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
4055 TK_ALT,
4056 TK_SUBEXP_OPEN,
4057 TK_SUBEXP_CLOSE,
4058 TK_OPEN_CC,
4059 TK_QUOTE_OPEN,
4060 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
4061 TK_KEEP, /* \K */
4062 TK_GENERAL_NEWLINE, /* \R */
4063 TK_NO_NEWLINE, /* \N */
4064 TK_TRUE_ANYCHAR, /* \O */
4065 TK_TEXT_SEGMENT, /* \X */
4066
4067 /* in cc */
4068 TK_CC_CLOSE,
4069 TK_CC_RANGE,
4070 TK_CC_POSIX_BRACKET_OPEN,
4071 TK_CC_AND, /* && */
4072 TK_CC_OPEN_CC /* [ */
4073 };
4074
4075 typedef struct {
4076 enum TokenSyms type;
4077 int escaped;
4078 int base; /* is number: 8, 16 (used in [....]) */
4079 UChar* backp;
4080 union {
4081 UChar* s;
4082 UChar byte;
4083 OnigCodePoint code;
4084 int anchor;
4085 int subtype;
4086 struct {
4087 int lower;
4088 int upper;
4089 int greedy;
4090 int possessive;
4091 } repeat;
4092 struct {
4093 int num;
4094 int ref1;
4095 int* refs;
4096 int by_name;
4097 #ifdef USE_BACKREF_WITH_LEVEL
4098 int exist_level;
4099 int level; /* \k<name+n> */
4100 #endif
4101 } backref;
4102 struct {
4103 UChar* name;
4104 UChar* name_end;
4105 int gnum;
4106 int by_number;
4107 } call;
4108 struct {
4109 int ctype;
4110 int not;
4111 } prop;
4112 } u;
4113 } PToken;
4114
4115
4116 static int
fetch_interval(UChar ** src,UChar * end,PToken * tok,ScanEnv * env)4117 fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)
4118 {
4119 int low, up, syn_allow, non_low = 0;
4120 int r = 0;
4121 OnigCodePoint c;
4122 OnigEncoding enc = env->enc;
4123 UChar* p = *src;
4124 PFETCH_READY;
4125
4126 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
4127
4128 if (PEND) {
4129 if (syn_allow)
4130 return 1; /* "....{" : OK! */
4131 else
4132 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
4133 }
4134
4135 if (! syn_allow) {
4136 c = PPEEK;
4137 if (c == ')' || c == '(' || c == '|') {
4138 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
4139 }
4140 }
4141
4142 low = scan_number(&p, end, env->enc);
4143 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4144 if (low > ONIG_MAX_REPEAT_NUM)
4145 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4146
4147 if (p == *src) { /* can't read low */
4148 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
4149 /* allow {,n} as {0,n} */
4150 low = 0;
4151 non_low = 1;
4152 }
4153 else
4154 goto invalid;
4155 }
4156
4157 if (PEND) goto invalid;
4158 PFETCH(c);
4159 if (c == ',') {
4160 UChar* prev = p;
4161 up = scan_number(&p, end, env->enc);
4162 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4163 if (up > ONIG_MAX_REPEAT_NUM)
4164 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4165
4166 if (p == prev) {
4167 if (non_low != 0)
4168 goto invalid;
4169 up = INFINITE_REPEAT; /* {n,} : {n,infinite} */
4170 }
4171 }
4172 else {
4173 if (non_low != 0)
4174 goto invalid;
4175
4176 PUNFETCH;
4177 up = low; /* {n} : exact n times */
4178 r = 2; /* fixed */
4179 }
4180
4181 if (PEND) goto invalid;
4182 PFETCH(c);
4183 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
4184 if (c != MC_ESC(env->syntax) || PEND) goto invalid;
4185 PFETCH(c);
4186 }
4187 if (c != '}') goto invalid;
4188
4189 if (!IS_INFINITE_REPEAT(up) && low > up) {
4190 /* {n,m}+ supported case */
4191 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL))
4192 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
4193
4194 tok->u.repeat.possessive = 1;
4195 {
4196 int tmp;
4197 tmp = low; low = up; up = tmp;
4198 }
4199 }
4200 else
4201 tok->u.repeat.possessive = 0;
4202
4203 tok->type = TK_INTERVAL;
4204 tok->u.repeat.lower = low;
4205 tok->u.repeat.upper = up;
4206 *src = p;
4207 return r; /* 0: normal {n,m}, 2: fixed {n} */
4208
4209 invalid:
4210 if (syn_allow) {
4211 /* *src = p; */ /* !!! Don't do this line !!! */
4212 return 1; /* OK */
4213 }
4214 else
4215 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
4216 }
4217
4218 /* \M-, \C-, \c, or \... */
4219 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)4220 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
4221 {
4222 int v;
4223 OnigCodePoint c;
4224 OnigEncoding enc = env->enc;
4225 UChar* p = *src;
4226
4227 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4228
4229 PFETCH_S(c);
4230 switch (c) {
4231 case 'M':
4232 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
4233 if (PEND) return ONIGERR_END_PATTERN_AT_META;
4234 PFETCH_S(c);
4235 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
4236 if (PEND) return ONIGERR_END_PATTERN_AT_META;
4237 PFETCH_S(c);
4238 if (c == MC_ESC(env->syntax)) {
4239 v = fetch_escaped_value(&p, end, env, &c);
4240 if (v < 0) return v;
4241 }
4242 c = ((c & 0xff) | 0x80);
4243 }
4244 else
4245 goto backslash;
4246 break;
4247
4248 case 'C':
4249 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
4250 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4251 PFETCH_S(c);
4252 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
4253 goto control;
4254 }
4255 else
4256 goto backslash;
4257
4258 case 'c':
4259 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
4260 control:
4261 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4262 PFETCH_S(c);
4263 if (c == '?') {
4264 c = 0177;
4265 }
4266 else {
4267 if (c == MC_ESC(env->syntax)) {
4268 v = fetch_escaped_value(&p, end, env, &c);
4269 if (v < 0) return v;
4270 }
4271 c &= 0x9f;
4272 }
4273 break;
4274 }
4275 /* fall through */
4276
4277 default:
4278 {
4279 backslash:
4280 c = conv_backslash_value(c, env);
4281 }
4282 break;
4283 }
4284
4285 *src = p;
4286 *val = c;
4287 return 0;
4288 }
4289
4290 static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env);
4291
4292 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)4293 get_name_end_code_point(OnigCodePoint start)
4294 {
4295 switch (start) {
4296 case '<': return (OnigCodePoint )'>'; break;
4297 case '\'': return (OnigCodePoint )'\''; break;
4298 case '(': return (OnigCodePoint )')'; break;
4299 default:
4300 break;
4301 }
4302
4303 return (OnigCodePoint )0;
4304 }
4305
4306 enum REF_NUM {
4307 IS_NOT_NUM = 0,
4308 IS_ABS_NUM = 1,
4309 IS_REL_NUM = 2
4310 };
4311
4312 #ifdef USE_BACKREF_WITH_LEVEL
4313 /*
4314 \k<name+n>, \k<name-n>
4315 \k<num+n>, \k<num-n>
4316 \k<-num+n>, \k<-num-n>
4317 \k<+num+n>, \k<+num-n>
4318 */
4319 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel,enum REF_NUM * num_type)4320 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
4321 UChar** rname_end, ScanEnv* env,
4322 int* rback_num, int* rlevel, enum REF_NUM* num_type)
4323 {
4324 int r, sign, exist_level;
4325 int digit_count;
4326 OnigCodePoint end_code;
4327 OnigCodePoint c = 0;
4328 OnigEncoding enc = env->enc;
4329 UChar *name_end;
4330 UChar *pnum_head;
4331 UChar *p = *src;
4332 PFETCH_READY;
4333
4334 *rback_num = 0;
4335 exist_level = 0;
4336 *num_type = IS_NOT_NUM;
4337 sign = 1;
4338 pnum_head = *src;
4339
4340 end_code = get_name_end_code_point(start_code);
4341
4342 digit_count = 0;
4343 name_end = end;
4344 r = 0;
4345 if (PEND) {
4346 return ONIGERR_EMPTY_GROUP_NAME;
4347 }
4348 else {
4349 PFETCH(c);
4350 if (c == end_code)
4351 return ONIGERR_EMPTY_GROUP_NAME;
4352
4353 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4354 *num_type = IS_ABS_NUM;
4355 digit_count++;
4356 }
4357 else if (c == '-') {
4358 *num_type = IS_REL_NUM;
4359 sign = -1;
4360 pnum_head = p;
4361 }
4362 else if (c == '+') {
4363 *num_type = IS_REL_NUM;
4364 sign = 1;
4365 pnum_head = p;
4366 }
4367 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4368 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4369 }
4370 }
4371
4372 while (!PEND) {
4373 name_end = p;
4374 PFETCH(c);
4375 if (c == end_code || c == ')' || c == '+' || c == '-') {
4376 if (*num_type != IS_NOT_NUM && digit_count == 0)
4377 r = ONIGERR_INVALID_GROUP_NAME;
4378 break;
4379 }
4380
4381 if (*num_type != IS_NOT_NUM) {
4382 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4383 digit_count++;
4384 }
4385 else {
4386 r = ONIGERR_INVALID_GROUP_NAME;
4387 *num_type = IS_NOT_NUM;
4388 }
4389 }
4390 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4391 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4392 }
4393 }
4394
4395 if (r == 0 && c != end_code) {
4396 if (c == '+' || c == '-') {
4397 int level;
4398 int flag = (c == '-' ? -1 : 1);
4399
4400 if (PEND) {
4401 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4402 goto end;
4403 }
4404 PFETCH(c);
4405 if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err;
4406 PUNFETCH;
4407 level = scan_number(&p, end, enc);
4408 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
4409 *rlevel = (level * flag);
4410 exist_level = 1;
4411
4412 if (!PEND) {
4413 PFETCH(c);
4414 if (c == end_code)
4415 goto end;
4416 }
4417 }
4418
4419 err:
4420 name_end = end;
4421 err2:
4422 r = ONIGERR_INVALID_GROUP_NAME;
4423 }
4424
4425 end:
4426 if (r == 0) {
4427 if (*num_type != IS_NOT_NUM) {
4428 *rback_num = scan_number(&pnum_head, name_end, enc);
4429 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4430 else if (*rback_num == 0) {
4431 if (*num_type == IS_REL_NUM)
4432 goto err2;
4433 }
4434
4435 *rback_num *= sign;
4436 }
4437
4438 *rname_end = name_end;
4439 *src = p;
4440 return (exist_level ? 1 : 0);
4441 }
4442 else {
4443 onig_scan_env_set_error_string(env, r, *src, name_end);
4444 return r;
4445 }
4446 }
4447 #endif /* USE_BACKREF_WITH_LEVEL */
4448
4449 /*
4450 ref: 0 -> define name (don't allow number name)
4451 1 -> reference name (allow number name)
4452 */
4453 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,enum REF_NUM * num_type,int is_ref)4454 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
4455 UChar** rname_end, ScanEnv* env, int* rback_num,
4456 enum REF_NUM* num_type, int is_ref)
4457 {
4458 int r, sign;
4459 int digit_count;
4460 OnigCodePoint end_code;
4461 OnigCodePoint c = 0;
4462 OnigEncoding enc = env->enc;
4463 UChar *name_end;
4464 UChar *pnum_head;
4465 UChar *p = *src;
4466
4467 *rback_num = 0;
4468
4469 end_code = get_name_end_code_point(start_code);
4470
4471 digit_count = 0;
4472 name_end = end;
4473 pnum_head = *src;
4474 r = 0;
4475 *num_type = IS_NOT_NUM;
4476 sign = 1;
4477 if (PEND) {
4478 return ONIGERR_EMPTY_GROUP_NAME;
4479 }
4480 else {
4481 PFETCH_S(c);
4482 if (c == end_code)
4483 return ONIGERR_EMPTY_GROUP_NAME;
4484
4485 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4486 if (is_ref == TRUE)
4487 *num_type = IS_ABS_NUM;
4488 else {
4489 r = ONIGERR_INVALID_GROUP_NAME;
4490 }
4491 digit_count++;
4492 }
4493 else if (c == '-') {
4494 if (is_ref == TRUE) {
4495 *num_type = IS_REL_NUM;
4496 sign = -1;
4497 pnum_head = p;
4498 }
4499 else {
4500 r = ONIGERR_INVALID_GROUP_NAME;
4501 }
4502 }
4503 else if (c == '+') {
4504 if (is_ref == TRUE) {
4505 *num_type = IS_REL_NUM;
4506 sign = 1;
4507 pnum_head = p;
4508 }
4509 else {
4510 r = ONIGERR_INVALID_GROUP_NAME;
4511 }
4512 }
4513 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4514 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4515 }
4516 }
4517
4518 if (r == 0) {
4519 while (!PEND) {
4520 name_end = p;
4521 PFETCH_S(c);
4522 if (c == end_code || c == ')') {
4523 if (*num_type != IS_NOT_NUM && digit_count == 0)
4524 r = ONIGERR_INVALID_GROUP_NAME;
4525 break;
4526 }
4527
4528 if (*num_type != IS_NOT_NUM) {
4529 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4530 digit_count++;
4531 }
4532 else {
4533 if (!ONIGENC_IS_CODE_WORD(enc, c))
4534 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4535 else
4536 r = ONIGERR_INVALID_GROUP_NAME;
4537
4538 *num_type = IS_NOT_NUM;
4539 }
4540 }
4541 else {
4542 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4543 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4544 }
4545 }
4546 }
4547
4548 if (c != end_code) {
4549 r = ONIGERR_INVALID_GROUP_NAME;
4550 goto err;
4551 }
4552
4553 if (*num_type != IS_NOT_NUM) {
4554 *rback_num = scan_number(&pnum_head, name_end, enc);
4555 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4556 else if (*rback_num == 0) {
4557 if (*num_type == IS_REL_NUM) {
4558 r = ONIGERR_INVALID_GROUP_NAME;
4559 goto err;
4560 }
4561 }
4562
4563 *rback_num *= sign;
4564 }
4565
4566 *rname_end = name_end;
4567 *src = p;
4568 return 0;
4569 }
4570 else {
4571 while (!PEND) {
4572 name_end = p;
4573 PFETCH_S(c);
4574 if (c == end_code || c == ')')
4575 break;
4576 }
4577 if (PEND)
4578 name_end = end;
4579
4580 err:
4581 onig_scan_env_set_error_string(env, r, *src, name_end);
4582 return r;
4583 }
4584 }
4585
4586 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)4587 CC_ESC_WARN(ScanEnv* env, UChar *c)
4588 {
4589 if (onig_warn == onig_null_warn) return ;
4590
4591 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
4592 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
4593 UChar buf[WARN_BUFSIZE];
4594 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4595 env->pattern, env->pattern_end,
4596 (UChar* )"character class has '%s' without escape",
4597 c);
4598 (*onig_warn)((char* )buf);
4599 }
4600 }
4601
4602 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)4603 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
4604 {
4605 if (onig_warn == onig_null_warn) return ;
4606
4607 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
4608 UChar buf[WARN_BUFSIZE];
4609 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
4610 (env)->pattern, (env)->pattern_end,
4611 (UChar* )"regular expression has '%s' without escape", c);
4612 (*onig_warn)((char* )buf);
4613 }
4614 }
4615
4616 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)4617 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
4618 UChar **next, OnigEncoding enc)
4619 {
4620 int i;
4621 OnigCodePoint x;
4622 UChar *q;
4623 UChar *p = from;
4624
4625 while (p < to) {
4626 x = ONIGENC_MBC_TO_CODE(enc, p, to);
4627 q = p + enclen(enc, p);
4628 if (x == s[0]) {
4629 for (i = 1; i < n && q < to; i++) {
4630 x = ONIGENC_MBC_TO_CODE(enc, q, to);
4631 if (x != s[i]) break;
4632 q += enclen(enc, q);
4633 }
4634 if (i >= n) {
4635 if (IS_NOT_NULL(next))
4636 *next = q;
4637 return p;
4638 }
4639 }
4640 p = q;
4641 }
4642 return NULL_UCHARP;
4643 }
4644
4645 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)4646 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
4647 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
4648 {
4649 int i, in_esc;
4650 OnigCodePoint x;
4651 UChar *q;
4652 UChar *p = from;
4653
4654 in_esc = 0;
4655 while (p < to) {
4656 if (in_esc) {
4657 in_esc = 0;
4658 p += enclen(enc, p);
4659 }
4660 else {
4661 x = ONIGENC_MBC_TO_CODE(enc, p, to);
4662 q = p + enclen(enc, p);
4663 if (x == s[0]) {
4664 for (i = 1; i < n && q < to; i++) {
4665 x = ONIGENC_MBC_TO_CODE(enc, q, to);
4666 if (x != s[i]) break;
4667 q += enclen(enc, q);
4668 }
4669 if (i >= n) return 1;
4670 p += enclen(enc, p);
4671 }
4672 else {
4673 x = ONIGENC_MBC_TO_CODE(enc, p, to);
4674 if (x == bad) return 0;
4675 else if (x == MC_ESC(syn)) in_esc = 1;
4676 p = q;
4677 }
4678 }
4679 }
4680 return 0;
4681 }
4682
4683 static int
fetch_token_in_cc(PToken * tok,UChar ** src,UChar * end,ScanEnv * env)4684 fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
4685 {
4686 int r;
4687 OnigCodePoint code;
4688 OnigCodePoint c, c2;
4689 OnigSyntaxType* syn = env->syntax;
4690 OnigEncoding enc = env->enc;
4691 UChar* prev;
4692 UChar* p = *src;
4693 PFETCH_READY;
4694
4695 if (PEND) {
4696 tok->type = TK_EOT;
4697 return tok->type;
4698 }
4699
4700 PFETCH(c);
4701 tok->type = TK_CHAR;
4702 tok->base = 0;
4703 tok->u.code = c;
4704 tok->escaped = 0;
4705
4706 if (c == ']') {
4707 tok->type = TK_CC_CLOSE;
4708 }
4709 else if (c == '-') {
4710 tok->type = TK_CC_RANGE;
4711 }
4712 else if (c == MC_ESC(syn)) {
4713 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
4714 goto end;
4715
4716 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4717
4718 PFETCH(c);
4719 tok->escaped = 1;
4720 tok->u.code = c;
4721 switch (c) {
4722 case 'w':
4723 tok->type = TK_CHAR_TYPE;
4724 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
4725 tok->u.prop.not = 0;
4726 break;
4727 case 'W':
4728 tok->type = TK_CHAR_TYPE;
4729 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
4730 tok->u.prop.not = 1;
4731 break;
4732 case 'd':
4733 tok->type = TK_CHAR_TYPE;
4734 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
4735 tok->u.prop.not = 0;
4736 break;
4737 case 'D':
4738 tok->type = TK_CHAR_TYPE;
4739 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
4740 tok->u.prop.not = 1;
4741 break;
4742 case 's':
4743 tok->type = TK_CHAR_TYPE;
4744 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
4745 tok->u.prop.not = 0;
4746 break;
4747 case 'S':
4748 tok->type = TK_CHAR_TYPE;
4749 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
4750 tok->u.prop.not = 1;
4751 break;
4752 case 'h':
4753 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
4754 tok->type = TK_CHAR_TYPE;
4755 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
4756 tok->u.prop.not = 0;
4757 break;
4758 case 'H':
4759 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
4760 tok->type = TK_CHAR_TYPE;
4761 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
4762 tok->u.prop.not = 1;
4763 break;
4764
4765 case 'p':
4766 case 'P':
4767 if (PEND) break;
4768
4769 c2 = PPEEK;
4770 if (c2 == '{' &&
4771 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
4772 PINC;
4773 tok->type = TK_CHAR_PROPERTY;
4774 tok->u.prop.not = c == 'P';
4775
4776 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
4777 PFETCH(c2);
4778 if (c2 == '^') {
4779 tok->u.prop.not = tok->u.prop.not == 0;
4780 }
4781 else
4782 PUNFETCH;
4783 }
4784 }
4785 break;
4786
4787 case 'o':
4788 if (PEND) break;
4789
4790 prev = p;
4791 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
4792 PINC;
4793 r = scan_octal_number(&p, end, 0, 11, enc, &code);
4794 if (r < 0) return r;
4795 if (!PEND) {
4796 c2 = PPEEK;
4797 if (IS_CODE_DIGIT_ASCII(enc, c2))
4798 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
4799 }
4800
4801 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
4802 PINC;
4803 tok->type = TK_CODE_POINT;
4804 tok->base = 8;
4805 tok->u.code = code;
4806 }
4807 else {
4808 /* can't read nothing or invalid format */
4809 p = prev;
4810 }
4811 }
4812 break;
4813
4814 case 'x':
4815 if (PEND) break;
4816
4817 prev = p;
4818 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
4819 PINC;
4820 r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
4821 if (r < 0) return r;
4822 if (!PEND) {
4823 c2 = PPEEK;
4824 if (IS_CODE_XDIGIT_ASCII(enc, c2))
4825 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
4826 }
4827
4828 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
4829 PINC;
4830 tok->type = TK_CODE_POINT;
4831 tok->base = 16;
4832 tok->u.code = code;
4833 }
4834 else {
4835 /* can't read nothing or invalid format */
4836 p = prev;
4837 }
4838 }
4839 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
4840 r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
4841 if (r < 0) return r;
4842 if (p == prev) { /* can't read nothing. */
4843 code = 0; /* but, it's not error */
4844 }
4845 tok->type = TK_CRUDE_BYTE;
4846 tok->base = 16;
4847 tok->u.byte = (UChar )code;
4848 }
4849 break;
4850
4851 case 'u':
4852 if (PEND) break;
4853
4854 prev = p;
4855 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
4856 r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
4857 if (r < 0) return r;
4858 if (p == prev) { /* can't read nothing. */
4859 code = 0; /* but, it's not error */
4860 }
4861 tok->type = TK_CODE_POINT;
4862 tok->base = 16;
4863 tok->u.code = code;
4864 }
4865 break;
4866
4867 case '0':
4868 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
4869 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
4870 PUNFETCH;
4871 prev = p;
4872 r = scan_octal_number(&p, end, 0, 3, enc, &code);
4873 if (r < 0) return r;
4874 if (code >= 256) return ONIGERR_TOO_BIG_NUMBER;
4875 if (p == prev) { /* can't read nothing. */
4876 code = 0; /* but, it's not error */
4877 }
4878 tok->type = TK_CRUDE_BYTE;
4879 tok->base = 8;
4880 tok->u.byte = (UChar )code;
4881 }
4882 break;
4883
4884 default:
4885 PUNFETCH;
4886 r = fetch_escaped_value(&p, end, env, &c2);
4887 if (r < 0) return r;
4888 if (tok->u.code != c2) {
4889 tok->u.code = c2;
4890 tok->type = TK_CODE_POINT;
4891 }
4892 break;
4893 }
4894 }
4895 else if (c == '[') {
4896 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
4897 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
4898 tok->backp = p; /* point at '[' is read */
4899 PINC;
4900 if (str_exist_check_with_esc(send, 2, p, end,
4901 (OnigCodePoint )']', enc, syn)) {
4902 tok->type = TK_CC_POSIX_BRACKET_OPEN;
4903 }
4904 else {
4905 PUNFETCH;
4906 goto cc_in_cc;
4907 }
4908 }
4909 else {
4910 cc_in_cc:
4911 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
4912 tok->type = TK_CC_OPEN_CC;
4913 }
4914 else {
4915 CC_ESC_WARN(env, (UChar* )"[");
4916 }
4917 }
4918 }
4919 else if (c == '&') {
4920 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
4921 !PEND && (PPEEK_IS('&'))) {
4922 PINC;
4923 tok->type = TK_CC_AND;
4924 }
4925 }
4926
4927 end:
4928 *src = p;
4929 return tok->type;
4930 }
4931
4932 static int
fetch_token(PToken * tok,UChar ** src,UChar * end,ScanEnv * env)4933 fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
4934 {
4935 int r;
4936 OnigCodePoint code;
4937 OnigCodePoint c;
4938 OnigEncoding enc = env->enc;
4939 OnigSyntaxType* syn = env->syntax;
4940 UChar* prev;
4941 UChar* p = *src;
4942 PFETCH_READY;
4943
4944 start:
4945 if (PEND) {
4946 tok->type = TK_EOT;
4947 return tok->type;
4948 }
4949
4950 tok->type = TK_STRING;
4951 tok->base = 0;
4952 tok->backp = p;
4953
4954 PFETCH(c);
4955 if (IS_MC_ESC_CODE(c, syn)) {
4956 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4957
4958 tok->backp = p;
4959 PFETCH(c);
4960
4961 tok->u.code = c;
4962 tok->escaped = 1;
4963 switch (c) {
4964 case '*':
4965 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
4966 tok->type = TK_REPEAT;
4967 tok->u.repeat.lower = 0;
4968 tok->u.repeat.upper = INFINITE_REPEAT;
4969 goto greedy_check;
4970 break;
4971
4972 case '+':
4973 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
4974 tok->type = TK_REPEAT;
4975 tok->u.repeat.lower = 1;
4976 tok->u.repeat.upper = INFINITE_REPEAT;
4977 goto greedy_check;
4978 break;
4979
4980 case '?':
4981 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
4982 tok->type = TK_REPEAT;
4983 tok->u.repeat.lower = 0;
4984 tok->u.repeat.upper = 1;
4985 greedy_check:
4986 tok->u.repeat.possessive = 0;
4987 greedy_check2:
4988 if (!PEND && PPEEK_IS('?') &&
4989 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY) &&
4990 tok->u.repeat.possessive == 0) {
4991 PFETCH(c);
4992 tok->u.repeat.greedy = 0;
4993 tok->u.repeat.possessive = 0;
4994 }
4995 else {
4996 possessive_check:
4997 tok->u.repeat.greedy = 1;
4998 if (!PEND && PPEEK_IS('+') &&
4999 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
5000 tok->type != TK_INTERVAL) ||
5001 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
5002 tok->type == TK_INTERVAL)) &&
5003 tok->u.repeat.possessive == 0) {
5004 PFETCH(c);
5005 tok->u.repeat.possessive = 1;
5006 }
5007 }
5008 break;
5009
5010 case '{':
5011 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
5012 r = fetch_interval(&p, end, tok, env);
5013 if (r < 0) return r; /* error */
5014 if (r == 0) goto greedy_check2;
5015 else if (r == 2) { /* {n} */
5016 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5017 goto possessive_check;
5018
5019 goto greedy_check2;
5020 }
5021 /* r == 1 : normal char */
5022 break;
5023
5024 case '|':
5025 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
5026 tok->type = TK_ALT;
5027 break;
5028
5029 case '(':
5030 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5031 tok->type = TK_SUBEXP_OPEN;
5032 break;
5033
5034 case ')':
5035 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5036 tok->type = TK_SUBEXP_CLOSE;
5037 break;
5038
5039 case 'w':
5040 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5041 tok->type = TK_CHAR_TYPE;
5042 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5043 tok->u.prop.not = 0;
5044 break;
5045
5046 case 'W':
5047 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5048 tok->type = TK_CHAR_TYPE;
5049 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5050 tok->u.prop.not = 1;
5051 break;
5052
5053 case 'b':
5054 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5055 tok->type = TK_ANCHOR;
5056 tok->u.anchor = ANCR_WORD_BOUNDARY;
5057 break;
5058
5059 case 'B':
5060 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5061 tok->type = TK_ANCHOR;
5062 tok->u.anchor = ANCR_NO_WORD_BOUNDARY;
5063 break;
5064
5065 case 'y':
5066 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5067 tok->type = TK_ANCHOR;
5068 tok->u.anchor = ANCR_TEXT_SEGMENT_BOUNDARY;
5069 break;
5070
5071 case 'Y':
5072 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5073 tok->type = TK_ANCHOR;
5074 tok->u.anchor = ANCR_NO_TEXT_SEGMENT_BOUNDARY;
5075 break;
5076
5077 #ifdef USE_WORD_BEGIN_END
5078 case '<':
5079 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5080 tok->type = TK_ANCHOR;
5081 tok->u.anchor = ANCR_WORD_BEGIN;
5082 break;
5083
5084 case '>':
5085 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5086 tok->type = TK_ANCHOR;
5087 tok->u.anchor = ANCR_WORD_END;
5088 break;
5089 #endif
5090
5091 case 's':
5092 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5093 tok->type = TK_CHAR_TYPE;
5094 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5095 tok->u.prop.not = 0;
5096 break;
5097
5098 case 'S':
5099 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5100 tok->type = TK_CHAR_TYPE;
5101 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5102 tok->u.prop.not = 1;
5103 break;
5104
5105 case 'd':
5106 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5107 tok->type = TK_CHAR_TYPE;
5108 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5109 tok->u.prop.not = 0;
5110 break;
5111
5112 case 'D':
5113 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5114 tok->type = TK_CHAR_TYPE;
5115 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5116 tok->u.prop.not = 1;
5117 break;
5118
5119 case 'h':
5120 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5121 tok->type = TK_CHAR_TYPE;
5122 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5123 tok->u.prop.not = 0;
5124 break;
5125
5126 case 'H':
5127 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5128 tok->type = TK_CHAR_TYPE;
5129 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5130 tok->u.prop.not = 1;
5131 break;
5132
5133 case 'K':
5134 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) break;
5135 tok->type = TK_KEEP;
5136 break;
5137
5138 case 'R':
5139 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE)) break;
5140 tok->type = TK_GENERAL_NEWLINE;
5141 break;
5142
5143 case 'N':
5144 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5145 tok->type = TK_NO_NEWLINE;
5146 break;
5147
5148 case 'O':
5149 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5150 tok->type = TK_TRUE_ANYCHAR;
5151 break;
5152
5153 case 'X':
5154 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5155 tok->type = TK_TEXT_SEGMENT;
5156 break;
5157
5158 case 'A':
5159 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5160 begin_buf:
5161 tok->type = TK_ANCHOR;
5162 tok->u.subtype = ANCR_BEGIN_BUF;
5163 break;
5164
5165 case 'Z':
5166 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5167 tok->type = TK_ANCHOR;
5168 tok->u.subtype = ANCR_SEMI_END_BUF;
5169 break;
5170
5171 case 'z':
5172 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5173 end_buf:
5174 tok->type = TK_ANCHOR;
5175 tok->u.subtype = ANCR_END_BUF;
5176 break;
5177
5178 case 'G':
5179 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
5180 tok->type = TK_ANCHOR;
5181 tok->u.subtype = ANCR_BEGIN_POSITION;
5182 break;
5183
5184 case '`':
5185 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5186 goto begin_buf;
5187 break;
5188
5189 case '\'':
5190 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5191 goto end_buf;
5192 break;
5193
5194 case 'o':
5195 if (PEND) break;
5196
5197 prev = p;
5198 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
5199 PINC;
5200 r = scan_octal_number(&p, end, 0, 11, enc, &code);
5201 if (r < 0) return r;
5202 if (!PEND) {
5203 if (IS_CODE_DIGIT_ASCII(enc, PPEEK))
5204 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5205 }
5206
5207 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
5208 PINC;
5209 tok->type = TK_CODE_POINT;
5210 tok->u.code = code;
5211 }
5212 else {
5213 /* can't read nothing or invalid format */
5214 p = prev;
5215 }
5216 }
5217 break;
5218
5219 case 'x':
5220 if (PEND) break;
5221
5222 prev = p;
5223 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
5224 PINC;
5225 r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
5226 if (r < 0) return r;
5227 if (!PEND) {
5228 if (IS_CODE_XDIGIT_ASCII(enc, PPEEK))
5229 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5230 }
5231
5232 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
5233 PINC;
5234 tok->type = TK_CODE_POINT;
5235 tok->u.code = code;
5236 }
5237 else {
5238 /* can't read nothing or invalid format */
5239 p = prev;
5240 }
5241 }
5242 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
5243 r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
5244 if (r < 0) return r;
5245 if (p == prev) { /* can't read nothing. */
5246 code = 0; /* but, it's not error */
5247 }
5248 tok->type = TK_CRUDE_BYTE;
5249 tok->base = 16;
5250 tok->u.byte = (UChar )code;
5251 }
5252 break;
5253
5254 case 'u':
5255 if (PEND) break;
5256
5257 prev = p;
5258 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5259 r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5260 if (r < 0) return r;
5261 if (p == prev) { /* can't read nothing. */
5262 code = 0; /* but, it's not error */
5263 }
5264 tok->type = TK_CODE_POINT;
5265 tok->base = 16;
5266 tok->u.code = code;
5267 }
5268 break;
5269
5270 case '1': case '2': case '3': case '4':
5271 case '5': case '6': case '7': case '8': case '9':
5272 PUNFETCH;
5273 prev = p;
5274 r = scan_number(&p, end, enc);
5275 if (r < 0 || r > ONIG_MAX_BACKREF_NUM) {
5276 goto skip_backref;
5277 }
5278
5279 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
5280 (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */
5281 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5282 if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node))
5283 return ONIGERR_INVALID_BACKREF;
5284 }
5285
5286 tok->type = TK_BACKREF;
5287 tok->u.backref.num = 1;
5288 tok->u.backref.ref1 = r;
5289 tok->u.backref.by_name = 0;
5290 #ifdef USE_BACKREF_WITH_LEVEL
5291 tok->u.backref.exist_level = 0;
5292 #endif
5293 break;
5294 }
5295
5296 skip_backref:
5297 if (c == '8' || c == '9') {
5298 /* normal char */
5299 p = prev; PINC;
5300 break;
5301 }
5302
5303 p = prev;
5304 /* fall through */
5305 case '0':
5306 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5307 prev = p;
5308 r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code);
5309 if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER;
5310 if (p == prev) { /* can't read nothing. */
5311 code = 0; /* but, it's not error */
5312 }
5313 tok->type = TK_CRUDE_BYTE;
5314 tok->base = 8;
5315 tok->u.byte = (UChar )code;
5316 }
5317 else if (c != '0') {
5318 PINC;
5319 }
5320 break;
5321
5322 case 'k':
5323 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
5324 PFETCH(c);
5325 if (c == '<' || c == '\'') {
5326 UChar* name_end;
5327 int* backs;
5328 int back_num;
5329 enum REF_NUM num_type;
5330
5331 prev = p;
5332
5333 #ifdef USE_BACKREF_WITH_LEVEL
5334 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
5335 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
5336 env, &back_num, &tok->u.backref.level, &num_type);
5337 if (r == 1) tok->u.backref.exist_level = 1;
5338 else tok->u.backref.exist_level = 0;
5339 #else
5340 r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE);
5341 #endif
5342 if (r < 0) return r;
5343
5344 if (num_type != IS_NOT_NUM) {
5345 if (num_type == IS_REL_NUM) {
5346 back_num = backref_rel_to_abs(back_num, env);
5347 }
5348 if (back_num <= 0)
5349 return ONIGERR_INVALID_BACKREF;
5350
5351 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5352 if (back_num > env->num_mem ||
5353 IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
5354 return ONIGERR_INVALID_BACKREF;
5355 }
5356 tok->type = TK_BACKREF;
5357 tok->u.backref.by_name = 0;
5358 tok->u.backref.num = 1;
5359 tok->u.backref.ref1 = back_num;
5360 }
5361 else {
5362 int num = name_to_group_numbers(env, prev, name_end, &backs);
5363 if (num <= 0) {
5364 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5365 }
5366 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5367 int i;
5368 for (i = 0; i < num; i++) {
5369 if (backs[i] > env->num_mem ||
5370 IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
5371 return ONIGERR_INVALID_BACKREF;
5372 }
5373 }
5374
5375 tok->type = TK_BACKREF;
5376 tok->u.backref.by_name = 1;
5377 if (num == 1) {
5378 tok->u.backref.num = 1;
5379 tok->u.backref.ref1 = backs[0];
5380 }
5381 else {
5382 tok->u.backref.num = num;
5383 tok->u.backref.refs = backs;
5384 }
5385 }
5386 }
5387 else
5388 PUNFETCH;
5389 }
5390 break;
5391
5392 #ifdef USE_CALL
5393 case 'g':
5394 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
5395 PFETCH(c);
5396 if (c == '<' || c == '\'') {
5397 int gnum;
5398 UChar* name_end;
5399 enum REF_NUM num_type;
5400
5401 prev = p;
5402 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,
5403 &gnum, &num_type, TRUE);
5404 if (r < 0) return r;
5405
5406 if (num_type != IS_NOT_NUM) {
5407 if (num_type == IS_REL_NUM) {
5408 gnum = backref_rel_to_abs(gnum, env);
5409 if (gnum < 0) {
5410 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5411 prev, name_end);
5412 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5413 }
5414 }
5415 tok->u.call.by_number = 1;
5416 tok->u.call.gnum = gnum;
5417 }
5418 else {
5419 tok->u.call.by_number = 0;
5420 tok->u.call.gnum = 0;
5421 }
5422
5423 tok->type = TK_CALL;
5424 tok->u.call.name = prev;
5425 tok->u.call.name_end = name_end;
5426 }
5427 else
5428 PUNFETCH;
5429 }
5430 break;
5431 #endif
5432
5433 case 'Q':
5434 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
5435 tok->type = TK_QUOTE_OPEN;
5436 }
5437 break;
5438
5439 case 'p':
5440 case 'P':
5441 if (!PEND && PPEEK_IS('{') &&
5442 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
5443 PINC;
5444 tok->type = TK_CHAR_PROPERTY;
5445 tok->u.prop.not = c == 'P';
5446
5447 if (!PEND &&
5448 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
5449 PFETCH(c);
5450 if (c == '^') {
5451 tok->u.prop.not = tok->u.prop.not == 0;
5452 }
5453 else
5454 PUNFETCH;
5455 }
5456 }
5457 break;
5458
5459 default:
5460 {
5461 OnigCodePoint c2;
5462
5463 PUNFETCH;
5464 r = fetch_escaped_value(&p, end, env, &c2);
5465 if (r < 0) return r;
5466 if (tok->u.code != c2) {
5467 tok->type = TK_CODE_POINT;
5468 tok->u.code = c2;
5469 }
5470 else { /* string */
5471 p = tok->backp + enclen(enc, tok->backp);
5472 }
5473 }
5474 break;
5475 }
5476 }
5477 else {
5478 tok->u.code = c;
5479 tok->escaped = 0;
5480
5481 #ifdef USE_VARIABLE_META_CHARS
5482 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
5483 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
5484 if (c == MC_ANYCHAR(syn))
5485 goto any_char;
5486 else if (c == MC_ANYTIME(syn))
5487 goto anytime;
5488 else if (c == MC_ZERO_OR_ONE_TIME(syn))
5489 goto zero_or_one_time;
5490 else if (c == MC_ONE_OR_MORE_TIME(syn))
5491 goto one_or_more_time;
5492 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
5493 tok->type = TK_ANYCHAR_ANYTIME;
5494 goto out;
5495 }
5496 }
5497 #endif
5498
5499 switch (c) {
5500 case '.':
5501 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
5502 #ifdef USE_VARIABLE_META_CHARS
5503 any_char:
5504 #endif
5505 tok->type = TK_ANYCHAR;
5506 break;
5507
5508 case '*':
5509 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
5510 #ifdef USE_VARIABLE_META_CHARS
5511 anytime:
5512 #endif
5513 tok->type = TK_REPEAT;
5514 tok->u.repeat.lower = 0;
5515 tok->u.repeat.upper = INFINITE_REPEAT;
5516 goto greedy_check;
5517 break;
5518
5519 case '+':
5520 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
5521 #ifdef USE_VARIABLE_META_CHARS
5522 one_or_more_time:
5523 #endif
5524 tok->type = TK_REPEAT;
5525 tok->u.repeat.lower = 1;
5526 tok->u.repeat.upper = INFINITE_REPEAT;
5527 goto greedy_check;
5528 break;
5529
5530 case '?':
5531 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
5532 #ifdef USE_VARIABLE_META_CHARS
5533 zero_or_one_time:
5534 #endif
5535 tok->type = TK_REPEAT;
5536 tok->u.repeat.lower = 0;
5537 tok->u.repeat.upper = 1;
5538 goto greedy_check;
5539 break;
5540
5541 case '{':
5542 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
5543 r = fetch_interval(&p, end, tok, env);
5544 if (r < 0) return r; /* error */
5545 if (r == 0) goto greedy_check2;
5546 else if (r == 2) { /* {n} */
5547 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5548 goto possessive_check;
5549
5550 goto greedy_check2;
5551 }
5552 /* r == 1 : normal char */
5553 break;
5554
5555 case '|':
5556 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
5557 tok->type = TK_ALT;
5558 break;
5559
5560 case '(':
5561 if (!PEND && PPEEK_IS('?') &&
5562 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
5563 PINC;
5564 if (! PEND) {
5565 c = PPEEK;
5566 if (c == '#') {
5567 PFETCH(c);
5568 while (1) {
5569 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5570 PFETCH(c);
5571 if (c == MC_ESC(syn)) {
5572 if (! PEND) PFETCH(c);
5573 }
5574 else {
5575 if (c == ')') break;
5576 }
5577 }
5578 goto start;
5579 }
5580 else if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL)) {
5581 int gnum;
5582 UChar* name;
5583 UChar* name_end;
5584 enum REF_NUM num_type;
5585
5586 switch (c) {
5587 case '&':
5588 {
5589 PINC;
5590 name = p;
5591 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
5592 &gnum, &num_type, FALSE);
5593 if (r < 0) return r;
5594
5595 tok->type = TK_CALL;
5596 tok->u.call.by_number = 0;
5597 tok->u.call.gnum = 0;
5598 tok->u.call.name = name;
5599 tok->u.call.name_end = name_end;
5600 }
5601 break;
5602
5603 case 'R':
5604 tok->type = TK_CALL;
5605 tok->u.call.by_number = 1;
5606 tok->u.call.gnum = 0;
5607 tok->u.call.name = p;
5608 PINC;
5609 if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;
5610 tok->u.call.name_end = p;
5611 break;
5612
5613 case '-':
5614 case '+':
5615 goto lparen_qmark_num;
5616 break;
5617 default:
5618 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto lparen_qmark_end;
5619
5620 lparen_qmark_num:
5621 {
5622 name = p;
5623 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
5624 &gnum, &num_type, TRUE);
5625 if (r < 0) return r;
5626
5627 if (num_type == IS_NOT_NUM) {
5628 return ONIGERR_INVALID_GROUP_NAME;
5629 }
5630 else {
5631 if (num_type == IS_REL_NUM) {
5632 gnum = backref_rel_to_abs(gnum, env);
5633 if (gnum < 0) {
5634 onig_scan_env_set_error_string(env,
5635 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
5636 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5637 }
5638 }
5639 tok->u.call.by_number = 1;
5640 tok->u.call.gnum = gnum;
5641 }
5642
5643 tok->type = TK_CALL;
5644 tok->u.call.name = name;
5645 tok->u.call.name_end = name_end;
5646 }
5647 break;
5648 }
5649 }
5650 }
5651 lparen_qmark_end:
5652 PUNFETCH;
5653 }
5654
5655 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
5656 tok->type = TK_SUBEXP_OPEN;
5657 break;
5658
5659 case ')':
5660 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
5661 tok->type = TK_SUBEXP_CLOSE;
5662 break;
5663
5664 case '^':
5665 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
5666 tok->type = TK_ANCHOR;
5667 tok->u.subtype = (IS_SINGLELINE(env->options)
5668 ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE);
5669 break;
5670
5671 case '$':
5672 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
5673 tok->type = TK_ANCHOR;
5674 tok->u.subtype = (IS_SINGLELINE(env->options)
5675 ? ANCR_SEMI_END_BUF : ANCR_END_LINE);
5676 break;
5677
5678 case '[':
5679 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
5680 tok->type = TK_OPEN_CC;
5681 break;
5682
5683 case ']':
5684 if (*src > env->pattern) /* /].../ is allowed. */
5685 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
5686 break;
5687
5688 case '#':
5689 if (IS_EXTEND(env->options)) {
5690 while (!PEND) {
5691 PFETCH(c);
5692 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
5693 break;
5694 }
5695 goto start;
5696 break;
5697 }
5698 break;
5699
5700 case ' ': case '\t': case '\n': case '\r': case '\f':
5701 if (IS_EXTEND(env->options))
5702 goto start;
5703 break;
5704
5705 default:
5706 /* string */
5707 break;
5708 }
5709 }
5710
5711 #ifdef USE_VARIABLE_META_CHARS
5712 out:
5713 #endif
5714 *src = p;
5715 return tok->type;
5716 }
5717
5718 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])5719 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
5720 OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out,
5721 const OnigCodePoint mbr[])
5722 {
5723 int i, r;
5724 OnigCodePoint j;
5725
5726 int n = ONIGENC_CODE_RANGE_NUM(mbr);
5727
5728 if (not == 0) {
5729 for (i = 0; i < n; i++) {
5730 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
5731 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
5732 if (j >= sb_out) {
5733 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
5734 r = add_code_range_to_buf(&(cc->mbuf), j,
5735 ONIGENC_CODE_RANGE_TO(mbr, i));
5736 if (r != 0) return r;
5737 i++;
5738 }
5739
5740 goto sb_end;
5741 }
5742 BITSET_SET_BIT(cc->bs, j);
5743 }
5744 }
5745
5746 sb_end:
5747 for ( ; i < n; i++) {
5748 r = add_code_range_to_buf(&(cc->mbuf),
5749 ONIGENC_CODE_RANGE_FROM(mbr, i),
5750 ONIGENC_CODE_RANGE_TO(mbr, i));
5751 if (r != 0) return r;
5752 }
5753 }
5754 else {
5755 OnigCodePoint prev = 0;
5756
5757 for (i = 0; i < n; i++) {
5758 for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
5759 if (j >= sb_out) {
5760 goto sb_end2;
5761 }
5762 BITSET_SET_BIT(cc->bs, j);
5763 }
5764 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
5765 }
5766 for (j = prev; j < sb_out; j++) {
5767 BITSET_SET_BIT(cc->bs, j);
5768 }
5769
5770 sb_end2:
5771 prev = sb_out;
5772
5773 for (i = 0; i < n; i++) {
5774 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
5775 r = add_code_range_to_buf(&(cc->mbuf), prev,
5776 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
5777 if (r != 0) return r;
5778 }
5779 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
5780 if (prev == 0) goto end;
5781 }
5782
5783 r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
5784 if (r != 0) return r;
5785 }
5786
5787 end:
5788 return 0;
5789 }
5790
5791 static int
add_ctype_to_cc_by_range_limit(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[],OnigCodePoint limit)5792 add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not,
5793 OnigEncoding enc ARG_UNUSED,
5794 OnigCodePoint sb_out,
5795 const OnigCodePoint mbr[], OnigCodePoint limit)
5796 {
5797 int i, r;
5798 OnigCodePoint j;
5799 OnigCodePoint from;
5800 OnigCodePoint to;
5801
5802 int n = ONIGENC_CODE_RANGE_NUM(mbr);
5803
5804 if (not == 0) {
5805 for (i = 0; i < n; i++) {
5806 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
5807 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
5808 if (j > limit) goto end;
5809 if (j >= sb_out) {
5810 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
5811 to = ONIGENC_CODE_RANGE_TO(mbr, i);
5812 if (to > limit) to = limit;
5813 r = add_code_range_to_buf(&(cc->mbuf), j, to);
5814 if (r != 0) return r;
5815 i++;
5816 }
5817
5818 goto sb_end;
5819 }
5820 BITSET_SET_BIT(cc->bs, j);
5821 }
5822 }
5823
5824 sb_end:
5825 for ( ; i < n; i++) {
5826 from = ONIGENC_CODE_RANGE_FROM(mbr, i);
5827 to = ONIGENC_CODE_RANGE_TO(mbr, i);
5828 if (from > limit) break;
5829 if (to > limit) to = limit;
5830 r = add_code_range_to_buf(&(cc->mbuf), from, to);
5831 if (r != 0) return r;
5832 }
5833 }
5834 else {
5835 OnigCodePoint prev = 0;
5836
5837 for (i = 0; i < n; i++) {
5838 from = ONIGENC_CODE_RANGE_FROM(mbr, i);
5839 if (from > limit) {
5840 for (j = prev; j < sb_out; j++) {
5841 BITSET_SET_BIT(cc->bs, j);
5842 }
5843 goto sb_end2;
5844 }
5845 for (j = prev; j < from; j++) {
5846 if (j >= sb_out) goto sb_end2;
5847 BITSET_SET_BIT(cc->bs, j);
5848 }
5849 prev = ONIGENC_CODE_RANGE_TO(mbr, i);
5850 if (prev > limit) prev = limit;
5851 prev++;
5852 if (prev == 0) goto end;
5853 }
5854 for (j = prev; j < sb_out; j++) {
5855 BITSET_SET_BIT(cc->bs, j);
5856 }
5857
5858 sb_end2:
5859 prev = sb_out;
5860
5861 for (i = 0; i < n; i++) {
5862 from = ONIGENC_CODE_RANGE_FROM(mbr, i);
5863 if (from > limit) goto last;
5864
5865 if (prev < from) {
5866 r = add_code_range_to_buf(&(cc->mbuf), prev, from - 1);
5867 if (r != 0) return r;
5868 }
5869 prev = ONIGENC_CODE_RANGE_TO(mbr, i);
5870 if (prev > limit) prev = limit;
5871 prev++;
5872 if (prev == 0) goto end;
5873 }
5874
5875 last:
5876 r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
5877 if (r != 0) return r;
5878 }
5879
5880 end:
5881 return 0;
5882 }
5883
5884 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)5885 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
5886 {
5887 #define ASCII_LIMIT 127
5888
5889 int c, r;
5890 int ascii_mode;
5891 int is_single;
5892 const OnigCodePoint *ranges;
5893 OnigCodePoint limit;
5894 OnigCodePoint sb_out;
5895 OnigEncoding enc = env->enc;
5896
5897 ascii_mode = IS_ASCII_MODE_CTYPE_OPTION(ctype, env->options);
5898
5899 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
5900 if (r == 0) {
5901 if (ascii_mode == 0)
5902 r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
5903 else
5904 r = add_ctype_to_cc_by_range_limit(cc, ctype, not, env->enc, sb_out,
5905 ranges, ASCII_LIMIT);
5906 return r;
5907 }
5908 else if (r != ONIG_NO_SUPPORT_CONFIG) {
5909 return r;
5910 }
5911
5912 r = 0;
5913 is_single = ONIGENC_IS_SINGLEBYTE(enc);
5914 limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE;
5915
5916 switch (ctype) {
5917 case ONIGENC_CTYPE_ALPHA:
5918 case ONIGENC_CTYPE_BLANK:
5919 case ONIGENC_CTYPE_CNTRL:
5920 case ONIGENC_CTYPE_DIGIT:
5921 case ONIGENC_CTYPE_LOWER:
5922 case ONIGENC_CTYPE_PUNCT:
5923 case ONIGENC_CTYPE_SPACE:
5924 case ONIGENC_CTYPE_UPPER:
5925 case ONIGENC_CTYPE_XDIGIT:
5926 case ONIGENC_CTYPE_ASCII:
5927 case ONIGENC_CTYPE_ALNUM:
5928 if (not != 0) {
5929 for (c = 0; c < (int )limit; c++) {
5930 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
5931 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
5932 BITSET_SET_BIT(cc->bs, c);
5933 }
5934 }
5935 for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
5936 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
5937 BITSET_SET_BIT(cc->bs, c);
5938 }
5939
5940 if (is_single == 0)
5941 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
5942 }
5943 else {
5944 for (c = 0; c < (int )limit; c++) {
5945 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
5946 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
5947 BITSET_SET_BIT(cc->bs, c);
5948 }
5949 }
5950 }
5951 break;
5952
5953 case ONIGENC_CTYPE_GRAPH:
5954 case ONIGENC_CTYPE_PRINT:
5955 case ONIGENC_CTYPE_WORD:
5956 if (not != 0) {
5957 for (c = 0; c < (int )limit; c++) {
5958 /* check invalid code point */
5959 if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
5960 && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
5961 BITSET_SET_BIT(cc->bs, c);
5962 }
5963 for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
5964 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
5965 BITSET_SET_BIT(cc->bs, c);
5966 }
5967 if (ascii_mode != 0 && is_single == 0)
5968 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
5969 }
5970 else {
5971 for (c = 0; c < (int )limit; c++) {
5972 if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
5973 && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
5974 BITSET_SET_BIT(cc->bs, c);
5975 }
5976 if (ascii_mode == 0 && is_single == 0)
5977 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
5978 }
5979 break;
5980
5981 default:
5982 return ONIGERR_PARSER_BUG;
5983 break;
5984 }
5985
5986 return r;
5987 }
5988
5989 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)5990 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
5991 {
5992 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
5993 #define POSIX_BRACKET_NAME_MIN_LEN 4
5994
5995 static PosixBracketEntryType PBS[] = {
5996 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
5997 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
5998 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
5999 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
6000 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
6001 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
6002 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
6003 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
6004 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
6005 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
6006 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
6007 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
6008 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
6009 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
6010 { (UChar* )NULL, -1, 0 }
6011 };
6012
6013 PosixBracketEntryType *pb;
6014 int not, i, r;
6015 OnigCodePoint c;
6016 OnigEncoding enc = env->enc;
6017 UChar *p = *src;
6018
6019 if (PPEEK_IS('^')) {
6020 PINC_S;
6021 not = 1;
6022 }
6023 else
6024 not = 0;
6025
6026 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
6027 goto not_posix_bracket;
6028
6029 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
6030 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
6031 p = (UChar* )onigenc_step(enc, p, end, pb->len);
6032 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
6033 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6034
6035 r = add_ctype_to_cc(cc, pb->ctype, not, env);
6036 if (r != 0) return r;
6037
6038 PINC_S; PINC_S;
6039 *src = p;
6040 return 0;
6041 }
6042 }
6043
6044 not_posix_bracket:
6045 c = 0;
6046 i = 0;
6047 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
6048 PINC_S;
6049 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
6050 }
6051 if (c == ':' && ! PEND) {
6052 PINC_S;
6053 if (! PEND) {
6054 PFETCH_S(c);
6055 if (c == ']')
6056 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6057 }
6058 }
6059
6060 return 1; /* 1: is not POSIX bracket, but no error. */
6061 }
6062
6063 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)6064 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
6065 {
6066 int r;
6067 OnigCodePoint c;
6068 OnigEncoding enc;
6069 UChar *prev, *start, *p;
6070
6071 p = *src;
6072 enc = env->enc;
6073 r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6074 start = prev = p;
6075
6076 while (!PEND) {
6077 prev = p;
6078 PFETCH_S(c);
6079 if (c == '}') {
6080 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
6081 if (r >= 0) {
6082 *src = p;
6083 }
6084 else {
6085 onig_scan_env_set_error_string(env, r, *src, prev);
6086 }
6087
6088 return r;
6089 }
6090 else if (c == '(' || c == ')' || c == '{' || c == '|') {
6091 break;
6092 }
6093 }
6094
6095 return r;
6096 }
6097
6098 static int
parse_char_property(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6099 parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6100 {
6101 int r, ctype;
6102 CClassNode* cc;
6103
6104 ctype = fetch_char_property_to_ctype(src, end, env);
6105 if (ctype < 0) return ctype;
6106
6107 *np = node_new_cclass();
6108 CHECK_NULL_RETURN_MEMERR(*np);
6109 cc = CCLASS_(*np);
6110 r = add_ctype_to_cc(cc, ctype, FALSE, env);
6111 if (r != 0) return r;
6112 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6113
6114 return 0;
6115 }
6116
6117
6118 typedef enum {
6119 CS_VALUE,
6120 CS_RANGE,
6121 CS_COMPLETE,
6122 CS_START
6123 } CSTATE;
6124
6125 typedef enum {
6126 CV_UNDEF,
6127 CV_SB,
6128 CV_MB,
6129 CV_CPROP
6130 } CVAL;
6131
6132 static int
cc_cprop_next(CClassNode * cc,OnigCodePoint * pcode,CVAL * val,CSTATE * state,ScanEnv * env)6133 cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,
6134 ScanEnv* env)
6135 {
6136 int r;
6137
6138 if (*state == CS_RANGE)
6139 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
6140
6141 if (*state == CS_VALUE) {
6142 if (*val == CV_SB)
6143 BITSET_SET_BIT(cc->bs, (int )(*pcode));
6144 else if (*val == CV_MB) {
6145 r = add_code_range(&(cc->mbuf), env, *pcode, *pcode);
6146 if (r < 0) return r;
6147 }
6148 }
6149
6150 *state = CS_VALUE;
6151 *val = CV_CPROP;
6152 return 0;
6153 }
6154
6155 static int
cc_char_next(CClassNode * cc,OnigCodePoint * from,OnigCodePoint to,int * from_raw,int to_raw,CVAL intype,CVAL * type,CSTATE * state,ScanEnv * env)6156 cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
6157 int* from_raw, int to_raw, CVAL intype, CVAL* type,
6158 CSTATE* state, ScanEnv* env)
6159 {
6160 int r;
6161
6162 switch (*state) {
6163 case CS_VALUE:
6164 if (*type == CV_SB) {
6165 if (*from > 0xff)
6166 return ONIGERR_INVALID_CODE_POINT_VALUE;
6167
6168 BITSET_SET_BIT(cc->bs, (int )(*from));
6169 }
6170 else if (*type == CV_MB) {
6171 r = add_code_range(&(cc->mbuf), env, *from, *from);
6172 if (r < 0) return r;
6173 }
6174 break;
6175
6176 case CS_RANGE:
6177 if (intype == *type) {
6178 if (intype == CV_SB) {
6179 if (*from > 0xff || to > 0xff)
6180 return ONIGERR_INVALID_CODE_POINT_VALUE;
6181
6182 if (*from > to) {
6183 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6184 goto ccs_range_end;
6185 else
6186 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6187 }
6188 bitset_set_range(cc->bs, (int )*from, (int )to);
6189 }
6190 else {
6191 r = add_code_range(&(cc->mbuf), env, *from, to);
6192 if (r < 0) return r;
6193 }
6194 }
6195 else {
6196 if (*from > to) {
6197 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6198 goto ccs_range_end;
6199 else
6200 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6201 }
6202 bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
6203 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
6204 if (r < 0) return r;
6205 }
6206 ccs_range_end:
6207 *state = CS_COMPLETE;
6208 break;
6209
6210 case CS_COMPLETE:
6211 case CS_START:
6212 *state = CS_VALUE;
6213 break;
6214
6215 default:
6216 break;
6217 }
6218
6219 *from_raw = to_raw;
6220 *from = to;
6221 *type = intype;
6222 return 0;
6223 }
6224
6225 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)6226 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
6227 ScanEnv* env)
6228 {
6229 int in_esc;
6230 OnigCodePoint code;
6231 OnigEncoding enc = env->enc;
6232 UChar* p = from;
6233
6234 in_esc = 0;
6235 while (! PEND) {
6236 if (ignore_escaped && in_esc) {
6237 in_esc = 0;
6238 }
6239 else {
6240 PFETCH_S(code);
6241 if (code == c) return 1;
6242 if (code == MC_ESC(env->syntax)) in_esc = 1;
6243 }
6244 }
6245 return 0;
6246 }
6247
6248 static int
parse_cc(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6249 parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6250 {
6251 int r, neg, len, fetched, and_start;
6252 OnigCodePoint in_code, curr_code;
6253 UChar *p;
6254 Node* node;
6255 CClassNode *cc, *prev_cc;
6256 CClassNode work_cc;
6257 int curr_raw, in_raw;
6258 CSTATE state;
6259 CVAL in_type;
6260 CVAL curr_type;
6261
6262 *np = NULL_NODE;
6263 INC_PARSE_DEPTH(env->parse_depth);
6264
6265 prev_cc = (CClassNode* )NULL;
6266 r = fetch_token_in_cc(tok, src, end, env);
6267 if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) {
6268 neg = 1;
6269 r = fetch_token_in_cc(tok, src, end, env);
6270 }
6271 else {
6272 neg = 0;
6273 }
6274
6275 if (r < 0) return r;
6276 if (r == TK_CC_CLOSE) {
6277 if (! code_exist_check((OnigCodePoint )']',
6278 *src, env->pattern_end, 1, env))
6279 return ONIGERR_EMPTY_CHAR_CLASS;
6280
6281 CC_ESC_WARN(env, (UChar* )"]");
6282 r = tok->type = TK_CHAR; /* allow []...] */
6283 }
6284
6285 *np = node = node_new_cclass();
6286 CHECK_NULL_RETURN_MEMERR(node);
6287 cc = CCLASS_(node);
6288
6289 and_start = 0;
6290 state = CS_START;
6291 curr_type = CV_UNDEF;
6292
6293 p = *src;
6294 while (r != TK_CC_CLOSE) {
6295 fetched = 0;
6296 switch (r) {
6297 case TK_CHAR:
6298 any_char_in:
6299 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code);
6300 if (len < 0) {
6301 r = len;
6302 goto err;
6303 }
6304 in_type = (len == 1) ? CV_SB : CV_MB;
6305 in_code = tok->u.code;
6306 in_raw = 0;
6307 goto val_entry2;
6308 break;
6309
6310 case TK_CRUDE_BYTE:
6311 /* tok->base != 0 : octal or hexadec. */
6312 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
6313 int i, j;
6314 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
6315 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
6316 UChar* psave = p;
6317 int base = tok->base;
6318
6319 buf[0] = tok->u.byte;
6320 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
6321 r = fetch_token_in_cc(tok, &p, end, env);
6322 if (r < 0) goto err;
6323 if (r != TK_CRUDE_BYTE || tok->base != base) {
6324 fetched = 1;
6325 break;
6326 }
6327 buf[i] = tok->u.byte;
6328 }
6329
6330 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
6331 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6332 goto err;
6333 }
6334
6335 /* clear buf tail */
6336 for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0';
6337
6338 len = enclen(env->enc, buf);
6339 if (i < len) {
6340 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6341 goto err;
6342 }
6343 else if (i > len) { /* fetch back */
6344 p = psave;
6345 for (i = 1; i < len; i++) {
6346 r = fetch_token_in_cc(tok, &p, end, env);
6347 }
6348 fetched = 0;
6349 }
6350
6351 if (i == 1) {
6352 in_code = (OnigCodePoint )buf[0];
6353 goto crude_single;
6354 }
6355 else {
6356 in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
6357 in_type = CV_MB;
6358 }
6359 }
6360 else {
6361 in_code = (OnigCodePoint )tok->u.byte;
6362 crude_single:
6363 in_type = CV_SB;
6364 }
6365 in_raw = 1;
6366 goto val_entry2;
6367 break;
6368
6369 case TK_CODE_POINT:
6370 in_code = tok->u.code;
6371 in_raw = 1;
6372 val_entry:
6373 len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code);
6374 if (len < 0) {
6375 if (state != CS_RANGE ||
6376 ! IS_SYNTAX_BV(env->syntax,
6377 ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) ||
6378 in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) {
6379 r = len;
6380 goto err;
6381 }
6382 }
6383 in_type = (len == 1 ? CV_SB : CV_MB);
6384 val_entry2:
6385 r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type,
6386 &curr_type, &state, env);
6387 if (r != 0) goto err;
6388 break;
6389
6390 case TK_CC_POSIX_BRACKET_OPEN:
6391 r = parse_posix_bracket(cc, &p, end, env);
6392 if (r < 0) goto err;
6393 if (r == 1) { /* is not POSIX bracket */
6394 CC_ESC_WARN(env, (UChar* )"[");
6395 p = tok->backp;
6396 in_code = tok->u.code;
6397 in_raw = 0;
6398 goto val_entry;
6399 }
6400 goto next_cprop;
6401 break;
6402
6403 case TK_CHAR_TYPE:
6404 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
6405 if (r != 0) goto err;
6406
6407 next_cprop:
6408 r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env);
6409 if (r != 0) goto err;
6410 break;
6411
6412 case TK_CHAR_PROPERTY:
6413 {
6414 int ctype = fetch_char_property_to_ctype(&p, end, env);
6415 if (ctype < 0) {
6416 r = ctype;
6417 goto err;
6418 }
6419 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
6420 if (r != 0) goto err;
6421 goto next_cprop;
6422 }
6423 break;
6424
6425 case TK_CC_RANGE:
6426 if (state == CS_VALUE) {
6427 r = fetch_token_in_cc(tok, &p, end, env);
6428 if (r < 0) goto err;
6429
6430 fetched = 1;
6431 if (r == TK_CC_CLOSE) { /* allow [x-] */
6432 range_end_val:
6433 in_code = (OnigCodePoint )'-';
6434 in_raw = 0;
6435 goto val_entry;
6436 }
6437 else if (r == TK_CC_AND) {
6438 CC_ESC_WARN(env, (UChar* )"-");
6439 goto range_end_val;
6440 }
6441
6442 if (curr_type == CV_CPROP) {
6443 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6444 goto err;
6445 }
6446
6447 state = CS_RANGE;
6448 }
6449 else if (state == CS_START) {
6450 /* [-xa] is allowed */
6451 in_code = tok->u.code;
6452 in_raw = 0;
6453
6454 r = fetch_token_in_cc(tok, &p, end, env);
6455 if (r < 0) goto err;
6456
6457 fetched = 1;
6458 /* [--x] or [a&&-x] is warned. */
6459 if (r == TK_CC_RANGE || and_start != 0)
6460 CC_ESC_WARN(env, (UChar* )"-");
6461
6462 goto val_entry;
6463 }
6464 else if (state == CS_RANGE) {
6465 CC_ESC_WARN(env, (UChar* )"-");
6466 goto any_char_in; /* [!--] is allowed */
6467 }
6468 else { /* CS_COMPLETE */
6469 r = fetch_token_in_cc(tok, &p, end, env);
6470 if (r < 0) goto err;
6471
6472 fetched = 1;
6473 if (r == TK_CC_CLOSE)
6474 goto range_end_val; /* allow [a-b-] */
6475 else if (r == TK_CC_AND) {
6476 CC_ESC_WARN(env, (UChar* )"-");
6477 goto range_end_val;
6478 }
6479
6480 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
6481 CC_ESC_WARN(env, (UChar* )"-");
6482 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
6483 }
6484 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6485 goto err;
6486 }
6487 break;
6488
6489 case TK_CC_OPEN_CC: /* [ */
6490 {
6491 Node *anode;
6492 CClassNode* acc;
6493
6494 if (state == CS_VALUE) {
6495 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6496 &state, env);
6497 if (r != 0) goto err;
6498 }
6499 state = CS_COMPLETE;
6500
6501 r = parse_cc(&anode, tok, &p, end, env);
6502 if (r != 0) {
6503 onig_node_free(anode);
6504 goto cc_open_err;
6505 }
6506 acc = CCLASS_(anode);
6507 r = or_cclass(cc, acc, env->enc);
6508 onig_node_free(anode);
6509
6510 cc_open_err:
6511 if (r != 0) goto err;
6512 }
6513 break;
6514
6515 case TK_CC_AND: /* && */
6516 {
6517 if (state == CS_VALUE) {
6518 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6519 &state, env);
6520 if (r != 0) goto err;
6521 }
6522 /* initialize local variables */
6523 and_start = 1;
6524 state = CS_START;
6525
6526 if (IS_NOT_NULL(prev_cc)) {
6527 r = and_cclass(prev_cc, cc, env->enc);
6528 if (r != 0) goto err;
6529 bbuf_free(cc->mbuf);
6530 }
6531 else {
6532 prev_cc = cc;
6533 cc = &work_cc;
6534 }
6535 initialize_cclass(cc);
6536 }
6537 break;
6538
6539 case TK_EOT:
6540 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
6541 goto err;
6542 break;
6543 default:
6544 r = ONIGERR_PARSER_BUG;
6545 goto err;
6546 break;
6547 }
6548
6549 if (fetched)
6550 r = tok->type;
6551 else {
6552 r = fetch_token_in_cc(tok, &p, end, env);
6553 if (r < 0) goto err;
6554 }
6555 }
6556
6557 if (state == CS_VALUE) {
6558 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6559 &state, env);
6560 if (r != 0) goto err;
6561 }
6562
6563 if (IS_NOT_NULL(prev_cc)) {
6564 r = and_cclass(prev_cc, cc, env->enc);
6565 if (r != 0) goto err;
6566 bbuf_free(cc->mbuf);
6567 cc = prev_cc;
6568 }
6569
6570 if (neg != 0)
6571 NCCLASS_SET_NOT(cc);
6572 else
6573 NCCLASS_CLEAR_NOT(cc);
6574 if (IS_NCCLASS_NOT(cc) &&
6575 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
6576 int is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
6577 if (is_empty != 0)
6578 BITSET_IS_EMPTY(cc->bs, is_empty);
6579
6580 if (is_empty == 0) {
6581 #define NEWLINE_CODE 0x0a
6582
6583 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
6584 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
6585 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
6586 else
6587 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
6588 }
6589 }
6590 }
6591 *src = p;
6592 DEC_PARSE_DEPTH(env->parse_depth);
6593 return 0;
6594
6595 err:
6596 if (cc != CCLASS_(*np))
6597 bbuf_free(cc->mbuf);
6598 return r;
6599 }
6600
6601 static int parse_alts(Node** top, PToken* tok, int term,
6602 UChar** src, UChar* end, ScanEnv* env, int group_head);
6603
6604 #ifdef USE_CALLOUT
6605
6606 /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */
6607 static int
parse_callout_of_contents(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)6608 parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
6609 {
6610 int r;
6611 int i;
6612 int in;
6613 int num;
6614 OnigCodePoint c;
6615 UChar* code_start;
6616 UChar* code_end;
6617 UChar* contents;
6618 UChar* tag_start;
6619 UChar* tag_end;
6620 int brace_nest;
6621 CalloutListEntry* e;
6622 RegexExt* ext;
6623 OnigEncoding enc = env->enc;
6624 UChar* p = *src;
6625
6626 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6627
6628 brace_nest = 0;
6629 while (PPEEK_IS('{')) {
6630 brace_nest++;
6631 PINC_S;
6632 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6633 }
6634
6635 in = ONIG_CALLOUT_IN_PROGRESS;
6636 code_start = p;
6637 while (1) {
6638 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6639
6640 code_end = p;
6641 PFETCH_S(c);
6642 if (c == '}') {
6643 i = brace_nest;
6644 while (i > 0) {
6645 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6646 PFETCH_S(c);
6647 if (c == '}') i--;
6648 else break;
6649 }
6650 if (i == 0) break;
6651 }
6652 }
6653
6654 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6655
6656 PFETCH_S(c);
6657 if (c == '[') {
6658 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6659 tag_end = tag_start = p;
6660 while (! PEND) {
6661 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6662 tag_end = p;
6663 PFETCH_S(c);
6664 if (c == ']') break;
6665 }
6666 if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
6667 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
6668
6669 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6670 PFETCH_S(c);
6671 }
6672 else {
6673 tag_start = tag_end = 0;
6674 }
6675
6676 if (c == 'X') {
6677 in |= ONIG_CALLOUT_IN_RETRACTION;
6678 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6679 PFETCH_S(c);
6680 }
6681 else if (c == '<') {
6682 in = ONIG_CALLOUT_IN_RETRACTION;
6683 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6684 PFETCH_S(c);
6685 }
6686 else if (c == '>') { /* no needs (default) */
6687 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6688 PFETCH_S(c);
6689 }
6690
6691 if (c != cterm)
6692 return ONIGERR_INVALID_CALLOUT_PATTERN;
6693
6694 r = reg_callout_list_entry(env, &num);
6695 if (r != 0) return r;
6696
6697 ext = onig_get_regex_ext(env->reg);
6698 CHECK_NULL_RETURN_MEMERR(ext);
6699 if (IS_NULL(ext->pattern)) {
6700 r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
6701 if (r != ONIG_NORMAL) return r;
6702 }
6703
6704 if (tag_start != tag_end) {
6705 r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
6706 if (r != ONIG_NORMAL) return r;
6707 }
6708
6709 contents = onigenc_strdup(enc, code_start, code_end);
6710 CHECK_NULL_RETURN_MEMERR(contents);
6711
6712 r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env);
6713 if (r != 0) {
6714 xfree(contents);
6715 return r;
6716 }
6717
6718 e = onig_reg_callout_list_at(env->reg, num);
6719 if (IS_NULL(e)) {
6720 xfree(contents);
6721 return ONIGERR_MEMORY;
6722 }
6723
6724 e->of = ONIG_CALLOUT_OF_CONTENTS;
6725 e->in = in;
6726 e->name_id = ONIG_NON_NAME_ID;
6727 e->u.content.start = contents;
6728 e->u.content.end = contents + (code_end - code_start);
6729
6730 *src = p;
6731 return 0;
6732 }
6733
6734 static long
parse_long(OnigEncoding enc,UChar * s,UChar * end,int sign_on,long max,long * rl)6735 parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl)
6736 {
6737 long v;
6738 long d;
6739 int flag;
6740 UChar* p;
6741 OnigCodePoint c;
6742
6743 if (s >= end) return ONIGERR_INVALID_CALLOUT_ARG;
6744
6745 flag = 1;
6746 v = 0;
6747 p = s;
6748 while (p < end) {
6749 c = ONIGENC_MBC_TO_CODE(enc, p, end);
6750 p += ONIGENC_MBC_ENC_LEN(enc, p);
6751 if (c >= '0' && c <= '9') {
6752 d = (long )(c - '0');
6753 if (v > (max - d) / 10)
6754 return ONIGERR_INVALID_CALLOUT_ARG;
6755
6756 v = v * 10 + d;
6757 }
6758 else if (sign_on != 0 && (c == '-' || c == '+')) {
6759 if (c == '-') flag = -1;
6760 }
6761 else
6762 return ONIGERR_INVALID_CALLOUT_ARG;
6763
6764 sign_on = 0;
6765 }
6766
6767 *rl = flag * v;
6768 return ONIG_NORMAL;
6769 }
6770
6771 static int
parse_callout_args(int skip_mode,int cterm,UChar ** src,UChar * end,int max_arg_num,unsigned int types[],OnigValue vals[],ScanEnv * env)6772 parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,
6773 int max_arg_num, unsigned int types[], OnigValue vals[],
6774 ScanEnv* env)
6775 {
6776 #define MAX_CALLOUT_ARG_BYTE_LENGTH 128
6777
6778 int r;
6779 int n;
6780 int esc;
6781 int cn;
6782 UChar* s;
6783 UChar* e;
6784 UChar* eesc;
6785 OnigCodePoint c;
6786 UChar* bufend;
6787 UChar buf[MAX_CALLOUT_ARG_BYTE_LENGTH];
6788 OnigEncoding enc = env->enc;
6789 UChar* p = *src;
6790
6791 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6792
6793 c = 0;
6794 n = 0;
6795 while (n < ONIG_CALLOUT_MAX_ARGS_NUM) {
6796 cn = 0;
6797 esc = 0;
6798 eesc = 0;
6799 bufend = buf;
6800 s = e = p;
6801 while (1) {
6802 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6803
6804 e = p;
6805 PFETCH_S(c);
6806 if (esc != 0) {
6807 esc = 0;
6808 if (c == '\\' || c == cterm || c == ',') {
6809 /* */
6810 }
6811 else {
6812 e = eesc;
6813 cn++;
6814 }
6815 goto add_char;
6816 }
6817 else {
6818 if (c == '\\') {
6819 esc = 1;
6820 eesc = e;
6821 }
6822 else if (c == cterm || c == ',')
6823 break;
6824 else {
6825 size_t clen;
6826
6827 add_char:
6828 if (skip_mode == FALSE) {
6829 clen = p - e;
6830 if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH)
6831 return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */
6832
6833 xmemcpy(bufend, e, clen);
6834 bufend += clen;
6835 }
6836 cn++;
6837 }
6838 }
6839 }
6840
6841 if (cn != 0) {
6842 if (max_arg_num >= 0 && n >= max_arg_num)
6843 return ONIGERR_INVALID_CALLOUT_ARG;
6844
6845 if (skip_mode == FALSE) {
6846 if ((types[n] & ONIG_TYPE_LONG) != 0) {
6847 int fixed = 0;
6848 if (cn > 0) {
6849 long rl;
6850 r = parse_long(enc, buf, bufend, 1, LONG_MAX, &rl);
6851 if (r == ONIG_NORMAL) {
6852 vals[n].l = rl;
6853 fixed = 1;
6854 types[n] = ONIG_TYPE_LONG;
6855 }
6856 }
6857
6858 if (fixed == 0) {
6859 types[n] = (types[n] & ~ONIG_TYPE_LONG);
6860 if (types[n] == ONIG_TYPE_VOID)
6861 return ONIGERR_INVALID_CALLOUT_ARG;
6862 }
6863 }
6864
6865 switch (types[n]) {
6866 case ONIG_TYPE_LONG:
6867 break;
6868
6869 case ONIG_TYPE_CHAR:
6870 if (cn != 1) return ONIGERR_INVALID_CALLOUT_ARG;
6871 vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend);
6872 break;
6873
6874 case ONIG_TYPE_STRING:
6875 {
6876 UChar* rs = onigenc_strdup(enc, buf, bufend);
6877 CHECK_NULL_RETURN_MEMERR(rs);
6878 vals[n].s.start = rs;
6879 vals[n].s.end = rs + (e - s);
6880 }
6881 break;
6882
6883 case ONIG_TYPE_TAG:
6884 if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e))
6885 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
6886
6887 vals[n].s.start = s;
6888 vals[n].s.end = e;
6889 break;
6890
6891 case ONIG_TYPE_VOID:
6892 case ONIG_TYPE_POINTER:
6893 return ONIGERR_PARSER_BUG;
6894 break;
6895 }
6896 }
6897
6898 n++;
6899 }
6900
6901 if (c == cterm) break;
6902 }
6903
6904 if (c != cterm) return ONIGERR_INVALID_CALLOUT_PATTERN;
6905
6906 *src = p;
6907 return n;
6908 }
6909
6910 /* (*name[TAG]) (*name[TAG]{a,b,..}) */
6911 static int
parse_callout_of_name(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)6912 parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
6913 {
6914 int r;
6915 int i;
6916 int in;
6917 int num;
6918 int name_id;
6919 int arg_num;
6920 int max_arg_num;
6921 int opt_arg_num;
6922 int is_not_single;
6923 OnigCodePoint c;
6924 UChar* name_start;
6925 UChar* name_end;
6926 UChar* tag_start;
6927 UChar* tag_end;
6928 Node* node;
6929 CalloutListEntry* e;
6930 RegexExt* ext;
6931 unsigned int types[ONIG_CALLOUT_MAX_ARGS_NUM];
6932 OnigValue vals[ONIG_CALLOUT_MAX_ARGS_NUM];
6933 OnigEncoding enc = env->enc;
6934 UChar* p = *src;
6935
6936 /* PFETCH_READY; */
6937 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6938
6939 node = 0;
6940 name_start = p;
6941 while (1) {
6942 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6943 name_end = p;
6944 PFETCH_S(c);
6945 if (c == cterm || c == '[' || c == '{') break;
6946 }
6947
6948 if (! is_allowed_callout_name(enc, name_start, name_end))
6949 return ONIGERR_INVALID_CALLOUT_NAME;
6950
6951 if (c == '[') {
6952 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6953 tag_end = tag_start = p;
6954 while (! PEND) {
6955 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6956 tag_end = p;
6957 PFETCH_S(c);
6958 if (c == ']') break;
6959 }
6960 if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
6961 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
6962
6963 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6964 PFETCH_S(c);
6965 }
6966 else {
6967 tag_start = tag_end = 0;
6968 }
6969
6970 if (c == '{') {
6971 UChar* save;
6972
6973 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6974
6975 /* read for single check only */
6976 save = p;
6977 arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env);
6978 if (arg_num < 0) return arg_num;
6979
6980 is_not_single = PPEEK_IS(cterm) ? 0 : 1;
6981 p = save;
6982 r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
6983 &name_id);
6984 if (r != ONIG_NORMAL) return r;
6985
6986 max_arg_num = get_callout_arg_num_by_name_id(name_id);
6987 for (i = 0; i < max_arg_num; i++) {
6988 types[i] = get_callout_arg_type_by_name_id(name_id, i);
6989 }
6990
6991 arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env);
6992 if (arg_num < 0) return arg_num;
6993
6994 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6995 PFETCH_S(c);
6996 }
6997 else {
6998 arg_num = 0;
6999
7000 is_not_single = 0;
7001 r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7002 &name_id);
7003 if (r != ONIG_NORMAL) return r;
7004
7005 max_arg_num = get_callout_arg_num_by_name_id(name_id);
7006 for (i = 0; i < max_arg_num; i++) {
7007 types[i] = get_callout_arg_type_by_name_id(name_id, i);
7008 }
7009 }
7010
7011 in = onig_get_callout_in_by_name_id(name_id);
7012 opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id);
7013 if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num))
7014 return ONIGERR_INVALID_CALLOUT_ARG;
7015
7016 if (c != cterm)
7017 return ONIGERR_INVALID_CALLOUT_PATTERN;
7018
7019 r = reg_callout_list_entry(env, &num);
7020 if (r != 0) return r;
7021
7022 ext = onig_get_regex_ext(env->reg);
7023 CHECK_NULL_RETURN_MEMERR(ext);
7024 if (IS_NULL(ext->pattern)) {
7025 r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
7026 if (r != ONIG_NORMAL) return r;
7027 }
7028
7029 if (tag_start != tag_end) {
7030 r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
7031 if (r != ONIG_NORMAL) return r;
7032 }
7033
7034 r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env);
7035 if (r != ONIG_NORMAL) return r;
7036
7037 e = onig_reg_callout_list_at(env->reg, num);
7038 CHECK_NULL_RETURN_MEMERR(e);
7039
7040 e->of = ONIG_CALLOUT_OF_NAME;
7041 e->in = in;
7042 e->name_id = name_id;
7043 e->type = onig_get_callout_type_by_name_id(name_id);
7044 e->start_func = onig_get_callout_start_func_by_name_id(name_id);
7045 e->end_func = onig_get_callout_end_func_by_name_id(name_id);
7046 e->u.arg.num = max_arg_num;
7047 e->u.arg.passed_num = arg_num;
7048 for (i = 0; i < max_arg_num; i++) {
7049 e->u.arg.types[i] = types[i];
7050 if (i < arg_num)
7051 e->u.arg.vals[i] = vals[i];
7052 else
7053 e->u.arg.vals[i] = get_callout_opt_default_by_name_id(name_id, i);
7054 }
7055
7056 *np = node;
7057 *src = p;
7058 return 0;
7059 }
7060 #endif
7061
7062 static int
parse_bag(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)7063 parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
7064 ScanEnv* env)
7065 {
7066 int r, num;
7067 Node *target;
7068 OnigOptionType option;
7069 OnigCodePoint c;
7070 int list_capture;
7071 OnigEncoding enc = env->enc;
7072
7073 UChar* p = *src;
7074 PFETCH_READY;
7075
7076 *np = NULL;
7077 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7078
7079 option = env->options;
7080 c = PPEEK;
7081 if (c == '?' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
7082 PINC;
7083 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7084
7085 PFETCH(c);
7086 switch (c) {
7087 case ':': /* (?:...) grouping only */
7088 group:
7089 r = fetch_token(tok, &p, end, env);
7090 if (r < 0) return r;
7091 r = parse_alts(np, tok, term, &p, end, env, FALSE);
7092 if (r < 0) return r;
7093 *src = p;
7094 return 1; /* group */
7095 break;
7096
7097 case '=':
7098 *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE);
7099 break;
7100 case '!': /* preceding read */
7101 *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE);
7102 break;
7103 case '>': /* (?>...) stop backtrack */
7104 *np = node_new_bag(BAG_STOP_BACKTRACK);
7105 break;
7106
7107 case '\'':
7108 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7109 goto named_group1;
7110 }
7111 else
7112 return ONIGERR_UNDEFINED_GROUP_OPTION;
7113 break;
7114
7115 case '<': /* look behind (?<=...), (?<!...) */
7116 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7117 PFETCH(c);
7118 if (c == '=')
7119 *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE);
7120 else if (c == '!')
7121 *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE);
7122 else {
7123 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7124 UChar *name;
7125 UChar *name_end;
7126 enum REF_NUM num_type;
7127
7128 PUNFETCH;
7129 c = '<';
7130
7131 named_group1:
7132 list_capture = 0;
7133
7134 named_group2:
7135 name = p;
7136 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num,
7137 &num_type, FALSE);
7138 if (r < 0) return r;
7139
7140 num = scan_env_add_mem_entry(env);
7141 if (num < 0) return num;
7142 if (list_capture != 0 && num >= (int )MEM_STATUS_BITS_NUM)
7143 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7144
7145 r = name_add(env->reg, name, name_end, num, env);
7146 if (r != 0) return r;
7147 *np = node_new_memory(1);
7148 CHECK_NULL_RETURN_MEMERR(*np);
7149 BAG_(*np)->m.regnum = num;
7150 if (list_capture != 0)
7151 MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7152 env->num_named++;
7153 }
7154 else {
7155 return ONIGERR_UNDEFINED_GROUP_OPTION;
7156 }
7157 }
7158 break;
7159
7160 case '~':
7161 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP)) {
7162 Node* absent;
7163 Node* expr;
7164 int head_bar;
7165 int is_range_cutter;
7166
7167 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7168
7169 if (PPEEK_IS('|')) { /* (?~|generator|absent) */
7170 PINC;
7171 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7172
7173 head_bar = 1;
7174 if (PPEEK_IS(')')) { /* (?~|) : range clear */
7175 PINC;
7176 r = make_range_clear(np, env);
7177 if (r != 0) return r;
7178 goto end;
7179 }
7180 }
7181 else
7182 head_bar = 0;
7183
7184 r = fetch_token(tok, &p, end, env);
7185 if (r < 0) return r;
7186 r = parse_alts(&absent, tok, term, &p, end, env, TRUE);
7187 if (r < 0) {
7188 onig_node_free(absent);
7189 return r;
7190 }
7191
7192 expr = NULL_NODE;
7193 is_range_cutter = 0;
7194 if (head_bar != 0) {
7195 Node* top = absent;
7196 if (NODE_TYPE(top) != NODE_ALT || IS_NULL(NODE_CDR(top))) {
7197 expr = NULL_NODE;
7198 is_range_cutter = 1;
7199 /* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */
7200 }
7201 else {
7202 absent = NODE_CAR(top);
7203 expr = NODE_CDR(top);
7204 NODE_CAR(top) = NULL_NODE;
7205 NODE_CDR(top) = NULL_NODE;
7206 onig_node_free(top);
7207 if (IS_NULL(NODE_CDR(expr))) {
7208 top = expr;
7209 expr = NODE_CAR(top);
7210 NODE_CAR(top) = NULL_NODE;
7211 onig_node_free(top);
7212 }
7213 }
7214 }
7215
7216 r = make_absent_tree(np, absent, expr, is_range_cutter, env);
7217 if (r != 0) {
7218 return r;
7219 }
7220 goto end;
7221 }
7222 else {
7223 return ONIGERR_UNDEFINED_GROUP_OPTION;
7224 }
7225 break;
7226
7227 #ifdef USE_CALLOUT
7228 case '{':
7229 if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS))
7230 return ONIGERR_UNDEFINED_GROUP_OPTION;
7231
7232 r = parse_callout_of_contents(np, ')', &p, end, env);
7233 if (r != 0) return r;
7234
7235 goto end;
7236 break;
7237 #endif
7238
7239 case '(':
7240 /* (?()...) */
7241 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE)) {
7242 UChar *prev;
7243 Node* condition;
7244 int condition_is_checker;
7245
7246 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7247 PFETCH(c);
7248 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7249
7250 if (IS_CODE_DIGIT_ASCII(enc, c)
7251 || c == '-' || c == '+' || c == '<' || c == '\'') {
7252 UChar* name_end;
7253 int back_num;
7254 int exist_level;
7255 int level;
7256 enum REF_NUM num_type;
7257 int is_enclosed;
7258
7259 is_enclosed = (c == '<' || c == '\'') ? 1 : 0;
7260 if (! is_enclosed)
7261 PUNFETCH;
7262 prev = p;
7263 exist_level = 0;
7264 #ifdef USE_BACKREF_WITH_LEVEL
7265 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
7266 r = fetch_name_with_level(
7267 (OnigCodePoint )(is_enclosed != 0 ? c : '('),
7268 &p, end, &name_end,
7269 env, &back_num, &level, &num_type);
7270 if (r == 1) exist_level = 1;
7271 #else
7272 r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('),
7273 &p, end, &name_end, env, &back_num, &num_type, TRUE);
7274 #endif
7275 if (r < 0) {
7276 if (is_enclosed == 0) {
7277 goto any_condition;
7278 }
7279 else
7280 return r;
7281 }
7282
7283 condition_is_checker = 1;
7284 if (num_type != IS_NOT_NUM) {
7285 if (num_type == IS_REL_NUM) {
7286 back_num = backref_rel_to_abs(back_num, env);
7287 }
7288 if (back_num <= 0)
7289 return ONIGERR_INVALID_BACKREF;
7290
7291 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7292 if (back_num > env->num_mem ||
7293 IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
7294 return ONIGERR_INVALID_BACKREF;
7295 }
7296
7297 condition = node_new_backref_checker(1, &back_num, FALSE,
7298 #ifdef USE_BACKREF_WITH_LEVEL
7299 exist_level, level,
7300 #endif
7301 env);
7302 }
7303 else {
7304 int num;
7305 int* backs;
7306
7307 num = name_to_group_numbers(env, prev, name_end, &backs);
7308 if (num <= 0) {
7309 return ONIGERR_UNDEFINED_NAME_REFERENCE;
7310 }
7311 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7312 int i;
7313 for (i = 0; i < num; i++) {
7314 if (backs[i] > env->num_mem ||
7315 IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
7316 return ONIGERR_INVALID_BACKREF;
7317 }
7318 }
7319
7320 condition = node_new_backref_checker(num, backs, TRUE,
7321 #ifdef USE_BACKREF_WITH_LEVEL
7322 exist_level, level,
7323 #endif
7324 env);
7325 }
7326
7327 if (is_enclosed != 0) {
7328 if (PEND) goto err_if_else;
7329 PFETCH(c);
7330 if (c != ')') goto err_if_else;
7331 }
7332 }
7333 #ifdef USE_CALLOUT
7334 else if (c == '?') {
7335 if (IS_SYNTAX_OP2(env->syntax,
7336 ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) {
7337 if (! PEND && PPEEK_IS('{')) {
7338 /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */
7339 condition_is_checker = 0;
7340 PFETCH(c);
7341 r = parse_callout_of_contents(&condition, ')', &p, end, env);
7342 if (r != 0) return r;
7343 goto end_condition;
7344 }
7345 }
7346 goto any_condition;
7347 }
7348 else if (c == '*' &&
7349 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
7350 condition_is_checker = 0;
7351 r = parse_callout_of_name(&condition, ')', &p, end, env);
7352 if (r != 0) return r;
7353 goto end_condition;
7354 }
7355 #endif
7356 else {
7357 any_condition:
7358 PUNFETCH;
7359 condition_is_checker = 0;
7360 r = fetch_token(tok, &p, end, env);
7361 if (r < 0) return r;
7362 r = parse_alts(&condition, tok, term, &p, end, env, FALSE);
7363 if (r < 0) {
7364 onig_node_free(condition);
7365 return r;
7366 }
7367 }
7368
7369 #ifdef USE_CALLOUT
7370 end_condition:
7371 #endif
7372 CHECK_NULL_RETURN_MEMERR(condition);
7373
7374 if (PEND) {
7375 err_if_else:
7376 onig_node_free(condition);
7377 return ONIGERR_END_PATTERN_IN_GROUP;
7378 }
7379
7380 if (PPEEK_IS(')')) { /* case: empty body: make backref checker */
7381 if (condition_is_checker == 0) {
7382 onig_node_free(condition);
7383 return ONIGERR_INVALID_IF_ELSE_SYNTAX;
7384 }
7385 PFETCH(c);
7386 *np = condition;
7387 }
7388 else { /* if-else */
7389 int then_is_empty;
7390 Node *Then, *Else;
7391
7392 Then = 0;
7393 if (PPEEK_IS('|')) {
7394 PFETCH(c);
7395 then_is_empty = 1;
7396 }
7397 else
7398 then_is_empty = 0;
7399
7400 r = fetch_token(tok, &p, end, env);
7401 if (r < 0) {
7402 onig_node_free(condition);
7403 return r;
7404 }
7405 r = parse_alts(&target, tok, term, &p, end, env, TRUE);
7406 if (r < 0) {
7407 onig_node_free(condition);
7408 onig_node_free(target);
7409 return r;
7410 }
7411
7412 if (then_is_empty != 0) {
7413 Else = target;
7414 }
7415 else {
7416 if (NODE_TYPE(target) == NODE_ALT) {
7417 Then = NODE_CAR(target);
7418 if (NODE_CDR(NODE_CDR(target)) == NULL_NODE) {
7419 Else = NODE_CAR(NODE_CDR(target));
7420 cons_node_free_alone(NODE_CDR(target));
7421 }
7422 else {
7423 Else = NODE_CDR(target);
7424 }
7425 cons_node_free_alone(target);
7426 }
7427 else {
7428 Then = target;
7429 Else = 0;
7430 }
7431 }
7432
7433 *np = node_new_bag_if_else(condition, Then, Else);
7434 if (IS_NULL(*np)) {
7435 onig_node_free(condition);
7436 onig_node_free(Then);
7437 onig_node_free(Else);
7438 return ONIGERR_MEMORY;
7439 }
7440 }
7441 goto end;
7442 }
7443 else {
7444 return ONIGERR_UNDEFINED_GROUP_OPTION;
7445 }
7446 break;
7447
7448 #ifdef USE_CAPTURE_HISTORY
7449 case '@':
7450 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
7451 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7452 PFETCH(c);
7453 if (c == '<' || c == '\'') {
7454 list_capture = 1;
7455 goto named_group2; /* (?@<name>...) */
7456 }
7457 PUNFETCH;
7458 }
7459
7460 *np = node_new_memory(0);
7461 CHECK_NULL_RETURN_MEMERR(*np);
7462 num = scan_env_add_mem_entry(env);
7463 if (num < 0) {
7464 return num;
7465 }
7466 else if (num >= (int )MEM_STATUS_BITS_NUM) {
7467 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7468 }
7469 BAG_(*np)->m.regnum = num;
7470 MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7471 }
7472 else {
7473 return ONIGERR_UNDEFINED_GROUP_OPTION;
7474 }
7475 break;
7476 #endif
7477
7478 #ifdef USE_POSIXLINE_OPTION
7479 case 'p':
7480 #endif
7481 case '-': case 'i': case 'm': case 's': case 'x':
7482 case 'W': case 'D': case 'S': case 'P':
7483 case 'y':
7484 {
7485 int neg = 0;
7486
7487 while (1) {
7488 switch (c) {
7489 case ':':
7490 case ')':
7491 break;
7492
7493 case '-': neg = 1; break;
7494 case 'x': OPTION_NEGATE(option, ONIG_OPTION_EXTEND, neg); break;
7495 case 'i': OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE, neg); break;
7496 case 's':
7497 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7498 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg);
7499 }
7500 else
7501 return ONIGERR_UNDEFINED_GROUP_OPTION;
7502 break;
7503
7504 case 'm':
7505 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7506 OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE));
7507 }
7508 else if (IS_SYNTAX_OP2(env->syntax,
7509 ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) {
7510 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg);
7511 }
7512 else
7513 return ONIGERR_UNDEFINED_GROUP_OPTION;
7514 break;
7515 #ifdef USE_POSIXLINE_OPTION
7516 case 'p':
7517 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
7518 break;
7519 #endif
7520 case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break;
7521 case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break;
7522 case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
7523 case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
7524
7525 case 'y': /* y{g}, y{w} */
7526 {
7527 if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
7528 return ONIGERR_UNDEFINED_GROUP_OPTION;
7529
7530 if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION;
7531
7532 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7533 if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION;
7534 PFETCH(c);
7535 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7536 PFETCH(c);
7537 switch (c) {
7538 case 'g':
7539 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
7540 return ONIGERR_UNDEFINED_GROUP_OPTION;
7541
7542 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE);
7543 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE);
7544 break;
7545 #ifdef USE_UNICODE_WORD_BREAK
7546 case 'w':
7547 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
7548 return ONIGERR_UNDEFINED_GROUP_OPTION;
7549
7550 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE);
7551 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE);
7552 break;
7553 #endif
7554 default:
7555 return ONIGERR_UNDEFINED_GROUP_OPTION;
7556 break;
7557 }
7558 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7559 PFETCH(c);
7560 if (c != '}')
7561 return ONIGERR_UNDEFINED_GROUP_OPTION;
7562 break;
7563 } /* case 'y' */
7564
7565 default:
7566 return ONIGERR_UNDEFINED_GROUP_OPTION;
7567 }
7568
7569 if (c == ')') {
7570 *np = node_new_option(option);
7571 CHECK_NULL_RETURN_MEMERR(*np);
7572 *src = p;
7573 return 2; /* option only */
7574 }
7575 else if (c == ':') {
7576 OnigOptionType prev = env->options;
7577
7578 env->options = option;
7579 r = fetch_token(tok, &p, end, env);
7580 if (r < 0) return r;
7581 r = parse_alts(&target, tok, term, &p, end, env, FALSE);
7582 env->options = prev;
7583 if (r < 0) {
7584 onig_node_free(target);
7585 return r;
7586 }
7587 *np = node_new_option(option);
7588 CHECK_NULL_RETURN_MEMERR(*np);
7589 NODE_BODY(*np) = target;
7590 *src = p;
7591 return 0;
7592 }
7593
7594 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7595 PFETCH(c);
7596 } /* while (1) */
7597 }
7598 break;
7599
7600 default:
7601 return ONIGERR_UNDEFINED_GROUP_OPTION;
7602 }
7603 }
7604 #ifdef USE_CALLOUT
7605 else if (c == '*' &&
7606 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
7607 PINC;
7608 r = parse_callout_of_name(np, ')', &p, end, env);
7609 if (r != 0) return r;
7610
7611 goto end;
7612 }
7613 #endif
7614 else {
7615 if (ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
7616 goto group;
7617
7618 *np = node_new_memory(0);
7619 CHECK_NULL_RETURN_MEMERR(*np);
7620 num = scan_env_add_mem_entry(env);
7621 if (num < 0) return num;
7622 BAG_(*np)->m.regnum = num;
7623 }
7624
7625 CHECK_NULL_RETURN_MEMERR(*np);
7626 r = fetch_token(tok, &p, end, env);
7627 if (r < 0) return r;
7628 r = parse_alts(&target, tok, term, &p, end, env, FALSE);
7629 if (r < 0) {
7630 onig_node_free(target);
7631 return r;
7632 }
7633
7634 NODE_BODY(*np) = target;
7635
7636 if (NODE_TYPE(*np) == NODE_BAG) {
7637 if (BAG_(*np)->type == BAG_MEMORY) {
7638 /* Don't move this to previous of parse_alts() */
7639 r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np);
7640 if (r != 0) return r;
7641 }
7642 }
7643
7644 end:
7645 *src = p;
7646 return 0;
7647 }
7648
7649 static const char* PopularQStr[] = {
7650 "?", "*", "+", "??", "*?", "+?"
7651 };
7652
7653 static const char* ReduceQStr[] = {
7654 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
7655 };
7656
7657 static int
assign_quantifier_body(Node * qnode,Node * target,int group,ScanEnv * env)7658 assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env)
7659 {
7660 QuantNode* qn;
7661
7662 qn = QUANT_(qnode);
7663 if (qn->lower == 1 && qn->upper == 1)
7664 return 1;
7665
7666 switch (NODE_TYPE(target)) {
7667 case NODE_STRING:
7668 if (group == 0) {
7669 if (str_node_can_be_split(target, env->enc)) {
7670 Node* n = str_node_split_last_char(target, env->enc);
7671 if (IS_NOT_NULL(n)) {
7672 NODE_BODY(qnode) = n;
7673 return 2;
7674 }
7675 }
7676 }
7677 break;
7678
7679 case NODE_QUANT:
7680 { /* check redundant double repeat. */
7681 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
7682 QuantNode* qnt = QUANT_(target);
7683 int nestq_num = quantifier_type_num(qn);
7684 int targetq_num = quantifier_type_num(qnt);
7685
7686 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
7687 if (targetq_num >= 0 && nestq_num >= 0 &&
7688 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
7689 UChar buf[WARN_BUFSIZE];
7690
7691 switch(ReduceTypeTable[targetq_num][nestq_num]) {
7692 case RQ_ASIS:
7693 break;
7694
7695 case RQ_DEL:
7696 if (onig_verb_warn != onig_null_warn) {
7697 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
7698 env->pattern, env->pattern_end,
7699 (UChar* )"redundant nested repeat operator");
7700 (*onig_verb_warn)((char* )buf);
7701 }
7702 goto warn_exit;
7703 break;
7704
7705 default:
7706 if (onig_verb_warn != onig_null_warn) {
7707 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
7708 env->pattern, env->pattern_end,
7709 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
7710 PopularQStr[targetq_num], PopularQStr[nestq_num],
7711 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
7712 (*onig_verb_warn)((char* )buf);
7713 }
7714 goto warn_exit;
7715 break;
7716 }
7717 }
7718
7719 warn_exit:
7720 #endif
7721 if (targetq_num >= 0 && nestq_num < 0) {
7722 if (targetq_num == 1 || targetq_num == 2) { /* * or + */
7723 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
7724 if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) {
7725 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
7726 }
7727 }
7728 }
7729 else {
7730 int r;
7731
7732 NODE_BODY(qnode) = target;
7733 r = onig_reduce_nested_quantifier(qnode);
7734 return r;
7735 }
7736 }
7737 break;
7738
7739 default:
7740 break;
7741 }
7742
7743 NODE_BODY(qnode) = target;
7744 return 0;
7745 }
7746
7747
7748 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
7749 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)7750 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
7751 {
7752 BBuf *tbuf;
7753 int r;
7754
7755 if (IS_NCCLASS_NOT(cc)) {
7756 bitset_invert(cc->bs);
7757
7758 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
7759 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
7760 if (r != 0) return r;
7761
7762 bbuf_free(cc->mbuf);
7763 cc->mbuf = tbuf;
7764 }
7765
7766 NCCLASS_CLEAR_NOT(cc);
7767 }
7768
7769 return 0;
7770 }
7771 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
7772
7773 #define ADD_CODE_INTO_CC(cc, code, enc) do {\
7774 if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\
7775 add_code_range_to_buf(&((cc)->mbuf), code, code);\
7776 }\
7777 else {\
7778 BITSET_SET_BIT((cc)->bs, code);\
7779 }\
7780 } while (0)
7781
7782 extern int
onig_new_cclass_with_code_list(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])7783 onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc,
7784 int n, OnigCodePoint codes[])
7785 {
7786 int i;
7787 Node* node;
7788 CClassNode* cc;
7789
7790 *rnode = NULL_NODE;
7791
7792 node = node_new_cclass();
7793 CHECK_NULL_RETURN_MEMERR(node);
7794
7795 cc = CCLASS_(node);
7796
7797 for (i = 0; i < n; i++) {
7798 ADD_CODE_INTO_CC(cc, codes[i], enc);
7799 }
7800
7801 *rnode = node;
7802 return 0;
7803 }
7804
7805 typedef struct {
7806 ScanEnv* env;
7807 CClassNode* cc;
7808 Node* alt_root;
7809 Node** ptail;
7810 } IApplyCaseFoldArg;
7811
7812 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)7813 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
7814 {
7815 IApplyCaseFoldArg* iarg;
7816 ScanEnv* env;
7817 CClassNode* cc;
7818
7819 iarg = (IApplyCaseFoldArg* )arg;
7820 env = iarg->env;
7821 cc = iarg->cc;
7822
7823 if (to_len == 1) {
7824 int is_in = onig_is_code_in_cc(env->enc, from, cc);
7825 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
7826 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
7827 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
7828 ADD_CODE_INTO_CC(cc, *to, env->enc);
7829 }
7830 #else
7831 if (is_in != 0) {
7832 if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
7833 ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) {
7834 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
7835 add_code_range(&(cc->mbuf), env, *to, *to);
7836 }
7837 else {
7838 if (IS_NCCLASS_NOT(cc)) {
7839 BITSET_CLEAR_BIT(cc->bs, *to);
7840 }
7841 else
7842 BITSET_SET_BIT(cc->bs, *to);
7843 }
7844 }
7845 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
7846 }
7847 else {
7848 int r, i, len;
7849 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
7850
7851 if (onig_is_code_in_cc(env->enc, from, cc)
7852 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
7853 && !IS_NCCLASS_NOT(cc)
7854 #endif
7855 ) {
7856 int n, j, m, index;
7857 Node* list_node;
7858 Node* ns[3];
7859
7860 n = 0;
7861 for (i = 0; i < to_len; i++) {
7862 OnigCodePoint code;
7863 Node* csnode;
7864 CClassNode* cs_cc;
7865
7866 index = onigenc_unicode_fold1_key(&to[i]);
7867 if (index >= 0) {
7868 csnode = node_new_cclass();
7869 cs_cc = CCLASS_(csnode);
7870 if (IS_NULL(csnode)) {
7871 err_free_ns:
7872 for (j = 0; j < n; j++) onig_node_free(ns[j]);
7873 return ONIGERR_MEMORY;
7874 }
7875 m = FOLDS1_UNFOLDS_NUM(index);
7876 for (j = 0; j < m; j++) {
7877 code = FOLDS1_UNFOLDS(index)[j];
7878 ADD_CODE_INTO_CC(cs_cc, code, env->enc);
7879 }
7880 ADD_CODE_INTO_CC(cs_cc, to[i], env->enc);
7881 ns[n++] = csnode;
7882 }
7883 else {
7884 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
7885 if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) {
7886 csnode = onig_node_new_str(buf, buf + len);
7887 if (IS_NULL(csnode)) goto err_free_ns;
7888
7889 NODE_STRING_SET_CASE_EXPANDED(csnode);
7890 ns[n++] = csnode;
7891 }
7892 else {
7893 r = onig_node_str_cat(ns[n-1], buf, buf + len);
7894 if (r < 0) goto err_free_ns;
7895 }
7896 }
7897 }
7898
7899 if (n == 1)
7900 list_node = ns[0];
7901 else
7902 list_node = make_list(n, ns);
7903
7904 *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE);
7905 if (IS_NULL(*(iarg->ptail))) {
7906 onig_node_free(list_node);
7907 return ONIGERR_MEMORY;
7908 }
7909 iarg->ptail = &(NODE_CDR((*(iarg->ptail))));
7910 }
7911 }
7912
7913 return 0;
7914 }
7915
7916 static int
parse_exp(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)7917 parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,
7918 ScanEnv* env, int group_head)
7919 {
7920 int r, len, group;
7921 Node* qn;
7922 Node** tp;
7923 unsigned int parse_depth;
7924
7925 group = 0;
7926 *np = NULL;
7927 if (tok->type == (enum TokenSyms )term)
7928 goto end_of_token;
7929
7930 parse_depth = env->parse_depth;
7931
7932 switch (tok->type) {
7933 case TK_ALT:
7934 case TK_EOT:
7935 end_of_token:
7936 *np = node_new_empty();
7937 CHECK_NULL_RETURN_MEMERR(*np);
7938 return tok->type;
7939 break;
7940
7941 case TK_SUBEXP_OPEN:
7942 r = parse_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env);
7943 if (r < 0) return r;
7944 if (r == 1) { /* group */
7945 if (group_head == 0)
7946 group = 1;
7947 else {
7948 Node* target = *np;
7949 *np = node_new_group(target);
7950 if (IS_NULL(*np)) {
7951 onig_node_free(target);
7952 return ONIGERR_MEMORY;
7953 }
7954 group = 2;
7955 }
7956 }
7957 else if (r == 2) { /* option only */
7958 Node* target;
7959 OnigOptionType prev = env->options;
7960
7961 env->options = BAG_(*np)->o.options;
7962 r = fetch_token(tok, src, end, env);
7963 if (r < 0) return r;
7964 r = parse_alts(&target, tok, term, src, end, env, FALSE);
7965 env->options = prev;
7966 if (r < 0) {
7967 onig_node_free(target);
7968 return r;
7969 }
7970 NODE_BODY(*np) = target;
7971 return tok->type;
7972 }
7973 break;
7974
7975 case TK_SUBEXP_CLOSE:
7976 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
7977 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
7978
7979 if (tok->escaped) goto tk_crude_byte;
7980 else goto tk_byte;
7981 break;
7982
7983 case TK_STRING:
7984 tk_byte:
7985 {
7986 *np = node_new_str(tok->backp, *src);
7987 CHECK_NULL_RETURN_MEMERR(*np);
7988
7989 while (1) {
7990 r = fetch_token(tok, src, end, env);
7991 if (r < 0) return r;
7992 if (r != TK_STRING) break;
7993
7994 r = onig_node_str_cat(*np, tok->backp, *src);
7995 if (r < 0) return r;
7996 }
7997
7998 string_end:
7999 tp = np;
8000 goto repeat;
8001 }
8002 break;
8003
8004 case TK_CRUDE_BYTE:
8005 tk_crude_byte:
8006 {
8007 *np = node_new_str_crude_char(tok->u.byte);
8008 CHECK_NULL_RETURN_MEMERR(*np);
8009 len = 1;
8010 while (1) {
8011 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
8012 if (len == enclen(env->enc, STR_(*np)->s)) {
8013 r = fetch_token(tok, src, end, env);
8014 goto tk_crude_byte_end;
8015 }
8016 }
8017
8018 r = fetch_token(tok, src, end, env);
8019 if (r < 0) return r;
8020 if (r != TK_CRUDE_BYTE)
8021 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
8022
8023 r = node_str_cat_char(*np, tok->u.byte);
8024 if (r < 0) return r;
8025
8026 len++;
8027 }
8028
8029 tk_crude_byte_end:
8030 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end))
8031 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8032
8033 NODE_STRING_CLEAR_CRUDE(*np);
8034 goto string_end;
8035 }
8036 break;
8037
8038 case TK_CODE_POINT:
8039 {
8040 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8041 len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
8042 if (len < 0) return len;
8043 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
8044 *np = node_new_str_crude(buf, buf + len);
8045 #else
8046 *np = node_new_str(buf, buf + len);
8047 #endif
8048 CHECK_NULL_RETURN_MEMERR(*np);
8049 }
8050 break;
8051
8052 case TK_QUOTE_OPEN:
8053 {
8054 OnigCodePoint end_op[2];
8055 UChar *qstart, *qend, *nextp;
8056
8057 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
8058 end_op[1] = (OnigCodePoint )'E';
8059 qstart = *src;
8060 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
8061 if (IS_NULL(qend)) {
8062 nextp = qend = end;
8063 }
8064 *np = node_new_str(qstart, qend);
8065 CHECK_NULL_RETURN_MEMERR(*np);
8066 *src = nextp;
8067 }
8068 break;
8069
8070 case TK_CHAR_TYPE:
8071 {
8072 switch (tok->u.prop.ctype) {
8073 case ONIGENC_CTYPE_WORD:
8074 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, env->options);
8075 CHECK_NULL_RETURN_MEMERR(*np);
8076 break;
8077
8078 case ONIGENC_CTYPE_SPACE:
8079 case ONIGENC_CTYPE_DIGIT:
8080 case ONIGENC_CTYPE_XDIGIT:
8081 {
8082 CClassNode* cc;
8083
8084 *np = node_new_cclass();
8085 CHECK_NULL_RETURN_MEMERR(*np);
8086 cc = CCLASS_(*np);
8087 add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env);
8088 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
8089 }
8090 break;
8091
8092 default:
8093 return ONIGERR_PARSER_BUG;
8094 break;
8095 }
8096 }
8097 break;
8098
8099 case TK_CHAR_PROPERTY:
8100 r = parse_char_property(np, tok, src, end, env);
8101 if (r != 0) return r;
8102 break;
8103
8104 case TK_OPEN_CC:
8105 {
8106 CClassNode* cc;
8107
8108 r = parse_cc(np, tok, src, end, env);
8109 if (r != 0) return r;
8110
8111 cc = CCLASS_(*np);
8112 if (IS_IGNORECASE(env->options)) {
8113 IApplyCaseFoldArg iarg;
8114
8115 iarg.env = env;
8116 iarg.cc = cc;
8117 iarg.alt_root = NULL_NODE;
8118 iarg.ptail = &(iarg.alt_root);
8119
8120 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
8121 i_apply_case_fold, &iarg);
8122 if (r != 0) {
8123 onig_node_free(iarg.alt_root);
8124 return r;
8125 }
8126 if (IS_NOT_NULL(iarg.alt_root)) {
8127 Node* work = onig_node_new_alt(*np, iarg.alt_root);
8128 if (IS_NULL(work)) {
8129 onig_node_free(iarg.alt_root);
8130 return ONIGERR_MEMORY;
8131 }
8132 *np = work;
8133 }
8134 }
8135 }
8136 break;
8137
8138 case TK_ANYCHAR:
8139 *np = node_new_anychar();
8140 CHECK_NULL_RETURN_MEMERR(*np);
8141 break;
8142
8143 case TK_ANYCHAR_ANYTIME:
8144 *np = node_new_anychar();
8145 CHECK_NULL_RETURN_MEMERR(*np);
8146 qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
8147 CHECK_NULL_RETURN_MEMERR(qn);
8148 NODE_BODY(qn) = *np;
8149 *np = qn;
8150 break;
8151
8152 case TK_BACKREF:
8153 len = tok->u.backref.num;
8154 *np = node_new_backref(len,
8155 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
8156 tok->u.backref.by_name,
8157 #ifdef USE_BACKREF_WITH_LEVEL
8158 tok->u.backref.exist_level,
8159 tok->u.backref.level,
8160 #endif
8161 env);
8162 CHECK_NULL_RETURN_MEMERR(*np);
8163 break;
8164
8165 #ifdef USE_CALL
8166 case TK_CALL:
8167 {
8168 int gnum = tok->u.call.gnum;
8169
8170 *np = node_new_call(tok->u.call.name, tok->u.call.name_end,
8171 gnum, tok->u.call.by_number);
8172 CHECK_NULL_RETURN_MEMERR(*np);
8173 env->num_call++;
8174 if (tok->u.call.by_number != 0 && gnum == 0) {
8175 env->has_call_zero = 1;
8176 }
8177 }
8178 break;
8179 #endif
8180
8181 case TK_ANCHOR:
8182 {
8183 int ascii_mode =
8184 IS_WORD_ASCII(env->options) && IS_WORD_ANCHOR_TYPE(tok->u.anchor) ? 1 : 0;
8185 *np = onig_node_new_anchor(tok->u.anchor, ascii_mode);
8186 CHECK_NULL_RETURN_MEMERR(*np);
8187 }
8188 break;
8189
8190 case TK_REPEAT:
8191 case TK_INTERVAL:
8192 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
8193 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
8194 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
8195 else {
8196 *np = node_new_empty();
8197 CHECK_NULL_RETURN_MEMERR(*np);
8198 }
8199 }
8200 else {
8201 goto tk_byte;
8202 }
8203 break;
8204
8205 case TK_KEEP:
8206 r = node_new_keep(np, env);
8207 if (r < 0) return r;
8208 break;
8209
8210 case TK_GENERAL_NEWLINE:
8211 r = node_new_general_newline(np, env);
8212 if (r < 0) return r;
8213 break;
8214
8215 case TK_NO_NEWLINE:
8216 r = node_new_no_newline(np, env);
8217 if (r < 0) return r;
8218 break;
8219
8220 case TK_TRUE_ANYCHAR:
8221 r = node_new_true_anychar(np, env);
8222 if (r < 0) return r;
8223 break;
8224
8225 case TK_TEXT_SEGMENT:
8226 r = make_text_segment(np, env);
8227 if (r < 0) return r;
8228 break;
8229
8230 default:
8231 return ONIGERR_PARSER_BUG;
8232 break;
8233 }
8234
8235 {
8236 tp = np;
8237
8238 re_entry:
8239 r = fetch_token(tok, src, end, env);
8240 if (r < 0) return r;
8241
8242 repeat:
8243 if (r == TK_REPEAT || r == TK_INTERVAL) {
8244 Node* target;
8245
8246 if (is_invalid_quantifier_target(*tp))
8247 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
8248
8249 INC_PARSE_DEPTH(parse_depth);
8250
8251 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
8252 r == TK_INTERVAL);
8253 CHECK_NULL_RETURN_MEMERR(qn);
8254 QUANT_(qn)->greedy = tok->u.repeat.greedy;
8255 if (group == 2) {
8256 target = node_drop_group(*tp);
8257 *tp = NULL_NODE;
8258 }
8259 else {
8260 target = *tp;
8261 }
8262 r = assign_quantifier_body(qn, target, group, env);
8263 if (r < 0) {
8264 onig_node_free(qn);
8265 *tp = NULL_NODE;
8266 return r;
8267 }
8268
8269 if (tok->u.repeat.possessive != 0) {
8270 Node* en;
8271 en = node_new_bag(BAG_STOP_BACKTRACK);
8272 if (IS_NULL(en)) {
8273 onig_node_free(qn);
8274 return ONIGERR_MEMORY;
8275 }
8276 NODE_BODY(en) = qn;
8277 qn = en;
8278 }
8279
8280 if (r == 0) {
8281 *tp = qn;
8282 }
8283 else if (r == 1) { /* x{1,1} ==> x */
8284 onig_node_free(qn);
8285 *tp = target;
8286 }
8287 else if (r == 2) { /* split case: /abc+/ */
8288 Node *tmp;
8289
8290 *tp = node_new_list(*tp, NULL);
8291 if (IS_NULL(*tp)) {
8292 onig_node_free(qn);
8293 return ONIGERR_MEMORY;
8294 }
8295 tmp = NODE_CDR(*tp) = node_new_list(qn, NULL);
8296 if (IS_NULL(tmp)) {
8297 onig_node_free(qn);
8298 return ONIGERR_MEMORY;
8299 }
8300 tp = &(NODE_CAR(tmp));
8301 }
8302 group = 0;
8303 goto re_entry;
8304 }
8305 }
8306
8307 return r;
8308 }
8309
8310 static int
parse_branch(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8311 parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8312 ScanEnv* env, int group_head)
8313 {
8314 int r;
8315 Node *node, **headp;
8316
8317 *top = NULL;
8318 INC_PARSE_DEPTH(env->parse_depth);
8319
8320 r = parse_exp(&node, tok, term, src, end, env, group_head);
8321 if (r < 0) {
8322 onig_node_free(node);
8323 return r;
8324 }
8325
8326 if (r == TK_EOT || r == term || r == TK_ALT) {
8327 *top = node;
8328 }
8329 else {
8330 *top = node_new_list(node, NULL);
8331 if (IS_NULL(*top)) {
8332 onig_node_free(node);
8333 return ONIGERR_MEMORY;
8334 }
8335
8336 headp = &(NODE_CDR(*top));
8337 while (r != TK_EOT && r != term && r != TK_ALT) {
8338 r = parse_exp(&node, tok, term, src, end, env, FALSE);
8339 if (r < 0) {
8340 onig_node_free(node);
8341 return r;
8342 }
8343
8344 if (NODE_TYPE(node) == NODE_LIST) {
8345 *headp = node;
8346 while (IS_NOT_NULL(NODE_CDR(node))) node = NODE_CDR(node);
8347 headp = &(NODE_CDR(node));
8348 }
8349 else {
8350 *headp = node_new_list(node, NULL);
8351 headp = &(NODE_CDR(*headp));
8352 }
8353 }
8354 }
8355
8356 DEC_PARSE_DEPTH(env->parse_depth);
8357 return r;
8358 }
8359
8360 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
8361 static int
parse_alts(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8362 parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8363 ScanEnv* env, int group_head)
8364 {
8365 int r;
8366 Node *node, **headp;
8367
8368 *top = NULL;
8369 INC_PARSE_DEPTH(env->parse_depth);
8370
8371 r = parse_branch(&node, tok, term, src, end, env, group_head);
8372 if (r < 0) {
8373 onig_node_free(node);
8374 return r;
8375 }
8376
8377 if (r == term) {
8378 *top = node;
8379 }
8380 else if (r == TK_ALT) {
8381 *top = onig_node_new_alt(node, NULL);
8382 if (IS_NULL(*top)) {
8383 onig_node_free(node);
8384 return ONIGERR_MEMORY;
8385 }
8386
8387 headp = &(NODE_CDR(*top));
8388 while (r == TK_ALT) {
8389 r = fetch_token(tok, src, end, env);
8390 if (r < 0) return r;
8391 r = parse_branch(&node, tok, term, src, end, env, FALSE);
8392 if (r < 0) {
8393 onig_node_free(node);
8394 return r;
8395 }
8396 *headp = onig_node_new_alt(node, NULL);
8397 if (IS_NULL(*headp)) {
8398 onig_node_free(node);
8399 onig_node_free(*top);
8400 return ONIGERR_MEMORY;
8401 }
8402
8403 headp = &(NODE_CDR(*headp));
8404 }
8405
8406 if (tok->type != (enum TokenSyms )term)
8407 goto err;
8408 }
8409 else {
8410 onig_node_free(node);
8411 err:
8412 if (term == TK_SUBEXP_CLOSE)
8413 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
8414 else
8415 return ONIGERR_PARSER_BUG;
8416 }
8417
8418 DEC_PARSE_DEPTH(env->parse_depth);
8419 return r;
8420 }
8421
8422 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)8423 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
8424 {
8425 int r;
8426 PToken tok;
8427
8428 r = fetch_token(&tok, src, end, env);
8429 if (r < 0) return r;
8430 r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE);
8431 if (r < 0) return r;
8432
8433 return 0;
8434 }
8435
8436 #ifdef USE_CALL
8437 static int
make_call_zero_body(Node * node,ScanEnv * env,Node ** rnode)8438 make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)
8439 {
8440 int r;
8441
8442 Node* x = node_new_memory(0 /* 0: is not named */);
8443 CHECK_NULL_RETURN_MEMERR(x);
8444
8445 NODE_BODY(x) = node;
8446 BAG_(x)->m.regnum = 0;
8447 r = scan_env_set_mem_node(env, 0, x);
8448 if (r != 0) {
8449 onig_node_free(x);
8450 return r;
8451 }
8452
8453 *rnode = x;
8454 return 0;
8455 }
8456 #endif
8457
8458 extern int
onig_parse_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)8459 onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
8460 regex_t* reg, ScanEnv* env)
8461 {
8462 int r;
8463 UChar* p;
8464 #ifdef USE_CALLOUT
8465 RegexExt* ext;
8466 #endif
8467
8468 names_clear(reg);
8469
8470 scan_env_clear(env);
8471 env->options = reg->options;
8472 env->case_fold_flag = reg->case_fold_flag;
8473 env->enc = reg->enc;
8474 env->syntax = reg->syntax;
8475 env->pattern = (UChar* )pattern;
8476 env->pattern_end = (UChar* )end;
8477 env->reg = reg;
8478
8479 *root = NULL;
8480
8481 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
8482 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8483
8484 p = (UChar* )pattern;
8485 r = parse_regexp(root, &p, (UChar* )end, env);
8486
8487 #ifdef USE_CALL
8488 if (r != 0) return r;
8489
8490 if (env->has_call_zero != 0) {
8491 Node* zero_node;
8492 r = make_call_zero_body(*root, env, &zero_node);
8493 if (r != 0) return r;
8494
8495 *root = zero_node;
8496 }
8497 #endif
8498
8499 reg->num_mem = env->num_mem;
8500
8501 #ifdef USE_CALLOUT
8502 ext = reg->extp;
8503 if (IS_NOT_NULL(ext) && ext->callout_num > 0) {
8504 r = setup_ext_callout_list_values(reg);
8505 }
8506 #endif
8507
8508 return r;
8509 }
8510
8511 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)8512 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
8513 UChar* arg, UChar* arg_end)
8514 {
8515 env->error = arg;
8516 env->error_end = arg_end;
8517 }
8518