1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 OnigAmbigType OnigDefaultAmbigFlag =
33 (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
34 ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE);
35
36 extern OnigAmbigType
onig_get_default_ambig_flag(void)37 onig_get_default_ambig_flag(void)
38 {
39 return OnigDefaultAmbigFlag;
40 }
41
42 extern int
onig_set_default_ambig_flag(OnigAmbigType ambig_flag)43 onig_set_default_ambig_flag(OnigAmbigType ambig_flag)
44 {
45 OnigDefaultAmbigFlag = ambig_flag;
46 return 0;
47 }
48
49
50 static UChar*
k_strdup(UChar * s,UChar * end)51 k_strdup(UChar* s, UChar* end)
52 {
53 int len = end - s;
54
55 if (len > 0) {
56 UChar* r = (UChar* )xmalloc(len + 1);
57 CHECK_NULL_RETURN(r);
58 xmemcpy(r, s, len);
59 r[len] = (UChar )0;
60 return r;
61 }
62 else return NULL;
63 }
64
65 /*
66 Caution: node should not be a string node.
67 (s and end member address break)
68 */
69 static void
swap_node(Node * a,Node * b)70 swap_node(Node* a, Node* b)
71 {
72 Node c;
73 c = *a; *a = *b; *b = c;
74 }
75
76 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)77 distance_add(OnigDistance d1, OnigDistance d2)
78 {
79 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
80 return ONIG_INFINITE_DISTANCE;
81 else {
82 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
83 else return ONIG_INFINITE_DISTANCE;
84 }
85 }
86
87 static OnigDistance
distance_multiply(OnigDistance d,int m)88 distance_multiply(OnigDistance d, int m)
89 {
90 if (m == 0) return 0;
91
92 if (d < ONIG_INFINITE_DISTANCE / m)
93 return d * m;
94 else
95 return ONIG_INFINITE_DISTANCE;
96 }
97
98 static int
bitset_is_empty(BitSetRef bs)99 bitset_is_empty(BitSetRef bs)
100 {
101 int i;
102 for (i = 0; i < BITSET_SIZE; i++) {
103 if (bs[i] != 0) return 0;
104 }
105 return 1;
106 }
107
108 #ifdef ONIG_DEBUG
109 static int
bitset_on_num(BitSetRef bs)110 bitset_on_num(BitSetRef bs)
111 {
112 int i, n;
113
114 n = 0;
115 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
116 if (BITSET_AT(bs, i)) n++;
117 }
118 return n;
119 }
120 #endif
121
122 extern int
onig_bbuf_init(BBuf * buf,int size)123 onig_bbuf_init(BBuf* buf, int size)
124 {
125 buf->p = (UChar* )xmalloc(size);
126 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
127
128 buf->alloc = size;
129 buf->used = 0;
130 return 0;
131 }
132
133
134 #ifdef USE_SUBEXP_CALL
135
136 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)137 unset_addr_list_init(UnsetAddrList* uslist, int size)
138 {
139 UnsetAddr* p;
140
141 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
142 CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
143 uslist->num = 0;
144 uslist->alloc = size;
145 uslist->us = p;
146 return 0;
147 }
148
149 static void
unset_addr_list_end(UnsetAddrList * uslist)150 unset_addr_list_end(UnsetAddrList* uslist)
151 {
152 if (IS_NOT_NULL(uslist->us))
153 xfree(uslist->us);
154 }
155
156 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)157 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
158 {
159 UnsetAddr* p;
160 int size;
161
162 if (uslist->num >= uslist->alloc) {
163 size = uslist->alloc * 2;
164 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
165 CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
166 uslist->alloc = size;
167 uslist->us = p;
168 }
169
170 uslist->us[uslist->num].offset = offset;
171 uslist->us[uslist->num].target = node;
172 uslist->num++;
173 return 0;
174 }
175 #endif /* USE_SUBEXP_CALL */
176
177
178 static int
add_opcode(regex_t * reg,int opcode)179 add_opcode(regex_t* reg, int opcode)
180 {
181 BBUF_ADD1(reg, opcode);
182 return 0;
183 }
184
185 #ifdef USE_COMBINATION_EXPLOSION_CHECK
186 static int
add_state_check_num(regex_t * reg,int num)187 add_state_check_num(regex_t* reg, int num)
188 {
189 StateCheckNumType n = (StateCheckNumType )num;
190
191 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
192 return 0;
193 }
194 #endif
195
196 static int
add_rel_addr(regex_t * reg,int addr)197 add_rel_addr(regex_t* reg, int addr)
198 {
199 RelAddrType ra = (RelAddrType )addr;
200
201 BBUF_ADD(reg, &ra, SIZE_RELADDR);
202 return 0;
203 }
204
205 static int
add_abs_addr(regex_t * reg,int addr)206 add_abs_addr(regex_t* reg, int addr)
207 {
208 AbsAddrType ra = (AbsAddrType )addr;
209
210 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
211 return 0;
212 }
213
214 static int
add_length(regex_t * reg,int len)215 add_length(regex_t* reg, int len)
216 {
217 LengthType l = (LengthType )len;
218
219 BBUF_ADD(reg, &l, SIZE_LENGTH);
220 return 0;
221 }
222
223 static int
add_mem_num(regex_t * reg,int num)224 add_mem_num(regex_t* reg, int num)
225 {
226 MemNumType n = (MemNumType )num;
227
228 BBUF_ADD(reg, &n, SIZE_MEMNUM);
229 return 0;
230 }
231
232 static int
add_pointer(regex_t * reg,void * addr)233 add_pointer(regex_t* reg, void* addr)
234 {
235 PointerType ptr = (PointerType )addr;
236
237 BBUF_ADD(reg, &ptr, SIZE_POINTER);
238 return 0;
239 }
240
241 static int
add_option(regex_t * reg,OnigOptionType option)242 add_option(regex_t* reg, OnigOptionType option)
243 {
244 BBUF_ADD(reg, &option, SIZE_OPTION);
245 return 0;
246 }
247
248 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)249 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
250 {
251 int r;
252
253 r = add_opcode(reg, opcode);
254 if (r) return r;
255 r = add_rel_addr(reg, addr);
256 return r;
257 }
258
259 static int
add_bytes(regex_t * reg,UChar * bytes,int len)260 add_bytes(regex_t* reg, UChar* bytes, int len)
261 {
262 BBUF_ADD(reg, bytes, len);
263 return 0;
264 }
265
266 static int
add_bitset(regex_t * reg,BitSetRef bs)267 add_bitset(regex_t* reg, BitSetRef bs)
268 {
269 BBUF_ADD(reg, bs, SIZE_BITSET);
270 return 0;
271 }
272
273 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)274 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
275 {
276 int r;
277
278 r = add_opcode(reg, opcode);
279 if (r) return r;
280 r = add_option(reg, option);
281 return r;
282 }
283
284 static int compile_length_tree(Node* node, regex_t* reg);
285 static int compile_tree(Node* node, regex_t* reg);
286
287
288 #define IS_NEED_STR_LEN_OP_EXACT(op) \
289 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
290 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
291
292 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)293 select_str_opcode(int mb_len, int str_len, int ignore_case)
294 {
295 int op;
296
297 if (ignore_case) {
298 switch (str_len) {
299 case 1: op = OP_EXACT1_IC; break;
300 default: op = OP_EXACTN_IC; break;
301 }
302 }
303 else {
304 switch (mb_len) {
305 case 1:
306 switch (str_len) {
307 case 1: op = OP_EXACT1; break;
308 case 2: op = OP_EXACT2; break;
309 case 3: op = OP_EXACT3; break;
310 case 4: op = OP_EXACT4; break;
311 case 5: op = OP_EXACT5; break;
312 default: op = OP_EXACTN; break;
313 }
314 break;
315
316 case 2:
317 switch (str_len) {
318 case 1: op = OP_EXACTMB2N1; break;
319 case 2: op = OP_EXACTMB2N2; break;
320 case 3: op = OP_EXACTMB2N3; break;
321 default: op = OP_EXACTMB2N; break;
322 }
323 break;
324
325 case 3:
326 op = OP_EXACTMB3N;
327 break;
328
329 default:
330 op = OP_EXACTMBN;
331 break;
332 }
333 }
334 return op;
335 }
336
337 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)338 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
339 {
340 int r;
341 int saved_num_null_check = reg->num_null_check;
342
343 if (empty_info != 0) {
344 r = add_opcode(reg, OP_NULL_CHECK_START);
345 if (r) return r;
346 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
347 if (r) return r;
348 reg->num_null_check++;
349 }
350
351 r = compile_tree(node, reg);
352 if (r) return r;
353
354 if (empty_info != 0) {
355 if (empty_info == NQ_TARGET_IS_EMPTY)
356 r = add_opcode(reg, OP_NULL_CHECK_END);
357 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
358 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
359 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
360 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
361
362 if (r) return r;
363 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
364 }
365 return r;
366 }
367
368 #ifdef USE_SUBEXP_CALL
369 static int
compile_call(CallNode * node,regex_t * reg)370 compile_call(CallNode* node, regex_t* reg)
371 {
372 int r;
373
374 r = add_opcode(reg, OP_CALL);
375 if (r) return r;
376 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
377 node->target);
378 if (r) return r;
379 r = add_abs_addr(reg, 0 /*dummy addr.*/);
380 return r;
381 }
382 #endif
383
384 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)385 compile_tree_n_times(Node* node, int n, regex_t* reg)
386 {
387 int i, r;
388
389 for (i = 0; i < n; i++) {
390 r = compile_tree(node, reg);
391 if (r) return r;
392 }
393 return 0;
394 }
395
396 static int
add_compile_string_length(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)397 add_compile_string_length(UChar* s, int mb_len, int str_len,
398 regex_t* reg, int ignore_case)
399 {
400 int len;
401 int op = select_str_opcode(mb_len, str_len, ignore_case);
402
403 len = SIZE_OPCODE;
404
405 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
406 if (IS_NEED_STR_LEN_OP_EXACT(op))
407 len += SIZE_LENGTH;
408
409 len += mb_len * str_len;
410 return len;
411 }
412
413 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)414 add_compile_string(UChar* s, int mb_len, int str_len,
415 regex_t* reg, int ignore_case)
416 {
417 int op = select_str_opcode(mb_len, str_len, ignore_case);
418 add_opcode(reg, op);
419
420 if (op == OP_EXACTMBN)
421 add_length(reg, mb_len);
422
423 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
424 if (op == OP_EXACTN_IC)
425 add_length(reg, mb_len * str_len);
426 else
427 add_length(reg, str_len);
428 }
429
430 add_bytes(reg, s, mb_len * str_len);
431 return 0;
432 }
433
434
435 static int
compile_length_string_node(Node * node,regex_t * reg)436 compile_length_string_node(Node* node, regex_t* reg)
437 {
438 int rlen, r, len, prev_len, slen, ambig;
439 OnigEncoding enc = reg->enc;
440 UChar *p, *prev;
441 StrNode* sn;
442
443 sn = &(NSTRING(node));
444 if (sn->end <= sn->s)
445 return 0;
446
447 ambig = NSTRING_IS_AMBIG(node);
448
449 p = prev = sn->s;
450 prev_len = enc_len(enc, p);
451 p += prev_len;
452 slen = 1;
453 rlen = 0;
454
455 for (; p < sn->end; ) {
456 len = enc_len(enc, p);
457 if (len == prev_len) {
458 slen++;
459 }
460 else {
461 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
462 rlen += r;
463 prev = p;
464 slen = 1;
465 prev_len = len;
466 }
467 p += len;
468 }
469 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
470 rlen += r;
471 return rlen;
472 }
473
474 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)475 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
476 {
477 if (sn->end <= sn->s)
478 return 0;
479
480 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
481 }
482
483 static int
compile_string_node(Node * node,regex_t * reg)484 compile_string_node(Node* node, regex_t* reg)
485 {
486 int r, len, prev_len, slen, ambig;
487 OnigEncoding enc = reg->enc;
488 UChar *p, *prev, *end;
489 StrNode* sn;
490
491 sn = &(NSTRING(node));
492 if (sn->end <= sn->s)
493 return 0;
494
495 end = sn->end;
496 ambig = NSTRING_IS_AMBIG(node);
497
498 p = prev = sn->s;
499 prev_len = enc_len(enc, p);
500 p += prev_len;
501 slen = 1;
502
503 for (; p < end; ) {
504 len = enc_len(enc, p);
505 if (len == prev_len) {
506 slen++;
507 }
508 else {
509 r = add_compile_string(prev, prev_len, slen, reg, ambig);
510 if (r) return r;
511
512 prev = p;
513 slen = 1;
514 prev_len = len;
515 }
516
517 p += len;
518 }
519 return add_compile_string(prev, prev_len, slen, reg, ambig);
520 }
521
522 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)523 compile_string_raw_node(StrNode* sn, regex_t* reg)
524 {
525 if (sn->end <= sn->s)
526 return 0;
527
528 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
529 }
530
531 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)532 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
533 {
534 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
535 add_length(reg, mbuf->used);
536 return add_bytes(reg, mbuf->p, mbuf->used);
537 #else
538 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
539
540 int r, pad_size;
541 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
542
543 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
544 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
545 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
546
547 r = add_bytes(reg, mbuf->p, mbuf->used);
548
549 /* padding for return value from compile_length_cclass_node() to be fix. */
550 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
551 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
552 return r;
553 #endif
554 }
555
556 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)557 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
558 {
559 int len;
560
561 if (IS_CCLASS_SHARE(cc)) {
562 len = SIZE_OPCODE + SIZE_POINTER;
563 return len;
564 }
565
566 if (IS_NULL(cc->mbuf)) {
567 len = SIZE_OPCODE + SIZE_BITSET;
568 }
569 else {
570 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
571 len = SIZE_OPCODE;
572 }
573 else {
574 len = SIZE_OPCODE + SIZE_BITSET;
575 }
576 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
577 len += SIZE_LENGTH + cc->mbuf->used;
578 #else
579 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
580 #endif
581 }
582
583 return len;
584 }
585
586 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)587 compile_cclass_node(CClassNode* cc, regex_t* reg)
588 {
589 int r;
590
591 if (IS_CCLASS_SHARE(cc)) {
592 add_opcode(reg, OP_CCLASS_NODE);
593 r = add_pointer(reg, cc);
594 return r;
595 }
596
597 if (IS_NULL(cc->mbuf)) {
598 if (IS_CCLASS_NOT(cc))
599 add_opcode(reg, OP_CCLASS_NOT);
600 else
601 add_opcode(reg, OP_CCLASS);
602
603 r = add_bitset(reg, cc->bs);
604 }
605 else {
606 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
607 if (IS_CCLASS_NOT(cc))
608 add_opcode(reg, OP_CCLASS_MB_NOT);
609 else
610 add_opcode(reg, OP_CCLASS_MB);
611
612 r = add_multi_byte_cclass(cc->mbuf, reg);
613 }
614 else {
615 if (IS_CCLASS_NOT(cc))
616 add_opcode(reg, OP_CCLASS_MIX_NOT);
617 else
618 add_opcode(reg, OP_CCLASS_MIX);
619
620 r = add_bitset(reg, cc->bs);
621 if (r) return r;
622 r = add_multi_byte_cclass(cc->mbuf, reg);
623 }
624 }
625
626 return r;
627 }
628
629 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)630 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
631 {
632 #define REPEAT_RANGE_ALLOC 4
633
634 OnigRepeatRange* p;
635
636 if (reg->repeat_range_alloc == 0) {
637 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
638 CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
639 reg->repeat_range = p;
640 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
641 }
642 else if (reg->repeat_range_alloc <= id) {
643 int n;
644 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
645 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
646 sizeof(OnigRepeatRange) * n);
647 CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
648 reg->repeat_range = p;
649 reg->repeat_range_alloc = n;
650 }
651 else {
652 p = reg->repeat_range;
653 }
654
655 p[id].lower = lower;
656 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
657 return 0;
658 }
659
660 static int
compile_range_repeat_node(QuantifierNode * qn,int target_len,int empty_info,regex_t * reg)661 compile_range_repeat_node(QuantifierNode* qn, int target_len, int empty_info,
662 regex_t* reg)
663 {
664 int r;
665 int num_repeat = reg->num_repeat;
666
667 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
668 if (r) return r;
669 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
670 reg->num_repeat++;
671 if (r) return r;
672 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
673 if (r) return r;
674
675 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
676 if (r) return r;
677
678 r = compile_tree_empty_check(qn->target, reg, empty_info);
679 if (r) return r;
680
681 if (
682 #ifdef USE_SUBEXP_CALL
683 reg->num_call > 0 ||
684 #endif
685 IS_QUANTIFIER_IN_REPEAT(qn)) {
686 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
687 }
688 else {
689 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
690 }
691 if (r) return r;
692 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
693 return r;
694 }
695
696 static int
is_anychar_star_quantifier(QuantifierNode * qn)697 is_anychar_star_quantifier(QuantifierNode* qn)
698 {
699 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
700 NTYPE(qn->target) == N_ANYCHAR)
701 return 1;
702 else
703 return 0;
704 }
705
706 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
707 #define CKN_ON (ckn > 0)
708
709 #ifdef USE_COMBINATION_EXPLOSION_CHECK
710
711 static int
compile_length_quantifier_node(QuantifierNode * qn,regex_t * reg)712 compile_length_quantifier_node(QuantifierNode* qn, regex_t* reg)
713 {
714 int len, mod_tlen, cklen;
715 int ckn;
716 int infinite = IS_REPEAT_INFINITE(qn->upper);
717 int empty_info = qn->target_empty_info;
718 int tlen = compile_length_tree(qn->target, reg);
719
720 if (tlen < 0) return tlen;
721
722 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
723
724 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
725
726 /* anychar repeat */
727 if (NTYPE(qn->target) == N_ANYCHAR) {
728 if (qn->greedy && infinite) {
729 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
730 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
731 else
732 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
733 }
734 }
735
736 if (empty_info != 0)
737 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
738 else
739 mod_tlen = tlen;
740
741 if (infinite && qn->lower <= 1) {
742 if (qn->greedy) {
743 if (qn->lower == 1)
744 len = SIZE_OP_JUMP;
745 else
746 len = 0;
747
748 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
749 }
750 else {
751 if (qn->lower == 0)
752 len = SIZE_OP_JUMP;
753 else
754 len = 0;
755
756 len += mod_tlen + SIZE_OP_PUSH + cklen;
757 }
758 }
759 else if (qn->upper == 0) {
760 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
761 len = SIZE_OP_JUMP + tlen;
762 else
763 len = 0;
764 }
765 else if (qn->upper == 1 && qn->greedy) {
766 if (qn->lower == 0) {
767 if (CKN_ON) {
768 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
769 }
770 else {
771 len = SIZE_OP_PUSH + tlen;
772 }
773 }
774 else {
775 len = tlen;
776 }
777 }
778 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
779 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
780 }
781 else {
782 len = SIZE_OP_REPEAT_INC
783 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
784 if (CKN_ON)
785 len += SIZE_OP_STATE_CHECK;
786 }
787
788 return len;
789 }
790
791 static int
compile_quantifier_node(QuantifierNode * qn,regex_t * reg)792 compile_quantifier_node(QuantifierNode* qn, regex_t* reg)
793 {
794 int r, mod_tlen;
795 int ckn;
796 int infinite = IS_REPEAT_INFINITE(qn->upper);
797 int empty_info = qn->target_empty_info;
798 int tlen = compile_length_tree(qn->target, reg);
799
800 if (tlen < 0) return tlen;
801
802 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
803
804 if (is_anychar_star_quantifier(qn)) {
805 r = compile_tree_n_times(qn->target, qn->lower, reg);
806 if (r) return r;
807 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
808 if (IS_MULTILINE(reg->options))
809 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
810 else
811 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
812 if (r) return r;
813 if (CKN_ON) {
814 r = add_state_check_num(reg, ckn);
815 if (r) return r;
816 }
817
818 return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
819 }
820 else {
821 if (IS_MULTILINE(reg->options)) {
822 r = add_opcode(reg, (CKN_ON ?
823 OP_STATE_CHECK_ANYCHAR_ML_STAR
824 : OP_ANYCHAR_ML_STAR));
825 }
826 else {
827 r = add_opcode(reg, (CKN_ON ?
828 OP_STATE_CHECK_ANYCHAR_STAR
829 : OP_ANYCHAR_STAR));
830 }
831 if (r) return r;
832 if (CKN_ON)
833 r = add_state_check_num(reg, ckn);
834
835 return r;
836 }
837 }
838
839 if (empty_info != 0)
840 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
841 else
842 mod_tlen = tlen;
843
844 if (infinite && qn->lower <= 1) {
845 if (qn->greedy) {
846 if (qn->lower == 1) {
847 r = add_opcode_rel_addr(reg, OP_JUMP,
848 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
849 if (r) return r;
850 }
851
852 if (CKN_ON) {
853 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
854 if (r) return r;
855 r = add_state_check_num(reg, ckn);
856 if (r) return r;
857 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
858 }
859 else {
860 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
861 }
862 if (r) return r;
863 r = compile_tree_empty_check(qn->target, reg, empty_info);
864 if (r) return r;
865 r = add_opcode_rel_addr(reg, OP_JUMP,
866 -(mod_tlen + (int )SIZE_OP_JUMP
867 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
868 }
869 else {
870 if (qn->lower == 0) {
871 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
872 if (r) return r;
873 }
874 r = compile_tree_empty_check(qn->target, reg, empty_info);
875 if (r) return r;
876 if (CKN_ON) {
877 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
878 if (r) return r;
879 r = add_state_check_num(reg, ckn);
880 if (r) return r;
881 r = add_rel_addr(reg,
882 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
883 }
884 else
885 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
886 }
887 }
888 else if (qn->upper == 0) {
889 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
890 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
891 if (r) return r;
892 r = compile_tree(qn->target, reg);
893 }
894 else
895 r = 0;
896 }
897 else if (qn->upper == 1 && qn->greedy) {
898 if (qn->lower == 0) {
899 if (CKN_ON) {
900 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
901 if (r) return r;
902 r = add_state_check_num(reg, ckn);
903 if (r) return r;
904 r = add_rel_addr(reg, tlen);
905 }
906 else {
907 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
908 }
909 if (r) return r;
910 }
911
912 r = compile_tree(qn->target, reg);
913 }
914 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
915 if (CKN_ON) {
916 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
917 if (r) return r;
918 r = add_state_check_num(reg, ckn);
919 if (r) return r;
920 r = add_rel_addr(reg, SIZE_OP_JUMP);
921 }
922 else {
923 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
924 }
925
926 if (r) return r;
927 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
928 if (r) return r;
929 r = compile_tree(qn->target, reg);
930 }
931 else {
932 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
933 if (CKN_ON) {
934 if (r) return r;
935 r = add_opcode(reg, OP_STATE_CHECK);
936 if (r) return r;
937 r = add_state_check_num(reg, ckn);
938 }
939 }
940 return r;
941 }
942
943 #else /* USE_COMBINATION_EXPLOSION_CHECK */
944
945 static int
compile_length_quantifier_node(QuantifierNode * qn,regex_t * reg)946 compile_length_quantifier_node(QuantifierNode* qn, regex_t* reg)
947 {
948 int len, mod_tlen;
949 int infinite = IS_REPEAT_INFINITE(qn->upper);
950 int empty_info = qn->target_empty_info;
951 int tlen = compile_length_tree(qn->target, reg);
952
953 if (tlen < 0) return tlen;
954
955 /* anychar repeat */
956 if (NTYPE(qn->target) == N_ANYCHAR) {
957 if (qn->greedy && infinite) {
958 if (IS_NOT_NULL(qn->next_head_exact))
959 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
960 else
961 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
962 }
963 }
964
965 if (empty_info != 0)
966 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
967 else
968 mod_tlen = tlen;
969
970 if (infinite &&
971 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
972 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
973 len = SIZE_OP_JUMP;
974 }
975 else {
976 len = tlen * qn->lower;
977 }
978
979 if (qn->greedy) {
980 if (IS_NOT_NULL(qn->head_exact))
981 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
982 else if (IS_NOT_NULL(qn->next_head_exact))
983 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
984 else
985 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
986 }
987 else
988 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
989 }
990 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
991 len = SIZE_OP_JUMP + tlen;
992 }
993 else if (!infinite && qn->greedy &&
994 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
995 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
996 len = tlen * qn->lower;
997 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
998 }
999 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1000 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1001 }
1002 else {
1003 len = SIZE_OP_REPEAT_INC
1004 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1005 }
1006
1007 return len;
1008 }
1009
1010 static int
compile_quantifier_node(QuantifierNode * qn,regex_t * reg)1011 compile_quantifier_node(QuantifierNode* qn, regex_t* reg)
1012 {
1013 int i, r, mod_tlen;
1014 int infinite = IS_REPEAT_INFINITE(qn->upper);
1015 int empty_info = qn->target_empty_info;
1016 int tlen = compile_length_tree(qn->target, reg);
1017
1018 if (tlen < 0) return tlen;
1019
1020 if (is_anychar_star_quantifier(qn)) {
1021 r = compile_tree_n_times(qn->target, qn->lower, reg);
1022 if (r) return r;
1023 if (IS_NOT_NULL(qn->next_head_exact)) {
1024 if (IS_MULTILINE(reg->options))
1025 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1026 else
1027 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1028 if (r) return r;
1029 return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
1030 }
1031 else {
1032 if (IS_MULTILINE(reg->options))
1033 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1034 else
1035 return add_opcode(reg, OP_ANYCHAR_STAR);
1036 }
1037 }
1038
1039 if (empty_info != 0)
1040 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1041 else
1042 mod_tlen = tlen;
1043
1044 if (infinite &&
1045 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1046 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1047 if (qn->greedy) {
1048 if (IS_NOT_NULL(qn->head_exact))
1049 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1050 else if (IS_NOT_NULL(qn->next_head_exact))
1051 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1052 else
1053 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1054 }
1055 else {
1056 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1057 }
1058 if (r) return r;
1059 }
1060 else {
1061 r = compile_tree_n_times(qn->target, qn->lower, reg);
1062 if (r) return r;
1063 }
1064
1065 if (qn->greedy) {
1066 if (IS_NOT_NULL(qn->head_exact)) {
1067 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1068 mod_tlen + SIZE_OP_JUMP);
1069 if (r) return r;
1070 add_bytes(reg, NSTRING(qn->head_exact).s, 1);
1071 r = compile_tree_empty_check(qn->target, reg, empty_info);
1072 if (r) return r;
1073 r = add_opcode_rel_addr(reg, OP_JUMP,
1074 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1075 }
1076 else if (IS_NOT_NULL(qn->next_head_exact)) {
1077 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1078 mod_tlen + SIZE_OP_JUMP);
1079 if (r) return r;
1080 add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
1081 r = compile_tree_empty_check(qn->target, reg, empty_info);
1082 if (r) return r;
1083 r = add_opcode_rel_addr(reg, OP_JUMP,
1084 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1085 }
1086 else {
1087 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1088 if (r) return r;
1089 r = compile_tree_empty_check(qn->target, reg, empty_info);
1090 if (r) return r;
1091 r = add_opcode_rel_addr(reg, OP_JUMP,
1092 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1093 }
1094 }
1095 else {
1096 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1097 if (r) return r;
1098 r = compile_tree_empty_check(qn->target, reg, empty_info);
1099 if (r) return r;
1100 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1101 }
1102 }
1103 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1104 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1105 if (r) return r;
1106 r = compile_tree(qn->target, reg);
1107 }
1108 else if (!infinite && qn->greedy &&
1109 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1110 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1111 int n = qn->upper - qn->lower;
1112
1113 r = compile_tree_n_times(qn->target, qn->lower, reg);
1114 if (r) return r;
1115
1116 for (i = 0; i < n; i++) {
1117 r = add_opcode_rel_addr(reg, OP_PUSH,
1118 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1119 if (r) return r;
1120 r = compile_tree(qn->target, reg);
1121 if (r) return r;
1122 }
1123 }
1124 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1125 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1126 if (r) return r;
1127 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1128 if (r) return r;
1129 r = compile_tree(qn->target, reg);
1130 }
1131 else {
1132 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1133 }
1134 return r;
1135 }
1136 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1137
1138 static int
compile_length_option_node(EffectNode * node,regex_t * reg)1139 compile_length_option_node(EffectNode* node, regex_t* reg)
1140 {
1141 int tlen;
1142 OnigOptionType prev = reg->options;
1143
1144 reg->options = node->option;
1145 tlen = compile_length_tree(node->target, reg);
1146 reg->options = prev;
1147
1148 if (tlen < 0) return tlen;
1149
1150 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1151 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1152 + tlen + SIZE_OP_SET_OPTION;
1153 }
1154 else
1155 return tlen;
1156 }
1157
1158 static int
compile_option_node(EffectNode * node,regex_t * reg)1159 compile_option_node(EffectNode* node, regex_t* reg)
1160 {
1161 int r;
1162 OnigOptionType prev = reg->options;
1163
1164 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1165 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1166 if (r) return r;
1167 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1168 if (r) return r;
1169 r = add_opcode(reg, OP_FAIL);
1170 if (r) return r;
1171 }
1172
1173 reg->options = node->option;
1174 r = compile_tree(node->target, reg);
1175 reg->options = prev;
1176
1177 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1178 if (r) return r;
1179 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1180 }
1181 return r;
1182 }
1183
1184 static int
compile_length_effect_node(EffectNode * node,regex_t * reg)1185 compile_length_effect_node(EffectNode* node, regex_t* reg)
1186 {
1187 int len;
1188 int tlen;
1189
1190 if (node->type == EFFECT_OPTION)
1191 return compile_length_option_node(node, reg);
1192
1193 if (node->target) {
1194 tlen = compile_length_tree(node->target, reg);
1195 if (tlen < 0) return tlen;
1196 }
1197 else
1198 tlen = 0;
1199
1200 switch (node->type) {
1201 case EFFECT_MEMORY:
1202 #ifdef USE_SUBEXP_CALL
1203 if (IS_EFFECT_CALLED(node)) {
1204 len = SIZE_OP_MEMORY_START_PUSH + tlen
1205 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1206 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1207 len += (IS_EFFECT_RECURSION(node)
1208 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1209 else
1210 len += (IS_EFFECT_RECURSION(node)
1211 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1212 }
1213 else
1214 #endif
1215 {
1216 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1217 len = SIZE_OP_MEMORY_START_PUSH;
1218 else
1219 len = SIZE_OP_MEMORY_START;
1220
1221 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1222 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1223 }
1224 break;
1225
1226 case EFFECT_STOP_BACKTRACK:
1227 if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
1228 QuantifierNode* qn = &NQUANTIFIER(node->target);
1229 tlen = compile_length_tree(qn->target, reg);
1230 if (tlen < 0) return tlen;
1231
1232 len = tlen * qn->lower
1233 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1234 }
1235 else {
1236 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1237 }
1238 break;
1239
1240 default:
1241 return ONIGERR_TYPE_BUG;
1242 break;
1243 }
1244
1245 return len;
1246 }
1247
1248 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1249
1250 static int
compile_effect_node(EffectNode * node,regex_t * reg)1251 compile_effect_node(EffectNode* node, regex_t* reg)
1252 {
1253 int r, len;
1254
1255 if (node->type == EFFECT_OPTION)
1256 return compile_option_node(node, reg);
1257
1258 switch (node->type) {
1259 case EFFECT_MEMORY:
1260 #ifdef USE_SUBEXP_CALL
1261 if (IS_EFFECT_CALLED(node)) {
1262 r = add_opcode(reg, OP_CALL);
1263 if (r) return r;
1264 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1265 node->state |= NST_ADDR_FIXED;
1266 r = add_abs_addr(reg, (int )node->call_addr);
1267 if (r) return r;
1268 len = compile_length_tree(node->target, reg);
1269 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1270 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1271 len += (IS_EFFECT_RECURSION(node)
1272 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1273 else
1274 len += (IS_EFFECT_RECURSION(node)
1275 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1276
1277 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1278 if (r) return r;
1279 }
1280 #endif
1281 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1282 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1283 else
1284 r = add_opcode(reg, OP_MEMORY_START);
1285 if (r) return r;
1286 r = add_mem_num(reg, node->regnum);
1287 if (r) return r;
1288 r = compile_tree(node->target, reg);
1289 if (r) return r;
1290 #ifdef USE_SUBEXP_CALL
1291 if (IS_EFFECT_CALLED(node)) {
1292 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1293 r = add_opcode(reg, (IS_EFFECT_RECURSION(node)
1294 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1295 else
1296 r = add_opcode(reg, (IS_EFFECT_RECURSION(node)
1297 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1298
1299 if (r) return r;
1300 r = add_mem_num(reg, node->regnum);
1301 if (r) return r;
1302 r = add_opcode(reg, OP_RETURN);
1303 }
1304 else
1305 #endif
1306 {
1307 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1308 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1309 else
1310 r = add_opcode(reg, OP_MEMORY_END);
1311 if (r) return r;
1312 r = add_mem_num(reg, node->regnum);
1313 }
1314 break;
1315
1316 case EFFECT_STOP_BACKTRACK:
1317 if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
1318 QuantifierNode* qn = &NQUANTIFIER(node->target);
1319 r = compile_tree_n_times(qn->target, qn->lower, reg);
1320 if (r) return r;
1321
1322 len = compile_length_tree(qn->target, reg);
1323 if (len < 0) return len;
1324
1325 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1326 if (r) return r;
1327 r = compile_tree(qn->target, reg);
1328 if (r) return r;
1329 r = add_opcode(reg, OP_POP);
1330 if (r) return r;
1331 r = add_opcode_rel_addr(reg, OP_JUMP,
1332 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1333 }
1334 else {
1335 r = add_opcode(reg, OP_PUSH_STOP_BT);
1336 if (r) return r;
1337 r = compile_tree(node->target, reg);
1338 if (r) return r;
1339 r = add_opcode(reg, OP_POP_STOP_BT);
1340 }
1341 break;
1342
1343 default:
1344 return ONIGERR_TYPE_BUG;
1345 break;
1346 }
1347
1348 return r;
1349 }
1350
1351 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1352 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1353 {
1354 int len;
1355 int tlen = 0;
1356
1357 if (node->target) {
1358 tlen = compile_length_tree(node->target, reg);
1359 if (tlen < 0) return tlen;
1360 }
1361
1362 switch (node->type) {
1363 case ANCHOR_PREC_READ:
1364 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1365 break;
1366 case ANCHOR_PREC_READ_NOT:
1367 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1368 break;
1369 case ANCHOR_LOOK_BEHIND:
1370 len = SIZE_OP_LOOK_BEHIND + tlen;
1371 break;
1372 case ANCHOR_LOOK_BEHIND_NOT:
1373 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1374 break;
1375
1376 default:
1377 len = SIZE_OPCODE;
1378 break;
1379 }
1380
1381 return len;
1382 }
1383
1384 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1385 compile_anchor_node(AnchorNode* node, regex_t* reg)
1386 {
1387 int r, len;
1388
1389 switch (node->type) {
1390 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1391 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1392 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1393 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1394 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1395 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1396
1397 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1398 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1399 #ifdef USE_WORD_BEGIN_END
1400 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1401 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1402 #endif
1403
1404 case ANCHOR_PREC_READ:
1405 r = add_opcode(reg, OP_PUSH_POS);
1406 if (r) return r;
1407 r = compile_tree(node->target, reg);
1408 if (r) return r;
1409 r = add_opcode(reg, OP_POP_POS);
1410 break;
1411
1412 case ANCHOR_PREC_READ_NOT:
1413 len = compile_length_tree(node->target, reg);
1414 if (len < 0) return len;
1415 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1416 if (r) return r;
1417 r = compile_tree(node->target, reg);
1418 if (r) return r;
1419 r = add_opcode(reg, OP_FAIL_POS);
1420 break;
1421
1422 case ANCHOR_LOOK_BEHIND:
1423 {
1424 int n;
1425 r = add_opcode(reg, OP_LOOK_BEHIND);
1426 if (r) return r;
1427 if (node->char_len < 0) {
1428 r = get_char_length_tree(node->target, reg, &n);
1429 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1430 }
1431 else
1432 n = node->char_len;
1433 r = add_length(reg, n);
1434 if (r) return r;
1435 r = compile_tree(node->target, reg);
1436 }
1437 break;
1438
1439 case ANCHOR_LOOK_BEHIND_NOT:
1440 {
1441 int n;
1442 len = compile_length_tree(node->target, reg);
1443 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1444 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1445 if (r) return r;
1446 if (node->char_len < 0) {
1447 r = get_char_length_tree(node->target, reg, &n);
1448 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1449 }
1450 else
1451 n = node->char_len;
1452 r = add_length(reg, n);
1453 if (r) return r;
1454 r = compile_tree(node->target, reg);
1455 if (r) return r;
1456 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1457 }
1458 break;
1459
1460 default:
1461 return ONIGERR_TYPE_BUG;
1462 break;
1463 }
1464
1465 return r;
1466 }
1467
1468 static int
compile_length_tree(Node * node,regex_t * reg)1469 compile_length_tree(Node* node, regex_t* reg)
1470 {
1471 int len, type, r;
1472
1473 type = NTYPE(node);
1474 switch (type) {
1475 case N_LIST:
1476 len = 0;
1477 do {
1478 r = compile_length_tree(NCONS(node).left, reg);
1479 if (r < 0) return r;
1480 len += r;
1481 } while (IS_NOT_NULL(node = NCONS(node).right));
1482 r = len;
1483 break;
1484
1485 case N_ALT:
1486 {
1487 int n;
1488
1489 n = r = 0;
1490 do {
1491 r += compile_length_tree(NCONS(node).left, reg);
1492 n++;
1493 } while (IS_NOT_NULL(node = NCONS(node).right));
1494 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1495 }
1496 break;
1497
1498 case N_STRING:
1499 if (NSTRING_IS_RAW(node))
1500 r = compile_length_string_raw_node(&(NSTRING(node)), reg);
1501 else
1502 r = compile_length_string_node(node, reg);
1503 break;
1504
1505 case N_CCLASS:
1506 r = compile_length_cclass_node(&(NCCLASS(node)), reg);
1507 break;
1508
1509 case N_CTYPE:
1510 case N_ANYCHAR:
1511 r = SIZE_OPCODE;
1512 break;
1513
1514 case N_BACKREF:
1515 {
1516 BackrefNode* br = &(NBACKREF(node));
1517
1518 #ifdef USE_BACKREF_AT_LEVEL
1519 if (IS_BACKREF_NEST_LEVEL(br)) {
1520 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1521 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1522 }
1523 else
1524 #endif
1525 if (br->back_num == 1) {
1526 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1527 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1528 }
1529 else {
1530 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1531 }
1532 }
1533 break;
1534
1535 #ifdef USE_SUBEXP_CALL
1536 case N_CALL:
1537 r = SIZE_OP_CALL;
1538 break;
1539 #endif
1540
1541 case N_QUANTIFIER:
1542 r = compile_length_quantifier_node(&(NQUANTIFIER(node)), reg);
1543 break;
1544
1545 case N_EFFECT:
1546 r = compile_length_effect_node(&NEFFECT(node), reg);
1547 break;
1548
1549 case N_ANCHOR:
1550 r = compile_length_anchor_node(&(NANCHOR(node)), reg);
1551 break;
1552
1553 default:
1554 return ONIGERR_TYPE_BUG;
1555 break;
1556 }
1557
1558 return r;
1559 }
1560
1561 static int
compile_tree(Node * node,regex_t * reg)1562 compile_tree(Node* node, regex_t* reg)
1563 {
1564 int n, type, len, pos, r = 0;
1565
1566 type = NTYPE(node);
1567 switch (type) {
1568 case N_LIST:
1569 do {
1570 r = compile_tree(NCONS(node).left, reg);
1571 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1572 break;
1573
1574 case N_ALT:
1575 {
1576 Node* x = node;
1577 len = 0;
1578 do {
1579 len += compile_length_tree(NCONS(x).left, reg);
1580 if (NCONS(x).right != NULL) {
1581 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1582 }
1583 } while (IS_NOT_NULL(x = NCONS(x).right));
1584 pos = reg->used + len; /* goal position */
1585
1586 do {
1587 len = compile_length_tree(NCONS(node).left, reg);
1588 if (IS_NOT_NULL(NCONS(node).right)) {
1589 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1590 if (r) break;
1591 }
1592 r = compile_tree(NCONS(node).left, reg);
1593 if (r) break;
1594 if (IS_NOT_NULL(NCONS(node).right)) {
1595 len = pos - (reg->used + SIZE_OP_JUMP);
1596 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1597 if (r) break;
1598 }
1599 } while (IS_NOT_NULL(node = NCONS(node).right));
1600 }
1601 break;
1602
1603 case N_STRING:
1604 if (NSTRING_IS_RAW(node))
1605 r = compile_string_raw_node(&(NSTRING(node)), reg);
1606 else
1607 r = compile_string_node(node, reg);
1608 break;
1609
1610 case N_CCLASS:
1611 r = compile_cclass_node(&(NCCLASS(node)), reg);
1612 break;
1613
1614 case N_CTYPE:
1615 {
1616 int op;
1617
1618 switch (NCTYPE(node).type) {
1619 case CTYPE_WORD: op = OP_WORD; break;
1620 case CTYPE_NOT_WORD: op = OP_NOT_WORD; break;
1621 default:
1622 return ONIGERR_TYPE_BUG;
1623 break;
1624 }
1625 r = add_opcode(reg, op);
1626 }
1627 break;
1628
1629 case N_ANYCHAR:
1630 if (IS_MULTILINE(reg->options))
1631 r = add_opcode(reg, OP_ANYCHAR_ML);
1632 else
1633 r = add_opcode(reg, OP_ANYCHAR);
1634 break;
1635
1636 case N_BACKREF:
1637 {
1638 BackrefNode* br = &(NBACKREF(node));
1639
1640 #ifdef USE_BACKREF_AT_LEVEL
1641 if (IS_BACKREF_NEST_LEVEL(br)) {
1642 r = add_opcode(reg, OP_BACKREF_AT_LEVEL);
1643 if (r) return r;
1644 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1645 if (r) return r;
1646 r = add_length(reg, br->nest_level);
1647 if (r) return r;
1648
1649 goto add_bacref_mems;
1650 }
1651 else
1652 #endif
1653 if (br->back_num == 1) {
1654 n = br->back_static[0];
1655 if (IS_IGNORECASE(reg->options)) {
1656 r = add_opcode(reg, OP_BACKREFN_IC);
1657 if (r) return r;
1658 r = add_mem_num(reg, n);
1659 }
1660 else {
1661 switch (n) {
1662 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1663 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1664 default:
1665 r = add_opcode(reg, OP_BACKREFN);
1666 if (r) return r;
1667 r = add_mem_num(reg, n);
1668 break;
1669 }
1670 }
1671 }
1672 else {
1673 int i;
1674 int* p;
1675
1676 if (IS_IGNORECASE(reg->options)) {
1677 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1678 }
1679 else {
1680 r = add_opcode(reg, OP_BACKREF_MULTI);
1681 }
1682 if (r) return r;
1683
1684 #ifdef USE_BACKREF_AT_LEVEL
1685 add_bacref_mems:
1686 #endif
1687 r = add_length(reg, br->back_num);
1688 if (r) return r;
1689 p = BACKREFS_P(br);
1690 for (i = br->back_num - 1; i >= 0; i--) {
1691 r = add_mem_num(reg, p[i]);
1692 if (r) return r;
1693 }
1694 }
1695 }
1696 break;
1697
1698 #ifdef USE_SUBEXP_CALL
1699 case N_CALL:
1700 r = compile_call(&(NCALL(node)), reg);
1701 break;
1702 #endif
1703
1704 case N_QUANTIFIER:
1705 r = compile_quantifier_node(&(NQUANTIFIER(node)), reg);
1706 break;
1707
1708 case N_EFFECT:
1709 r = compile_effect_node(&NEFFECT(node), reg);
1710 break;
1711
1712 case N_ANCHOR:
1713 r = compile_anchor_node(&(NANCHOR(node)), reg);
1714 break;
1715
1716 default:
1717 #ifdef ONIG_DEBUG
1718 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1719 #endif
1720 break;
1721 }
1722
1723 return r;
1724 }
1725
1726 #ifdef USE_NAMED_GROUP
1727
1728 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1729 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1730 {
1731 int r = 0;
1732 Node* node = *plink;
1733
1734 switch (NTYPE(node)) {
1735 case N_LIST:
1736 case N_ALT:
1737 do {
1738 r = noname_disable_map(&(NCONS(node).left), map, counter);
1739 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1740 break;
1741
1742 case N_QUANTIFIER:
1743 {
1744 Node** ptarget = &(NQUANTIFIER(node).target);
1745 Node* old = *ptarget;
1746 r = noname_disable_map(ptarget, map, counter);
1747 if (*ptarget != old && NTYPE(*ptarget) == N_QUANTIFIER) {
1748 onig_reduce_nested_quantifier(node, *ptarget);
1749 }
1750 }
1751 break;
1752
1753 case N_EFFECT:
1754 {
1755 EffectNode* en = &(NEFFECT(node));
1756 if (en->type == EFFECT_MEMORY) {
1757 if (IS_EFFECT_NAMED_GROUP(en)) {
1758 (*counter)++;
1759 map[en->regnum].new_val = *counter;
1760 en->regnum = *counter;
1761 r = noname_disable_map(&(en->target), map, counter);
1762 }
1763 else {
1764 *plink = en->target;
1765 en->target = NULL_NODE;
1766 onig_node_free(node);
1767 r = noname_disable_map(plink, map, counter);
1768 }
1769 }
1770 else
1771 r = noname_disable_map(&(en->target), map, counter);
1772 }
1773 break;
1774
1775 default:
1776 break;
1777 }
1778
1779 return r;
1780 }
1781
1782 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1783 renumber_node_backref(Node* node, GroupNumRemap* map)
1784 {
1785 int i, pos, n, old_num;
1786 int *backs;
1787 BackrefNode* bn = &(NBACKREF(node));
1788
1789 if (! IS_BACKREF_NAME_REF(bn))
1790 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1791
1792 old_num = bn->back_num;
1793 if (IS_NULL(bn->back_dynamic))
1794 backs = bn->back_static;
1795 else
1796 backs = bn->back_dynamic;
1797
1798 for (i = 0, pos = 0; i < old_num; i++) {
1799 n = map[backs[i]].new_val;
1800 if (n > 0) {
1801 backs[pos] = n;
1802 pos++;
1803 }
1804 }
1805
1806 bn->back_num = pos;
1807 return 0;
1808 }
1809
1810 static int
renumber_by_map(Node * node,GroupNumRemap * map)1811 renumber_by_map(Node* node, GroupNumRemap* map)
1812 {
1813 int r = 0;
1814
1815 switch (NTYPE(node)) {
1816 case N_LIST:
1817 case N_ALT:
1818 do {
1819 r = renumber_by_map(NCONS(node).left, map);
1820 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1821 break;
1822 case N_QUANTIFIER:
1823 r = renumber_by_map(NQUANTIFIER(node).target, map);
1824 break;
1825 case N_EFFECT:
1826 r = renumber_by_map(NEFFECT(node).target, map);
1827 break;
1828
1829 case N_BACKREF:
1830 r = renumber_node_backref(node, map);
1831 break;
1832
1833 default:
1834 break;
1835 }
1836
1837 return r;
1838 }
1839
1840 static int
numbered_ref_check(Node * node)1841 numbered_ref_check(Node* node)
1842 {
1843 int r = 0;
1844
1845 switch (NTYPE(node)) {
1846 case N_LIST:
1847 case N_ALT:
1848 do {
1849 r = numbered_ref_check(NCONS(node).left);
1850 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1851 break;
1852 case N_QUANTIFIER:
1853 r = numbered_ref_check(NQUANTIFIER(node).target);
1854 break;
1855 case N_EFFECT:
1856 r = numbered_ref_check(NEFFECT(node).target);
1857 break;
1858
1859 case N_BACKREF:
1860 if (! IS_BACKREF_NAME_REF(&(NBACKREF(node))))
1861 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1862 break;
1863
1864 default:
1865 break;
1866 }
1867
1868 return r;
1869 }
1870
1871 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1872 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1873 {
1874 int r, i, pos, counter;
1875 BitStatusType loc;
1876 GroupNumRemap* map;
1877
1878 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1879 CHECK_NULL_RETURN_VAL(map, ONIGERR_MEMORY);
1880 for (i = 1; i <= env->num_mem; i++) {
1881 map[i].new_val = 0;
1882 }
1883 counter = 0;
1884 r = noname_disable_map(root, map, &counter);
1885 if (r != 0) return r;
1886
1887 r = renumber_by_map(*root, map);
1888 if (r != 0) return r;
1889
1890 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1891 if (map[i].new_val > 0) {
1892 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1893 pos++;
1894 }
1895 }
1896
1897 loc = env->capture_history;
1898 BIT_STATUS_CLEAR(env->capture_history);
1899 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1900 if (BIT_STATUS_AT(loc, i)) {
1901 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1902 }
1903 }
1904
1905 env->num_mem = env->num_named;
1906 reg->num_mem = env->num_named;
1907
1908 return onig_renumber_name_table(reg, map);
1909 }
1910 #endif /* USE_NAMED_GROUP */
1911
1912 #ifdef USE_SUBEXP_CALL
1913 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1914 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1915 {
1916 int i, offset;
1917 EffectNode* en;
1918 AbsAddrType addr;
1919
1920 for (i = 0; i < uslist->num; i++) {
1921 en = &(NEFFECT(uslist->us[i].target));
1922 if (! IS_EFFECT_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1923 addr = en->call_addr;
1924 offset = uslist->us[i].offset;
1925
1926 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1927 }
1928 return 0;
1929 }
1930 #endif
1931
1932 #ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK
1933 static int
quantifiers_memory_node_info(Node * node)1934 quantifiers_memory_node_info(Node* node)
1935 {
1936 int r = 0;
1937
1938 switch (NTYPE(node)) {
1939 case N_LIST:
1940 case N_ALT:
1941 {
1942 int v;
1943 do {
1944 v = quantifiers_memory_node_info(NCONS(node).left);
1945 if (v > r) r = v;
1946 } while (v >= 0 && IS_NOT_NULL(node = NCONS(node).right));
1947 }
1948 break;
1949
1950 #ifdef USE_SUBEXP_CALL
1951 case N_CALL:
1952 if (IS_CALL_RECURSION(&NCALL(node))) {
1953 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1954 }
1955 else
1956 r = quantifiers_memory_node_info(NCALL(node).target);
1957 break;
1958 #endif
1959
1960 case N_QUANTIFIER:
1961 {
1962 QuantifierNode* qn = &(NQUANTIFIER(node));
1963 if (qn->upper != 0) {
1964 r = quantifiers_memory_node_info(qn->target);
1965 }
1966 }
1967 break;
1968
1969 case N_EFFECT:
1970 {
1971 EffectNode* en = &(NEFFECT(node));
1972 switch (en->type) {
1973 case EFFECT_MEMORY:
1974 return NQ_TARGET_IS_EMPTY_MEM;
1975 break;
1976
1977 case EFFECT_OPTION:
1978 case EFFECT_STOP_BACKTRACK:
1979 r = quantifiers_memory_node_info(en->target);
1980 break;
1981 default:
1982 break;
1983 }
1984 }
1985 break;
1986
1987 case N_BACKREF:
1988 case N_STRING:
1989 case N_CTYPE:
1990 case N_CCLASS:
1991 case N_ANYCHAR:
1992 case N_ANCHOR:
1993 default:
1994 break;
1995 }
1996
1997 return r;
1998 }
1999 #endif /* USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK */
2000
2001 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2002 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2003 {
2004 OnigDistance tmin;
2005 int r = 0;
2006
2007 *min = 0;
2008 switch (NTYPE(node)) {
2009 case N_BACKREF:
2010 {
2011 int i;
2012 int* backs;
2013 Node** nodes = SCANENV_MEM_NODES(env);
2014 BackrefNode* br = &(NBACKREF(node));
2015 if (br->state & NST_RECURSION) break;
2016
2017 backs = BACKREFS_P(br);
2018 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2019 r = get_min_match_length(nodes[backs[0]], min, env);
2020 if (r != 0) break;
2021 for (i = 1; i < br->back_num; i++) {
2022 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2023 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2024 if (r != 0) break;
2025 if (*min > tmin) *min = tmin;
2026 }
2027 }
2028 break;
2029
2030 #ifdef USE_SUBEXP_CALL
2031 case N_CALL:
2032 if (IS_CALL_RECURSION(&NCALL(node))) {
2033 EffectNode* en = &(NEFFECT(NCALL(node).target));
2034 if (IS_EFFECT_MIN_FIXED(en))
2035 *min = en->min_len;
2036 }
2037 else
2038 r = get_min_match_length(NCALL(node).target, min, env);
2039 break;
2040 #endif
2041
2042 case N_LIST:
2043 do {
2044 r = get_min_match_length(NCONS(node).left, &tmin, env);
2045 if (r == 0) *min += tmin;
2046 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2047 break;
2048
2049 case N_ALT:
2050 {
2051 Node *x, *y;
2052 y = node;
2053 do {
2054 x = NCONS(y).left;
2055 r = get_min_match_length(x, &tmin, env);
2056 if (r != 0) break;
2057 if (y == node) *min = tmin;
2058 else if (*min > tmin) *min = tmin;
2059 } while (r == 0 && IS_NOT_NULL(y = NCONS(y).right));
2060 }
2061 break;
2062
2063 case N_STRING:
2064 {
2065 StrNode* sn = &(NSTRING(node));
2066 *min = sn->end - sn->s;
2067 }
2068 break;
2069
2070 case N_CTYPE:
2071 switch (NCTYPE(node).type) {
2072 case CTYPE_WORD: *min = 1; break;
2073 case CTYPE_NOT_WORD: *min = 1; break;
2074 default:
2075 break;
2076 }
2077 break;
2078
2079 case N_CCLASS:
2080 case N_ANYCHAR:
2081 *min = 1;
2082 break;
2083
2084 case N_QUANTIFIER:
2085 {
2086 QuantifierNode* qn = &(NQUANTIFIER(node));
2087
2088 if (qn->lower > 0) {
2089 r = get_min_match_length(qn->target, min, env);
2090 if (r == 0)
2091 *min = distance_multiply(*min, qn->lower);
2092 }
2093 }
2094 break;
2095
2096 case N_EFFECT:
2097 {
2098 EffectNode* en = &(NEFFECT(node));
2099 switch (en->type) {
2100 case EFFECT_MEMORY:
2101 #ifdef USE_SUBEXP_CALL
2102 if (IS_EFFECT_MIN_FIXED(en))
2103 *min = en->min_len;
2104 else {
2105 r = get_min_match_length(en->target, min, env);
2106 if (r == 0) {
2107 en->min_len = *min;
2108 SET_EFFECT_STATUS(node, NST_MIN_FIXED);
2109 }
2110 }
2111 break;
2112 #endif
2113 case EFFECT_OPTION:
2114 case EFFECT_STOP_BACKTRACK:
2115 r = get_min_match_length(en->target, min, env);
2116 break;
2117 }
2118 }
2119 break;
2120
2121 case N_ANCHOR:
2122 default:
2123 break;
2124 }
2125
2126 return r;
2127 }
2128
2129 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2130 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2131 {
2132 OnigDistance tmax;
2133 int r = 0;
2134
2135 *max = 0;
2136 switch (NTYPE(node)) {
2137 case N_LIST:
2138 do {
2139 r = get_max_match_length(NCONS(node).left, &tmax, env);
2140 if (r == 0)
2141 *max = distance_add(*max, tmax);
2142 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2143 break;
2144
2145 case N_ALT:
2146 do {
2147 r = get_max_match_length(NCONS(node).left, &tmax, env);
2148 if (r == 0 && *max < tmax) *max = tmax;
2149 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2150 break;
2151
2152 case N_STRING:
2153 {
2154 StrNode* sn = &(NSTRING(node));
2155 *max = sn->end - sn->s;
2156 }
2157 break;
2158
2159 case N_CTYPE:
2160 switch (NCTYPE(node).type) {
2161 case CTYPE_WORD:
2162 case CTYPE_NOT_WORD:
2163 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2164 break;
2165
2166 default:
2167 break;
2168 }
2169 break;
2170
2171 case N_CCLASS:
2172 case N_ANYCHAR:
2173 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2174 break;
2175
2176 case N_BACKREF:
2177 {
2178 int i;
2179 int* backs;
2180 Node** nodes = SCANENV_MEM_NODES(env);
2181 BackrefNode* br = &(NBACKREF(node));
2182 if (br->state & NST_RECURSION) {
2183 *max = ONIG_INFINITE_DISTANCE;
2184 break;
2185 }
2186 backs = BACKREFS_P(br);
2187 for (i = 0; i < br->back_num; i++) {
2188 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2189 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2190 if (r != 0) break;
2191 if (*max < tmax) *max = tmax;
2192 }
2193 }
2194 break;
2195
2196 #ifdef USE_SUBEXP_CALL
2197 case N_CALL:
2198 if (! IS_CALL_RECURSION(&(NCALL(node))))
2199 r = get_max_match_length(NCALL(node).target, max, env);
2200 else
2201 *max = ONIG_INFINITE_DISTANCE;
2202 break;
2203 #endif
2204
2205 case N_QUANTIFIER:
2206 {
2207 QuantifierNode* qn = &(NQUANTIFIER(node));
2208
2209 if (qn->upper != 0) {
2210 r = get_max_match_length(qn->target, max, env);
2211 if (r == 0 && *max != 0) {
2212 if (! IS_REPEAT_INFINITE(qn->upper))
2213 *max = distance_multiply(*max, qn->upper);
2214 else
2215 *max = ONIG_INFINITE_DISTANCE;
2216 }
2217 }
2218 }
2219 break;
2220
2221 case N_EFFECT:
2222 {
2223 EffectNode* en = &(NEFFECT(node));
2224 switch (en->type) {
2225 case EFFECT_MEMORY:
2226 #ifdef USE_SUBEXP_CALL
2227 if (IS_EFFECT_MAX_FIXED(en))
2228 *max = en->max_len;
2229 else {
2230 r = get_max_match_length(en->target, max, env);
2231 if (r == 0) {
2232 en->max_len = *max;
2233 SET_EFFECT_STATUS(node, NST_MAX_FIXED);
2234 }
2235 }
2236 break;
2237 #endif
2238 case EFFECT_OPTION:
2239 case EFFECT_STOP_BACKTRACK:
2240 r = get_max_match_length(en->target, max, env);
2241 break;
2242 }
2243 }
2244 break;
2245
2246 case N_ANCHOR:
2247 default:
2248 break;
2249 }
2250
2251 return r;
2252 }
2253
2254 #define GET_CHAR_LEN_VARLEN -1
2255 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2256
2257 /* fixed size pattern node only */
2258 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2259 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2260 {
2261 int tlen;
2262 int r = 0;
2263
2264 level++;
2265 *len = 0;
2266 switch (NTYPE(node)) {
2267 case N_LIST:
2268 do {
2269 r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level);
2270 if (r == 0)
2271 *len = distance_add(*len, tlen);
2272 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2273 break;
2274
2275 case N_ALT:
2276 {
2277 int tlen2;
2278 int varlen = 0;
2279
2280 r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level);
2281 while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)) {
2282 r = get_char_length_tree1(NCONS(node).left, reg, &tlen2, level);
2283 if (r == 0) {
2284 if (tlen != tlen2)
2285 varlen = 1;
2286 }
2287 }
2288 if (r == 0) {
2289 if (varlen != 0) {
2290 if (level == 1)
2291 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2292 else
2293 r = GET_CHAR_LEN_VARLEN;
2294 }
2295 else
2296 *len = tlen;
2297 }
2298 }
2299 break;
2300
2301 case N_STRING:
2302 {
2303 StrNode* sn = &(NSTRING(node));
2304 UChar *s = sn->s;
2305 while (s < sn->end) {
2306 s += enc_len(reg->enc, s);
2307 (*len)++;
2308 }
2309 }
2310 break;
2311
2312 case N_QUANTIFIER:
2313 {
2314 QuantifierNode* qn = &(NQUANTIFIER(node));
2315 if (qn->lower == qn->upper) {
2316 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2317 if (r == 0)
2318 *len = distance_multiply(tlen, qn->lower);
2319 }
2320 else
2321 r = GET_CHAR_LEN_VARLEN;
2322 }
2323 break;
2324
2325 #ifdef USE_SUBEXP_CALL
2326 case N_CALL:
2327 if (! IS_CALL_RECURSION(&(NCALL(node))))
2328 r = get_char_length_tree1(NCALL(node).target, reg, len, level);
2329 else
2330 r = GET_CHAR_LEN_VARLEN;
2331 break;
2332 #endif
2333
2334 case N_CTYPE:
2335 switch (NCTYPE(node).type) {
2336 case CTYPE_WORD:
2337 case CTYPE_NOT_WORD:
2338 *len = 1;
2339 break;
2340 }
2341 break;
2342
2343 case N_CCLASS:
2344 case N_ANYCHAR:
2345 *len = 1;
2346 break;
2347
2348 case N_EFFECT:
2349 {
2350 EffectNode* en = &(NEFFECT(node));
2351 switch (en->type) {
2352 case EFFECT_MEMORY:
2353 #ifdef USE_SUBEXP_CALL
2354 if (IS_EFFECT_CLEN_FIXED(en))
2355 *len = en->char_len;
2356 else {
2357 r = get_char_length_tree1(en->target, reg, len, level);
2358 if (r == 0) {
2359 en->char_len = *len;
2360 SET_EFFECT_STATUS(node, NST_CLEN_FIXED);
2361 }
2362 }
2363 break;
2364 #endif
2365 case EFFECT_OPTION:
2366 case EFFECT_STOP_BACKTRACK:
2367 r = get_char_length_tree1(en->target, reg, len, level);
2368 break;
2369 default:
2370 break;
2371 }
2372 }
2373 break;
2374
2375 case N_ANCHOR:
2376 break;
2377
2378 default:
2379 r = GET_CHAR_LEN_VARLEN;
2380 break;
2381 }
2382
2383 return r;
2384 }
2385
2386 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2387 get_char_length_tree(Node* node, regex_t* reg, int* len)
2388 {
2389 return get_char_length_tree1(node, reg, len, 0);
2390 }
2391
2392 /* x is not included y ==> 1 : 0 */
2393 static int
is_not_included(Node * x,Node * y,regex_t * reg)2394 is_not_included(Node* x, Node* y, regex_t* reg)
2395 {
2396 int i, len;
2397 OnigCodePoint code;
2398 UChar *p, c;
2399 int ytype;
2400
2401 retry:
2402 ytype = NTYPE(y);
2403 switch (NTYPE(x)) {
2404 case N_CTYPE:
2405 {
2406 switch (ytype) {
2407 case N_CTYPE:
2408 switch (NCTYPE(x).type) {
2409 case CTYPE_WORD:
2410 if (NCTYPE(y).type == CTYPE_NOT_WORD)
2411 return 1;
2412 else
2413 return 0;
2414 break;
2415 case CTYPE_NOT_WORD:
2416 if (NCTYPE(y).type == CTYPE_WORD)
2417 return 1;
2418 else
2419 return 0;
2420 break;
2421 default:
2422 break;
2423 }
2424 break;
2425
2426 case N_CCLASS:
2427 swap:
2428 {
2429 Node* tmp;
2430 tmp = x; x = y; y = tmp;
2431 goto retry;
2432 }
2433 break;
2434
2435 case N_STRING:
2436 goto swap;
2437 break;
2438
2439 default:
2440 break;
2441 }
2442 }
2443 break;
2444
2445 case N_CCLASS:
2446 {
2447 CClassNode* xc = &(NCCLASS(x));
2448 switch (ytype) {
2449 case N_CTYPE:
2450 switch (NCTYPE(y).type) {
2451 case CTYPE_WORD:
2452 if (IS_NULL(xc->mbuf) && !IS_CCLASS_NOT(xc)) {
2453 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2454 if (BITSET_AT(xc->bs, i)) {
2455 if (ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) return 0;
2456 }
2457 }
2458 return 1;
2459 }
2460 return 0;
2461 break;
2462 case CTYPE_NOT_WORD:
2463 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2464 if (! ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) {
2465 if (!IS_CCLASS_NOT(xc)) {
2466 if (BITSET_AT(xc->bs, i))
2467 return 0;
2468 }
2469 else {
2470 if (! BITSET_AT(xc->bs, i))
2471 return 0;
2472 }
2473 }
2474 }
2475 return 1;
2476 break;
2477
2478 default:
2479 break;
2480 }
2481 break;
2482
2483 case N_CCLASS:
2484 {
2485 int v;
2486 CClassNode* yc = &(NCCLASS(y));
2487
2488 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2489 v = BITSET_AT(xc->bs, i);
2490 if ((v != 0 && !IS_CCLASS_NOT(xc)) ||
2491 (v == 0 && IS_CCLASS_NOT(xc))) {
2492 v = BITSET_AT(yc->bs, i);
2493 if ((v != 0 && !IS_CCLASS_NOT(yc)) ||
2494 (v == 0 && IS_CCLASS_NOT(yc)))
2495 return 0;
2496 }
2497 }
2498 if ((IS_NULL(xc->mbuf) && !IS_CCLASS_NOT(xc)) ||
2499 (IS_NULL(yc->mbuf) && !IS_CCLASS_NOT(yc)))
2500 return 1;
2501 return 0;
2502 }
2503 break;
2504
2505 case N_STRING:
2506 goto swap;
2507 break;
2508
2509 default:
2510 break;
2511 }
2512 }
2513 break;
2514
2515 case N_STRING:
2516 {
2517 StrNode* xs = &(NSTRING(x));
2518 if (NSTRING_LEN(x) == 0)
2519 break;
2520
2521 c = *(xs->s);
2522 switch (ytype) {
2523 case N_CTYPE:
2524 switch (NCTYPE(y).type) {
2525 case CTYPE_WORD:
2526 return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 0 : 1);
2527 break;
2528 case CTYPE_NOT_WORD:
2529 return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 1 : 0);
2530 break;
2531 default:
2532 break;
2533 }
2534 break;
2535
2536 case N_CCLASS:
2537 {
2538 CClassNode* cc = &(NCCLASS(y));
2539
2540 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2541 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2542 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2543 }
2544 break;
2545
2546 case N_STRING:
2547 {
2548 UChar *q;
2549 StrNode* ys = &(NSTRING(y));
2550 len = NSTRING_LEN(x);
2551 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2552 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2553 /* tiny version */
2554 return 0;
2555 }
2556 else {
2557 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2558 if (*p != *q) return 1;
2559 }
2560 }
2561 }
2562 break;
2563
2564 default:
2565 break;
2566 }
2567 }
2568 break;
2569
2570 default:
2571 break;
2572 }
2573
2574 return 0;
2575 }
2576
2577 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2578 get_head_value_node(Node* node, int exact, regex_t* reg)
2579 {
2580 Node* n = NULL_NODE;
2581
2582 switch (NTYPE(node)) {
2583 case N_BACKREF:
2584 case N_ALT:
2585 case N_ANYCHAR:
2586 #ifdef USE_SUBEXP_CALL
2587 case N_CALL:
2588 #endif
2589 break;
2590
2591 case N_CTYPE:
2592 case N_CCLASS:
2593 if (exact == 0) {
2594 n = node;
2595 }
2596 break;
2597
2598 case N_LIST:
2599 n = get_head_value_node(NCONS(node).left, exact, reg);
2600 break;
2601
2602 case N_STRING:
2603 {
2604 StrNode* sn = &(NSTRING(node));
2605
2606 if (sn->end <= sn->s)
2607 break;
2608
2609 if (exact != 0 &&
2610 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2611 #if 0
2612 UChar* tmp = sn->s;
2613 if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag,
2614 &tmp, sn->end))
2615 n = node;
2616 #endif
2617 }
2618 else {
2619 n = node;
2620 }
2621 }
2622 break;
2623
2624 case N_QUANTIFIER:
2625 {
2626 QuantifierNode* qn = &(NQUANTIFIER(node));
2627 if (qn->lower > 0) {
2628 if (IS_NOT_NULL(qn->head_exact))
2629 n = qn->head_exact;
2630 else
2631 n = get_head_value_node(qn->target, exact, reg);
2632 }
2633 }
2634 break;
2635
2636 case N_EFFECT:
2637 {
2638 EffectNode* en = &(NEFFECT(node));
2639 switch (en->type) {
2640 case EFFECT_OPTION:
2641 {
2642 OnigOptionType options = reg->options;
2643
2644 reg->options = NEFFECT(node).option;
2645 n = get_head_value_node(NEFFECT(node).target, exact, reg);
2646 reg->options = options;
2647 }
2648 break;
2649
2650 case EFFECT_MEMORY:
2651 case EFFECT_STOP_BACKTRACK:
2652 n = get_head_value_node(en->target, exact, reg);
2653 break;
2654 }
2655 }
2656 break;
2657
2658 case N_ANCHOR:
2659 if (NANCHOR(node).type == ANCHOR_PREC_READ)
2660 n = get_head_value_node(NANCHOR(node).target, exact, reg);
2661 break;
2662
2663 default:
2664 break;
2665 }
2666
2667 return n;
2668 }
2669
2670 static int
check_type_tree(Node * node,int type_mask,int effect_mask,int anchor_mask)2671 check_type_tree(Node* node, int type_mask, int effect_mask, int anchor_mask)
2672 {
2673 int type, r = 0;
2674
2675 type = NTYPE(node);
2676 if ((type & type_mask) == 0)
2677 return 1;
2678
2679 switch (type) {
2680 case N_LIST:
2681 case N_ALT:
2682 do {
2683 r = check_type_tree(NCONS(node).left, type_mask, effect_mask, anchor_mask);
2684 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2685 break;
2686
2687 case N_QUANTIFIER:
2688 r = check_type_tree(NQUANTIFIER(node).target, type_mask, effect_mask,
2689 anchor_mask);
2690 break;
2691
2692 case N_EFFECT:
2693 {
2694 EffectNode* en = &(NEFFECT(node));
2695 if ((en->type & effect_mask) == 0)
2696 return 1;
2697
2698 r = check_type_tree(en->target, type_mask, effect_mask, anchor_mask);
2699 }
2700 break;
2701
2702 case N_ANCHOR:
2703 type = NANCHOR(node).type;
2704 if ((type & anchor_mask) == 0)
2705 return 1;
2706
2707 if (NANCHOR(node).target)
2708 r = check_type_tree(NANCHOR(node).target,
2709 type_mask, effect_mask, anchor_mask);
2710 break;
2711
2712 default:
2713 break;
2714 }
2715 return r;
2716 }
2717
2718 #ifdef USE_SUBEXP_CALL
2719
2720 #define RECURSION_EXIST 1
2721 #define RECURSION_INFINITE 2
2722
2723 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2724 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2725 {
2726 int type;
2727 int r = 0;
2728
2729 type = NTYPE(node);
2730 switch (type) {
2731 case N_LIST:
2732 {
2733 Node *x;
2734 OnigDistance min;
2735 int ret;
2736
2737 x = node;
2738 do {
2739 ret = subexp_inf_recursive_check(NCONS(x).left, env, head);
2740 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2741 r |= ret;
2742 if (head) {
2743 ret = get_min_match_length(NCONS(x).left, &min, env);
2744 if (ret != 0) return ret;
2745 if (min != 0) head = 0;
2746 }
2747 } while (IS_NOT_NULL(x = NCONS(x).right));
2748 }
2749 break;
2750
2751 case N_ALT:
2752 {
2753 int ret;
2754 r = RECURSION_EXIST;
2755 do {
2756 ret = subexp_inf_recursive_check(NCONS(node).left, env, head);
2757 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2758 r &= ret;
2759 } while (IS_NOT_NULL(node = NCONS(node).right));
2760 }
2761 break;
2762
2763 case N_QUANTIFIER:
2764 r = subexp_inf_recursive_check(NQUANTIFIER(node).target, env, head);
2765 if (r == RECURSION_EXIST) {
2766 if (NQUANTIFIER(node).lower == 0) r = 0;
2767 }
2768 break;
2769
2770 case N_ANCHOR:
2771 {
2772 AnchorNode* an = &(NANCHOR(node));
2773 switch (an->type) {
2774 case ANCHOR_PREC_READ:
2775 case ANCHOR_PREC_READ_NOT:
2776 case ANCHOR_LOOK_BEHIND:
2777 case ANCHOR_LOOK_BEHIND_NOT:
2778 r = subexp_inf_recursive_check(an->target, env, head);
2779 break;
2780 }
2781 }
2782 break;
2783
2784 case N_CALL:
2785 r = subexp_inf_recursive_check(NCALL(node).target, env, head);
2786 break;
2787
2788 case N_EFFECT:
2789 if (IS_EFFECT_MARK2(&(NEFFECT(node))))
2790 return 0;
2791 else if (IS_EFFECT_MARK1(&(NEFFECT(node))))
2792 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2793 else {
2794 SET_EFFECT_STATUS(node, NST_MARK2);
2795 r = subexp_inf_recursive_check(NEFFECT(node).target, env, head);
2796 CLEAR_EFFECT_STATUS(node, NST_MARK2);
2797 }
2798 break;
2799
2800 default:
2801 break;
2802 }
2803
2804 return r;
2805 }
2806
2807 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2808 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2809 {
2810 int type;
2811 int r = 0;
2812
2813 type = NTYPE(node);
2814 switch (type) {
2815 case N_LIST:
2816 case N_ALT:
2817 do {
2818 r = subexp_inf_recursive_check_trav(NCONS(node).left, env);
2819 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2820 break;
2821
2822 case N_QUANTIFIER:
2823 r = subexp_inf_recursive_check_trav(NQUANTIFIER(node).target, env);
2824 break;
2825
2826 case N_ANCHOR:
2827 {
2828 AnchorNode* an = &(NANCHOR(node));
2829 switch (an->type) {
2830 case ANCHOR_PREC_READ:
2831 case ANCHOR_PREC_READ_NOT:
2832 case ANCHOR_LOOK_BEHIND:
2833 case ANCHOR_LOOK_BEHIND_NOT:
2834 r = subexp_inf_recursive_check_trav(an->target, env);
2835 break;
2836 }
2837 }
2838 break;
2839
2840 case N_EFFECT:
2841 {
2842 EffectNode* en = &(NEFFECT(node));
2843
2844 if (IS_EFFECT_RECURSION(en)) {
2845 SET_EFFECT_STATUS(node, NST_MARK1);
2846 r = subexp_inf_recursive_check(en->target, env, 1);
2847 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2848 CLEAR_EFFECT_STATUS(node, NST_MARK1);
2849 }
2850 r = subexp_inf_recursive_check_trav(en->target, env);
2851 }
2852
2853 break;
2854
2855 default:
2856 break;
2857 }
2858
2859 return r;
2860 }
2861
2862 static int
subexp_recursive_check(Node * node)2863 subexp_recursive_check(Node* node)
2864 {
2865 int type;
2866 int r = 0;
2867
2868 type = NTYPE(node);
2869 switch (type) {
2870 case N_LIST:
2871 case N_ALT:
2872 do {
2873 r |= subexp_recursive_check(NCONS(node).left);
2874 } while (IS_NOT_NULL(node = NCONS(node).right));
2875 break;
2876
2877 case N_QUANTIFIER:
2878 r = subexp_recursive_check(NQUANTIFIER(node).target);
2879 break;
2880
2881 case N_ANCHOR:
2882 {
2883 AnchorNode* an = &(NANCHOR(node));
2884 switch (an->type) {
2885 case ANCHOR_PREC_READ:
2886 case ANCHOR_PREC_READ_NOT:
2887 case ANCHOR_LOOK_BEHIND:
2888 case ANCHOR_LOOK_BEHIND_NOT:
2889 r = subexp_recursive_check(an->target);
2890 break;
2891 }
2892 }
2893 break;
2894
2895 case N_CALL:
2896 r = subexp_recursive_check(NCALL(node).target);
2897 if (r != 0) SET_CALL_RECURSION(node);
2898 break;
2899
2900 case N_EFFECT:
2901 if (IS_EFFECT_MARK2(&(NEFFECT(node))))
2902 return 0;
2903 else if (IS_EFFECT_MARK1(&(NEFFECT(node))))
2904 return 1; /* recursion */
2905 else {
2906 SET_EFFECT_STATUS(node, NST_MARK2);
2907 r = subexp_recursive_check(NEFFECT(node).target);
2908 CLEAR_EFFECT_STATUS(node, NST_MARK2);
2909 }
2910 break;
2911
2912 default:
2913 break;
2914 }
2915
2916 return r;
2917 }
2918
2919
2920 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2921 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2922 {
2923 #define FOUND_CALLED_NODE 1
2924
2925 int type;
2926 int r = 0;
2927
2928 type = NTYPE(node);
2929 switch (type) {
2930 case N_LIST:
2931 case N_ALT:
2932 {
2933 int ret;
2934 do {
2935 ret = subexp_recursive_check_trav(NCONS(node).left, env);
2936 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2937 else if (ret < 0) return ret;
2938 } while (IS_NOT_NULL(node = NCONS(node).right));
2939 }
2940 break;
2941
2942 case N_QUANTIFIER:
2943 r = subexp_recursive_check_trav(NQUANTIFIER(node).target, env);
2944 if (NQUANTIFIER(node).upper == 0) {
2945 if (r == FOUND_CALLED_NODE)
2946 NQUANTIFIER(node).is_refered = 1;
2947 }
2948 break;
2949
2950 case N_ANCHOR:
2951 {
2952 AnchorNode* an = &(NANCHOR(node));
2953 switch (an->type) {
2954 case ANCHOR_PREC_READ:
2955 case ANCHOR_PREC_READ_NOT:
2956 case ANCHOR_LOOK_BEHIND:
2957 case ANCHOR_LOOK_BEHIND_NOT:
2958 r = subexp_recursive_check_trav(an->target, env);
2959 break;
2960 }
2961 }
2962 break;
2963
2964 case N_EFFECT:
2965 {
2966 EffectNode* en = &(NEFFECT(node));
2967
2968 if (! IS_EFFECT_RECURSION(en)) {
2969 if (IS_EFFECT_CALLED(en)) {
2970 SET_EFFECT_STATUS(node, NST_MARK1);
2971 r = subexp_recursive_check(en->target);
2972 if (r != 0) SET_EFFECT_STATUS(node, NST_RECURSION);
2973 CLEAR_EFFECT_STATUS(node, NST_MARK1);
2974 }
2975 }
2976 r = subexp_recursive_check_trav(en->target, env);
2977 if (IS_EFFECT_CALLED(en))
2978 r |= FOUND_CALLED_NODE;
2979 }
2980 break;
2981
2982 default:
2983 break;
2984 }
2985
2986 return r;
2987 }
2988
2989 static int
setup_subexp_call(Node * node,ScanEnv * env)2990 setup_subexp_call(Node* node, ScanEnv* env)
2991 {
2992 int type;
2993 int r = 0;
2994
2995 type = NTYPE(node);
2996 switch (type) {
2997 case N_LIST:
2998 do {
2999 r = setup_subexp_call(NCONS(node).left, env);
3000 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3001 break;
3002
3003 case N_ALT:
3004 do {
3005 r = setup_subexp_call(NCONS(node).left, env);
3006 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3007 break;
3008
3009 case N_QUANTIFIER:
3010 r = setup_subexp_call(NQUANTIFIER(node).target, env);
3011 break;
3012 case N_EFFECT:
3013 r = setup_subexp_call(NEFFECT(node).target, env);
3014 break;
3015
3016 case N_CALL:
3017 {
3018 int n, num, *refs;
3019 UChar *p;
3020 CallNode* cn = &(NCALL(node));
3021 Node** nodes = SCANENV_MEM_NODES(env);
3022
3023 #ifdef USE_NAMED_GROUP
3024 n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
3025 #else
3026 n = -1;
3027 #endif
3028 if (n <= 0) {
3029 /* name not found, check group number. (?*ddd) */
3030 p = cn->name;
3031 num = onig_scan_unsigned_number(&p, cn->name_end, env->enc);
3032 if (num <= 0 || p != cn->name_end) {
3033 onig_scan_env_set_error_string(env,
3034 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3035 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3036 }
3037 #ifdef USE_NAMED_GROUP
3038 if (env->num_named > 0 &&
3039 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3040 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3041 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3042 }
3043 #endif
3044 if (num > env->num_mem) {
3045 onig_scan_env_set_error_string(env,
3046 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3047 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3048 }
3049 cn->ref_num = num;
3050 goto set_call_attr;
3051 }
3052 else if (n > 1) {
3053 onig_scan_env_set_error_string(env,
3054 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3055 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3056 }
3057 else {
3058 cn->ref_num = refs[0];
3059 set_call_attr:
3060 cn->target = nodes[cn->ref_num];
3061 if (IS_NULL(cn->target)) {
3062 onig_scan_env_set_error_string(env,
3063 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3064 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3065 }
3066 SET_EFFECT_STATUS(cn->target, NST_CALLED);
3067 BIT_STATUS_ON_AT(env->bt_mem_start, cn->ref_num);
3068 cn->unset_addr_list = env->unset_addr_list;
3069 }
3070 }
3071 break;
3072
3073 case N_ANCHOR:
3074 {
3075 AnchorNode* an = &(NANCHOR(node));
3076
3077 switch (an->type) {
3078 case ANCHOR_PREC_READ:
3079 case ANCHOR_PREC_READ_NOT:
3080 case ANCHOR_LOOK_BEHIND:
3081 case ANCHOR_LOOK_BEHIND_NOT:
3082 r = setup_subexp_call(an->target, env);
3083 break;
3084 }
3085 }
3086 break;
3087
3088 default:
3089 break;
3090 }
3091
3092 return r;
3093 }
3094 #endif
3095
3096 /* divide different length alternatives in look-behind.
3097 (?<=A|B) ==> (?<=A)|(?<=B)
3098 (?<!A|B) ==> (?<!A)(?<!B)
3099 */
3100 static int
divide_look_behind_alternatives(Node * node)3101 divide_look_behind_alternatives(Node* node)
3102 {
3103 Node tmp_node;
3104 Node *head, *np, *insert_node;
3105 AnchorNode* an = &(NANCHOR(node));
3106 int anc_type = an->type;
3107
3108 head = an->target;
3109 np = NCONS(head).left;
3110 tmp_node = *node; *node = *head; *head = tmp_node;
3111 NCONS(node).left = head;
3112 NANCHOR(head).target = np;
3113
3114 np = node;
3115 while ((np = NCONS(np).right) != NULL_NODE) {
3116 insert_node = onig_node_new_anchor(anc_type);
3117 CHECK_NULL_RETURN_VAL(insert_node, ONIGERR_MEMORY);
3118 NANCHOR(insert_node).target = NCONS(np).left;
3119 NCONS(np).left = insert_node;
3120 }
3121
3122 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3123 np = node;
3124 do {
3125 np->type = N_LIST; /* alt -> list */
3126 } while ((np = NCONS(np).right) != NULL_NODE);
3127 }
3128 return 0;
3129 }
3130
3131 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3132 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3133 {
3134 int r, len;
3135 AnchorNode* an = &(NANCHOR(node));
3136
3137 r = get_char_length_tree(an->target, reg, &len);
3138 if (r == 0)
3139 an->char_len = len;
3140 else if (r == GET_CHAR_LEN_VARLEN)
3141 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3142 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3143 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3144 r = divide_look_behind_alternatives(node);
3145 else
3146 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3147 }
3148
3149 return r;
3150 }
3151
3152 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3153 next_setup(Node* node, Node* next_node, regex_t* reg)
3154 {
3155 int type;
3156
3157 retry:
3158 type = NTYPE(node);
3159 if (type == N_QUANTIFIER) {
3160 QuantifierNode* qn = &(NQUANTIFIER(node));
3161 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3162 #ifdef USE_QUANTIFIER_PEEK_NEXT
3163 qn->next_head_exact = get_head_value_node(next_node, 1, reg);
3164 #endif
3165 /* automatic posseivation a*b ==> (?>a*)b */
3166 if (qn->lower <= 1) {
3167 int ttype = NTYPE(qn->target);
3168 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3169 Node *x, *y;
3170 x = get_head_value_node(qn->target, 0, reg);
3171 if (IS_NOT_NULL(x)) {
3172 y = get_head_value_node(next_node, 0, reg);
3173 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3174 Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK);
3175 CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
3176 SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3177 swap_node(node, en);
3178 NEFFECT(node).target = en;
3179 }
3180 }
3181 }
3182 }
3183 }
3184 }
3185 else if (type == N_EFFECT) {
3186 EffectNode* en = &(NEFFECT(node));
3187 if (en->type == EFFECT_MEMORY) {
3188 node = en->target;
3189 goto retry;
3190 }
3191 }
3192 return 0;
3193 }
3194
3195
3196 static int
divide_ambig_string_node_sub(regex_t * reg,int prev_ambig,UChar * prev_start,UChar * prev,UChar * end,Node *** tailp,Node ** root)3197 divide_ambig_string_node_sub(regex_t* reg, int prev_ambig,
3198 UChar* prev_start, UChar* prev,
3199 UChar* end, Node*** tailp, Node** root)
3200 {
3201 UChar *tmp, *wp;
3202 Node* snode;
3203
3204 if (prev_ambig != 0) {
3205 tmp = prev_start;
3206 wp = prev_start;
3207 while (tmp < prev) {
3208 wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
3209 &tmp, end, wp);
3210 }
3211 snode = onig_node_new_str(prev_start, wp);
3212 CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
3213 NSTRING_SET_AMBIG(snode);
3214 if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode);
3215 }
3216 else {
3217 snode = onig_node_new_str(prev_start, prev);
3218 CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
3219 }
3220
3221 if (*tailp == (Node** )0) {
3222 *root = onig_node_new_list(snode, NULL);
3223 CHECK_NULL_RETURN_VAL(*root, ONIGERR_MEMORY);
3224 *tailp = &(NCONS(*root).right);
3225 }
3226 else {
3227 **tailp = onig_node_new_list(snode, NULL);
3228 CHECK_NULL_RETURN_VAL(**tailp, ONIGERR_MEMORY);
3229 *tailp = &(NCONS(**tailp).right);
3230 }
3231
3232 return 0;
3233 }
3234
3235 static int
divide_ambig_string_node(Node * node,regex_t * reg)3236 divide_ambig_string_node(Node* node, regex_t* reg)
3237 {
3238 StrNode* sn = &NSTRING(node);
3239 int ambig, prev_ambig;
3240 UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp;
3241 Node *root = NULL_NODE;
3242 Node **tailp = (Node** )0;
3243 int r;
3244
3245 start = prev_start = p = sn->s;
3246 end = sn->end;
3247 if (p >= end) return 0;
3248
3249 prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end);
3250
3251 while (p < end) {
3252 prev = p;
3253 if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc,
3254 reg->ambig_flag, &p, end))) {
3255
3256 r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, prev,
3257 end, &tailp, &root);
3258 if (r != 0) return r;
3259
3260 prev_ambig = ambig;
3261 prev_start = prev;
3262 }
3263 }
3264
3265 if (prev_start == start) {
3266 if (prev_ambig != 0) {
3267 NSTRING_SET_AMBIG(node);
3268 tmp = start;
3269 wp = start;
3270 while (tmp < end) {
3271 wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
3272 &tmp, end, wp);
3273 }
3274 if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node);
3275 sn->end = wp;
3276 }
3277 }
3278 else {
3279 r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, end,
3280 end, &tailp, &root);
3281 if (r != 0) return r;
3282
3283 swap_node(node, root);
3284 onig_node_str_clear(root); /* should be after swap! */
3285 onig_node_free(root); /* free original string node */
3286 }
3287
3288 return 0;
3289 }
3290
3291 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3292
3293 #define CEC_THRES_NUM_BIG_REPEAT 512
3294 #define CEC_INFINITE_NUM 0x7fffffff
3295
3296 #define CEC_IN_INFINITE_REPEAT (1<<0)
3297 #define CEC_IN_FINITE_REPEAT (1<<1)
3298 #define CEC_CONT_BIG_REPEAT (1<<2)
3299
3300 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3301 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3302 {
3303 int type;
3304 int r = state;
3305
3306 type = NTYPE(node);
3307 switch (type) {
3308 case N_LIST:
3309 {
3310 Node* prev = NULL_NODE;
3311 do {
3312 r = setup_comb_exp_check(NCONS(node).left, r, env);
3313 prev = NCONS(node).left;
3314 } while (r >= 0 && IS_NOT_NULL(node = NCONS(node).right));
3315 }
3316 break;
3317
3318 case N_ALT:
3319 {
3320 int ret;
3321 do {
3322 ret = setup_comb_exp_check(NCONS(node).left, state, env);
3323 r |= ret;
3324 } while (ret >= 0 && IS_NOT_NULL(node = NCONS(node).right));
3325 }
3326 break;
3327
3328 case N_QUANTIFIER:
3329 {
3330 int child_state = state;
3331 int add_state = 0;
3332 QuantifierNode* qn = &(NQUANTIFIER(node));
3333 Node* target = qn->target;
3334 int var_num;
3335
3336 if (! IS_REPEAT_INFINITE(qn->upper)) {
3337 if (qn->upper > 1) {
3338 /* {0,1}, {1,1} are allowed */
3339 child_state |= CEC_IN_FINITE_REPEAT;
3340
3341 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3342 if (env->backrefed_mem == 0) {
3343 if (NTYPE(qn->target) == N_EFFECT) {
3344 EffectNode* en = &(NEFFECT(qn->target));
3345 if (en->type == EFFECT_MEMORY) {
3346 if (NTYPE(en->target) == N_QUANTIFIER) {
3347 QuantifierNode* q = &(NQUANTIFIER(en->target));
3348 if (IS_REPEAT_INFINITE(q->upper)
3349 && q->greedy == qn->greedy) {
3350 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3351 if (qn->upper == 1)
3352 child_state = state;
3353 }
3354 }
3355 }
3356 }
3357 }
3358 }
3359 }
3360
3361 if (state & CEC_IN_FINITE_REPEAT) {
3362 qn->comb_exp_check_num = -1;
3363 }
3364 else {
3365 if (IS_REPEAT_INFINITE(qn->upper)) {
3366 var_num = CEC_INFINITE_NUM;
3367 child_state |= CEC_IN_INFINITE_REPEAT;
3368 }
3369 else {
3370 var_num = qn->upper - qn->lower;
3371 }
3372
3373 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3374 add_state |= CEC_CONT_BIG_REPEAT;
3375
3376 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3377 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3378 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3379 if (qn->comb_exp_check_num == 0) {
3380 env->num_comb_exp_check++;
3381 qn->comb_exp_check_num = env->num_comb_exp_check;
3382 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3383 env->comb_exp_max_regnum = env->curr_max_regnum;
3384 }
3385 }
3386 }
3387
3388 r = setup_comb_exp_check(target, child_state, env);
3389 r |= add_state;
3390 }
3391 break;
3392
3393 case N_EFFECT:
3394 {
3395 EffectNode* en = &(NEFFECT(node));
3396
3397 switch (en->type) {
3398 case EFFECT_MEMORY:
3399 {
3400 if (env->curr_max_regnum < en->regnum)
3401 env->curr_max_regnum = en->regnum;
3402
3403 r = setup_comb_exp_check(en->target, state, env);
3404 }
3405 break;
3406
3407 default:
3408 r = setup_comb_exp_check(en->target, state, env);
3409 break;
3410 }
3411 }
3412 break;
3413
3414 #ifdef USE_SUBEXP_CALL
3415 case N_CALL:
3416 if (IS_CALL_RECURSION(&(NCALL(node))))
3417 env->has_recursion = 1;
3418 else
3419 r = setup_comb_exp_check(NCALL(node).target, state, env);
3420 break;
3421 #endif
3422
3423 default:
3424 break;
3425 }
3426
3427 return r;
3428 }
3429 #endif
3430
3431 #define IN_ALT (1<<0)
3432 #define IN_NOT (1<<1)
3433 #define IN_REPEAT (1<<2)
3434 #define IN_VAR_REPEAT (1<<3)
3435
3436 /* setup_tree does the following work.
3437 1. check empty loop. (set qn->target_empty_info)
3438 2. expand ignore-case in char class.
3439 3. set memory status bit flags. (reg->mem_stats)
3440 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3441 5. find invalid patterns in look-behind.
3442 6. expand repeated string.
3443 */
3444 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3445 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3446 {
3447 int type;
3448 int r = 0;
3449
3450 type = NTYPE(node);
3451 switch (type) {
3452 case N_LIST:
3453 {
3454 Node* prev = NULL_NODE;
3455 do {
3456 r = setup_tree(NCONS(node).left, reg, state, env);
3457 if (IS_NOT_NULL(prev) && r == 0) {
3458 r = next_setup(prev, NCONS(node).left, reg);
3459 }
3460 prev = NCONS(node).left;
3461 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3462 }
3463 break;
3464
3465 case N_ALT:
3466 do {
3467 r = setup_tree(NCONS(node).left, reg, (state | IN_ALT), env);
3468 } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3469 break;
3470
3471 case N_CCLASS:
3472 break;
3473
3474 case N_STRING:
3475 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3476 r = divide_ambig_string_node(node, reg);
3477 }
3478 break;
3479
3480 case N_CTYPE:
3481 case N_ANYCHAR:
3482 break;
3483
3484 #ifdef USE_SUBEXP_CALL
3485 case N_CALL:
3486 break;
3487 #endif
3488
3489 case N_BACKREF:
3490 {
3491 int i;
3492 int* p;
3493 Node** nodes = SCANENV_MEM_NODES(env);
3494 BackrefNode* br = &(NBACKREF(node));
3495 p = BACKREFS_P(br);
3496 for (i = 0; i < br->back_num; i++) {
3497 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3498 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3499 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3500 #ifdef USE_BACKREF_AT_LEVEL
3501 if (IS_BACKREF_NEST_LEVEL(br)) {
3502 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3503 }
3504 #endif
3505 SET_EFFECT_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3506 }
3507 }
3508 break;
3509
3510 case N_QUANTIFIER:
3511 {
3512 OnigDistance d;
3513 QuantifierNode* qn = &(NQUANTIFIER(node));
3514 Node* target = qn->target;
3515
3516 if ((state & IN_REPEAT) != 0) {
3517 qn->state |= NST_IN_REPEAT;
3518 }
3519
3520 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3521 r = get_min_match_length(target, &d, env);
3522 if (r) break;
3523 if (d == 0) {
3524 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3525 #ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK
3526 r = quantifiers_memory_node_info(target);
3527 if (r < 0) break;
3528 if (r > 0) {
3529 qn->target_empty_info = r;
3530 }
3531 #endif
3532 #if 0
3533 r = get_max_match_length(target, &d, env);
3534 if (r == 0 && d == 0) {
3535 /* ()* ==> ()?, ()+ ==> () */
3536 qn->upper = 1;
3537 if (qn->lower > 1) qn->lower = 1;
3538 if (NTYPE(target) == N_STRING) {
3539 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3540 }
3541 }
3542 #endif
3543 }
3544 }
3545
3546 state |= IN_REPEAT;
3547 if (qn->lower != qn->upper)
3548 state |= IN_VAR_REPEAT;
3549 r = setup_tree(target, reg, state, env);
3550 if (r) break;
3551
3552 /* expand string */
3553 #define EXPAND_STRING_MAX_LENGTH 100
3554 if (NTYPE(target) == N_STRING) {
3555 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3556 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3557 int len = NSTRING_LEN(target);
3558 StrNode* sn = &(NSTRING(target));
3559
3560 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3561 int i, n = qn->lower;
3562 onig_node_conv_to_str_node(node, NSTRING(target).flag);
3563 for (i = 0; i < n; i++) {
3564 r = onig_node_str_cat(node, sn->s, sn->end);
3565 if (r) break;
3566 }
3567 onig_node_free(target);
3568 break; /* break case N_QUANTIFIER: */
3569 }
3570 }
3571 }
3572
3573 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3574 if (qn->greedy && (qn->target_empty_info != 0)) {
3575 if (NTYPE(target) == N_QUANTIFIER) {
3576 QuantifierNode* tqn = &(NQUANTIFIER(target));
3577 if (IS_NOT_NULL(tqn->head_exact)) {
3578 qn->head_exact = tqn->head_exact;
3579 tqn->head_exact = NULL;
3580 }
3581 }
3582 else {
3583 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3584 }
3585 }
3586 #endif
3587 }
3588 break;
3589
3590 case N_EFFECT:
3591 {
3592 EffectNode* en = &(NEFFECT(node));
3593
3594 switch (en->type) {
3595 case EFFECT_OPTION:
3596 {
3597 OnigOptionType options = reg->options;
3598 reg->options = NEFFECT(node).option;
3599 r = setup_tree(NEFFECT(node).target, reg, state, env);
3600 reg->options = options;
3601 }
3602 break;
3603
3604 case EFFECT_MEMORY:
3605 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3606 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3607 /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */
3608 }
3609 r = setup_tree(en->target, reg, state, env);
3610 break;
3611
3612 case EFFECT_STOP_BACKTRACK:
3613 {
3614 Node* target = en->target;
3615 r = setup_tree(target, reg, state, env);
3616 if (NTYPE(target) == N_QUANTIFIER) {
3617 QuantifierNode* tqn = &(NQUANTIFIER(target));
3618 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3619 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3620 int qtype = NTYPE(tqn->target);
3621 if (IS_NODE_TYPE_SIMPLE(qtype))
3622 SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3623 }
3624 }
3625 }
3626 break;
3627 }
3628 }
3629 break;
3630
3631 case N_ANCHOR:
3632 {
3633 AnchorNode* an = &(NANCHOR(node));
3634
3635 switch (an->type) {
3636 case ANCHOR_PREC_READ:
3637 r = setup_tree(an->target, reg, state, env);
3638 break;
3639 case ANCHOR_PREC_READ_NOT:
3640 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3641 break;
3642
3643 /* allowed node types in look-behind */
3644 #define ALLOWED_TYPE_IN_LB \
3645 ( N_LIST | N_ALT | N_STRING | N_CCLASS | N_CTYPE | \
3646 N_ANYCHAR | N_ANCHOR | N_EFFECT | N_QUANTIFIER | N_CALL )
3647
3648 #define ALLOWED_EFFECT_IN_LB ( EFFECT_MEMORY )
3649 #define ALLOWED_EFFECT_IN_LB_NOT 0
3650
3651 #define ALLOWED_ANCHOR_IN_LB \
3652 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3653 #define ALLOWED_ANCHOR_IN_LB_NOT \
3654 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3655
3656 case ANCHOR_LOOK_BEHIND:
3657 {
3658 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3659 ALLOWED_EFFECT_IN_LB, ALLOWED_ANCHOR_IN_LB);
3660 if (r < 0) return r;
3661 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3662 r = setup_look_behind(node, reg, env);
3663 if (r != 0) return r;
3664 r = setup_tree(an->target, reg, state, env);
3665 }
3666 break;
3667
3668 case ANCHOR_LOOK_BEHIND_NOT:
3669 {
3670 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3671 ALLOWED_EFFECT_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3672 if (r < 0) return r;
3673 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3674 r = setup_look_behind(node, reg, env);
3675 if (r != 0) return r;
3676 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3677 }
3678 break;
3679 }
3680 }
3681 break;
3682
3683 default:
3684 break;
3685 }
3686
3687 return r;
3688 }
3689
3690 /* set skip map for Boyer-Moor search */
3691 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc,UChar skip[],int ** int_skip)3692 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc,
3693 UChar skip[], int** int_skip)
3694 {
3695 int i, len;
3696
3697 len = end - s;
3698 if (len < ONIG_CHAR_TABLE_SIZE) {
3699 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3700
3701 for (i = 0; i < len - 1; i++)
3702 skip[s[i]] = len - 1 - i;
3703 }
3704 else {
3705 if (IS_NULL(*int_skip)) {
3706 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3707 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3708 }
3709 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3710
3711 for (i = 0; i < len - 1; i++)
3712 (*int_skip)[s[i]] = len - 1 - i;
3713 }
3714 return 0;
3715 }
3716
3717 #define OPT_EXACT_MAXLEN 24
3718
3719 typedef struct {
3720 OnigDistance min; /* min byte length */
3721 OnigDistance max; /* max byte length */
3722 } MinMaxLen;
3723
3724 typedef struct {
3725 MinMaxLen mmd;
3726 OnigEncoding enc;
3727 OnigOptionType options;
3728 OnigAmbigType ambig_flag;
3729 ScanEnv* scan_env;
3730 } OptEnv;
3731
3732 typedef struct {
3733 int left_anchor;
3734 int right_anchor;
3735 } OptAncInfo;
3736
3737 typedef struct {
3738 MinMaxLen mmd; /* info position */
3739 OptAncInfo anc;
3740
3741 int reach_end;
3742 int ignore_case;
3743 int len;
3744 UChar s[OPT_EXACT_MAXLEN];
3745 } OptExactInfo;
3746
3747 typedef struct {
3748 MinMaxLen mmd; /* info position */
3749 OptAncInfo anc;
3750
3751 int value; /* weighted value */
3752 UChar map[ONIG_CHAR_TABLE_SIZE];
3753 } OptMapInfo;
3754
3755 typedef struct {
3756 MinMaxLen len;
3757
3758 OptAncInfo anc;
3759 OptExactInfo exb; /* boundary */
3760 OptExactInfo exm; /* middle */
3761 OptExactInfo expr; /* prec read (?=...) */
3762
3763 OptMapInfo map; /* boundary */
3764 } NodeOptInfo;
3765
3766
3767 static int
map_position_value(OnigEncoding enc,int i)3768 map_position_value(OnigEncoding enc, int i)
3769 {
3770 static const short int ByteValTable[] = {
3771 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
3772 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3773 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
3774 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
3775 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3776 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
3777 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3778 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
3779 };
3780
3781 if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0])) {
3782 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3783 return 20;
3784 else
3785 return (int )ByteValTable[i];
3786 }
3787 else
3788 return 4; /* Take it easy. */
3789 }
3790
3791 static int
distance_value(MinMaxLen * mm)3792 distance_value(MinMaxLen* mm)
3793 {
3794 /* 1000 / (min-max-dist + 1) */
3795 static const short int dist_vals[] = {
3796 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
3797 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
3798 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
3799 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
3800 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
3801 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
3802 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
3803 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
3804 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
3805 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
3806 };
3807
3808 int d;
3809
3810 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
3811
3812 d = mm->max - mm->min;
3813 if (d < sizeof(dist_vals)/sizeof(dist_vals[0]))
3814 /* return dist_vals[d] * 16 / (mm->min + 12); */
3815 return (int )dist_vals[d];
3816 else
3817 return 1;
3818 }
3819
3820 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)3821 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
3822 {
3823 if (v2 <= 0) return -1;
3824 if (v1 <= 0) return 1;
3825
3826 v1 *= distance_value(d1);
3827 v2 *= distance_value(d2);
3828
3829 if (v2 > v1) return 1;
3830 if (v2 < v1) return -1;
3831
3832 if (d2->min < d1->min) return 1;
3833 if (d2->min > d1->min) return -1;
3834 return 0;
3835 }
3836
3837 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)3838 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
3839 {
3840 return (a->min == b->min && a->max == b->max) ? 1 : 0;
3841 }
3842
3843
3844 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)3845 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
3846 {
3847 mml->min = min;
3848 mml->max = max;
3849 }
3850
3851 static void
clear_mml(MinMaxLen * mml)3852 clear_mml(MinMaxLen* mml)
3853 {
3854 mml->min = mml->max = 0;
3855 }
3856
3857 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)3858 copy_mml(MinMaxLen* to, MinMaxLen* from)
3859 {
3860 to->min = from->min;
3861 to->max = from->max;
3862 }
3863
3864 static void
add_mml(MinMaxLen * to,MinMaxLen * from)3865 add_mml(MinMaxLen* to, MinMaxLen* from)
3866 {
3867 to->min = distance_add(to->min, from->min);
3868 to->max = distance_add(to->max, from->max);
3869 }
3870
3871 #if 0
3872 static void
3873 add_len_mml(MinMaxLen* to, OnigDistance len)
3874 {
3875 to->min = distance_add(to->min, len);
3876 to->max = distance_add(to->max, len);
3877 }
3878 #endif
3879
3880 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)3881 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
3882 {
3883 if (to->min > from->min) to->min = from->min;
3884 if (to->max < from->max) to->max = from->max;
3885 }
3886
3887 static void
copy_opt_env(OptEnv * to,OptEnv * from)3888 copy_opt_env(OptEnv* to, OptEnv* from)
3889 {
3890 *to = *from;
3891 }
3892
3893 static void
clear_opt_anc_info(OptAncInfo * anc)3894 clear_opt_anc_info(OptAncInfo* anc)
3895 {
3896 anc->left_anchor = 0;
3897 anc->right_anchor = 0;
3898 }
3899
3900 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)3901 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
3902 {
3903 *to = *from;
3904 }
3905
3906 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)3907 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
3908 OnigDistance left_len, OnigDistance right_len)
3909 {
3910 clear_opt_anc_info(to);
3911
3912 to->left_anchor = left->left_anchor;
3913 if (left_len == 0) {
3914 to->left_anchor |= right->left_anchor;
3915 }
3916
3917 to->right_anchor = right->right_anchor;
3918 if (right_len == 0) {
3919 to->right_anchor |= left->right_anchor;
3920 }
3921 }
3922
3923 static int
is_left_anchor(int anc)3924 is_left_anchor(int anc)
3925 {
3926 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
3927 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
3928 anc == ANCHOR_PREC_READ_NOT)
3929 return 0;
3930
3931 return 1;
3932 }
3933
3934 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)3935 is_set_opt_anc_info(OptAncInfo* to, int anc)
3936 {
3937 if ((to->left_anchor & anc) != 0) return 1;
3938
3939 return ((to->right_anchor & anc) != 0 ? 1 : 0);
3940 }
3941
3942 static void
add_opt_anc_info(OptAncInfo * to,int anc)3943 add_opt_anc_info(OptAncInfo* to, int anc)
3944 {
3945 if (is_left_anchor(anc))
3946 to->left_anchor |= anc;
3947 else
3948 to->right_anchor |= anc;
3949 }
3950
3951 static void
remove_opt_anc_info(OptAncInfo * to,int anc)3952 remove_opt_anc_info(OptAncInfo* to, int anc)
3953 {
3954 if (is_left_anchor(anc))
3955 to->left_anchor &= ~anc;
3956 else
3957 to->right_anchor &= ~anc;
3958 }
3959
3960 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)3961 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
3962 {
3963 to->left_anchor &= add->left_anchor;
3964 to->right_anchor &= add->right_anchor;
3965 }
3966
3967 static int
is_full_opt_exact_info(OptExactInfo * ex)3968 is_full_opt_exact_info(OptExactInfo* ex)
3969 {
3970 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
3971 }
3972
3973 static void
clear_opt_exact_info(OptExactInfo * ex)3974 clear_opt_exact_info(OptExactInfo* ex)
3975 {
3976 clear_mml(&ex->mmd);
3977 clear_opt_anc_info(&ex->anc);
3978 ex->reach_end = 0;
3979 ex->ignore_case = 0;
3980 ex->len = 0;
3981 ex->s[0] = '\0';
3982 }
3983
3984 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)3985 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
3986 {
3987 *to = *from;
3988 }
3989
3990 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)3991 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
3992 {
3993 int i, j, len;
3994 UChar *p, *end;
3995 OptAncInfo tanc;
3996
3997 if (! to->ignore_case && add->ignore_case) {
3998 if (to->len >= add->len) return ; /* avoid */
3999
4000 to->ignore_case = 1;
4001 }
4002
4003 p = add->s;
4004 end = p + add->len;
4005 for (i = to->len; p < end; ) {
4006 len = enc_len(enc, p);
4007 if (i + len > OPT_EXACT_MAXLEN) break;
4008 for (j = 0; j < len && p < end; j++)
4009 to->s[i++] = *p++;
4010 }
4011
4012 to->len = i;
4013 to->reach_end = (p == end ? add->reach_end : 0);
4014
4015 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4016 if (! to->reach_end) tanc.right_anchor = 0;
4017 copy_opt_anc_info(&to->anc, &tanc);
4018 }
4019
4020 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw,OnigEncoding enc)4021 concat_opt_exact_info_str(OptExactInfo* to,
4022 UChar* s, UChar* end, int raw, OnigEncoding enc)
4023 {
4024 int i, j, len;
4025 UChar *p;
4026
4027 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4028 len = enc_len(enc, p);
4029 if (i + len > OPT_EXACT_MAXLEN) break;
4030 for (j = 0; j < len && p < end; j++)
4031 to->s[i++] = *p++;
4032 }
4033
4034 to->len = i;
4035 }
4036
4037 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4038 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4039 {
4040 int i, j, len;
4041
4042 if (add->len == 0 || to->len == 0) {
4043 clear_opt_exact_info(to);
4044 return ;
4045 }
4046
4047 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4048 clear_opt_exact_info(to);
4049 return ;
4050 }
4051
4052 for (i = 0; i < to->len && i < add->len; ) {
4053 if (to->s[i] != add->s[i]) break;
4054 len = enc_len(env->enc, to->s + i);
4055
4056 for (j = 1; j < len; j++) {
4057 if (to->s[i+j] != add->s[i+j]) break;
4058 }
4059 if (j < len) break;
4060 i += len;
4061 }
4062
4063 if (! add->reach_end || i < add->len || i < to->len) {
4064 to->reach_end = 0;
4065 }
4066 to->len = i;
4067 to->ignore_case |= add->ignore_case;
4068
4069 alt_merge_opt_anc_info(&to->anc, &add->anc);
4070 if (! to->reach_end) to->anc.right_anchor = 0;
4071 }
4072
4073 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4074 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4075 {
4076 int v1, v2;
4077
4078 v1 = now->len;
4079 v2 = alt->len;
4080
4081 if (v2 == 0) {
4082 return ;
4083 }
4084 else if (v1 == 0) {
4085 copy_opt_exact_info(now, alt);
4086 return ;
4087 }
4088 else if (v1 <= 2 && v2 <= 2) {
4089 /* ByteValTable[x] is big value --> low price */
4090 v2 = map_position_value(enc, now->s[0]);
4091 v1 = map_position_value(enc, alt->s[0]);
4092
4093 if (now->len > 1) v1 += 5;
4094 if (alt->len > 1) v2 += 5;
4095 }
4096
4097 if (now->ignore_case == 0) v1 *= 2;
4098 if (alt->ignore_case == 0) v2 *= 2;
4099
4100 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4101 copy_opt_exact_info(now, alt);
4102 }
4103
4104 static void
clear_opt_map_info(OptMapInfo * map)4105 clear_opt_map_info(OptMapInfo* map)
4106 {
4107 static const OptMapInfo clean_info = {
4108 {0, 0}, {0, 0}, 0,
4109 {
4110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4126 }
4127 };
4128
4129 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4130 }
4131
4132 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4133 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4134 {
4135 *to = *from;
4136 }
4137
4138 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4139 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4140 {
4141 if (map->map[c] == 0) {
4142 map->map[c] = 1;
4143 map->value += map_position_value(enc, c);
4144 }
4145 }
4146
4147 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigAmbigType ambig_flag)4148 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4149 OnigEncoding enc, OnigAmbigType ambig_flag)
4150 {
4151 int i, n, len;
4152 UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN];
4153 OnigCodePoint code;
4154 const OnigPairAmbigCodes* pccs;
4155 OnigAmbigType amb;
4156
4157 add_char_opt_map_info(map, p[0], enc);
4158 code = ONIGENC_MBC_TO_CODE(enc, p, end);
4159
4160 for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
4161 if ((amb & ambig_flag) == 0) continue;
4162
4163 n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs);
4164 for (i = 0; i < n; i++) {
4165 if (pccs[i].from == code) {
4166 len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf);
4167 if (len < 0) return len;
4168 add_char_opt_map_info(map, buf[0], enc);
4169 }
4170 }
4171 }
4172 return 0;
4173 }
4174
4175 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4176 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4177 {
4178 static int z = 1<<15; /* 32768: something big value */
4179
4180 int v1, v2;
4181
4182 if (alt->value == 0) return ;
4183 if (now->value == 0) {
4184 copy_opt_map_info(now, alt);
4185 return ;
4186 }
4187
4188 v1 = z / now->value;
4189 v2 = z / alt->value;
4190 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4191 copy_opt_map_info(now, alt);
4192 }
4193
4194 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4195 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4196 {
4197 #define COMP_EM_BASE 20
4198 int ve, vm;
4199
4200 if (m->value <= 0) return -1;
4201
4202 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4203 vm = COMP_EM_BASE * 5 * 2 / m->value;
4204 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4205 }
4206
4207 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4208 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4209 {
4210 int i, val;
4211
4212 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4213 if (to->value == 0) return ;
4214 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4215 clear_opt_map_info(to);
4216 return ;
4217 }
4218
4219 alt_merge_mml(&to->mmd, &add->mmd);
4220
4221 val = 0;
4222 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4223 if (add->map[i])
4224 to->map[i] = 1;
4225
4226 if (to->map[i])
4227 val += map_position_value(enc, i);
4228 }
4229 to->value = val;
4230
4231 alt_merge_opt_anc_info(&to->anc, &add->anc);
4232 }
4233
4234 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4235 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4236 {
4237 copy_mml(&(opt->exb.mmd), mmd);
4238 copy_mml(&(opt->expr.mmd), mmd);
4239 copy_mml(&(opt->map.mmd), mmd);
4240 }
4241
4242 static void
clear_node_opt_info(NodeOptInfo * opt)4243 clear_node_opt_info(NodeOptInfo* opt)
4244 {
4245 clear_mml(&opt->len);
4246 clear_opt_anc_info(&opt->anc);
4247 clear_opt_exact_info(&opt->exb);
4248 clear_opt_exact_info(&opt->exm);
4249 clear_opt_exact_info(&opt->expr);
4250 clear_opt_map_info(&opt->map);
4251 }
4252
4253 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4254 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4255 {
4256 *to = *from;
4257 }
4258
4259 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4260 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4261 {
4262 int exb_reach, exm_reach;
4263 OptAncInfo tanc;
4264
4265 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4266 copy_opt_anc_info(&to->anc, &tanc);
4267
4268 if (add->exb.len > 0 && to->len.max == 0) {
4269 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4270 to->len.max, add->len.max);
4271 copy_opt_anc_info(&add->exb.anc, &tanc);
4272 }
4273
4274 if (add->map.value > 0 && to->len.max == 0) {
4275 if (add->map.mmd.max == 0)
4276 add->map.anc.left_anchor |= to->anc.left_anchor;
4277 }
4278
4279 exb_reach = to->exb.reach_end;
4280 exm_reach = to->exm.reach_end;
4281
4282 if (add->len.max != 0)
4283 to->exb.reach_end = to->exm.reach_end = 0;
4284
4285 if (add->exb.len > 0) {
4286 if (exb_reach) {
4287 concat_opt_exact_info(&to->exb, &add->exb, enc);
4288 clear_opt_exact_info(&add->exb);
4289 }
4290 else if (exm_reach) {
4291 concat_opt_exact_info(&to->exm, &add->exb, enc);
4292 clear_opt_exact_info(&add->exb);
4293 }
4294 }
4295 select_opt_exact_info(enc, &to->exm, &add->exb);
4296 select_opt_exact_info(enc, &to->exm, &add->exm);
4297
4298 if (to->expr.len > 0) {
4299 if (add->len.max > 0) {
4300 if (to->expr.len > (int )add->len.max)
4301 to->expr.len = add->len.max;
4302
4303 if (to->expr.mmd.max == 0)
4304 select_opt_exact_info(enc, &to->exb, &to->expr);
4305 else
4306 select_opt_exact_info(enc, &to->exm, &to->expr);
4307 }
4308 }
4309 else if (add->expr.len > 0) {
4310 copy_opt_exact_info(&to->expr, &add->expr);
4311 }
4312
4313 select_opt_map_info(&to->map, &add->map);
4314
4315 add_mml(&to->len, &add->len);
4316 }
4317
4318 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4319 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4320 {
4321 alt_merge_opt_anc_info (&to->anc, &add->anc);
4322 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4323 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4324 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4325 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4326
4327 alt_merge_mml(&to->len, &add->len);
4328 }
4329
4330
4331 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4332
4333 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4334 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4335 {
4336 int type;
4337 int r = 0;
4338
4339 clear_node_opt_info(opt);
4340 set_bound_node_opt_info(opt, &env->mmd);
4341
4342 type = NTYPE(node);
4343 switch (type) {
4344 case N_LIST:
4345 {
4346 OptEnv nenv;
4347 NodeOptInfo nopt;
4348 Node* nd = node;
4349
4350 copy_opt_env(&nenv, env);
4351 do {
4352 r = optimize_node_left(NCONS(nd).left, &nopt, &nenv);
4353 if (r == 0) {
4354 add_mml(&nenv.mmd, &nopt.len);
4355 concat_left_node_opt_info(env->enc, opt, &nopt);
4356 }
4357 } while (r == 0 && IS_NOT_NULL(nd = NCONS(nd).right));
4358 }
4359 break;
4360
4361 case N_ALT:
4362 {
4363 NodeOptInfo nopt;
4364 Node* nd = node;
4365
4366 do {
4367 r = optimize_node_left(NCONS(nd).left, &nopt, env);
4368 if (r == 0) {
4369 if (nd == node) copy_node_opt_info(opt, &nopt);
4370 else alt_merge_node_opt_info(opt, &nopt, env);
4371 }
4372 } while ((r == 0) && IS_NOT_NULL(nd = NCONS(nd).right));
4373 }
4374 break;
4375
4376 case N_STRING:
4377 {
4378 StrNode* sn = &(NSTRING(node));
4379 int slen = sn->end - sn->s;
4380 int is_raw = NSTRING_IS_RAW(node);
4381
4382 if (! NSTRING_IS_AMBIG(node)) {
4383 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4384 NSTRING_IS_RAW(node), env->enc);
4385 if (slen > 0) {
4386 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4387 }
4388 set_mml(&opt->len, slen, slen);
4389 }
4390 else {
4391 int n, max;
4392
4393 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4394 is_raw, env->enc);
4395 opt->exb.ignore_case = 1;
4396
4397 if (slen > 0) {
4398 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4399 env->enc, env->ambig_flag);
4400 if (r != 0) break;
4401 }
4402
4403 if (NSTRING_IS_AMBIG_REDUCE(node)) {
4404 n = onigenc_strlen(env->enc, sn->s, sn->end);
4405 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4406 }
4407 else {
4408 max = slen;
4409 }
4410 set_mml(&opt->len, slen, max);
4411 }
4412
4413 if (opt->exb.len == slen)
4414 opt->exb.reach_end = 1;
4415 }
4416 break;
4417
4418 case N_CCLASS:
4419 {
4420 int i, z;
4421 CClassNode* cc = &(NCCLASS(node));
4422
4423 /* no need to check ignore case. (setted in setup_tree()) */
4424
4425 if (IS_NOT_NULL(cc->mbuf) || IS_CCLASS_NOT(cc)) {
4426 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4427 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4428
4429 set_mml(&opt->len, min, max);
4430 }
4431 else {
4432 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4433 z = BITSET_AT(cc->bs, i);
4434 if ((z && !IS_CCLASS_NOT(cc)) || (!z && IS_CCLASS_NOT(cc))) {
4435 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4436 }
4437 }
4438 set_mml(&opt->len, 1, 1);
4439 }
4440 }
4441 break;
4442
4443 case N_CTYPE:
4444 {
4445 int i, min, max;
4446
4447 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4448
4449 if (max == 1) {
4450 min = 1;
4451
4452 switch (NCTYPE(node).type) {
4453 case CTYPE_NOT_WORD:
4454 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4455 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4456 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4457 }
4458 }
4459 break;
4460
4461 case CTYPE_WORD:
4462 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4463 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4464 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4465 }
4466 }
4467 break;
4468 }
4469 }
4470 else {
4471 min = ONIGENC_MBC_MINLEN(env->enc);
4472 }
4473 set_mml(&opt->len, min, max);
4474 }
4475 break;
4476
4477 case N_ANYCHAR:
4478 {
4479 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4480 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4481 set_mml(&opt->len, min, max);
4482 }
4483 break;
4484
4485 case N_ANCHOR:
4486 switch (NANCHOR(node).type) {
4487 case ANCHOR_BEGIN_BUF:
4488 case ANCHOR_BEGIN_POSITION:
4489 case ANCHOR_BEGIN_LINE:
4490 case ANCHOR_END_BUF:
4491 case ANCHOR_SEMI_END_BUF:
4492 case ANCHOR_END_LINE:
4493 add_opt_anc_info(&opt->anc, NANCHOR(node).type);
4494 break;
4495
4496 case ANCHOR_PREC_READ:
4497 {
4498 NodeOptInfo nopt;
4499
4500 r = optimize_node_left(NANCHOR(node).target, &nopt, env);
4501 if (r == 0) {
4502 if (nopt.exb.len > 0)
4503 copy_opt_exact_info(&opt->expr, &nopt.exb);
4504 else if (nopt.exm.len > 0)
4505 copy_opt_exact_info(&opt->expr, &nopt.exm);
4506
4507 opt->expr.reach_end = 0;
4508
4509 if (nopt.map.value > 0)
4510 copy_opt_map_info(&opt->map, &nopt.map);
4511 }
4512 }
4513 break;
4514
4515 case ANCHOR_PREC_READ_NOT:
4516 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4517 case ANCHOR_LOOK_BEHIND_NOT:
4518 break;
4519 }
4520 break;
4521
4522 case N_BACKREF:
4523 {
4524 int i;
4525 int* backs;
4526 OnigDistance min, max, tmin, tmax;
4527 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4528 BackrefNode* br = &(NBACKREF(node));
4529
4530 if (br->state & NST_RECURSION) {
4531 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4532 break;
4533 }
4534 backs = BACKREFS_P(br);
4535 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4536 if (r != 0) break;
4537 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4538 if (r != 0) break;
4539 for (i = 1; i < br->back_num; i++) {
4540 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4541 if (r != 0) break;
4542 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4543 if (r != 0) break;
4544 if (min > tmin) min = tmin;
4545 if (max < tmax) max = tmax;
4546 }
4547 if (r == 0) set_mml(&opt->len, min, max);
4548 }
4549 break;
4550
4551 #ifdef USE_SUBEXP_CALL
4552 case N_CALL:
4553 if (IS_CALL_RECURSION(&(NCALL(node))))
4554 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4555 else {
4556 OnigOptionType save = env->options;
4557 env->options = NEFFECT(NCALL(node).target).option;
4558 r = optimize_node_left(NCALL(node).target, opt, env);
4559 env->options = save;
4560 }
4561 break;
4562 #endif
4563
4564 case N_QUANTIFIER:
4565 {
4566 int i;
4567 OnigDistance min, max;
4568 NodeOptInfo nopt;
4569 QuantifierNode* qn = &(NQUANTIFIER(node));
4570
4571 r = optimize_node_left(qn->target, &nopt, env);
4572 if (r) break;
4573
4574 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4575 if (env->mmd.max == 0 &&
4576 NTYPE(qn->target) == N_ANYCHAR && qn->greedy) {
4577 if (IS_MULTILINE(env->options))
4578 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4579 else
4580 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4581 }
4582 }
4583 else {
4584 if (qn->lower > 0) {
4585 copy_node_opt_info(opt, &nopt);
4586 if (nopt.exb.len > 0) {
4587 if (nopt.exb.reach_end) {
4588 for (i = 2; i < qn->lower &&
4589 ! is_full_opt_exact_info(&opt->exb); i++) {
4590 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4591 }
4592 if (i < qn->lower) {
4593 opt->exb.reach_end = 0;
4594 }
4595 }
4596 }
4597
4598 if (qn->lower != qn->upper) {
4599 opt->exb.reach_end = 0;
4600 opt->exm.reach_end = 0;
4601 }
4602 if (qn->lower > 1)
4603 opt->exm.reach_end = 0;
4604 }
4605 }
4606
4607 min = distance_multiply(nopt.len.min, qn->lower);
4608 if (IS_REPEAT_INFINITE(qn->upper))
4609 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4610 else
4611 max = distance_multiply(nopt.len.max, qn->upper);
4612
4613 set_mml(&opt->len, min, max);
4614 }
4615 break;
4616
4617 case N_EFFECT:
4618 {
4619 EffectNode* en = &(NEFFECT(node));
4620
4621 switch (en->type) {
4622 case EFFECT_OPTION:
4623 {
4624 OnigOptionType save = env->options;
4625
4626 env->options = en->option;
4627 r = optimize_node_left(en->target, opt, env);
4628 env->options = save;
4629 }
4630 break;
4631
4632 case EFFECT_MEMORY:
4633 #ifdef USE_SUBEXP_CALL
4634 en->opt_count++;
4635 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4636 OnigDistance min, max;
4637
4638 min = 0;
4639 max = ONIG_INFINITE_DISTANCE;
4640 if (IS_EFFECT_MIN_FIXED(en)) min = en->min_len;
4641 if (IS_EFFECT_MAX_FIXED(en)) max = en->max_len;
4642 set_mml(&opt->len, min, max);
4643 }
4644 else
4645 #endif
4646 {
4647 r = optimize_node_left(en->target, opt, env);
4648
4649 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4650 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4651 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4652 }
4653 }
4654 break;
4655
4656 case EFFECT_STOP_BACKTRACK:
4657 r = optimize_node_left(en->target, opt, env);
4658 break;
4659 }
4660 }
4661 break;
4662
4663 default:
4664 #ifdef ONIG_DEBUG
4665 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4666 NTYPE(node));
4667 #endif
4668 r = ONIGERR_TYPE_BUG;
4669 break;
4670 }
4671
4672 return r;
4673 }
4674
4675 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4676 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4677 {
4678 int r;
4679
4680 if (e->len == 0) return 0;
4681
4682 if (e->ignore_case) {
4683 reg->exact = (UChar* )xmalloc(e->len);
4684 CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
4685 xmemcpy(reg->exact, e->s, e->len);
4686 reg->exact_end = reg->exact + e->len;
4687 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4688 }
4689 else {
4690 int allow_reverse;
4691
4692 reg->exact = k_strdup(e->s, e->s + e->len);
4693 CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
4694 reg->exact_end = reg->exact + e->len;
4695
4696 allow_reverse =
4697 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4698
4699 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4700 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4701 reg->map, &(reg->int_map));
4702 if (r) return r;
4703
4704 reg->optimize = (allow_reverse != 0
4705 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4706 }
4707 else {
4708 reg->optimize = ONIG_OPTIMIZE_EXACT;
4709 }
4710 }
4711
4712 reg->dmin = e->mmd.min;
4713 reg->dmax = e->mmd.max;
4714
4715 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4716 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4717 }
4718
4719 return 0;
4720 }
4721
4722 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4723 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4724 {
4725 int i;
4726
4727 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4728 reg->map[i] = m->map[i];
4729
4730 reg->optimize = ONIG_OPTIMIZE_MAP;
4731 reg->dmin = m->mmd.min;
4732 reg->dmax = m->mmd.max;
4733
4734 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4735 reg->threshold_len = reg->dmin + 1;
4736 }
4737 }
4738
4739 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4740 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4741 {
4742 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4743 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4744 }
4745
4746 #ifdef ONIG_DEBUG
4747 static void print_optimize_info(FILE* f, regex_t* reg);
4748 #endif
4749
4750 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4751 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4752 {
4753
4754 int r;
4755 NodeOptInfo opt;
4756 OptEnv env;
4757
4758 env.enc = reg->enc;
4759 env.options = reg->options;
4760 env.ambig_flag = reg->ambig_flag;
4761 env.scan_env = scan_env;
4762 clear_mml(&env.mmd);
4763
4764 r = optimize_node_left(node, &opt, &env);
4765 if (r) return r;
4766
4767 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4768 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4769
4770 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4771
4772 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4773 reg->anchor_dmin = opt.len.min;
4774 reg->anchor_dmax = opt.len.max;
4775 }
4776
4777 if (opt.exb.len > 0 || opt.exm.len > 0) {
4778 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4779 if (opt.map.value > 0 &&
4780 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4781 goto set_map;
4782 }
4783 else {
4784 r = set_optimize_exact_info(reg, &opt.exb);
4785 set_sub_anchor(reg, &opt.exb.anc);
4786 }
4787 }
4788 else if (opt.map.value > 0) {
4789 set_map:
4790 set_optimize_map_info(reg, &opt.map);
4791 set_sub_anchor(reg, &opt.map.anc);
4792 }
4793 else {
4794 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
4795 if (opt.len.max == 0)
4796 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
4797 }
4798
4799 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
4800 print_optimize_info(stderr, reg);
4801 #endif
4802 return r;
4803 }
4804
4805 static void
clear_optimize_info(regex_t * reg)4806 clear_optimize_info(regex_t* reg)
4807 {
4808 reg->optimize = ONIG_OPTIMIZE_NONE;
4809 reg->anchor = 0;
4810 reg->anchor_dmin = 0;
4811 reg->anchor_dmax = 0;
4812 reg->sub_anchor = 0;
4813 reg->exact_end = (UChar* )NULL;
4814 reg->threshold_len = 0;
4815 if (IS_NOT_NULL(reg->exact)) {
4816 xfree(reg->exact);
4817 reg->exact = (UChar* )NULL;
4818 }
4819 }
4820
4821 #ifdef ONIG_DEBUG
4822
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)4823 static void print_enc_string(FILE* fp, OnigEncoding enc,
4824 const UChar *s, const UChar *end)
4825 {
4826 fprintf(fp, "\nPATTERN: /");
4827
4828 if (ONIGENC_MBC_MINLEN(enc) > 1) {
4829 const UChar *p;
4830 OnigCodePoint code;
4831
4832 p = s;
4833 while (p < end) {
4834 code = ONIGENC_MBC_TO_CODE(enc, p, end);
4835 if (code >= 0x80) {
4836 fprintf(fp, " 0x%04x ", (int )code);
4837 }
4838 else {
4839 fputc((int )code, fp);
4840 }
4841
4842 p += enc_len(enc, p);
4843 }
4844 }
4845 else {
4846 while (s < end) {
4847 fputc((int )*s, fp);
4848 s++;
4849 }
4850 }
4851
4852 fprintf(fp, "/\n");
4853 }
4854
4855 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)4856 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
4857 {
4858 if (a == ONIG_INFINITE_DISTANCE)
4859 fputs("inf", f);
4860 else
4861 fprintf(f, "(%u)", a);
4862
4863 fputs("-", f);
4864
4865 if (b == ONIG_INFINITE_DISTANCE)
4866 fputs("inf", f);
4867 else
4868 fprintf(f, "(%u)", b);
4869 }
4870
4871 static void
print_anchor(FILE * f,int anchor)4872 print_anchor(FILE* f, int anchor)
4873 {
4874 int q = 0;
4875
4876 fprintf(f, "[");
4877
4878 if (anchor & ANCHOR_BEGIN_BUF) {
4879 fprintf(f, "begin-buf");
4880 q = 1;
4881 }
4882 if (anchor & ANCHOR_BEGIN_LINE) {
4883 if (q) fprintf(f, ", ");
4884 q = 1;
4885 fprintf(f, "begin-line");
4886 }
4887 if (anchor & ANCHOR_BEGIN_POSITION) {
4888 if (q) fprintf(f, ", ");
4889 q = 1;
4890 fprintf(f, "begin-pos");
4891 }
4892 if (anchor & ANCHOR_END_BUF) {
4893 if (q) fprintf(f, ", ");
4894 q = 1;
4895 fprintf(f, "end-buf");
4896 }
4897 if (anchor & ANCHOR_SEMI_END_BUF) {
4898 if (q) fprintf(f, ", ");
4899 q = 1;
4900 fprintf(f, "semi-end-buf");
4901 }
4902 if (anchor & ANCHOR_END_LINE) {
4903 if (q) fprintf(f, ", ");
4904 q = 1;
4905 fprintf(f, "end-line");
4906 }
4907 if (anchor & ANCHOR_ANYCHAR_STAR) {
4908 if (q) fprintf(f, ", ");
4909 q = 1;
4910 fprintf(f, "anychar-star");
4911 }
4912 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
4913 if (q) fprintf(f, ", ");
4914 fprintf(f, "anychar-star-pl");
4915 }
4916
4917 fprintf(f, "]");
4918 }
4919
4920 static void
print_optimize_info(FILE * f,regex_t * reg)4921 print_optimize_info(FILE* f, regex_t* reg)
4922 {
4923 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
4924 "EXACT_IC", "MAP" };
4925
4926 fprintf(f, "optimize: %s\n", on[reg->optimize]);
4927 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
4928 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
4929 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
4930 fprintf(f, "\n");
4931
4932 if (reg->optimize) {
4933 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
4934 fprintf(f, "\n");
4935 }
4936 fprintf(f, "\n");
4937
4938 if (reg->exact) {
4939 UChar *p;
4940 fprintf(f, "exact: [");
4941 for (p = reg->exact; p < reg->exact_end; p++) {
4942 fputc(*p, f);
4943 }
4944 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
4945 }
4946 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
4947 int c, i, n = 0;
4948
4949 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4950 if (reg->map[i]) n++;
4951
4952 fprintf(f, "map: n=%d\n", n);
4953 if (n > 0) {
4954 c = 0;
4955 fputc('[', f);
4956 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4957 if (reg->map[i] != 0) {
4958 if (c > 0) fputs(", ", f);
4959 c++;
4960 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
4961 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
4962 fputc(i, f);
4963 else
4964 fprintf(f, "%d", i);
4965 }
4966 }
4967 fprintf(f, "]\n");
4968 }
4969 }
4970 }
4971 #endif /* ONIG_DEBUG */
4972
4973
4974 static void
onig_free_body(regex_t * reg)4975 onig_free_body(regex_t* reg)
4976 {
4977 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
4978 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
4979 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
4980 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
4981 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
4982 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
4983
4984 #ifdef USE_NAMED_GROUP
4985 onig_names_free(reg);
4986 #endif
4987 }
4988
4989 extern void
onig_free(regex_t * reg)4990 onig_free(regex_t* reg)
4991 {
4992 if (IS_NOT_NULL(reg)) {
4993 onig_free_body(reg);
4994 xfree(reg);
4995 }
4996 }
4997
4998 #define REGEX_TRANSFER(to,from) do {\
4999 (to)->state = ONIG_STATE_MODIFY;\
5000 onig_free_body(to);\
5001 xmemcpy(to, from, sizeof(regex_t));\
5002 xfree(from);\
5003 } while (0)
5004
5005 extern void
onig_transfer(regex_t * to,regex_t * from)5006 onig_transfer(regex_t* to, regex_t* from)
5007 {
5008 THREAD_ATOMIC_START;
5009 REGEX_TRANSFER(to, from);
5010 THREAD_ATOMIC_END;
5011 }
5012
5013 #define REGEX_CHAIN_HEAD(reg) do {\
5014 while (IS_NOT_NULL((reg)->chain)) {\
5015 (reg) = (reg)->chain;\
5016 }\
5017 } while (0)
5018
5019 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5020 onig_chain_link_add(regex_t* to, regex_t* add)
5021 {
5022 THREAD_ATOMIC_START;
5023 REGEX_CHAIN_HEAD(to);
5024 to->chain = add;
5025 THREAD_ATOMIC_END;
5026 }
5027
5028 extern void
onig_chain_reduce(regex_t * reg)5029 onig_chain_reduce(regex_t* reg)
5030 {
5031 regex_t *head, *prev;
5032
5033 prev = reg;
5034 head = prev->chain;
5035 if (IS_NOT_NULL(head)) {
5036 reg->state = ONIG_STATE_MODIFY;
5037 while (IS_NOT_NULL(head->chain)) {
5038 prev = head;
5039 head = head->chain;
5040 }
5041 prev->chain = (regex_t* )NULL;
5042 REGEX_TRANSFER(reg, head);
5043 }
5044 }
5045
5046 #if 0
5047 extern int
5048 onig_clone(regex_t** to, regex_t* from)
5049 {
5050 int r, size;
5051 regex_t* reg;
5052
5053 #ifdef USE_MULTI_THREAD_SYSTEM
5054 if (ONIG_STATE(from) >= ONIG_STATE_NORMAL) {
5055 ONIG_STATE_INC(from);
5056 if (IS_NOT_NULL(from->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5057 onig_chain_reduce(from);
5058 ONIG_STATE_INC(from);
5059 }
5060 }
5061 else {
5062 int n = 0;
5063 while (ONIG_STATE(from) < ONIG_STATE_NORMAL) {
5064 if (++n > THREAD_PASS_LIMIT_COUNT)
5065 return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT;
5066 THREAD_PASS;
5067 }
5068 ONIG_STATE_INC(from);
5069 }
5070 #endif /* USE_MULTI_THREAD_SYSTEM */
5071
5072 r = onig_alloc_init(®, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT,
5073 from->enc, ONIG_SYNTAX_DEFAULT);
5074 if (r != 0) {
5075 ONIG_STATE_DEC(from);
5076 return r;
5077 }
5078
5079 xmemcpy(reg, from, sizeof(onig_t));
5080 reg->chain = (regex_t* )NULL;
5081 reg->state = ONIG_STATE_NORMAL;
5082
5083 if (from->p) {
5084 reg->p = (UChar* )xmalloc(reg->alloc);
5085 if (IS_NULL(reg->p)) goto mem_error;
5086 xmemcpy(reg->p, from->p, reg->alloc);
5087 }
5088
5089 if (from->exact) {
5090 reg->exact = (UChar* )xmalloc(from->exact_end - from->exact);
5091 if (IS_NULL(reg->exact)) goto mem_error;
5092 reg->exact_end = reg->exact + (from->exact_end - from->exact);
5093 xmemcpy(reg->exact, from->exact, reg->exact_end - reg->exact);
5094 }
5095
5096 if (from->int_map) {
5097 size = sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5098 reg->int_map = (int* )xmalloc(size);
5099 if (IS_NULL(reg->int_map)) goto mem_error;
5100 xmemcpy(reg->int_map, from->int_map, size);
5101 }
5102
5103 if (from->int_map_backward) {
5104 size = sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5105 reg->int_map_backward = (int* )xmalloc(size);
5106 if (IS_NULL(reg->int_map_backward)) goto mem_error;
5107 xmemcpy(reg->int_map_backward, from->int_map_backward, size);
5108 }
5109
5110 #ifdef USE_NAMED_GROUP
5111 reg->name_table = names_clone(from); /* names_clone is not implemented */
5112 #endif
5113
5114 ONIG_STATE_DEC(from);
5115 *to = reg;
5116 return 0;
5117
5118 mem_error:
5119 ONIG_STATE_DEC(from);
5120 return ONIGERR_MEMORY;
5121 }
5122 #endif
5123
5124 #ifdef ONIG_DEBUG
5125 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5126 #endif
5127 #ifdef ONIG_DEBUG_PARSE_TREE
5128 static void print_tree P_((FILE* f, Node* node));
5129 #endif
5130
5131 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5132 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5133 OnigErrorInfo* einfo)
5134 {
5135 #define COMPILE_INIT_SIZE 20
5136
5137 int r, init_size;
5138 Node* root;
5139 ScanEnv scan_env;
5140 #ifdef USE_SUBEXP_CALL
5141 UnsetAddrList uslist;
5142 #endif
5143
5144 reg->state = ONIG_STATE_COMPILING;
5145
5146 #ifdef ONIG_DEBUG
5147 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5148 #endif
5149
5150 if (reg->alloc == 0) {
5151 init_size = (pattern_end - pattern) * 2;
5152 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5153 r = BBUF_INIT(reg, init_size);
5154 if (r != 0) goto end;
5155 }
5156 else
5157 reg->used = 0;
5158
5159 reg->num_mem = 0;
5160 reg->num_repeat = 0;
5161 reg->num_null_check = 0;
5162 reg->repeat_range_alloc = 0;
5163 reg->repeat_range = (OnigRepeatRange* )NULL;
5164 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5165 reg->num_comb_exp_check = 0;
5166 #endif
5167
5168 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5169 if (r != 0) goto err;
5170
5171 #ifdef USE_NAMED_GROUP
5172 /* mixed use named group and no-named group */
5173 if (scan_env.num_named > 0 &&
5174 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5175 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5176 if (scan_env.num_named != scan_env.num_mem)
5177 r = disable_noname_group_capture(&root, reg, &scan_env);
5178 else
5179 r = numbered_ref_check(root);
5180
5181 if (r != 0) goto err;
5182 }
5183 #endif
5184
5185 #ifdef ONIG_DEBUG_PARSE_TREE
5186 print_tree(stderr, root);
5187 #endif
5188
5189 #ifdef USE_SUBEXP_CALL
5190 if (scan_env.num_call > 0) {
5191 r = unset_addr_list_init(&uslist, scan_env.num_call);
5192 if (r != 0) goto err;
5193 scan_env.unset_addr_list = &uslist;
5194 r = setup_subexp_call(root, &scan_env);
5195 if (r != 0) goto err_unset;
5196 r = subexp_recursive_check_trav(root, &scan_env);
5197 if (r < 0) goto err_unset;
5198 r = subexp_inf_recursive_check_trav(root, &scan_env);
5199 if (r != 0) goto err_unset;
5200
5201 reg->num_call = scan_env.num_call;
5202 }
5203 else
5204 reg->num_call = 0;
5205 #endif
5206
5207 r = setup_tree(root, reg, 0, &scan_env);
5208 if (r != 0) goto err_unset;
5209
5210 reg->capture_history = scan_env.capture_history;
5211 reg->bt_mem_start = scan_env.bt_mem_start;
5212 reg->bt_mem_start |= reg->capture_history;
5213 if (IS_FIND_CONDITION(reg->options))
5214 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5215 else {
5216 reg->bt_mem_end = scan_env.bt_mem_end;
5217 reg->bt_mem_end |= reg->capture_history;
5218 }
5219
5220 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5221 if (scan_env.backrefed_mem == 0
5222 #ifdef USE_SUBEXP_CALL
5223 || scan_env.num_call == 0
5224 #endif
5225 ) {
5226 setup_comb_exp_check(root, 0, &scan_env);
5227 #ifdef USE_SUBEXP_CALL
5228 if (scan_env.has_recursion != 0) {
5229 scan_env.num_comb_exp_check = 0;
5230 }
5231 else
5232 #endif
5233 if (scan_env.comb_exp_max_regnum > 0) {
5234 int i;
5235 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5236 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5237 scan_env.num_comb_exp_check = 0;
5238 break;
5239 }
5240 }
5241 }
5242 }
5243
5244 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5245 #endif
5246
5247 clear_optimize_info(reg);
5248 #ifndef ONIG_DONT_OPTIMIZE
5249 r = set_optimize_info_from_tree(root, reg, &scan_env);
5250 if (r != 0) goto err_unset;
5251 #endif
5252
5253 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5254 xfree(scan_env.mem_nodes_dynamic);
5255 scan_env.mem_nodes_dynamic = (Node** )NULL;
5256 }
5257
5258 r = compile_tree(root, reg);
5259 if (r == 0) {
5260 r = add_opcode(reg, OP_END);
5261 #ifdef USE_SUBEXP_CALL
5262 if (scan_env.num_call > 0) {
5263 r = unset_addr_list_fix(&uslist, reg);
5264 unset_addr_list_end(&uslist);
5265 if (r) goto err;
5266 }
5267 #endif
5268
5269 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5270 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5271 else {
5272 if (reg->bt_mem_start != 0)
5273 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5274 else
5275 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5276 }
5277 }
5278 #ifdef USE_SUBEXP_CALL
5279 else if (scan_env.num_call > 0) {
5280 unset_addr_list_end(&uslist);
5281 }
5282 #endif
5283 onig_node_free(root);
5284
5285 #ifdef ONIG_DEBUG_COMPILE
5286 #ifdef USE_NAMED_GROUP
5287 onig_print_names(stderr, reg);
5288 #endif
5289 print_compiled_byte_code_list(stderr, reg);
5290 #endif
5291
5292 end:
5293 reg->state = ONIG_STATE_NORMAL;
5294 return r;
5295
5296 err_unset:
5297 #ifdef USE_SUBEXP_CALL
5298 if (scan_env.num_call > 0) {
5299 unset_addr_list_end(&uslist);
5300 }
5301 #endif
5302 err:
5303 if (IS_NOT_NULL(scan_env.error)) {
5304 if (IS_NOT_NULL(einfo)) {
5305 einfo->enc = scan_env.enc;
5306 einfo->par = scan_env.error;
5307 einfo->par_end = scan_env.error_end;
5308 }
5309 }
5310
5311 if (IS_NOT_NULL(root)) onig_node_free(root);
5312 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5313 xfree(scan_env.mem_nodes_dynamic);
5314 return r;
5315 }
5316
5317 #ifdef USE_RECOMPILE_API
5318 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5319 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5320 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5321 OnigErrorInfo* einfo)
5322 {
5323 int r;
5324 regex_t *new_reg;
5325
5326 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5327 if (r) return r;
5328 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5329 onig_transfer(reg, new_reg);
5330 }
5331 else {
5332 onig_chain_link_add(reg, new_reg);
5333 }
5334 return 0;
5335 }
5336 #endif
5337
5338 static int onig_inited = 0;
5339
5340 extern int
onig_alloc_init(regex_t ** reg,OnigOptionType option,OnigAmbigType ambig_flag,OnigEncoding enc,OnigSyntaxType * syntax)5341 onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag,
5342 OnigEncoding enc, OnigSyntaxType* syntax)
5343 {
5344 if (! onig_inited)
5345 onig_init();
5346
5347 if (ONIGENC_IS_UNDEF(enc))
5348 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5349
5350 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5351 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5352 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5353 }
5354
5355 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5356 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5357 (*reg)->state = ONIG_STATE_MODIFY;
5358
5359 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5360 option |= syntax->options;
5361 option &= ~ONIG_OPTION_SINGLELINE;
5362 }
5363 else
5364 option |= syntax->options;
5365
5366 (*reg)->enc = enc;
5367 (*reg)->options = option;
5368 (*reg)->syntax = syntax;
5369 (*reg)->optimize = 0;
5370 (*reg)->exact = (UChar* )NULL;
5371 (*reg)->int_map = (int* )NULL;
5372 (*reg)->int_map_backward = (int* )NULL;
5373 (*reg)->chain = (regex_t* )NULL;
5374
5375 (*reg)->p = (UChar* )NULL;
5376 (*reg)->alloc = 0;
5377 (*reg)->used = 0;
5378 (*reg)->name_table = (void* )NULL;
5379
5380 (*reg)->ambig_flag = ambig_flag;
5381 (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc);
5382
5383 return 0;
5384 }
5385
5386 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5387 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5388 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5389 OnigErrorInfo* einfo)
5390 {
5391 int r;
5392
5393 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5394
5395 r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT,
5396 enc, syntax);
5397 if (r) return r;
5398
5399 r = onig_compile(*reg, pattern, pattern_end, einfo);
5400 if (r) {
5401 onig_free(*reg);
5402 *reg = NULL;
5403 }
5404 return r;
5405 }
5406
5407 extern int
onig_init(void)5408 onig_init(void)
5409 {
5410 if (onig_inited != 0)
5411 return 0;
5412
5413 onig_inited = 1;
5414
5415 THREAD_SYSTEM_INIT;
5416 THREAD_ATOMIC_START;
5417
5418 onigenc_init();
5419 onigenc_set_default_caseconv_table((UChar* )0);
5420
5421 #ifdef ONIG_DEBUG_STATISTICS
5422 onig_statistics_init();
5423 #endif
5424
5425 THREAD_ATOMIC_END;
5426 return 0;
5427 }
5428
5429
5430 extern int
onig_end(void)5431 onig_end(void)
5432 {
5433 extern int onig_free_shared_cclass_table(void);
5434
5435 THREAD_ATOMIC_START;
5436
5437 #ifdef ONIG_DEBUG_STATISTICS
5438 onig_print_statistics(stderr);
5439 #endif
5440
5441 #ifdef USE_SHARED_CCLASS_TABLE
5442 onig_free_shared_cclass_table();
5443 #endif
5444
5445 #ifdef USE_RECYCLE_NODE
5446 onig_free_node_list();
5447 #endif
5448
5449 onig_inited = 0;
5450
5451 THREAD_ATOMIC_END;
5452 THREAD_SYSTEM_END;
5453 return 0;
5454 }
5455
5456
5457 #ifdef ONIG_DEBUG
5458
5459 /* arguments type */
5460 #define ARG_SPECIAL -1
5461 #define ARG_NON 0
5462 #define ARG_RELADDR 1
5463 #define ARG_ABSADDR 2
5464 #define ARG_LENGTH 3
5465 #define ARG_MEMNUM 4
5466 #define ARG_OPTION 5
5467 #define ARG_STATE_CHECK 6
5468
5469 OnigOpInfoType OnigOpInfo[] = {
5470 { OP_FINISH, "finish", ARG_NON },
5471 { OP_END, "end", ARG_NON },
5472 { OP_EXACT1, "exact1", ARG_SPECIAL },
5473 { OP_EXACT2, "exact2", ARG_SPECIAL },
5474 { OP_EXACT3, "exact3", ARG_SPECIAL },
5475 { OP_EXACT4, "exact4", ARG_SPECIAL },
5476 { OP_EXACT5, "exact5", ARG_SPECIAL },
5477 { OP_EXACTN, "exactn", ARG_SPECIAL },
5478 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5479 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5480 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5481 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5482 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5483 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5484 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5485 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5486 { OP_CCLASS, "cclass", ARG_SPECIAL },
5487 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5488 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5489 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5490 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5491 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5492 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5493 { OP_ANYCHAR, "anychar", ARG_NON },
5494 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5495 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5496 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5497 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5498 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5499 { OP_WORD, "word", ARG_NON },
5500 { OP_NOT_WORD, "not-word", ARG_NON },
5501 { OP_WORD_BOUND, "word-bound", ARG_NON },
5502 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5503 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5504 { OP_WORD_END, "word-end", ARG_NON },
5505 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5506 { OP_END_BUF, "end-buf", ARG_NON },
5507 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5508 { OP_END_LINE, "end-line", ARG_NON },
5509 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5510 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5511 { OP_BACKREF1, "backref1", ARG_NON },
5512 { OP_BACKREF2, "backref2", ARG_NON },
5513 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5514 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5515 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5516 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5517 { OP_BACKREF_AT_LEVEL, "backref_at_level", ARG_SPECIAL },
5518 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5519 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5520 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5521 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5522 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5523 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5524 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5525 { OP_SET_OPTION, "set-option", ARG_OPTION },
5526 { OP_FAIL, "fail", ARG_NON },
5527 { OP_JUMP, "jump", ARG_RELADDR },
5528 { OP_PUSH, "push", ARG_RELADDR },
5529 { OP_POP, "pop", ARG_NON },
5530 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5531 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5532 { OP_REPEAT, "repeat", ARG_SPECIAL },
5533 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5534 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5535 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5536 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5537 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5538 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5539 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5540 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5541 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5542 { OP_PUSH_POS, "push-pos", ARG_NON },
5543 { OP_POP_POS, "pop-pos", ARG_NON },
5544 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5545 { OP_FAIL_POS, "fail-pos", ARG_NON },
5546 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5547 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5548 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5549 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5550 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5551 { OP_CALL, "call", ARG_ABSADDR },
5552 { OP_RETURN, "return", ARG_NON },
5553 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5554 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5555 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5556 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5557 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5558 "state-check-anychar-ml*", ARG_STATE_CHECK },
5559 { -1, "", ARG_NON }
5560 };
5561
5562 static char*
op2name(int opcode)5563 op2name(int opcode)
5564 {
5565 int i;
5566
5567 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5568 if (opcode == OnigOpInfo[i].opcode)
5569 return OnigOpInfo[i].name;
5570 }
5571 return "";
5572 }
5573
5574 static int
op2arg_type(int opcode)5575 op2arg_type(int opcode)
5576 {
5577 int i;
5578
5579 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5580 if (opcode == OnigOpInfo[i].opcode)
5581 return OnigOpInfo[i].arg_type;
5582 }
5583 return ARG_SPECIAL;
5584 }
5585
5586 static void
Indent(FILE * f,int indent)5587 Indent(FILE* f, int indent)
5588 {
5589 int i;
5590 for (i = 0; i < indent; i++) putc(' ', f);
5591 }
5592
5593 static void
p_string(FILE * f,int len,UChar * s)5594 p_string(FILE* f, int len, UChar* s)
5595 {
5596 fputs(":", f);
5597 while (len-- > 0) { fputc(*s++, f); }
5598 }
5599
5600 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5601 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5602 {
5603 int x = len * mb_len;
5604
5605 fprintf(f, ":%d:", len);
5606 while (x-- > 0) { fputc(*s++, f); }
5607 }
5608
5609 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5610 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5611 OnigEncoding enc)
5612 {
5613 int i, n, arg_type;
5614 RelAddrType addr;
5615 LengthType len;
5616 MemNumType mem;
5617 StateCheckNumType scn;
5618 OnigCodePoint code;
5619 UChar *q;
5620
5621 fprintf(f, "[%s", op2name(*bp));
5622 arg_type = op2arg_type(*bp);
5623 if (arg_type != ARG_SPECIAL) {
5624 bp++;
5625 switch (arg_type) {
5626 case ARG_NON:
5627 break;
5628 case ARG_RELADDR:
5629 GET_RELADDR_INC(addr, bp);
5630 fprintf(f, ":(%d)", addr);
5631 break;
5632 case ARG_ABSADDR:
5633 GET_ABSADDR_INC(addr, bp);
5634 fprintf(f, ":(%d)", addr);
5635 break;
5636 case ARG_LENGTH:
5637 GET_LENGTH_INC(len, bp);
5638 fprintf(f, ":%d", len);
5639 break;
5640 case ARG_MEMNUM:
5641 mem = *((MemNumType* )bp);
5642 bp += SIZE_MEMNUM;
5643 fprintf(f, ":%d", mem);
5644 break;
5645 case ARG_OPTION:
5646 {
5647 OnigOptionType option = *((OnigOptionType* )bp);
5648 bp += SIZE_OPTION;
5649 fprintf(f, ":%d", option);
5650 }
5651 break;
5652
5653 case ARG_STATE_CHECK:
5654 scn = *((StateCheckNumType* )bp);
5655 bp += SIZE_STATE_CHECK_NUM;
5656 fprintf(f, ":%d", scn);
5657 break;
5658 }
5659 }
5660 else {
5661 switch (*bp++) {
5662 case OP_EXACT1:
5663 case OP_ANYCHAR_STAR_PEEK_NEXT:
5664 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5665 p_string(f, 1, bp++); break;
5666 case OP_EXACT2:
5667 p_string(f, 2, bp); bp += 2; break;
5668 case OP_EXACT3:
5669 p_string(f, 3, bp); bp += 3; break;
5670 case OP_EXACT4:
5671 p_string(f, 4, bp); bp += 4; break;
5672 case OP_EXACT5:
5673 p_string(f, 5, bp); bp += 5; break;
5674 case OP_EXACTN:
5675 GET_LENGTH_INC(len, bp);
5676 p_len_string(f, len, 1, bp);
5677 bp += len;
5678 break;
5679
5680 case OP_EXACTMB2N1:
5681 p_string(f, 2, bp); bp += 2; break;
5682 case OP_EXACTMB2N2:
5683 p_string(f, 4, bp); bp += 4; break;
5684 case OP_EXACTMB2N3:
5685 p_string(f, 6, bp); bp += 6; break;
5686 case OP_EXACTMB2N:
5687 GET_LENGTH_INC(len, bp);
5688 p_len_string(f, len, 2, bp);
5689 bp += len * 2;
5690 break;
5691 case OP_EXACTMB3N:
5692 GET_LENGTH_INC(len, bp);
5693 p_len_string(f, len, 3, bp);
5694 bp += len * 3;
5695 break;
5696 case OP_EXACTMBN:
5697 {
5698 int mb_len;
5699
5700 GET_LENGTH_INC(mb_len, bp);
5701 GET_LENGTH_INC(len, bp);
5702 fprintf(f, ":%d:%d:", mb_len, len);
5703 n = len * mb_len;
5704 while (n-- > 0) { fputc(*bp++, f); }
5705 }
5706 break;
5707
5708 case OP_EXACT1_IC:
5709 len = enc_len(enc, bp);
5710 p_string(f, len, bp);
5711 bp += len;
5712 break;
5713 case OP_EXACTN_IC:
5714 GET_LENGTH_INC(len, bp);
5715 p_len_string(f, len, 1, bp);
5716 bp += len;
5717 break;
5718
5719 case OP_CCLASS:
5720 n = bitset_on_num((BitSetRef )bp);
5721 bp += SIZE_BITSET;
5722 fprintf(f, ":%d", n);
5723 break;
5724
5725 case OP_CCLASS_NOT:
5726 n = bitset_on_num((BitSetRef )bp);
5727 bp += SIZE_BITSET;
5728 fprintf(f, ":%d", n);
5729 break;
5730
5731 case OP_CCLASS_MB:
5732 case OP_CCLASS_MB_NOT:
5733 GET_LENGTH_INC(len, bp);
5734 q = bp;
5735 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5736 ALIGNMENT_RIGHT(q);
5737 #endif
5738 GET_CODE_POINT(code, q);
5739 bp += len;
5740 fprintf(f, ":%d:%d", (int )code, len);
5741 break;
5742
5743 case OP_CCLASS_MIX:
5744 case OP_CCLASS_MIX_NOT:
5745 n = bitset_on_num((BitSetRef )bp);
5746 bp += SIZE_BITSET;
5747 GET_LENGTH_INC(len, bp);
5748 q = bp;
5749 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5750 ALIGNMENT_RIGHT(q);
5751 #endif
5752 GET_CODE_POINT(code, q);
5753 bp += len;
5754 fprintf(f, ":%d:%d:%d", n, (int )code, len);
5755 break;
5756
5757 case OP_CCLASS_NODE:
5758 {
5759 CClassNode *cc;
5760
5761 GET_POINTER_INC(cc, bp);
5762 n = bitset_on_num(cc->bs);
5763 fprintf(f, ":%u:%d", (unsigned int )cc, n);
5764 }
5765 break;
5766
5767 case OP_BACKREFN_IC:
5768 mem = *((MemNumType* )bp);
5769 bp += SIZE_MEMNUM;
5770 fprintf(f, ":%d", mem);
5771 break;
5772
5773 case OP_BACKREF_MULTI_IC:
5774 case OP_BACKREF_MULTI:
5775 fputs(" ", f);
5776 GET_LENGTH_INC(len, bp);
5777 for (i = 0; i < len; i++) {
5778 GET_MEMNUM_INC(mem, bp);
5779 if (i > 0) fputs(", ", f);
5780 fprintf(f, "%d", mem);
5781 }
5782 break;
5783
5784 case OP_BACKREF_AT_LEVEL:
5785 {
5786 OnigOptionType option;
5787 LengthType level;
5788
5789 GET_OPTION_INC(option, bp);
5790 fprintf(f, ":%d", option);
5791 GET_LENGTH_INC(level, bp);
5792 fprintf(f, ":%d", level);
5793
5794 fputs(" ", f);
5795 GET_LENGTH_INC(len, bp);
5796 for (i = 0; i < len; i++) {
5797 GET_MEMNUM_INC(mem, bp);
5798 if (i > 0) fputs(", ", f);
5799 fprintf(f, "%d", mem);
5800 }
5801 }
5802 break;
5803
5804 case OP_REPEAT:
5805 case OP_REPEAT_NG:
5806 {
5807 mem = *((MemNumType* )bp);
5808 bp += SIZE_MEMNUM;
5809 addr = *((RelAddrType* )bp);
5810 bp += SIZE_RELADDR;
5811 fprintf(f, ":%d:%d", mem, addr);
5812 }
5813 break;
5814
5815 case OP_PUSH_OR_JUMP_EXACT1:
5816 case OP_PUSH_IF_PEEK_NEXT:
5817 addr = *((RelAddrType* )bp);
5818 bp += SIZE_RELADDR;
5819 fprintf(f, ":(%d)", addr);
5820 p_string(f, 1, bp);
5821 bp += 1;
5822 break;
5823
5824 case OP_LOOK_BEHIND:
5825 GET_LENGTH_INC(len, bp);
5826 fprintf(f, ":%d", len);
5827 break;
5828
5829 case OP_PUSH_LOOK_BEHIND_NOT:
5830 GET_RELADDR_INC(addr, bp);
5831 GET_LENGTH_INC(len, bp);
5832 fprintf(f, ":%d:(%d)", len, addr);
5833 break;
5834
5835 case OP_STATE_CHECK_PUSH:
5836 case OP_STATE_CHECK_PUSH_OR_JUMP:
5837 scn = *((StateCheckNumType* )bp);
5838 bp += SIZE_STATE_CHECK_NUM;
5839 addr = *((RelAddrType* )bp);
5840 bp += SIZE_RELADDR;
5841 fprintf(f, ":%d:(%d)", scn, addr);
5842 break;
5843
5844 default:
5845 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
5846 *--bp);
5847 }
5848 }
5849 fputs("]", f);
5850 if (nextp) *nextp = bp;
5851 }
5852
5853 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)5854 print_compiled_byte_code_list(FILE* f, regex_t* reg)
5855 {
5856 int ncode;
5857 UChar* bp = reg->p;
5858 UChar* end = reg->p + reg->used;
5859
5860 fprintf(f, "code length: %d\n", reg->used);
5861
5862 ncode = 0;
5863 while (bp < end) {
5864 ncode++;
5865 if (bp > reg->p) {
5866 if (ncode % 5 == 0)
5867 fprintf(f, "\n");
5868 else
5869 fputs(" ", f);
5870 }
5871 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
5872 }
5873
5874 fprintf(f, "\n");
5875 }
5876
5877 static void
print_indent_tree(FILE * f,Node * node,int indent)5878 print_indent_tree(FILE* f, Node* node, int indent)
5879 {
5880 int i, type;
5881 int add = 3;
5882 UChar* p;
5883
5884 Indent(f, indent);
5885 if (IS_NULL(node)) {
5886 fprintf(f, "ERROR: null node!!!\n");
5887 exit (0);
5888 }
5889
5890 type = NTYPE(node);
5891 switch (type) {
5892 case N_LIST:
5893 case N_ALT:
5894 if (NTYPE(node) == N_LIST)
5895 fprintf(f, "<list:%x>\n", (int )node);
5896 else
5897 fprintf(f, "<alt:%x>\n", (int )node);
5898
5899 print_indent_tree(f, NCONS(node).left, indent + add);
5900 while (IS_NOT_NULL(node = NCONS(node).right)) {
5901 if (NTYPE(node) != type) {
5902 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
5903 exit(0);
5904 }
5905 print_indent_tree(f, NCONS(node).left, indent + add);
5906 }
5907 break;
5908
5909 case N_STRING:
5910 fprintf(f, "<string%s:%x>",
5911 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
5912 for (p = NSTRING(node).s; p < NSTRING(node).end; p++) {
5913 if (*p >= 0x20 && *p < 0x7f)
5914 fputc(*p, f);
5915 else {
5916 fprintf(f, " 0x%02x", *p);
5917 }
5918 }
5919 break;
5920
5921 case N_CCLASS:
5922 fprintf(f, "<cclass:%x>", (int )node);
5923 if (IS_CCLASS_NOT(&NCCLASS(node))) fputs(" not", f);
5924 if (NCCLASS(node).mbuf) {
5925 BBuf* bbuf = NCCLASS(node).mbuf;
5926 for (i = 0; i < bbuf->used; i++) {
5927 if (i > 0) fprintf(f, ",");
5928 fprintf(f, "%0x", bbuf->p[i]);
5929 }
5930 }
5931 break;
5932
5933 case N_CTYPE:
5934 fprintf(f, "<ctype:%x> ", (int )node);
5935 switch (NCTYPE(node).type) {
5936 case CTYPE_WORD: fputs("word", f); break;
5937 case CTYPE_NOT_WORD: fputs("not word", f); break;
5938 default:
5939 fprintf(f, "ERROR: undefined ctype.\n");
5940 exit(0);
5941 }
5942 break;
5943
5944 case N_ANYCHAR:
5945 fprintf(f, "<anychar:%x>", (int )node);
5946 break;
5947
5948 case N_ANCHOR:
5949 fprintf(f, "<anchor:%x> ", (int )node);
5950 switch (NANCHOR(node).type) {
5951 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
5952 case ANCHOR_END_BUF: fputs("end buf", f); break;
5953 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
5954 case ANCHOR_END_LINE: fputs("end line", f); break;
5955 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
5956 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
5957
5958 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
5959 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
5960 #ifdef USE_WORD_BEGIN_END
5961 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
5962 case ANCHOR_WORD_END: fputs("word end", f); break;
5963 #endif
5964 case ANCHOR_PREC_READ: fputs("prec read", f); break;
5965 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
5966 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
5967 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
5968
5969 default:
5970 fprintf(f, "ERROR: undefined anchor type.\n");
5971 break;
5972 }
5973 break;
5974
5975 case N_BACKREF:
5976 {
5977 int* p;
5978 BackrefNode* br = &(NBACKREF(node));
5979 p = BACKREFS_P(br);
5980 fprintf(f, "<backref:%x>", (int )node);
5981 for (i = 0; i < br->back_num; i++) {
5982 if (i > 0) fputs(", ", f);
5983 fprintf(f, "%d", p[i]);
5984 }
5985 }
5986 break;
5987
5988 #ifdef USE_SUBEXP_CALL
5989 case N_CALL:
5990 {
5991 CallNode* cn = &(NCALL(node));
5992 fprintf(f, "<call:%x>", (int )node);
5993 p_string(f, cn->name_end - cn->name, cn->name);
5994 }
5995 break;
5996 #endif
5997
5998 case N_QUANTIFIER:
5999 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6000 NQUANTIFIER(node).lower, NQUANTIFIER(node).upper,
6001 (NQUANTIFIER(node).greedy ? "" : "?"));
6002 print_indent_tree(f, NQUANTIFIER(node).target, indent + add);
6003 break;
6004
6005 case N_EFFECT:
6006 fprintf(f, "<effect:%x> ", (int )node);
6007 switch (NEFFECT(node).type) {
6008 case EFFECT_OPTION:
6009 fprintf(f, "option:%d\n", NEFFECT(node).option);
6010 print_indent_tree(f, NEFFECT(node).target, indent + add);
6011 break;
6012 case EFFECT_MEMORY:
6013 fprintf(f, "memory:%d", NEFFECT(node).regnum);
6014 break;
6015 case EFFECT_STOP_BACKTRACK:
6016 fprintf(f, "stop-bt");
6017 break;
6018
6019 default:
6020 break;
6021 }
6022 fprintf(f, "\n");
6023 print_indent_tree(f, NEFFECT(node).target, indent + add);
6024 break;
6025
6026 default:
6027 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6028 break;
6029 }
6030
6031 if (type != N_LIST && type != N_ALT && type != N_QUANTIFIER &&
6032 type != N_EFFECT)
6033 fprintf(f, "\n");
6034 fflush(f);
6035 }
6036 #endif /* ONIG_DEBUG */
6037
6038 #ifdef ONIG_DEBUG_PARSE_TREE
6039 static void
print_tree(FILE * f,Node * node)6040 print_tree(FILE* f, Node* node)
6041 {
6042 print_indent_tree(f, node, 0);
6043 }
6044 #endif
6045