1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37 return OnigDefaultCaseFoldFlag;
38 }
39
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43 OnigDefaultCaseFoldFlag = case_fold_flag;
44 return 0;
45 }
46
47
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55 int len = end - s;
56
57 if (len > 0) {
58 UChar* r = (UChar* )xmalloc(len + 1);
59 CHECK_NULL_RETURN(r);
60 xmemcpy(r, s, len);
61 r[len] = (UChar )0;
62 return r;
63 }
64 else return NULL;
65 }
66
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70 Node c;
71 c = *a; *a = *b; *b = c;
72
73 if (NTYPE(a) == NT_STR) {
74 StrNode* sn = NSTR(a);
75 if (sn->capa == 0) {
76 int len = sn->end - sn->s;
77 sn->s = sn->buf;
78 sn->end = sn->s + len;
79 }
80 }
81
82 if (NTYPE(b) == NT_STR) {
83 StrNode* sn = NSTR(b);
84 if (sn->capa == 0) {
85 int len = sn->end - sn->s;
86 sn->s = sn->buf;
87 sn->end = sn->s + len;
88 }
89 }
90 }
91
92 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)93 distance_add(OnigDistance d1, OnigDistance d2)
94 {
95 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96 return ONIG_INFINITE_DISTANCE;
97 else {
98 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99 else return ONIG_INFINITE_DISTANCE;
100 }
101 }
102
103 static OnigDistance
distance_multiply(OnigDistance d,int m)104 distance_multiply(OnigDistance d, int m)
105 {
106 if (m == 0) return 0;
107
108 if (d < ONIG_INFINITE_DISTANCE / m)
109 return d * m;
110 else
111 return ONIG_INFINITE_DISTANCE;
112 }
113
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117 int i;
118 for (i = 0; i < (int )BITSET_SIZE; i++) {
119 if (bs[i] != 0) return 0;
120 }
121 return 1;
122 }
123
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128 int i, n;
129
130 n = 0;
131 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132 if (BITSET_AT(bs, i)) n++;
133 }
134 return n;
135 }
136 #endif
137
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141 if (size <= 0) {
142 size = 0;
143 buf->p = NULL;
144 }
145 else {
146 buf->p = (UChar* )xmalloc(size);
147 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148 }
149
150 buf->alloc = size;
151 buf->used = 0;
152 return 0;
153 }
154
155
156 #ifdef USE_SUBEXP_CALL
157
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161 UnsetAddr* p;
162
163 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164 CHECK_NULL_RETURN_MEMERR(p);
165 uslist->num = 0;
166 uslist->alloc = size;
167 uslist->us = p;
168 return 0;
169 }
170
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174 if (IS_NOT_NULL(uslist->us))
175 xfree(uslist->us);
176 }
177
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181 UnsetAddr* p;
182 int size;
183
184 if (uslist->num >= uslist->alloc) {
185 size = uslist->alloc * 2;
186 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187 CHECK_NULL_RETURN_MEMERR(p);
188 uslist->alloc = size;
189 uslist->us = p;
190 }
191
192 uslist->us[uslist->num].offset = offset;
193 uslist->us[uslist->num].target = node;
194 uslist->num++;
195 return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198
199
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203 BBUF_ADD1(reg, opcode);
204 return 0;
205 }
206
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211 StateCheckNumType n = (StateCheckNumType )num;
212
213 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214 return 0;
215 }
216 #endif
217
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221 RelAddrType ra = (RelAddrType )addr;
222
223 BBUF_ADD(reg, &ra, SIZE_RELADDR);
224 return 0;
225 }
226
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230 AbsAddrType ra = (AbsAddrType )addr;
231
232 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233 return 0;
234 }
235
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239 LengthType l = (LengthType )len;
240
241 BBUF_ADD(reg, &l, SIZE_LENGTH);
242 return 0;
243 }
244
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248 MemNumType n = (MemNumType )num;
249
250 BBUF_ADD(reg, &n, SIZE_MEMNUM);
251 return 0;
252 }
253
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257 PointerType ptr = (PointerType )addr;
258
259 BBUF_ADD(reg, &ptr, SIZE_POINTER);
260 return 0;
261 }
262
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266 BBUF_ADD(reg, &option, SIZE_OPTION);
267 return 0;
268 }
269
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273 int r;
274
275 r = add_opcode(reg, opcode);
276 if (r) return r;
277 r = add_rel_addr(reg, addr);
278 return r;
279 }
280
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284 BBUF_ADD(reg, bytes, len);
285 return 0;
286 }
287
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291 BBUF_ADD(reg, bs, SIZE_BITSET);
292 return 0;
293 }
294
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_option(reg, option);
303 return r;
304 }
305
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308
309
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
312 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
313
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317 int op;
318
319 if (ignore_case) {
320 switch (str_len) {
321 case 1: op = OP_EXACT1_IC; break;
322 default: op = OP_EXACTN_IC; break;
323 }
324 }
325 else {
326 switch (mb_len) {
327 case 1:
328 switch (str_len) {
329 case 1: op = OP_EXACT1; break;
330 case 2: op = OP_EXACT2; break;
331 case 3: op = OP_EXACT3; break;
332 case 4: op = OP_EXACT4; break;
333 case 5: op = OP_EXACT5; break;
334 default: op = OP_EXACTN; break;
335 }
336 break;
337
338 case 2:
339 switch (str_len) {
340 case 1: op = OP_EXACTMB2N1; break;
341 case 2: op = OP_EXACTMB2N2; break;
342 case 3: op = OP_EXACTMB2N3; break;
343 default: op = OP_EXACTMB2N; break;
344 }
345 break;
346
347 case 3:
348 op = OP_EXACTMB3N;
349 break;
350
351 default:
352 op = OP_EXACTMBN;
353 break;
354 }
355 }
356 return op;
357 }
358
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362 int r;
363 int saved_num_null_check = reg->num_null_check;
364
365 if (empty_info != 0) {
366 r = add_opcode(reg, OP_NULL_CHECK_START);
367 if (r) return r;
368 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369 if (r) return r;
370 reg->num_null_check++;
371 }
372
373 r = compile_tree(node, reg);
374 if (r) return r;
375
376 if (empty_info != 0) {
377 if (empty_info == NQ_TARGET_IS_EMPTY)
378 r = add_opcode(reg, OP_NULL_CHECK_END);
379 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383
384 if (r) return r;
385 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386 }
387 return r;
388 }
389
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394 int r;
395
396 r = add_opcode(reg, OP_CALL);
397 if (r) return r;
398 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399 node->target);
400 if (r) return r;
401 r = add_abs_addr(reg, 0 /*dummy addr.*/);
402 return r;
403 }
404 #endif
405
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409 int i, r;
410
411 for (i = 0; i < n; i++) {
412 r = compile_tree(node, reg);
413 if (r) return r;
414 }
415 return 0;
416 }
417
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420 regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422 int len;
423 int op = select_str_opcode(mb_len, str_len, ignore_case);
424
425 len = SIZE_OPCODE;
426
427 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
428 if (IS_NEED_STR_LEN_OP_EXACT(op))
429 len += SIZE_LENGTH;
430
431 len += mb_len * str_len;
432 return len;
433 }
434
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437 regex_t* reg, int ignore_case)
438 {
439 int op = select_str_opcode(mb_len, str_len, ignore_case);
440 add_opcode(reg, op);
441
442 if (op == OP_EXACTMBN)
443 add_length(reg, mb_len);
444
445 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446 if (op == OP_EXACTN_IC)
447 add_length(reg, mb_len * str_len);
448 else
449 add_length(reg, str_len);
450 }
451
452 add_bytes(reg, s, mb_len * str_len);
453 return 0;
454 }
455
456
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460 int rlen, r, len, prev_len, slen, ambig;
461 OnigEncoding enc = reg->enc;
462 UChar *p, *prev;
463 StrNode* sn;
464
465 sn = NSTR(node);
466 if (sn->end <= sn->s)
467 return 0;
468
469 ambig = NSTRING_IS_AMBIG(node);
470
471 p = prev = sn->s;
472 prev_len = enclen(enc, p);
473 p += prev_len;
474 slen = 1;
475 rlen = 0;
476
477 for (; p < sn->end; ) {
478 len = enclen(enc, p);
479 if (len == prev_len) {
480 slen++;
481 }
482 else {
483 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484 rlen += r;
485 prev = p;
486 slen = 1;
487 prev_len = len;
488 }
489 p += len;
490 }
491 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492 rlen += r;
493 return rlen;
494 }
495
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499 if (sn->end <= sn->s)
500 return 0;
501
502 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508 int r, len, prev_len, slen, ambig;
509 OnigEncoding enc = reg->enc;
510 UChar *p, *prev, *end;
511 StrNode* sn;
512
513 sn = NSTR(node);
514 if (sn->end <= sn->s)
515 return 0;
516
517 end = sn->end;
518 ambig = NSTRING_IS_AMBIG(node);
519
520 p = prev = sn->s;
521 prev_len = enclen(enc, p);
522 p += prev_len;
523 slen = 1;
524
525 for (; p < end; ) {
526 len = enclen(enc, p);
527 if (len == prev_len) {
528 slen++;
529 }
530 else {
531 r = add_compile_string(prev, prev_len, slen, reg, ambig);
532 if (r) return r;
533
534 prev = p;
535 slen = 1;
536 prev_len = len;
537 }
538
539 p += len;
540 }
541 return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547 if (sn->end <= sn->s)
548 return 0;
549
550 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557 add_length(reg, mbuf->used);
558 return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560 int r, pad_size;
561 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562
563 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566
567 r = add_bytes(reg, mbuf->p, mbuf->used);
568
569 /* padding for return value from compile_length_cclass_node() to be fix. */
570 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572 return r;
573 #endif
574 }
575
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579 int len;
580
581 if (IS_NCCLASS_SHARE(cc)) {
582 len = SIZE_OPCODE + SIZE_POINTER;
583 return len;
584 }
585
586 if (IS_NULL(cc->mbuf)) {
587 len = SIZE_OPCODE + SIZE_BITSET;
588 }
589 else {
590 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591 len = SIZE_OPCODE;
592 }
593 else {
594 len = SIZE_OPCODE + SIZE_BITSET;
595 }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597 len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601 }
602
603 return len;
604 }
605
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609 int r;
610
611 if (IS_NCCLASS_SHARE(cc)) {
612 add_opcode(reg, OP_CCLASS_NODE);
613 r = add_pointer(reg, cc);
614 return r;
615 }
616
617 if (IS_NULL(cc->mbuf)) {
618 if (IS_NCCLASS_NOT(cc))
619 add_opcode(reg, OP_CCLASS_NOT);
620 else
621 add_opcode(reg, OP_CCLASS);
622
623 r = add_bitset(reg, cc->bs);
624 }
625 else {
626 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627 if (IS_NCCLASS_NOT(cc))
628 add_opcode(reg, OP_CCLASS_MB_NOT);
629 else
630 add_opcode(reg, OP_CCLASS_MB);
631
632 r = add_multi_byte_cclass(cc->mbuf, reg);
633 }
634 else {
635 if (IS_NCCLASS_NOT(cc))
636 add_opcode(reg, OP_CCLASS_MIX_NOT);
637 else
638 add_opcode(reg, OP_CCLASS_MIX);
639
640 r = add_bitset(reg, cc->bs);
641 if (r) return r;
642 r = add_multi_byte_cclass(cc->mbuf, reg);
643 }
644 }
645
646 return r;
647 }
648
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC 4
653
654 OnigRepeatRange* p;
655
656 if (reg->repeat_range_alloc == 0) {
657 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658 CHECK_NULL_RETURN_MEMERR(p);
659 reg->repeat_range = p;
660 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661 }
662 else if (reg->repeat_range_alloc <= id) {
663 int n;
664 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666 sizeof(OnigRepeatRange) * n);
667 CHECK_NULL_RETURN_MEMERR(p);
668 reg->repeat_range = p;
669 reg->repeat_range_alloc = n;
670 }
671 else {
672 p = reg->repeat_range;
673 }
674
675 p[id].lower = lower;
676 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677 return 0;
678 }
679
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682 regex_t* reg)
683 {
684 int r;
685 int num_repeat = reg->num_repeat;
686
687 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688 if (r) return r;
689 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690 reg->num_repeat++;
691 if (r) return r;
692 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693 if (r) return r;
694
695 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696 if (r) return r;
697
698 r = compile_tree_empty_check(qn->target, reg, empty_info);
699 if (r) return r;
700
701 if (
702 #ifdef USE_SUBEXP_CALL
703 reg->num_call > 0 ||
704 #endif
705 IS_QUANTIFIER_IN_REPEAT(qn)) {
706 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707 }
708 else {
709 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710 }
711 if (r) return r;
712 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713 return r;
714 }
715
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720 NTYPE(qn->target) == NT_CANY)
721 return 1;
722 else
723 return 0;
724 }
725
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
727 #define CKN_ON (ckn > 0)
728
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734 int len, mod_tlen, cklen;
735 int ckn;
736 int infinite = IS_REPEAT_INFINITE(qn->upper);
737 int empty_info = qn->target_empty_info;
738 int tlen = compile_length_tree(qn->target, reg);
739
740 if (tlen < 0) return tlen;
741
742 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743
744 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745
746 /* anychar repeat */
747 if (NTYPE(qn->target) == NT_CANY) {
748 if (qn->greedy && infinite) {
749 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751 else
752 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753 }
754 }
755
756 if (empty_info != 0)
757 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758 else
759 mod_tlen = tlen;
760
761 if (infinite && qn->lower <= 1) {
762 if (qn->greedy) {
763 if (qn->lower == 1)
764 len = SIZE_OP_JUMP;
765 else
766 len = 0;
767
768 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769 }
770 else {
771 if (qn->lower == 0)
772 len = SIZE_OP_JUMP;
773 else
774 len = 0;
775
776 len += mod_tlen + SIZE_OP_PUSH + cklen;
777 }
778 }
779 else if (qn->upper == 0) {
780 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781 len = SIZE_OP_JUMP + tlen;
782 else
783 len = 0;
784 }
785 else if (qn->upper == 1 && qn->greedy) {
786 if (qn->lower == 0) {
787 if (CKN_ON) {
788 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789 }
790 else {
791 len = SIZE_OP_PUSH + tlen;
792 }
793 }
794 else {
795 len = tlen;
796 }
797 }
798 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800 }
801 else {
802 len = SIZE_OP_REPEAT_INC
803 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804 if (CKN_ON)
805 len += SIZE_OP_STATE_CHECK;
806 }
807
808 return len;
809 }
810
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814 int r, mod_tlen;
815 int ckn;
816 int infinite = IS_REPEAT_INFINITE(qn->upper);
817 int empty_info = qn->target_empty_info;
818 int tlen = compile_length_tree(qn->target, reg);
819
820 if (tlen < 0) return tlen;
821
822 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823
824 if (is_anychar_star_quantifier(qn)) {
825 r = compile_tree_n_times(qn->target, qn->lower, reg);
826 if (r) return r;
827 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828 if (IS_MULTILINE(reg->options))
829 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830 else
831 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832 if (r) return r;
833 if (CKN_ON) {
834 r = add_state_check_num(reg, ckn);
835 if (r) return r;
836 }
837
838 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839 }
840 else {
841 if (IS_MULTILINE(reg->options)) {
842 r = add_opcode(reg, (CKN_ON ?
843 OP_STATE_CHECK_ANYCHAR_ML_STAR
844 : OP_ANYCHAR_ML_STAR));
845 }
846 else {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_STAR
849 : OP_ANYCHAR_STAR));
850 }
851 if (r) return r;
852 if (CKN_ON)
853 r = add_state_check_num(reg, ckn);
854
855 return r;
856 }
857 }
858
859 if (empty_info != 0)
860 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861 else
862 mod_tlen = tlen;
863
864 if (infinite && qn->lower <= 1) {
865 if (qn->greedy) {
866 if (qn->lower == 1) {
867 r = add_opcode_rel_addr(reg, OP_JUMP,
868 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 if (r) return r;
870 }
871
872 if (CKN_ON) {
873 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 if (r) return r;
875 r = add_state_check_num(reg, ckn);
876 if (r) return r;
877 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878 }
879 else {
880 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881 }
882 if (r) return r;
883 r = compile_tree_empty_check(qn->target, reg, empty_info);
884 if (r) return r;
885 r = add_opcode_rel_addr(reg, OP_JUMP,
886 -(mod_tlen + (int )SIZE_OP_JUMP
887 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888 }
889 else {
890 if (qn->lower == 0) {
891 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 if (r) return r;
893 }
894 r = compile_tree_empty_check(qn->target, reg, empty_info);
895 if (r) return r;
896 if (CKN_ON) {
897 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 if (r) return r;
899 r = add_state_check_num(reg, ckn);
900 if (r) return r;
901 r = add_rel_addr(reg,
902 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903 }
904 else
905 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906 }
907 }
908 else if (qn->upper == 0) {
909 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911 if (r) return r;
912 r = compile_tree(qn->target, reg);
913 }
914 else
915 r = 0;
916 }
917 else if (qn->upper == 1 && qn->greedy) {
918 if (qn->lower == 0) {
919 if (CKN_ON) {
920 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 if (r) return r;
922 r = add_state_check_num(reg, ckn);
923 if (r) return r;
924 r = add_rel_addr(reg, tlen);
925 }
926 else {
927 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928 }
929 if (r) return r;
930 }
931
932 r = compile_tree(qn->target, reg);
933 }
934 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935 if (CKN_ON) {
936 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937 if (r) return r;
938 r = add_state_check_num(reg, ckn);
939 if (r) return r;
940 r = add_rel_addr(reg, SIZE_OP_JUMP);
941 }
942 else {
943 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944 }
945
946 if (r) return r;
947 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948 if (r) return r;
949 r = compile_tree(qn->target, reg);
950 }
951 else {
952 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953 if (CKN_ON) {
954 if (r) return r;
955 r = add_opcode(reg, OP_STATE_CHECK);
956 if (r) return r;
957 r = add_state_check_num(reg, ckn);
958 }
959 }
960 return r;
961 }
962
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968 int len, mod_tlen;
969 int infinite = IS_REPEAT_INFINITE(qn->upper);
970 int empty_info = qn->target_empty_info;
971 int tlen = compile_length_tree(qn->target, reg);
972
973 if (tlen < 0) return tlen;
974
975 /* anychar repeat */
976 if (NTYPE(qn->target) == NT_CANY) {
977 if (qn->greedy && infinite) {
978 if (IS_NOT_NULL(qn->next_head_exact))
979 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980 else
981 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982 }
983 }
984
985 if (empty_info != 0)
986 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987 else
988 mod_tlen = tlen;
989
990 if (infinite &&
991 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993 len = SIZE_OP_JUMP;
994 }
995 else {
996 len = tlen * qn->lower;
997 }
998
999 if (qn->greedy) {
1000 if (IS_NOT_NULL(qn->head_exact))
1001 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002 else if (IS_NOT_NULL(qn->next_head_exact))
1003 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004 else
1005 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006 }
1007 else
1008 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009 }
1010 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011 len = SIZE_OP_JUMP + tlen;
1012 }
1013 else if (!infinite && qn->greedy &&
1014 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016 len = tlen * qn->lower;
1017 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018 }
1019 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021 }
1022 else {
1023 len = SIZE_OP_REPEAT_INC
1024 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025 }
1026
1027 return len;
1028 }
1029
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033 int i, r, mod_tlen;
1034 int infinite = IS_REPEAT_INFINITE(qn->upper);
1035 int empty_info = qn->target_empty_info;
1036 int tlen = compile_length_tree(qn->target, reg);
1037
1038 if (tlen < 0) return tlen;
1039
1040 if (is_anychar_star_quantifier(qn)) {
1041 r = compile_tree_n_times(qn->target, qn->lower, reg);
1042 if (r) return r;
1043 if (IS_NOT_NULL(qn->next_head_exact)) {
1044 if (IS_MULTILINE(reg->options))
1045 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046 else
1047 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048 if (r) return r;
1049 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050 }
1051 else {
1052 if (IS_MULTILINE(reg->options))
1053 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054 else
1055 return add_opcode(reg, OP_ANYCHAR_STAR);
1056 }
1057 }
1058
1059 if (empty_info != 0)
1060 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061 else
1062 mod_tlen = tlen;
1063
1064 if (infinite &&
1065 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067 if (qn->greedy) {
1068 if (IS_NOT_NULL(qn->head_exact))
1069 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 else if (IS_NOT_NULL(qn->next_head_exact))
1071 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 else
1073 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074 }
1075 else {
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077 }
1078 if (r) return r;
1079 }
1080 else {
1081 r = compile_tree_n_times(qn->target, qn->lower, reg);
1082 if (r) return r;
1083 }
1084
1085 if (qn->greedy) {
1086 if (IS_NOT_NULL(qn->head_exact)) {
1087 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 mod_tlen + SIZE_OP_JUMP);
1089 if (r) return r;
1090 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 if (r) return r;
1093 r = add_opcode_rel_addr(reg, OP_JUMP,
1094 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095 }
1096 else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 mod_tlen + SIZE_OP_JUMP);
1099 if (r) return r;
1100 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 if (r) return r;
1103 r = add_opcode_rel_addr(reg, OP_JUMP,
1104 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105 }
1106 else {
1107 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 if (r) return r;
1109 r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 if (r) return r;
1111 r = add_opcode_rel_addr(reg, OP_JUMP,
1112 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113 }
1114 }
1115 else {
1116 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117 if (r) return r;
1118 r = compile_tree_empty_check(qn->target, reg, empty_info);
1119 if (r) return r;
1120 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121 }
1122 }
1123 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125 if (r) return r;
1126 r = compile_tree(qn->target, reg);
1127 }
1128 else if (!infinite && qn->greedy &&
1129 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131 int n = qn->upper - qn->lower;
1132
1133 r = compile_tree_n_times(qn->target, qn->lower, reg);
1134 if (r) return r;
1135
1136 for (i = 0; i < n; i++) {
1137 r = add_opcode_rel_addr(reg, OP_PUSH,
1138 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139 if (r) return r;
1140 r = compile_tree(qn->target, reg);
1141 if (r) return r;
1142 }
1143 }
1144 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146 if (r) return r;
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1150 }
1151 else {
1152 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153 }
1154 return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161 int tlen;
1162 OnigOptionType prev = reg->options;
1163
1164 reg->options = node->option;
1165 tlen = compile_length_tree(node->target, reg);
1166 reg->options = prev;
1167
1168 if (tlen < 0) return tlen;
1169
1170 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172 + tlen + SIZE_OP_SET_OPTION;
1173 }
1174 else
1175 return tlen;
1176 }
1177
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181 int r;
1182 OnigOptionType prev = reg->options;
1183
1184 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186 if (r) return r;
1187 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188 if (r) return r;
1189 r = add_opcode(reg, OP_FAIL);
1190 if (r) return r;
1191 }
1192
1193 reg->options = node->option;
1194 r = compile_tree(node->target, reg);
1195 reg->options = prev;
1196
1197 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198 if (r) return r;
1199 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200 }
1201 return r;
1202 }
1203
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207 int len;
1208 int tlen;
1209
1210 if (node->type == ENCLOSE_OPTION)
1211 return compile_length_option_node(node, reg);
1212
1213 if (node->target) {
1214 tlen = compile_length_tree(node->target, reg);
1215 if (tlen < 0) return tlen;
1216 }
1217 else
1218 tlen = 0;
1219
1220 switch (node->type) {
1221 case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223 if (IS_ENCLOSE_CALLED(node)) {
1224 len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 len += (IS_ENCLOSE_RECURSION(node)
1228 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229 else
1230 len += (IS_ENCLOSE_RECURSION(node)
1231 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232 }
1233 else
1234 #endif
1235 {
1236 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 len = SIZE_OP_MEMORY_START_PUSH;
1238 else
1239 len = SIZE_OP_MEMORY_START;
1240
1241 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1243 }
1244 break;
1245
1246 case ENCLOSE_STOP_BACKTRACK:
1247 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248 QtfrNode* qn = NQTFR(node->target);
1249 tlen = compile_length_tree(qn->target, reg);
1250 if (tlen < 0) return tlen;
1251
1252 len = tlen * qn->lower
1253 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1254 }
1255 else {
1256 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1257 }
1258 break;
1259
1260 default:
1261 return ONIGERR_TYPE_BUG;
1262 break;
1263 }
1264
1265 return len;
1266 }
1267
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1269
1270 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1272 {
1273 int r, len;
1274
1275 if (node->type == ENCLOSE_OPTION)
1276 return compile_option_node(node, reg);
1277
1278 switch (node->type) {
1279 case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281 if (IS_ENCLOSE_CALLED(node)) {
1282 r = add_opcode(reg, OP_CALL);
1283 if (r) return r;
1284 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285 node->state |= NST_ADDR_FIXED;
1286 r = add_abs_addr(reg, (int )node->call_addr);
1287 if (r) return r;
1288 len = compile_length_tree(node->target, reg);
1289 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 len += (IS_ENCLOSE_RECURSION(node)
1292 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293 else
1294 len += (IS_ENCLOSE_RECURSION(node)
1295 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1296
1297 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298 if (r) return r;
1299 }
1300 #endif
1301 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303 else
1304 r = add_opcode(reg, OP_MEMORY_START);
1305 if (r) return r;
1306 r = add_mem_num(reg, node->regnum);
1307 if (r) return r;
1308 r = compile_tree(node->target, reg);
1309 if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311 if (IS_ENCLOSE_CALLED(node)) {
1312 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315 else
1316 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1318
1319 if (r) return r;
1320 r = add_mem_num(reg, node->regnum);
1321 if (r) return r;
1322 r = add_opcode(reg, OP_RETURN);
1323 }
1324 else
1325 #endif
1326 {
1327 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329 else
1330 r = add_opcode(reg, OP_MEMORY_END);
1331 if (r) return r;
1332 r = add_mem_num(reg, node->regnum);
1333 }
1334 break;
1335
1336 case ENCLOSE_STOP_BACKTRACK:
1337 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338 QtfrNode* qn = NQTFR(node->target);
1339 r = compile_tree_n_times(qn->target, qn->lower, reg);
1340 if (r) return r;
1341
1342 len = compile_length_tree(qn->target, reg);
1343 if (len < 0) return len;
1344
1345 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346 if (r) return r;
1347 r = compile_tree(qn->target, reg);
1348 if (r) return r;
1349 r = add_opcode(reg, OP_POP);
1350 if (r) return r;
1351 r = add_opcode_rel_addr(reg, OP_JUMP,
1352 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1353 }
1354 else {
1355 r = add_opcode(reg, OP_PUSH_STOP_BT);
1356 if (r) return r;
1357 r = compile_tree(node->target, reg);
1358 if (r) return r;
1359 r = add_opcode(reg, OP_POP_STOP_BT);
1360 }
1361 break;
1362
1363 default:
1364 return ONIGERR_TYPE_BUG;
1365 break;
1366 }
1367
1368 return r;
1369 }
1370
1371 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1373 {
1374 int len;
1375 int tlen = 0;
1376
1377 if (node->target) {
1378 tlen = compile_length_tree(node->target, reg);
1379 if (tlen < 0) return tlen;
1380 }
1381
1382 switch (node->type) {
1383 case ANCHOR_PREC_READ:
1384 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385 break;
1386 case ANCHOR_PREC_READ_NOT:
1387 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388 break;
1389 case ANCHOR_LOOK_BEHIND:
1390 len = SIZE_OP_LOOK_BEHIND + tlen;
1391 break;
1392 case ANCHOR_LOOK_BEHIND_NOT:
1393 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394 break;
1395
1396 default:
1397 len = SIZE_OPCODE;
1398 break;
1399 }
1400
1401 return len;
1402 }
1403
1404 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1406 {
1407 int r, len;
1408
1409 switch (node->type) {
1410 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1411 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1412 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1413 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1414 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1415 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1416
1417 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1418 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1421 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1422 #endif
1423
1424 case ANCHOR_PREC_READ:
1425 r = add_opcode(reg, OP_PUSH_POS);
1426 if (r) return r;
1427 r = compile_tree(node->target, reg);
1428 if (r) return r;
1429 r = add_opcode(reg, OP_POP_POS);
1430 break;
1431
1432 case ANCHOR_PREC_READ_NOT:
1433 len = compile_length_tree(node->target, reg);
1434 if (len < 0) return len;
1435 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436 if (r) return r;
1437 r = compile_tree(node->target, reg);
1438 if (r) return r;
1439 r = add_opcode(reg, OP_FAIL_POS);
1440 break;
1441
1442 case ANCHOR_LOOK_BEHIND:
1443 {
1444 int n;
1445 r = add_opcode(reg, OP_LOOK_BEHIND);
1446 if (r) return r;
1447 if (node->char_len < 0) {
1448 r = get_char_length_tree(node->target, reg, &n);
1449 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1450 }
1451 else
1452 n = node->char_len;
1453 r = add_length(reg, n);
1454 if (r) return r;
1455 r = compile_tree(node->target, reg);
1456 }
1457 break;
1458
1459 case ANCHOR_LOOK_BEHIND_NOT:
1460 {
1461 int n;
1462 len = compile_length_tree(node->target, reg);
1463 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465 if (r) return r;
1466 if (node->char_len < 0) {
1467 r = get_char_length_tree(node->target, reg, &n);
1468 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1469 }
1470 else
1471 n = node->char_len;
1472 r = add_length(reg, n);
1473 if (r) return r;
1474 r = compile_tree(node->target, reg);
1475 if (r) return r;
1476 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1477 }
1478 break;
1479
1480 default:
1481 return ONIGERR_TYPE_BUG;
1482 break;
1483 }
1484
1485 return r;
1486 }
1487
1488 static int
compile_length_tree(Node * node,regex_t * reg)1489 compile_length_tree(Node* node, regex_t* reg)
1490 {
1491 int len, type, r;
1492
1493 type = NTYPE(node);
1494 switch (type) {
1495 case NT_LIST:
1496 len = 0;
1497 do {
1498 r = compile_length_tree(NCAR(node), reg);
1499 if (r < 0) return r;
1500 len += r;
1501 } while (IS_NOT_NULL(node = NCDR(node)));
1502 r = len;
1503 break;
1504
1505 case NT_ALT:
1506 {
1507 int n;
1508
1509 n = r = 0;
1510 do {
1511 r += compile_length_tree(NCAR(node), reg);
1512 n++;
1513 } while (IS_NOT_NULL(node = NCDR(node)));
1514 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1515 }
1516 break;
1517
1518 case NT_STR:
1519 if (NSTRING_IS_RAW(node))
1520 r = compile_length_string_raw_node(NSTR(node), reg);
1521 else
1522 r = compile_length_string_node(node, reg);
1523 break;
1524
1525 case NT_CCLASS:
1526 r = compile_length_cclass_node(NCCLASS(node), reg);
1527 break;
1528
1529 case NT_CTYPE:
1530 case NT_CANY:
1531 r = SIZE_OPCODE;
1532 break;
1533
1534 case NT_BREF:
1535 {
1536 BRefNode* br = NBREF(node);
1537
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539 if (IS_BACKREF_NEST_LEVEL(br)) {
1540 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1542 }
1543 else
1544 #endif
1545 if (br->back_num == 1) {
1546 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1548 }
1549 else {
1550 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1551 }
1552 }
1553 break;
1554
1555 #ifdef USE_SUBEXP_CALL
1556 case NT_CALL:
1557 r = SIZE_OP_CALL;
1558 break;
1559 #endif
1560
1561 case NT_QTFR:
1562 r = compile_length_quantifier_node(NQTFR(node), reg);
1563 break;
1564
1565 case NT_ENCLOSE:
1566 r = compile_length_enclose_node(NENCLOSE(node), reg);
1567 break;
1568
1569 case NT_ANCHOR:
1570 r = compile_length_anchor_node(NANCHOR(node), reg);
1571 break;
1572
1573 default:
1574 return ONIGERR_TYPE_BUG;
1575 break;
1576 }
1577
1578 return r;
1579 }
1580
1581 static int
compile_tree(Node * node,regex_t * reg)1582 compile_tree(Node* node, regex_t* reg)
1583 {
1584 int n, type, len, pos, r = 0;
1585
1586 type = NTYPE(node);
1587 switch (type) {
1588 case NT_LIST:
1589 do {
1590 r = compile_tree(NCAR(node), reg);
1591 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592 break;
1593
1594 case NT_ALT:
1595 {
1596 Node* x = node;
1597 len = 0;
1598 do {
1599 len += compile_length_tree(NCAR(x), reg);
1600 if (NCDR(x) != NULL) {
1601 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1602 }
1603 } while (IS_NOT_NULL(x = NCDR(x)));
1604 pos = reg->used + len; /* goal position */
1605
1606 do {
1607 len = compile_length_tree(NCAR(node), reg);
1608 if (IS_NOT_NULL(NCDR(node))) {
1609 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 if (r) break;
1611 }
1612 r = compile_tree(NCAR(node), reg);
1613 if (r) break;
1614 if (IS_NOT_NULL(NCDR(node))) {
1615 len = pos - (reg->used + SIZE_OP_JUMP);
1616 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 if (r) break;
1618 }
1619 } while (IS_NOT_NULL(node = NCDR(node)));
1620 }
1621 break;
1622
1623 case NT_STR:
1624 if (NSTRING_IS_RAW(node))
1625 r = compile_string_raw_node(NSTR(node), reg);
1626 else
1627 r = compile_string_node(node, reg);
1628 break;
1629
1630 case NT_CCLASS:
1631 r = compile_cclass_node(NCCLASS(node), reg);
1632 break;
1633
1634 case NT_CTYPE:
1635 {
1636 int op;
1637
1638 switch (NCTYPE(node)->ctype) {
1639 case ONIGENC_CTYPE_WORD:
1640 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1641 else op = OP_WORD;
1642 break;
1643 default:
1644 return ONIGERR_TYPE_BUG;
1645 break;
1646 }
1647 r = add_opcode(reg, op);
1648 }
1649 break;
1650
1651 case NT_CANY:
1652 if (IS_MULTILINE(reg->options))
1653 r = add_opcode(reg, OP_ANYCHAR_ML);
1654 else
1655 r = add_opcode(reg, OP_ANYCHAR);
1656 break;
1657
1658 case NT_BREF:
1659 {
1660 BRefNode* br = NBREF(node);
1661
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663 if (IS_BACKREF_NEST_LEVEL(br)) {
1664 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 if (r) return r;
1666 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 if (r) return r;
1668 r = add_length(reg, br->nest_level);
1669 if (r) return r;
1670
1671 goto add_bacref_mems;
1672 }
1673 else
1674 #endif
1675 if (br->back_num == 1) {
1676 n = br->back_static[0];
1677 if (IS_IGNORECASE(reg->options)) {
1678 r = add_opcode(reg, OP_BACKREFN_IC);
1679 if (r) return r;
1680 r = add_mem_num(reg, n);
1681 }
1682 else {
1683 switch (n) {
1684 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1685 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1686 default:
1687 r = add_opcode(reg, OP_BACKREFN);
1688 if (r) return r;
1689 r = add_mem_num(reg, n);
1690 break;
1691 }
1692 }
1693 }
1694 else {
1695 int i;
1696 int* p;
1697
1698 if (IS_IGNORECASE(reg->options)) {
1699 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1700 }
1701 else {
1702 r = add_opcode(reg, OP_BACKREF_MULTI);
1703 }
1704 if (r) return r;
1705
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707 add_bacref_mems:
1708 #endif
1709 r = add_length(reg, br->back_num);
1710 if (r) return r;
1711 p = BACKREFS_P(br);
1712 for (i = br->back_num - 1; i >= 0; i--) {
1713 r = add_mem_num(reg, p[i]);
1714 if (r) return r;
1715 }
1716 }
1717 }
1718 break;
1719
1720 #ifdef USE_SUBEXP_CALL
1721 case NT_CALL:
1722 r = compile_call(NCALL(node), reg);
1723 break;
1724 #endif
1725
1726 case NT_QTFR:
1727 r = compile_quantifier_node(NQTFR(node), reg);
1728 break;
1729
1730 case NT_ENCLOSE:
1731 r = compile_enclose_node(NENCLOSE(node), reg);
1732 break;
1733
1734 case NT_ANCHOR:
1735 r = compile_anchor_node(NANCHOR(node), reg);
1736 break;
1737
1738 default:
1739 #ifdef ONIG_DEBUG
1740 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742 break;
1743 }
1744
1745 return r;
1746 }
1747
1748 #ifdef USE_NAMED_GROUP
1749
1750 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1752 {
1753 int r = 0;
1754 Node* node = *plink;
1755
1756 switch (NTYPE(node)) {
1757 case NT_LIST:
1758 case NT_ALT:
1759 do {
1760 r = noname_disable_map(&(NCAR(node)), map, counter);
1761 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762 break;
1763
1764 case NT_QTFR:
1765 {
1766 Node** ptarget = &(NQTFR(node)->target);
1767 Node* old = *ptarget;
1768 r = noname_disable_map(ptarget, map, counter);
1769 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 onig_reduce_nested_quantifier(node, *ptarget);
1771 }
1772 }
1773 break;
1774
1775 case NT_ENCLOSE:
1776 {
1777 EncloseNode* en = NENCLOSE(node);
1778 if (en->type == ENCLOSE_MEMORY) {
1779 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 (*counter)++;
1781 map[en->regnum].new_val = *counter;
1782 en->regnum = *counter;
1783 r = noname_disable_map(&(en->target), map, counter);
1784 }
1785 else {
1786 *plink = en->target;
1787 en->target = NULL_NODE;
1788 onig_node_free(node);
1789 r = noname_disable_map(plink, map, counter);
1790 }
1791 }
1792 else
1793 r = noname_disable_map(&(en->target), map, counter);
1794 }
1795 break;
1796
1797 default:
1798 break;
1799 }
1800
1801 return r;
1802 }
1803
1804 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1805 renumber_node_backref(Node* node, GroupNumRemap* map)
1806 {
1807 int i, pos, n, old_num;
1808 int *backs;
1809 BRefNode* bn = NBREF(node);
1810
1811 if (! IS_BACKREF_NAME_REF(bn))
1812 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1813
1814 old_num = bn->back_num;
1815 if (IS_NULL(bn->back_dynamic))
1816 backs = bn->back_static;
1817 else
1818 backs = bn->back_dynamic;
1819
1820 for (i = 0, pos = 0; i < old_num; i++) {
1821 n = map[backs[i]].new_val;
1822 if (n > 0) {
1823 backs[pos] = n;
1824 pos++;
1825 }
1826 }
1827
1828 bn->back_num = pos;
1829 return 0;
1830 }
1831
1832 static int
renumber_by_map(Node * node,GroupNumRemap * map)1833 renumber_by_map(Node* node, GroupNumRemap* map)
1834 {
1835 int r = 0;
1836
1837 switch (NTYPE(node)) {
1838 case NT_LIST:
1839 case NT_ALT:
1840 do {
1841 r = renumber_by_map(NCAR(node), map);
1842 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1843 break;
1844 case NT_QTFR:
1845 r = renumber_by_map(NQTFR(node)->target, map);
1846 break;
1847 case NT_ENCLOSE:
1848 r = renumber_by_map(NENCLOSE(node)->target, map);
1849 break;
1850
1851 case NT_BREF:
1852 r = renumber_node_backref(node, map);
1853 break;
1854
1855 default:
1856 break;
1857 }
1858
1859 return r;
1860 }
1861
1862 static int
numbered_ref_check(Node * node)1863 numbered_ref_check(Node* node)
1864 {
1865 int r = 0;
1866
1867 switch (NTYPE(node)) {
1868 case NT_LIST:
1869 case NT_ALT:
1870 do {
1871 r = numbered_ref_check(NCAR(node));
1872 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1873 break;
1874 case NT_QTFR:
1875 r = numbered_ref_check(NQTFR(node)->target);
1876 break;
1877 case NT_ENCLOSE:
1878 r = numbered_ref_check(NENCLOSE(node)->target);
1879 break;
1880
1881 case NT_BREF:
1882 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1883 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1884 break;
1885
1886 default:
1887 break;
1888 }
1889
1890 return r;
1891 }
1892
1893 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1894 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1895 {
1896 int r, i, pos, counter;
1897 BitStatusType loc;
1898 GroupNumRemap* map;
1899
1900 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1901 CHECK_NULL_RETURN_MEMERR(map);
1902 for (i = 1; i <= env->num_mem; i++) {
1903 map[i].new_val = 0;
1904 }
1905 counter = 0;
1906 r = noname_disable_map(root, map, &counter);
1907 if (r != 0) return r;
1908
1909 r = renumber_by_map(*root, map);
1910 if (r != 0) return r;
1911
1912 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1913 if (map[i].new_val > 0) {
1914 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1915 pos++;
1916 }
1917 }
1918
1919 loc = env->capture_history;
1920 BIT_STATUS_CLEAR(env->capture_history);
1921 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1922 if (BIT_STATUS_AT(loc, i)) {
1923 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1924 }
1925 }
1926
1927 env->num_mem = env->num_named;
1928 reg->num_mem = env->num_named;
1929
1930 return onig_renumber_name_table(reg, map);
1931 }
1932 #endif /* USE_NAMED_GROUP */
1933
1934 #ifdef USE_SUBEXP_CALL
1935 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1936 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1937 {
1938 int i, offset;
1939 EncloseNode* en;
1940 AbsAddrType addr;
1941
1942 for (i = 0; i < uslist->num; i++) {
1943 en = NENCLOSE(uslist->us[i].target);
1944 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1945 addr = en->call_addr;
1946 offset = uslist->us[i].offset;
1947
1948 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1949 }
1950 return 0;
1951 }
1952 #endif
1953
1954 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1955 static int
quantifiers_memory_node_info(Node * node)1956 quantifiers_memory_node_info(Node* node)
1957 {
1958 int r = 0;
1959
1960 switch (NTYPE(node)) {
1961 case NT_LIST:
1962 case NT_ALT:
1963 {
1964 int v;
1965 do {
1966 v = quantifiers_memory_node_info(NCAR(node));
1967 if (v > r) r = v;
1968 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1969 }
1970 break;
1971
1972 #ifdef USE_SUBEXP_CALL
1973 case NT_CALL:
1974 if (IS_CALL_RECURSION(NCALL(node))) {
1975 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1976 }
1977 else
1978 r = quantifiers_memory_node_info(NCALL(node)->target);
1979 break;
1980 #endif
1981
1982 case NT_QTFR:
1983 {
1984 QtfrNode* qn = NQTFR(node);
1985 if (qn->upper != 0) {
1986 r = quantifiers_memory_node_info(qn->target);
1987 }
1988 }
1989 break;
1990
1991 case NT_ENCLOSE:
1992 {
1993 EncloseNode* en = NENCLOSE(node);
1994 switch (en->type) {
1995 case ENCLOSE_MEMORY:
1996 return NQ_TARGET_IS_EMPTY_MEM;
1997 break;
1998
1999 case ENCLOSE_OPTION:
2000 case ENCLOSE_STOP_BACKTRACK:
2001 r = quantifiers_memory_node_info(en->target);
2002 break;
2003 default:
2004 break;
2005 }
2006 }
2007 break;
2008
2009 case NT_BREF:
2010 case NT_STR:
2011 case NT_CTYPE:
2012 case NT_CCLASS:
2013 case NT_CANY:
2014 case NT_ANCHOR:
2015 default:
2016 break;
2017 }
2018
2019 return r;
2020 }
2021 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2022
2023 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2024 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2025 {
2026 OnigDistance tmin;
2027 int r = 0;
2028
2029 *min = 0;
2030 switch (NTYPE(node)) {
2031 case NT_BREF:
2032 {
2033 int i;
2034 int* backs;
2035 Node** nodes = SCANENV_MEM_NODES(env);
2036 BRefNode* br = NBREF(node);
2037 if (br->state & NST_RECURSION) break;
2038
2039 backs = BACKREFS_P(br);
2040 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2041 r = get_min_match_length(nodes[backs[0]], min, env);
2042 if (r != 0) break;
2043 for (i = 1; i < br->back_num; i++) {
2044 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2045 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2046 if (r != 0) break;
2047 if (*min > tmin) *min = tmin;
2048 }
2049 }
2050 break;
2051
2052 #ifdef USE_SUBEXP_CALL
2053 case NT_CALL:
2054 if (IS_CALL_RECURSION(NCALL(node))) {
2055 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2056 if (IS_ENCLOSE_MIN_FIXED(en))
2057 *min = en->min_len;
2058 }
2059 else
2060 r = get_min_match_length(NCALL(node)->target, min, env);
2061 break;
2062 #endif
2063
2064 case NT_LIST:
2065 do {
2066 r = get_min_match_length(NCAR(node), &tmin, env);
2067 if (r == 0) *min += tmin;
2068 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2069 break;
2070
2071 case NT_ALT:
2072 {
2073 Node *x, *y;
2074 y = node;
2075 do {
2076 x = NCAR(y);
2077 r = get_min_match_length(x, &tmin, env);
2078 if (r != 0) break;
2079 if (y == node) *min = tmin;
2080 else if (*min > tmin) *min = tmin;
2081 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2082 }
2083 break;
2084
2085 case NT_STR:
2086 {
2087 StrNode* sn = NSTR(node);
2088 *min = sn->end - sn->s;
2089 }
2090 break;
2091
2092 case NT_CTYPE:
2093 *min = 1;
2094 break;
2095
2096 case NT_CCLASS:
2097 case NT_CANY:
2098 *min = 1;
2099 break;
2100
2101 case NT_QTFR:
2102 {
2103 QtfrNode* qn = NQTFR(node);
2104
2105 if (qn->lower > 0) {
2106 r = get_min_match_length(qn->target, min, env);
2107 if (r == 0)
2108 *min = distance_multiply(*min, qn->lower);
2109 }
2110 }
2111 break;
2112
2113 case NT_ENCLOSE:
2114 {
2115 EncloseNode* en = NENCLOSE(node);
2116 switch (en->type) {
2117 case ENCLOSE_MEMORY:
2118 #ifdef USE_SUBEXP_CALL
2119 if (IS_ENCLOSE_MIN_FIXED(en))
2120 *min = en->min_len;
2121 else {
2122 r = get_min_match_length(en->target, min, env);
2123 if (r == 0) {
2124 en->min_len = *min;
2125 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2126 }
2127 }
2128 break;
2129 #endif
2130 case ENCLOSE_OPTION:
2131 case ENCLOSE_STOP_BACKTRACK:
2132 r = get_min_match_length(en->target, min, env);
2133 break;
2134 }
2135 }
2136 break;
2137
2138 case NT_ANCHOR:
2139 default:
2140 break;
2141 }
2142
2143 return r;
2144 }
2145
2146 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2147 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2148 {
2149 OnigDistance tmax;
2150 int r = 0;
2151
2152 *max = 0;
2153 switch (NTYPE(node)) {
2154 case NT_LIST:
2155 do {
2156 r = get_max_match_length(NCAR(node), &tmax, env);
2157 if (r == 0)
2158 *max = distance_add(*max, tmax);
2159 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2160 break;
2161
2162 case NT_ALT:
2163 do {
2164 r = get_max_match_length(NCAR(node), &tmax, env);
2165 if (r == 0 && *max < tmax) *max = tmax;
2166 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2167 break;
2168
2169 case NT_STR:
2170 {
2171 StrNode* sn = NSTR(node);
2172 *max = sn->end - sn->s;
2173 }
2174 break;
2175
2176 case NT_CTYPE:
2177 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2178 break;
2179
2180 case NT_CCLASS:
2181 case NT_CANY:
2182 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2183 break;
2184
2185 case NT_BREF:
2186 {
2187 int i;
2188 int* backs;
2189 Node** nodes = SCANENV_MEM_NODES(env);
2190 BRefNode* br = NBREF(node);
2191 if (br->state & NST_RECURSION) {
2192 *max = ONIG_INFINITE_DISTANCE;
2193 break;
2194 }
2195 backs = BACKREFS_P(br);
2196 for (i = 0; i < br->back_num; i++) {
2197 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2198 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2199 if (r != 0) break;
2200 if (*max < tmax) *max = tmax;
2201 }
2202 }
2203 break;
2204
2205 #ifdef USE_SUBEXP_CALL
2206 case NT_CALL:
2207 if (! IS_CALL_RECURSION(NCALL(node)))
2208 r = get_max_match_length(NCALL(node)->target, max, env);
2209 else
2210 *max = ONIG_INFINITE_DISTANCE;
2211 break;
2212 #endif
2213
2214 case NT_QTFR:
2215 {
2216 QtfrNode* qn = NQTFR(node);
2217
2218 if (qn->upper != 0) {
2219 r = get_max_match_length(qn->target, max, env);
2220 if (r == 0 && *max != 0) {
2221 if (! IS_REPEAT_INFINITE(qn->upper))
2222 *max = distance_multiply(*max, qn->upper);
2223 else
2224 *max = ONIG_INFINITE_DISTANCE;
2225 }
2226 }
2227 }
2228 break;
2229
2230 case NT_ENCLOSE:
2231 {
2232 EncloseNode* en = NENCLOSE(node);
2233 switch (en->type) {
2234 case ENCLOSE_MEMORY:
2235 #ifdef USE_SUBEXP_CALL
2236 if (IS_ENCLOSE_MAX_FIXED(en))
2237 *max = en->max_len;
2238 else {
2239 r = get_max_match_length(en->target, max, env);
2240 if (r == 0) {
2241 en->max_len = *max;
2242 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2243 }
2244 }
2245 break;
2246 #endif
2247 case ENCLOSE_OPTION:
2248 case ENCLOSE_STOP_BACKTRACK:
2249 r = get_max_match_length(en->target, max, env);
2250 break;
2251 }
2252 }
2253 break;
2254
2255 case NT_ANCHOR:
2256 default:
2257 break;
2258 }
2259
2260 return r;
2261 }
2262
2263 #define GET_CHAR_LEN_VARLEN -1
2264 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2265
2266 /* fixed size pattern node only */
2267 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2268 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2269 {
2270 int tlen;
2271 int r = 0;
2272
2273 level++;
2274 *len = 0;
2275 switch (NTYPE(node)) {
2276 case NT_LIST:
2277 do {
2278 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2279 if (r == 0)
2280 *len = distance_add(*len, tlen);
2281 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2282 break;
2283
2284 case NT_ALT:
2285 {
2286 int tlen2;
2287 int varlen = 0;
2288
2289 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2290 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2291 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2292 if (r == 0) {
2293 if (tlen != tlen2)
2294 varlen = 1;
2295 }
2296 }
2297 if (r == 0) {
2298 if (varlen != 0) {
2299 if (level == 1)
2300 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2301 else
2302 r = GET_CHAR_LEN_VARLEN;
2303 }
2304 else
2305 *len = tlen;
2306 }
2307 }
2308 break;
2309
2310 case NT_STR:
2311 {
2312 StrNode* sn = NSTR(node);
2313 UChar *s = sn->s;
2314 while (s < sn->end) {
2315 s += enclen(reg->enc, s);
2316 (*len)++;
2317 }
2318 }
2319 break;
2320
2321 case NT_QTFR:
2322 {
2323 QtfrNode* qn = NQTFR(node);
2324 if (qn->lower == qn->upper) {
2325 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2326 if (r == 0)
2327 *len = distance_multiply(tlen, qn->lower);
2328 }
2329 else
2330 r = GET_CHAR_LEN_VARLEN;
2331 }
2332 break;
2333
2334 #ifdef USE_SUBEXP_CALL
2335 case NT_CALL:
2336 if (! IS_CALL_RECURSION(NCALL(node)))
2337 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2338 else
2339 r = GET_CHAR_LEN_VARLEN;
2340 break;
2341 #endif
2342
2343 case NT_CTYPE:
2344 *len = 1;
2345 break;
2346
2347 case NT_CCLASS:
2348 case NT_CANY:
2349 *len = 1;
2350 break;
2351
2352 case NT_ENCLOSE:
2353 {
2354 EncloseNode* en = NENCLOSE(node);
2355 switch (en->type) {
2356 case ENCLOSE_MEMORY:
2357 #ifdef USE_SUBEXP_CALL
2358 if (IS_ENCLOSE_CLEN_FIXED(en))
2359 *len = en->char_len;
2360 else {
2361 r = get_char_length_tree1(en->target, reg, len, level);
2362 if (r == 0) {
2363 en->char_len = *len;
2364 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2365 }
2366 }
2367 break;
2368 #endif
2369 case ENCLOSE_OPTION:
2370 case ENCLOSE_STOP_BACKTRACK:
2371 r = get_char_length_tree1(en->target, reg, len, level);
2372 break;
2373 default:
2374 break;
2375 }
2376 }
2377 break;
2378
2379 case NT_ANCHOR:
2380 break;
2381
2382 default:
2383 r = GET_CHAR_LEN_VARLEN;
2384 break;
2385 }
2386
2387 return r;
2388 }
2389
2390 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2391 get_char_length_tree(Node* node, regex_t* reg, int* len)
2392 {
2393 return get_char_length_tree1(node, reg, len, 0);
2394 }
2395
2396 /* x is not included y ==> 1 : 0 */
2397 static int
is_not_included(Node * x,Node * y,regex_t * reg)2398 is_not_included(Node* x, Node* y, regex_t* reg)
2399 {
2400 int i, len;
2401 OnigCodePoint code;
2402 UChar *p, c;
2403 int ytype;
2404
2405 retry:
2406 ytype = NTYPE(y);
2407 switch (NTYPE(x)) {
2408 case NT_CTYPE:
2409 {
2410 switch (ytype) {
2411 case NT_CTYPE:
2412 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2413 NCTYPE(y)->not != NCTYPE(x)->not)
2414 return 1;
2415 else
2416 return 0;
2417 break;
2418
2419 case NT_CCLASS:
2420 swap:
2421 {
2422 Node* tmp;
2423 tmp = x; x = y; y = tmp;
2424 goto retry;
2425 }
2426 break;
2427
2428 case NT_STR:
2429 goto swap;
2430 break;
2431
2432 default:
2433 break;
2434 }
2435 }
2436 break;
2437
2438 case NT_CCLASS:
2439 {
2440 CClassNode* xc = NCCLASS(x);
2441 switch (ytype) {
2442 case NT_CTYPE:
2443 switch (NCTYPE(y)->ctype) {
2444 case ONIGENC_CTYPE_WORD:
2445 if (NCTYPE(y)->not == 0) {
2446 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2447 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2448 if (BITSET_AT(xc->bs, i)) {
2449 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2450 }
2451 }
2452 return 1;
2453 }
2454 return 0;
2455 }
2456 else {
2457 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2458 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2459 if (!IS_NCCLASS_NOT(xc)) {
2460 if (BITSET_AT(xc->bs, i))
2461 return 0;
2462 }
2463 else {
2464 if (! BITSET_AT(xc->bs, i))
2465 return 0;
2466 }
2467 }
2468 }
2469 return 1;
2470 }
2471 break;
2472
2473 default:
2474 break;
2475 }
2476 break;
2477
2478 case NT_CCLASS:
2479 {
2480 int v;
2481 CClassNode* yc = NCCLASS(y);
2482
2483 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2484 v = BITSET_AT(xc->bs, i);
2485 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2486 (v == 0 && IS_NCCLASS_NOT(xc))) {
2487 v = BITSET_AT(yc->bs, i);
2488 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2489 (v == 0 && IS_NCCLASS_NOT(yc)))
2490 return 0;
2491 }
2492 }
2493 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2494 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2495 return 1;
2496 return 0;
2497 }
2498 break;
2499
2500 case NT_STR:
2501 goto swap;
2502 break;
2503
2504 default:
2505 break;
2506 }
2507 }
2508 break;
2509
2510 case NT_STR:
2511 {
2512 StrNode* xs = NSTR(x);
2513 if (NSTRING_LEN(x) == 0)
2514 break;
2515
2516 c = *(xs->s);
2517 switch (ytype) {
2518 case NT_CTYPE:
2519 switch (NCTYPE(y)->ctype) {
2520 case ONIGENC_CTYPE_WORD:
2521 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2522 return NCTYPE(y)->not;
2523 else
2524 return !(NCTYPE(y)->not);
2525 break;
2526 default:
2527 break;
2528 }
2529 break;
2530
2531 case NT_CCLASS:
2532 {
2533 CClassNode* cc = NCCLASS(y);
2534
2535 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2536 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2537 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2538 }
2539 break;
2540
2541 case NT_STR:
2542 {
2543 UChar *q;
2544 StrNode* ys = NSTR(y);
2545 len = NSTRING_LEN(x);
2546 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2547 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2548 /* tiny version */
2549 return 0;
2550 }
2551 else {
2552 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2553 if (*p != *q) return 1;
2554 }
2555 }
2556 }
2557 break;
2558
2559 default:
2560 break;
2561 }
2562 }
2563 break;
2564
2565 default:
2566 break;
2567 }
2568
2569 return 0;
2570 }
2571
2572 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2573 get_head_value_node(Node* node, int exact, regex_t* reg)
2574 {
2575 Node* n = NULL_NODE;
2576
2577 switch (NTYPE(node)) {
2578 case NT_BREF:
2579 case NT_ALT:
2580 case NT_CANY:
2581 #ifdef USE_SUBEXP_CALL
2582 case NT_CALL:
2583 #endif
2584 break;
2585
2586 case NT_CTYPE:
2587 case NT_CCLASS:
2588 if (exact == 0) {
2589 n = node;
2590 }
2591 break;
2592
2593 case NT_LIST:
2594 n = get_head_value_node(NCAR(node), exact, reg);
2595 break;
2596
2597 case NT_STR:
2598 {
2599 StrNode* sn = NSTR(node);
2600
2601 if (sn->end <= sn->s)
2602 break;
2603
2604 if (exact != 0 &&
2605 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2606 }
2607 else {
2608 n = node;
2609 }
2610 }
2611 break;
2612
2613 case NT_QTFR:
2614 {
2615 QtfrNode* qn = NQTFR(node);
2616 if (qn->lower > 0) {
2617 if (IS_NOT_NULL(qn->head_exact))
2618 n = qn->head_exact;
2619 else
2620 n = get_head_value_node(qn->target, exact, reg);
2621 }
2622 }
2623 break;
2624
2625 case NT_ENCLOSE:
2626 {
2627 EncloseNode* en = NENCLOSE(node);
2628 switch (en->type) {
2629 case ENCLOSE_OPTION:
2630 {
2631 OnigOptionType options = reg->options;
2632
2633 reg->options = NENCLOSE(node)->option;
2634 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2635 reg->options = options;
2636 }
2637 break;
2638
2639 case ENCLOSE_MEMORY:
2640 case ENCLOSE_STOP_BACKTRACK:
2641 n = get_head_value_node(en->target, exact, reg);
2642 break;
2643 }
2644 }
2645 break;
2646
2647 case NT_ANCHOR:
2648 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2649 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2650 break;
2651
2652 default:
2653 break;
2654 }
2655
2656 return n;
2657 }
2658
2659 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2660 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2661 {
2662 int type, r = 0;
2663
2664 type = NTYPE(node);
2665 if ((NTYPE2BIT(type) & type_mask) == 0)
2666 return 1;
2667
2668 switch (type) {
2669 case NT_LIST:
2670 case NT_ALT:
2671 do {
2672 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2673 anchor_mask);
2674 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2675 break;
2676
2677 case NT_QTFR:
2678 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2679 anchor_mask);
2680 break;
2681
2682 case NT_ENCLOSE:
2683 {
2684 EncloseNode* en = NENCLOSE(node);
2685 if ((en->type & enclose_mask) == 0)
2686 return 1;
2687
2688 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2689 }
2690 break;
2691
2692 case NT_ANCHOR:
2693 type = NANCHOR(node)->type;
2694 if ((type & anchor_mask) == 0)
2695 return 1;
2696
2697 if (NANCHOR(node)->target)
2698 r = check_type_tree(NANCHOR(node)->target,
2699 type_mask, enclose_mask, anchor_mask);
2700 break;
2701
2702 default:
2703 break;
2704 }
2705 return r;
2706 }
2707
2708 #ifdef USE_SUBEXP_CALL
2709
2710 #define RECURSION_EXIST 1
2711 #define RECURSION_INFINITE 2
2712
2713 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2714 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2715 {
2716 int type;
2717 int r = 0;
2718
2719 type = NTYPE(node);
2720 switch (type) {
2721 case NT_LIST:
2722 {
2723 Node *x;
2724 OnigDistance min;
2725 int ret;
2726
2727 x = node;
2728 do {
2729 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2730 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2731 r |= ret;
2732 if (head) {
2733 ret = get_min_match_length(NCAR(x), &min, env);
2734 if (ret != 0) return ret;
2735 if (min != 0) head = 0;
2736 }
2737 } while (IS_NOT_NULL(x = NCDR(x)));
2738 }
2739 break;
2740
2741 case NT_ALT:
2742 {
2743 int ret;
2744 r = RECURSION_EXIST;
2745 do {
2746 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2747 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2748 r &= ret;
2749 } while (IS_NOT_NULL(node = NCDR(node)));
2750 }
2751 break;
2752
2753 case NT_QTFR:
2754 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2755 if (r == RECURSION_EXIST) {
2756 if (NQTFR(node)->lower == 0) r = 0;
2757 }
2758 break;
2759
2760 case NT_ANCHOR:
2761 {
2762 AnchorNode* an = NANCHOR(node);
2763 switch (an->type) {
2764 case ANCHOR_PREC_READ:
2765 case ANCHOR_PREC_READ_NOT:
2766 case ANCHOR_LOOK_BEHIND:
2767 case ANCHOR_LOOK_BEHIND_NOT:
2768 r = subexp_inf_recursive_check(an->target, env, head);
2769 break;
2770 }
2771 }
2772 break;
2773
2774 case NT_CALL:
2775 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2776 break;
2777
2778 case NT_ENCLOSE:
2779 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2780 return 0;
2781 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2782 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2783 else {
2784 SET_ENCLOSE_STATUS(node, NST_MARK2);
2785 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2786 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2787 }
2788 break;
2789
2790 default:
2791 break;
2792 }
2793
2794 return r;
2795 }
2796
2797 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2798 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2799 {
2800 int type;
2801 int r = 0;
2802
2803 type = NTYPE(node);
2804 switch (type) {
2805 case NT_LIST:
2806 case NT_ALT:
2807 do {
2808 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2809 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2810 break;
2811
2812 case NT_QTFR:
2813 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2814 break;
2815
2816 case NT_ANCHOR:
2817 {
2818 AnchorNode* an = NANCHOR(node);
2819 switch (an->type) {
2820 case ANCHOR_PREC_READ:
2821 case ANCHOR_PREC_READ_NOT:
2822 case ANCHOR_LOOK_BEHIND:
2823 case ANCHOR_LOOK_BEHIND_NOT:
2824 r = subexp_inf_recursive_check_trav(an->target, env);
2825 break;
2826 }
2827 }
2828 break;
2829
2830 case NT_ENCLOSE:
2831 {
2832 EncloseNode* en = NENCLOSE(node);
2833
2834 if (IS_ENCLOSE_RECURSION(en)) {
2835 SET_ENCLOSE_STATUS(node, NST_MARK1);
2836 r = subexp_inf_recursive_check(en->target, env, 1);
2837 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2838 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2839 }
2840 r = subexp_inf_recursive_check_trav(en->target, env);
2841 }
2842
2843 break;
2844
2845 default:
2846 break;
2847 }
2848
2849 return r;
2850 }
2851
2852 static int
subexp_recursive_check(Node * node)2853 subexp_recursive_check(Node* node)
2854 {
2855 int r = 0;
2856
2857 switch (NTYPE(node)) {
2858 case NT_LIST:
2859 case NT_ALT:
2860 do {
2861 r |= subexp_recursive_check(NCAR(node));
2862 } while (IS_NOT_NULL(node = NCDR(node)));
2863 break;
2864
2865 case NT_QTFR:
2866 r = subexp_recursive_check(NQTFR(node)->target);
2867 break;
2868
2869 case NT_ANCHOR:
2870 {
2871 AnchorNode* an = NANCHOR(node);
2872 switch (an->type) {
2873 case ANCHOR_PREC_READ:
2874 case ANCHOR_PREC_READ_NOT:
2875 case ANCHOR_LOOK_BEHIND:
2876 case ANCHOR_LOOK_BEHIND_NOT:
2877 r = subexp_recursive_check(an->target);
2878 break;
2879 }
2880 }
2881 break;
2882
2883 case NT_CALL:
2884 r = subexp_recursive_check(NCALL(node)->target);
2885 if (r != 0) SET_CALL_RECURSION(node);
2886 break;
2887
2888 case NT_ENCLOSE:
2889 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2890 return 0;
2891 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2892 return 1; /* recursion */
2893 else {
2894 SET_ENCLOSE_STATUS(node, NST_MARK2);
2895 r = subexp_recursive_check(NENCLOSE(node)->target);
2896 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2897 }
2898 break;
2899
2900 default:
2901 break;
2902 }
2903
2904 return r;
2905 }
2906
2907
2908 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2909 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2910 {
2911 #define FOUND_CALLED_NODE 1
2912
2913 int type;
2914 int r = 0;
2915
2916 type = NTYPE(node);
2917 switch (type) {
2918 case NT_LIST:
2919 case NT_ALT:
2920 {
2921 int ret;
2922 do {
2923 ret = subexp_recursive_check_trav(NCAR(node), env);
2924 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2925 else if (ret < 0) return ret;
2926 } while (IS_NOT_NULL(node = NCDR(node)));
2927 }
2928 break;
2929
2930 case NT_QTFR:
2931 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2932 if (NQTFR(node)->upper == 0) {
2933 if (r == FOUND_CALLED_NODE)
2934 NQTFR(node)->is_refered = 1;
2935 }
2936 break;
2937
2938 case NT_ANCHOR:
2939 {
2940 AnchorNode* an = NANCHOR(node);
2941 switch (an->type) {
2942 case ANCHOR_PREC_READ:
2943 case ANCHOR_PREC_READ_NOT:
2944 case ANCHOR_LOOK_BEHIND:
2945 case ANCHOR_LOOK_BEHIND_NOT:
2946 r = subexp_recursive_check_trav(an->target, env);
2947 break;
2948 }
2949 }
2950 break;
2951
2952 case NT_ENCLOSE:
2953 {
2954 EncloseNode* en = NENCLOSE(node);
2955
2956 if (! IS_ENCLOSE_RECURSION(en)) {
2957 if (IS_ENCLOSE_CALLED(en)) {
2958 SET_ENCLOSE_STATUS(node, NST_MARK1);
2959 r = subexp_recursive_check(en->target);
2960 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2961 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2962 }
2963 }
2964 r = subexp_recursive_check_trav(en->target, env);
2965 if (IS_ENCLOSE_CALLED(en))
2966 r |= FOUND_CALLED_NODE;
2967 }
2968 break;
2969
2970 default:
2971 break;
2972 }
2973
2974 return r;
2975 }
2976
2977 static int
setup_subexp_call(Node * node,ScanEnv * env)2978 setup_subexp_call(Node* node, ScanEnv* env)
2979 {
2980 int type;
2981 int r = 0;
2982
2983 type = NTYPE(node);
2984 switch (type) {
2985 case NT_LIST:
2986 do {
2987 r = setup_subexp_call(NCAR(node), env);
2988 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2989 break;
2990
2991 case NT_ALT:
2992 do {
2993 r = setup_subexp_call(NCAR(node), env);
2994 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2995 break;
2996
2997 case NT_QTFR:
2998 r = setup_subexp_call(NQTFR(node)->target, env);
2999 break;
3000 case NT_ENCLOSE:
3001 r = setup_subexp_call(NENCLOSE(node)->target, env);
3002 break;
3003
3004 case NT_CALL:
3005 {
3006 CallNode* cn = NCALL(node);
3007 Node** nodes = SCANENV_MEM_NODES(env);
3008
3009 if (cn->group_num != 0) {
3010 int gnum = cn->group_num;
3011
3012 #ifdef USE_NAMED_GROUP
3013 if (env->num_named > 0 &&
3014 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3015 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3016 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3017 }
3018 #endif
3019 if (gnum > env->num_mem) {
3020 onig_scan_env_set_error_string(env,
3021 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3022 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3023 }
3024
3025 #ifdef USE_NAMED_GROUP
3026 set_call_attr:
3027 #endif
3028 cn->target = nodes[cn->group_num];
3029 if (IS_NULL(cn->target)) {
3030 onig_scan_env_set_error_string(env,
3031 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3032 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3033 }
3034 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3035 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3036 cn->unset_addr_list = env->unset_addr_list;
3037 }
3038 #ifdef USE_NAMED_GROUP
3039 else {
3040 int *refs;
3041
3042 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3043 &refs);
3044 if (n <= 0) {
3045 onig_scan_env_set_error_string(env,
3046 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3047 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3048 }
3049 else if (n > 1) {
3050 onig_scan_env_set_error_string(env,
3051 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3052 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3053 }
3054 else {
3055 cn->group_num = refs[0];
3056 goto set_call_attr;
3057 }
3058 }
3059 #endif
3060 }
3061 break;
3062
3063 case NT_ANCHOR:
3064 {
3065 AnchorNode* an = NANCHOR(node);
3066
3067 switch (an->type) {
3068 case ANCHOR_PREC_READ:
3069 case ANCHOR_PREC_READ_NOT:
3070 case ANCHOR_LOOK_BEHIND:
3071 case ANCHOR_LOOK_BEHIND_NOT:
3072 r = setup_subexp_call(an->target, env);
3073 break;
3074 }
3075 }
3076 break;
3077
3078 default:
3079 break;
3080 }
3081
3082 return r;
3083 }
3084 #endif
3085
3086 /* divide different length alternatives in look-behind.
3087 (?<=A|B) ==> (?<=A)|(?<=B)
3088 (?<!A|B) ==> (?<!A)(?<!B)
3089 */
3090 static int
divide_look_behind_alternatives(Node * node)3091 divide_look_behind_alternatives(Node* node)
3092 {
3093 Node *head, *np, *insert_node;
3094 AnchorNode* an = NANCHOR(node);
3095 int anc_type = an->type;
3096
3097 head = an->target;
3098 np = NCAR(head);
3099 swap_node(node, head);
3100 NCAR(node) = head;
3101 NANCHOR(head)->target = np;
3102
3103 np = node;
3104 while ((np = NCDR(np)) != NULL_NODE) {
3105 insert_node = onig_node_new_anchor(anc_type);
3106 CHECK_NULL_RETURN_MEMERR(insert_node);
3107 NANCHOR(insert_node)->target = NCAR(np);
3108 NCAR(np) = insert_node;
3109 }
3110
3111 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3112 np = node;
3113 do {
3114 SET_NTYPE(np, NT_LIST); /* alt -> list */
3115 } while ((np = NCDR(np)) != NULL_NODE);
3116 }
3117 return 0;
3118 }
3119
3120 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3121 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3122 {
3123 int r, len;
3124 AnchorNode* an = NANCHOR(node);
3125
3126 r = get_char_length_tree(an->target, reg, &len);
3127 if (r == 0)
3128 an->char_len = len;
3129 else if (r == GET_CHAR_LEN_VARLEN)
3130 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3131 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3132 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3133 r = divide_look_behind_alternatives(node);
3134 else
3135 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3136 }
3137
3138 return r;
3139 }
3140
3141 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3142 next_setup(Node* node, Node* next_node, regex_t* reg)
3143 {
3144 int type;
3145
3146 retry:
3147 type = NTYPE(node);
3148 if (type == NT_QTFR) {
3149 QtfrNode* qn = NQTFR(node);
3150 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3151 #ifdef USE_QTFR_PEEK_NEXT
3152 Node* n = get_head_value_node(next_node, 1, reg);
3153 /* '\0': for UTF-16BE etc... */
3154 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3155 qn->next_head_exact = n;
3156 }
3157 #endif
3158 /* automatic posseivation a*b ==> (?>a*)b */
3159 if (qn->lower <= 1) {
3160 int ttype = NTYPE(qn->target);
3161 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3162 Node *x, *y;
3163 x = get_head_value_node(qn->target, 0, reg);
3164 if (IS_NOT_NULL(x)) {
3165 y = get_head_value_node(next_node, 0, reg);
3166 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3167 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3168 CHECK_NULL_RETURN_MEMERR(en);
3169 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3170 swap_node(node, en);
3171 NENCLOSE(node)->target = en;
3172 }
3173 }
3174 }
3175 }
3176 }
3177 }
3178 else if (type == NT_ENCLOSE) {
3179 EncloseNode* en = NENCLOSE(node);
3180 if (en->type == ENCLOSE_MEMORY) {
3181 node = en->target;
3182 goto retry;
3183 }
3184 }
3185 return 0;
3186 }
3187
3188
3189 static int
update_string_node_case_fold(regex_t * reg,Node * node)3190 update_string_node_case_fold(regex_t* reg, Node *node)
3191 {
3192 UChar *p, *q, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3193 UChar *sbuf, *ebuf, *sp;
3194 int r, i, len, sbuf_size;
3195 StrNode* sn = NSTR(node);
3196
3197 end = sn->end;
3198 sbuf_size = (end - sn->s) * 2;
3199 sbuf = (UChar* )xmalloc(sbuf_size);
3200 CHECK_NULL_RETURN_MEMERR(sbuf);
3201 ebuf = sbuf + sbuf_size;
3202
3203 sp = sbuf;
3204 p = sn->s;
3205 while (p < end) {
3206 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3207 q = buf;
3208 for (i = 0; i < len; i++) {
3209 if (sp >= ebuf) {
3210 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3211 CHECK_NULL_RETURN_MEMERR(sbuf);
3212 sp = sbuf + sbuf_size;
3213 sbuf_size *= 2;
3214 ebuf = sbuf + sbuf_size;
3215 }
3216
3217 *sp++ = buf[i];
3218 }
3219 }
3220
3221 r = onig_node_str_set(node, sbuf, sp);
3222 if (r != 0) {
3223 xfree(sbuf);
3224 return r;
3225 }
3226
3227 xfree(sbuf);
3228 return 0;
3229 }
3230
3231 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3232 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3233 regex_t* reg)
3234 {
3235 int r;
3236 Node *node;
3237
3238 node = onig_node_new_str(s, end);
3239 if (IS_NULL(node)) return ONIGERR_MEMORY;
3240
3241 r = update_string_node_case_fold(reg, node);
3242 if (r != 0) {
3243 onig_node_free(node);
3244 return r;
3245 }
3246
3247 NSTRING_SET_AMBIG(node);
3248 NSTRING_SET_DONT_GET_OPT_INFO(node);
3249 *rnode = node;
3250 return 0;
3251 }
3252
3253 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3254 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3255 UChar *p, int slen, UChar *end,
3256 regex_t* reg, Node **rnode)
3257 {
3258 int r, i, j, len, varlen;
3259 Node *anode, *var_anode, *snode, *xnode, *an;
3260 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3261
3262 *rnode = var_anode = NULL_NODE;
3263
3264 varlen = 0;
3265 for (i = 0; i < item_num; i++) {
3266 if (items[i].byte_len != slen) {
3267 varlen = 1;
3268 break;
3269 }
3270 }
3271
3272 if (varlen != 0) {
3273 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3274 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3275
3276 xnode = onig_node_new_list(NULL, NULL);
3277 if (IS_NULL(xnode)) goto mem_err;
3278 NCAR(var_anode) = xnode;
3279
3280 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3281 if (IS_NULL(anode)) goto mem_err;
3282 NCAR(xnode) = anode;
3283 }
3284 else {
3285 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3286 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3287 }
3288
3289 snode = onig_node_new_str(p, p + slen);
3290 if (IS_NULL(snode)) goto mem_err;
3291
3292 NCAR(anode) = snode;
3293
3294 for (i = 0; i < item_num; i++) {
3295 snode = onig_node_new_str(NULL, NULL);
3296 if (IS_NULL(snode)) goto mem_err;
3297
3298 for (j = 0; j < items[i].code_len; j++) {
3299 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3300 if (len < 0) {
3301 r = len;
3302 goto mem_err2;
3303 }
3304
3305 r = onig_node_str_cat(snode, buf, buf + len);
3306 if (r != 0) goto mem_err2;
3307 }
3308
3309 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3310 if (IS_NULL(an)) {
3311 goto mem_err2;
3312 }
3313
3314 if (items[i].byte_len != slen) {
3315 Node *rem;
3316 UChar *q = p + items[i].byte_len;
3317
3318 if (q < end) {
3319 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3320 if (r != 0) {
3321 onig_node_free(an);
3322 goto mem_err2;
3323 }
3324
3325 xnode = onig_node_list_add(NULL_NODE, snode);
3326 if (IS_NULL(xnode)) {
3327 onig_node_free(an);
3328 onig_node_free(rem);
3329 goto mem_err2;
3330 }
3331 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3332 onig_node_free(an);
3333 onig_node_free(xnode);
3334 onig_node_free(rem);
3335 goto mem_err;
3336 }
3337
3338 NCAR(an) = xnode;
3339 }
3340 else {
3341 NCAR(an) = snode;
3342 }
3343
3344 NCDR(var_anode) = an;
3345 var_anode = an;
3346 }
3347 else {
3348 NCAR(an) = snode;
3349 NCDR(anode) = an;
3350 anode = an;
3351 }
3352 }
3353
3354 return varlen;
3355
3356 mem_err2:
3357 onig_node_free(snode);
3358
3359 mem_err:
3360 onig_node_free(*rnode);
3361
3362 return ONIGERR_MEMORY;
3363 }
3364
3365 static int
expand_case_fold_string(Node * node,regex_t * reg)3366 expand_case_fold_string(Node* node, regex_t* reg)
3367 {
3368 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3369
3370 int r, n, len, alt_num;
3371 UChar *start, *end, *p;
3372 Node *top_root, *root, *snode, *prev_node;
3373 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3374 StrNode* sn = NSTR(node);
3375
3376 if (NSTRING_IS_AMBIG(node)) return 0;
3377
3378 start = sn->s;
3379 end = sn->end;
3380 if (start >= end) return 0;
3381
3382 r = 0;
3383 top_root = root = prev_node = snode = NULL_NODE;
3384 alt_num = 1;
3385 p = start;
3386 while (p < end) {
3387 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3388 p, end, items);
3389 if (n < 0) {
3390 r = n;
3391 goto err;
3392 }
3393
3394 len = enclen(reg->enc, p);
3395
3396 if (n == 0) {
3397 if (IS_NULL(snode)) {
3398 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3399 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3400 if (IS_NULL(root)) {
3401 onig_node_free(prev_node);
3402 goto mem_err;
3403 }
3404 }
3405
3406 prev_node = snode = onig_node_new_str(NULL, NULL);
3407 if (IS_NULL(snode)) goto mem_err;
3408 if (IS_NOT_NULL(root)) {
3409 if (IS_NULL(onig_node_list_add(root, snode))) {
3410 onig_node_free(snode);
3411 goto mem_err;
3412 }
3413 }
3414 }
3415
3416 r = onig_node_str_cat(snode, p, p + len);
3417 if (r != 0) goto err;
3418 }
3419 else {
3420 alt_num *= (n + 1);
3421 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3422
3423 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3424 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3425 if (IS_NULL(root)) {
3426 onig_node_free(prev_node);
3427 goto mem_err;
3428 }
3429 }
3430
3431 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3432 if (r < 0) goto mem_err;
3433 if (r == 1) {
3434 if (IS_NULL(root)) {
3435 top_root = prev_node;
3436 }
3437 else {
3438 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3439 onig_node_free(prev_node);
3440 goto mem_err;
3441 }
3442 }
3443
3444 root = NCAR(prev_node);
3445 }
3446 else { /* r == 0 */
3447 if (IS_NOT_NULL(root)) {
3448 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3449 onig_node_free(prev_node);
3450 goto mem_err;
3451 }
3452 }
3453 }
3454
3455 snode = NULL_NODE;
3456 }
3457
3458 p += len;
3459 }
3460
3461 if (p < end) {
3462 Node *srem;
3463
3464 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3465 if (r != 0) goto mem_err;
3466
3467 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3468 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3469 if (IS_NULL(root)) {
3470 onig_node_free(srem);
3471 onig_node_free(prev_node);
3472 goto mem_err;
3473 }
3474 }
3475
3476 if (IS_NULL(root)) {
3477 prev_node = srem;
3478 }
3479 else {
3480 if (IS_NULL(onig_node_list_add(root, srem))) {
3481 onig_node_free(srem);
3482 goto mem_err;
3483 }
3484 }
3485 }
3486
3487 /* ending */
3488 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3489 swap_node(node, top_root);
3490 onig_node_free(top_root);
3491 return 0;
3492
3493 mem_err:
3494 r = ONIGERR_MEMORY;
3495
3496 err:
3497 onig_node_free(top_root);
3498 return r;
3499 }
3500
3501
3502 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3503
3504 #define CEC_THRES_NUM_BIG_REPEAT 512
3505 #define CEC_INFINITE_NUM 0x7fffffff
3506
3507 #define CEC_IN_INFINITE_REPEAT (1<<0)
3508 #define CEC_IN_FINITE_REPEAT (1<<1)
3509 #define CEC_CONT_BIG_REPEAT (1<<2)
3510
3511 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3512 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3513 {
3514 int type;
3515 int r = state;
3516
3517 type = NTYPE(node);
3518 switch (type) {
3519 case NT_LIST:
3520 {
3521 Node* prev = NULL_NODE;
3522 do {
3523 r = setup_comb_exp_check(NCAR(node), r, env);
3524 prev = NCAR(node);
3525 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3526 }
3527 break;
3528
3529 case NT_ALT:
3530 {
3531 int ret;
3532 do {
3533 ret = setup_comb_exp_check(NCAR(node), state, env);
3534 r |= ret;
3535 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3536 }
3537 break;
3538
3539 case NT_QTFR:
3540 {
3541 int child_state = state;
3542 int add_state = 0;
3543 QtfrNode* qn = NQTFR(node);
3544 Node* target = qn->target;
3545 int var_num;
3546
3547 if (! IS_REPEAT_INFINITE(qn->upper)) {
3548 if (qn->upper > 1) {
3549 /* {0,1}, {1,1} are allowed */
3550 child_state |= CEC_IN_FINITE_REPEAT;
3551
3552 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3553 if (env->backrefed_mem == 0) {
3554 if (NTYPE(qn->target) == NT_ENCLOSE) {
3555 EncloseNode* en = NENCLOSE(qn->target);
3556 if (en->type == ENCLOSE_MEMORY) {
3557 if (NTYPE(en->target) == NT_QTFR) {
3558 QtfrNode* q = NQTFR(en->target);
3559 if (IS_REPEAT_INFINITE(q->upper)
3560 && q->greedy == qn->greedy) {
3561 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3562 if (qn->upper == 1)
3563 child_state = state;
3564 }
3565 }
3566 }
3567 }
3568 }
3569 }
3570 }
3571
3572 if (state & CEC_IN_FINITE_REPEAT) {
3573 qn->comb_exp_check_num = -1;
3574 }
3575 else {
3576 if (IS_REPEAT_INFINITE(qn->upper)) {
3577 var_num = CEC_INFINITE_NUM;
3578 child_state |= CEC_IN_INFINITE_REPEAT;
3579 }
3580 else {
3581 var_num = qn->upper - qn->lower;
3582 }
3583
3584 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3585 add_state |= CEC_CONT_BIG_REPEAT;
3586
3587 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3588 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3589 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3590 if (qn->comb_exp_check_num == 0) {
3591 env->num_comb_exp_check++;
3592 qn->comb_exp_check_num = env->num_comb_exp_check;
3593 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3594 env->comb_exp_max_regnum = env->curr_max_regnum;
3595 }
3596 }
3597 }
3598
3599 r = setup_comb_exp_check(target, child_state, env);
3600 r |= add_state;
3601 }
3602 break;
3603
3604 case NT_ENCLOSE:
3605 {
3606 EncloseNode* en = NENCLOSE(node);
3607
3608 switch (en->type) {
3609 case ENCLOSE_MEMORY:
3610 {
3611 if (env->curr_max_regnum < en->regnum)
3612 env->curr_max_regnum = en->regnum;
3613
3614 r = setup_comb_exp_check(en->target, state, env);
3615 }
3616 break;
3617
3618 default:
3619 r = setup_comb_exp_check(en->target, state, env);
3620 break;
3621 }
3622 }
3623 break;
3624
3625 #ifdef USE_SUBEXP_CALL
3626 case NT_CALL:
3627 if (IS_CALL_RECURSION(NCALL(node)))
3628 env->has_recursion = 1;
3629 else
3630 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3631 break;
3632 #endif
3633
3634 default:
3635 break;
3636 }
3637
3638 return r;
3639 }
3640 #endif
3641
3642 #define IN_ALT (1<<0)
3643 #define IN_NOT (1<<1)
3644 #define IN_REPEAT (1<<2)
3645 #define IN_VAR_REPEAT (1<<3)
3646
3647 /* setup_tree does the following work.
3648 1. check empty loop. (set qn->target_empty_info)
3649 2. expand ignore-case in char class.
3650 3. set memory status bit flags. (reg->mem_stats)
3651 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3652 5. find invalid patterns in look-behind.
3653 6. expand repeated string.
3654 */
3655 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3656 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3657 {
3658 int type;
3659 int r = 0;
3660
3661 type = NTYPE(node);
3662 switch (type) {
3663 case NT_LIST:
3664 {
3665 Node* prev = NULL_NODE;
3666 do {
3667 r = setup_tree(NCAR(node), reg, state, env);
3668 if (IS_NOT_NULL(prev) && r == 0) {
3669 r = next_setup(prev, NCAR(node), reg);
3670 }
3671 prev = NCAR(node);
3672 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3673 }
3674 break;
3675
3676 case NT_ALT:
3677 do {
3678 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3679 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3680 break;
3681
3682 case NT_CCLASS:
3683 break;
3684
3685 case NT_STR:
3686 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3687 r = expand_case_fold_string(node, reg);
3688 }
3689 break;
3690
3691 case NT_CTYPE:
3692 case NT_CANY:
3693 break;
3694
3695 #ifdef USE_SUBEXP_CALL
3696 case NT_CALL:
3697 break;
3698 #endif
3699
3700 case NT_BREF:
3701 {
3702 int i;
3703 int* p;
3704 Node** nodes = SCANENV_MEM_NODES(env);
3705 BRefNode* br = NBREF(node);
3706 p = BACKREFS_P(br);
3707 for (i = 0; i < br->back_num; i++) {
3708 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3709 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3710 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3711 #ifdef USE_BACKREF_WITH_LEVEL
3712 if (IS_BACKREF_NEST_LEVEL(br)) {
3713 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3714 }
3715 #endif
3716 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3717 }
3718 }
3719 break;
3720
3721 case NT_QTFR:
3722 {
3723 OnigDistance d;
3724 QtfrNode* qn = NQTFR(node);
3725 Node* target = qn->target;
3726
3727 if ((state & IN_REPEAT) != 0) {
3728 qn->state |= NST_IN_REPEAT;
3729 }
3730
3731 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3732 r = get_min_match_length(target, &d, env);
3733 if (r) break;
3734 if (d == 0) {
3735 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3736 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3737 r = quantifiers_memory_node_info(target);
3738 if (r < 0) break;
3739 if (r > 0) {
3740 qn->target_empty_info = r;
3741 }
3742 #endif
3743 #if 0
3744 r = get_max_match_length(target, &d, env);
3745 if (r == 0 && d == 0) {
3746 /* ()* ==> ()?, ()+ ==> () */
3747 qn->upper = 1;
3748 if (qn->lower > 1) qn->lower = 1;
3749 if (NTYPE(target) == NT_STR) {
3750 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3751 }
3752 }
3753 #endif
3754 }
3755 }
3756
3757 state |= IN_REPEAT;
3758 if (qn->lower != qn->upper)
3759 state |= IN_VAR_REPEAT;
3760 r = setup_tree(target, reg, state, env);
3761 if (r) break;
3762
3763 /* expand string */
3764 #define EXPAND_STRING_MAX_LENGTH 100
3765 if (NTYPE(target) == NT_STR) {
3766 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3767 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3768 int len = NSTRING_LEN(target);
3769 StrNode* sn = NSTR(target);
3770
3771 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3772 int i, n = qn->lower;
3773 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3774 for (i = 0; i < n; i++) {
3775 r = onig_node_str_cat(node, sn->s, sn->end);
3776 if (r) break;
3777 }
3778 onig_node_free(target);
3779 break; /* break case NT_QTFR: */
3780 }
3781 }
3782 }
3783
3784 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3785 if (qn->greedy && (qn->target_empty_info != 0)) {
3786 if (NTYPE(target) == NT_QTFR) {
3787 QtfrNode* tqn = NQTFR(target);
3788 if (IS_NOT_NULL(tqn->head_exact)) {
3789 qn->head_exact = tqn->head_exact;
3790 tqn->head_exact = NULL;
3791 }
3792 }
3793 else {
3794 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3795 }
3796 }
3797 #endif
3798 }
3799 break;
3800
3801 case NT_ENCLOSE:
3802 {
3803 EncloseNode* en = NENCLOSE(node);
3804
3805 switch (en->type) {
3806 case ENCLOSE_OPTION:
3807 {
3808 OnigOptionType options = reg->options;
3809 reg->options = NENCLOSE(node)->option;
3810 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3811 reg->options = options;
3812 }
3813 break;
3814
3815 case ENCLOSE_MEMORY:
3816 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3817 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3818 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3819 }
3820 r = setup_tree(en->target, reg, state, env);
3821 break;
3822
3823 case ENCLOSE_STOP_BACKTRACK:
3824 {
3825 Node* target = en->target;
3826 r = setup_tree(target, reg, state, env);
3827 if (NTYPE(target) == NT_QTFR) {
3828 QtfrNode* tqn = NQTFR(target);
3829 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3830 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3831 int qtype = NTYPE(tqn->target);
3832 if (IS_NODE_TYPE_SIMPLE(qtype))
3833 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3834 }
3835 }
3836 }
3837 break;
3838 }
3839 }
3840 break;
3841
3842 case NT_ANCHOR:
3843 {
3844 AnchorNode* an = NANCHOR(node);
3845
3846 switch (an->type) {
3847 case ANCHOR_PREC_READ:
3848 r = setup_tree(an->target, reg, state, env);
3849 break;
3850 case ANCHOR_PREC_READ_NOT:
3851 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3852 break;
3853
3854 /* allowed node types in look-behind */
3855 #define ALLOWED_TYPE_IN_LB \
3856 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3857 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3858
3859 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
3860 #define ALLOWED_ENCLOSE_IN_LB_NOT 0
3861
3862 #define ALLOWED_ANCHOR_IN_LB \
3863 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3864 #define ALLOWED_ANCHOR_IN_LB_NOT \
3865 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3866
3867 case ANCHOR_LOOK_BEHIND:
3868 {
3869 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3870 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3871 if (r < 0) return r;
3872 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3873 r = setup_look_behind(node, reg, env);
3874 if (r != 0) return r;
3875 r = setup_tree(an->target, reg, state, env);
3876 }
3877 break;
3878
3879 case ANCHOR_LOOK_BEHIND_NOT:
3880 {
3881 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3882 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3883 if (r < 0) return r;
3884 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3885 r = setup_look_behind(node, reg, env);
3886 if (r != 0) return r;
3887 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3888 }
3889 break;
3890 }
3891 }
3892 break;
3893
3894 default:
3895 break;
3896 }
3897
3898 return r;
3899 }
3900
3901 /* set skip map for Boyer-Moor search */
3902 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3903 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3904 UChar skip[], int** int_skip)
3905 {
3906 int i, len;
3907
3908 len = end - s;
3909 if (len < ONIG_CHAR_TABLE_SIZE) {
3910 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3911
3912 for (i = 0; i < len - 1; i++)
3913 skip[s[i]] = len - 1 - i;
3914 }
3915 else {
3916 if (IS_NULL(*int_skip)) {
3917 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3918 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3919 }
3920 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3921
3922 for (i = 0; i < len - 1; i++)
3923 (*int_skip)[s[i]] = len - 1 - i;
3924 }
3925 return 0;
3926 }
3927
3928 #define OPT_EXACT_MAXLEN 24
3929
3930 typedef struct {
3931 OnigDistance min; /* min byte length */
3932 OnigDistance max; /* max byte length */
3933 } MinMaxLen;
3934
3935 typedef struct {
3936 MinMaxLen mmd;
3937 OnigEncoding enc;
3938 OnigOptionType options;
3939 OnigCaseFoldType case_fold_flag;
3940 ScanEnv* scan_env;
3941 } OptEnv;
3942
3943 typedef struct {
3944 int left_anchor;
3945 int right_anchor;
3946 } OptAncInfo;
3947
3948 typedef struct {
3949 MinMaxLen mmd; /* info position */
3950 OptAncInfo anc;
3951
3952 int reach_end;
3953 int ignore_case;
3954 int len;
3955 UChar s[OPT_EXACT_MAXLEN];
3956 } OptExactInfo;
3957
3958 typedef struct {
3959 MinMaxLen mmd; /* info position */
3960 OptAncInfo anc;
3961
3962 int value; /* weighted value */
3963 UChar map[ONIG_CHAR_TABLE_SIZE];
3964 } OptMapInfo;
3965
3966 typedef struct {
3967 MinMaxLen len;
3968
3969 OptAncInfo anc;
3970 OptExactInfo exb; /* boundary */
3971 OptExactInfo exm; /* middle */
3972 OptExactInfo expr; /* prec read (?=...) */
3973
3974 OptMapInfo map; /* boundary */
3975 } NodeOptInfo;
3976
3977
3978 static int
map_position_value(OnigEncoding enc,int i)3979 map_position_value(OnigEncoding enc, int i)
3980 {
3981 static const short int ByteValTable[] = {
3982 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
3983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3984 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
3985 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
3986 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3987 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
3988 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3989 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
3990 };
3991
3992 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
3993 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3994 return 20;
3995 else
3996 return (int )ByteValTable[i];
3997 }
3998 else
3999 return 4; /* Take it easy. */
4000 }
4001
4002 static int
distance_value(MinMaxLen * mm)4003 distance_value(MinMaxLen* mm)
4004 {
4005 /* 1000 / (min-max-dist + 1) */
4006 static const short int dist_vals[] = {
4007 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4008 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4009 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4010 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4011 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4012 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4013 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4014 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4015 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4016 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4017 };
4018
4019 int d;
4020
4021 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4022
4023 d = mm->max - mm->min;
4024 if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4025 /* return dist_vals[d] * 16 / (mm->min + 12); */
4026 return (int )dist_vals[d];
4027 else
4028 return 1;
4029 }
4030
4031 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4032 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4033 {
4034 if (v2 <= 0) return -1;
4035 if (v1 <= 0) return 1;
4036
4037 v1 *= distance_value(d1);
4038 v2 *= distance_value(d2);
4039
4040 if (v2 > v1) return 1;
4041 if (v2 < v1) return -1;
4042
4043 if (d2->min < d1->min) return 1;
4044 if (d2->min > d1->min) return -1;
4045 return 0;
4046 }
4047
4048 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4049 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4050 {
4051 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4052 }
4053
4054
4055 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4056 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4057 {
4058 mml->min = min;
4059 mml->max = max;
4060 }
4061
4062 static void
clear_mml(MinMaxLen * mml)4063 clear_mml(MinMaxLen* mml)
4064 {
4065 mml->min = mml->max = 0;
4066 }
4067
4068 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4069 copy_mml(MinMaxLen* to, MinMaxLen* from)
4070 {
4071 to->min = from->min;
4072 to->max = from->max;
4073 }
4074
4075 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4076 add_mml(MinMaxLen* to, MinMaxLen* from)
4077 {
4078 to->min = distance_add(to->min, from->min);
4079 to->max = distance_add(to->max, from->max);
4080 }
4081
4082 #if 0
4083 static void
4084 add_len_mml(MinMaxLen* to, OnigDistance len)
4085 {
4086 to->min = distance_add(to->min, len);
4087 to->max = distance_add(to->max, len);
4088 }
4089 #endif
4090
4091 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4092 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4093 {
4094 if (to->min > from->min) to->min = from->min;
4095 if (to->max < from->max) to->max = from->max;
4096 }
4097
4098 static void
copy_opt_env(OptEnv * to,OptEnv * from)4099 copy_opt_env(OptEnv* to, OptEnv* from)
4100 {
4101 *to = *from;
4102 }
4103
4104 static void
clear_opt_anc_info(OptAncInfo * anc)4105 clear_opt_anc_info(OptAncInfo* anc)
4106 {
4107 anc->left_anchor = 0;
4108 anc->right_anchor = 0;
4109 }
4110
4111 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4112 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4113 {
4114 *to = *from;
4115 }
4116
4117 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4118 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4119 OnigDistance left_len, OnigDistance right_len)
4120 {
4121 clear_opt_anc_info(to);
4122
4123 to->left_anchor = left->left_anchor;
4124 if (left_len == 0) {
4125 to->left_anchor |= right->left_anchor;
4126 }
4127
4128 to->right_anchor = right->right_anchor;
4129 if (right_len == 0) {
4130 to->right_anchor |= left->right_anchor;
4131 }
4132 }
4133
4134 static int
is_left_anchor(int anc)4135 is_left_anchor(int anc)
4136 {
4137 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4138 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4139 anc == ANCHOR_PREC_READ_NOT)
4140 return 0;
4141
4142 return 1;
4143 }
4144
4145 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4146 is_set_opt_anc_info(OptAncInfo* to, int anc)
4147 {
4148 if ((to->left_anchor & anc) != 0) return 1;
4149
4150 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4151 }
4152
4153 static void
add_opt_anc_info(OptAncInfo * to,int anc)4154 add_opt_anc_info(OptAncInfo* to, int anc)
4155 {
4156 if (is_left_anchor(anc))
4157 to->left_anchor |= anc;
4158 else
4159 to->right_anchor |= anc;
4160 }
4161
4162 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4163 remove_opt_anc_info(OptAncInfo* to, int anc)
4164 {
4165 if (is_left_anchor(anc))
4166 to->left_anchor &= ~anc;
4167 else
4168 to->right_anchor &= ~anc;
4169 }
4170
4171 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4172 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4173 {
4174 to->left_anchor &= add->left_anchor;
4175 to->right_anchor &= add->right_anchor;
4176 }
4177
4178 static int
is_full_opt_exact_info(OptExactInfo * ex)4179 is_full_opt_exact_info(OptExactInfo* ex)
4180 {
4181 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4182 }
4183
4184 static void
clear_opt_exact_info(OptExactInfo * ex)4185 clear_opt_exact_info(OptExactInfo* ex)
4186 {
4187 clear_mml(&ex->mmd);
4188 clear_opt_anc_info(&ex->anc);
4189 ex->reach_end = 0;
4190 ex->ignore_case = 0;
4191 ex->len = 0;
4192 ex->s[0] = '\0';
4193 }
4194
4195 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4196 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4197 {
4198 *to = *from;
4199 }
4200
4201 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4202 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4203 {
4204 int i, j, len;
4205 UChar *p, *end;
4206 OptAncInfo tanc;
4207
4208 if (! to->ignore_case && add->ignore_case) {
4209 if (to->len >= add->len) return ; /* avoid */
4210
4211 to->ignore_case = 1;
4212 }
4213
4214 p = add->s;
4215 end = p + add->len;
4216 for (i = to->len; p < end; ) {
4217 len = enclen(enc, p);
4218 if (i + len > OPT_EXACT_MAXLEN) break;
4219 for (j = 0; j < len && p < end; j++)
4220 to->s[i++] = *p++;
4221 }
4222
4223 to->len = i;
4224 to->reach_end = (p == end ? add->reach_end : 0);
4225
4226 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4227 if (! to->reach_end) tanc.right_anchor = 0;
4228 copy_opt_anc_info(&to->anc, &tanc);
4229 }
4230
4231 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4232 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4233 int raw ARG_UNUSED, OnigEncoding enc)
4234 {
4235 int i, j, len;
4236 UChar *p;
4237
4238 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4239 len = enclen(enc, p);
4240 if (i + len > OPT_EXACT_MAXLEN) break;
4241 for (j = 0; j < len && p < end; j++)
4242 to->s[i++] = *p++;
4243 }
4244
4245 to->len = i;
4246 }
4247
4248 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4249 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4250 {
4251 int i, j, len;
4252
4253 if (add->len == 0 || to->len == 0) {
4254 clear_opt_exact_info(to);
4255 return ;
4256 }
4257
4258 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4259 clear_opt_exact_info(to);
4260 return ;
4261 }
4262
4263 for (i = 0; i < to->len && i < add->len; ) {
4264 if (to->s[i] != add->s[i]) break;
4265 len = enclen(env->enc, to->s + i);
4266
4267 for (j = 1; j < len; j++) {
4268 if (to->s[i+j] != add->s[i+j]) break;
4269 }
4270 if (j < len) break;
4271 i += len;
4272 }
4273
4274 if (! add->reach_end || i < add->len || i < to->len) {
4275 to->reach_end = 0;
4276 }
4277 to->len = i;
4278 to->ignore_case |= add->ignore_case;
4279
4280 alt_merge_opt_anc_info(&to->anc, &add->anc);
4281 if (! to->reach_end) to->anc.right_anchor = 0;
4282 }
4283
4284 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4285 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4286 {
4287 int v1, v2;
4288
4289 v1 = now->len;
4290 v2 = alt->len;
4291
4292 if (v2 == 0) {
4293 return ;
4294 }
4295 else if (v1 == 0) {
4296 copy_opt_exact_info(now, alt);
4297 return ;
4298 }
4299 else if (v1 <= 2 && v2 <= 2) {
4300 /* ByteValTable[x] is big value --> low price */
4301 v2 = map_position_value(enc, now->s[0]);
4302 v1 = map_position_value(enc, alt->s[0]);
4303
4304 if (now->len > 1) v1 += 5;
4305 if (alt->len > 1) v2 += 5;
4306 }
4307
4308 if (now->ignore_case == 0) v1 *= 2;
4309 if (alt->ignore_case == 0) v2 *= 2;
4310
4311 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4312 copy_opt_exact_info(now, alt);
4313 }
4314
4315 static void
clear_opt_map_info(OptMapInfo * map)4316 clear_opt_map_info(OptMapInfo* map)
4317 {
4318 static const OptMapInfo clean_info = {
4319 {0, 0}, {0, 0}, 0,
4320 {
4321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4336 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4337 }
4338 };
4339
4340 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4341 }
4342
4343 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4344 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4345 {
4346 *to = *from;
4347 }
4348
4349 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4350 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4351 {
4352 if (map->map[c] == 0) {
4353 map->map[c] = 1;
4354 map->value += map_position_value(enc, c);
4355 }
4356 }
4357
4358 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4359 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4360 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4361 {
4362 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4363 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4364 int i, n;
4365
4366 add_char_opt_map_info(map, p[0], enc);
4367
4368 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4369 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4370 if (n < 0) return n;
4371
4372 for (i = 0; i < n; i++) {
4373 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4374 add_char_opt_map_info(map, buf[0], enc);
4375 }
4376
4377 return 0;
4378 }
4379
4380 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4381 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4382 {
4383 static int z = 1<<15; /* 32768: something big value */
4384
4385 int v1, v2;
4386
4387 if (alt->value == 0) return ;
4388 if (now->value == 0) {
4389 copy_opt_map_info(now, alt);
4390 return ;
4391 }
4392
4393 v1 = z / now->value;
4394 v2 = z / alt->value;
4395 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4396 copy_opt_map_info(now, alt);
4397 }
4398
4399 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4400 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4401 {
4402 #define COMP_EM_BASE 20
4403 int ve, vm;
4404
4405 if (m->value <= 0) return -1;
4406
4407 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4408 vm = COMP_EM_BASE * 5 * 2 / m->value;
4409 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4410 }
4411
4412 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4413 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4414 {
4415 int i, val;
4416
4417 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4418 if (to->value == 0) return ;
4419 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4420 clear_opt_map_info(to);
4421 return ;
4422 }
4423
4424 alt_merge_mml(&to->mmd, &add->mmd);
4425
4426 val = 0;
4427 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4428 if (add->map[i])
4429 to->map[i] = 1;
4430
4431 if (to->map[i])
4432 val += map_position_value(enc, i);
4433 }
4434 to->value = val;
4435
4436 alt_merge_opt_anc_info(&to->anc, &add->anc);
4437 }
4438
4439 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4440 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4441 {
4442 copy_mml(&(opt->exb.mmd), mmd);
4443 copy_mml(&(opt->expr.mmd), mmd);
4444 copy_mml(&(opt->map.mmd), mmd);
4445 }
4446
4447 static void
clear_node_opt_info(NodeOptInfo * opt)4448 clear_node_opt_info(NodeOptInfo* opt)
4449 {
4450 clear_mml(&opt->len);
4451 clear_opt_anc_info(&opt->anc);
4452 clear_opt_exact_info(&opt->exb);
4453 clear_opt_exact_info(&opt->exm);
4454 clear_opt_exact_info(&opt->expr);
4455 clear_opt_map_info(&opt->map);
4456 }
4457
4458 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4459 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4460 {
4461 *to = *from;
4462 }
4463
4464 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4465 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4466 {
4467 int exb_reach, exm_reach;
4468 OptAncInfo tanc;
4469
4470 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4471 copy_opt_anc_info(&to->anc, &tanc);
4472
4473 if (add->exb.len > 0 && to->len.max == 0) {
4474 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4475 to->len.max, add->len.max);
4476 copy_opt_anc_info(&add->exb.anc, &tanc);
4477 }
4478
4479 if (add->map.value > 0 && to->len.max == 0) {
4480 if (add->map.mmd.max == 0)
4481 add->map.anc.left_anchor |= to->anc.left_anchor;
4482 }
4483
4484 exb_reach = to->exb.reach_end;
4485 exm_reach = to->exm.reach_end;
4486
4487 if (add->len.max != 0)
4488 to->exb.reach_end = to->exm.reach_end = 0;
4489
4490 if (add->exb.len > 0) {
4491 if (exb_reach) {
4492 concat_opt_exact_info(&to->exb, &add->exb, enc);
4493 clear_opt_exact_info(&add->exb);
4494 }
4495 else if (exm_reach) {
4496 concat_opt_exact_info(&to->exm, &add->exb, enc);
4497 clear_opt_exact_info(&add->exb);
4498 }
4499 }
4500 select_opt_exact_info(enc, &to->exm, &add->exb);
4501 select_opt_exact_info(enc, &to->exm, &add->exm);
4502
4503 if (to->expr.len > 0) {
4504 if (add->len.max > 0) {
4505 if (to->expr.len > (int )add->len.max)
4506 to->expr.len = add->len.max;
4507
4508 if (to->expr.mmd.max == 0)
4509 select_opt_exact_info(enc, &to->exb, &to->expr);
4510 else
4511 select_opt_exact_info(enc, &to->exm, &to->expr);
4512 }
4513 }
4514 else if (add->expr.len > 0) {
4515 copy_opt_exact_info(&to->expr, &add->expr);
4516 }
4517
4518 select_opt_map_info(&to->map, &add->map);
4519
4520 add_mml(&to->len, &add->len);
4521 }
4522
4523 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4524 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4525 {
4526 alt_merge_opt_anc_info (&to->anc, &add->anc);
4527 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4528 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4529 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4530 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4531
4532 alt_merge_mml(&to->len, &add->len);
4533 }
4534
4535
4536 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4537
4538 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4539 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4540 {
4541 int type;
4542 int r = 0;
4543
4544 clear_node_opt_info(opt);
4545 set_bound_node_opt_info(opt, &env->mmd);
4546
4547 type = NTYPE(node);
4548 switch (type) {
4549 case NT_LIST:
4550 {
4551 OptEnv nenv;
4552 NodeOptInfo nopt;
4553 Node* nd = node;
4554
4555 copy_opt_env(&nenv, env);
4556 do {
4557 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4558 if (r == 0) {
4559 add_mml(&nenv.mmd, &nopt.len);
4560 concat_left_node_opt_info(env->enc, opt, &nopt);
4561 }
4562 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4563 }
4564 break;
4565
4566 case NT_ALT:
4567 {
4568 NodeOptInfo nopt;
4569 Node* nd = node;
4570
4571 do {
4572 r = optimize_node_left(NCAR(nd), &nopt, env);
4573 if (r == 0) {
4574 if (nd == node) copy_node_opt_info(opt, &nopt);
4575 else alt_merge_node_opt_info(opt, &nopt, env);
4576 }
4577 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4578 }
4579 break;
4580
4581 case NT_STR:
4582 {
4583 StrNode* sn = NSTR(node);
4584 int slen = sn->end - sn->s;
4585 int is_raw = NSTRING_IS_RAW(node);
4586
4587 if (! NSTRING_IS_AMBIG(node)) {
4588 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4589 NSTRING_IS_RAW(node), env->enc);
4590 if (slen > 0) {
4591 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4592 }
4593 set_mml(&opt->len, slen, slen);
4594 }
4595 else {
4596 int max;
4597
4598 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4599 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4600 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4601 }
4602 else {
4603 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4604 is_raw, env->enc);
4605 opt->exb.ignore_case = 1;
4606
4607 if (slen > 0) {
4608 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4609 env->enc, env->case_fold_flag);
4610 if (r != 0) break;
4611 }
4612
4613 max = slen;
4614 }
4615
4616 set_mml(&opt->len, slen, max);
4617 }
4618
4619 if (opt->exb.len == slen)
4620 opt->exb.reach_end = 1;
4621 }
4622 break;
4623
4624 case NT_CCLASS:
4625 {
4626 int i, z;
4627 CClassNode* cc = NCCLASS(node);
4628
4629 /* no need to check ignore case. (setted in setup_tree()) */
4630
4631 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4632 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4633 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4634
4635 set_mml(&opt->len, min, max);
4636 }
4637 else {
4638 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4639 z = BITSET_AT(cc->bs, i);
4640 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4641 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4642 }
4643 }
4644 set_mml(&opt->len, 1, 1);
4645 }
4646 }
4647 break;
4648
4649 case NT_CTYPE:
4650 {
4651 int i, min, max;
4652
4653 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4654
4655 if (max == 1) {
4656 min = 1;
4657
4658 switch (NCTYPE(node)->ctype) {
4659 case ONIGENC_CTYPE_WORD:
4660 if (NCTYPE(node)->not != 0) {
4661 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4662 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4663 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4664 }
4665 }
4666 }
4667 else {
4668 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4669 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4670 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4671 }
4672 }
4673 }
4674 break;
4675 }
4676 }
4677 else {
4678 min = ONIGENC_MBC_MINLEN(env->enc);
4679 }
4680 set_mml(&opt->len, min, max);
4681 }
4682 break;
4683
4684 case NT_CANY:
4685 {
4686 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4687 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4688 set_mml(&opt->len, min, max);
4689 }
4690 break;
4691
4692 case NT_ANCHOR:
4693 switch (NANCHOR(node)->type) {
4694 case ANCHOR_BEGIN_BUF:
4695 case ANCHOR_BEGIN_POSITION:
4696 case ANCHOR_BEGIN_LINE:
4697 case ANCHOR_END_BUF:
4698 case ANCHOR_SEMI_END_BUF:
4699 case ANCHOR_END_LINE:
4700 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4701 break;
4702
4703 case ANCHOR_PREC_READ:
4704 {
4705 NodeOptInfo nopt;
4706
4707 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4708 if (r == 0) {
4709 if (nopt.exb.len > 0)
4710 copy_opt_exact_info(&opt->expr, &nopt.exb);
4711 else if (nopt.exm.len > 0)
4712 copy_opt_exact_info(&opt->expr, &nopt.exm);
4713
4714 opt->expr.reach_end = 0;
4715
4716 if (nopt.map.value > 0)
4717 copy_opt_map_info(&opt->map, &nopt.map);
4718 }
4719 }
4720 break;
4721
4722 case ANCHOR_PREC_READ_NOT:
4723 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4724 case ANCHOR_LOOK_BEHIND_NOT:
4725 break;
4726 }
4727 break;
4728
4729 case NT_BREF:
4730 {
4731 int i;
4732 int* backs;
4733 OnigDistance min, max, tmin, tmax;
4734 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4735 BRefNode* br = NBREF(node);
4736
4737 if (br->state & NST_RECURSION) {
4738 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4739 break;
4740 }
4741 backs = BACKREFS_P(br);
4742 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4743 if (r != 0) break;
4744 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4745 if (r != 0) break;
4746 for (i = 1; i < br->back_num; i++) {
4747 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4748 if (r != 0) break;
4749 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4750 if (r != 0) break;
4751 if (min > tmin) min = tmin;
4752 if (max < tmax) max = tmax;
4753 }
4754 if (r == 0) set_mml(&opt->len, min, max);
4755 }
4756 break;
4757
4758 #ifdef USE_SUBEXP_CALL
4759 case NT_CALL:
4760 if (IS_CALL_RECURSION(NCALL(node)))
4761 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4762 else {
4763 OnigOptionType save = env->options;
4764 env->options = NENCLOSE(NCALL(node)->target)->option;
4765 r = optimize_node_left(NCALL(node)->target, opt, env);
4766 env->options = save;
4767 }
4768 break;
4769 #endif
4770
4771 case NT_QTFR:
4772 {
4773 int i;
4774 OnigDistance min, max;
4775 NodeOptInfo nopt;
4776 QtfrNode* qn = NQTFR(node);
4777
4778 r = optimize_node_left(qn->target, &nopt, env);
4779 if (r) break;
4780
4781 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4782 if (env->mmd.max == 0 &&
4783 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4784 if (IS_MULTILINE(env->options))
4785 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4786 else
4787 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4788 }
4789 }
4790 else {
4791 if (qn->lower > 0) {
4792 copy_node_opt_info(opt, &nopt);
4793 if (nopt.exb.len > 0) {
4794 if (nopt.exb.reach_end) {
4795 for (i = 2; i <= qn->lower &&
4796 ! is_full_opt_exact_info(&opt->exb); i++) {
4797 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4798 }
4799 if (i < qn->lower) {
4800 opt->exb.reach_end = 0;
4801 }
4802 }
4803 }
4804
4805 if (qn->lower != qn->upper) {
4806 opt->exb.reach_end = 0;
4807 opt->exm.reach_end = 0;
4808 }
4809 if (qn->lower > 1)
4810 opt->exm.reach_end = 0;
4811 }
4812 }
4813
4814 min = distance_multiply(nopt.len.min, qn->lower);
4815 if (IS_REPEAT_INFINITE(qn->upper))
4816 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4817 else
4818 max = distance_multiply(nopt.len.max, qn->upper);
4819
4820 set_mml(&opt->len, min, max);
4821 }
4822 break;
4823
4824 case NT_ENCLOSE:
4825 {
4826 EncloseNode* en = NENCLOSE(node);
4827
4828 switch (en->type) {
4829 case ENCLOSE_OPTION:
4830 {
4831 OnigOptionType save = env->options;
4832
4833 env->options = en->option;
4834 r = optimize_node_left(en->target, opt, env);
4835 env->options = save;
4836 }
4837 break;
4838
4839 case ENCLOSE_MEMORY:
4840 #ifdef USE_SUBEXP_CALL
4841 en->opt_count++;
4842 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4843 OnigDistance min, max;
4844
4845 min = 0;
4846 max = ONIG_INFINITE_DISTANCE;
4847 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4848 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4849 set_mml(&opt->len, min, max);
4850 }
4851 else
4852 #endif
4853 {
4854 r = optimize_node_left(en->target, opt, env);
4855
4856 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4857 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4858 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4859 }
4860 }
4861 break;
4862
4863 case ENCLOSE_STOP_BACKTRACK:
4864 r = optimize_node_left(en->target, opt, env);
4865 break;
4866 }
4867 }
4868 break;
4869
4870 default:
4871 #ifdef ONIG_DEBUG
4872 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4873 NTYPE(node));
4874 #endif
4875 r = ONIGERR_TYPE_BUG;
4876 break;
4877 }
4878
4879 return r;
4880 }
4881
4882 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4883 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4884 {
4885 int r;
4886
4887 if (e->len == 0) return 0;
4888
4889 if (e->ignore_case) {
4890 reg->exact = (UChar* )xmalloc(e->len);
4891 CHECK_NULL_RETURN_MEMERR(reg->exact);
4892 xmemcpy(reg->exact, e->s, e->len);
4893 reg->exact_end = reg->exact + e->len;
4894 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4895 }
4896 else {
4897 int allow_reverse;
4898
4899 reg->exact = str_dup(e->s, e->s + e->len);
4900 CHECK_NULL_RETURN_MEMERR(reg->exact);
4901 reg->exact_end = reg->exact + e->len;
4902
4903 allow_reverse =
4904 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4905
4906 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4907 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4908 reg->map, &(reg->int_map));
4909 if (r) return r;
4910
4911 reg->optimize = (allow_reverse != 0
4912 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4913 }
4914 else {
4915 reg->optimize = ONIG_OPTIMIZE_EXACT;
4916 }
4917 }
4918
4919 reg->dmin = e->mmd.min;
4920 reg->dmax = e->mmd.max;
4921
4922 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4923 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4924 }
4925
4926 return 0;
4927 }
4928
4929 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4930 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4931 {
4932 int i;
4933
4934 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4935 reg->map[i] = m->map[i];
4936
4937 reg->optimize = ONIG_OPTIMIZE_MAP;
4938 reg->dmin = m->mmd.min;
4939 reg->dmax = m->mmd.max;
4940
4941 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4942 reg->threshold_len = reg->dmin + 1;
4943 }
4944 }
4945
4946 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4947 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4948 {
4949 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4950 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4951 }
4952
4953 #ifdef ONIG_DEBUG
4954 static void print_optimize_info(FILE* f, regex_t* reg);
4955 #endif
4956
4957 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4958 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4959 {
4960
4961 int r;
4962 NodeOptInfo opt;
4963 OptEnv env;
4964
4965 env.enc = reg->enc;
4966 env.options = reg->options;
4967 env.case_fold_flag = reg->case_fold_flag;
4968 env.scan_env = scan_env;
4969 clear_mml(&env.mmd);
4970
4971 r = optimize_node_left(node, &opt, &env);
4972 if (r) return r;
4973
4974 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4975 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4976
4977 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4978
4979 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4980 reg->anchor_dmin = opt.len.min;
4981 reg->anchor_dmax = opt.len.max;
4982 }
4983
4984 if (opt.exb.len > 0 || opt.exm.len > 0) {
4985 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4986 if (opt.map.value > 0 &&
4987 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4988 goto set_map;
4989 }
4990 else {
4991 r = set_optimize_exact_info(reg, &opt.exb);
4992 set_sub_anchor(reg, &opt.exb.anc);
4993 }
4994 }
4995 else if (opt.map.value > 0) {
4996 set_map:
4997 set_optimize_map_info(reg, &opt.map);
4998 set_sub_anchor(reg, &opt.map.anc);
4999 }
5000 else {
5001 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5002 if (opt.len.max == 0)
5003 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5004 }
5005
5006 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5007 print_optimize_info(stderr, reg);
5008 #endif
5009 return r;
5010 }
5011
5012 static void
clear_optimize_info(regex_t * reg)5013 clear_optimize_info(regex_t* reg)
5014 {
5015 reg->optimize = ONIG_OPTIMIZE_NONE;
5016 reg->anchor = 0;
5017 reg->anchor_dmin = 0;
5018 reg->anchor_dmax = 0;
5019 reg->sub_anchor = 0;
5020 reg->exact_end = (UChar* )NULL;
5021 reg->threshold_len = 0;
5022 if (IS_NOT_NULL(reg->exact)) {
5023 xfree(reg->exact);
5024 reg->exact = (UChar* )NULL;
5025 }
5026 }
5027
5028 #ifdef ONIG_DEBUG
5029
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5030 static void print_enc_string(FILE* fp, OnigEncoding enc,
5031 const UChar *s, const UChar *end)
5032 {
5033 fprintf(fp, "\nPATTERN: /");
5034
5035 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5036 const UChar *p;
5037 OnigCodePoint code;
5038
5039 p = s;
5040 while (p < end) {
5041 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5042 if (code >= 0x80) {
5043 fprintf(fp, " 0x%04x ", (int )code);
5044 }
5045 else {
5046 fputc((int )code, fp);
5047 }
5048
5049 p += enclen(enc, p);
5050 }
5051 }
5052 else {
5053 while (s < end) {
5054 fputc((int )*s, fp);
5055 s++;
5056 }
5057 }
5058
5059 fprintf(fp, "/\n");
5060 }
5061
5062 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5063 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5064 {
5065 if (a == ONIG_INFINITE_DISTANCE)
5066 fputs("inf", f);
5067 else
5068 fprintf(f, "(%u)", a);
5069
5070 fputs("-", f);
5071
5072 if (b == ONIG_INFINITE_DISTANCE)
5073 fputs("inf", f);
5074 else
5075 fprintf(f, "(%u)", b);
5076 }
5077
5078 static void
print_anchor(FILE * f,int anchor)5079 print_anchor(FILE* f, int anchor)
5080 {
5081 int q = 0;
5082
5083 fprintf(f, "[");
5084
5085 if (anchor & ANCHOR_BEGIN_BUF) {
5086 fprintf(f, "begin-buf");
5087 q = 1;
5088 }
5089 if (anchor & ANCHOR_BEGIN_LINE) {
5090 if (q) fprintf(f, ", ");
5091 q = 1;
5092 fprintf(f, "begin-line");
5093 }
5094 if (anchor & ANCHOR_BEGIN_POSITION) {
5095 if (q) fprintf(f, ", ");
5096 q = 1;
5097 fprintf(f, "begin-pos");
5098 }
5099 if (anchor & ANCHOR_END_BUF) {
5100 if (q) fprintf(f, ", ");
5101 q = 1;
5102 fprintf(f, "end-buf");
5103 }
5104 if (anchor & ANCHOR_SEMI_END_BUF) {
5105 if (q) fprintf(f, ", ");
5106 q = 1;
5107 fprintf(f, "semi-end-buf");
5108 }
5109 if (anchor & ANCHOR_END_LINE) {
5110 if (q) fprintf(f, ", ");
5111 q = 1;
5112 fprintf(f, "end-line");
5113 }
5114 if (anchor & ANCHOR_ANYCHAR_STAR) {
5115 if (q) fprintf(f, ", ");
5116 q = 1;
5117 fprintf(f, "anychar-star");
5118 }
5119 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5120 if (q) fprintf(f, ", ");
5121 fprintf(f, "anychar-star-pl");
5122 }
5123
5124 fprintf(f, "]");
5125 }
5126
5127 static void
print_optimize_info(FILE * f,regex_t * reg)5128 print_optimize_info(FILE* f, regex_t* reg)
5129 {
5130 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5131 "EXACT_IC", "MAP" };
5132
5133 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5134 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5135 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5136 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5137 fprintf(f, "\n");
5138
5139 if (reg->optimize) {
5140 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5141 fprintf(f, "\n");
5142 }
5143 fprintf(f, "\n");
5144
5145 if (reg->exact) {
5146 UChar *p;
5147 fprintf(f, "exact: [");
5148 for (p = reg->exact; p < reg->exact_end; p++) {
5149 fputc(*p, f);
5150 }
5151 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5152 }
5153 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5154 int c, i, n = 0;
5155
5156 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5157 if (reg->map[i]) n++;
5158
5159 fprintf(f, "map: n=%d\n", n);
5160 if (n > 0) {
5161 c = 0;
5162 fputc('[', f);
5163 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5164 if (reg->map[i] != 0) {
5165 if (c > 0) fputs(", ", f);
5166 c++;
5167 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5168 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5169 fputc(i, f);
5170 else
5171 fprintf(f, "%d", i);
5172 }
5173 }
5174 fprintf(f, "]\n");
5175 }
5176 }
5177 }
5178 #endif /* ONIG_DEBUG */
5179
5180
5181 extern void
onig_free_body(regex_t * reg)5182 onig_free_body(regex_t* reg)
5183 {
5184 if (IS_NOT_NULL(reg)) {
5185 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5186 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5187 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5188 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5189 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5190 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5191
5192 #ifdef USE_NAMED_GROUP
5193 onig_names_free(reg);
5194 #endif
5195 }
5196 }
5197
5198 extern void
onig_free(regex_t * reg)5199 onig_free(regex_t* reg)
5200 {
5201 if (IS_NOT_NULL(reg)) {
5202 onig_free_body(reg);
5203 xfree(reg);
5204 }
5205 }
5206
5207 #define REGEX_TRANSFER(to,from) do {\
5208 (to)->state = ONIG_STATE_MODIFY;\
5209 onig_free_body(to);\
5210 xmemcpy(to, from, sizeof(regex_t));\
5211 xfree(from);\
5212 } while (0)
5213
5214 extern void
onig_transfer(regex_t * to,regex_t * from)5215 onig_transfer(regex_t* to, regex_t* from)
5216 {
5217 THREAD_ATOMIC_START;
5218 REGEX_TRANSFER(to, from);
5219 THREAD_ATOMIC_END;
5220 }
5221
5222 #define REGEX_CHAIN_HEAD(reg) do {\
5223 while (IS_NOT_NULL((reg)->chain)) {\
5224 (reg) = (reg)->chain;\
5225 }\
5226 } while (0)
5227
5228 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5229 onig_chain_link_add(regex_t* to, regex_t* add)
5230 {
5231 THREAD_ATOMIC_START;
5232 REGEX_CHAIN_HEAD(to);
5233 to->chain = add;
5234 THREAD_ATOMIC_END;
5235 }
5236
5237 extern void
onig_chain_reduce(regex_t * reg)5238 onig_chain_reduce(regex_t* reg)
5239 {
5240 regex_t *head, *prev;
5241
5242 prev = reg;
5243 head = prev->chain;
5244 if (IS_NOT_NULL(head)) {
5245 reg->state = ONIG_STATE_MODIFY;
5246 while (IS_NOT_NULL(head->chain)) {
5247 prev = head;
5248 head = head->chain;
5249 }
5250 prev->chain = (regex_t* )NULL;
5251 REGEX_TRANSFER(reg, head);
5252 }
5253 }
5254
5255 #ifdef ONIG_DEBUG
5256 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5257 #endif
5258 #ifdef ONIG_DEBUG_PARSE_TREE
5259 static void print_tree P_((FILE* f, Node* node));
5260 #endif
5261
5262 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5263 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5264 OnigErrorInfo* einfo)
5265 {
5266 #define COMPILE_INIT_SIZE 20
5267
5268 int r, init_size;
5269 Node* root;
5270 ScanEnv scan_env;
5271 #ifdef USE_SUBEXP_CALL
5272 UnsetAddrList uslist;
5273 #endif
5274
5275 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5276
5277 reg->state = ONIG_STATE_COMPILING;
5278
5279 #ifdef ONIG_DEBUG
5280 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5281 #endif
5282
5283 if (reg->alloc == 0) {
5284 init_size = (pattern_end - pattern) * 2;
5285 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5286 r = BBUF_INIT(reg, init_size);
5287 if (r != 0) goto end;
5288 }
5289 else
5290 reg->used = 0;
5291
5292 reg->num_mem = 0;
5293 reg->num_repeat = 0;
5294 reg->num_null_check = 0;
5295 reg->repeat_range_alloc = 0;
5296 reg->repeat_range = (OnigRepeatRange* )NULL;
5297 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5298 reg->num_comb_exp_check = 0;
5299 #endif
5300
5301 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5302 if (r != 0) goto err;
5303
5304 #ifdef USE_NAMED_GROUP
5305 /* mixed use named group and no-named group */
5306 if (scan_env.num_named > 0 &&
5307 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5308 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5309 if (scan_env.num_named != scan_env.num_mem)
5310 r = disable_noname_group_capture(&root, reg, &scan_env);
5311 else
5312 r = numbered_ref_check(root);
5313
5314 if (r != 0) goto err;
5315 }
5316 #endif
5317
5318 #ifdef USE_SUBEXP_CALL
5319 if (scan_env.num_call > 0) {
5320 r = unset_addr_list_init(&uslist, scan_env.num_call);
5321 if (r != 0) goto err;
5322 scan_env.unset_addr_list = &uslist;
5323 r = setup_subexp_call(root, &scan_env);
5324 if (r != 0) goto err_unset;
5325 r = subexp_recursive_check_trav(root, &scan_env);
5326 if (r < 0) goto err_unset;
5327 r = subexp_inf_recursive_check_trav(root, &scan_env);
5328 if (r != 0) goto err_unset;
5329
5330 reg->num_call = scan_env.num_call;
5331 }
5332 else
5333 reg->num_call = 0;
5334 #endif
5335
5336 r = setup_tree(root, reg, 0, &scan_env);
5337 if (r != 0) goto err_unset;
5338
5339 #ifdef ONIG_DEBUG_PARSE_TREE
5340 print_tree(stderr, root);
5341 #endif
5342
5343 reg->capture_history = scan_env.capture_history;
5344 reg->bt_mem_start = scan_env.bt_mem_start;
5345 reg->bt_mem_start |= reg->capture_history;
5346 if (IS_FIND_CONDITION(reg->options))
5347 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5348 else {
5349 reg->bt_mem_end = scan_env.bt_mem_end;
5350 reg->bt_mem_end |= reg->capture_history;
5351 }
5352
5353 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5354 if (scan_env.backrefed_mem == 0
5355 #ifdef USE_SUBEXP_CALL
5356 || scan_env.num_call == 0
5357 #endif
5358 ) {
5359 setup_comb_exp_check(root, 0, &scan_env);
5360 #ifdef USE_SUBEXP_CALL
5361 if (scan_env.has_recursion != 0) {
5362 scan_env.num_comb_exp_check = 0;
5363 }
5364 else
5365 #endif
5366 if (scan_env.comb_exp_max_regnum > 0) {
5367 int i;
5368 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5369 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5370 scan_env.num_comb_exp_check = 0;
5371 break;
5372 }
5373 }
5374 }
5375 }
5376
5377 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5378 #endif
5379
5380 clear_optimize_info(reg);
5381 #ifndef ONIG_DONT_OPTIMIZE
5382 r = set_optimize_info_from_tree(root, reg, &scan_env);
5383 if (r != 0) goto err_unset;
5384 #endif
5385
5386 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5387 xfree(scan_env.mem_nodes_dynamic);
5388 scan_env.mem_nodes_dynamic = (Node** )NULL;
5389 }
5390
5391 r = compile_tree(root, reg);
5392 if (r == 0) {
5393 r = add_opcode(reg, OP_END);
5394 #ifdef USE_SUBEXP_CALL
5395 if (scan_env.num_call > 0) {
5396 r = unset_addr_list_fix(&uslist, reg);
5397 unset_addr_list_end(&uslist);
5398 if (r) goto err;
5399 }
5400 #endif
5401
5402 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5403 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5404 else {
5405 if (reg->bt_mem_start != 0)
5406 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5407 else
5408 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5409 }
5410 }
5411 #ifdef USE_SUBEXP_CALL
5412 else if (scan_env.num_call > 0) {
5413 unset_addr_list_end(&uslist);
5414 }
5415 #endif
5416 onig_node_free(root);
5417
5418 #ifdef ONIG_DEBUG_COMPILE
5419 #ifdef USE_NAMED_GROUP
5420 onig_print_names(stderr, reg);
5421 #endif
5422 print_compiled_byte_code_list(stderr, reg);
5423 #endif
5424
5425 end:
5426 reg->state = ONIG_STATE_NORMAL;
5427 return r;
5428
5429 err_unset:
5430 #ifdef USE_SUBEXP_CALL
5431 if (scan_env.num_call > 0) {
5432 unset_addr_list_end(&uslist);
5433 }
5434 #endif
5435 err:
5436 if (IS_NOT_NULL(scan_env.error)) {
5437 if (IS_NOT_NULL(einfo)) {
5438 einfo->enc = scan_env.enc;
5439 einfo->par = scan_env.error;
5440 einfo->par_end = scan_env.error_end;
5441 }
5442 }
5443
5444 onig_node_free(root);
5445 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5446 xfree(scan_env.mem_nodes_dynamic);
5447 return r;
5448 }
5449
5450 #ifdef USE_RECOMPILE_API
5451 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5452 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5453 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5454 OnigErrorInfo* einfo)
5455 {
5456 int r;
5457 regex_t *new_reg;
5458
5459 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5460 if (r) return r;
5461 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5462 onig_transfer(reg, new_reg);
5463 }
5464 else {
5465 onig_chain_link_add(reg, new_reg);
5466 }
5467 return 0;
5468 }
5469 #endif
5470
5471 static int onig_inited = 0;
5472
5473 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5474 onig_reg_init(regex_t* reg, OnigOptionType option,
5475 OnigCaseFoldType case_fold_flag,
5476 OnigEncoding enc, OnigSyntaxType* syntax)
5477 {
5478 if (! onig_inited)
5479 onig_init();
5480
5481 if (IS_NULL(reg))
5482 return ONIGERR_INVALID_ARGUMENT;
5483
5484 if (ONIGENC_IS_UNDEF(enc))
5485 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5486
5487 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5488 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5489 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5490 }
5491
5492 (reg)->state = ONIG_STATE_MODIFY;
5493
5494 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5495 option |= syntax->options;
5496 option &= ~ONIG_OPTION_SINGLELINE;
5497 }
5498 else
5499 option |= syntax->options;
5500
5501 (reg)->enc = enc;
5502 (reg)->options = option;
5503 (reg)->syntax = syntax;
5504 (reg)->optimize = 0;
5505 (reg)->exact = (UChar* )NULL;
5506 (reg)->int_map = (int* )NULL;
5507 (reg)->int_map_backward = (int* )NULL;
5508 (reg)->chain = (regex_t* )NULL;
5509
5510 (reg)->p = (UChar* )NULL;
5511 (reg)->alloc = 0;
5512 (reg)->used = 0;
5513 (reg)->name_table = (void* )NULL;
5514
5515 (reg)->case_fold_flag = case_fold_flag;
5516 return 0;
5517 }
5518
5519 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5520 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5521 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5522 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5523 {
5524 int r;
5525
5526 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5527 if (r) return r;
5528
5529 r = onig_compile(reg, pattern, pattern_end, einfo);
5530 return r;
5531 }
5532
5533 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5534 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5535 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5536 OnigErrorInfo* einfo)
5537 {
5538 int r;
5539
5540 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5541 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5542
5543 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5544 if (r) goto err;
5545
5546 r = onig_compile(*reg, pattern, pattern_end, einfo);
5547 if (r) {
5548 err:
5549 onig_free(*reg);
5550 *reg = NULL;
5551 }
5552 return r;
5553 }
5554
5555
5556 extern int
onig_init(void)5557 onig_init(void)
5558 {
5559 if (onig_inited != 0)
5560 return 0;
5561
5562 THREAD_SYSTEM_INIT;
5563 THREAD_ATOMIC_START;
5564
5565 onig_inited = 1;
5566
5567 onigenc_init();
5568 /* onigenc_set_default_caseconv_table((UChar* )0); */
5569
5570 #ifdef ONIG_DEBUG_STATISTICS
5571 onig_statistics_init();
5572 #endif
5573
5574 THREAD_ATOMIC_END;
5575 return 0;
5576 }
5577
5578
5579 extern int
onig_end(void)5580 onig_end(void)
5581 {
5582 THREAD_ATOMIC_START;
5583
5584 #ifdef ONIG_DEBUG_STATISTICS
5585 onig_print_statistics(stderr);
5586 #endif
5587
5588 #ifdef USE_SHARED_CCLASS_TABLE
5589 onig_free_shared_cclass_table();
5590 #endif
5591
5592 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5593 onig_free_node_list();
5594 #endif
5595
5596 onig_inited = 0;
5597
5598 THREAD_ATOMIC_END;
5599 THREAD_SYSTEM_END;
5600 return 0;
5601 }
5602
5603 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5604 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5605 {
5606 OnigCodePoint n, *data;
5607 OnigCodePoint low, high, x;
5608
5609 GET_CODE_POINT(n, p);
5610 data = (OnigCodePoint* )p;
5611 data++;
5612
5613 for (low = 0, high = n; low < high; ) {
5614 x = (low + high) >> 1;
5615 if (code > data[x * 2 + 1])
5616 low = x + 1;
5617 else
5618 high = x;
5619 }
5620
5621 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5622 }
5623
5624 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5625 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5626 {
5627 int found;
5628
5629 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5630 if (IS_NULL(cc->mbuf)) {
5631 found = 0;
5632 }
5633 else {
5634 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5635 }
5636 }
5637 else {
5638 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5639 }
5640
5641 if (IS_NCCLASS_NOT(cc))
5642 return !found;
5643 else
5644 return found;
5645 }
5646
5647 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5648 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5649 {
5650 int len;
5651
5652 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5653 len = 2;
5654 }
5655 else {
5656 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5657 }
5658 return onig_is_code_in_cc_len(len, code, cc);
5659 }
5660
5661
5662 #ifdef ONIG_DEBUG
5663
5664 /* arguments type */
5665 #define ARG_SPECIAL -1
5666 #define ARG_NON 0
5667 #define ARG_RELADDR 1
5668 #define ARG_ABSADDR 2
5669 #define ARG_LENGTH 3
5670 #define ARG_MEMNUM 4
5671 #define ARG_OPTION 5
5672 #define ARG_STATE_CHECK 6
5673
5674 OnigOpInfoType OnigOpInfo[] = {
5675 { OP_FINISH, "finish", ARG_NON },
5676 { OP_END, "end", ARG_NON },
5677 { OP_EXACT1, "exact1", ARG_SPECIAL },
5678 { OP_EXACT2, "exact2", ARG_SPECIAL },
5679 { OP_EXACT3, "exact3", ARG_SPECIAL },
5680 { OP_EXACT4, "exact4", ARG_SPECIAL },
5681 { OP_EXACT5, "exact5", ARG_SPECIAL },
5682 { OP_EXACTN, "exactn", ARG_SPECIAL },
5683 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5684 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5685 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5686 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5687 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5688 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5689 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5690 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5691 { OP_CCLASS, "cclass", ARG_SPECIAL },
5692 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5693 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5694 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5695 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5696 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5697 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5698 { OP_ANYCHAR, "anychar", ARG_NON },
5699 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5700 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5701 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5702 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5703 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5704 { OP_WORD, "word", ARG_NON },
5705 { OP_NOT_WORD, "not-word", ARG_NON },
5706 { OP_WORD_BOUND, "word-bound", ARG_NON },
5707 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5708 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5709 { OP_WORD_END, "word-end", ARG_NON },
5710 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5711 { OP_END_BUF, "end-buf", ARG_NON },
5712 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5713 { OP_END_LINE, "end-line", ARG_NON },
5714 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5715 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5716 { OP_BACKREF1, "backref1", ARG_NON },
5717 { OP_BACKREF2, "backref2", ARG_NON },
5718 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5719 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5720 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5721 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5722 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5723 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5724 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5725 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5726 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5727 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5728 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5729 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5730 { OP_SET_OPTION, "set-option", ARG_OPTION },
5731 { OP_FAIL, "fail", ARG_NON },
5732 { OP_JUMP, "jump", ARG_RELADDR },
5733 { OP_PUSH, "push", ARG_RELADDR },
5734 { OP_POP, "pop", ARG_NON },
5735 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5736 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5737 { OP_REPEAT, "repeat", ARG_SPECIAL },
5738 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5739 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5740 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5741 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5742 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5743 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5744 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5745 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5746 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5747 { OP_PUSH_POS, "push-pos", ARG_NON },
5748 { OP_POP_POS, "pop-pos", ARG_NON },
5749 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5750 { OP_FAIL_POS, "fail-pos", ARG_NON },
5751 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5752 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5753 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5754 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5755 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5756 { OP_CALL, "call", ARG_ABSADDR },
5757 { OP_RETURN, "return", ARG_NON },
5758 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5759 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5760 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5761 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5762 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5763 "state-check-anychar-ml*", ARG_STATE_CHECK },
5764 { -1, "", ARG_NON }
5765 };
5766
5767 static char*
op2name(int opcode)5768 op2name(int opcode)
5769 {
5770 int i;
5771
5772 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5773 if (opcode == OnigOpInfo[i].opcode)
5774 return OnigOpInfo[i].name;
5775 }
5776 return "";
5777 }
5778
5779 static int
op2arg_type(int opcode)5780 op2arg_type(int opcode)
5781 {
5782 int i;
5783
5784 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5785 if (opcode == OnigOpInfo[i].opcode)
5786 return OnigOpInfo[i].arg_type;
5787 }
5788 return ARG_SPECIAL;
5789 }
5790
5791 static void
Indent(FILE * f,int indent)5792 Indent(FILE* f, int indent)
5793 {
5794 int i;
5795 for (i = 0; i < indent; i++) putc(' ', f);
5796 }
5797
5798 static void
p_string(FILE * f,int len,UChar * s)5799 p_string(FILE* f, int len, UChar* s)
5800 {
5801 fputs(":", f);
5802 while (len-- > 0) { fputc(*s++, f); }
5803 }
5804
5805 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5806 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5807 {
5808 int x = len * mb_len;
5809
5810 fprintf(f, ":%d:", len);
5811 while (x-- > 0) { fputc(*s++, f); }
5812 }
5813
5814 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5815 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5816 OnigEncoding enc)
5817 {
5818 int i, n, arg_type;
5819 RelAddrType addr;
5820 LengthType len;
5821 MemNumType mem;
5822 StateCheckNumType scn;
5823 OnigCodePoint code;
5824 UChar *q;
5825
5826 fprintf(f, "[%s", op2name(*bp));
5827 arg_type = op2arg_type(*bp);
5828 if (arg_type != ARG_SPECIAL) {
5829 bp++;
5830 switch (arg_type) {
5831 case ARG_NON:
5832 break;
5833 case ARG_RELADDR:
5834 GET_RELADDR_INC(addr, bp);
5835 fprintf(f, ":(%d)", addr);
5836 break;
5837 case ARG_ABSADDR:
5838 GET_ABSADDR_INC(addr, bp);
5839 fprintf(f, ":(%d)", addr);
5840 break;
5841 case ARG_LENGTH:
5842 GET_LENGTH_INC(len, bp);
5843 fprintf(f, ":%d", len);
5844 break;
5845 case ARG_MEMNUM:
5846 mem = *((MemNumType* )bp);
5847 bp += SIZE_MEMNUM;
5848 fprintf(f, ":%d", mem);
5849 break;
5850 case ARG_OPTION:
5851 {
5852 OnigOptionType option = *((OnigOptionType* )bp);
5853 bp += SIZE_OPTION;
5854 fprintf(f, ":%d", option);
5855 }
5856 break;
5857
5858 case ARG_STATE_CHECK:
5859 scn = *((StateCheckNumType* )bp);
5860 bp += SIZE_STATE_CHECK_NUM;
5861 fprintf(f, ":%d", scn);
5862 break;
5863 }
5864 }
5865 else {
5866 switch (*bp++) {
5867 case OP_EXACT1:
5868 case OP_ANYCHAR_STAR_PEEK_NEXT:
5869 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5870 p_string(f, 1, bp++); break;
5871 case OP_EXACT2:
5872 p_string(f, 2, bp); bp += 2; break;
5873 case OP_EXACT3:
5874 p_string(f, 3, bp); bp += 3; break;
5875 case OP_EXACT4:
5876 p_string(f, 4, bp); bp += 4; break;
5877 case OP_EXACT5:
5878 p_string(f, 5, bp); bp += 5; break;
5879 case OP_EXACTN:
5880 GET_LENGTH_INC(len, bp);
5881 p_len_string(f, len, 1, bp);
5882 bp += len;
5883 break;
5884
5885 case OP_EXACTMB2N1:
5886 p_string(f, 2, bp); bp += 2; break;
5887 case OP_EXACTMB2N2:
5888 p_string(f, 4, bp); bp += 4; break;
5889 case OP_EXACTMB2N3:
5890 p_string(f, 6, bp); bp += 6; break;
5891 case OP_EXACTMB2N:
5892 GET_LENGTH_INC(len, bp);
5893 p_len_string(f, len, 2, bp);
5894 bp += len * 2;
5895 break;
5896 case OP_EXACTMB3N:
5897 GET_LENGTH_INC(len, bp);
5898 p_len_string(f, len, 3, bp);
5899 bp += len * 3;
5900 break;
5901 case OP_EXACTMBN:
5902 {
5903 int mb_len;
5904
5905 GET_LENGTH_INC(mb_len, bp);
5906 GET_LENGTH_INC(len, bp);
5907 fprintf(f, ":%d:%d:", mb_len, len);
5908 n = len * mb_len;
5909 while (n-- > 0) { fputc(*bp++, f); }
5910 }
5911 break;
5912
5913 case OP_EXACT1_IC:
5914 len = enclen(enc, bp);
5915 p_string(f, len, bp);
5916 bp += len;
5917 break;
5918 case OP_EXACTN_IC:
5919 GET_LENGTH_INC(len, bp);
5920 p_len_string(f, len, 1, bp);
5921 bp += len;
5922 break;
5923
5924 case OP_CCLASS:
5925 n = bitset_on_num((BitSetRef )bp);
5926 bp += SIZE_BITSET;
5927 fprintf(f, ":%d", n);
5928 break;
5929
5930 case OP_CCLASS_NOT:
5931 n = bitset_on_num((BitSetRef )bp);
5932 bp += SIZE_BITSET;
5933 fprintf(f, ":%d", n);
5934 break;
5935
5936 case OP_CCLASS_MB:
5937 case OP_CCLASS_MB_NOT:
5938 GET_LENGTH_INC(len, bp);
5939 q = bp;
5940 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5941 ALIGNMENT_RIGHT(q);
5942 #endif
5943 GET_CODE_POINT(code, q);
5944 bp += len;
5945 fprintf(f, ":%d:%d", (int )code, len);
5946 break;
5947
5948 case OP_CCLASS_MIX:
5949 case OP_CCLASS_MIX_NOT:
5950 n = bitset_on_num((BitSetRef )bp);
5951 bp += SIZE_BITSET;
5952 GET_LENGTH_INC(len, bp);
5953 q = bp;
5954 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5955 ALIGNMENT_RIGHT(q);
5956 #endif
5957 GET_CODE_POINT(code, q);
5958 bp += len;
5959 fprintf(f, ":%d:%d:%d", n, (int )code, len);
5960 break;
5961
5962 case OP_CCLASS_NODE:
5963 {
5964 CClassNode *cc;
5965
5966 GET_POINTER_INC(cc, bp);
5967 n = bitset_on_num(cc->bs);
5968 fprintf(f, ":%u:%d", (unsigned int )cc, n);
5969 }
5970 break;
5971
5972 case OP_BACKREFN_IC:
5973 mem = *((MemNumType* )bp);
5974 bp += SIZE_MEMNUM;
5975 fprintf(f, ":%d", mem);
5976 break;
5977
5978 case OP_BACKREF_MULTI_IC:
5979 case OP_BACKREF_MULTI:
5980 fputs(" ", f);
5981 GET_LENGTH_INC(len, bp);
5982 for (i = 0; i < len; i++) {
5983 GET_MEMNUM_INC(mem, bp);
5984 if (i > 0) fputs(", ", f);
5985 fprintf(f, "%d", mem);
5986 }
5987 break;
5988
5989 case OP_BACKREF_WITH_LEVEL:
5990 {
5991 OnigOptionType option;
5992 LengthType level;
5993
5994 GET_OPTION_INC(option, bp);
5995 fprintf(f, ":%d", option);
5996 GET_LENGTH_INC(level, bp);
5997 fprintf(f, ":%d", level);
5998
5999 fputs(" ", f);
6000 GET_LENGTH_INC(len, bp);
6001 for (i = 0; i < len; i++) {
6002 GET_MEMNUM_INC(mem, bp);
6003 if (i > 0) fputs(", ", f);
6004 fprintf(f, "%d", mem);
6005 }
6006 }
6007 break;
6008
6009 case OP_REPEAT:
6010 case OP_REPEAT_NG:
6011 {
6012 mem = *((MemNumType* )bp);
6013 bp += SIZE_MEMNUM;
6014 addr = *((RelAddrType* )bp);
6015 bp += SIZE_RELADDR;
6016 fprintf(f, ":%d:%d", mem, addr);
6017 }
6018 break;
6019
6020 case OP_PUSH_OR_JUMP_EXACT1:
6021 case OP_PUSH_IF_PEEK_NEXT:
6022 addr = *((RelAddrType* )bp);
6023 bp += SIZE_RELADDR;
6024 fprintf(f, ":(%d)", addr);
6025 p_string(f, 1, bp);
6026 bp += 1;
6027 break;
6028
6029 case OP_LOOK_BEHIND:
6030 GET_LENGTH_INC(len, bp);
6031 fprintf(f, ":%d", len);
6032 break;
6033
6034 case OP_PUSH_LOOK_BEHIND_NOT:
6035 GET_RELADDR_INC(addr, bp);
6036 GET_LENGTH_INC(len, bp);
6037 fprintf(f, ":%d:(%d)", len, addr);
6038 break;
6039
6040 case OP_STATE_CHECK_PUSH:
6041 case OP_STATE_CHECK_PUSH_OR_JUMP:
6042 scn = *((StateCheckNumType* )bp);
6043 bp += SIZE_STATE_CHECK_NUM;
6044 addr = *((RelAddrType* )bp);
6045 bp += SIZE_RELADDR;
6046 fprintf(f, ":%d:(%d)", scn, addr);
6047 break;
6048
6049 default:
6050 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6051 *--bp);
6052 }
6053 }
6054 fputs("]", f);
6055 if (nextp) *nextp = bp;
6056 }
6057
6058 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6059 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6060 {
6061 int ncode;
6062 UChar* bp = reg->p;
6063 UChar* end = reg->p + reg->used;
6064
6065 fprintf(f, "code length: %d\n", reg->used);
6066
6067 ncode = 0;
6068 while (bp < end) {
6069 ncode++;
6070 if (bp > reg->p) {
6071 if (ncode % 5 == 0)
6072 fprintf(f, "\n");
6073 else
6074 fputs(" ", f);
6075 }
6076 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6077 }
6078
6079 fprintf(f, "\n");
6080 }
6081
6082 static void
print_indent_tree(FILE * f,Node * node,int indent)6083 print_indent_tree(FILE* f, Node* node, int indent)
6084 {
6085 int i, type;
6086 int add = 3;
6087 UChar* p;
6088
6089 Indent(f, indent);
6090 if (IS_NULL(node)) {
6091 fprintf(f, "ERROR: null node!!!\n");
6092 exit (0);
6093 }
6094
6095 type = NTYPE(node);
6096 switch (type) {
6097 case NT_LIST:
6098 case NT_ALT:
6099 if (NTYPE(node) == NT_LIST)
6100 fprintf(f, "<list:%x>\n", (int )node);
6101 else
6102 fprintf(f, "<alt:%x>\n", (int )node);
6103
6104 print_indent_tree(f, NCAR(node), indent + add);
6105 while (IS_NOT_NULL(node = NCDR(node))) {
6106 if (NTYPE(node) != type) {
6107 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6108 exit(0);
6109 }
6110 print_indent_tree(f, NCAR(node), indent + add);
6111 }
6112 break;
6113
6114 case NT_STR:
6115 fprintf(f, "<string%s:%x>",
6116 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6117 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6118 if (*p >= 0x20 && *p < 0x7f)
6119 fputc(*p, f);
6120 else {
6121 fprintf(f, " 0x%02x", *p);
6122 }
6123 }
6124 break;
6125
6126 case NT_CCLASS:
6127 fprintf(f, "<cclass:%x>", (int )node);
6128 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6129 if (NCCLASS(node)->mbuf) {
6130 BBuf* bbuf = NCCLASS(node)->mbuf;
6131 for (i = 0; i < bbuf->used; i++) {
6132 if (i > 0) fprintf(f, ",");
6133 fprintf(f, "%0x", bbuf->p[i]);
6134 }
6135 }
6136 break;
6137
6138 case NT_CTYPE:
6139 fprintf(f, "<ctype:%x> ", (int )node);
6140 switch (NCTYPE(node)->ctype) {
6141 case ONIGENC_CTYPE_WORD:
6142 if (NCTYPE(node)->not != 0)
6143 fputs("not word", f);
6144 else
6145 fputs("word", f);
6146 break;
6147
6148 default:
6149 fprintf(f, "ERROR: undefined ctype.\n");
6150 exit(0);
6151 }
6152 break;
6153
6154 case NT_CANY:
6155 fprintf(f, "<anychar:%x>", (int )node);
6156 break;
6157
6158 case NT_ANCHOR:
6159 fprintf(f, "<anchor:%x> ", (int )node);
6160 switch (NANCHOR(node)->type) {
6161 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6162 case ANCHOR_END_BUF: fputs("end buf", f); break;
6163 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6164 case ANCHOR_END_LINE: fputs("end line", f); break;
6165 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6166 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6167
6168 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6169 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6170 #ifdef USE_WORD_BEGIN_END
6171 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6172 case ANCHOR_WORD_END: fputs("word end", f); break;
6173 #endif
6174 case ANCHOR_PREC_READ: fputs("prec read", f); break;
6175 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
6176 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
6177 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6178
6179 default:
6180 fprintf(f, "ERROR: undefined anchor type.\n");
6181 break;
6182 }
6183 break;
6184
6185 case NT_BREF:
6186 {
6187 int* p;
6188 BRefNode* br = NBREF(node);
6189 p = BACKREFS_P(br);
6190 fprintf(f, "<backref:%x>", (int )node);
6191 for (i = 0; i < br->back_num; i++) {
6192 if (i > 0) fputs(", ", f);
6193 fprintf(f, "%d", p[i]);
6194 }
6195 }
6196 break;
6197
6198 #ifdef USE_SUBEXP_CALL
6199 case NT_CALL:
6200 {
6201 CallNode* cn = NCALL(node);
6202 fprintf(f, "<call:%x>", (int )node);
6203 p_string(f, cn->name_end - cn->name, cn->name);
6204 }
6205 break;
6206 #endif
6207
6208 case NT_QTFR:
6209 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6210 NQTFR(node)->lower, NQTFR(node)->upper,
6211 (NQTFR(node)->greedy ? "" : "?"));
6212 print_indent_tree(f, NQTFR(node)->target, indent + add);
6213 break;
6214
6215 case NT_ENCLOSE:
6216 fprintf(f, "<enclose:%x> ", (int )node);
6217 switch (NENCLOSE(node)->type) {
6218 case ENCLOSE_OPTION:
6219 fprintf(f, "option:%d\n", NENCLOSE(node)->option);
6220 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6221 break;
6222 case ENCLOSE_MEMORY:
6223 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6224 break;
6225 case ENCLOSE_STOP_BACKTRACK:
6226 fprintf(f, "stop-bt");
6227 break;
6228
6229 default:
6230 break;
6231 }
6232 fprintf(f, "\n");
6233 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6234 break;
6235
6236 default:
6237 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6238 break;
6239 }
6240
6241 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6242 type != NT_ENCLOSE)
6243 fprintf(f, "\n");
6244 fflush(f);
6245 }
6246 #endif /* ONIG_DEBUG */
6247
6248 #ifdef ONIG_DEBUG_PARSE_TREE
6249 static void
print_tree(FILE * f,Node * node)6250 print_tree(FILE* f, Node* node)
6251 {
6252 print_indent_tree(f, node, 0);
6253 }
6254 #endif
6255