1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37 return OnigDefaultCaseFoldFlag;
38 }
39
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43 OnigDefaultCaseFoldFlag = case_fold_flag;
44 return 0;
45 }
46
47
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55 int len = end - s;
56
57 if (len > 0) {
58 UChar* r = (UChar* )xmalloc(len + 1);
59 CHECK_NULL_RETURN(r);
60 xmemcpy(r, s, len);
61 r[len] = (UChar )0;
62 return r;
63 }
64 else return NULL;
65 }
66
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70 Node c;
71 c = *a; *a = *b; *b = c;
72
73 if (NTYPE(a) == NT_STR) {
74 StrNode* sn = NSTR(a);
75 if (sn->capa == 0) {
76 int len = sn->end - sn->s;
77 sn->s = sn->buf;
78 sn->end = sn->s + len;
79 }
80 }
81
82 if (NTYPE(b) == NT_STR) {
83 StrNode* sn = NSTR(b);
84 if (sn->capa == 0) {
85 int len = sn->end - sn->s;
86 sn->s = sn->buf;
87 sn->end = sn->s + len;
88 }
89 }
90 }
91
92 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)93 distance_add(OnigDistance d1, OnigDistance d2)
94 {
95 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96 return ONIG_INFINITE_DISTANCE;
97 else {
98 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99 else return ONIG_INFINITE_DISTANCE;
100 }
101 }
102
103 static OnigDistance
distance_multiply(OnigDistance d,int m)104 distance_multiply(OnigDistance d, int m)
105 {
106 if (m == 0) return 0;
107
108 if (d < ONIG_INFINITE_DISTANCE / m)
109 return d * m;
110 else
111 return ONIG_INFINITE_DISTANCE;
112 }
113
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117 int i;
118 for (i = 0; i < (int )BITSET_SIZE; i++) {
119 if (bs[i] != 0) return 0;
120 }
121 return 1;
122 }
123
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128 int i, n;
129
130 n = 0;
131 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132 if (BITSET_AT(bs, i)) n++;
133 }
134 return n;
135 }
136 #endif
137
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141 if (size <= 0) {
142 size = 0;
143 buf->p = NULL;
144 }
145 else {
146 buf->p = (UChar* )xmalloc(size);
147 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148 }
149
150 buf->alloc = size;
151 buf->used = 0;
152 return 0;
153 }
154
155
156 #ifdef USE_SUBEXP_CALL
157
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161 UnsetAddr* p;
162
163 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164 CHECK_NULL_RETURN_MEMERR(p);
165 uslist->num = 0;
166 uslist->alloc = size;
167 uslist->us = p;
168 return 0;
169 }
170
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174 if (IS_NOT_NULL(uslist->us))
175 xfree(uslist->us);
176 }
177
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181 UnsetAddr* p;
182 int size;
183
184 if (uslist->num >= uslist->alloc) {
185 size = uslist->alloc * 2;
186 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187 CHECK_NULL_RETURN_MEMERR(p);
188 uslist->alloc = size;
189 uslist->us = p;
190 }
191
192 uslist->us[uslist->num].offset = offset;
193 uslist->us[uslist->num].target = node;
194 uslist->num++;
195 return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198
199
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203 BBUF_ADD1(reg, opcode);
204 return 0;
205 }
206
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211 StateCheckNumType n = (StateCheckNumType )num;
212
213 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214 return 0;
215 }
216 #endif
217
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221 RelAddrType ra = (RelAddrType )addr;
222
223 BBUF_ADD(reg, &ra, SIZE_RELADDR);
224 return 0;
225 }
226
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230 AbsAddrType ra = (AbsAddrType )addr;
231
232 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233 return 0;
234 }
235
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239 LengthType l = (LengthType )len;
240
241 BBUF_ADD(reg, &l, SIZE_LENGTH);
242 return 0;
243 }
244
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248 MemNumType n = (MemNumType )num;
249
250 BBUF_ADD(reg, &n, SIZE_MEMNUM);
251 return 0;
252 }
253
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257 PointerType ptr = (PointerType )addr;
258
259 BBUF_ADD(reg, &ptr, SIZE_POINTER);
260 return 0;
261 }
262
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266 BBUF_ADD(reg, &option, SIZE_OPTION);
267 return 0;
268 }
269
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273 int r;
274
275 r = add_opcode(reg, opcode);
276 if (r) return r;
277 r = add_rel_addr(reg, addr);
278 return r;
279 }
280
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284 BBUF_ADD(reg, bytes, len);
285 return 0;
286 }
287
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291 BBUF_ADD(reg, bs, SIZE_BITSET);
292 return 0;
293 }
294
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_option(reg, option);
303 return r;
304 }
305
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308
309
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
312 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
313
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317 int op;
318
319 if (ignore_case) {
320 switch (str_len) {
321 case 1: op = OP_EXACT1_IC; break;
322 default: op = OP_EXACTN_IC; break;
323 }
324 }
325 else {
326 switch (mb_len) {
327 case 1:
328 switch (str_len) {
329 case 1: op = OP_EXACT1; break;
330 case 2: op = OP_EXACT2; break;
331 case 3: op = OP_EXACT3; break;
332 case 4: op = OP_EXACT4; break;
333 case 5: op = OP_EXACT5; break;
334 default: op = OP_EXACTN; break;
335 }
336 break;
337
338 case 2:
339 switch (str_len) {
340 case 1: op = OP_EXACTMB2N1; break;
341 case 2: op = OP_EXACTMB2N2; break;
342 case 3: op = OP_EXACTMB2N3; break;
343 default: op = OP_EXACTMB2N; break;
344 }
345 break;
346
347 case 3:
348 op = OP_EXACTMB3N;
349 break;
350
351 default:
352 op = OP_EXACTMBN;
353 break;
354 }
355 }
356 return op;
357 }
358
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362 int r;
363 int saved_num_null_check = reg->num_null_check;
364
365 if (empty_info != 0) {
366 r = add_opcode(reg, OP_NULL_CHECK_START);
367 if (r) return r;
368 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369 if (r) return r;
370 reg->num_null_check++;
371 }
372
373 r = compile_tree(node, reg);
374 if (r) return r;
375
376 if (empty_info != 0) {
377 if (empty_info == NQ_TARGET_IS_EMPTY)
378 r = add_opcode(reg, OP_NULL_CHECK_END);
379 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383
384 if (r) return r;
385 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386 }
387 return r;
388 }
389
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394 int r;
395
396 r = add_opcode(reg, OP_CALL);
397 if (r) return r;
398 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399 node->target);
400 if (r) return r;
401 r = add_abs_addr(reg, 0 /*dummy addr.*/);
402 return r;
403 }
404 #endif
405
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409 int i, r;
410
411 for (i = 0; i < n; i++) {
412 r = compile_tree(node, reg);
413 if (r) return r;
414 }
415 return 0;
416 }
417
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420 regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422 int len;
423 int op = select_str_opcode(mb_len, str_len, ignore_case);
424
425 len = SIZE_OPCODE;
426
427 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
428 if (IS_NEED_STR_LEN_OP_EXACT(op))
429 len += SIZE_LENGTH;
430
431 len += mb_len * str_len;
432 return len;
433 }
434
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437 regex_t* reg, int ignore_case)
438 {
439 int op = select_str_opcode(mb_len, str_len, ignore_case);
440 add_opcode(reg, op);
441
442 if (op == OP_EXACTMBN)
443 add_length(reg, mb_len);
444
445 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446 if (op == OP_EXACTN_IC)
447 add_length(reg, mb_len * str_len);
448 else
449 add_length(reg, str_len);
450 }
451
452 add_bytes(reg, s, mb_len * str_len);
453 return 0;
454 }
455
456
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460 int rlen, r, len, prev_len, slen, ambig;
461 OnigEncoding enc = reg->enc;
462 UChar *p, *prev;
463 StrNode* sn;
464
465 sn = NSTR(node);
466 if (sn->end <= sn->s)
467 return 0;
468
469 ambig = NSTRING_IS_AMBIG(node);
470
471 p = prev = sn->s;
472 SAFE_ENC_LEN(enc, p, sn->end, prev_len);
473 p += prev_len;
474 slen = 1;
475 rlen = 0;
476
477 for (; p < sn->end; ) {
478 SAFE_ENC_LEN(enc, p, sn->end, len);
479 if (len == prev_len) {
480 slen++;
481 }
482 else {
483 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484 rlen += r;
485 prev = p;
486 slen = 1;
487 prev_len = len;
488 }
489 p += len;
490 }
491 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492 rlen += r;
493 return rlen;
494 }
495
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499 if (sn->end <= sn->s)
500 return 0;
501
502 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508 int r, len, prev_len, slen, ambig;
509 OnigEncoding enc = reg->enc;
510 UChar *p, *prev, *end;
511 StrNode* sn;
512
513 sn = NSTR(node);
514 if (sn->end <= sn->s)
515 return 0;
516
517 end = sn->end;
518 ambig = NSTRING_IS_AMBIG(node);
519
520 p = prev = sn->s;
521 SAFE_ENC_LEN(enc, p, end, prev_len);
522 p += prev_len;
523 slen = 1;
524
525 for (; p < end; ) {
526 SAFE_ENC_LEN(enc, p, end, len);
527 if (len == prev_len) {
528 slen++;
529 }
530 else {
531 r = add_compile_string(prev, prev_len, slen, reg, ambig);
532 if (r) return r;
533
534 prev = p;
535 slen = 1;
536 prev_len = len;
537 }
538
539 p += len;
540 }
541 return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547 if (sn->end <= sn->s)
548 return 0;
549
550 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557 add_length(reg, mbuf->used);
558 return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560 int r, pad_size;
561 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562
563 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566
567 r = add_bytes(reg, mbuf->p, mbuf->used);
568
569 /* padding for return value from compile_length_cclass_node() to be fix. */
570 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572 return r;
573 #endif
574 }
575
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579 int len;
580
581 if (IS_NCCLASS_SHARE(cc)) {
582 len = SIZE_OPCODE + SIZE_POINTER;
583 return len;
584 }
585
586 if (IS_NULL(cc->mbuf)) {
587 len = SIZE_OPCODE + SIZE_BITSET;
588 }
589 else {
590 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591 len = SIZE_OPCODE;
592 }
593 else {
594 len = SIZE_OPCODE + SIZE_BITSET;
595 }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597 len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601 }
602
603 return len;
604 }
605
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609 int r;
610
611 if (IS_NCCLASS_SHARE(cc)) {
612 add_opcode(reg, OP_CCLASS_NODE);
613 r = add_pointer(reg, cc);
614 return r;
615 }
616
617 if (IS_NULL(cc->mbuf)) {
618 if (IS_NCCLASS_NOT(cc))
619 add_opcode(reg, OP_CCLASS_NOT);
620 else
621 add_opcode(reg, OP_CCLASS);
622
623 r = add_bitset(reg, cc->bs);
624 }
625 else {
626 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627 if (IS_NCCLASS_NOT(cc))
628 add_opcode(reg, OP_CCLASS_MB_NOT);
629 else
630 add_opcode(reg, OP_CCLASS_MB);
631
632 r = add_multi_byte_cclass(cc->mbuf, reg);
633 }
634 else {
635 if (IS_NCCLASS_NOT(cc))
636 add_opcode(reg, OP_CCLASS_MIX_NOT);
637 else
638 add_opcode(reg, OP_CCLASS_MIX);
639
640 r = add_bitset(reg, cc->bs);
641 if (r) return r;
642 r = add_multi_byte_cclass(cc->mbuf, reg);
643 }
644 }
645
646 return r;
647 }
648
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC 4
653
654 OnigRepeatRange* p;
655
656 if (reg->repeat_range_alloc == 0) {
657 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658 CHECK_NULL_RETURN_MEMERR(p);
659 reg->repeat_range = p;
660 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661 }
662 else if (reg->repeat_range_alloc <= id) {
663 int n;
664 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666 sizeof(OnigRepeatRange) * n);
667 CHECK_NULL_RETURN_MEMERR(p);
668 reg->repeat_range = p;
669 reg->repeat_range_alloc = n;
670 }
671 else {
672 p = reg->repeat_range;
673 }
674
675 p[id].lower = lower;
676 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677 return 0;
678 }
679
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682 regex_t* reg)
683 {
684 int r;
685 int num_repeat = reg->num_repeat;
686
687 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688 if (r) return r;
689 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690 reg->num_repeat++;
691 if (r) return r;
692 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693 if (r) return r;
694
695 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696 if (r) return r;
697
698 r = compile_tree_empty_check(qn->target, reg, empty_info);
699 if (r) return r;
700
701 if (
702 #ifdef USE_SUBEXP_CALL
703 reg->num_call > 0 ||
704 #endif
705 IS_QUANTIFIER_IN_REPEAT(qn)) {
706 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707 }
708 else {
709 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710 }
711 if (r) return r;
712 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713 return r;
714 }
715
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720 NTYPE(qn->target) == NT_CANY)
721 return 1;
722 else
723 return 0;
724 }
725
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
727 #define CKN_ON (ckn > 0)
728
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734 int len, mod_tlen, cklen;
735 int ckn;
736 int infinite = IS_REPEAT_INFINITE(qn->upper);
737 int empty_info = qn->target_empty_info;
738 int tlen = compile_length_tree(qn->target, reg);
739
740 if (tlen < 0) return tlen;
741
742 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743
744 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745
746 /* anychar repeat */
747 if (NTYPE(qn->target) == NT_CANY) {
748 if (qn->greedy && infinite) {
749 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751 else
752 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753 }
754 }
755
756 if (empty_info != 0)
757 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758 else
759 mod_tlen = tlen;
760
761 if (infinite && qn->lower <= 1) {
762 if (qn->greedy) {
763 if (qn->lower == 1)
764 len = SIZE_OP_JUMP;
765 else
766 len = 0;
767
768 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769 }
770 else {
771 if (qn->lower == 0)
772 len = SIZE_OP_JUMP;
773 else
774 len = 0;
775
776 len += mod_tlen + SIZE_OP_PUSH + cklen;
777 }
778 }
779 else if (qn->upper == 0) {
780 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781 len = SIZE_OP_JUMP + tlen;
782 else
783 len = 0;
784 }
785 else if (qn->upper == 1 && qn->greedy) {
786 if (qn->lower == 0) {
787 if (CKN_ON) {
788 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789 }
790 else {
791 len = SIZE_OP_PUSH + tlen;
792 }
793 }
794 else {
795 len = tlen;
796 }
797 }
798 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800 }
801 else {
802 len = SIZE_OP_REPEAT_INC
803 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804 if (CKN_ON)
805 len += SIZE_OP_STATE_CHECK;
806 }
807
808 return len;
809 }
810
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814 int r, mod_tlen;
815 int ckn;
816 int infinite = IS_REPEAT_INFINITE(qn->upper);
817 int empty_info = qn->target_empty_info;
818 int tlen = compile_length_tree(qn->target, reg);
819
820 if (tlen < 0) return tlen;
821
822 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823
824 if (is_anychar_star_quantifier(qn)) {
825 r = compile_tree_n_times(qn->target, qn->lower, reg);
826 if (r) return r;
827 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828 if (IS_MULTILINE(reg->options))
829 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830 else
831 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832 if (r) return r;
833 if (CKN_ON) {
834 r = add_state_check_num(reg, ckn);
835 if (r) return r;
836 }
837
838 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839 }
840 else {
841 if (IS_MULTILINE(reg->options)) {
842 r = add_opcode(reg, (CKN_ON ?
843 OP_STATE_CHECK_ANYCHAR_ML_STAR
844 : OP_ANYCHAR_ML_STAR));
845 }
846 else {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_STAR
849 : OP_ANYCHAR_STAR));
850 }
851 if (r) return r;
852 if (CKN_ON)
853 r = add_state_check_num(reg, ckn);
854
855 return r;
856 }
857 }
858
859 if (empty_info != 0)
860 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861 else
862 mod_tlen = tlen;
863
864 if (infinite && qn->lower <= 1) {
865 if (qn->greedy) {
866 if (qn->lower == 1) {
867 r = add_opcode_rel_addr(reg, OP_JUMP,
868 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 if (r) return r;
870 }
871
872 if (CKN_ON) {
873 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 if (r) return r;
875 r = add_state_check_num(reg, ckn);
876 if (r) return r;
877 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878 }
879 else {
880 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881 }
882 if (r) return r;
883 r = compile_tree_empty_check(qn->target, reg, empty_info);
884 if (r) return r;
885 r = add_opcode_rel_addr(reg, OP_JUMP,
886 -(mod_tlen + (int )SIZE_OP_JUMP
887 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888 }
889 else {
890 if (qn->lower == 0) {
891 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 if (r) return r;
893 }
894 r = compile_tree_empty_check(qn->target, reg, empty_info);
895 if (r) return r;
896 if (CKN_ON) {
897 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 if (r) return r;
899 r = add_state_check_num(reg, ckn);
900 if (r) return r;
901 r = add_rel_addr(reg,
902 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903 }
904 else
905 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906 }
907 }
908 else if (qn->upper == 0) {
909 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911 if (r) return r;
912 r = compile_tree(qn->target, reg);
913 }
914 else
915 r = 0;
916 }
917 else if (qn->upper == 1 && qn->greedy) {
918 if (qn->lower == 0) {
919 if (CKN_ON) {
920 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 if (r) return r;
922 r = add_state_check_num(reg, ckn);
923 if (r) return r;
924 r = add_rel_addr(reg, tlen);
925 }
926 else {
927 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928 }
929 if (r) return r;
930 }
931
932 r = compile_tree(qn->target, reg);
933 }
934 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935 if (CKN_ON) {
936 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937 if (r) return r;
938 r = add_state_check_num(reg, ckn);
939 if (r) return r;
940 r = add_rel_addr(reg, SIZE_OP_JUMP);
941 }
942 else {
943 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944 }
945
946 if (r) return r;
947 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948 if (r) return r;
949 r = compile_tree(qn->target, reg);
950 }
951 else {
952 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953 if (CKN_ON) {
954 if (r) return r;
955 r = add_opcode(reg, OP_STATE_CHECK);
956 if (r) return r;
957 r = add_state_check_num(reg, ckn);
958 }
959 }
960 return r;
961 }
962
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968 int len, mod_tlen;
969 int infinite = IS_REPEAT_INFINITE(qn->upper);
970 int empty_info = qn->target_empty_info;
971 int tlen = compile_length_tree(qn->target, reg);
972
973 if (tlen < 0) return tlen;
974
975 /* anychar repeat */
976 if (NTYPE(qn->target) == NT_CANY) {
977 if (qn->greedy && infinite) {
978 if (IS_NOT_NULL(qn->next_head_exact))
979 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980 else
981 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982 }
983 }
984
985 if (empty_info != 0)
986 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987 else
988 mod_tlen = tlen;
989
990 if (infinite &&
991 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993 len = SIZE_OP_JUMP;
994 }
995 else {
996 len = tlen * qn->lower;
997 }
998
999 if (qn->greedy) {
1000 if (IS_NOT_NULL(qn->head_exact))
1001 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002 else if (IS_NOT_NULL(qn->next_head_exact))
1003 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004 else
1005 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006 }
1007 else
1008 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009 }
1010 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011 len = SIZE_OP_JUMP + tlen;
1012 }
1013 else if (!infinite && qn->greedy &&
1014 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016 len = tlen * qn->lower;
1017 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018 }
1019 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021 }
1022 else {
1023 len = SIZE_OP_REPEAT_INC
1024 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025 }
1026
1027 return len;
1028 }
1029
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033 int i, r, mod_tlen;
1034 int infinite = IS_REPEAT_INFINITE(qn->upper);
1035 int empty_info = qn->target_empty_info;
1036 int tlen = compile_length_tree(qn->target, reg);
1037
1038 if (tlen < 0) return tlen;
1039
1040 if (is_anychar_star_quantifier(qn)) {
1041 r = compile_tree_n_times(qn->target, qn->lower, reg);
1042 if (r) return r;
1043 if (IS_NOT_NULL(qn->next_head_exact)) {
1044 if (IS_MULTILINE(reg->options))
1045 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046 else
1047 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048 if (r) return r;
1049 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050 }
1051 else {
1052 if (IS_MULTILINE(reg->options))
1053 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054 else
1055 return add_opcode(reg, OP_ANYCHAR_STAR);
1056 }
1057 }
1058
1059 if (empty_info != 0)
1060 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061 else
1062 mod_tlen = tlen;
1063
1064 if (infinite &&
1065 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067 if (qn->greedy) {
1068 if (IS_NOT_NULL(qn->head_exact))
1069 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 else if (IS_NOT_NULL(qn->next_head_exact))
1071 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 else
1073 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074 }
1075 else {
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077 }
1078 if (r) return r;
1079 }
1080 else {
1081 r = compile_tree_n_times(qn->target, qn->lower, reg);
1082 if (r) return r;
1083 }
1084
1085 if (qn->greedy) {
1086 if (IS_NOT_NULL(qn->head_exact)) {
1087 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 mod_tlen + SIZE_OP_JUMP);
1089 if (r) return r;
1090 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 if (r) return r;
1093 r = add_opcode_rel_addr(reg, OP_JUMP,
1094 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095 }
1096 else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 mod_tlen + SIZE_OP_JUMP);
1099 if (r) return r;
1100 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 if (r) return r;
1103 r = add_opcode_rel_addr(reg, OP_JUMP,
1104 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105 }
1106 else {
1107 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 if (r) return r;
1109 r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 if (r) return r;
1111 r = add_opcode_rel_addr(reg, OP_JUMP,
1112 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113 }
1114 }
1115 else {
1116 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117 if (r) return r;
1118 r = compile_tree_empty_check(qn->target, reg, empty_info);
1119 if (r) return r;
1120 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121 }
1122 }
1123 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125 if (r) return r;
1126 r = compile_tree(qn->target, reg);
1127 }
1128 else if (!infinite && qn->greedy &&
1129 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131 int n = qn->upper - qn->lower;
1132
1133 r = compile_tree_n_times(qn->target, qn->lower, reg);
1134 if (r) return r;
1135
1136 for (i = 0; i < n; i++) {
1137 r = add_opcode_rel_addr(reg, OP_PUSH,
1138 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139 if (r) return r;
1140 r = compile_tree(qn->target, reg);
1141 if (r) return r;
1142 }
1143 }
1144 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146 if (r) return r;
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1150 }
1151 else {
1152 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153 }
1154 return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161 int tlen;
1162 OnigOptionType prev = reg->options;
1163
1164 reg->options = node->option;
1165 tlen = compile_length_tree(node->target, reg);
1166 reg->options = prev;
1167
1168 if (tlen < 0) return tlen;
1169
1170 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172 + tlen + SIZE_OP_SET_OPTION;
1173 }
1174 else
1175 return tlen;
1176 }
1177
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181 int r;
1182 OnigOptionType prev = reg->options;
1183
1184 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186 if (r) return r;
1187 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188 if (r) return r;
1189 r = add_opcode(reg, OP_FAIL);
1190 if (r) return r;
1191 }
1192
1193 reg->options = node->option;
1194 r = compile_tree(node->target, reg);
1195 reg->options = prev;
1196
1197 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198 if (r) return r;
1199 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200 }
1201 return r;
1202 }
1203
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207 int len;
1208 int tlen;
1209
1210 if (node->type == ENCLOSE_OPTION)
1211 return compile_length_option_node(node, reg);
1212
1213 if (node->target) {
1214 tlen = compile_length_tree(node->target, reg);
1215 if (tlen < 0) return tlen;
1216 }
1217 else
1218 tlen = 0;
1219
1220 switch (node->type) {
1221 case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223 if (IS_ENCLOSE_CALLED(node)) {
1224 len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 len += (IS_ENCLOSE_RECURSION(node)
1228 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229 else
1230 len += (IS_ENCLOSE_RECURSION(node)
1231 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232 }
1233 else
1234 #endif
1235 {
1236 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 len = SIZE_OP_MEMORY_START_PUSH;
1238 else
1239 len = SIZE_OP_MEMORY_START;
1240
1241 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1243 }
1244 break;
1245
1246 case ENCLOSE_STOP_BACKTRACK:
1247 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248 QtfrNode* qn = NQTFR(node->target);
1249 tlen = compile_length_tree(qn->target, reg);
1250 if (tlen < 0) return tlen;
1251
1252 len = tlen * qn->lower
1253 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1254 }
1255 else {
1256 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1257 }
1258 break;
1259
1260 default:
1261 return ONIGERR_TYPE_BUG;
1262 break;
1263 }
1264
1265 return len;
1266 }
1267
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1269
1270 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1272 {
1273 int r, len;
1274
1275 if (node->type == ENCLOSE_OPTION)
1276 return compile_option_node(node, reg);
1277
1278 switch (node->type) {
1279 case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281 if (IS_ENCLOSE_CALLED(node)) {
1282 r = add_opcode(reg, OP_CALL);
1283 if (r) return r;
1284 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285 node->state |= NST_ADDR_FIXED;
1286 r = add_abs_addr(reg, (int )node->call_addr);
1287 if (r) return r;
1288 len = compile_length_tree(node->target, reg);
1289 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 len += (IS_ENCLOSE_RECURSION(node)
1292 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293 else
1294 len += (IS_ENCLOSE_RECURSION(node)
1295 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1296
1297 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298 if (r) return r;
1299 }
1300 #endif
1301 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303 else
1304 r = add_opcode(reg, OP_MEMORY_START);
1305 if (r) return r;
1306 r = add_mem_num(reg, node->regnum);
1307 if (r) return r;
1308 r = compile_tree(node->target, reg);
1309 if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311 if (IS_ENCLOSE_CALLED(node)) {
1312 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315 else
1316 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1318
1319 if (r) return r;
1320 r = add_mem_num(reg, node->regnum);
1321 if (r) return r;
1322 r = add_opcode(reg, OP_RETURN);
1323 }
1324 else
1325 #endif
1326 {
1327 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329 else
1330 r = add_opcode(reg, OP_MEMORY_END);
1331 if (r) return r;
1332 r = add_mem_num(reg, node->regnum);
1333 }
1334 break;
1335
1336 case ENCLOSE_STOP_BACKTRACK:
1337 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338 QtfrNode* qn = NQTFR(node->target);
1339 r = compile_tree_n_times(qn->target, qn->lower, reg);
1340 if (r) return r;
1341
1342 len = compile_length_tree(qn->target, reg);
1343 if (len < 0) return len;
1344
1345 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346 if (r) return r;
1347 r = compile_tree(qn->target, reg);
1348 if (r) return r;
1349 r = add_opcode(reg, OP_POP);
1350 if (r) return r;
1351 r = add_opcode_rel_addr(reg, OP_JUMP,
1352 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1353 }
1354 else {
1355 r = add_opcode(reg, OP_PUSH_STOP_BT);
1356 if (r) return r;
1357 r = compile_tree(node->target, reg);
1358 if (r) return r;
1359 r = add_opcode(reg, OP_POP_STOP_BT);
1360 }
1361 break;
1362
1363 default:
1364 return ONIGERR_TYPE_BUG;
1365 break;
1366 }
1367
1368 return r;
1369 }
1370
1371 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1373 {
1374 int len;
1375 int tlen = 0;
1376
1377 if (node->target) {
1378 tlen = compile_length_tree(node->target, reg);
1379 if (tlen < 0) return tlen;
1380 }
1381
1382 switch (node->type) {
1383 case ANCHOR_PREC_READ:
1384 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385 break;
1386 case ANCHOR_PREC_READ_NOT:
1387 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388 break;
1389 case ANCHOR_LOOK_BEHIND:
1390 len = SIZE_OP_LOOK_BEHIND + tlen;
1391 break;
1392 case ANCHOR_LOOK_BEHIND_NOT:
1393 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394 break;
1395
1396 default:
1397 len = SIZE_OPCODE;
1398 break;
1399 }
1400
1401 return len;
1402 }
1403
1404 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1406 {
1407 int r, len;
1408
1409 switch (node->type) {
1410 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1411 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1412 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1413 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1414 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1415 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1416
1417 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1418 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1421 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1422 #endif
1423
1424 case ANCHOR_PREC_READ:
1425 r = add_opcode(reg, OP_PUSH_POS);
1426 if (r) return r;
1427 r = compile_tree(node->target, reg);
1428 if (r) return r;
1429 r = add_opcode(reg, OP_POP_POS);
1430 break;
1431
1432 case ANCHOR_PREC_READ_NOT:
1433 len = compile_length_tree(node->target, reg);
1434 if (len < 0) return len;
1435 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436 if (r) return r;
1437 r = compile_tree(node->target, reg);
1438 if (r) return r;
1439 r = add_opcode(reg, OP_FAIL_POS);
1440 break;
1441
1442 case ANCHOR_LOOK_BEHIND:
1443 {
1444 int n;
1445 r = add_opcode(reg, OP_LOOK_BEHIND);
1446 if (r) return r;
1447 if (node->char_len < 0) {
1448 r = get_char_length_tree(node->target, reg, &n);
1449 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1450 }
1451 else
1452 n = node->char_len;
1453 r = add_length(reg, n);
1454 if (r) return r;
1455 r = compile_tree(node->target, reg);
1456 }
1457 break;
1458
1459 case ANCHOR_LOOK_BEHIND_NOT:
1460 {
1461 int n;
1462 len = compile_length_tree(node->target, reg);
1463 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465 if (r) return r;
1466 if (node->char_len < 0) {
1467 r = get_char_length_tree(node->target, reg, &n);
1468 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1469 }
1470 else
1471 n = node->char_len;
1472 r = add_length(reg, n);
1473 if (r) return r;
1474 r = compile_tree(node->target, reg);
1475 if (r) return r;
1476 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1477 }
1478 break;
1479
1480 default:
1481 return ONIGERR_TYPE_BUG;
1482 break;
1483 }
1484
1485 return r;
1486 }
1487
1488 static int
compile_length_tree(Node * node,regex_t * reg)1489 compile_length_tree(Node* node, regex_t* reg)
1490 {
1491 int len, type, r;
1492
1493 type = NTYPE(node);
1494 switch (type) {
1495 case NT_LIST:
1496 len = 0;
1497 do {
1498 r = compile_length_tree(NCAR(node), reg);
1499 if (r < 0) return r;
1500 len += r;
1501 } while (IS_NOT_NULL(node = NCDR(node)));
1502 r = len;
1503 break;
1504
1505 case NT_ALT:
1506 {
1507 int n;
1508
1509 n = r = 0;
1510 do {
1511 r += compile_length_tree(NCAR(node), reg);
1512 n++;
1513 } while (IS_NOT_NULL(node = NCDR(node)));
1514 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1515 }
1516 break;
1517
1518 case NT_STR:
1519 if (NSTRING_IS_RAW(node))
1520 r = compile_length_string_raw_node(NSTR(node), reg);
1521 else
1522 r = compile_length_string_node(node, reg);
1523 break;
1524
1525 case NT_CCLASS:
1526 r = compile_length_cclass_node(NCCLASS(node), reg);
1527 break;
1528
1529 case NT_CTYPE:
1530 case NT_CANY:
1531 r = SIZE_OPCODE;
1532 break;
1533
1534 case NT_BREF:
1535 {
1536 BRefNode* br = NBREF(node);
1537
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539 if (IS_BACKREF_NEST_LEVEL(br)) {
1540 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1542 }
1543 else
1544 #endif
1545 if (br->back_num == 1) {
1546 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1548 }
1549 else {
1550 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1551 }
1552 }
1553 break;
1554
1555 #ifdef USE_SUBEXP_CALL
1556 case NT_CALL:
1557 r = SIZE_OP_CALL;
1558 break;
1559 #endif
1560
1561 case NT_QTFR:
1562 r = compile_length_quantifier_node(NQTFR(node), reg);
1563 break;
1564
1565 case NT_ENCLOSE:
1566 r = compile_length_enclose_node(NENCLOSE(node), reg);
1567 break;
1568
1569 case NT_ANCHOR:
1570 r = compile_length_anchor_node(NANCHOR(node), reg);
1571 break;
1572
1573 default:
1574 return ONIGERR_TYPE_BUG;
1575 break;
1576 }
1577
1578 return r;
1579 }
1580
1581 static int
compile_tree(Node * node,regex_t * reg)1582 compile_tree(Node* node, regex_t* reg)
1583 {
1584 int n, type, len, pos, r = 0;
1585
1586 type = NTYPE(node);
1587 switch (type) {
1588 case NT_LIST:
1589 do {
1590 r = compile_tree(NCAR(node), reg);
1591 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592 break;
1593
1594 case NT_ALT:
1595 {
1596 Node* x = node;
1597 len = 0;
1598 do {
1599 len += compile_length_tree(NCAR(x), reg);
1600 if (NCDR(x) != NULL) {
1601 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1602 }
1603 } while (IS_NOT_NULL(x = NCDR(x)));
1604 pos = reg->used + len; /* goal position */
1605
1606 do {
1607 len = compile_length_tree(NCAR(node), reg);
1608 if (IS_NOT_NULL(NCDR(node))) {
1609 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 if (r) break;
1611 }
1612 r = compile_tree(NCAR(node), reg);
1613 if (r) break;
1614 if (IS_NOT_NULL(NCDR(node))) {
1615 len = pos - (reg->used + SIZE_OP_JUMP);
1616 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 if (r) break;
1618 }
1619 } while (IS_NOT_NULL(node = NCDR(node)));
1620 }
1621 break;
1622
1623 case NT_STR:
1624 if (NSTRING_IS_RAW(node))
1625 r = compile_string_raw_node(NSTR(node), reg);
1626 else
1627 r = compile_string_node(node, reg);
1628 break;
1629
1630 case NT_CCLASS:
1631 r = compile_cclass_node(NCCLASS(node), reg);
1632 break;
1633
1634 case NT_CTYPE:
1635 {
1636 int op;
1637
1638 switch (NCTYPE(node)->ctype) {
1639 case ONIGENC_CTYPE_WORD:
1640 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1641 else op = OP_WORD;
1642 break;
1643 default:
1644 return ONIGERR_TYPE_BUG;
1645 break;
1646 }
1647 r = add_opcode(reg, op);
1648 }
1649 break;
1650
1651 case NT_CANY:
1652 if (IS_MULTILINE(reg->options))
1653 r = add_opcode(reg, OP_ANYCHAR_ML);
1654 else
1655 r = add_opcode(reg, OP_ANYCHAR);
1656 break;
1657
1658 case NT_BREF:
1659 {
1660 BRefNode* br = NBREF(node);
1661
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663 if (IS_BACKREF_NEST_LEVEL(br)) {
1664 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 if (r) return r;
1666 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 if (r) return r;
1668 r = add_length(reg, br->nest_level);
1669 if (r) return r;
1670
1671 goto add_bacref_mems;
1672 }
1673 else
1674 #endif
1675 if (br->back_num == 1) {
1676 n = br->back_static[0];
1677 if (IS_IGNORECASE(reg->options)) {
1678 r = add_opcode(reg, OP_BACKREFN_IC);
1679 if (r) return r;
1680 r = add_mem_num(reg, n);
1681 }
1682 else {
1683 switch (n) {
1684 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1685 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1686 default:
1687 r = add_opcode(reg, OP_BACKREFN);
1688 if (r) return r;
1689 r = add_mem_num(reg, n);
1690 break;
1691 }
1692 }
1693 }
1694 else {
1695 int i;
1696 int* p;
1697
1698 if (IS_IGNORECASE(reg->options)) {
1699 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1700 }
1701 else {
1702 r = add_opcode(reg, OP_BACKREF_MULTI);
1703 }
1704 if (r) return r;
1705
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707 add_bacref_mems:
1708 #endif
1709 r = add_length(reg, br->back_num);
1710 if (r) return r;
1711 p = BACKREFS_P(br);
1712 for (i = br->back_num - 1; i >= 0; i--) {
1713 r = add_mem_num(reg, p[i]);
1714 if (r) return r;
1715 }
1716 }
1717 }
1718 break;
1719
1720 #ifdef USE_SUBEXP_CALL
1721 case NT_CALL:
1722 r = compile_call(NCALL(node), reg);
1723 break;
1724 #endif
1725
1726 case NT_QTFR:
1727 r = compile_quantifier_node(NQTFR(node), reg);
1728 break;
1729
1730 case NT_ENCLOSE:
1731 r = compile_enclose_node(NENCLOSE(node), reg);
1732 break;
1733
1734 case NT_ANCHOR:
1735 r = compile_anchor_node(NANCHOR(node), reg);
1736 break;
1737
1738 default:
1739 #ifdef ONIG_DEBUG
1740 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742 break;
1743 }
1744
1745 return r;
1746 }
1747
1748 #ifdef USE_NAMED_GROUP
1749
1750 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1752 {
1753 int r = 0;
1754 Node* node = *plink;
1755
1756 switch (NTYPE(node)) {
1757 case NT_LIST:
1758 case NT_ALT:
1759 do {
1760 r = noname_disable_map(&(NCAR(node)), map, counter);
1761 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762 break;
1763
1764 case NT_QTFR:
1765 {
1766 Node** ptarget = &(NQTFR(node)->target);
1767 Node* old = *ptarget;
1768 r = noname_disable_map(ptarget, map, counter);
1769 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 onig_reduce_nested_quantifier(node, *ptarget);
1771 }
1772 }
1773 break;
1774
1775 case NT_ENCLOSE:
1776 {
1777 EncloseNode* en = NENCLOSE(node);
1778 if (en->type == ENCLOSE_MEMORY) {
1779 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 (*counter)++;
1781 map[en->regnum].new_val = *counter;
1782 en->regnum = *counter;
1783 r = noname_disable_map(&(en->target), map, counter);
1784 }
1785 else {
1786 *plink = en->target;
1787 en->target = NULL_NODE;
1788 onig_node_free(node);
1789 r = noname_disable_map(plink, map, counter);
1790 }
1791 }
1792 else
1793 r = noname_disable_map(&(en->target), map, counter);
1794 }
1795 break;
1796
1797 default:
1798 break;
1799 }
1800
1801 return r;
1802 }
1803
1804 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1805 renumber_node_backref(Node* node, GroupNumRemap* map)
1806 {
1807 int i, pos, n, old_num;
1808 int *backs;
1809 BRefNode* bn = NBREF(node);
1810
1811 if (! IS_BACKREF_NAME_REF(bn))
1812 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1813
1814 old_num = bn->back_num;
1815 if (IS_NULL(bn->back_dynamic))
1816 backs = bn->back_static;
1817 else
1818 backs = bn->back_dynamic;
1819
1820 for (i = 0, pos = 0; i < old_num; i++) {
1821 n = map[backs[i]].new_val;
1822 if (n > 0) {
1823 backs[pos] = n;
1824 pos++;
1825 }
1826 }
1827
1828 bn->back_num = pos;
1829 return 0;
1830 }
1831
1832 static int
renumber_by_map(Node * node,GroupNumRemap * map)1833 renumber_by_map(Node* node, GroupNumRemap* map)
1834 {
1835 int r = 0;
1836
1837 switch (NTYPE(node)) {
1838 case NT_LIST:
1839 case NT_ALT:
1840 do {
1841 r = renumber_by_map(NCAR(node), map);
1842 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1843 break;
1844 case NT_QTFR:
1845 r = renumber_by_map(NQTFR(node)->target, map);
1846 break;
1847 case NT_ENCLOSE:
1848 r = renumber_by_map(NENCLOSE(node)->target, map);
1849 break;
1850
1851 case NT_BREF:
1852 r = renumber_node_backref(node, map);
1853 break;
1854
1855 default:
1856 break;
1857 }
1858
1859 return r;
1860 }
1861
1862 static int
numbered_ref_check(Node * node)1863 numbered_ref_check(Node* node)
1864 {
1865 int r = 0;
1866
1867 switch (NTYPE(node)) {
1868 case NT_LIST:
1869 case NT_ALT:
1870 do {
1871 r = numbered_ref_check(NCAR(node));
1872 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1873 break;
1874 case NT_QTFR:
1875 r = numbered_ref_check(NQTFR(node)->target);
1876 break;
1877 case NT_ENCLOSE:
1878 r = numbered_ref_check(NENCLOSE(node)->target);
1879 break;
1880
1881 case NT_BREF:
1882 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1883 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1884 break;
1885
1886 default:
1887 break;
1888 }
1889
1890 return r;
1891 }
1892
1893 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1894 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1895 {
1896 int r, i, pos, counter;
1897 BitStatusType loc;
1898 GroupNumRemap* map;
1899
1900 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1901 CHECK_NULL_RETURN_MEMERR(map);
1902 for (i = 1; i <= env->num_mem; i++) {
1903 map[i].new_val = 0;
1904 }
1905 counter = 0;
1906 r = noname_disable_map(root, map, &counter);
1907 if (r != 0) return r;
1908
1909 r = renumber_by_map(*root, map);
1910 if (r != 0) return r;
1911
1912 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1913 if (map[i].new_val > 0) {
1914 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1915 pos++;
1916 }
1917 }
1918
1919 loc = env->capture_history;
1920 BIT_STATUS_CLEAR(env->capture_history);
1921 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1922 if (BIT_STATUS_AT(loc, i)) {
1923 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1924 }
1925 }
1926
1927 env->num_mem = env->num_named;
1928 reg->num_mem = env->num_named;
1929
1930 return onig_renumber_name_table(reg, map);
1931 }
1932 #endif /* USE_NAMED_GROUP */
1933
1934 #ifdef USE_SUBEXP_CALL
1935 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1936 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1937 {
1938 int i, offset;
1939 EncloseNode* en;
1940 AbsAddrType addr;
1941
1942 for (i = 0; i < uslist->num; i++) {
1943 en = NENCLOSE(uslist->us[i].target);
1944 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1945 addr = en->call_addr;
1946 offset = uslist->us[i].offset;
1947
1948 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1949 }
1950 return 0;
1951 }
1952 #endif
1953
1954 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1955 static int
quantifiers_memory_node_info(Node * node)1956 quantifiers_memory_node_info(Node* node)
1957 {
1958 int r = 0;
1959
1960 switch (NTYPE(node)) {
1961 case NT_LIST:
1962 case NT_ALT:
1963 {
1964 int v;
1965 do {
1966 v = quantifiers_memory_node_info(NCAR(node));
1967 if (v > r) r = v;
1968 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1969 }
1970 break;
1971
1972 #ifdef USE_SUBEXP_CALL
1973 case NT_CALL:
1974 if (IS_CALL_RECURSION(NCALL(node))) {
1975 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1976 }
1977 else
1978 r = quantifiers_memory_node_info(NCALL(node)->target);
1979 break;
1980 #endif
1981
1982 case NT_QTFR:
1983 {
1984 QtfrNode* qn = NQTFR(node);
1985 if (qn->upper != 0) {
1986 r = quantifiers_memory_node_info(qn->target);
1987 }
1988 }
1989 break;
1990
1991 case NT_ENCLOSE:
1992 {
1993 EncloseNode* en = NENCLOSE(node);
1994 switch (en->type) {
1995 case ENCLOSE_MEMORY:
1996 return NQ_TARGET_IS_EMPTY_MEM;
1997 break;
1998
1999 case ENCLOSE_OPTION:
2000 case ENCLOSE_STOP_BACKTRACK:
2001 r = quantifiers_memory_node_info(en->target);
2002 break;
2003 default:
2004 break;
2005 }
2006 }
2007 break;
2008
2009 case NT_BREF:
2010 case NT_STR:
2011 case NT_CTYPE:
2012 case NT_CCLASS:
2013 case NT_CANY:
2014 case NT_ANCHOR:
2015 default:
2016 break;
2017 }
2018
2019 return r;
2020 }
2021 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2022
2023 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2024 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2025 {
2026 OnigDistance tmin;
2027 int r = 0;
2028
2029 *min = 0;
2030 switch (NTYPE(node)) {
2031 case NT_BREF:
2032 {
2033 int i;
2034 int* backs;
2035 Node** nodes = SCANENV_MEM_NODES(env);
2036 BRefNode* br = NBREF(node);
2037 if (br->state & NST_RECURSION) break;
2038
2039 backs = BACKREFS_P(br);
2040 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2041 r = get_min_match_length(nodes[backs[0]], min, env);
2042 if (r != 0) break;
2043 for (i = 1; i < br->back_num; i++) {
2044 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2045 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2046 if (r != 0) break;
2047 if (*min > tmin) *min = tmin;
2048 }
2049 }
2050 break;
2051
2052 #ifdef USE_SUBEXP_CALL
2053 case NT_CALL:
2054 if (IS_CALL_RECURSION(NCALL(node))) {
2055 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2056 if (IS_ENCLOSE_MIN_FIXED(en))
2057 *min = en->min_len;
2058 }
2059 else
2060 r = get_min_match_length(NCALL(node)->target, min, env);
2061 break;
2062 #endif
2063
2064 case NT_LIST:
2065 do {
2066 r = get_min_match_length(NCAR(node), &tmin, env);
2067 if (r == 0) *min += tmin;
2068 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2069 break;
2070
2071 case NT_ALT:
2072 {
2073 Node *x, *y;
2074 y = node;
2075 do {
2076 x = NCAR(y);
2077 r = get_min_match_length(x, &tmin, env);
2078 if (r != 0) break;
2079 if (y == node) *min = tmin;
2080 else if (*min > tmin) *min = tmin;
2081 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2082 }
2083 break;
2084
2085 case NT_STR:
2086 {
2087 StrNode* sn = NSTR(node);
2088 *min = sn->end - sn->s;
2089 }
2090 break;
2091
2092 case NT_CTYPE:
2093 *min = 1;
2094 break;
2095
2096 case NT_CCLASS:
2097 case NT_CANY:
2098 *min = 1;
2099 break;
2100
2101 case NT_QTFR:
2102 {
2103 QtfrNode* qn = NQTFR(node);
2104
2105 if (qn->lower > 0) {
2106 r = get_min_match_length(qn->target, min, env);
2107 if (r == 0)
2108 *min = distance_multiply(*min, qn->lower);
2109 }
2110 }
2111 break;
2112
2113 case NT_ENCLOSE:
2114 {
2115 EncloseNode* en = NENCLOSE(node);
2116 switch (en->type) {
2117 case ENCLOSE_MEMORY:
2118 #ifdef USE_SUBEXP_CALL
2119 if (IS_ENCLOSE_MIN_FIXED(en))
2120 *min = en->min_len;
2121 else {
2122 r = get_min_match_length(en->target, min, env);
2123 if (r == 0) {
2124 en->min_len = *min;
2125 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2126 }
2127 }
2128 break;
2129 #endif
2130 case ENCLOSE_OPTION:
2131 case ENCLOSE_STOP_BACKTRACK:
2132 r = get_min_match_length(en->target, min, env);
2133 break;
2134 }
2135 }
2136 break;
2137
2138 case NT_ANCHOR:
2139 default:
2140 break;
2141 }
2142
2143 return r;
2144 }
2145
2146 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2147 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2148 {
2149 OnigDistance tmax;
2150 int r = 0;
2151
2152 *max = 0;
2153 switch (NTYPE(node)) {
2154 case NT_LIST:
2155 do {
2156 r = get_max_match_length(NCAR(node), &tmax, env);
2157 if (r == 0)
2158 *max = distance_add(*max, tmax);
2159 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2160 break;
2161
2162 case NT_ALT:
2163 do {
2164 r = get_max_match_length(NCAR(node), &tmax, env);
2165 if (r == 0 && *max < tmax) *max = tmax;
2166 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2167 break;
2168
2169 case NT_STR:
2170 {
2171 StrNode* sn = NSTR(node);
2172 *max = sn->end - sn->s;
2173 }
2174 break;
2175
2176 case NT_CTYPE:
2177 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2178 break;
2179
2180 case NT_CCLASS:
2181 case NT_CANY:
2182 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2183 break;
2184
2185 case NT_BREF:
2186 {
2187 int i;
2188 int* backs;
2189 Node** nodes = SCANENV_MEM_NODES(env);
2190 BRefNode* br = NBREF(node);
2191 if (br->state & NST_RECURSION) {
2192 *max = ONIG_INFINITE_DISTANCE;
2193 break;
2194 }
2195 backs = BACKREFS_P(br);
2196 for (i = 0; i < br->back_num; i++) {
2197 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2198 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2199 if (r != 0) break;
2200 if (*max < tmax) *max = tmax;
2201 }
2202 }
2203 break;
2204
2205 #ifdef USE_SUBEXP_CALL
2206 case NT_CALL:
2207 if (! IS_CALL_RECURSION(NCALL(node)))
2208 r = get_max_match_length(NCALL(node)->target, max, env);
2209 else
2210 *max = ONIG_INFINITE_DISTANCE;
2211 break;
2212 #endif
2213
2214 case NT_QTFR:
2215 {
2216 QtfrNode* qn = NQTFR(node);
2217
2218 if (qn->upper != 0) {
2219 r = get_max_match_length(qn->target, max, env);
2220 if (r == 0 && *max != 0) {
2221 if (! IS_REPEAT_INFINITE(qn->upper))
2222 *max = distance_multiply(*max, qn->upper);
2223 else
2224 *max = ONIG_INFINITE_DISTANCE;
2225 }
2226 }
2227 }
2228 break;
2229
2230 case NT_ENCLOSE:
2231 {
2232 EncloseNode* en = NENCLOSE(node);
2233 switch (en->type) {
2234 case ENCLOSE_MEMORY:
2235 #ifdef USE_SUBEXP_CALL
2236 if (IS_ENCLOSE_MAX_FIXED(en))
2237 *max = en->max_len;
2238 else {
2239 r = get_max_match_length(en->target, max, env);
2240 if (r == 0) {
2241 en->max_len = *max;
2242 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2243 }
2244 }
2245 break;
2246 #endif
2247 case ENCLOSE_OPTION:
2248 case ENCLOSE_STOP_BACKTRACK:
2249 r = get_max_match_length(en->target, max, env);
2250 break;
2251 }
2252 }
2253 break;
2254
2255 case NT_ANCHOR:
2256 default:
2257 break;
2258 }
2259
2260 return r;
2261 }
2262
2263 #define GET_CHAR_LEN_VARLEN -1
2264 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2265
2266 /* fixed size pattern node only */
2267 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2268 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2269 {
2270 int tlen;
2271 int r = 0;
2272
2273 level++;
2274 *len = 0;
2275 switch (NTYPE(node)) {
2276 case NT_LIST:
2277 do {
2278 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2279 if (r == 0)
2280 *len = distance_add(*len, tlen);
2281 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2282 break;
2283
2284 case NT_ALT:
2285 {
2286 int tlen2;
2287 int varlen = 0;
2288
2289 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2290 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2291 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2292 if (r == 0) {
2293 if (tlen != tlen2)
2294 varlen = 1;
2295 }
2296 }
2297 if (r == 0) {
2298 if (varlen != 0) {
2299 if (level == 1)
2300 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2301 else
2302 r = GET_CHAR_LEN_VARLEN;
2303 }
2304 else
2305 *len = tlen;
2306 }
2307 }
2308 break;
2309
2310 case NT_STR:
2311 {
2312 StrNode* sn = NSTR(node);
2313 UChar *s = sn->s;
2314 while (s < sn->end) {
2315 s += enclen(reg->enc, s);
2316 (*len)++;
2317 }
2318 }
2319 break;
2320
2321 case NT_QTFR:
2322 {
2323 QtfrNode* qn = NQTFR(node);
2324 if (qn->lower == qn->upper) {
2325 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2326 if (r == 0)
2327 *len = distance_multiply(tlen, qn->lower);
2328 }
2329 else
2330 r = GET_CHAR_LEN_VARLEN;
2331 }
2332 break;
2333
2334 #ifdef USE_SUBEXP_CALL
2335 case NT_CALL:
2336 if (! IS_CALL_RECURSION(NCALL(node)))
2337 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2338 else
2339 r = GET_CHAR_LEN_VARLEN;
2340 break;
2341 #endif
2342
2343 case NT_CTYPE:
2344 *len = 1;
2345 break;
2346
2347 case NT_CCLASS:
2348 case NT_CANY:
2349 *len = 1;
2350 break;
2351
2352 case NT_ENCLOSE:
2353 {
2354 EncloseNode* en = NENCLOSE(node);
2355 switch (en->type) {
2356 case ENCLOSE_MEMORY:
2357 #ifdef USE_SUBEXP_CALL
2358 if (IS_ENCLOSE_CLEN_FIXED(en))
2359 *len = en->char_len;
2360 else {
2361 r = get_char_length_tree1(en->target, reg, len, level);
2362 if (r == 0) {
2363 en->char_len = *len;
2364 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2365 }
2366 }
2367 break;
2368 #endif
2369 case ENCLOSE_OPTION:
2370 case ENCLOSE_STOP_BACKTRACK:
2371 r = get_char_length_tree1(en->target, reg, len, level);
2372 break;
2373 default:
2374 break;
2375 }
2376 }
2377 break;
2378
2379 case NT_ANCHOR:
2380 break;
2381
2382 default:
2383 r = GET_CHAR_LEN_VARLEN;
2384 break;
2385 }
2386
2387 return r;
2388 }
2389
2390 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2391 get_char_length_tree(Node* node, regex_t* reg, int* len)
2392 {
2393 return get_char_length_tree1(node, reg, len, 0);
2394 }
2395
2396 /* x is not included y ==> 1 : 0 */
2397 static int
is_not_included(Node * x,Node * y,regex_t * reg)2398 is_not_included(Node* x, Node* y, regex_t* reg)
2399 {
2400 int i, len;
2401 OnigCodePoint code;
2402 UChar *p;
2403 int ytype;
2404
2405 retry:
2406 ytype = NTYPE(y);
2407 switch (NTYPE(x)) {
2408 case NT_CTYPE:
2409 {
2410 switch (ytype) {
2411 case NT_CTYPE:
2412 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2413 NCTYPE(y)->not != NCTYPE(x)->not)
2414 return 1;
2415 else
2416 return 0;
2417 break;
2418
2419 case NT_CCLASS:
2420 swap:
2421 {
2422 Node* tmp;
2423 tmp = x; x = y; y = tmp;
2424 goto retry;
2425 }
2426 break;
2427
2428 case NT_STR:
2429 goto swap;
2430 break;
2431
2432 default:
2433 break;
2434 }
2435 }
2436 break;
2437
2438 case NT_CCLASS:
2439 {
2440 CClassNode* xc = NCCLASS(x);
2441 switch (ytype) {
2442 case NT_CTYPE:
2443 switch (NCTYPE(y)->ctype) {
2444 case ONIGENC_CTYPE_WORD:
2445 if (NCTYPE(y)->not == 0) {
2446 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2447 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2448 if (BITSET_AT(xc->bs, i)) {
2449 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2450 }
2451 }
2452 return 1;
2453 }
2454 return 0;
2455 }
2456 else {
2457 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2458 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2459 if (!IS_NCCLASS_NOT(xc)) {
2460 if (BITSET_AT(xc->bs, i))
2461 return 0;
2462 }
2463 else {
2464 if (! BITSET_AT(xc->bs, i))
2465 return 0;
2466 }
2467 }
2468 }
2469 return 1;
2470 }
2471 break;
2472
2473 default:
2474 break;
2475 }
2476 break;
2477
2478 case NT_CCLASS:
2479 {
2480 int v;
2481 CClassNode* yc = NCCLASS(y);
2482
2483 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2484 v = BITSET_AT(xc->bs, i);
2485 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2486 (v == 0 && IS_NCCLASS_NOT(xc))) {
2487 v = BITSET_AT(yc->bs, i);
2488 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2489 (v == 0 && IS_NCCLASS_NOT(yc)))
2490 return 0;
2491 }
2492 }
2493 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2494 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2495 return 1;
2496 return 0;
2497 }
2498 break;
2499
2500 case NT_STR:
2501 goto swap;
2502 break;
2503
2504 default:
2505 break;
2506 }
2507 }
2508 break;
2509
2510 case NT_STR:
2511 {
2512 StrNode* xs = NSTR(x);
2513 if (NSTRING_LEN(x) == 0)
2514 break;
2515
2516 //c = *(xs->s);
2517 switch (ytype) {
2518 case NT_CTYPE:
2519 switch (NCTYPE(y)->ctype) {
2520 case ONIGENC_CTYPE_WORD:
2521 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2522 return NCTYPE(y)->not;
2523 else
2524 return !(NCTYPE(y)->not);
2525 break;
2526 default:
2527 break;
2528 }
2529 break;
2530
2531 case NT_CCLASS:
2532 {
2533 CClassNode* cc = NCCLASS(y);
2534
2535 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2536 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2537 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2538 }
2539 break;
2540
2541 case NT_STR:
2542 {
2543 UChar *q;
2544 StrNode* ys = NSTR(y);
2545 len = NSTRING_LEN(x);
2546 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2547 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2548 /* tiny version */
2549 return 0;
2550 }
2551 else {
2552 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2553 if (*p != *q) return 1;
2554 }
2555 }
2556 }
2557 break;
2558
2559 default:
2560 break;
2561 }
2562 }
2563 break;
2564
2565 default:
2566 break;
2567 }
2568
2569 return 0;
2570 }
2571
2572 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2573 get_head_value_node(Node* node, int exact, regex_t* reg)
2574 {
2575 Node* n = NULL_NODE;
2576
2577 switch (NTYPE(node)) {
2578 case NT_BREF:
2579 case NT_ALT:
2580 case NT_CANY:
2581 #ifdef USE_SUBEXP_CALL
2582 case NT_CALL:
2583 #endif
2584 break;
2585
2586 case NT_CTYPE:
2587 case NT_CCLASS:
2588 if (exact == 0) {
2589 n = node;
2590 }
2591 break;
2592
2593 case NT_LIST:
2594 n = get_head_value_node(NCAR(node), exact, reg);
2595 break;
2596
2597 case NT_STR:
2598 {
2599 StrNode* sn = NSTR(node);
2600
2601 if (sn->end <= sn->s)
2602 break;
2603
2604 if (exact != 0 &&
2605 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2606 }
2607 else {
2608 n = node;
2609 }
2610 }
2611 break;
2612
2613 case NT_QTFR:
2614 {
2615 QtfrNode* qn = NQTFR(node);
2616 if (qn->lower > 0) {
2617 if (IS_NOT_NULL(qn->head_exact))
2618 n = qn->head_exact;
2619 else
2620 n = get_head_value_node(qn->target, exact, reg);
2621 }
2622 }
2623 break;
2624
2625 case NT_ENCLOSE:
2626 {
2627 EncloseNode* en = NENCLOSE(node);
2628 switch (en->type) {
2629 case ENCLOSE_OPTION:
2630 {
2631 OnigOptionType options = reg->options;
2632
2633 reg->options = NENCLOSE(node)->option;
2634 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2635 reg->options = options;
2636 }
2637 break;
2638
2639 case ENCLOSE_MEMORY:
2640 case ENCLOSE_STOP_BACKTRACK:
2641 n = get_head_value_node(en->target, exact, reg);
2642 break;
2643 }
2644 }
2645 break;
2646
2647 case NT_ANCHOR:
2648 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2649 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2650 break;
2651
2652 default:
2653 break;
2654 }
2655
2656 return n;
2657 }
2658
2659 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2660 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2661 {
2662 int type, r = 0;
2663
2664 type = NTYPE(node);
2665 if ((NTYPE2BIT(type) & type_mask) == 0)
2666 return 1;
2667
2668 switch (type) {
2669 case NT_LIST:
2670 case NT_ALT:
2671 do {
2672 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2673 anchor_mask);
2674 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2675 break;
2676
2677 case NT_QTFR:
2678 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2679 anchor_mask);
2680 break;
2681
2682 case NT_ENCLOSE:
2683 {
2684 EncloseNode* en = NENCLOSE(node);
2685 if ((en->type & enclose_mask) == 0)
2686 return 1;
2687
2688 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2689 }
2690 break;
2691
2692 case NT_ANCHOR:
2693 type = NANCHOR(node)->type;
2694 if ((type & anchor_mask) == 0)
2695 return 1;
2696
2697 if (NANCHOR(node)->target)
2698 r = check_type_tree(NANCHOR(node)->target,
2699 type_mask, enclose_mask, anchor_mask);
2700 break;
2701
2702 default:
2703 break;
2704 }
2705 return r;
2706 }
2707
2708 #ifdef USE_SUBEXP_CALL
2709
2710 #define RECURSION_EXIST 1
2711 #define RECURSION_INFINITE 2
2712
2713 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2714 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2715 {
2716 int type;
2717 int r = 0;
2718
2719 type = NTYPE(node);
2720 switch (type) {
2721 case NT_LIST:
2722 {
2723 Node *x;
2724 OnigDistance min;
2725 int ret;
2726
2727 x = node;
2728 do {
2729 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2730 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2731 r |= ret;
2732 if (head) {
2733 ret = get_min_match_length(NCAR(x), &min, env);
2734 if (ret != 0) return ret;
2735 if (min != 0) head = 0;
2736 }
2737 } while (IS_NOT_NULL(x = NCDR(x)));
2738 }
2739 break;
2740
2741 case NT_ALT:
2742 {
2743 int ret;
2744 r = RECURSION_EXIST;
2745 do {
2746 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2747 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2748 r &= ret;
2749 } while (IS_NOT_NULL(node = NCDR(node)));
2750 }
2751 break;
2752
2753 case NT_QTFR:
2754 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2755 if (r == RECURSION_EXIST) {
2756 if (NQTFR(node)->lower == 0) r = 0;
2757 }
2758 break;
2759
2760 case NT_ANCHOR:
2761 {
2762 AnchorNode* an = NANCHOR(node);
2763 switch (an->type) {
2764 case ANCHOR_PREC_READ:
2765 case ANCHOR_PREC_READ_NOT:
2766 case ANCHOR_LOOK_BEHIND:
2767 case ANCHOR_LOOK_BEHIND_NOT:
2768 r = subexp_inf_recursive_check(an->target, env, head);
2769 break;
2770 }
2771 }
2772 break;
2773
2774 case NT_CALL:
2775 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2776 break;
2777
2778 case NT_ENCLOSE:
2779 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2780 return 0;
2781 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2782 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2783 else {
2784 SET_ENCLOSE_STATUS(node, NST_MARK2);
2785 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2786 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2787 }
2788 break;
2789
2790 default:
2791 break;
2792 }
2793
2794 return r;
2795 }
2796
2797 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2798 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2799 {
2800 int type;
2801 int r = 0;
2802
2803 type = NTYPE(node);
2804 switch (type) {
2805 case NT_LIST:
2806 case NT_ALT:
2807 do {
2808 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2809 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2810 break;
2811
2812 case NT_QTFR:
2813 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2814 break;
2815
2816 case NT_ANCHOR:
2817 {
2818 AnchorNode* an = NANCHOR(node);
2819 switch (an->type) {
2820 case ANCHOR_PREC_READ:
2821 case ANCHOR_PREC_READ_NOT:
2822 case ANCHOR_LOOK_BEHIND:
2823 case ANCHOR_LOOK_BEHIND_NOT:
2824 r = subexp_inf_recursive_check_trav(an->target, env);
2825 break;
2826 }
2827 }
2828 break;
2829
2830 case NT_ENCLOSE:
2831 {
2832 EncloseNode* en = NENCLOSE(node);
2833
2834 if (IS_ENCLOSE_RECURSION(en)) {
2835 SET_ENCLOSE_STATUS(node, NST_MARK1);
2836 r = subexp_inf_recursive_check(en->target, env, 1);
2837 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2838 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2839 }
2840 r = subexp_inf_recursive_check_trav(en->target, env);
2841 }
2842
2843 break;
2844
2845 default:
2846 break;
2847 }
2848
2849 return r;
2850 }
2851
2852 static int
subexp_recursive_check(Node * node)2853 subexp_recursive_check(Node* node)
2854 {
2855 int r = 0;
2856
2857 switch (NTYPE(node)) {
2858 case NT_LIST:
2859 case NT_ALT:
2860 do {
2861 r |= subexp_recursive_check(NCAR(node));
2862 } while (IS_NOT_NULL(node = NCDR(node)));
2863 break;
2864
2865 case NT_QTFR:
2866 r = subexp_recursive_check(NQTFR(node)->target);
2867 break;
2868
2869 case NT_ANCHOR:
2870 {
2871 AnchorNode* an = NANCHOR(node);
2872 switch (an->type) {
2873 case ANCHOR_PREC_READ:
2874 case ANCHOR_PREC_READ_NOT:
2875 case ANCHOR_LOOK_BEHIND:
2876 case ANCHOR_LOOK_BEHIND_NOT:
2877 r = subexp_recursive_check(an->target);
2878 break;
2879 }
2880 }
2881 break;
2882
2883 case NT_CALL:
2884 r = subexp_recursive_check(NCALL(node)->target);
2885 if (r != 0) SET_CALL_RECURSION(node);
2886 break;
2887
2888 case NT_ENCLOSE:
2889 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2890 return 0;
2891 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2892 return 1; /* recursion */
2893 else {
2894 SET_ENCLOSE_STATUS(node, NST_MARK2);
2895 r = subexp_recursive_check(NENCLOSE(node)->target);
2896 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2897 }
2898 break;
2899
2900 default:
2901 break;
2902 }
2903
2904 return r;
2905 }
2906
2907
2908 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2909 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2910 {
2911 #define FOUND_CALLED_NODE 1
2912
2913 int type;
2914 int r = 0;
2915
2916 type = NTYPE(node);
2917 switch (type) {
2918 case NT_LIST:
2919 case NT_ALT:
2920 {
2921 int ret;
2922 do {
2923 ret = subexp_recursive_check_trav(NCAR(node), env);
2924 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2925 else if (ret < 0) return ret;
2926 } while (IS_NOT_NULL(node = NCDR(node)));
2927 }
2928 break;
2929
2930 case NT_QTFR:
2931 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2932 if (NQTFR(node)->upper == 0) {
2933 if (r == FOUND_CALLED_NODE)
2934 NQTFR(node)->is_refered = 1;
2935 }
2936 break;
2937
2938 case NT_ANCHOR:
2939 {
2940 AnchorNode* an = NANCHOR(node);
2941 switch (an->type) {
2942 case ANCHOR_PREC_READ:
2943 case ANCHOR_PREC_READ_NOT:
2944 case ANCHOR_LOOK_BEHIND:
2945 case ANCHOR_LOOK_BEHIND_NOT:
2946 r = subexp_recursive_check_trav(an->target, env);
2947 break;
2948 }
2949 }
2950 break;
2951
2952 case NT_ENCLOSE:
2953 {
2954 EncloseNode* en = NENCLOSE(node);
2955
2956 if (! IS_ENCLOSE_RECURSION(en)) {
2957 if (IS_ENCLOSE_CALLED(en)) {
2958 SET_ENCLOSE_STATUS(node, NST_MARK1);
2959 r = subexp_recursive_check(en->target);
2960 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2961 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2962 }
2963 }
2964 r = subexp_recursive_check_trav(en->target, env);
2965 if (IS_ENCLOSE_CALLED(en))
2966 r |= FOUND_CALLED_NODE;
2967 }
2968 break;
2969
2970 default:
2971 break;
2972 }
2973
2974 return r;
2975 }
2976
2977 static int
setup_subexp_call(Node * node,ScanEnv * env)2978 setup_subexp_call(Node* node, ScanEnv* env)
2979 {
2980 int type;
2981 int r = 0;
2982
2983 type = NTYPE(node);
2984 switch (type) {
2985 case NT_LIST:
2986 do {
2987 r = setup_subexp_call(NCAR(node), env);
2988 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2989 break;
2990
2991 case NT_ALT:
2992 do {
2993 r = setup_subexp_call(NCAR(node), env);
2994 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2995 break;
2996
2997 case NT_QTFR:
2998 r = setup_subexp_call(NQTFR(node)->target, env);
2999 break;
3000 case NT_ENCLOSE:
3001 r = setup_subexp_call(NENCLOSE(node)->target, env);
3002 break;
3003
3004 case NT_CALL:
3005 {
3006 CallNode* cn = NCALL(node);
3007 Node** nodes = SCANENV_MEM_NODES(env);
3008
3009 if (cn->group_num != 0) {
3010 int gnum = cn->group_num;
3011
3012 #ifdef USE_NAMED_GROUP
3013 if (env->num_named > 0 &&
3014 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3015 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3016 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3017 }
3018 #endif
3019 if (gnum > env->num_mem) {
3020 onig_scan_env_set_error_string(env,
3021 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3022 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3023 }
3024
3025 #ifdef USE_NAMED_GROUP
3026 set_call_attr:
3027 #endif
3028 cn->target = nodes[cn->group_num];
3029 if (IS_NULL(cn->target)) {
3030 onig_scan_env_set_error_string(env,
3031 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3032 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3033 }
3034 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3035 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3036 cn->unset_addr_list = env->unset_addr_list;
3037 }
3038 #ifdef USE_NAMED_GROUP
3039 else {
3040 int *refs;
3041
3042 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3043 &refs);
3044 if (n <= 0) {
3045 onig_scan_env_set_error_string(env,
3046 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3047 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3048 }
3049 else if (n > 1) {
3050 onig_scan_env_set_error_string(env,
3051 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3052 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3053 }
3054 else {
3055 cn->group_num = refs[0];
3056 goto set_call_attr;
3057 }
3058 }
3059 #endif
3060 }
3061 break;
3062
3063 case NT_ANCHOR:
3064 {
3065 AnchorNode* an = NANCHOR(node);
3066
3067 switch (an->type) {
3068 case ANCHOR_PREC_READ:
3069 case ANCHOR_PREC_READ_NOT:
3070 case ANCHOR_LOOK_BEHIND:
3071 case ANCHOR_LOOK_BEHIND_NOT:
3072 r = setup_subexp_call(an->target, env);
3073 break;
3074 }
3075 }
3076 break;
3077
3078 default:
3079 break;
3080 }
3081
3082 return r;
3083 }
3084 #endif
3085
3086 /* divide different length alternatives in look-behind.
3087 (?<=A|B) ==> (?<=A)|(?<=B)
3088 (?<!A|B) ==> (?<!A)(?<!B)
3089 */
3090 static int
divide_look_behind_alternatives(Node * node)3091 divide_look_behind_alternatives(Node* node)
3092 {
3093 Node *head, *np, *insert_node;
3094 AnchorNode* an = NANCHOR(node);
3095 int anc_type = an->type;
3096
3097 head = an->target;
3098 np = NCAR(head);
3099 swap_node(node, head);
3100 NCAR(node) = head;
3101 NANCHOR(head)->target = np;
3102
3103 np = node;
3104 while ((np = NCDR(np)) != NULL_NODE) {
3105 insert_node = onig_node_new_anchor(anc_type);
3106 CHECK_NULL_RETURN_MEMERR(insert_node);
3107 NANCHOR(insert_node)->target = NCAR(np);
3108 NCAR(np) = insert_node;
3109 }
3110
3111 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3112 np = node;
3113 do {
3114 SET_NTYPE(np, NT_LIST); /* alt -> list */
3115 } while ((np = NCDR(np)) != NULL_NODE);
3116 }
3117 return 0;
3118 }
3119
3120 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3121 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3122 {
3123 int r, len;
3124 AnchorNode* an = NANCHOR(node);
3125
3126 r = get_char_length_tree(an->target, reg, &len);
3127 if (r == 0)
3128 an->char_len = len;
3129 else if (r == GET_CHAR_LEN_VARLEN)
3130 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3131 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3132 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3133 r = divide_look_behind_alternatives(node);
3134 else
3135 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3136 }
3137
3138 return r;
3139 }
3140
3141 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3142 next_setup(Node* node, Node* next_node, regex_t* reg)
3143 {
3144 int type;
3145
3146 retry:
3147 type = NTYPE(node);
3148 if (type == NT_QTFR) {
3149 QtfrNode* qn = NQTFR(node);
3150 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3151 #ifdef USE_QTFR_PEEK_NEXT
3152 Node* n = get_head_value_node(next_node, 1, reg);
3153 /* '\0': for UTF-16BE etc... */
3154 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3155 qn->next_head_exact = n;
3156 }
3157 #endif
3158 /* automatic posseivation a*b ==> (?>a*)b */
3159 if (qn->lower <= 1) {
3160 int ttype = NTYPE(qn->target);
3161 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3162 Node *x, *y;
3163 x = get_head_value_node(qn->target, 0, reg);
3164 if (IS_NOT_NULL(x)) {
3165 y = get_head_value_node(next_node, 0, reg);
3166 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3167 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3168 CHECK_NULL_RETURN_MEMERR(en);
3169 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3170 swap_node(node, en);
3171 NENCLOSE(node)->target = en;
3172 }
3173 }
3174 }
3175 }
3176 }
3177 }
3178 else if (type == NT_ENCLOSE) {
3179 EncloseNode* en = NENCLOSE(node);
3180 if (en->type == ENCLOSE_MEMORY) {
3181 node = en->target;
3182 goto retry;
3183 }
3184 }
3185 return 0;
3186 }
3187
3188
3189 static int
update_string_node_case_fold(regex_t * reg,Node * node)3190 update_string_node_case_fold(regex_t* reg, Node *node)
3191 {
3192 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3193 UChar *sbuf, *ebuf, *sp;
3194 int r, i, len, sbuf_size;
3195 StrNode* sn = NSTR(node);
3196
3197 end = sn->end;
3198 sbuf_size = (end - sn->s) * 2;
3199 sbuf = (UChar* )xmalloc(sbuf_size);
3200 CHECK_NULL_RETURN_MEMERR(sbuf);
3201 ebuf = sbuf + sbuf_size;
3202
3203 sp = sbuf;
3204 p = sn->s;
3205 while (p < end) {
3206 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3207 for (i = 0; i < len; i++) {
3208 if (sp >= ebuf) {
3209 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3210 CHECK_NULL_RETURN_MEMERR(sbuf);
3211 sp = sbuf + sbuf_size;
3212 sbuf_size *= 2;
3213 ebuf = sbuf + sbuf_size;
3214 }
3215
3216 *sp++ = buf[i];
3217 }
3218 }
3219
3220 r = onig_node_str_set(node, sbuf, sp);
3221 if (r != 0) {
3222 xfree(sbuf);
3223 return r;
3224 }
3225
3226 xfree(sbuf);
3227 return 0;
3228 }
3229
3230 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3231 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3232 regex_t* reg)
3233 {
3234 int r;
3235 Node *node;
3236
3237 node = onig_node_new_str(s, end);
3238 if (IS_NULL(node)) return ONIGERR_MEMORY;
3239
3240 r = update_string_node_case_fold(reg, node);
3241 if (r != 0) {
3242 onig_node_free(node);
3243 return r;
3244 }
3245
3246 NSTRING_SET_AMBIG(node);
3247 NSTRING_SET_DONT_GET_OPT_INFO(node);
3248 *rnode = node;
3249 return 0;
3250 }
3251
3252 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3253 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3254 UChar *p, int slen, UChar *end,
3255 regex_t* reg, Node **rnode)
3256 {
3257 int r, i, j, len, varlen;
3258 Node *anode, *var_anode, *snode, *xnode, *an;
3259 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3260
3261 *rnode = var_anode = NULL_NODE;
3262
3263 varlen = 0;
3264 for (i = 0; i < item_num; i++) {
3265 if (items[i].byte_len != slen) {
3266 varlen = 1;
3267 break;
3268 }
3269 }
3270
3271 if (varlen != 0) {
3272 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3273 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3274
3275 xnode = onig_node_new_list(NULL, NULL);
3276 if (IS_NULL(xnode)) goto mem_err;
3277 NCAR(var_anode) = xnode;
3278
3279 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3280 if (IS_NULL(anode)) goto mem_err;
3281 NCAR(xnode) = anode;
3282 }
3283 else {
3284 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3285 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3286 }
3287
3288 snode = onig_node_new_str(p, p + slen);
3289 if (IS_NULL(snode)) goto mem_err;
3290
3291 NCAR(anode) = snode;
3292
3293 for (i = 0; i < item_num; i++) {
3294 snode = onig_node_new_str(NULL, NULL);
3295 if (IS_NULL(snode)) goto mem_err;
3296
3297 for (j = 0; j < items[i].code_len; j++) {
3298 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3299 if (len < 0) {
3300 r = len;
3301 goto mem_err2;
3302 }
3303
3304 r = onig_node_str_cat(snode, buf, buf + len);
3305 if (r != 0) goto mem_err2;
3306 }
3307
3308 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3309 if (IS_NULL(an)) {
3310 goto mem_err2;
3311 }
3312
3313 if (items[i].byte_len != slen) {
3314 Node *rem;
3315 UChar *q = p + items[i].byte_len;
3316
3317 if (q < end) {
3318 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3319 if (r != 0) {
3320 onig_node_free(an);
3321 goto mem_err2;
3322 }
3323
3324 xnode = onig_node_list_add(NULL_NODE, snode);
3325 if (IS_NULL(xnode)) {
3326 onig_node_free(an);
3327 onig_node_free(rem);
3328 goto mem_err2;
3329 }
3330 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3331 onig_node_free(an);
3332 onig_node_free(xnode);
3333 onig_node_free(rem);
3334 goto mem_err;
3335 }
3336
3337 NCAR(an) = xnode;
3338 }
3339 else {
3340 NCAR(an) = snode;
3341 }
3342
3343 NCDR(var_anode) = an;
3344 var_anode = an;
3345 }
3346 else {
3347 NCAR(an) = snode;
3348 NCDR(anode) = an;
3349 anode = an;
3350 }
3351 }
3352
3353 return varlen;
3354
3355 mem_err2:
3356 onig_node_free(snode);
3357
3358 mem_err:
3359 onig_node_free(*rnode);
3360
3361 return ONIGERR_MEMORY;
3362 }
3363
3364 static int
expand_case_fold_string(Node * node,regex_t * reg)3365 expand_case_fold_string(Node* node, regex_t* reg)
3366 {
3367 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3368
3369 int r, n, len, alt_num;
3370 UChar *start, *end, *p;
3371 Node *top_root, *root, *snode, *prev_node;
3372 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3373 StrNode* sn = NSTR(node);
3374
3375 if (NSTRING_IS_AMBIG(node)) return 0;
3376
3377 start = sn->s;
3378 end = sn->end;
3379 if (start >= end) return 0;
3380
3381 r = 0;
3382 top_root = root = prev_node = snode = NULL_NODE;
3383 alt_num = 1;
3384 p = start;
3385 while (p < end) {
3386 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3387 p, end, items);
3388 if (n < 0) {
3389 r = n;
3390 goto err;
3391 }
3392
3393 SAFE_ENC_LEN(reg->enc, p, end, len);
3394
3395 if (n == 0) {
3396 if (IS_NULL(snode)) {
3397 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3398 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3399 if (IS_NULL(root)) {
3400 onig_node_free(prev_node);
3401 goto mem_err;
3402 }
3403 }
3404
3405 prev_node = snode = onig_node_new_str(NULL, NULL);
3406 if (IS_NULL(snode)) goto mem_err;
3407 if (IS_NOT_NULL(root)) {
3408 if (IS_NULL(onig_node_list_add(root, snode))) {
3409 onig_node_free(snode);
3410 goto mem_err;
3411 }
3412 }
3413 }
3414
3415 r = onig_node_str_cat(snode, p, p + len);
3416 if (r != 0) goto err;
3417 }
3418 else {
3419 alt_num *= (n + 1);
3420 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3421
3422 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3423 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3424 if (IS_NULL(root)) {
3425 onig_node_free(prev_node);
3426 goto mem_err;
3427 }
3428 }
3429
3430 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3431 if (r < 0) goto mem_err;
3432 if (r == 1) {
3433 if (IS_NULL(root)) {
3434 top_root = prev_node;
3435 }
3436 else {
3437 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3438 onig_node_free(prev_node);
3439 goto mem_err;
3440 }
3441 }
3442
3443 root = NCAR(prev_node);
3444 }
3445 else { /* r == 0 */
3446 if (IS_NOT_NULL(root)) {
3447 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3448 onig_node_free(prev_node);
3449 goto mem_err;
3450 }
3451 }
3452 }
3453
3454 snode = NULL_NODE;
3455 }
3456
3457 p += len;
3458 }
3459
3460 if (p < end) {
3461 Node *srem;
3462
3463 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3464 if (r != 0) goto mem_err;
3465
3466 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3467 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3468 if (IS_NULL(root)) {
3469 onig_node_free(srem);
3470 onig_node_free(prev_node);
3471 goto mem_err;
3472 }
3473 }
3474
3475 if (IS_NULL(root)) {
3476 prev_node = srem;
3477 }
3478 else {
3479 if (IS_NULL(onig_node_list_add(root, srem))) {
3480 onig_node_free(srem);
3481 goto mem_err;
3482 }
3483 }
3484 }
3485
3486 /* ending */
3487 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3488 swap_node(node, top_root);
3489 onig_node_free(top_root);
3490 return 0;
3491
3492 mem_err:
3493 r = ONIGERR_MEMORY;
3494
3495 err:
3496 onig_node_free(top_root);
3497 return r;
3498 }
3499
3500
3501 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3502
3503 #define CEC_THRES_NUM_BIG_REPEAT 512
3504 #define CEC_INFINITE_NUM 0x7fffffff
3505
3506 #define CEC_IN_INFINITE_REPEAT (1<<0)
3507 #define CEC_IN_FINITE_REPEAT (1<<1)
3508 #define CEC_CONT_BIG_REPEAT (1<<2)
3509
3510 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3511 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3512 {
3513 int type;
3514 int r = state;
3515
3516 type = NTYPE(node);
3517 switch (type) {
3518 case NT_LIST:
3519 {
3520 Node* prev = NULL_NODE;
3521 do {
3522 r = setup_comb_exp_check(NCAR(node), r, env);
3523 prev = NCAR(node);
3524 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3525 }
3526 break;
3527
3528 case NT_ALT:
3529 {
3530 int ret;
3531 do {
3532 ret = setup_comb_exp_check(NCAR(node), state, env);
3533 r |= ret;
3534 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3535 }
3536 break;
3537
3538 case NT_QTFR:
3539 {
3540 int child_state = state;
3541 int add_state = 0;
3542 QtfrNode* qn = NQTFR(node);
3543 Node* target = qn->target;
3544 int var_num;
3545
3546 if (! IS_REPEAT_INFINITE(qn->upper)) {
3547 if (qn->upper > 1) {
3548 /* {0,1}, {1,1} are allowed */
3549 child_state |= CEC_IN_FINITE_REPEAT;
3550
3551 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3552 if (env->backrefed_mem == 0) {
3553 if (NTYPE(qn->target) == NT_ENCLOSE) {
3554 EncloseNode* en = NENCLOSE(qn->target);
3555 if (en->type == ENCLOSE_MEMORY) {
3556 if (NTYPE(en->target) == NT_QTFR) {
3557 QtfrNode* q = NQTFR(en->target);
3558 if (IS_REPEAT_INFINITE(q->upper)
3559 && q->greedy == qn->greedy) {
3560 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3561 if (qn->upper == 1)
3562 child_state = state;
3563 }
3564 }
3565 }
3566 }
3567 }
3568 }
3569 }
3570
3571 if (state & CEC_IN_FINITE_REPEAT) {
3572 qn->comb_exp_check_num = -1;
3573 }
3574 else {
3575 if (IS_REPEAT_INFINITE(qn->upper)) {
3576 var_num = CEC_INFINITE_NUM;
3577 child_state |= CEC_IN_INFINITE_REPEAT;
3578 }
3579 else {
3580 var_num = qn->upper - qn->lower;
3581 }
3582
3583 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3584 add_state |= CEC_CONT_BIG_REPEAT;
3585
3586 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3587 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3588 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3589 if (qn->comb_exp_check_num == 0) {
3590 env->num_comb_exp_check++;
3591 qn->comb_exp_check_num = env->num_comb_exp_check;
3592 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3593 env->comb_exp_max_regnum = env->curr_max_regnum;
3594 }
3595 }
3596 }
3597
3598 r = setup_comb_exp_check(target, child_state, env);
3599 r |= add_state;
3600 }
3601 break;
3602
3603 case NT_ENCLOSE:
3604 {
3605 EncloseNode* en = NENCLOSE(node);
3606
3607 switch (en->type) {
3608 case ENCLOSE_MEMORY:
3609 {
3610 if (env->curr_max_regnum < en->regnum)
3611 env->curr_max_regnum = en->regnum;
3612
3613 r = setup_comb_exp_check(en->target, state, env);
3614 }
3615 break;
3616
3617 default:
3618 r = setup_comb_exp_check(en->target, state, env);
3619 break;
3620 }
3621 }
3622 break;
3623
3624 #ifdef USE_SUBEXP_CALL
3625 case NT_CALL:
3626 if (IS_CALL_RECURSION(NCALL(node)))
3627 env->has_recursion = 1;
3628 else
3629 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3630 break;
3631 #endif
3632
3633 default:
3634 break;
3635 }
3636
3637 return r;
3638 }
3639 #endif
3640
3641 #define IN_ALT (1<<0)
3642 #define IN_NOT (1<<1)
3643 #define IN_REPEAT (1<<2)
3644 #define IN_VAR_REPEAT (1<<3)
3645
3646 /* setup_tree does the following work.
3647 1. check empty loop. (set qn->target_empty_info)
3648 2. expand ignore-case in char class.
3649 3. set memory status bit flags. (reg->mem_stats)
3650 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3651 5. find invalid patterns in look-behind.
3652 6. expand repeated string.
3653 */
3654 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3655 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3656 {
3657 int type;
3658 int r = 0;
3659
3660 type = NTYPE(node);
3661 switch (type) {
3662 case NT_LIST:
3663 {
3664 Node* prev = NULL_NODE;
3665 do {
3666 r = setup_tree(NCAR(node), reg, state, env);
3667 if (IS_NOT_NULL(prev) && r == 0) {
3668 r = next_setup(prev, NCAR(node), reg);
3669 }
3670 prev = NCAR(node);
3671 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3672 }
3673 break;
3674
3675 case NT_ALT:
3676 do {
3677 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3678 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3679 break;
3680
3681 case NT_CCLASS:
3682 break;
3683
3684 case NT_STR:
3685 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3686 r = expand_case_fold_string(node, reg);
3687 }
3688 break;
3689
3690 case NT_CTYPE:
3691 case NT_CANY:
3692 break;
3693
3694 #ifdef USE_SUBEXP_CALL
3695 case NT_CALL:
3696 break;
3697 #endif
3698
3699 case NT_BREF:
3700 {
3701 int i;
3702 int* p;
3703 Node** nodes = SCANENV_MEM_NODES(env);
3704 BRefNode* br = NBREF(node);
3705 p = BACKREFS_P(br);
3706 for (i = 0; i < br->back_num; i++) {
3707 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3708 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3709 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3710 #ifdef USE_BACKREF_WITH_LEVEL
3711 if (IS_BACKREF_NEST_LEVEL(br)) {
3712 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3713 }
3714 #endif
3715 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3716 }
3717 }
3718 break;
3719
3720 case NT_QTFR:
3721 {
3722 OnigDistance d;
3723 QtfrNode* qn = NQTFR(node);
3724 Node* target = qn->target;
3725
3726 if ((state & IN_REPEAT) != 0) {
3727 qn->state |= NST_IN_REPEAT;
3728 }
3729
3730 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3731 r = get_min_match_length(target, &d, env);
3732 if (r) break;
3733 if (d == 0) {
3734 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3735 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3736 r = quantifiers_memory_node_info(target);
3737 if (r < 0) break;
3738 if (r > 0) {
3739 qn->target_empty_info = r;
3740 }
3741 #endif
3742 #if 0
3743 r = get_max_match_length(target, &d, env);
3744 if (r == 0 && d == 0) {
3745 /* ()* ==> ()?, ()+ ==> () */
3746 qn->upper = 1;
3747 if (qn->lower > 1) qn->lower = 1;
3748 if (NTYPE(target) == NT_STR) {
3749 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3750 }
3751 }
3752 #endif
3753 }
3754 }
3755
3756 state |= IN_REPEAT;
3757 if (qn->lower != qn->upper)
3758 state |= IN_VAR_REPEAT;
3759 r = setup_tree(target, reg, state, env);
3760 if (r) break;
3761
3762 /* expand string */
3763 #define EXPAND_STRING_MAX_LENGTH 100
3764 if (NTYPE(target) == NT_STR) {
3765 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3766 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3767 int len = NSTRING_LEN(target);
3768 StrNode* sn = NSTR(target);
3769
3770 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3771 int i, n = qn->lower;
3772 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3773 for (i = 0; i < n; i++) {
3774 r = onig_node_str_cat(node, sn->s, sn->end);
3775 if (r) break;
3776 }
3777 onig_node_free(target);
3778 break; /* break case NT_QTFR: */
3779 }
3780 }
3781 }
3782
3783 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3784 if (qn->greedy && (qn->target_empty_info != 0)) {
3785 if (NTYPE(target) == NT_QTFR) {
3786 QtfrNode* tqn = NQTFR(target);
3787 if (IS_NOT_NULL(tqn->head_exact)) {
3788 qn->head_exact = tqn->head_exact;
3789 tqn->head_exact = NULL;
3790 }
3791 }
3792 else {
3793 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3794 }
3795 }
3796 #endif
3797 }
3798 break;
3799
3800 case NT_ENCLOSE:
3801 {
3802 EncloseNode* en = NENCLOSE(node);
3803
3804 switch (en->type) {
3805 case ENCLOSE_OPTION:
3806 {
3807 OnigOptionType options = reg->options;
3808 reg->options = NENCLOSE(node)->option;
3809 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3810 reg->options = options;
3811 }
3812 break;
3813
3814 case ENCLOSE_MEMORY:
3815 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3816 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3817 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3818 }
3819 r = setup_tree(en->target, reg, state, env);
3820 break;
3821
3822 case ENCLOSE_STOP_BACKTRACK:
3823 {
3824 Node* target = en->target;
3825 r = setup_tree(target, reg, state, env);
3826 if (NTYPE(target) == NT_QTFR) {
3827 QtfrNode* tqn = NQTFR(target);
3828 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3829 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3830 int qtype = NTYPE(tqn->target);
3831 if (IS_NODE_TYPE_SIMPLE(qtype))
3832 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3833 }
3834 }
3835 }
3836 break;
3837 }
3838 }
3839 break;
3840
3841 case NT_ANCHOR:
3842 {
3843 AnchorNode* an = NANCHOR(node);
3844
3845 switch (an->type) {
3846 case ANCHOR_PREC_READ:
3847 r = setup_tree(an->target, reg, state, env);
3848 break;
3849 case ANCHOR_PREC_READ_NOT:
3850 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3851 break;
3852
3853 /* allowed node types in look-behind */
3854 #define ALLOWED_TYPE_IN_LB \
3855 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3856 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3857
3858 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
3859 #define ALLOWED_ENCLOSE_IN_LB_NOT 0
3860
3861 #define ALLOWED_ANCHOR_IN_LB \
3862 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3863 #define ALLOWED_ANCHOR_IN_LB_NOT \
3864 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3865
3866 case ANCHOR_LOOK_BEHIND:
3867 {
3868 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3869 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3870 if (r < 0) return r;
3871 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3872 r = setup_look_behind(node, reg, env);
3873 if (r != 0) return r;
3874 r = setup_tree(an->target, reg, state, env);
3875 }
3876 break;
3877
3878 case ANCHOR_LOOK_BEHIND_NOT:
3879 {
3880 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3881 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3882 if (r < 0) return r;
3883 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3884 r = setup_look_behind(node, reg, env);
3885 if (r != 0) return r;
3886 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3887 }
3888 break;
3889 }
3890 }
3891 break;
3892
3893 default:
3894 break;
3895 }
3896
3897 return r;
3898 }
3899
3900 /* set skip map for Boyer-Moor search */
3901 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3902 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3903 UChar skip[], int** int_skip)
3904 {
3905 int i, len;
3906
3907 len = end - s;
3908 if (len < ONIG_CHAR_TABLE_SIZE) {
3909 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3910
3911 for (i = 0; i < len - 1; i++)
3912 skip[s[i]] = len - 1 - i;
3913 }
3914 else {
3915 if (IS_NULL(*int_skip)) {
3916 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3917 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3918 }
3919 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3920
3921 for (i = 0; i < len - 1; i++)
3922 (*int_skip)[s[i]] = len - 1 - i;
3923 }
3924 return 0;
3925 }
3926
3927 #define OPT_EXACT_MAXLEN 24
3928
3929 typedef struct {
3930 OnigDistance min; /* min byte length */
3931 OnigDistance max; /* max byte length */
3932 } MinMaxLen;
3933
3934 typedef struct {
3935 MinMaxLen mmd;
3936 OnigEncoding enc;
3937 OnigOptionType options;
3938 OnigCaseFoldType case_fold_flag;
3939 ScanEnv* scan_env;
3940 } OptEnv;
3941
3942 typedef struct {
3943 int left_anchor;
3944 int right_anchor;
3945 } OptAncInfo;
3946
3947 typedef struct {
3948 MinMaxLen mmd; /* info position */
3949 OptAncInfo anc;
3950
3951 int reach_end;
3952 int ignore_case;
3953 int len;
3954 UChar s[OPT_EXACT_MAXLEN];
3955 } OptExactInfo;
3956
3957 typedef struct {
3958 MinMaxLen mmd; /* info position */
3959 OptAncInfo anc;
3960
3961 int value; /* weighted value */
3962 UChar map[ONIG_CHAR_TABLE_SIZE];
3963 } OptMapInfo;
3964
3965 typedef struct {
3966 MinMaxLen len;
3967
3968 OptAncInfo anc;
3969 OptExactInfo exb; /* boundary */
3970 OptExactInfo exm; /* middle */
3971 OptExactInfo expr; /* prec read (?=...) */
3972
3973 OptMapInfo map; /* boundary */
3974 } NodeOptInfo;
3975
3976
3977 static int
map_position_value(OnigEncoding enc,int i)3978 map_position_value(OnigEncoding enc, int i)
3979 {
3980 static const short int ByteValTable[] = {
3981 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
3982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3983 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
3984 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
3985 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3986 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
3987 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3988 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
3989 };
3990
3991 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
3992 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3993 return 20;
3994 else
3995 return (int )ByteValTable[i];
3996 }
3997 else
3998 return 4; /* Take it easy. */
3999 }
4000
4001 static int
distance_value(MinMaxLen * mm)4002 distance_value(MinMaxLen* mm)
4003 {
4004 /* 1000 / (min-max-dist + 1) */
4005 static const short int dist_vals[] = {
4006 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4007 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4008 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4009 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4010 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4011 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4012 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4013 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4014 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4015 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4016 };
4017
4018 int d;
4019
4020 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4021
4022 d = mm->max - mm->min;
4023 if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4024 /* return dist_vals[d] * 16 / (mm->min + 12); */
4025 return (int )dist_vals[d];
4026 else
4027 return 1;
4028 }
4029
4030 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4031 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4032 {
4033 if (v2 <= 0) return -1;
4034 if (v1 <= 0) return 1;
4035
4036 v1 *= distance_value(d1);
4037 v2 *= distance_value(d2);
4038
4039 if (v2 > v1) return 1;
4040 if (v2 < v1) return -1;
4041
4042 if (d2->min < d1->min) return 1;
4043 if (d2->min > d1->min) return -1;
4044 return 0;
4045 }
4046
4047 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4048 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4049 {
4050 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4051 }
4052
4053
4054 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4055 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4056 {
4057 mml->min = min;
4058 mml->max = max;
4059 }
4060
4061 static void
clear_mml(MinMaxLen * mml)4062 clear_mml(MinMaxLen* mml)
4063 {
4064 mml->min = mml->max = 0;
4065 }
4066
4067 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4068 copy_mml(MinMaxLen* to, MinMaxLen* from)
4069 {
4070 to->min = from->min;
4071 to->max = from->max;
4072 }
4073
4074 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4075 add_mml(MinMaxLen* to, MinMaxLen* from)
4076 {
4077 to->min = distance_add(to->min, from->min);
4078 to->max = distance_add(to->max, from->max);
4079 }
4080
4081 #if 0
4082 static void
4083 add_len_mml(MinMaxLen* to, OnigDistance len)
4084 {
4085 to->min = distance_add(to->min, len);
4086 to->max = distance_add(to->max, len);
4087 }
4088 #endif
4089
4090 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4091 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4092 {
4093 if (to->min > from->min) to->min = from->min;
4094 if (to->max < from->max) to->max = from->max;
4095 }
4096
4097 static void
copy_opt_env(OptEnv * to,OptEnv * from)4098 copy_opt_env(OptEnv* to, OptEnv* from)
4099 {
4100 *to = *from;
4101 }
4102
4103 static void
clear_opt_anc_info(OptAncInfo * anc)4104 clear_opt_anc_info(OptAncInfo* anc)
4105 {
4106 anc->left_anchor = 0;
4107 anc->right_anchor = 0;
4108 }
4109
4110 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4111 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4112 {
4113 *to = *from;
4114 }
4115
4116 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4117 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4118 OnigDistance left_len, OnigDistance right_len)
4119 {
4120 clear_opt_anc_info(to);
4121
4122 to->left_anchor = left->left_anchor;
4123 if (left_len == 0) {
4124 to->left_anchor |= right->left_anchor;
4125 }
4126
4127 to->right_anchor = right->right_anchor;
4128 if (right_len == 0) {
4129 to->right_anchor |= left->right_anchor;
4130 }
4131 }
4132
4133 static int
is_left_anchor(int anc)4134 is_left_anchor(int anc)
4135 {
4136 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4137 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4138 anc == ANCHOR_PREC_READ_NOT)
4139 return 0;
4140
4141 return 1;
4142 }
4143
4144 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4145 is_set_opt_anc_info(OptAncInfo* to, int anc)
4146 {
4147 if ((to->left_anchor & anc) != 0) return 1;
4148
4149 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4150 }
4151
4152 static void
add_opt_anc_info(OptAncInfo * to,int anc)4153 add_opt_anc_info(OptAncInfo* to, int anc)
4154 {
4155 if (is_left_anchor(anc))
4156 to->left_anchor |= anc;
4157 else
4158 to->right_anchor |= anc;
4159 }
4160
4161 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4162 remove_opt_anc_info(OptAncInfo* to, int anc)
4163 {
4164 if (is_left_anchor(anc))
4165 to->left_anchor &= ~anc;
4166 else
4167 to->right_anchor &= ~anc;
4168 }
4169
4170 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4171 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4172 {
4173 to->left_anchor &= add->left_anchor;
4174 to->right_anchor &= add->right_anchor;
4175 }
4176
4177 static int
is_full_opt_exact_info(OptExactInfo * ex)4178 is_full_opt_exact_info(OptExactInfo* ex)
4179 {
4180 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4181 }
4182
4183 static void
clear_opt_exact_info(OptExactInfo * ex)4184 clear_opt_exact_info(OptExactInfo* ex)
4185 {
4186 clear_mml(&ex->mmd);
4187 clear_opt_anc_info(&ex->anc);
4188 ex->reach_end = 0;
4189 ex->ignore_case = 0;
4190 ex->len = 0;
4191 ex->s[0] = '\0';
4192 }
4193
4194 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4195 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4196 {
4197 *to = *from;
4198 }
4199
4200 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4201 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4202 {
4203 int i, j, len;
4204 UChar *p, *end;
4205 OptAncInfo tanc;
4206
4207 if (! to->ignore_case && add->ignore_case) {
4208 if (to->len >= add->len) return ; /* avoid */
4209
4210 to->ignore_case = 1;
4211 }
4212
4213 p = add->s;
4214 end = p + add->len;
4215 for (i = to->len; p < end; ) {
4216 len = enclen(enc, p);
4217 if (i + len > OPT_EXACT_MAXLEN) break;
4218 for (j = 0; j < len && p < end; j++)
4219 to->s[i++] = *p++;
4220 }
4221
4222 to->len = i;
4223 to->reach_end = (p == end ? add->reach_end : 0);
4224
4225 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4226 if (! to->reach_end) tanc.right_anchor = 0;
4227 copy_opt_anc_info(&to->anc, &tanc);
4228 }
4229
4230 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4231 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4232 int raw ARG_UNUSED, OnigEncoding enc)
4233 {
4234 int i, j, len;
4235 UChar *p;
4236
4237 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4238 len = enclen(enc, p);
4239 if (i + len > OPT_EXACT_MAXLEN) break;
4240 for (j = 0; j < len && p < end; j++)
4241 to->s[i++] = *p++;
4242 }
4243
4244 to->len = i;
4245 }
4246
4247 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4248 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4249 {
4250 int i, j, len;
4251
4252 if (add->len == 0 || to->len == 0) {
4253 clear_opt_exact_info(to);
4254 return ;
4255 }
4256
4257 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4258 clear_opt_exact_info(to);
4259 return ;
4260 }
4261
4262 for (i = 0; i < to->len && i < add->len; ) {
4263 if (to->s[i] != add->s[i]) break;
4264 len = enclen(env->enc, to->s + i);
4265
4266 for (j = 1; j < len; j++) {
4267 if (to->s[i+j] != add->s[i+j]) break;
4268 }
4269 if (j < len) break;
4270 i += len;
4271 }
4272
4273 if (! add->reach_end || i < add->len || i < to->len) {
4274 to->reach_end = 0;
4275 }
4276 to->len = i;
4277 to->ignore_case |= add->ignore_case;
4278
4279 alt_merge_opt_anc_info(&to->anc, &add->anc);
4280 if (! to->reach_end) to->anc.right_anchor = 0;
4281 }
4282
4283 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4284 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4285 {
4286 int v1, v2;
4287
4288 v1 = now->len;
4289 v2 = alt->len;
4290
4291 if (v2 == 0) {
4292 return ;
4293 }
4294 else if (v1 == 0) {
4295 copy_opt_exact_info(now, alt);
4296 return ;
4297 }
4298 else if (v1 <= 2 && v2 <= 2) {
4299 /* ByteValTable[x] is big value --> low price */
4300 v2 = map_position_value(enc, now->s[0]);
4301 v1 = map_position_value(enc, alt->s[0]);
4302
4303 if (now->len > 1) v1 += 5;
4304 if (alt->len > 1) v2 += 5;
4305 }
4306
4307 if (now->ignore_case == 0) v1 *= 2;
4308 if (alt->ignore_case == 0) v2 *= 2;
4309
4310 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4311 copy_opt_exact_info(now, alt);
4312 }
4313
4314 static void
clear_opt_map_info(OptMapInfo * map)4315 clear_opt_map_info(OptMapInfo* map)
4316 {
4317 static const OptMapInfo clean_info = {
4318 {0, 0}, {0, 0}, 0,
4319 {
4320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4336 }
4337 };
4338
4339 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4340 }
4341
4342 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4343 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4344 {
4345 *to = *from;
4346 }
4347
4348 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4349 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4350 {
4351 if (map->map[c] == 0) {
4352 map->map[c] = 1;
4353 map->value += map_position_value(enc, c);
4354 }
4355 }
4356
4357 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4358 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4359 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4360 {
4361 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4362 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4363 int i, n;
4364
4365 add_char_opt_map_info(map, p[0], enc);
4366
4367 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4368 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4369 if (n < 0) return n;
4370
4371 for (i = 0; i < n; i++) {
4372 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4373 add_char_opt_map_info(map, buf[0], enc);
4374 }
4375
4376 return 0;
4377 }
4378
4379 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4380 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4381 {
4382 static int z = 1<<15; /* 32768: something big value */
4383
4384 int v1, v2;
4385
4386 if (alt->value == 0) return ;
4387 if (now->value == 0) {
4388 copy_opt_map_info(now, alt);
4389 return ;
4390 }
4391
4392 v1 = z / now->value;
4393 v2 = z / alt->value;
4394 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4395 copy_opt_map_info(now, alt);
4396 }
4397
4398 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4399 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4400 {
4401 #define COMP_EM_BASE 20
4402 int ve, vm;
4403
4404 if (m->value <= 0) return -1;
4405
4406 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4407 vm = COMP_EM_BASE * 5 * 2 / m->value;
4408 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4409 }
4410
4411 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4412 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4413 {
4414 int i, val;
4415
4416 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4417 if (to->value == 0) return ;
4418 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4419 clear_opt_map_info(to);
4420 return ;
4421 }
4422
4423 alt_merge_mml(&to->mmd, &add->mmd);
4424
4425 val = 0;
4426 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4427 if (add->map[i])
4428 to->map[i] = 1;
4429
4430 if (to->map[i])
4431 val += map_position_value(enc, i);
4432 }
4433 to->value = val;
4434
4435 alt_merge_opt_anc_info(&to->anc, &add->anc);
4436 }
4437
4438 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4439 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4440 {
4441 copy_mml(&(opt->exb.mmd), mmd);
4442 copy_mml(&(opt->expr.mmd), mmd);
4443 copy_mml(&(opt->map.mmd), mmd);
4444 }
4445
4446 static void
clear_node_opt_info(NodeOptInfo * opt)4447 clear_node_opt_info(NodeOptInfo* opt)
4448 {
4449 clear_mml(&opt->len);
4450 clear_opt_anc_info(&opt->anc);
4451 clear_opt_exact_info(&opt->exb);
4452 clear_opt_exact_info(&opt->exm);
4453 clear_opt_exact_info(&opt->expr);
4454 clear_opt_map_info(&opt->map);
4455 }
4456
4457 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4458 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4459 {
4460 *to = *from;
4461 }
4462
4463 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4464 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4465 {
4466 int exb_reach, exm_reach;
4467 OptAncInfo tanc;
4468
4469 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4470 copy_opt_anc_info(&to->anc, &tanc);
4471
4472 if (add->exb.len > 0 && to->len.max == 0) {
4473 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4474 to->len.max, add->len.max);
4475 copy_opt_anc_info(&add->exb.anc, &tanc);
4476 }
4477
4478 if (add->map.value > 0 && to->len.max == 0) {
4479 if (add->map.mmd.max == 0)
4480 add->map.anc.left_anchor |= to->anc.left_anchor;
4481 }
4482
4483 exb_reach = to->exb.reach_end;
4484 exm_reach = to->exm.reach_end;
4485
4486 if (add->len.max != 0)
4487 to->exb.reach_end = to->exm.reach_end = 0;
4488
4489 if (add->exb.len > 0) {
4490 if (exb_reach) {
4491 concat_opt_exact_info(&to->exb, &add->exb, enc);
4492 clear_opt_exact_info(&add->exb);
4493 }
4494 else if (exm_reach) {
4495 concat_opt_exact_info(&to->exm, &add->exb, enc);
4496 clear_opt_exact_info(&add->exb);
4497 }
4498 }
4499 select_opt_exact_info(enc, &to->exm, &add->exb);
4500 select_opt_exact_info(enc, &to->exm, &add->exm);
4501
4502 if (to->expr.len > 0) {
4503 if (add->len.max > 0) {
4504 if (to->expr.len > (int )add->len.max)
4505 to->expr.len = add->len.max;
4506
4507 if (to->expr.mmd.max == 0)
4508 select_opt_exact_info(enc, &to->exb, &to->expr);
4509 else
4510 select_opt_exact_info(enc, &to->exm, &to->expr);
4511 }
4512 }
4513 else if (add->expr.len > 0) {
4514 copy_opt_exact_info(&to->expr, &add->expr);
4515 }
4516
4517 select_opt_map_info(&to->map, &add->map);
4518
4519 add_mml(&to->len, &add->len);
4520 }
4521
4522 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4523 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4524 {
4525 alt_merge_opt_anc_info (&to->anc, &add->anc);
4526 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4527 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4528 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4529 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4530
4531 alt_merge_mml(&to->len, &add->len);
4532 }
4533
4534
4535 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4536
4537 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4538 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4539 {
4540 int type;
4541 int r = 0;
4542
4543 clear_node_opt_info(opt);
4544 set_bound_node_opt_info(opt, &env->mmd);
4545
4546 type = NTYPE(node);
4547 switch (type) {
4548 case NT_LIST:
4549 {
4550 OptEnv nenv;
4551 NodeOptInfo nopt;
4552 Node* nd = node;
4553
4554 copy_opt_env(&nenv, env);
4555 do {
4556 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4557 if (r == 0) {
4558 add_mml(&nenv.mmd, &nopt.len);
4559 concat_left_node_opt_info(env->enc, opt, &nopt);
4560 }
4561 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4562 }
4563 break;
4564
4565 case NT_ALT:
4566 {
4567 NodeOptInfo nopt;
4568 Node* nd = node;
4569
4570 do {
4571 r = optimize_node_left(NCAR(nd), &nopt, env);
4572 if (r == 0) {
4573 if (nd == node) copy_node_opt_info(opt, &nopt);
4574 else alt_merge_node_opt_info(opt, &nopt, env);
4575 }
4576 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4577 }
4578 break;
4579
4580 case NT_STR:
4581 {
4582 StrNode* sn = NSTR(node);
4583 int slen = sn->end - sn->s;
4584 int is_raw = NSTRING_IS_RAW(node);
4585
4586 if (! NSTRING_IS_AMBIG(node)) {
4587 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4588 NSTRING_IS_RAW(node), env->enc);
4589 if (slen > 0) {
4590 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4591 }
4592 set_mml(&opt->len, slen, slen);
4593 }
4594 else {
4595 int max;
4596
4597 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4598 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4599 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4600 }
4601 else {
4602 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4603 is_raw, env->enc);
4604 opt->exb.ignore_case = 1;
4605
4606 if (slen > 0) {
4607 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4608 env->enc, env->case_fold_flag);
4609 if (r != 0) break;
4610 }
4611
4612 max = slen;
4613 }
4614
4615 set_mml(&opt->len, slen, max);
4616 }
4617
4618 if (opt->exb.len == slen)
4619 opt->exb.reach_end = 1;
4620 }
4621 break;
4622
4623 case NT_CCLASS:
4624 {
4625 int i, z;
4626 CClassNode* cc = NCCLASS(node);
4627
4628 /* no need to check ignore case. (setted in setup_tree()) */
4629
4630 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4631 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4632 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4633
4634 set_mml(&opt->len, min, max);
4635 }
4636 else {
4637 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4638 z = BITSET_AT(cc->bs, i);
4639 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4640 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4641 }
4642 }
4643 set_mml(&opt->len, 1, 1);
4644 }
4645 }
4646 break;
4647
4648 case NT_CTYPE:
4649 {
4650 int i, min, max;
4651
4652 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4653
4654 if (max == 1) {
4655 min = 1;
4656
4657 switch (NCTYPE(node)->ctype) {
4658 case ONIGENC_CTYPE_WORD:
4659 if (NCTYPE(node)->not != 0) {
4660 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4661 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4662 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4663 }
4664 }
4665 }
4666 else {
4667 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4668 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4669 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4670 }
4671 }
4672 }
4673 break;
4674 }
4675 }
4676 else {
4677 min = ONIGENC_MBC_MINLEN(env->enc);
4678 }
4679 set_mml(&opt->len, min, max);
4680 }
4681 break;
4682
4683 case NT_CANY:
4684 {
4685 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4686 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4687 set_mml(&opt->len, min, max);
4688 }
4689 break;
4690
4691 case NT_ANCHOR:
4692 switch (NANCHOR(node)->type) {
4693 case ANCHOR_BEGIN_BUF:
4694 case ANCHOR_BEGIN_POSITION:
4695 case ANCHOR_BEGIN_LINE:
4696 case ANCHOR_END_BUF:
4697 case ANCHOR_SEMI_END_BUF:
4698 case ANCHOR_END_LINE:
4699 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4700 break;
4701
4702 case ANCHOR_PREC_READ:
4703 {
4704 NodeOptInfo nopt;
4705
4706 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4707 if (r == 0) {
4708 if (nopt.exb.len > 0)
4709 copy_opt_exact_info(&opt->expr, &nopt.exb);
4710 else if (nopt.exm.len > 0)
4711 copy_opt_exact_info(&opt->expr, &nopt.exm);
4712
4713 opt->expr.reach_end = 0;
4714
4715 if (nopt.map.value > 0)
4716 copy_opt_map_info(&opt->map, &nopt.map);
4717 }
4718 }
4719 break;
4720
4721 case ANCHOR_PREC_READ_NOT:
4722 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4723 case ANCHOR_LOOK_BEHIND_NOT:
4724 break;
4725 }
4726 break;
4727
4728 case NT_BREF:
4729 {
4730 int i;
4731 int* backs;
4732 OnigDistance min, max, tmin, tmax;
4733 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4734 BRefNode* br = NBREF(node);
4735
4736 if (br->state & NST_RECURSION) {
4737 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4738 break;
4739 }
4740 backs = BACKREFS_P(br);
4741 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4742 if (r != 0) break;
4743 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4744 if (r != 0) break;
4745 for (i = 1; i < br->back_num; i++) {
4746 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4747 if (r != 0) break;
4748 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4749 if (r != 0) break;
4750 if (min > tmin) min = tmin;
4751 if (max < tmax) max = tmax;
4752 }
4753 if (r == 0) set_mml(&opt->len, min, max);
4754 }
4755 break;
4756
4757 #ifdef USE_SUBEXP_CALL
4758 case NT_CALL:
4759 if (IS_CALL_RECURSION(NCALL(node)))
4760 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4761 else {
4762 OnigOptionType save = env->options;
4763 env->options = NENCLOSE(NCALL(node)->target)->option;
4764 r = optimize_node_left(NCALL(node)->target, opt, env);
4765 env->options = save;
4766 }
4767 break;
4768 #endif
4769
4770 case NT_QTFR:
4771 {
4772 int i;
4773 OnigDistance min, max;
4774 NodeOptInfo nopt;
4775 QtfrNode* qn = NQTFR(node);
4776
4777 r = optimize_node_left(qn->target, &nopt, env);
4778 if (r) break;
4779
4780 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4781 if (env->mmd.max == 0 &&
4782 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4783 if (IS_MULTILINE(env->options))
4784 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4785 else
4786 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4787 }
4788 }
4789 else {
4790 if (qn->lower > 0) {
4791 copy_node_opt_info(opt, &nopt);
4792 if (nopt.exb.len > 0) {
4793 if (nopt.exb.reach_end) {
4794 for (i = 2; i <= qn->lower &&
4795 ! is_full_opt_exact_info(&opt->exb); i++) {
4796 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4797 }
4798 if (i < qn->lower) {
4799 opt->exb.reach_end = 0;
4800 }
4801 }
4802 }
4803
4804 if (qn->lower != qn->upper) {
4805 opt->exb.reach_end = 0;
4806 opt->exm.reach_end = 0;
4807 }
4808 if (qn->lower > 1)
4809 opt->exm.reach_end = 0;
4810 }
4811 }
4812
4813 min = distance_multiply(nopt.len.min, qn->lower);
4814 if (IS_REPEAT_INFINITE(qn->upper))
4815 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4816 else
4817 max = distance_multiply(nopt.len.max, qn->upper);
4818
4819 set_mml(&opt->len, min, max);
4820 }
4821 break;
4822
4823 case NT_ENCLOSE:
4824 {
4825 EncloseNode* en = NENCLOSE(node);
4826
4827 switch (en->type) {
4828 case ENCLOSE_OPTION:
4829 {
4830 OnigOptionType save = env->options;
4831
4832 env->options = en->option;
4833 r = optimize_node_left(en->target, opt, env);
4834 env->options = save;
4835 }
4836 break;
4837
4838 case ENCLOSE_MEMORY:
4839 #ifdef USE_SUBEXP_CALL
4840 en->opt_count++;
4841 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4842 OnigDistance min, max;
4843
4844 min = 0;
4845 max = ONIG_INFINITE_DISTANCE;
4846 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4847 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4848 set_mml(&opt->len, min, max);
4849 }
4850 else
4851 #endif
4852 {
4853 r = optimize_node_left(en->target, opt, env);
4854
4855 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4856 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4857 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4858 }
4859 }
4860 break;
4861
4862 case ENCLOSE_STOP_BACKTRACK:
4863 r = optimize_node_left(en->target, opt, env);
4864 break;
4865 }
4866 }
4867 break;
4868
4869 default:
4870 #ifdef ONIG_DEBUG
4871 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4872 NTYPE(node));
4873 #endif
4874 r = ONIGERR_TYPE_BUG;
4875 break;
4876 }
4877
4878 return r;
4879 }
4880
4881 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4882 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4883 {
4884 int r;
4885
4886 if (e->len == 0) return 0;
4887
4888 if (e->ignore_case) {
4889 reg->exact = (UChar* )xmalloc(e->len);
4890 CHECK_NULL_RETURN_MEMERR(reg->exact);
4891 xmemcpy(reg->exact, e->s, e->len);
4892 reg->exact_end = reg->exact + e->len;
4893 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4894 }
4895 else {
4896 int allow_reverse;
4897
4898 reg->exact = str_dup(e->s, e->s + e->len);
4899 CHECK_NULL_RETURN_MEMERR(reg->exact);
4900 reg->exact_end = reg->exact + e->len;
4901
4902 allow_reverse =
4903 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4904
4905 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4906 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4907 reg->map, &(reg->int_map));
4908 if (r) return r;
4909
4910 reg->optimize = (allow_reverse != 0
4911 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4912 }
4913 else {
4914 reg->optimize = ONIG_OPTIMIZE_EXACT;
4915 }
4916 }
4917
4918 reg->dmin = e->mmd.min;
4919 reg->dmax = e->mmd.max;
4920
4921 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4922 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4923 }
4924
4925 return 0;
4926 }
4927
4928 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4929 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4930 {
4931 int i;
4932
4933 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4934 reg->map[i] = m->map[i];
4935
4936 reg->optimize = ONIG_OPTIMIZE_MAP;
4937 reg->dmin = m->mmd.min;
4938 reg->dmax = m->mmd.max;
4939
4940 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4941 reg->threshold_len = reg->dmin + 1;
4942 }
4943 }
4944
4945 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4946 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4947 {
4948 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4949 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4950 }
4951
4952 #ifdef ONIG_DEBUG
4953 static void print_optimize_info(FILE* f, regex_t* reg);
4954 #endif
4955
4956 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4957 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4958 {
4959
4960 int r;
4961 NodeOptInfo opt;
4962 OptEnv env;
4963
4964 env.enc = reg->enc;
4965 env.options = reg->options;
4966 env.case_fold_flag = reg->case_fold_flag;
4967 env.scan_env = scan_env;
4968 clear_mml(&env.mmd);
4969
4970 r = optimize_node_left(node, &opt, &env);
4971 if (r) return r;
4972
4973 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4974 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4975
4976 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4977
4978 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4979 reg->anchor_dmin = opt.len.min;
4980 reg->anchor_dmax = opt.len.max;
4981 }
4982
4983 if (opt.exb.len > 0 || opt.exm.len > 0) {
4984 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4985 if (opt.map.value > 0 &&
4986 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4987 goto set_map;
4988 }
4989 else {
4990 r = set_optimize_exact_info(reg, &opt.exb);
4991 set_sub_anchor(reg, &opt.exb.anc);
4992 }
4993 }
4994 else if (opt.map.value > 0) {
4995 set_map:
4996 set_optimize_map_info(reg, &opt.map);
4997 set_sub_anchor(reg, &opt.map.anc);
4998 }
4999 else {
5000 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5001 if (opt.len.max == 0)
5002 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5003 }
5004
5005 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5006 print_optimize_info(stderr, reg);
5007 #endif
5008 return r;
5009 }
5010
5011 static void
clear_optimize_info(regex_t * reg)5012 clear_optimize_info(regex_t* reg)
5013 {
5014 reg->optimize = ONIG_OPTIMIZE_NONE;
5015 reg->anchor = 0;
5016 reg->anchor_dmin = 0;
5017 reg->anchor_dmax = 0;
5018 reg->sub_anchor = 0;
5019 reg->exact_end = (UChar* )NULL;
5020 reg->threshold_len = 0;
5021 if (IS_NOT_NULL(reg->exact)) {
5022 xfree(reg->exact);
5023 reg->exact = (UChar* )NULL;
5024 }
5025 }
5026
5027 #ifdef ONIG_DEBUG
5028
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5029 static void print_enc_string(FILE* fp, OnigEncoding enc,
5030 const UChar *s, const UChar *end)
5031 {
5032 fprintf(fp, "\nPATTERN: /");
5033
5034 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5035 const UChar *p;
5036 OnigCodePoint code;
5037
5038 p = s;
5039 while (p < end) {
5040 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5041 if (code >= 0x80) {
5042 fprintf(fp, " 0x%04x ", (int )code);
5043 }
5044 else {
5045 fputc((int )code, fp);
5046 }
5047
5048 p += enclen(enc, p);
5049 }
5050 }
5051 else {
5052 while (s < end) {
5053 fputc((int )*s, fp);
5054 s++;
5055 }
5056 }
5057
5058 fprintf(fp, "/\n");
5059 }
5060
5061 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5062 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5063 {
5064 if (a == ONIG_INFINITE_DISTANCE)
5065 fputs("inf", f);
5066 else
5067 fprintf(f, "(%u)", a);
5068
5069 fputs("-", f);
5070
5071 if (b == ONIG_INFINITE_DISTANCE)
5072 fputs("inf", f);
5073 else
5074 fprintf(f, "(%u)", b);
5075 }
5076
5077 static void
print_anchor(FILE * f,int anchor)5078 print_anchor(FILE* f, int anchor)
5079 {
5080 int q = 0;
5081
5082 fprintf(f, "[");
5083
5084 if (anchor & ANCHOR_BEGIN_BUF) {
5085 fprintf(f, "begin-buf");
5086 q = 1;
5087 }
5088 if (anchor & ANCHOR_BEGIN_LINE) {
5089 if (q) fprintf(f, ", ");
5090 q = 1;
5091 fprintf(f, "begin-line");
5092 }
5093 if (anchor & ANCHOR_BEGIN_POSITION) {
5094 if (q) fprintf(f, ", ");
5095 q = 1;
5096 fprintf(f, "begin-pos");
5097 }
5098 if (anchor & ANCHOR_END_BUF) {
5099 if (q) fprintf(f, ", ");
5100 q = 1;
5101 fprintf(f, "end-buf");
5102 }
5103 if (anchor & ANCHOR_SEMI_END_BUF) {
5104 if (q) fprintf(f, ", ");
5105 q = 1;
5106 fprintf(f, "semi-end-buf");
5107 }
5108 if (anchor & ANCHOR_END_LINE) {
5109 if (q) fprintf(f, ", ");
5110 q = 1;
5111 fprintf(f, "end-line");
5112 }
5113 if (anchor & ANCHOR_ANYCHAR_STAR) {
5114 if (q) fprintf(f, ", ");
5115 q = 1;
5116 fprintf(f, "anychar-star");
5117 }
5118 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5119 if (q) fprintf(f, ", ");
5120 fprintf(f, "anychar-star-pl");
5121 }
5122
5123 fprintf(f, "]");
5124 }
5125
5126 static void
print_optimize_info(FILE * f,regex_t * reg)5127 print_optimize_info(FILE* f, regex_t* reg)
5128 {
5129 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5130 "EXACT_IC", "MAP" };
5131
5132 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5133 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5134 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5135 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5136 fprintf(f, "\n");
5137
5138 if (reg->optimize) {
5139 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5140 fprintf(f, "\n");
5141 }
5142 fprintf(f, "\n");
5143
5144 if (reg->exact) {
5145 UChar *p;
5146 fprintf(f, "exact: [");
5147 for (p = reg->exact; p < reg->exact_end; p++) {
5148 fputc(*p, f);
5149 }
5150 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5151 }
5152 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5153 int c, i, n = 0;
5154
5155 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5156 if (reg->map[i]) n++;
5157
5158 fprintf(f, "map: n=%d\n", n);
5159 if (n > 0) {
5160 c = 0;
5161 fputc('[', f);
5162 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5163 if (reg->map[i] != 0) {
5164 if (c > 0) fputs(", ", f);
5165 c++;
5166 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5167 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5168 fputc(i, f);
5169 else
5170 fprintf(f, "%d", i);
5171 }
5172 }
5173 fprintf(f, "]\n");
5174 }
5175 }
5176 }
5177 #endif /* ONIG_DEBUG */
5178
5179
5180 extern void
onig_free_body(regex_t * reg)5181 onig_free_body(regex_t* reg)
5182 {
5183 if (IS_NOT_NULL(reg)) {
5184 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5185 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5186 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5187 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5188 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5189 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5190
5191 #ifdef USE_NAMED_GROUP
5192 onig_names_free(reg);
5193 #endif
5194 }
5195 }
5196
5197 extern void
onig_free(regex_t * reg)5198 onig_free(regex_t* reg)
5199 {
5200 if (IS_NOT_NULL(reg)) {
5201 onig_free_body(reg);
5202 xfree(reg);
5203 }
5204 }
5205
5206 #define REGEX_TRANSFER(to,from) do {\
5207 (to)->state = ONIG_STATE_MODIFY;\
5208 onig_free_body(to);\
5209 xmemcpy(to, from, sizeof(regex_t));\
5210 xfree(from);\
5211 } while (0)
5212
5213 extern void
onig_transfer(regex_t * to,regex_t * from)5214 onig_transfer(regex_t* to, regex_t* from)
5215 {
5216 THREAD_ATOMIC_START;
5217 REGEX_TRANSFER(to, from);
5218 THREAD_ATOMIC_END;
5219 }
5220
5221 #define REGEX_CHAIN_HEAD(reg) do {\
5222 while (IS_NOT_NULL((reg)->chain)) {\
5223 (reg) = (reg)->chain;\
5224 }\
5225 } while (0)
5226
5227 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5228 onig_chain_link_add(regex_t* to, regex_t* add)
5229 {
5230 THREAD_ATOMIC_START;
5231 REGEX_CHAIN_HEAD(to);
5232 to->chain = add;
5233 THREAD_ATOMIC_END;
5234 }
5235
5236 extern void
onig_chain_reduce(regex_t * reg)5237 onig_chain_reduce(regex_t* reg)
5238 {
5239 regex_t *head, *prev;
5240
5241 prev = reg;
5242 head = prev->chain;
5243 if (IS_NOT_NULL(head)) {
5244 reg->state = ONIG_STATE_MODIFY;
5245 while (IS_NOT_NULL(head->chain)) {
5246 prev = head;
5247 head = head->chain;
5248 }
5249 prev->chain = (regex_t* )NULL;
5250 REGEX_TRANSFER(reg, head);
5251 }
5252 }
5253
5254 #ifdef ONIG_DEBUG
5255 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5256 #endif
5257 #ifdef ONIG_DEBUG_PARSE_TREE
5258 static void print_tree P_((FILE* f, Node* node));
5259 #endif
5260
5261 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5262 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5263 OnigErrorInfo* einfo)
5264 {
5265 #define COMPILE_INIT_SIZE 20
5266
5267 int r, init_size;
5268 Node* root;
5269 ScanEnv scan_env;
5270 #ifdef USE_SUBEXP_CALL
5271 UnsetAddrList uslist;
5272 #endif
5273
5274 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5275
5276 reg->state = ONIG_STATE_COMPILING;
5277
5278 #ifdef ONIG_DEBUG
5279 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5280 #endif
5281
5282 if (reg->alloc == 0) {
5283 init_size = (pattern_end - pattern) * 2;
5284 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5285 r = BBUF_INIT(reg, init_size);
5286 if (r != 0) goto end;
5287 }
5288 else
5289 reg->used = 0;
5290
5291 reg->num_mem = 0;
5292 reg->num_repeat = 0;
5293 reg->num_null_check = 0;
5294 reg->repeat_range_alloc = 0;
5295 reg->repeat_range = (OnigRepeatRange* )NULL;
5296 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5297 reg->num_comb_exp_check = 0;
5298 #endif
5299
5300 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5301 if (r != 0) goto err;
5302
5303 #ifdef USE_NAMED_GROUP
5304 /* mixed use named group and no-named group */
5305 if (scan_env.num_named > 0 &&
5306 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5307 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5308 if (scan_env.num_named != scan_env.num_mem)
5309 r = disable_noname_group_capture(&root, reg, &scan_env);
5310 else
5311 r = numbered_ref_check(root);
5312
5313 if (r != 0) goto err;
5314 }
5315 #endif
5316
5317 #ifdef USE_SUBEXP_CALL
5318 if (scan_env.num_call > 0) {
5319 r = unset_addr_list_init(&uslist, scan_env.num_call);
5320 if (r != 0) goto err;
5321 scan_env.unset_addr_list = &uslist;
5322 r = setup_subexp_call(root, &scan_env);
5323 if (r != 0) goto err_unset;
5324 r = subexp_recursive_check_trav(root, &scan_env);
5325 if (r < 0) goto err_unset;
5326 r = subexp_inf_recursive_check_trav(root, &scan_env);
5327 if (r != 0) goto err_unset;
5328
5329 reg->num_call = scan_env.num_call;
5330 }
5331 else
5332 reg->num_call = 0;
5333 #endif
5334
5335 r = setup_tree(root, reg, 0, &scan_env);
5336 if (r != 0) goto err_unset;
5337
5338 #ifdef ONIG_DEBUG_PARSE_TREE
5339 print_tree(stderr, root);
5340 #endif
5341
5342 reg->capture_history = scan_env.capture_history;
5343 reg->bt_mem_start = scan_env.bt_mem_start;
5344 reg->bt_mem_start |= reg->capture_history;
5345 if (IS_FIND_CONDITION(reg->options))
5346 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5347 else {
5348 reg->bt_mem_end = scan_env.bt_mem_end;
5349 reg->bt_mem_end |= reg->capture_history;
5350 }
5351
5352 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5353 if (scan_env.backrefed_mem == 0
5354 #ifdef USE_SUBEXP_CALL
5355 || scan_env.num_call == 0
5356 #endif
5357 ) {
5358 setup_comb_exp_check(root, 0, &scan_env);
5359 #ifdef USE_SUBEXP_CALL
5360 if (scan_env.has_recursion != 0) {
5361 scan_env.num_comb_exp_check = 0;
5362 }
5363 else
5364 #endif
5365 if (scan_env.comb_exp_max_regnum > 0) {
5366 int i;
5367 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5368 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5369 scan_env.num_comb_exp_check = 0;
5370 break;
5371 }
5372 }
5373 }
5374 }
5375
5376 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5377 #endif
5378
5379 clear_optimize_info(reg);
5380 #ifndef ONIG_DONT_OPTIMIZE
5381 r = set_optimize_info_from_tree(root, reg, &scan_env);
5382 if (r != 0) goto err_unset;
5383 #endif
5384
5385 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5386 xfree(scan_env.mem_nodes_dynamic);
5387 scan_env.mem_nodes_dynamic = (Node** )NULL;
5388 }
5389
5390 r = compile_tree(root, reg);
5391 if (r == 0) {
5392 r = add_opcode(reg, OP_END);
5393 #ifdef USE_SUBEXP_CALL
5394 if (scan_env.num_call > 0) {
5395 r = unset_addr_list_fix(&uslist, reg);
5396 unset_addr_list_end(&uslist);
5397 if (r) goto err;
5398 }
5399 #endif
5400
5401 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5402 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5403 else {
5404 if (reg->bt_mem_start != 0)
5405 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5406 else
5407 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5408 }
5409 }
5410 #ifdef USE_SUBEXP_CALL
5411 else if (scan_env.num_call > 0) {
5412 unset_addr_list_end(&uslist);
5413 }
5414 #endif
5415 onig_node_free(root);
5416
5417 #ifdef ONIG_DEBUG_COMPILE
5418 #ifdef USE_NAMED_GROUP
5419 onig_print_names(stderr, reg);
5420 #endif
5421 print_compiled_byte_code_list(stderr, reg);
5422 #endif
5423
5424 end:
5425 reg->state = ONIG_STATE_NORMAL;
5426 return r;
5427
5428 err_unset:
5429 #ifdef USE_SUBEXP_CALL
5430 if (scan_env.num_call > 0) {
5431 unset_addr_list_end(&uslist);
5432 }
5433 #endif
5434 err:
5435 if (IS_NOT_NULL(scan_env.error)) {
5436 if (IS_NOT_NULL(einfo)) {
5437 einfo->enc = scan_env.enc;
5438 einfo->par = scan_env.error;
5439 einfo->par_end = scan_env.error_end;
5440 }
5441 }
5442
5443 onig_node_free(root);
5444 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5445 xfree(scan_env.mem_nodes_dynamic);
5446 return r;
5447 }
5448
5449 #ifdef USE_RECOMPILE_API
5450 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5451 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5452 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5453 OnigErrorInfo* einfo)
5454 {
5455 int r;
5456 regex_t *new_reg;
5457
5458 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5459 if (r) return r;
5460 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5461 onig_transfer(reg, new_reg);
5462 }
5463 else {
5464 onig_chain_link_add(reg, new_reg);
5465 }
5466 return 0;
5467 }
5468 #endif
5469
5470 static int onig_inited = 0;
5471
5472 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5473 onig_reg_init(regex_t* reg, OnigOptionType option,
5474 OnigCaseFoldType case_fold_flag,
5475 OnigEncoding enc, OnigSyntaxType* syntax)
5476 {
5477 if (! onig_inited)
5478 onig_init();
5479
5480 if (IS_NULL(reg))
5481 return ONIGERR_INVALID_ARGUMENT;
5482
5483 if (ONIGENC_IS_UNDEF(enc))
5484 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5485
5486 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5487 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5488 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5489 }
5490
5491 (reg)->state = ONIG_STATE_MODIFY;
5492
5493 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5494 option |= syntax->options;
5495 option &= ~ONIG_OPTION_SINGLELINE;
5496 }
5497 else
5498 option |= syntax->options;
5499
5500 (reg)->enc = enc;
5501 (reg)->options = option;
5502 (reg)->syntax = syntax;
5503 (reg)->optimize = 0;
5504 (reg)->exact = (UChar* )NULL;
5505 (reg)->int_map = (int* )NULL;
5506 (reg)->int_map_backward = (int* )NULL;
5507 (reg)->chain = (regex_t* )NULL;
5508
5509 (reg)->p = (UChar* )NULL;
5510 (reg)->alloc = 0;
5511 (reg)->used = 0;
5512 (reg)->name_table = (void* )NULL;
5513
5514 (reg)->case_fold_flag = case_fold_flag;
5515 return 0;
5516 }
5517
5518 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5519 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5520 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5521 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5522 {
5523 int r;
5524
5525 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5526 if (r) return r;
5527
5528 r = onig_compile(reg, pattern, pattern_end, einfo);
5529 return r;
5530 }
5531
5532 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5533 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5534 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5535 OnigErrorInfo* einfo)
5536 {
5537 int r;
5538
5539 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5540 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5541
5542 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5543 if (r) goto err;
5544
5545 r = onig_compile(*reg, pattern, pattern_end, einfo);
5546 if (r) {
5547 err:
5548 onig_free(*reg);
5549 *reg = NULL;
5550 }
5551 return r;
5552 }
5553
5554
5555 extern int
onig_init(void)5556 onig_init(void)
5557 {
5558 if (onig_inited != 0)
5559 return 0;
5560
5561 THREAD_SYSTEM_INIT;
5562 THREAD_ATOMIC_START;
5563
5564 onig_inited = 1;
5565
5566 onigenc_init();
5567 /* onigenc_set_default_caseconv_table((UChar* )0); */
5568
5569 #ifdef ONIG_DEBUG_STATISTICS
5570 onig_statistics_init();
5571 #endif
5572
5573 THREAD_ATOMIC_END;
5574 return 0;
5575 }
5576
5577
5578 extern int
onig_end(void)5579 onig_end(void)
5580 {
5581 THREAD_ATOMIC_START;
5582
5583 #ifdef ONIG_DEBUG_STATISTICS
5584 onig_print_statistics(stderr);
5585 #endif
5586
5587 #ifdef USE_SHARED_CCLASS_TABLE
5588 onig_free_shared_cclass_table();
5589 #endif
5590
5591 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5592 onig_free_node_list();
5593 #endif
5594
5595 onig_inited = 0;
5596
5597 THREAD_ATOMIC_END;
5598 THREAD_SYSTEM_END;
5599 return 0;
5600 }
5601
5602 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5603 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5604 {
5605 OnigCodePoint n, *data;
5606 OnigCodePoint low, high, x;
5607
5608 GET_CODE_POINT(n, p);
5609 data = (OnigCodePoint* )p;
5610 data++;
5611
5612 for (low = 0, high = n; low < high; ) {
5613 x = (low + high) >> 1;
5614 if (code > data[x * 2 + 1])
5615 low = x + 1;
5616 else
5617 high = x;
5618 }
5619
5620 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5621 }
5622
5623 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5624 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5625 {
5626 int found;
5627
5628 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5629 if (IS_NULL(cc->mbuf)) {
5630 found = 0;
5631 }
5632 else {
5633 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5634 }
5635 }
5636 else {
5637 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5638 }
5639
5640 if (IS_NCCLASS_NOT(cc))
5641 return !found;
5642 else
5643 return found;
5644 }
5645
5646 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5647 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5648 {
5649 int len;
5650
5651 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5652 len = 2;
5653 }
5654 else {
5655 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5656 }
5657 return onig_is_code_in_cc_len(len, code, cc);
5658 }
5659
5660
5661 #ifdef ONIG_DEBUG
5662
5663 /* arguments type */
5664 #define ARG_SPECIAL -1
5665 #define ARG_NON 0
5666 #define ARG_RELADDR 1
5667 #define ARG_ABSADDR 2
5668 #define ARG_LENGTH 3
5669 #define ARG_MEMNUM 4
5670 #define ARG_OPTION 5
5671 #define ARG_STATE_CHECK 6
5672
5673 OnigOpInfoType OnigOpInfo[] = {
5674 { OP_FINISH, "finish", ARG_NON },
5675 { OP_END, "end", ARG_NON },
5676 { OP_EXACT1, "exact1", ARG_SPECIAL },
5677 { OP_EXACT2, "exact2", ARG_SPECIAL },
5678 { OP_EXACT3, "exact3", ARG_SPECIAL },
5679 { OP_EXACT4, "exact4", ARG_SPECIAL },
5680 { OP_EXACT5, "exact5", ARG_SPECIAL },
5681 { OP_EXACTN, "exactn", ARG_SPECIAL },
5682 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5683 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5684 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5685 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5686 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5687 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5688 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5689 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5690 { OP_CCLASS, "cclass", ARG_SPECIAL },
5691 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5692 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5693 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5694 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5695 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5696 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5697 { OP_ANYCHAR, "anychar", ARG_NON },
5698 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5699 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5700 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5701 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5702 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5703 { OP_WORD, "word", ARG_NON },
5704 { OP_NOT_WORD, "not-word", ARG_NON },
5705 { OP_WORD_BOUND, "word-bound", ARG_NON },
5706 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5707 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5708 { OP_WORD_END, "word-end", ARG_NON },
5709 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5710 { OP_END_BUF, "end-buf", ARG_NON },
5711 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5712 { OP_END_LINE, "end-line", ARG_NON },
5713 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5714 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5715 { OP_BACKREF1, "backref1", ARG_NON },
5716 { OP_BACKREF2, "backref2", ARG_NON },
5717 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5718 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5719 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5720 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5721 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5722 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5723 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5724 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5725 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5726 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5727 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5728 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5729 { OP_SET_OPTION, "set-option", ARG_OPTION },
5730 { OP_FAIL, "fail", ARG_NON },
5731 { OP_JUMP, "jump", ARG_RELADDR },
5732 { OP_PUSH, "push", ARG_RELADDR },
5733 { OP_POP, "pop", ARG_NON },
5734 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5735 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5736 { OP_REPEAT, "repeat", ARG_SPECIAL },
5737 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5738 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5739 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5740 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5741 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5742 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5743 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5744 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5745 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5746 { OP_PUSH_POS, "push-pos", ARG_NON },
5747 { OP_POP_POS, "pop-pos", ARG_NON },
5748 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5749 { OP_FAIL_POS, "fail-pos", ARG_NON },
5750 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5751 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5752 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5753 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5754 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5755 { OP_CALL, "call", ARG_ABSADDR },
5756 { OP_RETURN, "return", ARG_NON },
5757 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5758 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5759 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5760 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5761 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5762 "state-check-anychar-ml*", ARG_STATE_CHECK },
5763 { -1, "", ARG_NON }
5764 };
5765
5766 static char*
op2name(int opcode)5767 op2name(int opcode)
5768 {
5769 int i;
5770
5771 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5772 if (opcode == OnigOpInfo[i].opcode)
5773 return OnigOpInfo[i].name;
5774 }
5775 return "";
5776 }
5777
5778 static int
op2arg_type(int opcode)5779 op2arg_type(int opcode)
5780 {
5781 int i;
5782
5783 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5784 if (opcode == OnigOpInfo[i].opcode)
5785 return OnigOpInfo[i].arg_type;
5786 }
5787 return ARG_SPECIAL;
5788 }
5789
5790 static void
Indent(FILE * f,int indent)5791 Indent(FILE* f, int indent)
5792 {
5793 int i;
5794 for (i = 0; i < indent; i++) putc(' ', f);
5795 }
5796
5797 static void
p_string(FILE * f,int len,UChar * s)5798 p_string(FILE* f, int len, UChar* s)
5799 {
5800 fputs(":", f);
5801 while (len-- > 0) { fputc(*s++, f); }
5802 }
5803
5804 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5805 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5806 {
5807 int x = len * mb_len;
5808
5809 fprintf(f, ":%d:", len);
5810 while (x-- > 0) { fputc(*s++, f); }
5811 }
5812
5813 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5814 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5815 OnigEncoding enc)
5816 {
5817 int i, n, arg_type;
5818 RelAddrType addr;
5819 LengthType len;
5820 MemNumType mem;
5821 StateCheckNumType scn;
5822 OnigCodePoint code;
5823 UChar *q;
5824
5825 fprintf(f, "[%s", op2name(*bp));
5826 arg_type = op2arg_type(*bp);
5827 if (arg_type != ARG_SPECIAL) {
5828 bp++;
5829 switch (arg_type) {
5830 case ARG_NON:
5831 break;
5832 case ARG_RELADDR:
5833 GET_RELADDR_INC(addr, bp);
5834 fprintf(f, ":(%d)", addr);
5835 break;
5836 case ARG_ABSADDR:
5837 GET_ABSADDR_INC(addr, bp);
5838 fprintf(f, ":(%d)", addr);
5839 break;
5840 case ARG_LENGTH:
5841 GET_LENGTH_INC(len, bp);
5842 fprintf(f, ":%d", len);
5843 break;
5844 case ARG_MEMNUM:
5845 mem = *((MemNumType* )bp);
5846 bp += SIZE_MEMNUM;
5847 fprintf(f, ":%d", mem);
5848 break;
5849 case ARG_OPTION:
5850 {
5851 OnigOptionType option = *((OnigOptionType* )bp);
5852 bp += SIZE_OPTION;
5853 fprintf(f, ":%d", option);
5854 }
5855 break;
5856
5857 case ARG_STATE_CHECK:
5858 scn = *((StateCheckNumType* )bp);
5859 bp += SIZE_STATE_CHECK_NUM;
5860 fprintf(f, ":%d", scn);
5861 break;
5862 }
5863 }
5864 else {
5865 switch (*bp++) {
5866 case OP_EXACT1:
5867 case OP_ANYCHAR_STAR_PEEK_NEXT:
5868 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5869 p_string(f, 1, bp++); break;
5870 case OP_EXACT2:
5871 p_string(f, 2, bp); bp += 2; break;
5872 case OP_EXACT3:
5873 p_string(f, 3, bp); bp += 3; break;
5874 case OP_EXACT4:
5875 p_string(f, 4, bp); bp += 4; break;
5876 case OP_EXACT5:
5877 p_string(f, 5, bp); bp += 5; break;
5878 case OP_EXACTN:
5879 GET_LENGTH_INC(len, bp);
5880 p_len_string(f, len, 1, bp);
5881 bp += len;
5882 break;
5883
5884 case OP_EXACTMB2N1:
5885 p_string(f, 2, bp); bp += 2; break;
5886 case OP_EXACTMB2N2:
5887 p_string(f, 4, bp); bp += 4; break;
5888 case OP_EXACTMB2N3:
5889 p_string(f, 6, bp); bp += 6; break;
5890 case OP_EXACTMB2N:
5891 GET_LENGTH_INC(len, bp);
5892 p_len_string(f, len, 2, bp);
5893 bp += len * 2;
5894 break;
5895 case OP_EXACTMB3N:
5896 GET_LENGTH_INC(len, bp);
5897 p_len_string(f, len, 3, bp);
5898 bp += len * 3;
5899 break;
5900 case OP_EXACTMBN:
5901 {
5902 int mb_len;
5903
5904 GET_LENGTH_INC(mb_len, bp);
5905 GET_LENGTH_INC(len, bp);
5906 fprintf(f, ":%d:%d:", mb_len, len);
5907 n = len * mb_len;
5908 while (n-- > 0) { fputc(*bp++, f); }
5909 }
5910 break;
5911
5912 case OP_EXACT1_IC:
5913 len = enclen(enc, bp);
5914 p_string(f, len, bp);
5915 bp += len;
5916 break;
5917 case OP_EXACTN_IC:
5918 GET_LENGTH_INC(len, bp);
5919 p_len_string(f, len, 1, bp);
5920 bp += len;
5921 break;
5922
5923 case OP_CCLASS:
5924 n = bitset_on_num((BitSetRef )bp);
5925 bp += SIZE_BITSET;
5926 fprintf(f, ":%d", n);
5927 break;
5928
5929 case OP_CCLASS_NOT:
5930 n = bitset_on_num((BitSetRef )bp);
5931 bp += SIZE_BITSET;
5932 fprintf(f, ":%d", n);
5933 break;
5934
5935 case OP_CCLASS_MB:
5936 case OP_CCLASS_MB_NOT:
5937 GET_LENGTH_INC(len, bp);
5938 q = bp;
5939 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5940 ALIGNMENT_RIGHT(q);
5941 #endif
5942 GET_CODE_POINT(code, q);
5943 bp += len;
5944 fprintf(f, ":%d:%d", (int )code, len);
5945 break;
5946
5947 case OP_CCLASS_MIX:
5948 case OP_CCLASS_MIX_NOT:
5949 n = bitset_on_num((BitSetRef )bp);
5950 bp += SIZE_BITSET;
5951 GET_LENGTH_INC(len, bp);
5952 q = bp;
5953 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5954 ALIGNMENT_RIGHT(q);
5955 #endif
5956 GET_CODE_POINT(code, q);
5957 bp += len;
5958 fprintf(f, ":%d:%d:%d", n, (int )code, len);
5959 break;
5960
5961 case OP_CCLASS_NODE:
5962 {
5963 CClassNode *cc;
5964
5965 GET_POINTER_INC(cc, bp);
5966 n = bitset_on_num(cc->bs);
5967 fprintf(f, ":%u:%d", (unsigned int )cc, n);
5968 }
5969 break;
5970
5971 case OP_BACKREFN_IC:
5972 mem = *((MemNumType* )bp);
5973 bp += SIZE_MEMNUM;
5974 fprintf(f, ":%d", mem);
5975 break;
5976
5977 case OP_BACKREF_MULTI_IC:
5978 case OP_BACKREF_MULTI:
5979 fputs(" ", f);
5980 GET_LENGTH_INC(len, bp);
5981 for (i = 0; i < len; i++) {
5982 GET_MEMNUM_INC(mem, bp);
5983 if (i > 0) fputs(", ", f);
5984 fprintf(f, "%d", mem);
5985 }
5986 break;
5987
5988 case OP_BACKREF_WITH_LEVEL:
5989 {
5990 OnigOptionType option;
5991 LengthType level;
5992
5993 GET_OPTION_INC(option, bp);
5994 fprintf(f, ":%d", option);
5995 GET_LENGTH_INC(level, bp);
5996 fprintf(f, ":%d", level);
5997
5998 fputs(" ", f);
5999 GET_LENGTH_INC(len, bp);
6000 for (i = 0; i < len; i++) {
6001 GET_MEMNUM_INC(mem, bp);
6002 if (i > 0) fputs(", ", f);
6003 fprintf(f, "%d", mem);
6004 }
6005 }
6006 break;
6007
6008 case OP_REPEAT:
6009 case OP_REPEAT_NG:
6010 {
6011 mem = *((MemNumType* )bp);
6012 bp += SIZE_MEMNUM;
6013 addr = *((RelAddrType* )bp);
6014 bp += SIZE_RELADDR;
6015 fprintf(f, ":%d:%d", mem, addr);
6016 }
6017 break;
6018
6019 case OP_PUSH_OR_JUMP_EXACT1:
6020 case OP_PUSH_IF_PEEK_NEXT:
6021 addr = *((RelAddrType* )bp);
6022 bp += SIZE_RELADDR;
6023 fprintf(f, ":(%d)", addr);
6024 p_string(f, 1, bp);
6025 bp += 1;
6026 break;
6027
6028 case OP_LOOK_BEHIND:
6029 GET_LENGTH_INC(len, bp);
6030 fprintf(f, ":%d", len);
6031 break;
6032
6033 case OP_PUSH_LOOK_BEHIND_NOT:
6034 GET_RELADDR_INC(addr, bp);
6035 GET_LENGTH_INC(len, bp);
6036 fprintf(f, ":%d:(%d)", len, addr);
6037 break;
6038
6039 case OP_STATE_CHECK_PUSH:
6040 case OP_STATE_CHECK_PUSH_OR_JUMP:
6041 scn = *((StateCheckNumType* )bp);
6042 bp += SIZE_STATE_CHECK_NUM;
6043 addr = *((RelAddrType* )bp);
6044 bp += SIZE_RELADDR;
6045 fprintf(f, ":%d:(%d)", scn, addr);
6046 break;
6047
6048 default:
6049 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6050 *--bp);
6051 }
6052 }
6053 fputs("]", f);
6054 if (nextp) *nextp = bp;
6055 }
6056
6057 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6058 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6059 {
6060 int ncode;
6061 UChar* bp = reg->p;
6062 UChar* end = reg->p + reg->used;
6063
6064 fprintf(f, "code length: %d\n", reg->used);
6065
6066 ncode = 0;
6067 while (bp < end) {
6068 ncode++;
6069 if (bp > reg->p) {
6070 if (ncode % 5 == 0)
6071 fprintf(f, "\n");
6072 else
6073 fputs(" ", f);
6074 }
6075 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6076 }
6077
6078 fprintf(f, "\n");
6079 }
6080
6081 static void
print_indent_tree(FILE * f,Node * node,int indent)6082 print_indent_tree(FILE* f, Node* node, int indent)
6083 {
6084 int i, type;
6085 int add = 3;
6086 UChar* p;
6087
6088 Indent(f, indent);
6089 if (IS_NULL(node)) {
6090 fprintf(f, "ERROR: null node!!!\n");
6091 exit (0);
6092 }
6093
6094 type = NTYPE(node);
6095 switch (type) {
6096 case NT_LIST:
6097 case NT_ALT:
6098 if (NTYPE(node) == NT_LIST)
6099 fprintf(f, "<list:%x>\n", (int )node);
6100 else
6101 fprintf(f, "<alt:%x>\n", (int )node);
6102
6103 print_indent_tree(f, NCAR(node), indent + add);
6104 while (IS_NOT_NULL(node = NCDR(node))) {
6105 if (NTYPE(node) != type) {
6106 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6107 exit(0);
6108 }
6109 print_indent_tree(f, NCAR(node), indent + add);
6110 }
6111 break;
6112
6113 case NT_STR:
6114 fprintf(f, "<string%s:%x>",
6115 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6116 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6117 if (*p >= 0x20 && *p < 0x7f)
6118 fputc(*p, f);
6119 else {
6120 fprintf(f, " 0x%02x", *p);
6121 }
6122 }
6123 break;
6124
6125 case NT_CCLASS:
6126 fprintf(f, "<cclass:%x>", (int )node);
6127 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6128 if (NCCLASS(node)->mbuf) {
6129 BBuf* bbuf = NCCLASS(node)->mbuf;
6130 for (i = 0; i < bbuf->used; i++) {
6131 if (i > 0) fprintf(f, ",");
6132 fprintf(f, "%0x", bbuf->p[i]);
6133 }
6134 }
6135 break;
6136
6137 case NT_CTYPE:
6138 fprintf(f, "<ctype:%x> ", (int )node);
6139 switch (NCTYPE(node)->ctype) {
6140 case ONIGENC_CTYPE_WORD:
6141 if (NCTYPE(node)->not != 0)
6142 fputs("not word", f);
6143 else
6144 fputs("word", f);
6145 break;
6146
6147 default:
6148 fprintf(f, "ERROR: undefined ctype.\n");
6149 exit(0);
6150 }
6151 break;
6152
6153 case NT_CANY:
6154 fprintf(f, "<anychar:%x>", (int )node);
6155 break;
6156
6157 case NT_ANCHOR:
6158 fprintf(f, "<anchor:%x> ", (int )node);
6159 switch (NANCHOR(node)->type) {
6160 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6161 case ANCHOR_END_BUF: fputs("end buf", f); break;
6162 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6163 case ANCHOR_END_LINE: fputs("end line", f); break;
6164 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6165 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6166
6167 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6168 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6169 #ifdef USE_WORD_BEGIN_END
6170 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6171 case ANCHOR_WORD_END: fputs("word end", f); break;
6172 #endif
6173 case ANCHOR_PREC_READ: fputs("prec read", f); break;
6174 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
6175 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
6176 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6177
6178 default:
6179 fprintf(f, "ERROR: undefined anchor type.\n");
6180 break;
6181 }
6182 break;
6183
6184 case NT_BREF:
6185 {
6186 int* p;
6187 BRefNode* br = NBREF(node);
6188 p = BACKREFS_P(br);
6189 fprintf(f, "<backref:%x>", (int )node);
6190 for (i = 0; i < br->back_num; i++) {
6191 if (i > 0) fputs(", ", f);
6192 fprintf(f, "%d", p[i]);
6193 }
6194 }
6195 break;
6196
6197 #ifdef USE_SUBEXP_CALL
6198 case NT_CALL:
6199 {
6200 CallNode* cn = NCALL(node);
6201 fprintf(f, "<call:%x>", (int )node);
6202 p_string(f, cn->name_end - cn->name, cn->name);
6203 }
6204 break;
6205 #endif
6206
6207 case NT_QTFR:
6208 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6209 NQTFR(node)->lower, NQTFR(node)->upper,
6210 (NQTFR(node)->greedy ? "" : "?"));
6211 print_indent_tree(f, NQTFR(node)->target, indent + add);
6212 break;
6213
6214 case NT_ENCLOSE:
6215 fprintf(f, "<enclose:%x> ", (int )node);
6216 switch (NENCLOSE(node)->type) {
6217 case ENCLOSE_OPTION:
6218 fprintf(f, "option:%d", NENCLOSE(node)->option);
6219 break;
6220 case ENCLOSE_MEMORY:
6221 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6222 break;
6223 case ENCLOSE_STOP_BACKTRACK:
6224 fprintf(f, "stop-bt");
6225 break;
6226
6227 default:
6228 break;
6229 }
6230 fprintf(f, "\n");
6231 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6232 break;
6233
6234 default:
6235 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6236 break;
6237 }
6238
6239 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6240 type != NT_ENCLOSE)
6241 fprintf(f, "\n");
6242 fflush(f);
6243 }
6244 #endif /* ONIG_DEBUG */
6245
6246 #ifdef ONIG_DEBUG_PARSE_TREE
6247 static void
print_tree(FILE * f,Node * node)6248 print_tree(FILE* f, Node* node)
6249 {
6250 print_indent_tree(f, node, 0);
6251 }
6252 #endif
6253