1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37 return OnigDefaultCaseFoldFlag;
38 }
39
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43 OnigDefaultCaseFoldFlag = case_fold_flag;
44 return 0;
45 }
46
47
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55 int len = end - s;
56
57 if (len > 0) {
58 UChar* r = (UChar* )xmalloc(len + 1);
59 CHECK_NULL_RETURN(r);
60 xmemcpy(r, s, len);
61 r[len] = (UChar )0;
62 return r;
63 }
64 else return NULL;
65 }
66
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70 Node c;
71 c = *a; *a = *b; *b = c;
72
73 if (NTYPE(a) == NT_STR) {
74 StrNode* sn = NSTR(a);
75 if (sn->capa == 0) {
76 int len = sn->end - sn->s;
77 sn->s = sn->buf;
78 sn->end = sn->s + len;
79 }
80 }
81
82 if (NTYPE(b) == NT_STR) {
83 StrNode* sn = NSTR(b);
84 if (sn->capa == 0) {
85 int len = sn->end - sn->s;
86 sn->s = sn->buf;
87 sn->end = sn->s + len;
88 }
89 }
90 }
91
92 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)93 distance_add(OnigDistance d1, OnigDistance d2)
94 {
95 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96 return ONIG_INFINITE_DISTANCE;
97 else {
98 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99 else return ONIG_INFINITE_DISTANCE;
100 }
101 }
102
103 static OnigDistance
distance_multiply(OnigDistance d,int m)104 distance_multiply(OnigDistance d, int m)
105 {
106 if (m == 0) return 0;
107
108 if (d < ONIG_INFINITE_DISTANCE / m)
109 return d * m;
110 else
111 return ONIG_INFINITE_DISTANCE;
112 }
113
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117 int i;
118 for (i = 0; i < (int )BITSET_SIZE; i++) {
119 if (bs[i] != 0) return 0;
120 }
121 return 1;
122 }
123
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128 int i, n;
129
130 n = 0;
131 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132 if (BITSET_AT(bs, i)) n++;
133 }
134 return n;
135 }
136 #endif
137
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141 if (size <= 0) {
142 size = 0;
143 buf->p = NULL;
144 }
145 else {
146 buf->p = (UChar* )xmalloc(size);
147 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148 }
149
150 buf->alloc = size;
151 buf->used = 0;
152 return 0;
153 }
154
155
156 #ifdef USE_SUBEXP_CALL
157
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161 UnsetAddr* p;
162
163 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164 CHECK_NULL_RETURN_MEMERR(p);
165 uslist->num = 0;
166 uslist->alloc = size;
167 uslist->us = p;
168 return 0;
169 }
170
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174 if (IS_NOT_NULL(uslist->us))
175 xfree(uslist->us);
176 }
177
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181 UnsetAddr* p;
182 int size;
183
184 if (uslist->num >= uslist->alloc) {
185 size = uslist->alloc * 2;
186 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187 CHECK_NULL_RETURN_MEMERR(p);
188 uslist->alloc = size;
189 uslist->us = p;
190 }
191
192 uslist->us[uslist->num].offset = offset;
193 uslist->us[uslist->num].target = node;
194 uslist->num++;
195 return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198
199
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203 BBUF_ADD1(reg, opcode);
204 return 0;
205 }
206
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211 StateCheckNumType n = (StateCheckNumType )num;
212
213 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214 return 0;
215 }
216 #endif
217
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221 RelAddrType ra = (RelAddrType )addr;
222
223 BBUF_ADD(reg, &ra, SIZE_RELADDR);
224 return 0;
225 }
226
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230 AbsAddrType ra = (AbsAddrType )addr;
231
232 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233 return 0;
234 }
235
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239 LengthType l = (LengthType )len;
240
241 BBUF_ADD(reg, &l, SIZE_LENGTH);
242 return 0;
243 }
244
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248 MemNumType n = (MemNumType )num;
249
250 BBUF_ADD(reg, &n, SIZE_MEMNUM);
251 return 0;
252 }
253
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257 PointerType ptr = (PointerType )addr;
258
259 BBUF_ADD(reg, &ptr, SIZE_POINTER);
260 return 0;
261 }
262
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266 BBUF_ADD(reg, &option, SIZE_OPTION);
267 return 0;
268 }
269
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273 int r;
274
275 r = add_opcode(reg, opcode);
276 if (r) return r;
277 r = add_rel_addr(reg, addr);
278 return r;
279 }
280
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284 BBUF_ADD(reg, bytes, len);
285 return 0;
286 }
287
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291 BBUF_ADD(reg, bs, SIZE_BITSET);
292 return 0;
293 }
294
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_option(reg, option);
303 return r;
304 }
305
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308
309
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
312 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
313
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317 int op;
318
319 if (ignore_case) {
320 switch (str_len) {
321 case 1: op = OP_EXACT1_IC; break;
322 default: op = OP_EXACTN_IC; break;
323 }
324 }
325 else {
326 switch (mb_len) {
327 case 1:
328 switch (str_len) {
329 case 1: op = OP_EXACT1; break;
330 case 2: op = OP_EXACT2; break;
331 case 3: op = OP_EXACT3; break;
332 case 4: op = OP_EXACT4; break;
333 case 5: op = OP_EXACT5; break;
334 default: op = OP_EXACTN; break;
335 }
336 break;
337
338 case 2:
339 switch (str_len) {
340 case 1: op = OP_EXACTMB2N1; break;
341 case 2: op = OP_EXACTMB2N2; break;
342 case 3: op = OP_EXACTMB2N3; break;
343 default: op = OP_EXACTMB2N; break;
344 }
345 break;
346
347 case 3:
348 op = OP_EXACTMB3N;
349 break;
350
351 default:
352 op = OP_EXACTMBN;
353 break;
354 }
355 }
356 return op;
357 }
358
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362 int r;
363 int saved_num_null_check = reg->num_null_check;
364
365 if (empty_info != 0) {
366 r = add_opcode(reg, OP_NULL_CHECK_START);
367 if (r) return r;
368 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369 if (r) return r;
370 reg->num_null_check++;
371 }
372
373 r = compile_tree(node, reg);
374 if (r) return r;
375
376 if (empty_info != 0) {
377 if (empty_info == NQ_TARGET_IS_EMPTY)
378 r = add_opcode(reg, OP_NULL_CHECK_END);
379 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383
384 if (r) return r;
385 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386 }
387 return r;
388 }
389
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394 int r;
395
396 r = add_opcode(reg, OP_CALL);
397 if (r) return r;
398 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399 node->target);
400 if (r) return r;
401 r = add_abs_addr(reg, 0 /*dummy addr.*/);
402 return r;
403 }
404 #endif
405
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409 int i, r;
410
411 for (i = 0; i < n; i++) {
412 r = compile_tree(node, reg);
413 if (r) return r;
414 }
415 return 0;
416 }
417
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420 regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422 int len;
423 int op = select_str_opcode(mb_len, str_len, ignore_case);
424
425 len = SIZE_OPCODE;
426
427 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
428 if (IS_NEED_STR_LEN_OP_EXACT(op))
429 len += SIZE_LENGTH;
430
431 len += mb_len * str_len;
432 return len;
433 }
434
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437 regex_t* reg, int ignore_case)
438 {
439 int op = select_str_opcode(mb_len, str_len, ignore_case);
440 add_opcode(reg, op);
441
442 if (op == OP_EXACTMBN)
443 add_length(reg, mb_len);
444
445 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446 if (op == OP_EXACTN_IC)
447 add_length(reg, mb_len * str_len);
448 else
449 add_length(reg, str_len);
450 }
451
452 add_bytes(reg, s, mb_len * str_len);
453 return 0;
454 }
455
456
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460 int rlen, r, len, prev_len, slen, ambig;
461 OnigEncoding enc = reg->enc;
462 UChar *p, *prev;
463 StrNode* sn;
464
465 sn = NSTR(node);
466 if (sn->end <= sn->s)
467 return 0;
468
469 ambig = NSTRING_IS_AMBIG(node);
470
471 p = prev = sn->s;
472 SAFE_ENC_LEN(enc, p, sn->end, prev_len);
473 p += prev_len;
474 slen = 1;
475 rlen = 0;
476
477 for (; p < sn->end; ) {
478 SAFE_ENC_LEN(enc, p, sn->end, len);
479 if (len == prev_len) {
480 slen++;
481 }
482 else {
483 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484 rlen += r;
485 prev = p;
486 slen = 1;
487 prev_len = len;
488 }
489 p += len;
490 }
491 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492 rlen += r;
493 return rlen;
494 }
495
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499 if (sn->end <= sn->s)
500 return 0;
501
502 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508 int r, len, prev_len, slen, ambig;
509 OnigEncoding enc = reg->enc;
510 UChar *p, *prev, *end;
511 StrNode* sn;
512
513 sn = NSTR(node);
514 if (sn->end <= sn->s)
515 return 0;
516
517 end = sn->end;
518 ambig = NSTRING_IS_AMBIG(node);
519
520 p = prev = sn->s;
521 SAFE_ENC_LEN(enc, p, end, prev_len);
522 p += prev_len;
523 slen = 1;
524
525 for (; p < end; ) {
526 SAFE_ENC_LEN(enc, p, end, len);
527 if (len == prev_len) {
528 slen++;
529 }
530 else {
531 r = add_compile_string(prev, prev_len, slen, reg, ambig);
532 if (r) return r;
533
534 prev = p;
535 slen = 1;
536 prev_len = len;
537 }
538
539 p += len;
540 }
541 return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547 if (sn->end <= sn->s)
548 return 0;
549
550 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557 add_length(reg, mbuf->used);
558 return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560 int r, pad_size;
561 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562
563 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566
567 r = add_bytes(reg, mbuf->p, mbuf->used);
568
569 /* padding for return value from compile_length_cclass_node() to be fix. */
570 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572 return r;
573 #endif
574 }
575
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579 int len;
580
581 if (IS_NCCLASS_SHARE(cc)) {
582 len = SIZE_OPCODE + SIZE_POINTER;
583 return len;
584 }
585
586 if (IS_NULL(cc->mbuf)) {
587 len = SIZE_OPCODE + SIZE_BITSET;
588 }
589 else {
590 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591 len = SIZE_OPCODE;
592 }
593 else {
594 len = SIZE_OPCODE + SIZE_BITSET;
595 }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597 len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601 }
602
603 return len;
604 }
605
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609 int r;
610
611 if (IS_NCCLASS_SHARE(cc)) {
612 add_opcode(reg, OP_CCLASS_NODE);
613 r = add_pointer(reg, cc);
614 return r;
615 }
616
617 if (IS_NULL(cc->mbuf)) {
618 if (IS_NCCLASS_NOT(cc))
619 add_opcode(reg, OP_CCLASS_NOT);
620 else
621 add_opcode(reg, OP_CCLASS);
622
623 r = add_bitset(reg, cc->bs);
624 }
625 else {
626 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627 if (IS_NCCLASS_NOT(cc))
628 add_opcode(reg, OP_CCLASS_MB_NOT);
629 else
630 add_opcode(reg, OP_CCLASS_MB);
631
632 r = add_multi_byte_cclass(cc->mbuf, reg);
633 }
634 else {
635 if (IS_NCCLASS_NOT(cc))
636 add_opcode(reg, OP_CCLASS_MIX_NOT);
637 else
638 add_opcode(reg, OP_CCLASS_MIX);
639
640 r = add_bitset(reg, cc->bs);
641 if (r) return r;
642 r = add_multi_byte_cclass(cc->mbuf, reg);
643 }
644 }
645
646 return r;
647 }
648
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC 4
653
654 OnigRepeatRange* p;
655
656 if (reg->repeat_range_alloc == 0) {
657 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658 CHECK_NULL_RETURN_MEMERR(p);
659 reg->repeat_range = p;
660 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661 }
662 else if (reg->repeat_range_alloc <= id) {
663 int n;
664 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666 sizeof(OnigRepeatRange) * n);
667 CHECK_NULL_RETURN_MEMERR(p);
668 reg->repeat_range = p;
669 reg->repeat_range_alloc = n;
670 }
671 else {
672 p = reg->repeat_range;
673 }
674
675 p[id].lower = lower;
676 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677 return 0;
678 }
679
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682 regex_t* reg)
683 {
684 int r;
685 int num_repeat = reg->num_repeat;
686
687 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688 if (r) return r;
689 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690 reg->num_repeat++;
691 if (r) return r;
692 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693 if (r) return r;
694
695 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696 if (r) return r;
697
698 r = compile_tree_empty_check(qn->target, reg, empty_info);
699 if (r) return r;
700
701 if (
702 #ifdef USE_SUBEXP_CALL
703 reg->num_call > 0 ||
704 #endif
705 IS_QUANTIFIER_IN_REPEAT(qn)) {
706 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707 }
708 else {
709 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710 }
711 if (r) return r;
712 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713 return r;
714 }
715
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720 NTYPE(qn->target) == NT_CANY)
721 return 1;
722 else
723 return 0;
724 }
725
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
727 #define CKN_ON (ckn > 0)
728
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734 int len, mod_tlen, cklen;
735 int ckn;
736 int infinite = IS_REPEAT_INFINITE(qn->upper);
737 int empty_info = qn->target_empty_info;
738 int tlen = compile_length_tree(qn->target, reg);
739
740 if (tlen < 0) return tlen;
741
742 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743
744 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745
746 /* anychar repeat */
747 if (NTYPE(qn->target) == NT_CANY) {
748 if (qn->greedy && infinite) {
749 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751 else
752 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753 }
754 }
755
756 if (empty_info != 0)
757 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758 else
759 mod_tlen = tlen;
760
761 if (infinite && qn->lower <= 1) {
762 if (qn->greedy) {
763 if (qn->lower == 1)
764 len = SIZE_OP_JUMP;
765 else
766 len = 0;
767
768 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769 }
770 else {
771 if (qn->lower == 0)
772 len = SIZE_OP_JUMP;
773 else
774 len = 0;
775
776 len += mod_tlen + SIZE_OP_PUSH + cklen;
777 }
778 }
779 else if (qn->upper == 0) {
780 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781 len = SIZE_OP_JUMP + tlen;
782 else
783 len = 0;
784 }
785 else if (qn->upper == 1 && qn->greedy) {
786 if (qn->lower == 0) {
787 if (CKN_ON) {
788 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789 }
790 else {
791 len = SIZE_OP_PUSH + tlen;
792 }
793 }
794 else {
795 len = tlen;
796 }
797 }
798 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800 }
801 else {
802 len = SIZE_OP_REPEAT_INC
803 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804 if (CKN_ON)
805 len += SIZE_OP_STATE_CHECK;
806 }
807
808 return len;
809 }
810
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814 int r, mod_tlen;
815 int ckn;
816 int infinite = IS_REPEAT_INFINITE(qn->upper);
817 int empty_info = qn->target_empty_info;
818 int tlen = compile_length_tree(qn->target, reg);
819
820 if (tlen < 0) return tlen;
821
822 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823
824 if (is_anychar_star_quantifier(qn)) {
825 r = compile_tree_n_times(qn->target, qn->lower, reg);
826 if (r) return r;
827 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828 if (IS_MULTILINE(reg->options))
829 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830 else
831 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832 if (r) return r;
833 if (CKN_ON) {
834 r = add_state_check_num(reg, ckn);
835 if (r) return r;
836 }
837
838 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839 }
840 else {
841 if (IS_MULTILINE(reg->options)) {
842 r = add_opcode(reg, (CKN_ON ?
843 OP_STATE_CHECK_ANYCHAR_ML_STAR
844 : OP_ANYCHAR_ML_STAR));
845 }
846 else {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_STAR
849 : OP_ANYCHAR_STAR));
850 }
851 if (r) return r;
852 if (CKN_ON)
853 r = add_state_check_num(reg, ckn);
854
855 return r;
856 }
857 }
858
859 if (empty_info != 0)
860 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861 else
862 mod_tlen = tlen;
863
864 if (infinite && qn->lower <= 1) {
865 if (qn->greedy) {
866 if (qn->lower == 1) {
867 r = add_opcode_rel_addr(reg, OP_JUMP,
868 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 if (r) return r;
870 }
871
872 if (CKN_ON) {
873 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 if (r) return r;
875 r = add_state_check_num(reg, ckn);
876 if (r) return r;
877 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878 }
879 else {
880 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881 }
882 if (r) return r;
883 r = compile_tree_empty_check(qn->target, reg, empty_info);
884 if (r) return r;
885 r = add_opcode_rel_addr(reg, OP_JUMP,
886 -(mod_tlen + (int )SIZE_OP_JUMP
887 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888 }
889 else {
890 if (qn->lower == 0) {
891 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 if (r) return r;
893 }
894 r = compile_tree_empty_check(qn->target, reg, empty_info);
895 if (r) return r;
896 if (CKN_ON) {
897 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 if (r) return r;
899 r = add_state_check_num(reg, ckn);
900 if (r) return r;
901 r = add_rel_addr(reg,
902 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903 }
904 else
905 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906 }
907 }
908 else if (qn->upper == 0) {
909 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911 if (r) return r;
912 r = compile_tree(qn->target, reg);
913 }
914 else
915 r = 0;
916 }
917 else if (qn->upper == 1 && qn->greedy) {
918 if (qn->lower == 0) {
919 if (CKN_ON) {
920 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 if (r) return r;
922 r = add_state_check_num(reg, ckn);
923 if (r) return r;
924 r = add_rel_addr(reg, tlen);
925 }
926 else {
927 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928 }
929 if (r) return r;
930 }
931
932 r = compile_tree(qn->target, reg);
933 }
934 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935 if (CKN_ON) {
936 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937 if (r) return r;
938 r = add_state_check_num(reg, ckn);
939 if (r) return r;
940 r = add_rel_addr(reg, SIZE_OP_JUMP);
941 }
942 else {
943 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944 }
945
946 if (r) return r;
947 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948 if (r) return r;
949 r = compile_tree(qn->target, reg);
950 }
951 else {
952 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953 if (CKN_ON) {
954 if (r) return r;
955 r = add_opcode(reg, OP_STATE_CHECK);
956 if (r) return r;
957 r = add_state_check_num(reg, ckn);
958 }
959 }
960 return r;
961 }
962
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968 int len, mod_tlen;
969 int infinite = IS_REPEAT_INFINITE(qn->upper);
970 int empty_info = qn->target_empty_info;
971 int tlen = compile_length_tree(qn->target, reg);
972
973 if (tlen < 0) return tlen;
974
975 /* anychar repeat */
976 if (NTYPE(qn->target) == NT_CANY) {
977 if (qn->greedy && infinite) {
978 if (IS_NOT_NULL(qn->next_head_exact))
979 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980 else
981 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982 }
983 }
984
985 if (empty_info != 0)
986 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987 else
988 mod_tlen = tlen;
989
990 if (infinite &&
991 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993 len = SIZE_OP_JUMP;
994 }
995 else {
996 len = tlen * qn->lower;
997 }
998
999 if (qn->greedy) {
1000 if (IS_NOT_NULL(qn->head_exact))
1001 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002 else if (IS_NOT_NULL(qn->next_head_exact))
1003 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004 else
1005 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006 }
1007 else
1008 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009 }
1010 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011 len = SIZE_OP_JUMP + tlen;
1012 }
1013 else if (!infinite && qn->greedy &&
1014 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016 len = tlen * qn->lower;
1017 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018 }
1019 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021 }
1022 else {
1023 len = SIZE_OP_REPEAT_INC
1024 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025 }
1026
1027 return len;
1028 }
1029
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033 int i, r, mod_tlen;
1034 int infinite = IS_REPEAT_INFINITE(qn->upper);
1035 int empty_info = qn->target_empty_info;
1036 int tlen = compile_length_tree(qn->target, reg);
1037
1038 if (tlen < 0) return tlen;
1039
1040 if (is_anychar_star_quantifier(qn)) {
1041 r = compile_tree_n_times(qn->target, qn->lower, reg);
1042 if (r) return r;
1043 if (IS_NOT_NULL(qn->next_head_exact)) {
1044 if (IS_MULTILINE(reg->options))
1045 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046 else
1047 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048 if (r) return r;
1049 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050 }
1051 else {
1052 if (IS_MULTILINE(reg->options))
1053 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054 else
1055 return add_opcode(reg, OP_ANYCHAR_STAR);
1056 }
1057 }
1058
1059 if (empty_info != 0)
1060 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061 else
1062 mod_tlen = tlen;
1063
1064 if (infinite &&
1065 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067 if (qn->greedy) {
1068 if (IS_NOT_NULL(qn->head_exact))
1069 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 else if (IS_NOT_NULL(qn->next_head_exact))
1071 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 else
1073 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074 }
1075 else {
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077 }
1078 if (r) return r;
1079 }
1080 else {
1081 r = compile_tree_n_times(qn->target, qn->lower, reg);
1082 if (r) return r;
1083 }
1084
1085 if (qn->greedy) {
1086 if (IS_NOT_NULL(qn->head_exact)) {
1087 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 mod_tlen + SIZE_OP_JUMP);
1089 if (r) return r;
1090 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 if (r) return r;
1093 r = add_opcode_rel_addr(reg, OP_JUMP,
1094 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095 }
1096 else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 mod_tlen + SIZE_OP_JUMP);
1099 if (r) return r;
1100 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 if (r) return r;
1103 r = add_opcode_rel_addr(reg, OP_JUMP,
1104 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105 }
1106 else {
1107 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 if (r) return r;
1109 r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 if (r) return r;
1111 r = add_opcode_rel_addr(reg, OP_JUMP,
1112 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113 }
1114 }
1115 else {
1116 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117 if (r) return r;
1118 r = compile_tree_empty_check(qn->target, reg, empty_info);
1119 if (r) return r;
1120 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121 }
1122 }
1123 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125 if (r) return r;
1126 r = compile_tree(qn->target, reg);
1127 }
1128 else if (!infinite && qn->greedy &&
1129 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131 int n = qn->upper - qn->lower;
1132
1133 r = compile_tree_n_times(qn->target, qn->lower, reg);
1134 if (r) return r;
1135
1136 for (i = 0; i < n; i++) {
1137 r = add_opcode_rel_addr(reg, OP_PUSH,
1138 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139 if (r) return r;
1140 r = compile_tree(qn->target, reg);
1141 if (r) return r;
1142 }
1143 }
1144 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146 if (r) return r;
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1150 }
1151 else {
1152 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153 }
1154 return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161 int tlen;
1162 OnigOptionType prev = reg->options;
1163
1164 reg->options = node->option;
1165 tlen = compile_length_tree(node->target, reg);
1166 reg->options = prev;
1167
1168 if (tlen < 0) return tlen;
1169
1170 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172 + tlen + SIZE_OP_SET_OPTION;
1173 }
1174 else
1175 return tlen;
1176 }
1177
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181 int r;
1182 OnigOptionType prev = reg->options;
1183
1184 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186 if (r) return r;
1187 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188 if (r) return r;
1189 r = add_opcode(reg, OP_FAIL);
1190 if (r) return r;
1191 }
1192
1193 reg->options = node->option;
1194 r = compile_tree(node->target, reg);
1195 reg->options = prev;
1196
1197 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198 if (r) return r;
1199 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200 }
1201 return r;
1202 }
1203
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207 int len;
1208 int tlen;
1209
1210 if (node->type == ENCLOSE_OPTION)
1211 return compile_length_option_node(node, reg);
1212
1213 if (node->target) {
1214 tlen = compile_length_tree(node->target, reg);
1215 if (tlen < 0) return tlen;
1216 }
1217 else
1218 tlen = 0;
1219
1220 switch (node->type) {
1221 case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223 if (IS_ENCLOSE_CALLED(node)) {
1224 len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 len += (IS_ENCLOSE_RECURSION(node)
1228 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229 else
1230 len += (IS_ENCLOSE_RECURSION(node)
1231 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232 }
1233 else
1234 #endif
1235 {
1236 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 len = SIZE_OP_MEMORY_START_PUSH;
1238 else
1239 len = SIZE_OP_MEMORY_START;
1240
1241 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1243 }
1244 break;
1245
1246 case ENCLOSE_STOP_BACKTRACK:
1247 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248 QtfrNode* qn = NQTFR(node->target);
1249 tlen = compile_length_tree(qn->target, reg);
1250 if (tlen < 0) return tlen;
1251
1252 len = tlen * qn->lower
1253 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1254 }
1255 else {
1256 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1257 }
1258 break;
1259
1260 default:
1261 return ONIGERR_TYPE_BUG;
1262 break;
1263 }
1264
1265 return len;
1266 }
1267
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1269
1270 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1272 {
1273 int r, len;
1274
1275 if (node->type == ENCLOSE_OPTION)
1276 return compile_option_node(node, reg);
1277
1278 switch (node->type) {
1279 case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281 if (IS_ENCLOSE_CALLED(node)) {
1282 r = add_opcode(reg, OP_CALL);
1283 if (r) return r;
1284 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285 node->state |= NST_ADDR_FIXED;
1286 r = add_abs_addr(reg, (int )node->call_addr);
1287 if (r) return r;
1288 len = compile_length_tree(node->target, reg);
1289 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 len += (IS_ENCLOSE_RECURSION(node)
1292 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293 else
1294 len += (IS_ENCLOSE_RECURSION(node)
1295 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1296
1297 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298 if (r) return r;
1299 }
1300 #endif
1301 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303 else
1304 r = add_opcode(reg, OP_MEMORY_START);
1305 if (r) return r;
1306 r = add_mem_num(reg, node->regnum);
1307 if (r) return r;
1308 r = compile_tree(node->target, reg);
1309 if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311 if (IS_ENCLOSE_CALLED(node)) {
1312 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315 else
1316 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1318
1319 if (r) return r;
1320 r = add_mem_num(reg, node->regnum);
1321 if (r) return r;
1322 r = add_opcode(reg, OP_RETURN);
1323 }
1324 else
1325 #endif
1326 {
1327 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329 else
1330 r = add_opcode(reg, OP_MEMORY_END);
1331 if (r) return r;
1332 r = add_mem_num(reg, node->regnum);
1333 }
1334 break;
1335
1336 case ENCLOSE_STOP_BACKTRACK:
1337 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338 QtfrNode* qn = NQTFR(node->target);
1339 r = compile_tree_n_times(qn->target, qn->lower, reg);
1340 if (r) return r;
1341
1342 len = compile_length_tree(qn->target, reg);
1343 if (len < 0) return len;
1344
1345 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346 if (r) return r;
1347 r = compile_tree(qn->target, reg);
1348 if (r) return r;
1349 r = add_opcode(reg, OP_POP);
1350 if (r) return r;
1351 r = add_opcode_rel_addr(reg, OP_JUMP,
1352 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1353 }
1354 else {
1355 r = add_opcode(reg, OP_PUSH_STOP_BT);
1356 if (r) return r;
1357 r = compile_tree(node->target, reg);
1358 if (r) return r;
1359 r = add_opcode(reg, OP_POP_STOP_BT);
1360 }
1361 break;
1362
1363 default:
1364 return ONIGERR_TYPE_BUG;
1365 break;
1366 }
1367
1368 return r;
1369 }
1370
1371 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1373 {
1374 int len;
1375 int tlen = 0;
1376
1377 if (node->target) {
1378 tlen = compile_length_tree(node->target, reg);
1379 if (tlen < 0) return tlen;
1380 }
1381
1382 switch (node->type) {
1383 case ANCHOR_PREC_READ:
1384 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385 break;
1386 case ANCHOR_PREC_READ_NOT:
1387 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388 break;
1389 case ANCHOR_LOOK_BEHIND:
1390 len = SIZE_OP_LOOK_BEHIND + tlen;
1391 break;
1392 case ANCHOR_LOOK_BEHIND_NOT:
1393 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394 break;
1395
1396 default:
1397 len = SIZE_OPCODE;
1398 break;
1399 }
1400
1401 return len;
1402 }
1403
1404 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1406 {
1407 int r, len;
1408
1409 switch (node->type) {
1410 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1411 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1412 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1413 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1414 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1415 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1416
1417 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1418 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1421 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1422 #endif
1423
1424 case ANCHOR_PREC_READ:
1425 r = add_opcode(reg, OP_PUSH_POS);
1426 if (r) return r;
1427 r = compile_tree(node->target, reg);
1428 if (r) return r;
1429 r = add_opcode(reg, OP_POP_POS);
1430 break;
1431
1432 case ANCHOR_PREC_READ_NOT:
1433 len = compile_length_tree(node->target, reg);
1434 if (len < 0) return len;
1435 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436 if (r) return r;
1437 r = compile_tree(node->target, reg);
1438 if (r) return r;
1439 r = add_opcode(reg, OP_FAIL_POS);
1440 break;
1441
1442 case ANCHOR_LOOK_BEHIND:
1443 {
1444 int n;
1445 r = add_opcode(reg, OP_LOOK_BEHIND);
1446 if (r) return r;
1447 if (node->char_len < 0) {
1448 r = get_char_length_tree(node->target, reg, &n);
1449 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1450 }
1451 else
1452 n = node->char_len;
1453 r = add_length(reg, n);
1454 if (r) return r;
1455 r = compile_tree(node->target, reg);
1456 }
1457 break;
1458
1459 case ANCHOR_LOOK_BEHIND_NOT:
1460 {
1461 int n;
1462 len = compile_length_tree(node->target, reg);
1463 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465 if (r) return r;
1466 if (node->char_len < 0) {
1467 r = get_char_length_tree(node->target, reg, &n);
1468 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1469 }
1470 else
1471 n = node->char_len;
1472 r = add_length(reg, n);
1473 if (r) return r;
1474 r = compile_tree(node->target, reg);
1475 if (r) return r;
1476 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1477 }
1478 break;
1479
1480 default:
1481 return ONIGERR_TYPE_BUG;
1482 break;
1483 }
1484
1485 return r;
1486 }
1487
1488 static int
compile_length_tree(Node * node,regex_t * reg)1489 compile_length_tree(Node* node, regex_t* reg)
1490 {
1491 int len, type, r;
1492
1493 type = NTYPE(node);
1494 switch (type) {
1495 case NT_LIST:
1496 len = 0;
1497 do {
1498 r = compile_length_tree(NCAR(node), reg);
1499 if (r < 0) return r;
1500 len += r;
1501 } while (IS_NOT_NULL(node = NCDR(node)));
1502 r = len;
1503 break;
1504
1505 case NT_ALT:
1506 {
1507 int n;
1508
1509 n = r = 0;
1510 do {
1511 r += compile_length_tree(NCAR(node), reg);
1512 n++;
1513 } while (IS_NOT_NULL(node = NCDR(node)));
1514 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1515 }
1516 break;
1517
1518 case NT_STR:
1519 if (NSTRING_IS_RAW(node))
1520 r = compile_length_string_raw_node(NSTR(node), reg);
1521 else
1522 r = compile_length_string_node(node, reg);
1523 break;
1524
1525 case NT_CCLASS:
1526 r = compile_length_cclass_node(NCCLASS(node), reg);
1527 break;
1528
1529 case NT_CTYPE:
1530 case NT_CANY:
1531 r = SIZE_OPCODE;
1532 break;
1533
1534 case NT_BREF:
1535 {
1536 BRefNode* br = NBREF(node);
1537
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539 if (IS_BACKREF_NEST_LEVEL(br)) {
1540 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1542 }
1543 else
1544 #endif
1545 if (br->back_num == 1) {
1546 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1548 }
1549 else {
1550 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1551 }
1552 }
1553 break;
1554
1555 #ifdef USE_SUBEXP_CALL
1556 case NT_CALL:
1557 r = SIZE_OP_CALL;
1558 break;
1559 #endif
1560
1561 case NT_QTFR:
1562 r = compile_length_quantifier_node(NQTFR(node), reg);
1563 break;
1564
1565 case NT_ENCLOSE:
1566 r = compile_length_enclose_node(NENCLOSE(node), reg);
1567 break;
1568
1569 case NT_ANCHOR:
1570 r = compile_length_anchor_node(NANCHOR(node), reg);
1571 break;
1572
1573 default:
1574 return ONIGERR_TYPE_BUG;
1575 break;
1576 }
1577
1578 return r;
1579 }
1580
1581 static int
compile_tree(Node * node,regex_t * reg)1582 compile_tree(Node* node, regex_t* reg)
1583 {
1584 int n, type, len, pos, r = 0;
1585
1586 type = NTYPE(node);
1587 switch (type) {
1588 case NT_LIST:
1589 do {
1590 r = compile_tree(NCAR(node), reg);
1591 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592 break;
1593
1594 case NT_ALT:
1595 {
1596 Node* x = node;
1597 len = 0;
1598 do {
1599 len += compile_length_tree(NCAR(x), reg);
1600 if (NCDR(x) != NULL) {
1601 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1602 }
1603 } while (IS_NOT_NULL(x = NCDR(x)));
1604 pos = reg->used + len; /* goal position */
1605
1606 do {
1607 len = compile_length_tree(NCAR(node), reg);
1608 if (IS_NOT_NULL(NCDR(node))) {
1609 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 if (r) break;
1611 }
1612 r = compile_tree(NCAR(node), reg);
1613 if (r) break;
1614 if (IS_NOT_NULL(NCDR(node))) {
1615 len = pos - (reg->used + SIZE_OP_JUMP);
1616 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 if (r) break;
1618 }
1619 } while (IS_NOT_NULL(node = NCDR(node)));
1620 }
1621 break;
1622
1623 case NT_STR:
1624 if (NSTRING_IS_RAW(node))
1625 r = compile_string_raw_node(NSTR(node), reg);
1626 else
1627 r = compile_string_node(node, reg);
1628 break;
1629
1630 case NT_CCLASS:
1631 r = compile_cclass_node(NCCLASS(node), reg);
1632 break;
1633
1634 case NT_CTYPE:
1635 {
1636 int op;
1637
1638 switch (NCTYPE(node)->ctype) {
1639 case ONIGENC_CTYPE_WORD:
1640 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1641 else op = OP_WORD;
1642 break;
1643 default:
1644 return ONIGERR_TYPE_BUG;
1645 break;
1646 }
1647 r = add_opcode(reg, op);
1648 }
1649 break;
1650
1651 case NT_CANY:
1652 if (IS_MULTILINE(reg->options))
1653 r = add_opcode(reg, OP_ANYCHAR_ML);
1654 else
1655 r = add_opcode(reg, OP_ANYCHAR);
1656 break;
1657
1658 case NT_BREF:
1659 {
1660 BRefNode* br = NBREF(node);
1661
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663 if (IS_BACKREF_NEST_LEVEL(br)) {
1664 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 if (r) return r;
1666 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 if (r) return r;
1668 r = add_length(reg, br->nest_level);
1669 if (r) return r;
1670
1671 goto add_bacref_mems;
1672 }
1673 else
1674 #endif
1675 if (br->back_num == 1) {
1676 n = br->back_static[0];
1677 if (IS_IGNORECASE(reg->options)) {
1678 r = add_opcode(reg, OP_BACKREFN_IC);
1679 if (r) return r;
1680 r = add_mem_num(reg, n);
1681 }
1682 else {
1683 switch (n) {
1684 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1685 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1686 default:
1687 r = add_opcode(reg, OP_BACKREFN);
1688 if (r) return r;
1689 r = add_mem_num(reg, n);
1690 break;
1691 }
1692 }
1693 }
1694 else {
1695 int i;
1696 int* p;
1697
1698 if (IS_IGNORECASE(reg->options)) {
1699 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1700 }
1701 else {
1702 r = add_opcode(reg, OP_BACKREF_MULTI);
1703 }
1704 if (r) return r;
1705
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707 add_bacref_mems:
1708 #endif
1709 r = add_length(reg, br->back_num);
1710 if (r) return r;
1711 p = BACKREFS_P(br);
1712 for (i = br->back_num - 1; i >= 0; i--) {
1713 r = add_mem_num(reg, p[i]);
1714 if (r) return r;
1715 }
1716 }
1717 }
1718 break;
1719
1720 #ifdef USE_SUBEXP_CALL
1721 case NT_CALL:
1722 r = compile_call(NCALL(node), reg);
1723 break;
1724 #endif
1725
1726 case NT_QTFR:
1727 r = compile_quantifier_node(NQTFR(node), reg);
1728 break;
1729
1730 case NT_ENCLOSE:
1731 r = compile_enclose_node(NENCLOSE(node), reg);
1732 break;
1733
1734 case NT_ANCHOR:
1735 r = compile_anchor_node(NANCHOR(node), reg);
1736 break;
1737
1738 default:
1739 #ifdef ONIG_DEBUG
1740 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742 break;
1743 }
1744
1745 return r;
1746 }
1747
1748 #ifdef USE_NAMED_GROUP
1749
1750 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1752 {
1753 int r = 0;
1754 Node* node = *plink;
1755
1756 switch (NTYPE(node)) {
1757 case NT_LIST:
1758 case NT_ALT:
1759 do {
1760 r = noname_disable_map(&(NCAR(node)), map, counter);
1761 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762 break;
1763
1764 case NT_QTFR:
1765 {
1766 Node** ptarget = &(NQTFR(node)->target);
1767 Node* old = *ptarget;
1768 r = noname_disable_map(ptarget, map, counter);
1769 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 onig_reduce_nested_quantifier(node, *ptarget);
1771 }
1772 }
1773 break;
1774
1775 case NT_ENCLOSE:
1776 {
1777 EncloseNode* en = NENCLOSE(node);
1778 if (en->type == ENCLOSE_MEMORY) {
1779 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 (*counter)++;
1781 map[en->regnum].new_val = *counter;
1782 en->regnum = *counter;
1783 r = noname_disable_map(&(en->target), map, counter);
1784 }
1785 else {
1786 *plink = en->target;
1787 en->target = NULL_NODE;
1788 onig_node_free(node);
1789 r = noname_disable_map(plink, map, counter);
1790 }
1791 }
1792 else
1793 r = noname_disable_map(&(en->target), map, counter);
1794 }
1795 break;
1796
1797 default:
1798 break;
1799 }
1800
1801 return r;
1802 }
1803
1804 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1805 renumber_node_backref(Node* node, GroupNumRemap* map)
1806 {
1807 int i, pos, n, old_num;
1808 int *backs;
1809 BRefNode* bn = NBREF(node);
1810
1811 if (! IS_BACKREF_NAME_REF(bn))
1812 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1813
1814 old_num = bn->back_num;
1815 if (IS_NULL(bn->back_dynamic))
1816 backs = bn->back_static;
1817 else
1818 backs = bn->back_dynamic;
1819
1820 for (i = 0, pos = 0; i < old_num; i++) {
1821 n = map[backs[i]].new_val;
1822 if (n > 0) {
1823 backs[pos] = n;
1824 pos++;
1825 }
1826 }
1827
1828 bn->back_num = pos;
1829 return 0;
1830 }
1831
1832 static int
renumber_by_map(Node * node,GroupNumRemap * map)1833 renumber_by_map(Node* node, GroupNumRemap* map)
1834 {
1835 int r = 0;
1836
1837 switch (NTYPE(node)) {
1838 case NT_LIST:
1839 case NT_ALT:
1840 do {
1841 r = renumber_by_map(NCAR(node), map);
1842 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1843 break;
1844 case NT_QTFR:
1845 r = renumber_by_map(NQTFR(node)->target, map);
1846 break;
1847 case NT_ENCLOSE:
1848 r = renumber_by_map(NENCLOSE(node)->target, map);
1849 break;
1850
1851 case NT_BREF:
1852 r = renumber_node_backref(node, map);
1853 break;
1854
1855 default:
1856 break;
1857 }
1858
1859 return r;
1860 }
1861
1862 static int
numbered_ref_check(Node * node)1863 numbered_ref_check(Node* node)
1864 {
1865 int r = 0;
1866
1867 switch (NTYPE(node)) {
1868 case NT_LIST:
1869 case NT_ALT:
1870 do {
1871 r = numbered_ref_check(NCAR(node));
1872 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1873 break;
1874 case NT_QTFR:
1875 r = numbered_ref_check(NQTFR(node)->target);
1876 break;
1877 case NT_ENCLOSE:
1878 r = numbered_ref_check(NENCLOSE(node)->target);
1879 break;
1880
1881 case NT_BREF:
1882 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1883 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1884 break;
1885
1886 default:
1887 break;
1888 }
1889
1890 return r;
1891 }
1892
1893 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1894 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1895 {
1896 int r, i, pos, counter;
1897 BitStatusType loc;
1898 GroupNumRemap* map;
1899
1900 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1901 CHECK_NULL_RETURN_MEMERR(map);
1902 for (i = 1; i <= env->num_mem; i++) {
1903 map[i].new_val = 0;
1904 }
1905 counter = 0;
1906 r = noname_disable_map(root, map, &counter);
1907 if (r != 0) return r;
1908
1909 r = renumber_by_map(*root, map);
1910 if (r != 0) return r;
1911
1912 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1913 if (map[i].new_val > 0) {
1914 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1915 pos++;
1916 }
1917 }
1918
1919 loc = env->capture_history;
1920 BIT_STATUS_CLEAR(env->capture_history);
1921 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1922 if (BIT_STATUS_AT(loc, i)) {
1923 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1924 }
1925 }
1926
1927 env->num_mem = env->num_named;
1928 reg->num_mem = env->num_named;
1929
1930 return onig_renumber_name_table(reg, map);
1931 }
1932 #endif /* USE_NAMED_GROUP */
1933
1934 #ifdef USE_SUBEXP_CALL
1935 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1936 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1937 {
1938 int i, offset;
1939 EncloseNode* en;
1940 AbsAddrType addr;
1941
1942 for (i = 0; i < uslist->num; i++) {
1943 en = NENCLOSE(uslist->us[i].target);
1944 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1945 addr = en->call_addr;
1946 offset = uslist->us[i].offset;
1947
1948 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1949 }
1950 return 0;
1951 }
1952 #endif
1953
1954 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1955 static int
quantifiers_memory_node_info(Node * node)1956 quantifiers_memory_node_info(Node* node)
1957 {
1958 int r = 0;
1959
1960 switch (NTYPE(node)) {
1961 case NT_LIST:
1962 case NT_ALT:
1963 {
1964 int v;
1965 do {
1966 v = quantifiers_memory_node_info(NCAR(node));
1967 if (v > r) r = v;
1968 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1969 }
1970 break;
1971
1972 #ifdef USE_SUBEXP_CALL
1973 case NT_CALL:
1974 if (IS_CALL_RECURSION(NCALL(node))) {
1975 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1976 }
1977 else
1978 r = quantifiers_memory_node_info(NCALL(node)->target);
1979 break;
1980 #endif
1981
1982 case NT_QTFR:
1983 {
1984 QtfrNode* qn = NQTFR(node);
1985 if (qn->upper != 0) {
1986 r = quantifiers_memory_node_info(qn->target);
1987 }
1988 }
1989 break;
1990
1991 case NT_ENCLOSE:
1992 {
1993 EncloseNode* en = NENCLOSE(node);
1994 switch (en->type) {
1995 case ENCLOSE_MEMORY:
1996 return NQ_TARGET_IS_EMPTY_MEM;
1997 break;
1998
1999 case ENCLOSE_OPTION:
2000 case ENCLOSE_STOP_BACKTRACK:
2001 r = quantifiers_memory_node_info(en->target);
2002 break;
2003 default:
2004 break;
2005 }
2006 }
2007 break;
2008
2009 case NT_BREF:
2010 case NT_STR:
2011 case NT_CTYPE:
2012 case NT_CCLASS:
2013 case NT_CANY:
2014 case NT_ANCHOR:
2015 default:
2016 break;
2017 }
2018
2019 return r;
2020 }
2021 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2022
2023 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2024 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2025 {
2026 OnigDistance tmin;
2027 int r = 0;
2028
2029 *min = 0;
2030 switch (NTYPE(node)) {
2031 case NT_BREF:
2032 {
2033 int i;
2034 int* backs;
2035 Node** nodes = SCANENV_MEM_NODES(env);
2036 BRefNode* br = NBREF(node);
2037 if (br->state & NST_RECURSION) break;
2038
2039 backs = BACKREFS_P(br);
2040 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2041 r = get_min_match_length(nodes[backs[0]], min, env);
2042 if (r != 0) break;
2043 for (i = 1; i < br->back_num; i++) {
2044 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2045 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2046 if (r != 0) break;
2047 if (*min > tmin) *min = tmin;
2048 }
2049 }
2050 break;
2051
2052 #ifdef USE_SUBEXP_CALL
2053 case NT_CALL:
2054 if (IS_CALL_RECURSION(NCALL(node))) {
2055 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2056 if (IS_ENCLOSE_MIN_FIXED(en))
2057 *min = en->min_len;
2058 }
2059 else
2060 r = get_min_match_length(NCALL(node)->target, min, env);
2061 break;
2062 #endif
2063
2064 case NT_LIST:
2065 do {
2066 r = get_min_match_length(NCAR(node), &tmin, env);
2067 if (r == 0) *min += tmin;
2068 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2069 break;
2070
2071 case NT_ALT:
2072 {
2073 Node *x, *y;
2074 y = node;
2075 do {
2076 x = NCAR(y);
2077 r = get_min_match_length(x, &tmin, env);
2078 if (r != 0) break;
2079 if (y == node) *min = tmin;
2080 else if (*min > tmin) *min = tmin;
2081 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2082 }
2083 break;
2084
2085 case NT_STR:
2086 {
2087 StrNode* sn = NSTR(node);
2088 *min = sn->end - sn->s;
2089 }
2090 break;
2091
2092 case NT_CTYPE:
2093 *min = 1;
2094 break;
2095
2096 case NT_CCLASS:
2097 case NT_CANY:
2098 *min = 1;
2099 break;
2100
2101 case NT_QTFR:
2102 {
2103 QtfrNode* qn = NQTFR(node);
2104
2105 if (qn->lower > 0) {
2106 r = get_min_match_length(qn->target, min, env);
2107 if (r == 0)
2108 *min = distance_multiply(*min, qn->lower);
2109 }
2110 }
2111 break;
2112
2113 case NT_ENCLOSE:
2114 {
2115 EncloseNode* en = NENCLOSE(node);
2116 switch (en->type) {
2117 case ENCLOSE_MEMORY:
2118 #ifdef USE_SUBEXP_CALL
2119 if (IS_ENCLOSE_MIN_FIXED(en))
2120 *min = en->min_len;
2121 else {
2122 r = get_min_match_length(en->target, min, env);
2123 if (r == 0) {
2124 en->min_len = *min;
2125 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2126 }
2127 }
2128 break;
2129 #endif
2130 case ENCLOSE_OPTION:
2131 case ENCLOSE_STOP_BACKTRACK:
2132 r = get_min_match_length(en->target, min, env);
2133 break;
2134 }
2135 }
2136 break;
2137
2138 case NT_ANCHOR:
2139 default:
2140 break;
2141 }
2142
2143 return r;
2144 }
2145
2146 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2147 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2148 {
2149 OnigDistance tmax;
2150 int r = 0;
2151
2152 *max = 0;
2153 switch (NTYPE(node)) {
2154 case NT_LIST:
2155 do {
2156 r = get_max_match_length(NCAR(node), &tmax, env);
2157 if (r == 0)
2158 *max = distance_add(*max, tmax);
2159 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2160 break;
2161
2162 case NT_ALT:
2163 do {
2164 r = get_max_match_length(NCAR(node), &tmax, env);
2165 if (r == 0 && *max < tmax) *max = tmax;
2166 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2167 break;
2168
2169 case NT_STR:
2170 {
2171 StrNode* sn = NSTR(node);
2172 *max = sn->end - sn->s;
2173 }
2174 break;
2175
2176 case NT_CTYPE:
2177 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2178 break;
2179
2180 case NT_CCLASS:
2181 case NT_CANY:
2182 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2183 break;
2184
2185 case NT_BREF:
2186 {
2187 int i;
2188 int* backs;
2189 Node** nodes = SCANENV_MEM_NODES(env);
2190 BRefNode* br = NBREF(node);
2191 if (br->state & NST_RECURSION) {
2192 *max = ONIG_INFINITE_DISTANCE;
2193 break;
2194 }
2195 backs = BACKREFS_P(br);
2196 for (i = 0; i < br->back_num; i++) {
2197 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2198 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2199 if (r != 0) break;
2200 if (*max < tmax) *max = tmax;
2201 }
2202 }
2203 break;
2204
2205 #ifdef USE_SUBEXP_CALL
2206 case NT_CALL:
2207 if (! IS_CALL_RECURSION(NCALL(node)))
2208 r = get_max_match_length(NCALL(node)->target, max, env);
2209 else
2210 *max = ONIG_INFINITE_DISTANCE;
2211 break;
2212 #endif
2213
2214 case NT_QTFR:
2215 {
2216 QtfrNode* qn = NQTFR(node);
2217
2218 if (qn->upper != 0) {
2219 r = get_max_match_length(qn->target, max, env);
2220 if (r == 0 && *max != 0) {
2221 if (! IS_REPEAT_INFINITE(qn->upper))
2222 *max = distance_multiply(*max, qn->upper);
2223 else
2224 *max = ONIG_INFINITE_DISTANCE;
2225 }
2226 }
2227 }
2228 break;
2229
2230 case NT_ENCLOSE:
2231 {
2232 EncloseNode* en = NENCLOSE(node);
2233 switch (en->type) {
2234 case ENCLOSE_MEMORY:
2235 #ifdef USE_SUBEXP_CALL
2236 if (IS_ENCLOSE_MAX_FIXED(en))
2237 *max = en->max_len;
2238 else {
2239 r = get_max_match_length(en->target, max, env);
2240 if (r == 0) {
2241 en->max_len = *max;
2242 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2243 }
2244 }
2245 break;
2246 #endif
2247 case ENCLOSE_OPTION:
2248 case ENCLOSE_STOP_BACKTRACK:
2249 r = get_max_match_length(en->target, max, env);
2250 break;
2251 }
2252 }
2253 break;
2254
2255 case NT_ANCHOR:
2256 default:
2257 break;
2258 }
2259
2260 return r;
2261 }
2262
2263 #define GET_CHAR_LEN_VARLEN -1
2264 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2265
2266 /* fixed size pattern node only */
2267 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2268 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2269 {
2270 int tlen;
2271 int r = 0;
2272
2273 level++;
2274 *len = 0;
2275 switch (NTYPE(node)) {
2276 case NT_LIST:
2277 do {
2278 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2279 if (r == 0)
2280 *len = distance_add(*len, tlen);
2281 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2282 break;
2283
2284 case NT_ALT:
2285 {
2286 int tlen2;
2287 int varlen = 0;
2288
2289 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2290 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2291 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2292 if (r == 0) {
2293 if (tlen != tlen2)
2294 varlen = 1;
2295 }
2296 }
2297 if (r == 0) {
2298 if (varlen != 0) {
2299 if (level == 1)
2300 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2301 else
2302 r = GET_CHAR_LEN_VARLEN;
2303 }
2304 else
2305 *len = tlen;
2306 }
2307 }
2308 break;
2309
2310 case NT_STR:
2311 {
2312 StrNode* sn = NSTR(node);
2313 UChar *s = sn->s;
2314 while (s < sn->end) {
2315 s += enclen(reg->enc, s);
2316 (*len)++;
2317 }
2318 }
2319 break;
2320
2321 case NT_QTFR:
2322 {
2323 QtfrNode* qn = NQTFR(node);
2324 if (qn->lower == qn->upper) {
2325 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2326 if (r == 0)
2327 *len = distance_multiply(tlen, qn->lower);
2328 }
2329 else
2330 r = GET_CHAR_LEN_VARLEN;
2331 }
2332 break;
2333
2334 #ifdef USE_SUBEXP_CALL
2335 case NT_CALL:
2336 if (! IS_CALL_RECURSION(NCALL(node)))
2337 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2338 else
2339 r = GET_CHAR_LEN_VARLEN;
2340 break;
2341 #endif
2342
2343 case NT_CTYPE:
2344 *len = 1;
2345 break;
2346
2347 case NT_CCLASS:
2348 case NT_CANY:
2349 *len = 1;
2350 break;
2351
2352 case NT_ENCLOSE:
2353 {
2354 EncloseNode* en = NENCLOSE(node);
2355 switch (en->type) {
2356 case ENCLOSE_MEMORY:
2357 #ifdef USE_SUBEXP_CALL
2358 if (IS_ENCLOSE_CLEN_FIXED(en))
2359 *len = en->char_len;
2360 else {
2361 r = get_char_length_tree1(en->target, reg, len, level);
2362 if (r == 0) {
2363 en->char_len = *len;
2364 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2365 }
2366 }
2367 break;
2368 #endif
2369 case ENCLOSE_OPTION:
2370 case ENCLOSE_STOP_BACKTRACK:
2371 r = get_char_length_tree1(en->target, reg, len, level);
2372 break;
2373 default:
2374 break;
2375 }
2376 }
2377 break;
2378
2379 case NT_ANCHOR:
2380 break;
2381
2382 default:
2383 r = GET_CHAR_LEN_VARLEN;
2384 break;
2385 }
2386
2387 return r;
2388 }
2389
2390 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2391 get_char_length_tree(Node* node, regex_t* reg, int* len)
2392 {
2393 return get_char_length_tree1(node, reg, len, 0);
2394 }
2395
2396 /* x is not included y ==> 1 : 0 */
2397 static int
is_not_included(Node * x,Node * y,regex_t * reg)2398 is_not_included(Node* x, Node* y, regex_t* reg)
2399 {
2400 int i, len;
2401 OnigCodePoint code;
2402 UChar *p;
2403 int ytype;
2404
2405 retry:
2406 ytype = NTYPE(y);
2407 switch (NTYPE(x)) {
2408 case NT_CTYPE:
2409 {
2410 switch (ytype) {
2411 case NT_CTYPE:
2412 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2413 NCTYPE(y)->not != NCTYPE(x)->not)
2414 return 1;
2415 else
2416 return 0;
2417 break;
2418
2419 case NT_CCLASS:
2420 swap:
2421 {
2422 Node* tmp;
2423 tmp = x; x = y; y = tmp;
2424 goto retry;
2425 }
2426 break;
2427
2428 case NT_STR:
2429 goto swap;
2430 break;
2431
2432 default:
2433 break;
2434 }
2435 }
2436 break;
2437
2438 case NT_CCLASS:
2439 {
2440 CClassNode* xc = NCCLASS(x);
2441 switch (ytype) {
2442 case NT_CTYPE:
2443 switch (NCTYPE(y)->ctype) {
2444 case ONIGENC_CTYPE_WORD:
2445 if (NCTYPE(y)->not == 0) {
2446 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2447 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2448 if (BITSET_AT(xc->bs, i)) {
2449 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2450 }
2451 }
2452 return 1;
2453 }
2454 return 0;
2455 }
2456 else {
2457 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2458 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2459 if (!IS_NCCLASS_NOT(xc)) {
2460 if (BITSET_AT(xc->bs, i))
2461 return 0;
2462 }
2463 else {
2464 if (! BITSET_AT(xc->bs, i))
2465 return 0;
2466 }
2467 }
2468 }
2469 return 1;
2470 }
2471 break;
2472
2473 default:
2474 break;
2475 }
2476 break;
2477
2478 case NT_CCLASS:
2479 {
2480 int v;
2481 CClassNode* yc = NCCLASS(y);
2482
2483 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2484 v = BITSET_AT(xc->bs, i);
2485 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2486 (v == 0 && IS_NCCLASS_NOT(xc))) {
2487 v = BITSET_AT(yc->bs, i);
2488 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2489 (v == 0 && IS_NCCLASS_NOT(yc)))
2490 return 0;
2491 }
2492 }
2493 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2494 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2495 return 1;
2496 return 0;
2497 }
2498 break;
2499
2500 case NT_STR:
2501 goto swap;
2502 break;
2503
2504 default:
2505 break;
2506 }
2507 }
2508 break;
2509
2510 case NT_STR:
2511 {
2512 StrNode* xs = NSTR(x);
2513 if (NSTRING_LEN(x) == 0)
2514 break;
2515
2516 //c = *(xs->s);
2517 switch (ytype) {
2518 case NT_CTYPE:
2519 switch (NCTYPE(y)->ctype) {
2520 case ONIGENC_CTYPE_WORD:
2521 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2522 return NCTYPE(y)->not;
2523 else
2524 return !(NCTYPE(y)->not);
2525 break;
2526 default:
2527 break;
2528 }
2529 break;
2530
2531 case NT_CCLASS:
2532 {
2533 CClassNode* cc = NCCLASS(y);
2534
2535 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2536 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2537 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2538 }
2539 break;
2540
2541 case NT_STR:
2542 {
2543 UChar *q;
2544 StrNode* ys = NSTR(y);
2545 len = NSTRING_LEN(x);
2546 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2547 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2548 /* tiny version */
2549 return 0;
2550 }
2551 else {
2552 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2553 if (*p != *q) return 1;
2554 }
2555 }
2556 }
2557 break;
2558
2559 default:
2560 break;
2561 }
2562 }
2563 break;
2564
2565 default:
2566 break;
2567 }
2568
2569 return 0;
2570 }
2571
2572 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2573 get_head_value_node(Node* node, int exact, regex_t* reg)
2574 {
2575 Node* n = NULL_NODE;
2576
2577 switch (NTYPE(node)) {
2578 case NT_BREF:
2579 case NT_ALT:
2580 case NT_CANY:
2581 #ifdef USE_SUBEXP_CALL
2582 case NT_CALL:
2583 #endif
2584 break;
2585
2586 case NT_CTYPE:
2587 case NT_CCLASS:
2588 if (exact == 0) {
2589 n = node;
2590 }
2591 break;
2592
2593 case NT_LIST:
2594 n = get_head_value_node(NCAR(node), exact, reg);
2595 break;
2596
2597 case NT_STR:
2598 {
2599 StrNode* sn = NSTR(node);
2600
2601 if (sn->end <= sn->s)
2602 break;
2603
2604 if (exact != 0 &&
2605 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2606 }
2607 else {
2608 n = node;
2609 }
2610 }
2611 break;
2612
2613 case NT_QTFR:
2614 {
2615 QtfrNode* qn = NQTFR(node);
2616 if (qn->lower > 0) {
2617 if (IS_NOT_NULL(qn->head_exact))
2618 n = qn->head_exact;
2619 else
2620 n = get_head_value_node(qn->target, exact, reg);
2621 }
2622 }
2623 break;
2624
2625 case NT_ENCLOSE:
2626 {
2627 EncloseNode* en = NENCLOSE(node);
2628 switch (en->type) {
2629 case ENCLOSE_OPTION:
2630 {
2631 OnigOptionType options = reg->options;
2632
2633 reg->options = NENCLOSE(node)->option;
2634 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2635 reg->options = options;
2636 }
2637 break;
2638
2639 case ENCLOSE_MEMORY:
2640 case ENCLOSE_STOP_BACKTRACK:
2641 n = get_head_value_node(en->target, exact, reg);
2642 break;
2643 }
2644 }
2645 break;
2646
2647 case NT_ANCHOR:
2648 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2649 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2650 break;
2651
2652 default:
2653 break;
2654 }
2655
2656 return n;
2657 }
2658
2659 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2660 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2661 {
2662 int type, r = 0;
2663
2664 type = NTYPE(node);
2665 if ((NTYPE2BIT(type) & type_mask) == 0)
2666 return 1;
2667
2668 switch (type) {
2669 case NT_LIST:
2670 case NT_ALT:
2671 do {
2672 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2673 anchor_mask);
2674 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2675 break;
2676
2677 case NT_QTFR:
2678 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2679 anchor_mask);
2680 break;
2681
2682 case NT_ENCLOSE:
2683 {
2684 EncloseNode* en = NENCLOSE(node);
2685 if ((en->type & enclose_mask) == 0)
2686 return 1;
2687
2688 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2689 }
2690 break;
2691
2692 case NT_ANCHOR:
2693 type = NANCHOR(node)->type;
2694 if ((type & anchor_mask) == 0)
2695 return 1;
2696
2697 if (NANCHOR(node)->target)
2698 r = check_type_tree(NANCHOR(node)->target,
2699 type_mask, enclose_mask, anchor_mask);
2700 break;
2701
2702 default:
2703 break;
2704 }
2705 return r;
2706 }
2707
2708 #ifdef USE_SUBEXP_CALL
2709
2710 #define RECURSION_EXIST 1
2711 #define RECURSION_INFINITE 2
2712
2713 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2714 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2715 {
2716 int type;
2717 int r = 0;
2718
2719 type = NTYPE(node);
2720 switch (type) {
2721 case NT_LIST:
2722 {
2723 Node *x;
2724 OnigDistance min;
2725 int ret;
2726
2727 x = node;
2728 do {
2729 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2730 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2731 r |= ret;
2732 if (head) {
2733 ret = get_min_match_length(NCAR(x), &min, env);
2734 if (ret != 0) return ret;
2735 if (min != 0) head = 0;
2736 }
2737 } while (IS_NOT_NULL(x = NCDR(x)));
2738 }
2739 break;
2740
2741 case NT_ALT:
2742 {
2743 int ret;
2744 r = RECURSION_EXIST;
2745 do {
2746 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2747 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2748 r &= ret;
2749 } while (IS_NOT_NULL(node = NCDR(node)));
2750 }
2751 break;
2752
2753 case NT_QTFR:
2754 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2755 if (r == RECURSION_EXIST) {
2756 if (NQTFR(node)->lower == 0) r = 0;
2757 }
2758 break;
2759
2760 case NT_ANCHOR:
2761 {
2762 AnchorNode* an = NANCHOR(node);
2763 switch (an->type) {
2764 case ANCHOR_PREC_READ:
2765 case ANCHOR_PREC_READ_NOT:
2766 case ANCHOR_LOOK_BEHIND:
2767 case ANCHOR_LOOK_BEHIND_NOT:
2768 r = subexp_inf_recursive_check(an->target, env, head);
2769 break;
2770 }
2771 }
2772 break;
2773
2774 case NT_CALL:
2775 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2776 break;
2777
2778 case NT_ENCLOSE:
2779 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2780 return 0;
2781 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2782 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2783 else {
2784 SET_ENCLOSE_STATUS(node, NST_MARK2);
2785 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2786 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2787 }
2788 break;
2789
2790 default:
2791 break;
2792 }
2793
2794 return r;
2795 }
2796
2797 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2798 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2799 {
2800 int type;
2801 int r = 0;
2802
2803 type = NTYPE(node);
2804 switch (type) {
2805 case NT_LIST:
2806 case NT_ALT:
2807 do {
2808 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2809 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2810 break;
2811
2812 case NT_QTFR:
2813 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2814 break;
2815
2816 case NT_ANCHOR:
2817 {
2818 AnchorNode* an = NANCHOR(node);
2819 switch (an->type) {
2820 case ANCHOR_PREC_READ:
2821 case ANCHOR_PREC_READ_NOT:
2822 case ANCHOR_LOOK_BEHIND:
2823 case ANCHOR_LOOK_BEHIND_NOT:
2824 r = subexp_inf_recursive_check_trav(an->target, env);
2825 break;
2826 }
2827 }
2828 break;
2829
2830 case NT_ENCLOSE:
2831 {
2832 EncloseNode* en = NENCLOSE(node);
2833
2834 if (IS_ENCLOSE_RECURSION(en)) {
2835 SET_ENCLOSE_STATUS(node, NST_MARK1);
2836 r = subexp_inf_recursive_check(en->target, env, 1);
2837 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2838 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2839 }
2840 r = subexp_inf_recursive_check_trav(en->target, env);
2841 }
2842
2843 break;
2844
2845 default:
2846 break;
2847 }
2848
2849 return r;
2850 }
2851
2852 static int
subexp_recursive_check(Node * node)2853 subexp_recursive_check(Node* node)
2854 {
2855 int r = 0;
2856
2857 switch (NTYPE(node)) {
2858 case NT_LIST:
2859 case NT_ALT:
2860 do {
2861 r |= subexp_recursive_check(NCAR(node));
2862 } while (IS_NOT_NULL(node = NCDR(node)));
2863 break;
2864
2865 case NT_QTFR:
2866 r = subexp_recursive_check(NQTFR(node)->target);
2867 break;
2868
2869 case NT_ANCHOR:
2870 {
2871 AnchorNode* an = NANCHOR(node);
2872 switch (an->type) {
2873 case ANCHOR_PREC_READ:
2874 case ANCHOR_PREC_READ_NOT:
2875 case ANCHOR_LOOK_BEHIND:
2876 case ANCHOR_LOOK_BEHIND_NOT:
2877 r = subexp_recursive_check(an->target);
2878 break;
2879 }
2880 }
2881 break;
2882
2883 case NT_CALL:
2884 r = subexp_recursive_check(NCALL(node)->target);
2885 if (r != 0) SET_CALL_RECURSION(node);
2886 break;
2887
2888 case NT_ENCLOSE:
2889 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2890 return 0;
2891 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2892 return 1; /* recursion */
2893 else {
2894 SET_ENCLOSE_STATUS(node, NST_MARK2);
2895 r = subexp_recursive_check(NENCLOSE(node)->target);
2896 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2897 }
2898 break;
2899
2900 default:
2901 break;
2902 }
2903
2904 return r;
2905 }
2906
2907
2908 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2909 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2910 {
2911 #define FOUND_CALLED_NODE 1
2912
2913 int type;
2914 int r = 0;
2915
2916 type = NTYPE(node);
2917 switch (type) {
2918 case NT_LIST:
2919 case NT_ALT:
2920 {
2921 int ret;
2922 do {
2923 ret = subexp_recursive_check_trav(NCAR(node), env);
2924 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2925 else if (ret < 0) return ret;
2926 } while (IS_NOT_NULL(node = NCDR(node)));
2927 }
2928 break;
2929
2930 case NT_QTFR:
2931 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2932 if (NQTFR(node)->upper == 0) {
2933 if (r == FOUND_CALLED_NODE)
2934 NQTFR(node)->is_refered = 1;
2935 }
2936 break;
2937
2938 case NT_ANCHOR:
2939 {
2940 AnchorNode* an = NANCHOR(node);
2941 switch (an->type) {
2942 case ANCHOR_PREC_READ:
2943 case ANCHOR_PREC_READ_NOT:
2944 case ANCHOR_LOOK_BEHIND:
2945 case ANCHOR_LOOK_BEHIND_NOT:
2946 r = subexp_recursive_check_trav(an->target, env);
2947 break;
2948 }
2949 }
2950 break;
2951
2952 case NT_ENCLOSE:
2953 {
2954 EncloseNode* en = NENCLOSE(node);
2955
2956 if (! IS_ENCLOSE_RECURSION(en)) {
2957 if (IS_ENCLOSE_CALLED(en)) {
2958 SET_ENCLOSE_STATUS(node, NST_MARK1);
2959 r = subexp_recursive_check(en->target);
2960 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2961 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2962 }
2963 }
2964 r = subexp_recursive_check_trav(en->target, env);
2965 if (IS_ENCLOSE_CALLED(en))
2966 r |= FOUND_CALLED_NODE;
2967 }
2968 break;
2969
2970 default:
2971 break;
2972 }
2973
2974 return r;
2975 }
2976
2977 static int
setup_subexp_call(Node * node,ScanEnv * env)2978 setup_subexp_call(Node* node, ScanEnv* env)
2979 {
2980 int type;
2981 int r = 0;
2982
2983 type = NTYPE(node);
2984 switch (type) {
2985 case NT_LIST:
2986 do {
2987 r = setup_subexp_call(NCAR(node), env);
2988 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2989 break;
2990
2991 case NT_ALT:
2992 do {
2993 r = setup_subexp_call(NCAR(node), env);
2994 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2995 break;
2996
2997 case NT_QTFR:
2998 r = setup_subexp_call(NQTFR(node)->target, env);
2999 break;
3000 case NT_ENCLOSE:
3001 r = setup_subexp_call(NENCLOSE(node)->target, env);
3002 break;
3003
3004 case NT_CALL:
3005 {
3006 CallNode* cn = NCALL(node);
3007 Node** nodes = SCANENV_MEM_NODES(env);
3008
3009 if (cn->group_num != 0) {
3010 int gnum = cn->group_num;
3011
3012 #ifdef USE_NAMED_GROUP
3013 if (env->num_named > 0 &&
3014 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3015 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3016 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3017 }
3018 #endif
3019 if (gnum > env->num_mem) {
3020 onig_scan_env_set_error_string(env,
3021 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3022 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3023 }
3024
3025 #ifdef USE_NAMED_GROUP
3026 set_call_attr:
3027 #endif
3028 cn->target = nodes[cn->group_num];
3029 if (IS_NULL(cn->target)) {
3030 onig_scan_env_set_error_string(env,
3031 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3032 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3033 }
3034 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3035 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3036 cn->unset_addr_list = env->unset_addr_list;
3037 }
3038 #ifdef USE_NAMED_GROUP
3039 else {
3040 int *refs;
3041
3042 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3043 &refs);
3044 if (n <= 0) {
3045 onig_scan_env_set_error_string(env,
3046 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3047 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3048 }
3049 else if (n > 1) {
3050 onig_scan_env_set_error_string(env,
3051 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3052 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3053 }
3054 else {
3055 cn->group_num = refs[0];
3056 goto set_call_attr;
3057 }
3058 }
3059 #endif
3060 }
3061 break;
3062
3063 case NT_ANCHOR:
3064 {
3065 AnchorNode* an = NANCHOR(node);
3066
3067 switch (an->type) {
3068 case ANCHOR_PREC_READ:
3069 case ANCHOR_PREC_READ_NOT:
3070 case ANCHOR_LOOK_BEHIND:
3071 case ANCHOR_LOOK_BEHIND_NOT:
3072 r = setup_subexp_call(an->target, env);
3073 break;
3074 }
3075 }
3076 break;
3077
3078 default:
3079 break;
3080 }
3081
3082 return r;
3083 }
3084 #endif
3085
3086 /* divide different length alternatives in look-behind.
3087 (?<=A|B) ==> (?<=A)|(?<=B)
3088 (?<!A|B) ==> (?<!A)(?<!B)
3089 */
3090 static int
divide_look_behind_alternatives(Node * node)3091 divide_look_behind_alternatives(Node* node)
3092 {
3093 Node *head, *np, *insert_node;
3094 AnchorNode* an = NANCHOR(node);
3095 int anc_type = an->type;
3096
3097 head = an->target;
3098 np = NCAR(head);
3099 swap_node(node, head);
3100 NCAR(node) = head;
3101 NANCHOR(head)->target = np;
3102
3103 np = node;
3104 while ((np = NCDR(np)) != NULL_NODE) {
3105 insert_node = onig_node_new_anchor(anc_type);
3106 CHECK_NULL_RETURN_MEMERR(insert_node);
3107 NANCHOR(insert_node)->target = NCAR(np);
3108 NCAR(np) = insert_node;
3109 }
3110
3111 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3112 np = node;
3113 do {
3114 SET_NTYPE(np, NT_LIST); /* alt -> list */
3115 } while ((np = NCDR(np)) != NULL_NODE);
3116 }
3117 return 0;
3118 }
3119
3120 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3121 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3122 {
3123 int r, len;
3124 AnchorNode* an = NANCHOR(node);
3125
3126 r = get_char_length_tree(an->target, reg, &len);
3127 if (r == 0)
3128 an->char_len = len;
3129 else if (r == GET_CHAR_LEN_VARLEN)
3130 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3131 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3132 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3133 r = divide_look_behind_alternatives(node);
3134 else
3135 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3136 }
3137
3138 return r;
3139 }
3140
3141 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3142 next_setup(Node* node, Node* next_node, regex_t* reg)
3143 {
3144 int type;
3145
3146 retry:
3147 type = NTYPE(node);
3148 if (type == NT_QTFR) {
3149 QtfrNode* qn = NQTFR(node);
3150 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3151 #ifdef USE_QTFR_PEEK_NEXT
3152 Node* n = get_head_value_node(next_node, 1, reg);
3153 /* '\0': for UTF-16BE etc... */
3154 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3155 qn->next_head_exact = n;
3156 }
3157 #endif
3158 /* automatic posseivation a*b ==> (?>a*)b */
3159 if (qn->lower <= 1) {
3160 int ttype = NTYPE(qn->target);
3161 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3162 Node *x, *y;
3163 x = get_head_value_node(qn->target, 0, reg);
3164 if (IS_NOT_NULL(x)) {
3165 y = get_head_value_node(next_node, 0, reg);
3166 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3167 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3168 CHECK_NULL_RETURN_MEMERR(en);
3169 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3170 swap_node(node, en);
3171 NENCLOSE(node)->target = en;
3172 }
3173 }
3174 }
3175 }
3176 }
3177 }
3178 else if (type == NT_ENCLOSE) {
3179 EncloseNode* en = NENCLOSE(node);
3180 if (en->type == ENCLOSE_MEMORY) {
3181 node = en->target;
3182 goto retry;
3183 }
3184 }
3185 return 0;
3186 }
3187
3188
3189 static int
update_string_node_case_fold(regex_t * reg,Node * node)3190 update_string_node_case_fold(regex_t* reg, Node *node)
3191 {
3192 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3193 UChar *sbuf, *ebuf, *sp;
3194 int r, i, len, sbuf_size;
3195 StrNode* sn = NSTR(node);
3196
3197 end = sn->end;
3198 sbuf_size = (end - sn->s) * 2;
3199 sbuf = (UChar* )xmalloc(sbuf_size);
3200 CHECK_NULL_RETURN_MEMERR(sbuf);
3201 ebuf = sbuf + sbuf_size;
3202
3203 sp = sbuf;
3204 p = sn->s;
3205 while (p < end) {
3206 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3207 for (i = 0; i < len; i++) {
3208 if (sp >= ebuf) {
3209 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3210 CHECK_NULL_RETURN_MEMERR(sbuf);
3211 sp = sbuf + sbuf_size;
3212 sbuf_size *= 2;
3213 ebuf = sbuf + sbuf_size;
3214 }
3215
3216 *sp++ = buf[i];
3217 }
3218 }
3219
3220 r = onig_node_str_set(node, sbuf, sp);
3221 if (r != 0) {
3222 xfree(sbuf);
3223 return r;
3224 }
3225
3226 xfree(sbuf);
3227 return 0;
3228 }
3229
3230 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3231 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3232 regex_t* reg)
3233 {
3234 int r;
3235 Node *node;
3236
3237 node = onig_node_new_str(s, end);
3238 if (IS_NULL(node)) return ONIGERR_MEMORY;
3239
3240 r = update_string_node_case_fold(reg, node);
3241 if (r != 0) {
3242 onig_node_free(node);
3243 return r;
3244 }
3245
3246 NSTRING_SET_AMBIG(node);
3247 NSTRING_SET_DONT_GET_OPT_INFO(node);
3248 *rnode = node;
3249 return 0;
3250 }
3251
3252 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3253 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3254 UChar *p, int slen, UChar *end,
3255 regex_t* reg, Node **rnode)
3256 {
3257 int r, i, j, len, varlen;
3258 Node *anode, *var_anode, *snode, *xnode, *an;
3259 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3260
3261 *rnode = var_anode = NULL_NODE;
3262
3263 varlen = 0;
3264 for (i = 0; i < item_num; i++) {
3265 if (items[i].byte_len != slen) {
3266 varlen = 1;
3267 break;
3268 }
3269 }
3270
3271 if (varlen != 0) {
3272 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3273 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3274
3275 xnode = onig_node_new_list(NULL, NULL);
3276 if (IS_NULL(xnode)) goto mem_err;
3277 NCAR(var_anode) = xnode;
3278
3279 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3280 if (IS_NULL(anode)) goto mem_err;
3281 NCAR(xnode) = anode;
3282 }
3283 else {
3284 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3285 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3286 }
3287
3288 snode = onig_node_new_str(p, p + slen);
3289 if (IS_NULL(snode)) goto mem_err;
3290
3291 NCAR(anode) = snode;
3292
3293 for (i = 0; i < item_num; i++) {
3294 snode = onig_node_new_str(NULL, NULL);
3295 if (IS_NULL(snode)) goto mem_err;
3296
3297 for (j = 0; j < items[i].code_len; j++) {
3298 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3299 if (len < 0) {
3300 r = len;
3301 goto mem_err2;
3302 }
3303
3304 r = onig_node_str_cat(snode, buf, buf + len);
3305 if (r != 0) goto mem_err2;
3306 }
3307
3308 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3309 if (IS_NULL(an)) {
3310 goto mem_err2;
3311 }
3312
3313 if (items[i].byte_len != slen) {
3314 Node *rem;
3315 UChar *q = p + items[i].byte_len;
3316
3317 if (q < end) {
3318 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3319 if (r != 0) {
3320 onig_node_free(an);
3321 goto mem_err2;
3322 }
3323
3324 xnode = onig_node_list_add(NULL_NODE, snode);
3325 if (IS_NULL(xnode)) {
3326 onig_node_free(an);
3327 onig_node_free(rem);
3328 goto mem_err2;
3329 }
3330 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3331 onig_node_free(an);
3332 onig_node_free(xnode);
3333 onig_node_free(rem);
3334 goto mem_err;
3335 }
3336
3337 NCAR(an) = xnode;
3338 }
3339 else {
3340 NCAR(an) = snode;
3341 }
3342
3343 NCDR(var_anode) = an;
3344 var_anode = an;
3345 }
3346 else {
3347 NCAR(an) = snode;
3348 NCDR(anode) = an;
3349 anode = an;
3350 }
3351 }
3352
3353 return varlen;
3354
3355 mem_err2:
3356 onig_node_free(snode);
3357
3358 mem_err:
3359 onig_node_free(*rnode);
3360
3361 return ONIGERR_MEMORY;
3362 }
3363
3364 static int
expand_case_fold_string(Node * node,regex_t * reg)3365 expand_case_fold_string(Node* node, regex_t* reg)
3366 {
3367 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3368
3369 int r, n, len, alt_num;
3370 UChar *start, *end, *p;
3371 Node *top_root, *root, *snode, *prev_node;
3372 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3373 StrNode* sn = NSTR(node);
3374
3375 if (NSTRING_IS_AMBIG(node)) return 0;
3376
3377 start = sn->s;
3378 end = sn->end;
3379 if (start >= end) return 0;
3380
3381 r = 0;
3382 top_root = root = prev_node = snode = NULL_NODE;
3383 alt_num = 1;
3384 p = start;
3385 while (p < end) {
3386 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3387 p, end, items);
3388 if (n < 0) {
3389 r = n;
3390 goto err;
3391 }
3392
3393 SAFE_ENC_LEN(reg->enc, p, end, len);
3394
3395 if (n == 0) {
3396 if (IS_NULL(snode)) {
3397 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3398 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3399 if (IS_NULL(root)) {
3400 onig_node_free(prev_node);
3401 goto mem_err;
3402 }
3403 }
3404
3405 prev_node = snode = onig_node_new_str(NULL, NULL);
3406 if (IS_NULL(snode)) goto mem_err;
3407 if (IS_NOT_NULL(root)) {
3408 if (IS_NULL(onig_node_list_add(root, snode))) {
3409 onig_node_free(snode);
3410 goto mem_err;
3411 }
3412 }
3413 }
3414
3415 r = onig_node_str_cat(snode, p, p + len);
3416 if (r != 0) goto err;
3417 }
3418 else {
3419 alt_num *= (n + 1);
3420 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3421
3422 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3423 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3424 if (IS_NULL(root)) {
3425 onig_node_free(prev_node);
3426 goto mem_err;
3427 }
3428 }
3429
3430 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3431 if (r < 0) goto mem_err;
3432 if (r == 1) {
3433 if (IS_NULL(root)) {
3434 top_root = prev_node;
3435 }
3436 else {
3437 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3438 onig_node_free(prev_node);
3439 goto mem_err;
3440 }
3441 }
3442
3443 root = NCAR(prev_node);
3444 }
3445 else { /* r == 0 */
3446 if (IS_NOT_NULL(root)) {
3447 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3448 onig_node_free(prev_node);
3449 goto mem_err;
3450 }
3451 }
3452 }
3453
3454 snode = NULL_NODE;
3455 }
3456
3457 p += len;
3458 }
3459
3460 if (p < end) {
3461 Node *srem;
3462
3463 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3464 if (r != 0) goto mem_err;
3465
3466 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3467 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3468 if (IS_NULL(root)) {
3469 onig_node_free(srem);
3470 onig_node_free(prev_node);
3471 goto mem_err;
3472 }
3473 }
3474
3475 if (IS_NULL(root)) {
3476 prev_node = srem;
3477 }
3478 else {
3479 if (IS_NULL(onig_node_list_add(root, srem))) {
3480 onig_node_free(srem);
3481 goto mem_err;
3482 }
3483 }
3484 }
3485
3486 /* ending */
3487 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3488 swap_node(node, top_root);
3489 onig_node_free(top_root);
3490 return 0;
3491
3492 mem_err:
3493 r = ONIGERR_MEMORY;
3494
3495 err:
3496 onig_node_free(top_root);
3497 return r;
3498 }
3499
3500
3501 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3502
3503 #define CEC_THRES_NUM_BIG_REPEAT 512
3504 #define CEC_INFINITE_NUM 0x7fffffff
3505
3506 #define CEC_IN_INFINITE_REPEAT (1<<0)
3507 #define CEC_IN_FINITE_REPEAT (1<<1)
3508 #define CEC_CONT_BIG_REPEAT (1<<2)
3509
3510 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3511 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3512 {
3513 int type;
3514 int r = state;
3515
3516 type = NTYPE(node);
3517 switch (type) {
3518 case NT_LIST:
3519 {
3520 Node* prev = NULL_NODE;
3521 do {
3522 r = setup_comb_exp_check(NCAR(node), r, env);
3523 prev = NCAR(node);
3524 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3525 }
3526 break;
3527
3528 case NT_ALT:
3529 {
3530 int ret;
3531 do {
3532 ret = setup_comb_exp_check(NCAR(node), state, env);
3533 r |= ret;
3534 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3535 }
3536 break;
3537
3538 case NT_QTFR:
3539 {
3540 int child_state = state;
3541 int add_state = 0;
3542 QtfrNode* qn = NQTFR(node);
3543 Node* target = qn->target;
3544 int var_num;
3545
3546 if (! IS_REPEAT_INFINITE(qn->upper)) {
3547 if (qn->upper > 1) {
3548 /* {0,1}, {1,1} are allowed */
3549 child_state |= CEC_IN_FINITE_REPEAT;
3550
3551 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3552 if (env->backrefed_mem == 0) {
3553 if (NTYPE(qn->target) == NT_ENCLOSE) {
3554 EncloseNode* en = NENCLOSE(qn->target);
3555 if (en->type == ENCLOSE_MEMORY) {
3556 if (NTYPE(en->target) == NT_QTFR) {
3557 QtfrNode* q = NQTFR(en->target);
3558 if (IS_REPEAT_INFINITE(q->upper)
3559 && q->greedy == qn->greedy) {
3560 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3561 if (qn->upper == 1)
3562 child_state = state;
3563 }
3564 }
3565 }
3566 }
3567 }
3568 }
3569 }
3570
3571 if (state & CEC_IN_FINITE_REPEAT) {
3572 qn->comb_exp_check_num = -1;
3573 }
3574 else {
3575 if (IS_REPEAT_INFINITE(qn->upper)) {
3576 var_num = CEC_INFINITE_NUM;
3577 child_state |= CEC_IN_INFINITE_REPEAT;
3578 }
3579 else {
3580 var_num = qn->upper - qn->lower;
3581 }
3582
3583 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3584 add_state |= CEC_CONT_BIG_REPEAT;
3585
3586 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3587 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3588 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3589 if (qn->comb_exp_check_num == 0) {
3590 env->num_comb_exp_check++;
3591 qn->comb_exp_check_num = env->num_comb_exp_check;
3592 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3593 env->comb_exp_max_regnum = env->curr_max_regnum;
3594 }
3595 }
3596 }
3597
3598 r = setup_comb_exp_check(target, child_state, env);
3599 r |= add_state;
3600 }
3601 break;
3602
3603 case NT_ENCLOSE:
3604 {
3605 EncloseNode* en = NENCLOSE(node);
3606
3607 switch (en->type) {
3608 case ENCLOSE_MEMORY:
3609 {
3610 if (env->curr_max_regnum < en->regnum)
3611 env->curr_max_regnum = en->regnum;
3612
3613 r = setup_comb_exp_check(en->target, state, env);
3614 }
3615 break;
3616
3617 default:
3618 r = setup_comb_exp_check(en->target, state, env);
3619 break;
3620 }
3621 }
3622 break;
3623
3624 #ifdef USE_SUBEXP_CALL
3625 case NT_CALL:
3626 if (IS_CALL_RECURSION(NCALL(node)))
3627 env->has_recursion = 1;
3628 else
3629 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3630 break;
3631 #endif
3632
3633 default:
3634 break;
3635 }
3636
3637 return r;
3638 }
3639 #endif
3640
3641 #define IN_ALT (1<<0)
3642 #define IN_NOT (1<<1)
3643 #define IN_REPEAT (1<<2)
3644 #define IN_VAR_REPEAT (1<<3)
3645
3646 /* setup_tree does the following work.
3647 1. check empty loop. (set qn->target_empty_info)
3648 2. expand ignore-case in char class.
3649 3. set memory status bit flags. (reg->mem_stats)
3650 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3651 5. find invalid patterns in look-behind.
3652 6. expand repeated string.
3653 */
3654 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3655 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3656 {
3657 int type;
3658 int r = 0;
3659
3660 type = NTYPE(node);
3661 switch (type) {
3662 case NT_LIST:
3663 {
3664 Node* prev = NULL_NODE;
3665 do {
3666 r = setup_tree(NCAR(node), reg, state, env);
3667 if (IS_NOT_NULL(prev) && r == 0) {
3668 r = next_setup(prev, NCAR(node), reg);
3669 }
3670 prev = NCAR(node);
3671 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3672 }
3673 break;
3674
3675 case NT_ALT:
3676 do {
3677 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3678 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3679 break;
3680
3681 case NT_CCLASS:
3682 break;
3683
3684 case NT_STR:
3685 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3686 r = expand_case_fold_string(node, reg);
3687 }
3688 break;
3689
3690 case NT_CTYPE:
3691 case NT_CANY:
3692 break;
3693
3694 #ifdef USE_SUBEXP_CALL
3695 case NT_CALL:
3696 break;
3697 #endif
3698
3699 case NT_BREF:
3700 {
3701 int i;
3702 int* p;
3703 Node** nodes = SCANENV_MEM_NODES(env);
3704 BRefNode* br = NBREF(node);
3705 p = BACKREFS_P(br);
3706 for (i = 0; i < br->back_num; i++) {
3707 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3708 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3709 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3710 #ifdef USE_BACKREF_WITH_LEVEL
3711 if (IS_BACKREF_NEST_LEVEL(br)) {
3712 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3713 }
3714 #endif
3715 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3716 }
3717 }
3718 break;
3719
3720 case NT_QTFR:
3721 {
3722 OnigDistance d;
3723 QtfrNode* qn = NQTFR(node);
3724 Node* target = qn->target;
3725
3726 if ((state & IN_REPEAT) != 0) {
3727 qn->state |= NST_IN_REPEAT;
3728 }
3729
3730 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3731 r = get_min_match_length(target, &d, env);
3732 if (r) break;
3733 if (d == 0) {
3734 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3735 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3736 r = quantifiers_memory_node_info(target);
3737 if (r < 0) break;
3738 if (r > 0) {
3739 qn->target_empty_info = r;
3740 }
3741 #endif
3742 #if 0
3743 r = get_max_match_length(target, &d, env);
3744 if (r == 0 && d == 0) {
3745 /* ()* ==> ()?, ()+ ==> () */
3746 qn->upper = 1;
3747 if (qn->lower > 1) qn->lower = 1;
3748 if (NTYPE(target) == NT_STR) {
3749 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3750 }
3751 }
3752 #endif
3753 }
3754 }
3755
3756 state |= IN_REPEAT;
3757 if (qn->lower != qn->upper)
3758 state |= IN_VAR_REPEAT;
3759 r = setup_tree(target, reg, state, env);
3760 if (r) break;
3761
3762 /* expand string */
3763 #define EXPAND_STRING_MAX_LENGTH 100
3764 if (NTYPE(target) == NT_STR) {
3765 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3766 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3767 int len = NSTRING_LEN(target);
3768 StrNode* sn = NSTR(target);
3769
3770 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3771 int i, n = qn->lower;
3772 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3773 for (i = 0; i < n; i++) {
3774 r = onig_node_str_cat(node, sn->s, sn->end);
3775 if (r) break;
3776 }
3777 onig_node_free(target);
3778 break; /* break case NT_QTFR: */
3779 }
3780 }
3781 }
3782
3783 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3784 if (qn->greedy && (qn->target_empty_info != 0)) {
3785 if (NTYPE(target) == NT_QTFR) {
3786 QtfrNode* tqn = NQTFR(target);
3787 if (IS_NOT_NULL(tqn->head_exact)) {
3788 qn->head_exact = tqn->head_exact;
3789 tqn->head_exact = NULL;
3790 }
3791 }
3792 else {
3793 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3794 }
3795 }
3796 #endif
3797 }
3798 break;
3799
3800 case NT_ENCLOSE:
3801 {
3802 EncloseNode* en = NENCLOSE(node);
3803
3804 switch (en->type) {
3805 case ENCLOSE_OPTION:
3806 {
3807 OnigOptionType options = reg->options;
3808 reg->options = NENCLOSE(node)->option;
3809 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3810 reg->options = options;
3811 }
3812 break;
3813
3814 case ENCLOSE_MEMORY:
3815 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3816 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3817 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3818 }
3819 r = setup_tree(en->target, reg, state, env);
3820 break;
3821
3822 case ENCLOSE_STOP_BACKTRACK:
3823 {
3824 Node* target = en->target;
3825 r = setup_tree(target, reg, state, env);
3826 if (NTYPE(target) == NT_QTFR) {
3827 QtfrNode* tqn = NQTFR(target);
3828 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3829 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3830 int qtype = NTYPE(tqn->target);
3831 if (IS_NODE_TYPE_SIMPLE(qtype))
3832 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3833 }
3834 }
3835 }
3836 break;
3837 }
3838 }
3839 break;
3840
3841 case NT_ANCHOR:
3842 {
3843 AnchorNode* an = NANCHOR(node);
3844
3845 switch (an->type) {
3846 case ANCHOR_PREC_READ:
3847 r = setup_tree(an->target, reg, state, env);
3848 break;
3849 case ANCHOR_PREC_READ_NOT:
3850 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3851 break;
3852
3853 /* allowed node types in look-behind */
3854 #define ALLOWED_TYPE_IN_LB \
3855 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3856 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3857
3858 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
3859 #define ALLOWED_ENCLOSE_IN_LB_NOT 0
3860
3861 #define ALLOWED_ANCHOR_IN_LB \
3862 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3863 #define ALLOWED_ANCHOR_IN_LB_NOT \
3864 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3865
3866 case ANCHOR_LOOK_BEHIND:
3867 {
3868 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3869 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3870 if (r < 0) return r;
3871 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3872 r = setup_look_behind(node, reg, env);
3873 if (r != 0) return r;
3874 r = setup_tree(an->target, reg, state, env);
3875 }
3876 break;
3877
3878 case ANCHOR_LOOK_BEHIND_NOT:
3879 {
3880 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3881 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3882 if (r < 0) return r;
3883 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3884 r = setup_look_behind(node, reg, env);
3885 if (r != 0) return r;
3886 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3887 }
3888 break;
3889 }
3890 }
3891 break;
3892
3893 default:
3894 break;
3895 }
3896
3897 return r;
3898 }
3899
3900 /* set skip map for Boyer-Moor search */
3901 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3902 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3903 UChar skip[], int** int_skip)
3904 {
3905 int i, len;
3906
3907 len = end - s;
3908 if (len < ONIG_CHAR_TABLE_SIZE) {
3909 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3910
3911 for (i = 0; i < len - 1; i++)
3912 skip[s[i]] = len - 1 - i;
3913 }
3914 else {
3915 if (IS_NULL(*int_skip)) {
3916 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3917 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3918 }
3919 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3920
3921 for (i = 0; i < len - 1; i++)
3922 (*int_skip)[s[i]] = len - 1 - i;
3923 }
3924 return 0;
3925 }
3926
3927 #define OPT_EXACT_MAXLEN 24
3928
3929 typedef struct {
3930 OnigDistance min; /* min byte length */
3931 OnigDistance max; /* max byte length */
3932 } MinMaxLen;
3933
3934 typedef struct {
3935 MinMaxLen mmd;
3936 OnigEncoding enc;
3937 OnigOptionType options;
3938 OnigCaseFoldType case_fold_flag;
3939 ScanEnv* scan_env;
3940 } OptEnv;
3941
3942 typedef struct {
3943 int left_anchor;
3944 int right_anchor;
3945 } OptAncInfo;
3946
3947 typedef struct {
3948 MinMaxLen mmd; /* info position */
3949 OptAncInfo anc;
3950
3951 int reach_end;
3952 int ignore_case;
3953 int len;
3954 UChar s[OPT_EXACT_MAXLEN];
3955 } OptExactInfo;
3956
3957 typedef struct {
3958 MinMaxLen mmd; /* info position */
3959 OptAncInfo anc;
3960
3961 int value; /* weighted value */
3962 UChar map[ONIG_CHAR_TABLE_SIZE];
3963 } OptMapInfo;
3964
3965 typedef struct {
3966 MinMaxLen len;
3967
3968 OptAncInfo anc;
3969 OptExactInfo exb; /* boundary */
3970 OptExactInfo exm; /* middle */
3971 OptExactInfo expr; /* prec read (?=...) */
3972
3973 OptMapInfo map; /* boundary */
3974 } NodeOptInfo;
3975
3976
3977 static int
map_position_value(OnigEncoding enc,int i)3978 map_position_value(OnigEncoding enc, int i)
3979 {
3980 static const short int ByteValTable[] = {
3981 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
3982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3983 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
3984 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
3985 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3986 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
3987 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3988 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
3989 };
3990
3991 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
3992 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3993 return 20;
3994 else
3995 return (int )ByteValTable[i];
3996 }
3997 else
3998 return 4; /* Take it easy. */
3999 }
4000
4001 static int
distance_value(MinMaxLen * mm)4002 distance_value(MinMaxLen* mm)
4003 {
4004 /* 1000 / (min-max-dist + 1) */
4005 static const short int dist_vals[] = {
4006 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4007 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4008 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4009 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4010 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4011 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4012 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4013 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4014 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4015 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4016 };
4017
4018 int d;
4019
4020 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4021
4022 d = mm->max - mm->min;
4023 if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4024 /* return dist_vals[d] * 16 / (mm->min + 12); */
4025 return (int )dist_vals[d];
4026 else
4027 return 1;
4028 }
4029
4030 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4031 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4032 {
4033 if (v2 <= 0) return -1;
4034 if (v1 <= 0) return 1;
4035
4036 v1 *= distance_value(d1);
4037 v2 *= distance_value(d2);
4038
4039 if (v2 > v1) return 1;
4040 if (v2 < v1) return -1;
4041
4042 if (d2->min < d1->min) return 1;
4043 if (d2->min > d1->min) return -1;
4044 return 0;
4045 }
4046
4047 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4048 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4049 {
4050 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4051 }
4052
4053
4054 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4055 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4056 {
4057 mml->min = min;
4058 mml->max = max;
4059 }
4060
4061 static void
clear_mml(MinMaxLen * mml)4062 clear_mml(MinMaxLen* mml)
4063 {
4064 mml->min = mml->max = 0;
4065 }
4066
4067 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4068 copy_mml(MinMaxLen* to, MinMaxLen* from)
4069 {
4070 to->min = from->min;
4071 to->max = from->max;
4072 }
4073
4074 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4075 add_mml(MinMaxLen* to, MinMaxLen* from)
4076 {
4077 to->min = distance_add(to->min, from->min);
4078 to->max = distance_add(to->max, from->max);
4079 }
4080
4081 #if 0
4082 static void
4083 add_len_mml(MinMaxLen* to, OnigDistance len)
4084 {
4085 to->min = distance_add(to->min, len);
4086 to->max = distance_add(to->max, len);
4087 }
4088 #endif
4089
4090 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4091 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4092 {
4093 if (to->min > from->min) to->min = from->min;
4094 if (to->max < from->max) to->max = from->max;
4095 }
4096
4097 static void
copy_opt_env(OptEnv * to,OptEnv * from)4098 copy_opt_env(OptEnv* to, OptEnv* from)
4099 {
4100 *to = *from;
4101 }
4102
4103 static void
clear_opt_anc_info(OptAncInfo * anc)4104 clear_opt_anc_info(OptAncInfo* anc)
4105 {
4106 anc->left_anchor = 0;
4107 anc->right_anchor = 0;
4108 }
4109
4110 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4111 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4112 {
4113 *to = *from;
4114 }
4115
4116 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4117 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4118 OnigDistance left_len, OnigDistance right_len)
4119 {
4120 clear_opt_anc_info(to);
4121
4122 to->left_anchor = left->left_anchor;
4123 if (left_len == 0) {
4124 to->left_anchor |= right->left_anchor;
4125 }
4126
4127 to->right_anchor = right->right_anchor;
4128 if (right_len == 0) {
4129 to->right_anchor |= left->right_anchor;
4130 }
4131 }
4132
4133 static int
is_left_anchor(int anc)4134 is_left_anchor(int anc)
4135 {
4136 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4137 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4138 anc == ANCHOR_PREC_READ_NOT)
4139 return 0;
4140
4141 return 1;
4142 }
4143
4144 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4145 is_set_opt_anc_info(OptAncInfo* to, int anc)
4146 {
4147 if ((to->left_anchor & anc) != 0) return 1;
4148
4149 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4150 }
4151
4152 static void
add_opt_anc_info(OptAncInfo * to,int anc)4153 add_opt_anc_info(OptAncInfo* to, int anc)
4154 {
4155 if (is_left_anchor(anc))
4156 to->left_anchor |= anc;
4157 else
4158 to->right_anchor |= anc;
4159 }
4160
4161 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4162 remove_opt_anc_info(OptAncInfo* to, int anc)
4163 {
4164 if (is_left_anchor(anc))
4165 to->left_anchor &= ~anc;
4166 else
4167 to->right_anchor &= ~anc;
4168 }
4169
4170 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4171 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4172 {
4173 to->left_anchor &= add->left_anchor;
4174 to->right_anchor &= add->right_anchor;
4175 }
4176
4177 static int
is_full_opt_exact_info(OptExactInfo * ex)4178 is_full_opt_exact_info(OptExactInfo* ex)
4179 {
4180 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4181 }
4182
4183 static void
clear_opt_exact_info(OptExactInfo * ex)4184 clear_opt_exact_info(OptExactInfo* ex)
4185 {
4186 clear_mml(&ex->mmd);
4187 clear_opt_anc_info(&ex->anc);
4188 ex->reach_end = 0;
4189 ex->ignore_case = 0;
4190 ex->len = 0;
4191 ex->s[0] = '\0';
4192 }
4193
4194 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4195 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4196 {
4197 *to = *from;
4198 }
4199
4200 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4201 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4202 {
4203 int i, j, len;
4204 UChar *p, *end;
4205 OptAncInfo tanc;
4206
4207 if (! to->ignore_case && add->ignore_case) {
4208 if (to->len >= add->len) return ; /* avoid */
4209
4210 to->ignore_case = 1;
4211 }
4212
4213 p = add->s;
4214 end = p + add->len;
4215 for (i = to->len; p < end; ) {
4216 len = enclen(enc, p);
4217 if (i + len > OPT_EXACT_MAXLEN) break;
4218 for (j = 0; j < len && p < end; j++)
4219 to->s[i++] = *p++;
4220 }
4221
4222 to->len = i;
4223 to->reach_end = (p == end ? add->reach_end : 0);
4224
4225 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4226 if (! to->reach_end) tanc.right_anchor = 0;
4227 copy_opt_anc_info(&to->anc, &tanc);
4228 }
4229
4230 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4231 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4232 int raw ARG_UNUSED, OnigEncoding enc)
4233 {
4234 int i, j, len;
4235 UChar *p;
4236
4237 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4238 len = enclen(enc, p);
4239 if (i + len > OPT_EXACT_MAXLEN) break;
4240 for (j = 0; j < len && p < end; j++)
4241 to->s[i++] = *p++;
4242 }
4243
4244 to->len = i;
4245 }
4246
4247 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4248 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4249 {
4250 int i, j, len;
4251
4252 if (add->len == 0 || to->len == 0) {
4253 clear_opt_exact_info(to);
4254 return ;
4255 }
4256
4257 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4258 clear_opt_exact_info(to);
4259 return ;
4260 }
4261
4262 for (i = 0; i < to->len && i < add->len; ) {
4263 if (to->s[i] != add->s[i]) break;
4264 len = enclen(env->enc, to->s + i);
4265
4266 for (j = 1; j < len; j++) {
4267 if (to->s[i+j] != add->s[i+j]) break;
4268 }
4269 if (j < len) break;
4270 i += len;
4271 }
4272
4273 if (! add->reach_end || i < add->len || i < to->len) {
4274 to->reach_end = 0;
4275 }
4276 to->len = i;
4277 to->ignore_case |= add->ignore_case;
4278
4279 alt_merge_opt_anc_info(&to->anc, &add->anc);
4280 if (! to->reach_end) to->anc.right_anchor = 0;
4281 }
4282
4283 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4284 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4285 {
4286 int v1, v2;
4287
4288 v1 = now->len;
4289 v2 = alt->len;
4290
4291 if (v2 == 0) {
4292 return ;
4293 }
4294 else if (v1 == 0) {
4295 copy_opt_exact_info(now, alt);
4296 return ;
4297 }
4298 else if (v1 <= 2 && v2 <= 2) {
4299 /* ByteValTable[x] is big value --> low price */
4300 v2 = map_position_value(enc, now->s[0]);
4301 v1 = map_position_value(enc, alt->s[0]);
4302
4303 if (now->len > 1) v1 += 5;
4304 if (alt->len > 1) v2 += 5;
4305 }
4306
4307 if (now->ignore_case == 0) v1 *= 2;
4308 if (alt->ignore_case == 0) v2 *= 2;
4309
4310 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4311 copy_opt_exact_info(now, alt);
4312 }
4313
4314 static void
clear_opt_map_info(OptMapInfo * map)4315 clear_opt_map_info(OptMapInfo* map)
4316 {
4317 static const OptMapInfo clean_info = {
4318 {0, 0}, {0, 0}, 0,
4319 {
4320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4336 }
4337 };
4338
4339 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4340 }
4341
4342 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4343 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4344 {
4345 *to = *from;
4346 }
4347
4348 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4349 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4350 {
4351 if (map->map[c] == 0) {
4352 map->map[c] = 1;
4353 map->value += map_position_value(enc, c);
4354 }
4355 }
4356
4357 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4358 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4359 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4360 {
4361 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4362 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4363 int i, n;
4364
4365 add_char_opt_map_info(map, p[0], enc);
4366
4367 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4368 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4369 if (n < 0) return n;
4370
4371 for (i = 0; i < n; i++) {
4372 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4373 add_char_opt_map_info(map, buf[0], enc);
4374 }
4375
4376 return 0;
4377 }
4378
4379 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4380 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4381 {
4382 static int z = 1<<15; /* 32768: something big value */
4383
4384 int v1, v2;
4385
4386 if (alt->value == 0) return ;
4387 if (now->value == 0) {
4388 copy_opt_map_info(now, alt);
4389 return ;
4390 }
4391
4392 v1 = z / now->value;
4393 v2 = z / alt->value;
4394 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4395 copy_opt_map_info(now, alt);
4396 }
4397
4398 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4399 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4400 {
4401 #define COMP_EM_BASE 20
4402 int ve, vm;
4403
4404 if (m->value <= 0) return -1;
4405
4406 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4407 vm = COMP_EM_BASE * 5 * 2 / m->value;
4408 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4409 }
4410
4411 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4412 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4413 {
4414 int i, val;
4415
4416 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4417 if (to->value == 0) return ;
4418 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4419 clear_opt_map_info(to);
4420 return ;
4421 }
4422
4423 alt_merge_mml(&to->mmd, &add->mmd);
4424
4425 val = 0;
4426 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4427 if (add->map[i])
4428 to->map[i] = 1;
4429
4430 if (to->map[i])
4431 val += map_position_value(enc, i);
4432 }
4433 to->value = val;
4434
4435 alt_merge_opt_anc_info(&to->anc, &add->anc);
4436 }
4437
4438 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4439 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4440 {
4441 copy_mml(&(opt->exb.mmd), mmd);
4442 copy_mml(&(opt->expr.mmd), mmd);
4443 copy_mml(&(opt->map.mmd), mmd);
4444 }
4445
4446 static void
clear_node_opt_info(NodeOptInfo * opt)4447 clear_node_opt_info(NodeOptInfo* opt)
4448 {
4449 clear_mml(&opt->len);
4450 clear_opt_anc_info(&opt->anc);
4451 clear_opt_exact_info(&opt->exb);
4452 clear_opt_exact_info(&opt->exm);
4453 clear_opt_exact_info(&opt->expr);
4454 clear_opt_map_info(&opt->map);
4455 }
4456
4457 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4458 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4459 {
4460 *to = *from;
4461 }
4462
4463 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4464 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4465 {
4466 int exb_reach, exm_reach;
4467 OptAncInfo tanc;
4468
4469 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4470 copy_opt_anc_info(&to->anc, &tanc);
4471
4472 if (add->exb.len > 0 && to->len.max == 0) {
4473 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4474 to->len.max, add->len.max);
4475 copy_opt_anc_info(&add->exb.anc, &tanc);
4476 }
4477
4478 if (add->map.value > 0 && to->len.max == 0) {
4479 if (add->map.mmd.max == 0)
4480 add->map.anc.left_anchor |= to->anc.left_anchor;
4481 }
4482
4483 exb_reach = to->exb.reach_end;
4484 exm_reach = to->exm.reach_end;
4485
4486 if (add->len.max != 0)
4487 to->exb.reach_end = to->exm.reach_end = 0;
4488
4489 if (add->exb.len > 0) {
4490 if (exb_reach) {
4491 concat_opt_exact_info(&to->exb, &add->exb, enc);
4492 clear_opt_exact_info(&add->exb);
4493 }
4494 else if (exm_reach) {
4495 concat_opt_exact_info(&to->exm, &add->exb, enc);
4496 clear_opt_exact_info(&add->exb);
4497 }
4498 }
4499 select_opt_exact_info(enc, &to->exm, &add->exb);
4500 select_opt_exact_info(enc, &to->exm, &add->exm);
4501
4502 if (to->expr.len > 0) {
4503 if (add->len.max > 0) {
4504 if (to->expr.len > (int )add->len.max)
4505 to->expr.len = add->len.max;
4506
4507 if (to->expr.mmd.max == 0)
4508 select_opt_exact_info(enc, &to->exb, &to->expr);
4509 else
4510 select_opt_exact_info(enc, &to->exm, &to->expr);
4511 }
4512 }
4513 else if (add->expr.len > 0) {
4514 copy_opt_exact_info(&to->expr, &add->expr);
4515 }
4516
4517 select_opt_map_info(&to->map, &add->map);
4518
4519 add_mml(&to->len, &add->len);
4520 }
4521
4522 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4523 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4524 {
4525 alt_merge_opt_anc_info (&to->anc, &add->anc);
4526 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4527 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4528 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4529 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4530
4531 alt_merge_mml(&to->len, &add->len);
4532 }
4533
4534
4535 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4536
4537 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4538 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4539 {
4540 int type;
4541 int r = 0;
4542
4543 clear_node_opt_info(opt);
4544 set_bound_node_opt_info(opt, &env->mmd);
4545
4546 type = NTYPE(node);
4547 switch (type) {
4548 case NT_LIST:
4549 {
4550 OptEnv nenv;
4551 NodeOptInfo nopt;
4552 Node* nd = node;
4553
4554 copy_opt_env(&nenv, env);
4555 do {
4556 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4557 if (r == 0) {
4558 add_mml(&nenv.mmd, &nopt.len);
4559 concat_left_node_opt_info(env->enc, opt, &nopt);
4560 }
4561 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4562 }
4563 break;
4564
4565 case NT_ALT:
4566 {
4567 NodeOptInfo nopt;
4568 Node* nd = node;
4569
4570 do {
4571 r = optimize_node_left(NCAR(nd), &nopt, env);
4572 if (r == 0) {
4573 if (nd == node) copy_node_opt_info(opt, &nopt);
4574 else alt_merge_node_opt_info(opt, &nopt, env);
4575 }
4576 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4577 }
4578 break;
4579
4580 case NT_STR:
4581 {
4582 StrNode* sn = NSTR(node);
4583 int slen = sn->end - sn->s;
4584 int is_raw = NSTRING_IS_RAW(node);
4585
4586 if (! NSTRING_IS_AMBIG(node)) {
4587 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4588 NSTRING_IS_RAW(node), env->enc);
4589 if (slen > 0) {
4590 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4591 }
4592 set_mml(&opt->len, slen, slen);
4593 }
4594 else {
4595 int max;
4596
4597 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4598 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4599 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4600 }
4601 else {
4602 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4603 is_raw, env->enc);
4604 opt->exb.ignore_case = 1;
4605
4606 if (slen > 0) {
4607 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4608 env->enc, env->case_fold_flag);
4609 if (r != 0) break;
4610 }
4611
4612 max = slen;
4613 }
4614
4615 set_mml(&opt->len, slen, max);
4616 }
4617
4618 if (opt->exb.len == slen)
4619 opt->exb.reach_end = 1;
4620 }
4621 break;
4622
4623 case NT_CCLASS:
4624 {
4625 int i, z;
4626 CClassNode* cc = NCCLASS(node);
4627
4628 /* no need to check ignore case. (setted in setup_tree()) */
4629
4630 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4631 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4632 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4633
4634 set_mml(&opt->len, min, max);
4635 }
4636 else {
4637 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4638 z = BITSET_AT(cc->bs, i);
4639 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4640 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4641 }
4642 }
4643 set_mml(&opt->len, 1, 1);
4644 }
4645 }
4646 break;
4647
4648 case NT_CTYPE:
4649 {
4650 int i, min, max;
4651
4652 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4653
4654 if (max == 1) {
4655 min = 1;
4656
4657 switch (NCTYPE(node)->ctype) {
4658 case ONIGENC_CTYPE_WORD:
4659 if (NCTYPE(node)->not != 0) {
4660 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4661 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4662 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4663 }
4664 }
4665 }
4666 else {
4667 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4668 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4669 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4670 }
4671 }
4672 }
4673 break;
4674 }
4675 }
4676 else {
4677 min = ONIGENC_MBC_MINLEN(env->enc);
4678 }
4679 set_mml(&opt->len, min, max);
4680 }
4681 break;
4682
4683 case NT_CANY:
4684 {
4685 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4686 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4687 set_mml(&opt->len, min, max);
4688 }
4689 break;
4690
4691 case NT_ANCHOR:
4692 switch (NANCHOR(node)->type) {
4693 case ANCHOR_BEGIN_BUF:
4694 case ANCHOR_BEGIN_POSITION:
4695 case ANCHOR_BEGIN_LINE:
4696 case ANCHOR_END_BUF:
4697 case ANCHOR_SEMI_END_BUF:
4698 case ANCHOR_END_LINE:
4699 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4700 break;
4701
4702 case ANCHOR_PREC_READ:
4703 {
4704 NodeOptInfo nopt;
4705
4706 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4707 if (r == 0) {
4708 if (nopt.exb.len > 0)
4709 copy_opt_exact_info(&opt->expr, &nopt.exb);
4710 else if (nopt.exm.len > 0)
4711 copy_opt_exact_info(&opt->expr, &nopt.exm);
4712
4713 opt->expr.reach_end = 0;
4714
4715 if (nopt.map.value > 0)
4716 copy_opt_map_info(&opt->map, &nopt.map);
4717 }
4718 }
4719 break;
4720
4721 case ANCHOR_PREC_READ_NOT:
4722 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4723 case ANCHOR_LOOK_BEHIND_NOT:
4724 break;
4725 }
4726 break;
4727
4728 case NT_BREF:
4729 {
4730 int i;
4731 int* backs;
4732 OnigDistance min, max, tmin, tmax;
4733 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4734 BRefNode* br = NBREF(node);
4735
4736 if (br->state & NST_RECURSION) {
4737 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4738 break;
4739 }
4740 backs = BACKREFS_P(br);
4741 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4742 if (r != 0) break;
4743 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4744 if (r != 0) break;
4745 for (i = 1; i < br->back_num; i++) {
4746 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4747 if (r != 0) break;
4748 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4749 if (r != 0) break;
4750 if (min > tmin) min = tmin;
4751 if (max < tmax) max = tmax;
4752 }
4753 if (r == 0) set_mml(&opt->len, min, max);
4754 }
4755 break;
4756
4757 #ifdef USE_SUBEXP_CALL
4758 case NT_CALL:
4759 if (IS_CALL_RECURSION(NCALL(node)))
4760 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4761 else {
4762 OnigOptionType save = env->options;
4763 env->options = NENCLOSE(NCALL(node)->target)->option;
4764 r = optimize_node_left(NCALL(node)->target, opt, env);
4765 env->options = save;
4766 }
4767 break;
4768 #endif
4769
4770 case NT_QTFR:
4771 {
4772 int i;
4773 OnigDistance min, max;
4774 NodeOptInfo nopt;
4775 QtfrNode* qn = NQTFR(node);
4776
4777 r = optimize_node_left(qn->target, &nopt, env);
4778 if (r) break;
4779
4780 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4781 if (env->mmd.max == 0 &&
4782 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4783 if (IS_MULTILINE(env->options))
4784 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4785 else
4786 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4787 }
4788 }
4789 else {
4790 if (qn->lower > 0) {
4791 copy_node_opt_info(opt, &nopt);
4792 if (nopt.exb.len > 0) {
4793 if (nopt.exb.reach_end) {
4794 for (i = 2; i <= qn->lower &&
4795 ! is_full_opt_exact_info(&opt->exb); i++) {
4796 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4797 }
4798 if (i < qn->lower) {
4799 opt->exb.reach_end = 0;
4800 }
4801 }
4802 }
4803
4804 if (qn->lower != qn->upper) {
4805 opt->exb.reach_end = 0;
4806 opt->exm.reach_end = 0;
4807 }
4808 if (qn->lower > 1)
4809 opt->exm.reach_end = 0;
4810 }
4811 }
4812
4813 min = distance_multiply(nopt.len.min, qn->lower);
4814 if (IS_REPEAT_INFINITE(qn->upper))
4815 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4816 else
4817 max = distance_multiply(nopt.len.max, qn->upper);
4818
4819 set_mml(&opt->len, min, max);
4820 }
4821 break;
4822
4823 case NT_ENCLOSE:
4824 {
4825 EncloseNode* en = NENCLOSE(node);
4826
4827 switch (en->type) {
4828 case ENCLOSE_OPTION:
4829 {
4830 OnigOptionType save = env->options;
4831
4832 env->options = en->option;
4833 r = optimize_node_left(en->target, opt, env);
4834 env->options = save;
4835 }
4836 break;
4837
4838 case ENCLOSE_MEMORY:
4839 #ifdef USE_SUBEXP_CALL
4840 en->opt_count++;
4841 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4842 OnigDistance min, max;
4843
4844 min = 0;
4845 max = ONIG_INFINITE_DISTANCE;
4846 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4847 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4848 set_mml(&opt->len, min, max);
4849 }
4850 else
4851 #endif
4852 {
4853 r = optimize_node_left(en->target, opt, env);
4854
4855 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4856 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4857 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4858 }
4859 }
4860 break;
4861
4862 case ENCLOSE_STOP_BACKTRACK:
4863 r = optimize_node_left(en->target, opt, env);
4864 break;
4865 }
4866 }
4867 break;
4868
4869 default:
4870 #ifdef ONIG_DEBUG
4871 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4872 NTYPE(node));
4873 #endif
4874 r = ONIGERR_TYPE_BUG;
4875 break;
4876 }
4877
4878 return r;
4879 }
4880
4881 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4882 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4883 {
4884 int r;
4885
4886 if (e->len == 0) return 0;
4887
4888 if (e->ignore_case) {
4889 reg->exact = (UChar* )xmalloc(e->len);
4890 CHECK_NULL_RETURN_MEMERR(reg->exact);
4891 xmemcpy(reg->exact, e->s, e->len);
4892 reg->exact_end = reg->exact + e->len;
4893 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4894 }
4895 else {
4896 int allow_reverse;
4897
4898 reg->exact = str_dup(e->s, e->s + e->len);
4899 CHECK_NULL_RETURN_MEMERR(reg->exact);
4900 reg->exact_end = reg->exact + e->len;
4901
4902 allow_reverse =
4903 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4904
4905 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4906 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4907 reg->map, &(reg->int_map));
4908 if (r) return r;
4909
4910 reg->optimize = (allow_reverse != 0
4911 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4912 }
4913 else {
4914 reg->optimize = ONIG_OPTIMIZE_EXACT;
4915 }
4916 }
4917
4918 reg->dmin = e->mmd.min;
4919 reg->dmax = e->mmd.max;
4920
4921 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4922 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4923 }
4924
4925 return 0;
4926 }
4927
4928 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4929 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4930 {
4931 int i;
4932
4933 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4934 reg->map[i] = m->map[i];
4935
4936 reg->optimize = ONIG_OPTIMIZE_MAP;
4937 reg->dmin = m->mmd.min;
4938 reg->dmax = m->mmd.max;
4939
4940 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4941 reg->threshold_len = reg->dmin + 1;
4942 }
4943 }
4944
4945 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4946 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4947 {
4948 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4949 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4950 }
4951
4952 #ifdef ONIG_DEBUG
4953 static void print_optimize_info(FILE* f, regex_t* reg);
4954 #endif
4955
4956 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4957 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4958 {
4959
4960 int r;
4961 NodeOptInfo opt;
4962 OptEnv env;
4963
4964 env.enc = reg->enc;
4965 env.options = reg->options;
4966 env.case_fold_flag = reg->case_fold_flag;
4967 env.scan_env = scan_env;
4968 clear_mml(&env.mmd);
4969
4970 r = optimize_node_left(node, &opt, &env);
4971 if (r) return r;
4972
4973 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4974 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4975
4976 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4977
4978 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4979 reg->anchor_dmin = opt.len.min;
4980 reg->anchor_dmax = opt.len.max;
4981 }
4982
4983 if (opt.exb.len > 0 || opt.exm.len > 0) {
4984 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4985 if (opt.map.value > 0 &&
4986 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4987 goto set_map;
4988 }
4989 else {
4990 r = set_optimize_exact_info(reg, &opt.exb);
4991 set_sub_anchor(reg, &opt.exb.anc);
4992 }
4993 }
4994 else if (opt.map.value > 0) {
4995 set_map:
4996 set_optimize_map_info(reg, &opt.map);
4997 set_sub_anchor(reg, &opt.map.anc);
4998 }
4999 else {
5000 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5001 if (opt.len.max == 0)
5002 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5003 }
5004
5005 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5006 print_optimize_info(stderr, reg);
5007 #endif
5008 return r;
5009 }
5010
5011 static void
clear_optimize_info(regex_t * reg)5012 clear_optimize_info(regex_t* reg)
5013 {
5014 reg->optimize = ONIG_OPTIMIZE_NONE;
5015 reg->anchor = 0;
5016 reg->anchor_dmin = 0;
5017 reg->anchor_dmax = 0;
5018 reg->sub_anchor = 0;
5019 reg->exact_end = (UChar* )NULL;
5020 reg->threshold_len = 0;
5021 if (IS_NOT_NULL(reg->exact)) {
5022 xfree(reg->exact);
5023 reg->exact = (UChar* )NULL;
5024 }
5025 }
5026
5027 #ifdef ONIG_DEBUG
5028
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5029 static void print_enc_string(FILE* fp, OnigEncoding enc,
5030 const UChar *s, const UChar *end)
5031 {
5032 fprintf(fp, "\nPATTERN: /");
5033
5034 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5035 const UChar *p;
5036 OnigCodePoint code;
5037
5038 p = s;
5039 while (p < end) {
5040 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5041 if (code >= 0x80) {
5042 fprintf(fp, " 0x%04x ", (int )code);
5043 }
5044 else {
5045 fputc((int )code, fp);
5046 }
5047
5048 p += enclen(enc, p);
5049 }
5050 }
5051 else {
5052 while (s < end) {
5053 fputc((int )*s, fp);
5054 s++;
5055 }
5056 }
5057
5058 fprintf(fp, "/\n");
5059 }
5060
5061 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5062 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5063 {
5064 if (a == ONIG_INFINITE_DISTANCE)
5065 fputs("inf", f);
5066 else
5067 fprintf(f, "(%u)", a);
5068
5069 fputs("-", f);
5070
5071 if (b == ONIG_INFINITE_DISTANCE)
5072 fputs("inf", f);
5073 else
5074 fprintf(f, "(%u)", b);
5075 }
5076
5077 static void
print_anchor(FILE * f,int anchor)5078 print_anchor(FILE* f, int anchor)
5079 {
5080 int q = 0;
5081
5082 fprintf(f, "[");
5083
5084 if (anchor & ANCHOR_BEGIN_BUF) {
5085 fprintf(f, "begin-buf");
5086 q = 1;
5087 }
5088 if (anchor & ANCHOR_BEGIN_LINE) {
5089 if (q) fprintf(f, ", ");
5090 q = 1;
5091 fprintf(f, "begin-line");
5092 }
5093 if (anchor & ANCHOR_BEGIN_POSITION) {
5094 if (q) fprintf(f, ", ");
5095 q = 1;
5096 fprintf(f, "begin-pos");
5097 }
5098 if (anchor & ANCHOR_END_BUF) {
5099 if (q) fprintf(f, ", ");
5100 q = 1;
5101 fprintf(f, "end-buf");
5102 }
5103 if (anchor & ANCHOR_SEMI_END_BUF) {
5104 if (q) fprintf(f, ", ");
5105 q = 1;
5106 fprintf(f, "semi-end-buf");
5107 }
5108 if (anchor & ANCHOR_END_LINE) {
5109 if (q) fprintf(f, ", ");
5110 q = 1;
5111 fprintf(f, "end-line");
5112 }
5113 if (anchor & ANCHOR_ANYCHAR_STAR) {
5114 if (q) fprintf(f, ", ");
5115 q = 1;
5116 fprintf(f, "anychar-star");
5117 }
5118 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5119 if (q) fprintf(f, ", ");
5120 fprintf(f, "anychar-star-pl");
5121 }
5122
5123 fprintf(f, "]");
5124 }
5125
5126 static void
print_optimize_info(FILE * f,regex_t * reg)5127 print_optimize_info(FILE* f, regex_t* reg)
5128 {
5129 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5130 "EXACT_IC", "MAP" };
5131
5132 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5133 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5134 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5135 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5136 fprintf(f, "\n");
5137
5138 if (reg->optimize) {
5139 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5140 fprintf(f, "\n");
5141 }
5142 fprintf(f, "\n");
5143
5144 if (reg->exact) {
5145 UChar *p;
5146 fprintf(f, "exact: [");
5147 for (p = reg->exact; p < reg->exact_end; p++) {
5148 fputc(*p, f);
5149 }
5150 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5151 }
5152 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5153 int c, i, n = 0;
5154
5155 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5156 if (reg->map[i]) n++;
5157
5158 fprintf(f, "map: n=%d\n", n);
5159 if (n > 0) {
5160 c = 0;
5161 fputc('[', f);
5162 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5163 if (reg->map[i] != 0) {
5164 if (c > 0) fputs(", ", f);
5165 c++;
5166 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5167 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5168 fputc(i, f);
5169 else
5170 fprintf(f, "%d", i);
5171 }
5172 }
5173 fprintf(f, "]\n");
5174 }
5175 }
5176 }
5177 #endif /* ONIG_DEBUG */
5178
5179
5180 extern void
onig_free_body(regex_t * reg)5181 onig_free_body(regex_t* reg)
5182 {
5183 if (IS_NOT_NULL(reg)) {
5184 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5185 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5186 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5187 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5188 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5189 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5190
5191 #ifdef USE_NAMED_GROUP
5192 onig_names_free(reg);
5193 #endif
5194 }
5195 }
5196
5197 extern void
onig_free(regex_t * reg)5198 onig_free(regex_t* reg)
5199 {
5200 if (IS_NOT_NULL(reg)) {
5201 onig_free_body(reg);
5202 xfree(reg);
5203 }
5204 }
5205
5206 #define REGEX_TRANSFER(to,from) do {\
5207 (to)->state = ONIG_STATE_MODIFY;\
5208 onig_free_body(to);\
5209 xmemcpy(to, from, sizeof(regex_t));\
5210 xfree(from);\
5211 } while (0)
5212
5213 extern void
onig_transfer(regex_t * to,regex_t * from)5214 onig_transfer(regex_t* to, regex_t* from)
5215 {
5216 THREAD_ATOMIC_START;
5217 REGEX_TRANSFER(to, from);
5218 THREAD_ATOMIC_END;
5219 }
5220
5221 #define REGEX_CHAIN_HEAD(reg) do {\
5222 while (IS_NOT_NULL((reg)->chain)) {\
5223 (reg) = (reg)->chain;\
5224 }\
5225 } while (0)
5226
5227 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5228 onig_chain_link_add(regex_t* to, regex_t* add)
5229 {
5230 THREAD_ATOMIC_START;
5231 REGEX_CHAIN_HEAD(to);
5232 to->chain = add;
5233 THREAD_ATOMIC_END;
5234 }
5235
5236 extern void
onig_chain_reduce(regex_t * reg)5237 onig_chain_reduce(regex_t* reg)
5238 {
5239 regex_t *head, *prev;
5240
5241 prev = reg;
5242 head = prev->chain;
5243 if (IS_NOT_NULL(head)) {
5244 reg->state = ONIG_STATE_MODIFY;
5245 while (IS_NOT_NULL(head->chain)) {
5246 prev = head;
5247 head = head->chain;
5248 }
5249 prev->chain = (regex_t* )NULL;
5250 REGEX_TRANSFER(reg, head);
5251 }
5252 }
5253
5254 #ifdef ONIG_DEBUG
5255 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5256 #endif
5257 #ifdef ONIG_DEBUG_PARSE_TREE
5258 static void print_tree P_((FILE* f, Node* node));
5259 #endif
5260
5261 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5262 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5263 OnigErrorInfo* einfo)
5264 {
5265 #define COMPILE_INIT_SIZE 20
5266
5267 int r, init_size;
5268 Node* root;
5269 ScanEnv scan_env;
5270 #ifdef USE_SUBEXP_CALL
5271 UnsetAddrList uslist;
5272 #endif
5273
5274 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5275
5276 reg->state = ONIG_STATE_COMPILING;
5277
5278 #ifdef ONIG_DEBUG
5279 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5280 #endif
5281
5282 if (reg->alloc == 0) {
5283 init_size = (pattern_end - pattern) * 2;
5284 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5285 r = BBUF_INIT(reg, init_size);
5286 if (r != 0) goto end;
5287 }
5288 else
5289 reg->used = 0;
5290
5291 reg->num_mem = 0;
5292 reg->num_repeat = 0;
5293 reg->num_null_check = 0;
5294 reg->repeat_range_alloc = 0;
5295 reg->repeat_range = (OnigRepeatRange* )NULL;
5296 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5297 reg->num_comb_exp_check = 0;
5298 #endif
5299
5300 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5301 if (r != 0) goto err;
5302
5303 #ifdef USE_NAMED_GROUP
5304 /* mixed use named group and no-named group */
5305 if (scan_env.num_named > 0 &&
5306 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5307 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5308 if (scan_env.num_named != scan_env.num_mem)
5309 r = disable_noname_group_capture(&root, reg, &scan_env);
5310 else
5311 r = numbered_ref_check(root);
5312
5313 if (r != 0) goto err;
5314 }
5315 #endif
5316
5317 #ifdef USE_SUBEXP_CALL
5318 if (scan_env.num_call > 0) {
5319 r = unset_addr_list_init(&uslist, scan_env.num_call);
5320 if (r != 0) goto err;
5321 scan_env.unset_addr_list = &uslist;
5322 r = setup_subexp_call(root, &scan_env);
5323 if (r != 0) goto err_unset;
5324 r = subexp_recursive_check_trav(root, &scan_env);
5325 if (r < 0) goto err_unset;
5326 r = subexp_inf_recursive_check_trav(root, &scan_env);
5327 if (r != 0) goto err_unset;
5328
5329 reg->num_call = scan_env.num_call;
5330 }
5331 else
5332 reg->num_call = 0;
5333 #endif
5334
5335 r = setup_tree(root, reg, 0, &scan_env);
5336 if (r != 0) goto err_unset;
5337
5338 #ifdef ONIG_DEBUG_PARSE_TREE
5339 print_tree(stderr, root);
5340 #endif
5341
5342 reg->capture_history = scan_env.capture_history;
5343 reg->bt_mem_start = scan_env.bt_mem_start;
5344 reg->bt_mem_start |= reg->capture_history;
5345 if (IS_FIND_CONDITION(reg->options))
5346 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5347 else {
5348 reg->bt_mem_end = scan_env.bt_mem_end;
5349 reg->bt_mem_end |= reg->capture_history;
5350 }
5351
5352 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5353 if (scan_env.backrefed_mem == 0
5354 #ifdef USE_SUBEXP_CALL
5355 || scan_env.num_call == 0
5356 #endif
5357 ) {
5358 setup_comb_exp_check(root, 0, &scan_env);
5359 #ifdef USE_SUBEXP_CALL
5360 if (scan_env.has_recursion != 0) {
5361 scan_env.num_comb_exp_check = 0;
5362 }
5363 else
5364 #endif
5365 if (scan_env.comb_exp_max_regnum > 0) {
5366 int i;
5367 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5368 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5369 scan_env.num_comb_exp_check = 0;
5370 break;
5371 }
5372 }
5373 }
5374 }
5375
5376 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5377 #endif
5378
5379 clear_optimize_info(reg);
5380 #ifndef ONIG_DONT_OPTIMIZE
5381 r = set_optimize_info_from_tree(root, reg, &scan_env);
5382 if (r != 0) goto err_unset;
5383 #endif
5384
5385 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5386 xfree(scan_env.mem_nodes_dynamic);
5387 scan_env.mem_nodes_dynamic = (Node** )NULL;
5388 }
5389
5390 r = compile_tree(root, reg);
5391 if (r == 0) {
5392 r = add_opcode(reg, OP_END);
5393 #ifdef USE_SUBEXP_CALL
5394 if (scan_env.num_call > 0) {
5395 r = unset_addr_list_fix(&uslist, reg);
5396 unset_addr_list_end(&uslist);
5397 if (r) goto err;
5398 }
5399 #endif
5400
5401 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5402 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5403 else {
5404 if (reg->bt_mem_start != 0)
5405 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5406 else
5407 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5408 }
5409 }
5410 #ifdef USE_SUBEXP_CALL
5411 else if (scan_env.num_call > 0) {
5412 unset_addr_list_end(&uslist);
5413 }
5414 #endif
5415 onig_node_free(root);
5416
5417 #ifdef ONIG_DEBUG_COMPILE
5418 #ifdef USE_NAMED_GROUP
5419 onig_print_names(stderr, reg);
5420 #endif
5421 print_compiled_byte_code_list(stderr, reg);
5422 #endif
5423
5424 end:
5425 reg->state = ONIG_STATE_NORMAL;
5426 return r;
5427
5428 err_unset:
5429 #ifdef USE_SUBEXP_CALL
5430 if (scan_env.num_call > 0) {
5431 unset_addr_list_end(&uslist);
5432 }
5433 #endif
5434 err:
5435 if (IS_NOT_NULL(scan_env.error)) {
5436 if (IS_NOT_NULL(einfo)) {
5437 einfo->enc = scan_env.enc;
5438 einfo->par = scan_env.error;
5439 einfo->par_end = scan_env.error_end;
5440 }
5441 }
5442
5443 onig_node_free(root);
5444 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5445 xfree(scan_env.mem_nodes_dynamic);
5446 return r;
5447 }
5448
5449 #ifdef USE_RECOMPILE_API
5450 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5451 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5452 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5453 OnigErrorInfo* einfo)
5454 {
5455 int r;
5456 regex_t *new_reg;
5457
5458 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5459 if (r) return r;
5460 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5461 onig_transfer(reg, new_reg);
5462 }
5463 else {
5464 onig_chain_link_add(reg, new_reg);
5465 }
5466 return 0;
5467 }
5468 #endif
5469
5470 static int onig_inited = 0;
5471
5472 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5473 onig_reg_init(regex_t* reg, OnigOptionType option,
5474 OnigCaseFoldType case_fold_flag,
5475 OnigEncoding enc, OnigSyntaxType* syntax)
5476 {
5477 if (! onig_inited)
5478 onig_init();
5479
5480 if (IS_NULL(reg))
5481 return ONIGERR_INVALID_ARGUMENT;
5482
5483 if (ONIGENC_IS_UNDEF(enc))
5484 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5485
5486 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5487 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5488 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5489 }
5490
5491 (reg)->state = ONIG_STATE_MODIFY;
5492
5493 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5494 option |= syntax->options;
5495 option &= ~ONIG_OPTION_SINGLELINE;
5496 }
5497 else
5498 option |= syntax->options;
5499
5500 (reg)->enc = enc;
5501 (reg)->options = option;
5502 (reg)->syntax = syntax;
5503 (reg)->optimize = 0;
5504 (reg)->exact = (UChar* )NULL;
5505 (reg)->int_map = (int* )NULL;
5506 (reg)->int_map_backward = (int* )NULL;
5507 (reg)->chain = (regex_t* )NULL;
5508
5509 (reg)->p = (UChar* )NULL;
5510 (reg)->alloc = 0;
5511 (reg)->used = 0;
5512 (reg)->name_table = (void* )NULL;
5513
5514 (reg)->case_fold_flag = case_fold_flag;
5515 return 0;
5516 }
5517
5518 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5519 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5520 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5521 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5522 {
5523 int r;
5524
5525 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5526 if (r) return r;
5527
5528 r = onig_compile(reg, pattern, pattern_end, einfo);
5529 return r;
5530 }
5531
5532 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5533 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5534 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5535 OnigErrorInfo* einfo)
5536 {
5537 int r;
5538
5539 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5540 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5541
5542 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5543 if (r) goto err;
5544
5545 r = onig_compile(*reg, pattern, pattern_end, einfo);
5546 if (r) {
5547 err:
5548 onig_free(*reg);
5549 *reg = NULL;
5550 }
5551 return r;
5552 }
5553
5554
5555 extern int
onig_init(void)5556 onig_init(void)
5557 {
5558 if (onig_inited != 0)
5559 return 0;
5560
5561 THREAD_SYSTEM_INIT;
5562 THREAD_ATOMIC_START;
5563
5564 onig_inited = 1;
5565
5566 onigenc_init();
5567 /* onigenc_set_default_caseconv_table((UChar* )0); */
5568
5569 #ifdef ONIG_DEBUG_STATISTICS
5570 onig_statistics_init();
5571 #endif
5572
5573 THREAD_ATOMIC_END;
5574 return 0;
5575 }
5576
5577
5578 static OnigEndCallListItemType* EndCallTop;
5579
onig_add_end_call(void (* func)(void))5580 extern void onig_add_end_call(void (*func)(void))
5581 {
5582 OnigEndCallListItemType* item;
5583
5584 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
5585 if (item == 0) return ;
5586
5587 item->next = EndCallTop;
5588 item->func = func;
5589
5590 EndCallTop = item;
5591 }
5592
5593 static void
exec_end_call_list(void)5594 exec_end_call_list(void)
5595 {
5596 OnigEndCallListItemType* prev;
5597 void (*func)(void);
5598
5599 while (EndCallTop != 0) {
5600 func = EndCallTop->func;
5601 (*func)();
5602
5603 prev = EndCallTop;
5604 EndCallTop = EndCallTop->next;
5605 xfree(prev);
5606 }
5607 }
5608
5609 extern int
onig_end(void)5610 onig_end(void)
5611 {
5612 THREAD_ATOMIC_START;
5613
5614 exec_end_call_list();
5615
5616 #ifdef ONIG_DEBUG_STATISTICS
5617 onig_print_statistics(stderr);
5618 #endif
5619
5620 #ifdef USE_SHARED_CCLASS_TABLE
5621 onig_free_shared_cclass_table();
5622 #endif
5623
5624 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5625 onig_free_node_list();
5626 #endif
5627
5628 onig_inited = 0;
5629
5630 THREAD_ATOMIC_END;
5631 THREAD_SYSTEM_END;
5632 return 0;
5633 }
5634
5635 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5636 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5637 {
5638 OnigCodePoint n, *data;
5639 OnigCodePoint low, high, x;
5640
5641 GET_CODE_POINT(n, p);
5642 data = (OnigCodePoint* )p;
5643 data++;
5644
5645 for (low = 0, high = n; low < high; ) {
5646 x = (low + high) >> 1;
5647 if (code > data[x * 2 + 1])
5648 low = x + 1;
5649 else
5650 high = x;
5651 }
5652
5653 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5654 }
5655
5656 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5657 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5658 {
5659 int found;
5660
5661 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5662 if (IS_NULL(cc->mbuf)) {
5663 found = 0;
5664 }
5665 else {
5666 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5667 }
5668 }
5669 else {
5670 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5671 }
5672
5673 if (IS_NCCLASS_NOT(cc))
5674 return !found;
5675 else
5676 return found;
5677 }
5678
5679 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5680 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5681 {
5682 int len;
5683
5684 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5685 len = 2;
5686 }
5687 else {
5688 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5689 }
5690 return onig_is_code_in_cc_len(len, code, cc);
5691 }
5692
5693
5694 #ifdef ONIG_DEBUG
5695
5696 /* arguments type */
5697 #define ARG_SPECIAL -1
5698 #define ARG_NON 0
5699 #define ARG_RELADDR 1
5700 #define ARG_ABSADDR 2
5701 #define ARG_LENGTH 3
5702 #define ARG_MEMNUM 4
5703 #define ARG_OPTION 5
5704 #define ARG_STATE_CHECK 6
5705
5706 OnigOpInfoType OnigOpInfo[] = {
5707 { OP_FINISH, "finish", ARG_NON },
5708 { OP_END, "end", ARG_NON },
5709 { OP_EXACT1, "exact1", ARG_SPECIAL },
5710 { OP_EXACT2, "exact2", ARG_SPECIAL },
5711 { OP_EXACT3, "exact3", ARG_SPECIAL },
5712 { OP_EXACT4, "exact4", ARG_SPECIAL },
5713 { OP_EXACT5, "exact5", ARG_SPECIAL },
5714 { OP_EXACTN, "exactn", ARG_SPECIAL },
5715 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5716 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5717 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5718 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5719 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5720 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5721 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5722 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5723 { OP_CCLASS, "cclass", ARG_SPECIAL },
5724 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5725 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5726 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5727 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5728 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5729 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5730 { OP_ANYCHAR, "anychar", ARG_NON },
5731 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5732 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5733 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5734 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5735 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5736 { OP_WORD, "word", ARG_NON },
5737 { OP_NOT_WORD, "not-word", ARG_NON },
5738 { OP_WORD_BOUND, "word-bound", ARG_NON },
5739 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5740 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5741 { OP_WORD_END, "word-end", ARG_NON },
5742 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5743 { OP_END_BUF, "end-buf", ARG_NON },
5744 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5745 { OP_END_LINE, "end-line", ARG_NON },
5746 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5747 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5748 { OP_BACKREF1, "backref1", ARG_NON },
5749 { OP_BACKREF2, "backref2", ARG_NON },
5750 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5751 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5752 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5753 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5754 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5755 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5756 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5757 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5758 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5759 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5760 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5761 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5762 { OP_SET_OPTION, "set-option", ARG_OPTION },
5763 { OP_FAIL, "fail", ARG_NON },
5764 { OP_JUMP, "jump", ARG_RELADDR },
5765 { OP_PUSH, "push", ARG_RELADDR },
5766 { OP_POP, "pop", ARG_NON },
5767 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5768 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5769 { OP_REPEAT, "repeat", ARG_SPECIAL },
5770 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5771 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5772 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5773 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5774 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5775 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5776 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5777 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5778 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5779 { OP_PUSH_POS, "push-pos", ARG_NON },
5780 { OP_POP_POS, "pop-pos", ARG_NON },
5781 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5782 { OP_FAIL_POS, "fail-pos", ARG_NON },
5783 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5784 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5785 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5786 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5787 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5788 { OP_CALL, "call", ARG_ABSADDR },
5789 { OP_RETURN, "return", ARG_NON },
5790 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5791 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5792 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5793 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5794 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5795 "state-check-anychar-ml*", ARG_STATE_CHECK },
5796 { -1, "", ARG_NON }
5797 };
5798
5799 static char*
op2name(int opcode)5800 op2name(int opcode)
5801 {
5802 int i;
5803
5804 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5805 if (opcode == OnigOpInfo[i].opcode)
5806 return OnigOpInfo[i].name;
5807 }
5808 return "";
5809 }
5810
5811 static int
op2arg_type(int opcode)5812 op2arg_type(int opcode)
5813 {
5814 int i;
5815
5816 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5817 if (opcode == OnigOpInfo[i].opcode)
5818 return OnigOpInfo[i].arg_type;
5819 }
5820 return ARG_SPECIAL;
5821 }
5822
5823 static void
Indent(FILE * f,int indent)5824 Indent(FILE* f, int indent)
5825 {
5826 int i;
5827 for (i = 0; i < indent; i++) putc(' ', f);
5828 }
5829
5830 static void
p_string(FILE * f,int len,UChar * s)5831 p_string(FILE* f, int len, UChar* s)
5832 {
5833 fputs(":", f);
5834 while (len-- > 0) { fputc(*s++, f); }
5835 }
5836
5837 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5838 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5839 {
5840 int x = len * mb_len;
5841
5842 fprintf(f, ":%d:", len);
5843 while (x-- > 0) { fputc(*s++, f); }
5844 }
5845
5846 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5847 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5848 OnigEncoding enc)
5849 {
5850 int i, n, arg_type;
5851 RelAddrType addr;
5852 LengthType len;
5853 MemNumType mem;
5854 StateCheckNumType scn;
5855 OnigCodePoint code;
5856 UChar *q;
5857
5858 fprintf(f, "[%s", op2name(*bp));
5859 arg_type = op2arg_type(*bp);
5860 if (arg_type != ARG_SPECIAL) {
5861 bp++;
5862 switch (arg_type) {
5863 case ARG_NON:
5864 break;
5865 case ARG_RELADDR:
5866 GET_RELADDR_INC(addr, bp);
5867 fprintf(f, ":(%d)", addr);
5868 break;
5869 case ARG_ABSADDR:
5870 GET_ABSADDR_INC(addr, bp);
5871 fprintf(f, ":(%d)", addr);
5872 break;
5873 case ARG_LENGTH:
5874 GET_LENGTH_INC(len, bp);
5875 fprintf(f, ":%d", len);
5876 break;
5877 case ARG_MEMNUM:
5878 mem = *((MemNumType* )bp);
5879 bp += SIZE_MEMNUM;
5880 fprintf(f, ":%d", mem);
5881 break;
5882 case ARG_OPTION:
5883 {
5884 OnigOptionType option = *((OnigOptionType* )bp);
5885 bp += SIZE_OPTION;
5886 fprintf(f, ":%d", option);
5887 }
5888 break;
5889
5890 case ARG_STATE_CHECK:
5891 scn = *((StateCheckNumType* )bp);
5892 bp += SIZE_STATE_CHECK_NUM;
5893 fprintf(f, ":%d", scn);
5894 break;
5895 }
5896 }
5897 else {
5898 switch (*bp++) {
5899 case OP_EXACT1:
5900 case OP_ANYCHAR_STAR_PEEK_NEXT:
5901 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5902 p_string(f, 1, bp++); break;
5903 case OP_EXACT2:
5904 p_string(f, 2, bp); bp += 2; break;
5905 case OP_EXACT3:
5906 p_string(f, 3, bp); bp += 3; break;
5907 case OP_EXACT4:
5908 p_string(f, 4, bp); bp += 4; break;
5909 case OP_EXACT5:
5910 p_string(f, 5, bp); bp += 5; break;
5911 case OP_EXACTN:
5912 GET_LENGTH_INC(len, bp);
5913 p_len_string(f, len, 1, bp);
5914 bp += len;
5915 break;
5916
5917 case OP_EXACTMB2N1:
5918 p_string(f, 2, bp); bp += 2; break;
5919 case OP_EXACTMB2N2:
5920 p_string(f, 4, bp); bp += 4; break;
5921 case OP_EXACTMB2N3:
5922 p_string(f, 6, bp); bp += 6; break;
5923 case OP_EXACTMB2N:
5924 GET_LENGTH_INC(len, bp);
5925 p_len_string(f, len, 2, bp);
5926 bp += len * 2;
5927 break;
5928 case OP_EXACTMB3N:
5929 GET_LENGTH_INC(len, bp);
5930 p_len_string(f, len, 3, bp);
5931 bp += len * 3;
5932 break;
5933 case OP_EXACTMBN:
5934 {
5935 int mb_len;
5936
5937 GET_LENGTH_INC(mb_len, bp);
5938 GET_LENGTH_INC(len, bp);
5939 fprintf(f, ":%d:%d:", mb_len, len);
5940 n = len * mb_len;
5941 while (n-- > 0) { fputc(*bp++, f); }
5942 }
5943 break;
5944
5945 case OP_EXACT1_IC:
5946 len = enclen(enc, bp);
5947 p_string(f, len, bp);
5948 bp += len;
5949 break;
5950 case OP_EXACTN_IC:
5951 GET_LENGTH_INC(len, bp);
5952 p_len_string(f, len, 1, bp);
5953 bp += len;
5954 break;
5955
5956 case OP_CCLASS:
5957 n = bitset_on_num((BitSetRef )bp);
5958 bp += SIZE_BITSET;
5959 fprintf(f, ":%d", n);
5960 break;
5961
5962 case OP_CCLASS_NOT:
5963 n = bitset_on_num((BitSetRef )bp);
5964 bp += SIZE_BITSET;
5965 fprintf(f, ":%d", n);
5966 break;
5967
5968 case OP_CCLASS_MB:
5969 case OP_CCLASS_MB_NOT:
5970 GET_LENGTH_INC(len, bp);
5971 q = bp;
5972 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5973 ALIGNMENT_RIGHT(q);
5974 #endif
5975 GET_CODE_POINT(code, q);
5976 bp += len;
5977 fprintf(f, ":%d:%d", (int )code, len);
5978 break;
5979
5980 case OP_CCLASS_MIX:
5981 case OP_CCLASS_MIX_NOT:
5982 n = bitset_on_num((BitSetRef )bp);
5983 bp += SIZE_BITSET;
5984 GET_LENGTH_INC(len, bp);
5985 q = bp;
5986 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5987 ALIGNMENT_RIGHT(q);
5988 #endif
5989 GET_CODE_POINT(code, q);
5990 bp += len;
5991 fprintf(f, ":%d:%d:%d", n, (int )code, len);
5992 break;
5993
5994 case OP_CCLASS_NODE:
5995 {
5996 CClassNode *cc;
5997
5998 GET_POINTER_INC(cc, bp);
5999 n = bitset_on_num(cc->bs);
6000 fprintf(f, ":%u:%d", (unsigned int )cc, n);
6001 }
6002 break;
6003
6004 case OP_BACKREFN_IC:
6005 mem = *((MemNumType* )bp);
6006 bp += SIZE_MEMNUM;
6007 fprintf(f, ":%d", mem);
6008 break;
6009
6010 case OP_BACKREF_MULTI_IC:
6011 case OP_BACKREF_MULTI:
6012 fputs(" ", f);
6013 GET_LENGTH_INC(len, bp);
6014 for (i = 0; i < len; i++) {
6015 GET_MEMNUM_INC(mem, bp);
6016 if (i > 0) fputs(", ", f);
6017 fprintf(f, "%d", mem);
6018 }
6019 break;
6020
6021 case OP_BACKREF_WITH_LEVEL:
6022 {
6023 OnigOptionType option;
6024 LengthType level;
6025
6026 GET_OPTION_INC(option, bp);
6027 fprintf(f, ":%d", option);
6028 GET_LENGTH_INC(level, bp);
6029 fprintf(f, ":%d", level);
6030
6031 fputs(" ", f);
6032 GET_LENGTH_INC(len, bp);
6033 for (i = 0; i < len; i++) {
6034 GET_MEMNUM_INC(mem, bp);
6035 if (i > 0) fputs(", ", f);
6036 fprintf(f, "%d", mem);
6037 }
6038 }
6039 break;
6040
6041 case OP_REPEAT:
6042 case OP_REPEAT_NG:
6043 {
6044 mem = *((MemNumType* )bp);
6045 bp += SIZE_MEMNUM;
6046 addr = *((RelAddrType* )bp);
6047 bp += SIZE_RELADDR;
6048 fprintf(f, ":%d:%d", mem, addr);
6049 }
6050 break;
6051
6052 case OP_PUSH_OR_JUMP_EXACT1:
6053 case OP_PUSH_IF_PEEK_NEXT:
6054 addr = *((RelAddrType* )bp);
6055 bp += SIZE_RELADDR;
6056 fprintf(f, ":(%d)", addr);
6057 p_string(f, 1, bp);
6058 bp += 1;
6059 break;
6060
6061 case OP_LOOK_BEHIND:
6062 GET_LENGTH_INC(len, bp);
6063 fprintf(f, ":%d", len);
6064 break;
6065
6066 case OP_PUSH_LOOK_BEHIND_NOT:
6067 GET_RELADDR_INC(addr, bp);
6068 GET_LENGTH_INC(len, bp);
6069 fprintf(f, ":%d:(%d)", len, addr);
6070 break;
6071
6072 case OP_STATE_CHECK_PUSH:
6073 case OP_STATE_CHECK_PUSH_OR_JUMP:
6074 scn = *((StateCheckNumType* )bp);
6075 bp += SIZE_STATE_CHECK_NUM;
6076 addr = *((RelAddrType* )bp);
6077 bp += SIZE_RELADDR;
6078 fprintf(f, ":%d:(%d)", scn, addr);
6079 break;
6080
6081 default:
6082 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6083 *--bp);
6084 }
6085 }
6086 fputs("]", f);
6087 if (nextp) *nextp = bp;
6088 }
6089
6090 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6091 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6092 {
6093 int ncode;
6094 UChar* bp = reg->p;
6095 UChar* end = reg->p + reg->used;
6096
6097 fprintf(f, "code length: %d\n", reg->used);
6098
6099 ncode = 0;
6100 while (bp < end) {
6101 ncode++;
6102 if (bp > reg->p) {
6103 if (ncode % 5 == 0)
6104 fprintf(f, "\n");
6105 else
6106 fputs(" ", f);
6107 }
6108 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6109 }
6110
6111 fprintf(f, "\n");
6112 }
6113
6114 static void
print_indent_tree(FILE * f,Node * node,int indent)6115 print_indent_tree(FILE* f, Node* node, int indent)
6116 {
6117 int i, type;
6118 int add = 3;
6119 UChar* p;
6120
6121 Indent(f, indent);
6122 if (IS_NULL(node)) {
6123 fprintf(f, "ERROR: null node!!!\n");
6124 exit (0);
6125 }
6126
6127 type = NTYPE(node);
6128 switch (type) {
6129 case NT_LIST:
6130 case NT_ALT:
6131 if (NTYPE(node) == NT_LIST)
6132 fprintf(f, "<list:%x>\n", (int )node);
6133 else
6134 fprintf(f, "<alt:%x>\n", (int )node);
6135
6136 print_indent_tree(f, NCAR(node), indent + add);
6137 while (IS_NOT_NULL(node = NCDR(node))) {
6138 if (NTYPE(node) != type) {
6139 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6140 exit(0);
6141 }
6142 print_indent_tree(f, NCAR(node), indent + add);
6143 }
6144 break;
6145
6146 case NT_STR:
6147 fprintf(f, "<string%s:%x>",
6148 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6149 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6150 if (*p >= 0x20 && *p < 0x7f)
6151 fputc(*p, f);
6152 else {
6153 fprintf(f, " 0x%02x", *p);
6154 }
6155 }
6156 break;
6157
6158 case NT_CCLASS:
6159 fprintf(f, "<cclass:%x>", (int )node);
6160 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6161 if (NCCLASS(node)->mbuf) {
6162 BBuf* bbuf = NCCLASS(node)->mbuf;
6163 for (i = 0; i < bbuf->used; i++) {
6164 if (i > 0) fprintf(f, ",");
6165 fprintf(f, "%0x", bbuf->p[i]);
6166 }
6167 }
6168 break;
6169
6170 case NT_CTYPE:
6171 fprintf(f, "<ctype:%x> ", (int )node);
6172 switch (NCTYPE(node)->ctype) {
6173 case ONIGENC_CTYPE_WORD:
6174 if (NCTYPE(node)->not != 0)
6175 fputs("not word", f);
6176 else
6177 fputs("word", f);
6178 break;
6179
6180 default:
6181 fprintf(f, "ERROR: undefined ctype.\n");
6182 exit(0);
6183 }
6184 break;
6185
6186 case NT_CANY:
6187 fprintf(f, "<anychar:%x>", (int )node);
6188 break;
6189
6190 case NT_ANCHOR:
6191 fprintf(f, "<anchor:%x> ", (int )node);
6192 switch (NANCHOR(node)->type) {
6193 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6194 case ANCHOR_END_BUF: fputs("end buf", f); break;
6195 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6196 case ANCHOR_END_LINE: fputs("end line", f); break;
6197 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6198 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6199
6200 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6201 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6202 #ifdef USE_WORD_BEGIN_END
6203 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6204 case ANCHOR_WORD_END: fputs("word end", f); break;
6205 #endif
6206 case ANCHOR_PREC_READ: fputs("prec read", f); break;
6207 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
6208 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
6209 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6210
6211 default:
6212 fprintf(f, "ERROR: undefined anchor type.\n");
6213 break;
6214 }
6215 break;
6216
6217 case NT_BREF:
6218 {
6219 int* p;
6220 BRefNode* br = NBREF(node);
6221 p = BACKREFS_P(br);
6222 fprintf(f, "<backref:%x>", (int )node);
6223 for (i = 0; i < br->back_num; i++) {
6224 if (i > 0) fputs(", ", f);
6225 fprintf(f, "%d", p[i]);
6226 }
6227 }
6228 break;
6229
6230 #ifdef USE_SUBEXP_CALL
6231 case NT_CALL:
6232 {
6233 CallNode* cn = NCALL(node);
6234 fprintf(f, "<call:%x>", (int )node);
6235 p_string(f, cn->name_end - cn->name, cn->name);
6236 }
6237 break;
6238 #endif
6239
6240 case NT_QTFR:
6241 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6242 NQTFR(node)->lower, NQTFR(node)->upper,
6243 (NQTFR(node)->greedy ? "" : "?"));
6244 print_indent_tree(f, NQTFR(node)->target, indent + add);
6245 break;
6246
6247 case NT_ENCLOSE:
6248 fprintf(f, "<enclose:%x> ", (int )node);
6249 switch (NENCLOSE(node)->type) {
6250 case ENCLOSE_OPTION:
6251 fprintf(f, "option:%d", NENCLOSE(node)->option);
6252 break;
6253 case ENCLOSE_MEMORY:
6254 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6255 break;
6256 case ENCLOSE_STOP_BACKTRACK:
6257 fprintf(f, "stop-bt");
6258 break;
6259
6260 default:
6261 break;
6262 }
6263 fprintf(f, "\n");
6264 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6265 break;
6266
6267 default:
6268 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6269 break;
6270 }
6271
6272 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6273 type != NT_ENCLOSE)
6274 fprintf(f, "\n");
6275 fflush(f);
6276 }
6277 #endif /* ONIG_DEBUG */
6278
6279 #ifdef ONIG_DEBUG_PARSE_TREE
6280 static void
print_tree(FILE * f,Node * node)6281 print_tree(FILE* f, Node* node)
6282 {
6283 print_indent_tree(f, node, 0);
6284 }
6285 #endif
6286