1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2017 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37 return OnigDefaultCaseFoldFlag;
38 }
39
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43 OnigDefaultCaseFoldFlag = case_fold_flag;
44 return 0;
45 }
46
47
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55 int len = end - s;
56
57 if (len > 0) {
58 UChar* r = (UChar* )xmalloc(len + 1);
59 CHECK_NULL_RETURN(r);
60 xmemcpy(r, s, len);
61 r[len] = (UChar )0;
62 return r;
63 }
64 else return NULL;
65 }
66
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70 Node c;
71 c = *a; *a = *b; *b = c;
72
73 if (NTYPE(a) == NT_STR) {
74 StrNode* sn = NSTR(a);
75 if (sn->capa == 0) {
76 int len = sn->end - sn->s;
77 sn->s = sn->buf;
78 sn->end = sn->s + len;
79 }
80 }
81
82 if (NTYPE(b) == NT_STR) {
83 StrNode* sn = NSTR(b);
84 if (sn->capa == 0) {
85 int len = sn->end - sn->s;
86 sn->s = sn->buf;
87 sn->end = sn->s + len;
88 }
89 }
90 }
91
92 static OnigLen
distance_add(OnigLen d1,OnigLen d2)93 distance_add(OnigLen d1, OnigLen d2)
94 {
95 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96 return ONIG_INFINITE_DISTANCE;
97 else {
98 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99 else return ONIG_INFINITE_DISTANCE;
100 }
101 }
102
103 static OnigLen
distance_multiply(OnigLen d,int m)104 distance_multiply(OnigLen d, int m)
105 {
106 if (m == 0) return 0;
107
108 if (d < ONIG_INFINITE_DISTANCE / m)
109 return d * m;
110 else
111 return ONIG_INFINITE_DISTANCE;
112 }
113
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117 int i;
118 for (i = 0; i < (int )BITSET_SIZE; i++) {
119 if (bs[i] != 0) return 0;
120 }
121 return 1;
122 }
123
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128 int i, n;
129
130 n = 0;
131 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132 if (BITSET_AT(bs, i)) n++;
133 }
134 return n;
135 }
136 #endif
137
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141 if (size <= 0) {
142 size = 0;
143 buf->p = NULL;
144 }
145 else {
146 buf->p = (UChar* )xmalloc(size);
147 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148 }
149
150 buf->alloc = size;
151 buf->used = 0;
152 return 0;
153 }
154
155
156 #ifdef USE_SUBEXP_CALL
157
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161 UnsetAddr* p;
162
163 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164 CHECK_NULL_RETURN_MEMERR(p);
165 uslist->num = 0;
166 uslist->alloc = size;
167 uslist->us = p;
168 return 0;
169 }
170
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174 if (IS_NOT_NULL(uslist->us))
175 xfree(uslist->us);
176 }
177
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181 UnsetAddr* p;
182 int size;
183
184 if (uslist->num >= uslist->alloc) {
185 size = uslist->alloc * 2;
186 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187 CHECK_NULL_RETURN_MEMERR(p);
188 uslist->alloc = size;
189 uslist->us = p;
190 }
191
192 uslist->us[uslist->num].offset = offset;
193 uslist->us[uslist->num].target = node;
194 uslist->num++;
195 return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198
199
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203 BBUF_ADD1(reg, opcode);
204 return 0;
205 }
206
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211 StateCheckNumType n = (StateCheckNumType )num;
212
213 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214 return 0;
215 }
216 #endif
217
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221 RelAddrType ra = (RelAddrType )addr;
222
223 BBUF_ADD(reg, &ra, SIZE_RELADDR);
224 return 0;
225 }
226
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230 AbsAddrType ra = (AbsAddrType )addr;
231
232 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233 return 0;
234 }
235
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239 LengthType l = (LengthType )len;
240
241 BBUF_ADD(reg, &l, SIZE_LENGTH);
242 return 0;
243 }
244
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248 MemNumType n = (MemNumType )num;
249
250 BBUF_ADD(reg, &n, SIZE_MEMNUM);
251 return 0;
252 }
253
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257 PointerType ptr = (PointerType )addr;
258
259 BBUF_ADD(reg, &ptr, SIZE_POINTER);
260 return 0;
261 }
262
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266 BBUF_ADD(reg, &option, SIZE_OPTION);
267 return 0;
268 }
269
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273 int r;
274
275 r = add_opcode(reg, opcode);
276 if (r) return r;
277 r = add_rel_addr(reg, addr);
278 return r;
279 }
280
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284 BBUF_ADD(reg, bytes, len);
285 return 0;
286 }
287
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291 BBUF_ADD(reg, bs, SIZE_BITSET);
292 return 0;
293 }
294
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_option(reg, option);
303 return r;
304 }
305
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308
309
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
312 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
313
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317 int op;
318
319 if (ignore_case) {
320 switch (str_len) {
321 case 1: op = OP_EXACT1_IC; break;
322 default: op = OP_EXACTN_IC; break;
323 }
324 }
325 else {
326 switch (mb_len) {
327 case 1:
328 switch (str_len) {
329 case 1: op = OP_EXACT1; break;
330 case 2: op = OP_EXACT2; break;
331 case 3: op = OP_EXACT3; break;
332 case 4: op = OP_EXACT4; break;
333 case 5: op = OP_EXACT5; break;
334 default: op = OP_EXACTN; break;
335 }
336 break;
337
338 case 2:
339 switch (str_len) {
340 case 1: op = OP_EXACTMB2N1; break;
341 case 2: op = OP_EXACTMB2N2; break;
342 case 3: op = OP_EXACTMB2N3; break;
343 default: op = OP_EXACTMB2N; break;
344 }
345 break;
346
347 case 3:
348 op = OP_EXACTMB3N;
349 break;
350
351 default:
352 op = OP_EXACTMBN;
353 break;
354 }
355 }
356 return op;
357 }
358
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362 int r;
363 int saved_num_null_check = reg->num_null_check;
364
365 if (empty_info != 0) {
366 r = add_opcode(reg, OP_NULL_CHECK_START);
367 if (r) return r;
368 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369 if (r) return r;
370 reg->num_null_check++;
371 }
372
373 r = compile_tree(node, reg);
374 if (r) return r;
375
376 if (empty_info != 0) {
377 if (empty_info == NQ_TARGET_IS_EMPTY)
378 r = add_opcode(reg, OP_NULL_CHECK_END);
379 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383
384 if (r) return r;
385 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386 }
387 return r;
388 }
389
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394 int r;
395
396 r = add_opcode(reg, OP_CALL);
397 if (r) return r;
398 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399 node->target);
400 if (r) return r;
401 r = add_abs_addr(reg, 0 /*dummy addr.*/);
402 return r;
403 }
404 #endif
405
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409 int i, r;
410
411 for (i = 0; i < n; i++) {
412 r = compile_tree(node, reg);
413 if (r) return r;
414 }
415 return 0;
416 }
417
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420 regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422 int len;
423 int op = select_str_opcode(mb_len, str_len, ignore_case);
424
425 len = SIZE_OPCODE;
426
427 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
428 if (IS_NEED_STR_LEN_OP_EXACT(op))
429 len += SIZE_LENGTH;
430
431 len += mb_len * str_len;
432 return len;
433 }
434
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437 regex_t* reg, int ignore_case)
438 {
439 int op = select_str_opcode(mb_len, str_len, ignore_case);
440 add_opcode(reg, op);
441
442 if (op == OP_EXACTMBN)
443 add_length(reg, mb_len);
444
445 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446 if (op == OP_EXACTN_IC)
447 add_length(reg, mb_len * str_len);
448 else
449 add_length(reg, str_len);
450 }
451
452 add_bytes(reg, s, mb_len * str_len);
453 return 0;
454 }
455
456
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460 int rlen, r, len, prev_len, slen, ambig;
461 OnigEncoding enc = reg->enc;
462 UChar *p, *prev;
463 StrNode* sn;
464
465 sn = NSTR(node);
466 if (sn->end <= sn->s)
467 return 0;
468
469 ambig = NSTRING_IS_AMBIG(node);
470
471 p = prev = sn->s;
472 SAFE_ENC_LEN(enc, p, sn->end, prev_len);
473 p += prev_len;
474 slen = 1;
475 rlen = 0;
476
477 for (; p < sn->end; ) {
478 SAFE_ENC_LEN(enc, p, sn->end, len);
479 if (len == prev_len) {
480 slen++;
481 }
482 else {
483 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484 rlen += r;
485 prev = p;
486 slen = 1;
487 prev_len = len;
488 }
489 p += len;
490 }
491 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492 rlen += r;
493 return rlen;
494 }
495
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499 if (sn->end <= sn->s)
500 return 0;
501
502 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508 int r, len, prev_len, slen, ambig;
509 OnigEncoding enc = reg->enc;
510 UChar *p, *prev, *end;
511 StrNode* sn;
512
513 sn = NSTR(node);
514 if (sn->end <= sn->s)
515 return 0;
516
517 end = sn->end;
518 ambig = NSTRING_IS_AMBIG(node);
519
520 p = prev = sn->s;
521 SAFE_ENC_LEN(enc, p, end, prev_len);
522 p += prev_len;
523 slen = 1;
524
525 for (; p < end; ) {
526 SAFE_ENC_LEN(enc, p, end, len);
527 if (len == prev_len) {
528 slen++;
529 }
530 else {
531 r = add_compile_string(prev, prev_len, slen, reg, ambig);
532 if (r) return r;
533
534 prev = p;
535 slen = 1;
536 prev_len = len;
537 }
538
539 p += len;
540 }
541 return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547 if (sn->end <= sn->s)
548 return 0;
549
550 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557 add_length(reg, mbuf->used);
558 return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560 int r, pad_size;
561 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562
563 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566
567 r = add_bytes(reg, mbuf->p, mbuf->used);
568
569 /* padding for return value from compile_length_cclass_node() to be fix. */
570 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572 return r;
573 #endif
574 }
575
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579 int len;
580
581 if (IS_NCCLASS_SHARE(cc)) {
582 len = SIZE_OPCODE + SIZE_POINTER;
583 return len;
584 }
585
586 if (IS_NULL(cc->mbuf)) {
587 len = SIZE_OPCODE + SIZE_BITSET;
588 }
589 else {
590 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591 len = SIZE_OPCODE;
592 }
593 else {
594 len = SIZE_OPCODE + SIZE_BITSET;
595 }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597 len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601 }
602
603 return len;
604 }
605
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609 int r;
610
611 if (IS_NCCLASS_SHARE(cc)) {
612 add_opcode(reg, OP_CCLASS_NODE);
613 r = add_pointer(reg, cc);
614 return r;
615 }
616
617 if (IS_NULL(cc->mbuf)) {
618 if (IS_NCCLASS_NOT(cc))
619 add_opcode(reg, OP_CCLASS_NOT);
620 else
621 add_opcode(reg, OP_CCLASS);
622
623 r = add_bitset(reg, cc->bs);
624 }
625 else {
626 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627 if (IS_NCCLASS_NOT(cc))
628 add_opcode(reg, OP_CCLASS_MB_NOT);
629 else
630 add_opcode(reg, OP_CCLASS_MB);
631
632 r = add_multi_byte_cclass(cc->mbuf, reg);
633 }
634 else {
635 if (IS_NCCLASS_NOT(cc))
636 add_opcode(reg, OP_CCLASS_MIX_NOT);
637 else
638 add_opcode(reg, OP_CCLASS_MIX);
639
640 r = add_bitset(reg, cc->bs);
641 if (r) return r;
642 r = add_multi_byte_cclass(cc->mbuf, reg);
643 }
644 }
645
646 return r;
647 }
648
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC 4
653
654 OnigRepeatRange* p;
655
656 if (reg->repeat_range_alloc == 0) {
657 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658 CHECK_NULL_RETURN_MEMERR(p);
659 reg->repeat_range = p;
660 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661 }
662 else if (reg->repeat_range_alloc <= id) {
663 int n;
664 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666 sizeof(OnigRepeatRange) * n);
667 CHECK_NULL_RETURN_MEMERR(p);
668 reg->repeat_range = p;
669 reg->repeat_range_alloc = n;
670 }
671 else {
672 p = reg->repeat_range;
673 }
674
675 p[id].lower = lower;
676 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677 return 0;
678 }
679
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682 regex_t* reg)
683 {
684 int r;
685 int num_repeat = reg->num_repeat;
686
687 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688 if (r) return r;
689 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690 reg->num_repeat++;
691 if (r) return r;
692 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693 if (r) return r;
694
695 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696 if (r) return r;
697
698 r = compile_tree_empty_check(qn->target, reg, empty_info);
699 if (r) return r;
700
701 if (
702 #ifdef USE_SUBEXP_CALL
703 reg->num_call > 0 ||
704 #endif
705 IS_QUANTIFIER_IN_REPEAT(qn)) {
706 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707 }
708 else {
709 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710 }
711 if (r) return r;
712 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713 return r;
714 }
715
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720 NTYPE(qn->target) == NT_CANY)
721 return 1;
722 else
723 return 0;
724 }
725
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
727 #define CKN_ON (ckn > 0)
728
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734 int len, mod_tlen, cklen;
735 int ckn;
736 int infinite = IS_REPEAT_INFINITE(qn->upper);
737 int empty_info = qn->target_empty_info;
738 int tlen = compile_length_tree(qn->target, reg);
739
740 if (tlen < 0) return tlen;
741
742 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743
744 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745
746 /* anychar repeat */
747 if (NTYPE(qn->target) == NT_CANY) {
748 if (qn->greedy && infinite) {
749 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751 else
752 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753 }
754 }
755
756 if (empty_info != 0)
757 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758 else
759 mod_tlen = tlen;
760
761 if (infinite && qn->lower <= 1) {
762 if (qn->greedy) {
763 if (qn->lower == 1)
764 len = SIZE_OP_JUMP;
765 else
766 len = 0;
767
768 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769 }
770 else {
771 if (qn->lower == 0)
772 len = SIZE_OP_JUMP;
773 else
774 len = 0;
775
776 len += mod_tlen + SIZE_OP_PUSH + cklen;
777 }
778 }
779 else if (qn->upper == 0) {
780 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781 len = SIZE_OP_JUMP + tlen;
782 else
783 len = 0;
784 }
785 else if (qn->upper == 1 && qn->greedy) {
786 if (qn->lower == 0) {
787 if (CKN_ON) {
788 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789 }
790 else {
791 len = SIZE_OP_PUSH + tlen;
792 }
793 }
794 else {
795 len = tlen;
796 }
797 }
798 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800 }
801 else {
802 len = SIZE_OP_REPEAT_INC
803 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804 if (CKN_ON)
805 len += SIZE_OP_STATE_CHECK;
806 }
807
808 return len;
809 }
810
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814 int r, mod_tlen;
815 int ckn;
816 int infinite = IS_REPEAT_INFINITE(qn->upper);
817 int empty_info = qn->target_empty_info;
818 int tlen = compile_length_tree(qn->target, reg);
819
820 if (tlen < 0) return tlen;
821
822 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823
824 if (is_anychar_star_quantifier(qn)) {
825 r = compile_tree_n_times(qn->target, qn->lower, reg);
826 if (r) return r;
827 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828 if (IS_MULTILINE(reg->options))
829 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830 else
831 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832 if (r) return r;
833 if (CKN_ON) {
834 r = add_state_check_num(reg, ckn);
835 if (r) return r;
836 }
837
838 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839 }
840 else {
841 if (IS_MULTILINE(reg->options)) {
842 r = add_opcode(reg, (CKN_ON ?
843 OP_STATE_CHECK_ANYCHAR_ML_STAR
844 : OP_ANYCHAR_ML_STAR));
845 }
846 else {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_STAR
849 : OP_ANYCHAR_STAR));
850 }
851 if (r) return r;
852 if (CKN_ON)
853 r = add_state_check_num(reg, ckn);
854
855 return r;
856 }
857 }
858
859 if (empty_info != 0)
860 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861 else
862 mod_tlen = tlen;
863
864 if (infinite && qn->lower <= 1) {
865 if (qn->greedy) {
866 if (qn->lower == 1) {
867 r = add_opcode_rel_addr(reg, OP_JUMP,
868 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 if (r) return r;
870 }
871
872 if (CKN_ON) {
873 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 if (r) return r;
875 r = add_state_check_num(reg, ckn);
876 if (r) return r;
877 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878 }
879 else {
880 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881 }
882 if (r) return r;
883 r = compile_tree_empty_check(qn->target, reg, empty_info);
884 if (r) return r;
885 r = add_opcode_rel_addr(reg, OP_JUMP,
886 -(mod_tlen + (int )SIZE_OP_JUMP
887 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888 }
889 else {
890 if (qn->lower == 0) {
891 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 if (r) return r;
893 }
894 r = compile_tree_empty_check(qn->target, reg, empty_info);
895 if (r) return r;
896 if (CKN_ON) {
897 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 if (r) return r;
899 r = add_state_check_num(reg, ckn);
900 if (r) return r;
901 r = add_rel_addr(reg,
902 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903 }
904 else
905 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906 }
907 }
908 else if (qn->upper == 0) {
909 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911 if (r) return r;
912 r = compile_tree(qn->target, reg);
913 }
914 else
915 r = 0;
916 }
917 else if (qn->upper == 1 && qn->greedy) {
918 if (qn->lower == 0) {
919 if (CKN_ON) {
920 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 if (r) return r;
922 r = add_state_check_num(reg, ckn);
923 if (r) return r;
924 r = add_rel_addr(reg, tlen);
925 }
926 else {
927 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928 }
929 if (r) return r;
930 }
931
932 r = compile_tree(qn->target, reg);
933 }
934 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935 if (CKN_ON) {
936 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937 if (r) return r;
938 r = add_state_check_num(reg, ckn);
939 if (r) return r;
940 r = add_rel_addr(reg, SIZE_OP_JUMP);
941 }
942 else {
943 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944 }
945
946 if (r) return r;
947 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948 if (r) return r;
949 r = compile_tree(qn->target, reg);
950 }
951 else {
952 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953 if (CKN_ON) {
954 if (r) return r;
955 r = add_opcode(reg, OP_STATE_CHECK);
956 if (r) return r;
957 r = add_state_check_num(reg, ckn);
958 }
959 }
960 return r;
961 }
962
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968 int len, mod_tlen;
969 int infinite = IS_REPEAT_INFINITE(qn->upper);
970 int empty_info = qn->target_empty_info;
971 int tlen = compile_length_tree(qn->target, reg);
972
973 if (tlen < 0) return tlen;
974
975 /* anychar repeat */
976 if (NTYPE(qn->target) == NT_CANY) {
977 if (qn->greedy && infinite) {
978 if (IS_NOT_NULL(qn->next_head_exact))
979 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980 else
981 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982 }
983 }
984
985 if (empty_info != 0)
986 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987 else
988 mod_tlen = tlen;
989
990 if (infinite &&
991 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993 len = SIZE_OP_JUMP;
994 }
995 else {
996 len = tlen * qn->lower;
997 }
998
999 if (qn->greedy) {
1000 if (IS_NOT_NULL(qn->head_exact))
1001 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002 else if (IS_NOT_NULL(qn->next_head_exact))
1003 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004 else
1005 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006 }
1007 else
1008 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009 }
1010 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011 len = SIZE_OP_JUMP + tlen;
1012 }
1013 else if (!infinite && qn->greedy &&
1014 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016 len = tlen * qn->lower;
1017 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018 }
1019 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021 }
1022 else {
1023 len = SIZE_OP_REPEAT_INC
1024 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025 }
1026
1027 return len;
1028 }
1029
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033 int i, r, mod_tlen;
1034 int infinite = IS_REPEAT_INFINITE(qn->upper);
1035 int empty_info = qn->target_empty_info;
1036 int tlen = compile_length_tree(qn->target, reg);
1037
1038 if (tlen < 0) return tlen;
1039
1040 if (is_anychar_star_quantifier(qn)) {
1041 r = compile_tree_n_times(qn->target, qn->lower, reg);
1042 if (r) return r;
1043 if (IS_NOT_NULL(qn->next_head_exact)) {
1044 if (IS_MULTILINE(reg->options))
1045 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046 else
1047 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048 if (r) return r;
1049 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050 }
1051 else {
1052 if (IS_MULTILINE(reg->options))
1053 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054 else
1055 return add_opcode(reg, OP_ANYCHAR_STAR);
1056 }
1057 }
1058
1059 if (empty_info != 0)
1060 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061 else
1062 mod_tlen = tlen;
1063
1064 if (infinite &&
1065 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067 if (qn->greedy) {
1068 if (IS_NOT_NULL(qn->head_exact))
1069 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 else if (IS_NOT_NULL(qn->next_head_exact))
1071 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 else
1073 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074 }
1075 else {
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077 }
1078 if (r) return r;
1079 }
1080 else {
1081 r = compile_tree_n_times(qn->target, qn->lower, reg);
1082 if (r) return r;
1083 }
1084
1085 if (qn->greedy) {
1086 if (IS_NOT_NULL(qn->head_exact)) {
1087 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 mod_tlen + SIZE_OP_JUMP);
1089 if (r) return r;
1090 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 if (r) return r;
1093 r = add_opcode_rel_addr(reg, OP_JUMP,
1094 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095 }
1096 else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 mod_tlen + SIZE_OP_JUMP);
1099 if (r) return r;
1100 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 if (r) return r;
1103 r = add_opcode_rel_addr(reg, OP_JUMP,
1104 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105 }
1106 else {
1107 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 if (r) return r;
1109 r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 if (r) return r;
1111 r = add_opcode_rel_addr(reg, OP_JUMP,
1112 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113 }
1114 }
1115 else {
1116 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117 if (r) return r;
1118 r = compile_tree_empty_check(qn->target, reg, empty_info);
1119 if (r) return r;
1120 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121 }
1122 }
1123 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125 if (r) return r;
1126 r = compile_tree(qn->target, reg);
1127 }
1128 else if (!infinite && qn->greedy &&
1129 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131 int n = qn->upper - qn->lower;
1132
1133 r = compile_tree_n_times(qn->target, qn->lower, reg);
1134 if (r) return r;
1135
1136 for (i = 0; i < n; i++) {
1137 r = add_opcode_rel_addr(reg, OP_PUSH,
1138 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139 if (r) return r;
1140 r = compile_tree(qn->target, reg);
1141 if (r) return r;
1142 }
1143 }
1144 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146 if (r) return r;
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1150 }
1151 else {
1152 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153 }
1154 return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161 int tlen;
1162 OnigOptionType prev = reg->options;
1163
1164 reg->options = node->option;
1165 tlen = compile_length_tree(node->target, reg);
1166 reg->options = prev;
1167
1168 if (tlen < 0) return tlen;
1169
1170 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172 + tlen + SIZE_OP_SET_OPTION;
1173 }
1174 else
1175 return tlen;
1176 }
1177
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181 int r;
1182 OnigOptionType prev = reg->options;
1183
1184 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186 if (r) return r;
1187 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188 if (r) return r;
1189 r = add_opcode(reg, OP_FAIL);
1190 if (r) return r;
1191 }
1192
1193 reg->options = node->option;
1194 r = compile_tree(node->target, reg);
1195 reg->options = prev;
1196
1197 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198 if (r) return r;
1199 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200 }
1201 return r;
1202 }
1203
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207 int len;
1208 int tlen;
1209
1210 if (node->type == ENCLOSE_OPTION)
1211 return compile_length_option_node(node, reg);
1212
1213 if (node->target) {
1214 tlen = compile_length_tree(node->target, reg);
1215 if (tlen < 0) return tlen;
1216 }
1217 else
1218 tlen = 0;
1219
1220 switch (node->type) {
1221 case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223 if (IS_ENCLOSE_CALLED(node)) {
1224 len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 len += (IS_ENCLOSE_RECURSION(node)
1228 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229 else
1230 len += (IS_ENCLOSE_RECURSION(node)
1231 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232 }
1233 else if (IS_ENCLOSE_RECURSION(node)) {
1234 len = SIZE_OP_MEMORY_START_PUSH;
1235 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1236 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1237 }
1238 else
1239 #endif
1240 {
1241 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1242 len = SIZE_OP_MEMORY_START_PUSH;
1243 else
1244 len = SIZE_OP_MEMORY_START;
1245
1246 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1247 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1248 }
1249 break;
1250
1251 case ENCLOSE_STOP_BACKTRACK:
1252 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1253 QtfrNode* qn = NQTFR(node->target);
1254 tlen = compile_length_tree(qn->target, reg);
1255 if (tlen < 0) return tlen;
1256
1257 len = tlen * qn->lower
1258 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1259 }
1260 else {
1261 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1262 }
1263 break;
1264
1265 default:
1266 return ONIGERR_TYPE_BUG;
1267 break;
1268 }
1269
1270 return len;
1271 }
1272
1273 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1274
1275 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1276 compile_enclose_node(EncloseNode* node, regex_t* reg)
1277 {
1278 int r, len;
1279
1280 if (node->type == ENCLOSE_OPTION)
1281 return compile_option_node(node, reg);
1282
1283 switch (node->type) {
1284 case ENCLOSE_MEMORY:
1285 #ifdef USE_SUBEXP_CALL
1286 if (IS_ENCLOSE_CALLED(node)) {
1287 r = add_opcode(reg, OP_CALL);
1288 if (r) return r;
1289 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1290 node->state |= NST_ADDR_FIXED;
1291 r = add_abs_addr(reg, (int )node->call_addr);
1292 if (r) return r;
1293 len = compile_length_tree(node->target, reg);
1294 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1295 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1296 len += (IS_ENCLOSE_RECURSION(node)
1297 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1298 else
1299 len += (IS_ENCLOSE_RECURSION(node)
1300 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1301
1302 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1303 if (r) return r;
1304 }
1305 #endif
1306 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1307 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1308 else
1309 r = add_opcode(reg, OP_MEMORY_START);
1310 if (r) return r;
1311 r = add_mem_num(reg, node->regnum);
1312 if (r) return r;
1313 r = compile_tree(node->target, reg);
1314 if (r) return r;
1315 #ifdef USE_SUBEXP_CALL
1316 if (IS_ENCLOSE_CALLED(node)) {
1317 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1318 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1319 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1320 else
1321 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1322 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1323
1324 if (r) return r;
1325 r = add_mem_num(reg, node->regnum);
1326 if (r) return r;
1327 r = add_opcode(reg, OP_RETURN);
1328 }
1329 else if (IS_ENCLOSE_RECURSION(node)) {
1330 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1331 r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
1332 else
1333 r = add_opcode(reg, OP_MEMORY_END_REC);
1334 if (r) return r;
1335 r = add_mem_num(reg, node->regnum);
1336 }
1337 else
1338 #endif
1339 {
1340 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1341 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1342 else
1343 r = add_opcode(reg, OP_MEMORY_END);
1344 if (r) return r;
1345 r = add_mem_num(reg, node->regnum);
1346 }
1347 break;
1348
1349 case ENCLOSE_STOP_BACKTRACK:
1350 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1351 QtfrNode* qn = NQTFR(node->target);
1352 r = compile_tree_n_times(qn->target, qn->lower, reg);
1353 if (r) return r;
1354
1355 len = compile_length_tree(qn->target, reg);
1356 if (len < 0) return len;
1357
1358 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1359 if (r) return r;
1360 r = compile_tree(qn->target, reg);
1361 if (r) return r;
1362 r = add_opcode(reg, OP_POP);
1363 if (r) return r;
1364 r = add_opcode_rel_addr(reg, OP_JUMP,
1365 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1366 }
1367 else {
1368 r = add_opcode(reg, OP_PUSH_STOP_BT);
1369 if (r) return r;
1370 r = compile_tree(node->target, reg);
1371 if (r) return r;
1372 r = add_opcode(reg, OP_POP_STOP_BT);
1373 }
1374 break;
1375
1376 default:
1377 return ONIGERR_TYPE_BUG;
1378 break;
1379 }
1380
1381 return r;
1382 }
1383
1384 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1385 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1386 {
1387 int len;
1388 int tlen = 0;
1389
1390 if (node->target) {
1391 tlen = compile_length_tree(node->target, reg);
1392 if (tlen < 0) return tlen;
1393 }
1394
1395 switch (node->type) {
1396 case ANCHOR_PREC_READ:
1397 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1398 break;
1399 case ANCHOR_PREC_READ_NOT:
1400 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1401 break;
1402 case ANCHOR_LOOK_BEHIND:
1403 len = SIZE_OP_LOOK_BEHIND + tlen;
1404 break;
1405 case ANCHOR_LOOK_BEHIND_NOT:
1406 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1407 break;
1408
1409 default:
1410 len = SIZE_OPCODE;
1411 break;
1412 }
1413
1414 return len;
1415 }
1416
1417 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1418 compile_anchor_node(AnchorNode* node, regex_t* reg)
1419 {
1420 int r, len;
1421
1422 switch (node->type) {
1423 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1424 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1425 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1426 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1427 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1428 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1429
1430 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1431 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1432 #ifdef USE_WORD_BEGIN_END
1433 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1434 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1435 #endif
1436
1437 case ANCHOR_PREC_READ:
1438 r = add_opcode(reg, OP_PUSH_POS);
1439 if (r) return r;
1440 r = compile_tree(node->target, reg);
1441 if (r) return r;
1442 r = add_opcode(reg, OP_POP_POS);
1443 break;
1444
1445 case ANCHOR_PREC_READ_NOT:
1446 len = compile_length_tree(node->target, reg);
1447 if (len < 0) return len;
1448 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1449 if (r) return r;
1450 r = compile_tree(node->target, reg);
1451 if (r) return r;
1452 r = add_opcode(reg, OP_FAIL_POS);
1453 break;
1454
1455 case ANCHOR_LOOK_BEHIND:
1456 {
1457 int n;
1458 r = add_opcode(reg, OP_LOOK_BEHIND);
1459 if (r) return r;
1460 if (node->char_len < 0) {
1461 r = get_char_length_tree(node->target, reg, &n);
1462 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1463 }
1464 else
1465 n = node->char_len;
1466
1467 r = add_length(reg, n);
1468 if (r) return r;
1469 r = compile_tree(node->target, reg);
1470 }
1471 break;
1472
1473 case ANCHOR_LOOK_BEHIND_NOT:
1474 {
1475 int n;
1476 len = compile_length_tree(node->target, reg);
1477 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1478 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1479 if (r) return r;
1480 if (node->char_len < 0) {
1481 r = get_char_length_tree(node->target, reg, &n);
1482 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1483 }
1484 else
1485 n = node->char_len;
1486 r = add_length(reg, n);
1487 if (r) return r;
1488 r = compile_tree(node->target, reg);
1489 if (r) return r;
1490 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1491 }
1492 break;
1493
1494 default:
1495 return ONIGERR_TYPE_BUG;
1496 break;
1497 }
1498
1499 return r;
1500 }
1501
1502 static int
compile_length_tree(Node * node,regex_t * reg)1503 compile_length_tree(Node* node, regex_t* reg)
1504 {
1505 int len, type, r;
1506
1507 type = NTYPE(node);
1508 switch (type) {
1509 case NT_LIST:
1510 len = 0;
1511 do {
1512 r = compile_length_tree(NCAR(node), reg);
1513 if (r < 0) return r;
1514 len += r;
1515 } while (IS_NOT_NULL(node = NCDR(node)));
1516 r = len;
1517 break;
1518
1519 case NT_ALT:
1520 {
1521 int n;
1522
1523 n = r = 0;
1524 do {
1525 r += compile_length_tree(NCAR(node), reg);
1526 n++;
1527 } while (IS_NOT_NULL(node = NCDR(node)));
1528 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1529 }
1530 break;
1531
1532 case NT_STR:
1533 if (NSTRING_IS_RAW(node))
1534 r = compile_length_string_raw_node(NSTR(node), reg);
1535 else
1536 r = compile_length_string_node(node, reg);
1537 break;
1538
1539 case NT_CCLASS:
1540 r = compile_length_cclass_node(NCCLASS(node), reg);
1541 break;
1542
1543 case NT_CTYPE:
1544 case NT_CANY:
1545 r = SIZE_OPCODE;
1546 break;
1547
1548 case NT_BREF:
1549 {
1550 BRefNode* br = NBREF(node);
1551
1552 #ifdef USE_BACKREF_WITH_LEVEL
1553 if (IS_BACKREF_NEST_LEVEL(br)) {
1554 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1555 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1556 }
1557 else
1558 #endif
1559 if (br->back_num == 1) {
1560 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1561 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1562 }
1563 else {
1564 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1565 }
1566 }
1567 break;
1568
1569 #ifdef USE_SUBEXP_CALL
1570 case NT_CALL:
1571 r = SIZE_OP_CALL;
1572 break;
1573 #endif
1574
1575 case NT_QTFR:
1576 r = compile_length_quantifier_node(NQTFR(node), reg);
1577 break;
1578
1579 case NT_ENCLOSE:
1580 r = compile_length_enclose_node(NENCLOSE(node), reg);
1581 break;
1582
1583 case NT_ANCHOR:
1584 r = compile_length_anchor_node(NANCHOR(node), reg);
1585 break;
1586
1587 default:
1588 return ONIGERR_TYPE_BUG;
1589 break;
1590 }
1591
1592 return r;
1593 }
1594
1595 static int
compile_tree(Node * node,regex_t * reg)1596 compile_tree(Node* node, regex_t* reg)
1597 {
1598 int n, type, len, pos, r = 0;
1599
1600 type = NTYPE(node);
1601 switch (type) {
1602 case NT_LIST:
1603 do {
1604 r = compile_tree(NCAR(node), reg);
1605 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1606 break;
1607
1608 case NT_ALT:
1609 {
1610 Node* x = node;
1611 len = 0;
1612 do {
1613 len += compile_length_tree(NCAR(x), reg);
1614 if (NCDR(x) != NULL) {
1615 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1616 }
1617 } while (IS_NOT_NULL(x = NCDR(x)));
1618 pos = reg->used + len; /* goal position */
1619
1620 do {
1621 len = compile_length_tree(NCAR(node), reg);
1622 if (IS_NOT_NULL(NCDR(node))) {
1623 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1624 if (r) break;
1625 }
1626 r = compile_tree(NCAR(node), reg);
1627 if (r) break;
1628 if (IS_NOT_NULL(NCDR(node))) {
1629 len = pos - (reg->used + SIZE_OP_JUMP);
1630 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1631 if (r) break;
1632 }
1633 } while (IS_NOT_NULL(node = NCDR(node)));
1634 }
1635 break;
1636
1637 case NT_STR:
1638 if (NSTRING_IS_RAW(node))
1639 r = compile_string_raw_node(NSTR(node), reg);
1640 else
1641 r = compile_string_node(node, reg);
1642 break;
1643
1644 case NT_CCLASS:
1645 r = compile_cclass_node(NCCLASS(node), reg);
1646 break;
1647
1648 case NT_CTYPE:
1649 {
1650 int op;
1651
1652 switch (NCTYPE(node)->ctype) {
1653 case ONIGENC_CTYPE_WORD:
1654 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1655 else op = OP_WORD;
1656 break;
1657 default:
1658 return ONIGERR_TYPE_BUG;
1659 break;
1660 }
1661 r = add_opcode(reg, op);
1662 }
1663 break;
1664
1665 case NT_CANY:
1666 if (IS_MULTILINE(reg->options))
1667 r = add_opcode(reg, OP_ANYCHAR_ML);
1668 else
1669 r = add_opcode(reg, OP_ANYCHAR);
1670 break;
1671
1672 case NT_BREF:
1673 {
1674 BRefNode* br = NBREF(node);
1675
1676 #ifdef USE_BACKREF_WITH_LEVEL
1677 if (IS_BACKREF_NEST_LEVEL(br)) {
1678 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1679 if (r) return r;
1680 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1681 if (r) return r;
1682 r = add_length(reg, br->nest_level);
1683 if (r) return r;
1684
1685 goto add_bacref_mems;
1686 }
1687 else
1688 #endif
1689 if (br->back_num == 1) {
1690 n = br->back_static[0];
1691 if (IS_IGNORECASE(reg->options)) {
1692 r = add_opcode(reg, OP_BACKREFN_IC);
1693 if (r) return r;
1694 r = add_mem_num(reg, n);
1695 }
1696 else {
1697 switch (n) {
1698 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1699 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1700 default:
1701 r = add_opcode(reg, OP_BACKREFN);
1702 if (r) return r;
1703 r = add_mem_num(reg, n);
1704 break;
1705 }
1706 }
1707 }
1708 else {
1709 int i;
1710 int* p;
1711
1712 if (IS_IGNORECASE(reg->options)) {
1713 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1714 }
1715 else {
1716 r = add_opcode(reg, OP_BACKREF_MULTI);
1717 }
1718 if (r) return r;
1719
1720 #ifdef USE_BACKREF_WITH_LEVEL
1721 add_bacref_mems:
1722 #endif
1723 r = add_length(reg, br->back_num);
1724 if (r) return r;
1725 p = BACKREFS_P(br);
1726 for (i = br->back_num - 1; i >= 0; i--) {
1727 r = add_mem_num(reg, p[i]);
1728 if (r) return r;
1729 }
1730 }
1731 }
1732 break;
1733
1734 #ifdef USE_SUBEXP_CALL
1735 case NT_CALL:
1736 r = compile_call(NCALL(node), reg);
1737 break;
1738 #endif
1739
1740 case NT_QTFR:
1741 r = compile_quantifier_node(NQTFR(node), reg);
1742 break;
1743
1744 case NT_ENCLOSE:
1745 r = compile_enclose_node(NENCLOSE(node), reg);
1746 break;
1747
1748 case NT_ANCHOR:
1749 r = compile_anchor_node(NANCHOR(node), reg);
1750 break;
1751
1752 default:
1753 #ifdef ONIG_DEBUG
1754 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1755 #endif
1756 break;
1757 }
1758
1759 return r;
1760 }
1761
1762 #ifdef USE_NAMED_GROUP
1763
1764 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1765 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1766 {
1767 int r = 0;
1768 Node* node = *plink;
1769
1770 switch (NTYPE(node)) {
1771 case NT_LIST:
1772 case NT_ALT:
1773 do {
1774 r = noname_disable_map(&(NCAR(node)), map, counter);
1775 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1776 break;
1777
1778 case NT_QTFR:
1779 {
1780 Node** ptarget = &(NQTFR(node)->target);
1781 Node* old = *ptarget;
1782 r = noname_disable_map(ptarget, map, counter);
1783 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1784 onig_reduce_nested_quantifier(node, *ptarget);
1785 }
1786 }
1787 break;
1788
1789 case NT_ENCLOSE:
1790 {
1791 EncloseNode* en = NENCLOSE(node);
1792 if (en->type == ENCLOSE_MEMORY) {
1793 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1794 (*counter)++;
1795 map[en->regnum].new_val = *counter;
1796 en->regnum = *counter;
1797 r = noname_disable_map(&(en->target), map, counter);
1798 }
1799 else {
1800 *plink = en->target;
1801 en->target = NULL_NODE;
1802 onig_node_free(node);
1803 r = noname_disable_map(plink, map, counter);
1804 }
1805 }
1806 else
1807 r = noname_disable_map(&(en->target), map, counter);
1808 }
1809 break;
1810
1811 case NT_ANCHOR:
1812 if (NANCHOR(node)->target)
1813 r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
1814 break;
1815
1816 default:
1817 break;
1818 }
1819
1820 return r;
1821 }
1822
1823 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1824 renumber_node_backref(Node* node, GroupNumRemap* map)
1825 {
1826 int i, pos, n, old_num;
1827 int *backs;
1828 BRefNode* bn = NBREF(node);
1829
1830 if (! IS_BACKREF_NAME_REF(bn))
1831 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1832
1833 old_num = bn->back_num;
1834 if (IS_NULL(bn->back_dynamic))
1835 backs = bn->back_static;
1836 else
1837 backs = bn->back_dynamic;
1838
1839 for (i = 0, pos = 0; i < old_num; i++) {
1840 n = map[backs[i]].new_val;
1841 if (n > 0) {
1842 backs[pos] = n;
1843 pos++;
1844 }
1845 }
1846
1847 bn->back_num = pos;
1848 return 0;
1849 }
1850
1851 static int
renumber_by_map(Node * node,GroupNumRemap * map)1852 renumber_by_map(Node* node, GroupNumRemap* map)
1853 {
1854 int r = 0;
1855
1856 switch (NTYPE(node)) {
1857 case NT_LIST:
1858 case NT_ALT:
1859 do {
1860 r = renumber_by_map(NCAR(node), map);
1861 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1862 break;
1863 case NT_QTFR:
1864 r = renumber_by_map(NQTFR(node)->target, map);
1865 break;
1866 case NT_ENCLOSE:
1867 r = renumber_by_map(NENCLOSE(node)->target, map);
1868 break;
1869
1870 case NT_BREF:
1871 r = renumber_node_backref(node, map);
1872 break;
1873
1874 case NT_ANCHOR:
1875 if (NANCHOR(node)->target)
1876 r = renumber_by_map(NANCHOR(node)->target, map);
1877 break;
1878
1879 default:
1880 break;
1881 }
1882
1883 return r;
1884 }
1885
1886 static int
numbered_ref_check(Node * node)1887 numbered_ref_check(Node* node)
1888 {
1889 int r = 0;
1890
1891 switch (NTYPE(node)) {
1892 case NT_LIST:
1893 case NT_ALT:
1894 do {
1895 r = numbered_ref_check(NCAR(node));
1896 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1897 break;
1898 case NT_QTFR:
1899 r = numbered_ref_check(NQTFR(node)->target);
1900 break;
1901 case NT_ENCLOSE:
1902 r = numbered_ref_check(NENCLOSE(node)->target);
1903 break;
1904
1905 case NT_BREF:
1906 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1907 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1908 break;
1909
1910 case NT_ANCHOR:
1911 if (NANCHOR(node)->target)
1912 r = numbered_ref_check(NANCHOR(node)->target);
1913 break;
1914
1915 default:
1916 break;
1917 }
1918
1919 return r;
1920 }
1921
1922 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1923 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1924 {
1925 int r, i, pos, counter;
1926 BitStatusType loc;
1927 GroupNumRemap* map;
1928
1929 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1930 CHECK_NULL_RETURN_MEMERR(map);
1931 for (i = 1; i <= env->num_mem; i++) {
1932 map[i].new_val = 0;
1933 }
1934 counter = 0;
1935 r = noname_disable_map(root, map, &counter);
1936 if (r != 0) return r;
1937
1938 r = renumber_by_map(*root, map);
1939 if (r != 0) return r;
1940
1941 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1942 if (map[i].new_val > 0) {
1943 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1944 pos++;
1945 }
1946 }
1947
1948 loc = env->capture_history;
1949 BIT_STATUS_CLEAR(env->capture_history);
1950 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1951 if (BIT_STATUS_AT(loc, i)) {
1952 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1953 }
1954 }
1955
1956 env->num_mem = env->num_named;
1957 reg->num_mem = env->num_named;
1958
1959 return onig_renumber_name_table(reg, map);
1960 }
1961 #endif /* USE_NAMED_GROUP */
1962
1963 #ifdef USE_SUBEXP_CALL
1964 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1965 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1966 {
1967 int i, offset;
1968 EncloseNode* en;
1969 AbsAddrType addr;
1970
1971 for (i = 0; i < uslist->num; i++) {
1972 en = NENCLOSE(uslist->us[i].target);
1973 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1974 addr = en->call_addr;
1975 offset = uslist->us[i].offset;
1976
1977 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1978 }
1979 return 0;
1980 }
1981 #endif
1982
1983 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1984 static int
quantifiers_memory_node_info(Node * node)1985 quantifiers_memory_node_info(Node* node)
1986 {
1987 int r = 0;
1988
1989 switch (NTYPE(node)) {
1990 case NT_LIST:
1991 case NT_ALT:
1992 {
1993 int v;
1994 do {
1995 v = quantifiers_memory_node_info(NCAR(node));
1996 if (v > r) r = v;
1997 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1998 }
1999 break;
2000
2001 #ifdef USE_SUBEXP_CALL
2002 case NT_CALL:
2003 if (IS_CALL_RECURSION(NCALL(node))) {
2004 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2005 }
2006 else
2007 r = quantifiers_memory_node_info(NCALL(node)->target);
2008 break;
2009 #endif
2010
2011 case NT_QTFR:
2012 {
2013 QtfrNode* qn = NQTFR(node);
2014 if (qn->upper != 0) {
2015 r = quantifiers_memory_node_info(qn->target);
2016 }
2017 }
2018 break;
2019
2020 case NT_ENCLOSE:
2021 {
2022 EncloseNode* en = NENCLOSE(node);
2023 switch (en->type) {
2024 case ENCLOSE_MEMORY:
2025 return NQ_TARGET_IS_EMPTY_MEM;
2026 break;
2027
2028 case ENCLOSE_OPTION:
2029 case ENCLOSE_STOP_BACKTRACK:
2030 r = quantifiers_memory_node_info(en->target);
2031 break;
2032 default:
2033 break;
2034 }
2035 }
2036 break;
2037
2038 case NT_BREF:
2039 case NT_STR:
2040 case NT_CTYPE:
2041 case NT_CCLASS:
2042 case NT_CANY:
2043 case NT_ANCHOR:
2044 default:
2045 break;
2046 }
2047
2048 return r;
2049 }
2050 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2051
2052
2053 #define GET_CHAR_LEN_VARLEN -1
2054 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2055
2056 /* fixed size pattern node only */
2057 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2058 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2059 {
2060 int tlen;
2061 int r = 0;
2062
2063 level++;
2064 *len = 0;
2065 switch (NTYPE(node)) {
2066 case NT_LIST:
2067 do {
2068 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2069 if (r == 0)
2070 *len = distance_add(*len, tlen);
2071 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2072 break;
2073
2074 case NT_ALT:
2075 {
2076 int tlen2;
2077 int varlen = 0;
2078
2079 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2080 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2081 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2082 if (r == 0) {
2083 if (tlen != tlen2)
2084 varlen = 1;
2085 }
2086 }
2087 if (r == 0) {
2088 if (varlen != 0) {
2089 if (level == 1)
2090 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2091 else
2092 r = GET_CHAR_LEN_VARLEN;
2093 }
2094 else
2095 *len = tlen;
2096 }
2097 }
2098 break;
2099
2100 case NT_STR:
2101 {
2102 StrNode* sn = NSTR(node);
2103 UChar *s = sn->s;
2104 while (s < sn->end) {
2105 s += enclen(reg->enc, s);
2106 (*len)++;
2107 }
2108 }
2109 break;
2110
2111 case NT_QTFR:
2112 {
2113 QtfrNode* qn = NQTFR(node);
2114 if (qn->lower == qn->upper) {
2115 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2116 if (r == 0)
2117 *len = distance_multiply(tlen, qn->lower);
2118 }
2119 else
2120 r = GET_CHAR_LEN_VARLEN;
2121 }
2122 break;
2123
2124 #ifdef USE_SUBEXP_CALL
2125 case NT_CALL:
2126 if (! IS_CALL_RECURSION(NCALL(node)))
2127 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2128 else
2129 r = GET_CHAR_LEN_VARLEN;
2130 break;
2131 #endif
2132
2133 case NT_CTYPE:
2134 *len = 1;
2135 break;
2136
2137 case NT_CCLASS:
2138 case NT_CANY:
2139 *len = 1;
2140 break;
2141
2142 case NT_ENCLOSE:
2143 {
2144 EncloseNode* en = NENCLOSE(node);
2145 switch (en->type) {
2146 case ENCLOSE_MEMORY:
2147 #ifdef USE_SUBEXP_CALL
2148 if (IS_ENCLOSE_CLEN_FIXED(en))
2149 *len = en->char_len;
2150 else {
2151 r = get_char_length_tree1(en->target, reg, len, level);
2152 if (r == 0) {
2153 en->char_len = *len;
2154 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2155 }
2156 }
2157 break;
2158 #endif
2159 case ENCLOSE_OPTION:
2160 case ENCLOSE_STOP_BACKTRACK:
2161 r = get_char_length_tree1(en->target, reg, len, level);
2162 break;
2163 default:
2164 break;
2165 }
2166 }
2167 break;
2168
2169 case NT_ANCHOR:
2170 break;
2171
2172 default:
2173 r = GET_CHAR_LEN_VARLEN;
2174 break;
2175 }
2176
2177 return r;
2178 }
2179
2180 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2181 get_char_length_tree(Node* node, regex_t* reg, int* len)
2182 {
2183 return get_char_length_tree1(node, reg, len, 0);
2184 }
2185
2186 /* x is not included y ==> 1 : 0 */
2187 static int
is_not_included(Node * x,Node * y,regex_t * reg)2188 is_not_included(Node* x, Node* y, regex_t* reg)
2189 {
2190 int i, len;
2191 OnigCodePoint code;
2192 UChar *p;
2193 int ytype;
2194
2195 retry:
2196 ytype = NTYPE(y);
2197 switch (NTYPE(x)) {
2198 case NT_CTYPE:
2199 {
2200 switch (ytype) {
2201 case NT_CTYPE:
2202 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2203 NCTYPE(y)->not != NCTYPE(x)->not)
2204 return 1;
2205 else
2206 return 0;
2207 break;
2208
2209 case NT_CCLASS:
2210 swap:
2211 {
2212 Node* tmp;
2213 tmp = x; x = y; y = tmp;
2214 goto retry;
2215 }
2216 break;
2217
2218 case NT_STR:
2219 goto swap;
2220 break;
2221
2222 default:
2223 break;
2224 }
2225 }
2226 break;
2227
2228 case NT_CCLASS:
2229 {
2230 CClassNode* xc = NCCLASS(x);
2231 switch (ytype) {
2232 case NT_CTYPE:
2233 switch (NCTYPE(y)->ctype) {
2234 case ONIGENC_CTYPE_WORD:
2235 if (NCTYPE(y)->not == 0) {
2236 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2237 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2238 if (BITSET_AT(xc->bs, i)) {
2239 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2240 }
2241 }
2242 return 1;
2243 }
2244 return 0;
2245 }
2246 else {
2247 if (IS_NOT_NULL(xc->mbuf)) return 0;
2248 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2249 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2250 if (!IS_NCCLASS_NOT(xc)) {
2251 if (BITSET_AT(xc->bs, i))
2252 return 0;
2253 }
2254 else {
2255 if (! BITSET_AT(xc->bs, i))
2256 return 0;
2257 }
2258 }
2259 }
2260 return 1;
2261 }
2262 break;
2263
2264 default:
2265 break;
2266 }
2267 break;
2268
2269 case NT_CCLASS:
2270 {
2271 int v;
2272 CClassNode* yc = NCCLASS(y);
2273
2274 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2275 v = BITSET_AT(xc->bs, i);
2276 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2277 (v == 0 && IS_NCCLASS_NOT(xc))) {
2278 v = BITSET_AT(yc->bs, i);
2279 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2280 (v == 0 && IS_NCCLASS_NOT(yc)))
2281 return 0;
2282 }
2283 }
2284 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2285 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2286 return 1;
2287 return 0;
2288 }
2289 break;
2290
2291 case NT_STR:
2292 goto swap;
2293 break;
2294
2295 default:
2296 break;
2297 }
2298 }
2299 break;
2300
2301 case NT_STR:
2302 {
2303 StrNode* xs = NSTR(x);
2304 if (NSTRING_LEN(x) == 0)
2305 break;
2306
2307 //c = *(xs->s);
2308 switch (ytype) {
2309 case NT_CTYPE:
2310 switch (NCTYPE(y)->ctype) {
2311 case ONIGENC_CTYPE_WORD:
2312 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2313 return NCTYPE(y)->not;
2314 else
2315 return !(NCTYPE(y)->not);
2316 break;
2317 default:
2318 break;
2319 }
2320 break;
2321
2322 case NT_CCLASS:
2323 {
2324 CClassNode* cc = NCCLASS(y);
2325
2326 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2327 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2328 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2329 }
2330 break;
2331
2332 case NT_STR:
2333 {
2334 UChar *q;
2335 StrNode* ys = NSTR(y);
2336 len = NSTRING_LEN(x);
2337 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2338 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2339 /* tiny version */
2340 return 0;
2341 }
2342 else {
2343 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2344 if (*p != *q) return 1;
2345 }
2346 }
2347 }
2348 break;
2349
2350 default:
2351 break;
2352 }
2353 }
2354 break;
2355
2356 default:
2357 break;
2358 }
2359
2360 return 0;
2361 }
2362
2363 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2364 get_head_value_node(Node* node, int exact, regex_t* reg)
2365 {
2366 Node* n = NULL_NODE;
2367
2368 switch (NTYPE(node)) {
2369 case NT_BREF:
2370 case NT_ALT:
2371 case NT_CANY:
2372 #ifdef USE_SUBEXP_CALL
2373 case NT_CALL:
2374 #endif
2375 break;
2376
2377 case NT_CTYPE:
2378 case NT_CCLASS:
2379 if (exact == 0) {
2380 n = node;
2381 }
2382 break;
2383
2384 case NT_LIST:
2385 n = get_head_value_node(NCAR(node), exact, reg);
2386 break;
2387
2388 case NT_STR:
2389 {
2390 StrNode* sn = NSTR(node);
2391
2392 if (sn->end <= sn->s)
2393 break;
2394
2395 if (exact != 0 &&
2396 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2397 }
2398 else {
2399 n = node;
2400 }
2401 }
2402 break;
2403
2404 case NT_QTFR:
2405 {
2406 QtfrNode* qn = NQTFR(node);
2407 if (qn->lower > 0) {
2408 if (IS_NOT_NULL(qn->head_exact))
2409 n = qn->head_exact;
2410 else
2411 n = get_head_value_node(qn->target, exact, reg);
2412 }
2413 }
2414 break;
2415
2416 case NT_ENCLOSE:
2417 {
2418 EncloseNode* en = NENCLOSE(node);
2419 switch (en->type) {
2420 case ENCLOSE_OPTION:
2421 {
2422 OnigOptionType options = reg->options;
2423
2424 reg->options = NENCLOSE(node)->option;
2425 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2426 reg->options = options;
2427 }
2428 break;
2429
2430 case ENCLOSE_MEMORY:
2431 case ENCLOSE_STOP_BACKTRACK:
2432 n = get_head_value_node(en->target, exact, reg);
2433 break;
2434 }
2435 }
2436 break;
2437
2438 case NT_ANCHOR:
2439 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2440 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2441 break;
2442
2443 default:
2444 break;
2445 }
2446
2447 return n;
2448 }
2449
2450 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2451 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2452 {
2453 int type, r = 0;
2454
2455 type = NTYPE(node);
2456 if ((NTYPE2BIT(type) & type_mask) == 0)
2457 return 1;
2458
2459 switch (type) {
2460 case NT_LIST:
2461 case NT_ALT:
2462 do {
2463 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2464 anchor_mask);
2465 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2466 break;
2467
2468 case NT_QTFR:
2469 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2470 anchor_mask);
2471 break;
2472
2473 case NT_ENCLOSE:
2474 {
2475 EncloseNode* en = NENCLOSE(node);
2476 if ((en->type & enclose_mask) == 0)
2477 return 1;
2478
2479 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2480 }
2481 break;
2482
2483 case NT_ANCHOR:
2484 type = NANCHOR(node)->type;
2485 if ((type & anchor_mask) == 0)
2486 return 1;
2487
2488 if (NANCHOR(node)->target)
2489 r = check_type_tree(NANCHOR(node)->target,
2490 type_mask, enclose_mask, anchor_mask);
2491 break;
2492
2493 default:
2494 break;
2495 }
2496 return r;
2497 }
2498
2499 static int
get_min_len(Node * node,OnigLen * min,ScanEnv * env)2500 get_min_len(Node* node, OnigLen *min, ScanEnv* env)
2501 {
2502 OnigLen tmin;
2503 int r = 0;
2504
2505 *min = 0;
2506 switch (NTYPE(node)) {
2507 case NT_BREF:
2508 {
2509 int i;
2510 int* backs;
2511 Node** nodes = SCANENV_MEM_NODES(env);
2512 BRefNode* br = NBREF(node);
2513 if (br->state & NST_RECURSION) break;
2514
2515 backs = BACKREFS_P(br);
2516 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2517 r = get_min_len(nodes[backs[0]], min, env);
2518 if (r != 0) break;
2519 for (i = 1; i < br->back_num; i++) {
2520 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2521 r = get_min_len(nodes[backs[i]], &tmin, env);
2522 if (r != 0) break;
2523 if (*min > tmin) *min = tmin;
2524 }
2525 }
2526 break;
2527
2528 #ifdef USE_SUBEXP_CALL
2529 case NT_CALL:
2530 if (IS_CALL_RECURSION(NCALL(node))) {
2531 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2532 if (IS_ENCLOSE_MIN_FIXED(en))
2533 *min = en->min_len;
2534 }
2535 else
2536 r = get_min_len(NCALL(node)->target, min, env);
2537 break;
2538 #endif
2539
2540 case NT_LIST:
2541 do {
2542 r = get_min_len(NCAR(node), &tmin, env);
2543 if (r == 0) *min += tmin;
2544 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2545 break;
2546
2547 case NT_ALT:
2548 {
2549 Node *x, *y;
2550 y = node;
2551 do {
2552 x = NCAR(y);
2553 r = get_min_len(x, &tmin, env);
2554 if (r != 0) break;
2555 if (y == node) *min = tmin;
2556 else if (*min > tmin) *min = tmin;
2557 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2558 }
2559 break;
2560
2561 case NT_STR:
2562 {
2563 StrNode* sn = NSTR(node);
2564 *min = sn->end - sn->s;
2565 }
2566 break;
2567
2568 case NT_CTYPE:
2569 *min = 1;
2570 break;
2571
2572 case NT_CCLASS:
2573 case NT_CANY:
2574 *min = 1;
2575 break;
2576
2577 case NT_QTFR:
2578 {
2579 QtfrNode* qn = NQTFR(node);
2580
2581 if (qn->lower > 0) {
2582 r = get_min_len(qn->target, min, env);
2583 if (r == 0)
2584 *min = distance_multiply(*min, qn->lower);
2585 }
2586 }
2587 break;
2588
2589 case NT_ENCLOSE:
2590 {
2591 EncloseNode* en = NENCLOSE(node);
2592 switch (en->type) {
2593 case ENCLOSE_MEMORY:
2594 if (IS_ENCLOSE_MIN_FIXED(en))
2595 *min = en->min_len;
2596 else {
2597 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2598 *min = 0; // recursive
2599 else {
2600 SET_ENCLOSE_STATUS(node, NST_MARK1);
2601 r = get_min_len(en->target, min, env);
2602 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2603 if (r == 0) {
2604 en->min_len = *min;
2605 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2606 }
2607 }
2608 }
2609 break;
2610
2611 case ENCLOSE_OPTION:
2612 case ENCLOSE_STOP_BACKTRACK:
2613 r = get_min_len(en->target, min, env);
2614 break;
2615 }
2616 }
2617 break;
2618
2619 case NT_ANCHOR:
2620 default:
2621 break;
2622 }
2623
2624 return r;
2625 }
2626
2627 static int
get_max_len(Node * node,OnigLen * max,ScanEnv * env)2628 get_max_len(Node* node, OnigLen *max, ScanEnv* env)
2629 {
2630 OnigLen tmax;
2631 int r = 0;
2632
2633 *max = 0;
2634 switch (NTYPE(node)) {
2635 case NT_LIST:
2636 do {
2637 r = get_max_len(NCAR(node), &tmax, env);
2638 if (r == 0)
2639 *max = distance_add(*max, tmax);
2640 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2641 break;
2642
2643 case NT_ALT:
2644 do {
2645 r = get_max_len(NCAR(node), &tmax, env);
2646 if (r == 0 && *max < tmax) *max = tmax;
2647 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2648 break;
2649
2650 case NT_STR:
2651 {
2652 StrNode* sn = NSTR(node);
2653 *max = sn->end - sn->s;
2654 }
2655 break;
2656
2657 case NT_CTYPE:
2658 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2659 break;
2660
2661 case NT_CCLASS:
2662 case NT_CANY:
2663 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2664 break;
2665
2666 case NT_BREF:
2667 {
2668 int i;
2669 int* backs;
2670 Node** nodes = SCANENV_MEM_NODES(env);
2671 BRefNode* br = NBREF(node);
2672 if (br->state & NST_RECURSION) {
2673 *max = ONIG_INFINITE_DISTANCE;
2674 break;
2675 }
2676 backs = BACKREFS_P(br);
2677 for (i = 0; i < br->back_num; i++) {
2678 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2679 r = get_max_len(nodes[backs[i]], &tmax, env);
2680 if (r != 0) break;
2681 if (*max < tmax) *max = tmax;
2682 }
2683 }
2684 break;
2685
2686 #ifdef USE_SUBEXP_CALL
2687 case NT_CALL:
2688 if (! IS_CALL_RECURSION(NCALL(node)))
2689 r = get_max_len(NCALL(node)->target, max, env);
2690 else
2691 *max = ONIG_INFINITE_DISTANCE;
2692 break;
2693 #endif
2694
2695 case NT_QTFR:
2696 {
2697 QtfrNode* qn = NQTFR(node);
2698
2699 if (qn->upper != 0) {
2700 r = get_max_len(qn->target, max, env);
2701 if (r == 0 && *max != 0) {
2702 if (! IS_REPEAT_INFINITE(qn->upper))
2703 *max = distance_multiply(*max, qn->upper);
2704 else
2705 *max = ONIG_INFINITE_DISTANCE;
2706 }
2707 }
2708 }
2709 break;
2710
2711 case NT_ENCLOSE:
2712 {
2713 EncloseNode* en = NENCLOSE(node);
2714 switch (en->type) {
2715 case ENCLOSE_MEMORY:
2716 if (IS_ENCLOSE_MAX_FIXED(en))
2717 *max = en->max_len;
2718 else {
2719 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2720 *max = ONIG_INFINITE_DISTANCE;
2721 else {
2722 SET_ENCLOSE_STATUS(node, NST_MARK1);
2723 r = get_max_len(en->target, max, env);
2724 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2725 if (r == 0) {
2726 en->max_len = *max;
2727 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2728 }
2729 }
2730 }
2731 break;
2732
2733 case ENCLOSE_OPTION:
2734 case ENCLOSE_STOP_BACKTRACK:
2735 r = get_max_len(en->target, max, env);
2736 break;
2737 }
2738 }
2739 break;
2740
2741 case NT_ANCHOR:
2742 default:
2743 break;
2744 }
2745
2746 return r;
2747 }
2748
2749
2750 #ifdef USE_SUBEXP_CALL
2751
2752 #define RECURSION_EXIST 1
2753 #define RECURSION_INFINITE 2
2754
2755 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2756 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2757 {
2758 int type;
2759 int r = 0;
2760
2761 type = NTYPE(node);
2762 switch (type) {
2763 case NT_LIST:
2764 {
2765 Node *x;
2766 OnigLen min;
2767 int ret;
2768
2769 x = node;
2770 do {
2771 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2772 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2773 r |= ret;
2774 if (head) {
2775 ret = get_min_len(NCAR(x), &min, env);
2776 if (ret != 0) return ret;
2777 if (min != 0) head = 0;
2778 }
2779 } while (IS_NOT_NULL(x = NCDR(x)));
2780 }
2781 break;
2782
2783 case NT_ALT:
2784 {
2785 int ret;
2786 r = RECURSION_EXIST;
2787 do {
2788 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2789 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2790 r &= ret;
2791 } while (IS_NOT_NULL(node = NCDR(node)));
2792 }
2793 break;
2794
2795 case NT_QTFR:
2796 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2797 if (r == RECURSION_EXIST) {
2798 if (NQTFR(node)->lower == 0) r = 0;
2799 }
2800 break;
2801
2802 case NT_ANCHOR:
2803 {
2804 AnchorNode* an = NANCHOR(node);
2805 switch (an->type) {
2806 case ANCHOR_PREC_READ:
2807 case ANCHOR_PREC_READ_NOT:
2808 case ANCHOR_LOOK_BEHIND:
2809 case ANCHOR_LOOK_BEHIND_NOT:
2810 r = subexp_inf_recursive_check(an->target, env, head);
2811 break;
2812 }
2813 }
2814 break;
2815
2816 case NT_CALL:
2817 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2818 break;
2819
2820 case NT_ENCLOSE:
2821 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2822 return 0;
2823 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2824 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2825 else {
2826 SET_ENCLOSE_STATUS(node, NST_MARK2);
2827 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2828 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2829 }
2830 break;
2831
2832 default:
2833 break;
2834 }
2835
2836 return r;
2837 }
2838
2839 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2840 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2841 {
2842 int type;
2843 int r = 0;
2844
2845 type = NTYPE(node);
2846 switch (type) {
2847 case NT_LIST:
2848 case NT_ALT:
2849 do {
2850 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2851 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2852 break;
2853
2854 case NT_QTFR:
2855 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2856 break;
2857
2858 case NT_ANCHOR:
2859 {
2860 AnchorNode* an = NANCHOR(node);
2861 switch (an->type) {
2862 case ANCHOR_PREC_READ:
2863 case ANCHOR_PREC_READ_NOT:
2864 case ANCHOR_LOOK_BEHIND:
2865 case ANCHOR_LOOK_BEHIND_NOT:
2866 r = subexp_inf_recursive_check_trav(an->target, env);
2867 break;
2868 }
2869 }
2870 break;
2871
2872 case NT_ENCLOSE:
2873 {
2874 EncloseNode* en = NENCLOSE(node);
2875
2876 if (IS_ENCLOSE_RECURSION(en)) {
2877 SET_ENCLOSE_STATUS(node, NST_MARK1);
2878 r = subexp_inf_recursive_check(en->target, env, 1);
2879 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2880 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2881 }
2882 r = subexp_inf_recursive_check_trav(en->target, env);
2883 }
2884 break;
2885
2886 default:
2887 break;
2888 }
2889
2890 return r;
2891 }
2892
2893 static int
subexp_recursive_check(Node * node)2894 subexp_recursive_check(Node* node)
2895 {
2896 int r = 0;
2897
2898 switch (NTYPE(node)) {
2899 case NT_LIST:
2900 case NT_ALT:
2901 do {
2902 r |= subexp_recursive_check(NCAR(node));
2903 } while (IS_NOT_NULL(node = NCDR(node)));
2904 break;
2905
2906 case NT_QTFR:
2907 r = subexp_recursive_check(NQTFR(node)->target);
2908 break;
2909
2910 case NT_ANCHOR:
2911 {
2912 AnchorNode* an = NANCHOR(node);
2913 switch (an->type) {
2914 case ANCHOR_PREC_READ:
2915 case ANCHOR_PREC_READ_NOT:
2916 case ANCHOR_LOOK_BEHIND:
2917 case ANCHOR_LOOK_BEHIND_NOT:
2918 r = subexp_recursive_check(an->target);
2919 break;
2920 }
2921 }
2922 break;
2923
2924 case NT_CALL:
2925 r = subexp_recursive_check(NCALL(node)->target);
2926 if (r != 0) SET_CALL_RECURSION(node);
2927 break;
2928
2929 case NT_ENCLOSE:
2930 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2931 return 0;
2932 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2933 return 1; /* recursion */
2934 else {
2935 SET_ENCLOSE_STATUS(node, NST_MARK2);
2936 r = subexp_recursive_check(NENCLOSE(node)->target);
2937 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2938 }
2939 break;
2940
2941 default:
2942 break;
2943 }
2944
2945 return r;
2946 }
2947
2948
2949 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2950 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2951 {
2952 #define FOUND_CALLED_NODE 1
2953
2954 int type;
2955 int r = 0;
2956
2957 type = NTYPE(node);
2958 switch (type) {
2959 case NT_LIST:
2960 case NT_ALT:
2961 {
2962 int ret;
2963 do {
2964 ret = subexp_recursive_check_trav(NCAR(node), env);
2965 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2966 else if (ret < 0) return ret;
2967 } while (IS_NOT_NULL(node = NCDR(node)));
2968 }
2969 break;
2970
2971 case NT_QTFR:
2972 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2973 if (NQTFR(node)->upper == 0) {
2974 if (r == FOUND_CALLED_NODE)
2975 NQTFR(node)->is_refered = 1;
2976 }
2977 break;
2978
2979 case NT_ANCHOR:
2980 {
2981 AnchorNode* an = NANCHOR(node);
2982 switch (an->type) {
2983 case ANCHOR_PREC_READ:
2984 case ANCHOR_PREC_READ_NOT:
2985 case ANCHOR_LOOK_BEHIND:
2986 case ANCHOR_LOOK_BEHIND_NOT:
2987 r = subexp_recursive_check_trav(an->target, env);
2988 break;
2989 }
2990 }
2991 break;
2992
2993 case NT_ENCLOSE:
2994 {
2995 EncloseNode* en = NENCLOSE(node);
2996
2997 if (! IS_ENCLOSE_RECURSION(en)) {
2998 if (IS_ENCLOSE_CALLED(en)) {
2999 SET_ENCLOSE_STATUS(node, NST_MARK1);
3000 r = subexp_recursive_check(en->target);
3001 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
3002 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3003 }
3004 }
3005 r = subexp_recursive_check_trav(en->target, env);
3006 if (IS_ENCLOSE_CALLED(en))
3007 r |= FOUND_CALLED_NODE;
3008 }
3009 break;
3010
3011 default:
3012 break;
3013 }
3014
3015 return r;
3016 }
3017
3018 static int
setup_subexp_call(Node * node,ScanEnv * env)3019 setup_subexp_call(Node* node, ScanEnv* env)
3020 {
3021 int type;
3022 int r = 0;
3023
3024 type = NTYPE(node);
3025 switch (type) {
3026 case NT_LIST:
3027 do {
3028 r = setup_subexp_call(NCAR(node), env);
3029 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3030 break;
3031
3032 case NT_ALT:
3033 do {
3034 r = setup_subexp_call(NCAR(node), env);
3035 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3036 break;
3037
3038 case NT_QTFR:
3039 r = setup_subexp_call(NQTFR(node)->target, env);
3040 break;
3041 case NT_ENCLOSE:
3042 r = setup_subexp_call(NENCLOSE(node)->target, env);
3043 break;
3044
3045 case NT_CALL:
3046 {
3047 CallNode* cn = NCALL(node);
3048 Node** nodes = SCANENV_MEM_NODES(env);
3049
3050 if (cn->group_num != 0) {
3051 int gnum = cn->group_num;
3052
3053 #ifdef USE_NAMED_GROUP
3054 if (env->num_named > 0 &&
3055 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3056 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3057 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3058 }
3059 #endif
3060 if (gnum > env->num_mem) {
3061 onig_scan_env_set_error_string(env,
3062 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3063 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3064 }
3065
3066 #ifdef USE_NAMED_GROUP
3067 set_call_attr:
3068 #endif
3069 cn->target = nodes[cn->group_num];
3070 if (IS_NULL(cn->target)) {
3071 onig_scan_env_set_error_string(env,
3072 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3073 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3074 }
3075 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3076 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3077 cn->unset_addr_list = env->unset_addr_list;
3078 }
3079 #ifdef USE_NAMED_GROUP
3080 else {
3081 int *refs;
3082
3083 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3084 &refs);
3085 if (n <= 0) {
3086 onig_scan_env_set_error_string(env,
3087 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3088 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3089 }
3090 else if (n > 1) {
3091 onig_scan_env_set_error_string(env,
3092 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3093 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3094 }
3095 else {
3096 cn->group_num = refs[0];
3097 goto set_call_attr;
3098 }
3099 }
3100 #endif
3101 }
3102 break;
3103
3104 case NT_ANCHOR:
3105 {
3106 AnchorNode* an = NANCHOR(node);
3107
3108 switch (an->type) {
3109 case ANCHOR_PREC_READ:
3110 case ANCHOR_PREC_READ_NOT:
3111 case ANCHOR_LOOK_BEHIND:
3112 case ANCHOR_LOOK_BEHIND_NOT:
3113 r = setup_subexp_call(an->target, env);
3114 break;
3115 }
3116 }
3117 break;
3118
3119 default:
3120 break;
3121 }
3122
3123 return r;
3124 }
3125 #endif
3126
3127 /* divide different length alternatives in look-behind.
3128 (?<=A|B) ==> (?<=A)|(?<=B)
3129 (?<!A|B) ==> (?<!A)(?<!B)
3130 */
3131 static int
divide_look_behind_alternatives(Node * node)3132 divide_look_behind_alternatives(Node* node)
3133 {
3134 Node *head, *np, *insert_node;
3135 AnchorNode* an = NANCHOR(node);
3136 int anc_type = an->type;
3137
3138 /* fprintf(stderr, "divide_look_behind: %d\n", (int )node); */
3139
3140 head = an->target;
3141 np = NCAR(head);
3142 swap_node(node, head);
3143 NCAR(node) = head;
3144 NANCHOR(head)->target = np;
3145
3146 np = node;
3147 while ((np = NCDR(np)) != NULL_NODE) {
3148 insert_node = onig_node_new_anchor(anc_type);
3149 CHECK_NULL_RETURN_MEMERR(insert_node);
3150 NANCHOR(insert_node)->target = NCAR(np);
3151 NCAR(np) = insert_node;
3152 }
3153
3154 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3155 np = node;
3156 do {
3157 SET_NTYPE(np, NT_LIST); /* alt -> list */
3158 } while ((np = NCDR(np)) != NULL_NODE);
3159 }
3160 return 0;
3161 }
3162
3163 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3164 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3165 {
3166 int r, len;
3167 AnchorNode* an = NANCHOR(node);
3168
3169 /* fprintf(stderr, "setup_look_behind: %x\n", (int )node); */
3170
3171 r = get_char_length_tree(an->target, reg, &len);
3172 if (r == 0)
3173 an->char_len = len;
3174 else if (r == GET_CHAR_LEN_VARLEN)
3175 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3176 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3177 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3178 r = divide_look_behind_alternatives(node);
3179 else
3180 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3181 }
3182
3183 return r;
3184 }
3185
3186 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3187 next_setup(Node* node, Node* next_node, regex_t* reg)
3188 {
3189 int type;
3190
3191 retry:
3192 type = NTYPE(node);
3193 if (type == NT_QTFR) {
3194 QtfrNode* qn = NQTFR(node);
3195 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3196 #ifdef USE_QTFR_PEEK_NEXT
3197 Node* n = get_head_value_node(next_node, 1, reg);
3198 /* '\0': for UTF-16BE etc... */
3199 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3200 qn->next_head_exact = n;
3201 }
3202 #endif
3203 /* automatic posseivation a*b ==> (?>a*)b */
3204 if (qn->lower <= 1) {
3205 int ttype = NTYPE(qn->target);
3206 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3207 Node *x, *y;
3208 x = get_head_value_node(qn->target, 0, reg);
3209 if (IS_NOT_NULL(x)) {
3210 y = get_head_value_node(next_node, 0, reg);
3211 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3212 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3213 CHECK_NULL_RETURN_MEMERR(en);
3214 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3215 swap_node(node, en);
3216 NENCLOSE(node)->target = en;
3217 }
3218 }
3219 }
3220 }
3221 }
3222 }
3223 else if (type == NT_ENCLOSE) {
3224 EncloseNode* en = NENCLOSE(node);
3225 if (en->type == ENCLOSE_MEMORY) {
3226 node = en->target;
3227 goto retry;
3228 }
3229 }
3230 return 0;
3231 }
3232
3233
3234 static int
update_string_node_case_fold(regex_t * reg,Node * node)3235 update_string_node_case_fold(regex_t* reg, Node *node)
3236 {
3237 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3238 UChar *sbuf, *ebuf, *sp;
3239 int r, i, len, sbuf_size;
3240 StrNode* sn = NSTR(node);
3241
3242 end = sn->end;
3243 sbuf_size = (end - sn->s) * 2;
3244 sbuf = (UChar* )xmalloc(sbuf_size);
3245 CHECK_NULL_RETURN_MEMERR(sbuf);
3246 ebuf = sbuf + sbuf_size;
3247
3248 sp = sbuf;
3249 p = sn->s;
3250 while (p < end) {
3251 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3252 for (i = 0; i < len; i++) {
3253 if (sp >= ebuf) {
3254 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3255 CHECK_NULL_RETURN_MEMERR(sbuf);
3256 sp = sbuf + sbuf_size;
3257 sbuf_size *= 2;
3258 ebuf = sbuf + sbuf_size;
3259 }
3260
3261 *sp++ = buf[i];
3262 }
3263 }
3264
3265 r = onig_node_str_set(node, sbuf, sp);
3266 if (r != 0) {
3267 xfree(sbuf);
3268 return r;
3269 }
3270
3271 xfree(sbuf);
3272 return 0;
3273 }
3274
3275 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3276 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3277 regex_t* reg)
3278 {
3279 int r;
3280 Node *node;
3281
3282 node = onig_node_new_str(s, end);
3283 if (IS_NULL(node)) return ONIGERR_MEMORY;
3284
3285 r = update_string_node_case_fold(reg, node);
3286 if (r != 0) {
3287 onig_node_free(node);
3288 return r;
3289 }
3290
3291 NSTRING_SET_AMBIG(node);
3292 NSTRING_SET_DONT_GET_OPT_INFO(node);
3293 *rnode = node;
3294 return 0;
3295 }
3296
3297 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3298 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3299 UChar *p, int slen, UChar *end,
3300 regex_t* reg, Node **rnode)
3301 {
3302 int r, i, j, len, varlen;
3303 Node *anode, *var_anode, *snode, *xnode, *an;
3304 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3305
3306 *rnode = var_anode = NULL_NODE;
3307
3308 varlen = 0;
3309 for (i = 0; i < item_num; i++) {
3310 if (items[i].byte_len != slen) {
3311 varlen = 1;
3312 break;
3313 }
3314 }
3315
3316 if (varlen != 0) {
3317 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3318 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3319
3320 xnode = onig_node_new_list(NULL, NULL);
3321 if (IS_NULL(xnode)) goto mem_err;
3322 NCAR(var_anode) = xnode;
3323
3324 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3325 if (IS_NULL(anode)) goto mem_err;
3326 NCAR(xnode) = anode;
3327 }
3328 else {
3329 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3330 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3331 }
3332
3333 snode = onig_node_new_str(p, p + slen);
3334 if (IS_NULL(snode)) goto mem_err;
3335
3336 NCAR(anode) = snode;
3337
3338 for (i = 0; i < item_num; i++) {
3339 snode = onig_node_new_str(NULL, NULL);
3340 if (IS_NULL(snode)) goto mem_err;
3341
3342 for (j = 0; j < items[i].code_len; j++) {
3343 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3344 if (len < 0) {
3345 r = len;
3346 goto mem_err2;
3347 }
3348
3349 r = onig_node_str_cat(snode, buf, buf + len);
3350 if (r != 0) goto mem_err2;
3351 }
3352
3353 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3354 if (IS_NULL(an)) {
3355 goto mem_err2;
3356 }
3357
3358 if (items[i].byte_len != slen) {
3359 Node *rem;
3360 UChar *q = p + items[i].byte_len;
3361
3362 if (q < end) {
3363 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3364 if (r != 0) {
3365 onig_node_free(an);
3366 goto mem_err2;
3367 }
3368
3369 xnode = onig_node_list_add(NULL_NODE, snode);
3370 if (IS_NULL(xnode)) {
3371 onig_node_free(an);
3372 onig_node_free(rem);
3373 goto mem_err2;
3374 }
3375 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3376 onig_node_free(an);
3377 onig_node_free(xnode);
3378 onig_node_free(rem);
3379 goto mem_err;
3380 }
3381
3382 NCAR(an) = xnode;
3383 }
3384 else {
3385 NCAR(an) = snode;
3386 }
3387
3388 NCDR(var_anode) = an;
3389 var_anode = an;
3390 }
3391 else {
3392 NCAR(an) = snode;
3393 NCDR(anode) = an;
3394 anode = an;
3395 }
3396 }
3397
3398 return varlen;
3399
3400 mem_err2:
3401 onig_node_free(snode);
3402
3403 mem_err:
3404 onig_node_free(*rnode);
3405
3406 return ONIGERR_MEMORY;
3407 }
3408
3409 static int
expand_case_fold_string(Node * node,regex_t * reg)3410 expand_case_fold_string(Node* node, regex_t* reg)
3411 {
3412 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3413
3414 int r, n, len, alt_num;
3415 UChar *start, *end, *p;
3416 Node *top_root, *root, *snode, *prev_node;
3417 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3418 StrNode* sn = NSTR(node);
3419
3420 if (NSTRING_IS_AMBIG(node)) return 0;
3421
3422 start = sn->s;
3423 end = sn->end;
3424 if (start >= end) return 0;
3425
3426 r = 0;
3427 top_root = root = prev_node = snode = NULL_NODE;
3428 alt_num = 1;
3429 p = start;
3430 while (p < end) {
3431 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3432 p, end, items);
3433 if (n < 0) {
3434 r = n;
3435 goto err;
3436 }
3437
3438 SAFE_ENC_LEN(reg->enc, p, end, len);
3439
3440 if (n == 0) {
3441 if (IS_NULL(snode)) {
3442 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3443 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3444 if (IS_NULL(root)) {
3445 onig_node_free(prev_node);
3446 goto mem_err;
3447 }
3448 }
3449
3450 prev_node = snode = onig_node_new_str(NULL, NULL);
3451 if (IS_NULL(snode)) goto mem_err;
3452 if (IS_NOT_NULL(root)) {
3453 if (IS_NULL(onig_node_list_add(root, snode))) {
3454 onig_node_free(snode);
3455 goto mem_err;
3456 }
3457 }
3458 }
3459
3460 r = onig_node_str_cat(snode, p, p + len);
3461 if (r != 0) goto err;
3462 }
3463 else {
3464 alt_num *= (n + 1);
3465 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3466
3467 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3468 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3469 if (IS_NULL(root)) {
3470 onig_node_free(prev_node);
3471 goto mem_err;
3472 }
3473 }
3474
3475 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3476 if (r < 0) goto mem_err;
3477 if (r == 1) {
3478 if (IS_NULL(root)) {
3479 top_root = prev_node;
3480 }
3481 else {
3482 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3483 onig_node_free(prev_node);
3484 goto mem_err;
3485 }
3486 }
3487
3488 root = NCAR(prev_node);
3489 }
3490 else { /* r == 0 */
3491 if (IS_NOT_NULL(root)) {
3492 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3493 onig_node_free(prev_node);
3494 goto mem_err;
3495 }
3496 }
3497 }
3498
3499 snode = NULL_NODE;
3500 }
3501
3502 p += len;
3503 }
3504
3505 if (p < end) {
3506 Node *srem;
3507
3508 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3509 if (r != 0) goto mem_err;
3510
3511 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3512 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3513 if (IS_NULL(root)) {
3514 onig_node_free(srem);
3515 onig_node_free(prev_node);
3516 goto mem_err;
3517 }
3518 }
3519
3520 if (IS_NULL(root)) {
3521 prev_node = srem;
3522 }
3523 else {
3524 if (IS_NULL(onig_node_list_add(root, srem))) {
3525 onig_node_free(srem);
3526 goto mem_err;
3527 }
3528 }
3529 }
3530
3531 /* ending */
3532 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3533 swap_node(node, top_root);
3534 onig_node_free(top_root);
3535 return 0;
3536
3537 mem_err:
3538 r = ONIGERR_MEMORY;
3539
3540 err:
3541 onig_node_free(top_root);
3542 return r;
3543 }
3544
3545
3546 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3547
3548 #define CEC_THRES_NUM_BIG_REPEAT 512
3549 #define CEC_INFINITE_NUM 0x7fffffff
3550
3551 #define CEC_IN_INFINITE_REPEAT (1<<0)
3552 #define CEC_IN_FINITE_REPEAT (1<<1)
3553 #define CEC_CONT_BIG_REPEAT (1<<2)
3554
3555 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3556 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3557 {
3558 int type;
3559 int r = state;
3560
3561 type = NTYPE(node);
3562 switch (type) {
3563 case NT_LIST:
3564 {
3565 Node* prev = NULL_NODE;
3566 do {
3567 r = setup_comb_exp_check(NCAR(node), r, env);
3568 prev = NCAR(node);
3569 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3570 }
3571 break;
3572
3573 case NT_ALT:
3574 {
3575 int ret;
3576 do {
3577 ret = setup_comb_exp_check(NCAR(node), state, env);
3578 r |= ret;
3579 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3580 }
3581 break;
3582
3583 case NT_QTFR:
3584 {
3585 int child_state = state;
3586 int add_state = 0;
3587 QtfrNode* qn = NQTFR(node);
3588 Node* target = qn->target;
3589 int var_num;
3590
3591 if (! IS_REPEAT_INFINITE(qn->upper)) {
3592 if (qn->upper > 1) {
3593 /* {0,1}, {1,1} are allowed */
3594 child_state |= CEC_IN_FINITE_REPEAT;
3595
3596 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3597 if (env->backrefed_mem == 0) {
3598 if (NTYPE(qn->target) == NT_ENCLOSE) {
3599 EncloseNode* en = NENCLOSE(qn->target);
3600 if (en->type == ENCLOSE_MEMORY) {
3601 if (NTYPE(en->target) == NT_QTFR) {
3602 QtfrNode* q = NQTFR(en->target);
3603 if (IS_REPEAT_INFINITE(q->upper)
3604 && q->greedy == qn->greedy) {
3605 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3606 if (qn->upper == 1)
3607 child_state = state;
3608 }
3609 }
3610 }
3611 }
3612 }
3613 }
3614 }
3615
3616 if (state & CEC_IN_FINITE_REPEAT) {
3617 qn->comb_exp_check_num = -1;
3618 }
3619 else {
3620 if (IS_REPEAT_INFINITE(qn->upper)) {
3621 var_num = CEC_INFINITE_NUM;
3622 child_state |= CEC_IN_INFINITE_REPEAT;
3623 }
3624 else {
3625 var_num = qn->upper - qn->lower;
3626 }
3627
3628 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3629 add_state |= CEC_CONT_BIG_REPEAT;
3630
3631 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3632 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3633 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3634 if (qn->comb_exp_check_num == 0) {
3635 env->num_comb_exp_check++;
3636 qn->comb_exp_check_num = env->num_comb_exp_check;
3637 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3638 env->comb_exp_max_regnum = env->curr_max_regnum;
3639 }
3640 }
3641 }
3642
3643 r = setup_comb_exp_check(target, child_state, env);
3644 r |= add_state;
3645 }
3646 break;
3647
3648 case NT_ENCLOSE:
3649 {
3650 EncloseNode* en = NENCLOSE(node);
3651
3652 switch (en->type) {
3653 case ENCLOSE_MEMORY:
3654 {
3655 if (env->curr_max_regnum < en->regnum)
3656 env->curr_max_regnum = en->regnum;
3657
3658 r = setup_comb_exp_check(en->target, state, env);
3659 }
3660 break;
3661
3662 default:
3663 r = setup_comb_exp_check(en->target, state, env);
3664 break;
3665 }
3666 }
3667 break;
3668
3669 #ifdef USE_SUBEXP_CALL
3670 case NT_CALL:
3671 if (IS_CALL_RECURSION(NCALL(node)))
3672 env->has_recursion = 1;
3673 else
3674 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3675 break;
3676 #endif
3677
3678 default:
3679 break;
3680 }
3681
3682 return r;
3683 }
3684 #endif
3685
3686 #define IN_ALT (1<<0)
3687 #define IN_NOT (1<<1)
3688 #define IN_REPEAT (1<<2)
3689 #define IN_VAR_REPEAT (1<<3)
3690 #define IN_CALL (1<<4)
3691 #define IN_RECCALL (1<<5)
3692
3693 /* setup_tree does the following work.
3694 1. check empty loop. (set qn->target_empty_info)
3695 2. expand ignore-case in char class.
3696 3. set memory status bit flags. (reg->mem_stats)
3697 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3698 5. find invalid patterns in look-behind.
3699 6. expand repeated string.
3700 */
3701 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3702 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3703 {
3704 int type;
3705 int r = 0;
3706
3707 type = NTYPE(node);
3708 switch (type) {
3709 case NT_LIST:
3710 {
3711 Node* prev = NULL_NODE;
3712 do {
3713 r = setup_tree(NCAR(node), reg, state, env);
3714 if (IS_NOT_NULL(prev) && r == 0) {
3715 r = next_setup(prev, NCAR(node), reg);
3716 }
3717 prev = NCAR(node);
3718 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3719 }
3720 break;
3721
3722 case NT_ALT:
3723 do {
3724 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3725 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3726 break;
3727
3728 case NT_CCLASS:
3729 break;
3730
3731 case NT_STR:
3732 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3733 r = expand_case_fold_string(node, reg);
3734 }
3735 break;
3736
3737 case NT_CTYPE:
3738 case NT_CANY:
3739 break;
3740
3741 #ifdef USE_SUBEXP_CALL
3742 case NT_CALL:
3743 break;
3744 #endif
3745
3746 case NT_BREF:
3747 {
3748 int i;
3749 int* p;
3750 Node** nodes = SCANENV_MEM_NODES(env);
3751 BRefNode* br = NBREF(node);
3752 p = BACKREFS_P(br);
3753 for (i = 0; i < br->back_num; i++) {
3754 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3755 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3756 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3757 #ifdef USE_BACKREF_WITH_LEVEL
3758 if (IS_BACKREF_NEST_LEVEL(br)) {
3759 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3760 }
3761 #endif
3762 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3763 }
3764 }
3765 break;
3766
3767 case NT_QTFR:
3768 {
3769 OnigLen d;
3770 QtfrNode* qn = NQTFR(node);
3771 Node* target = qn->target;
3772
3773 if ((state & IN_REPEAT) != 0) {
3774 qn->state |= NST_IN_REPEAT;
3775 }
3776
3777 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3778 r = get_min_len(target, &d, env);
3779 if (r) break;
3780 if (d == 0) {
3781 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3782 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3783 r = quantifiers_memory_node_info(target);
3784 if (r < 0) break;
3785 if (r > 0) {
3786 qn->target_empty_info = r;
3787 }
3788 #endif
3789 #if 0
3790 r = get_max_len(target, &d, env);
3791 if (r == 0 && d == 0) {
3792 /* ()* ==> ()?, ()+ ==> () */
3793 qn->upper = 1;
3794 if (qn->lower > 1) qn->lower = 1;
3795 if (NTYPE(target) == NT_STR) {
3796 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3797 }
3798 }
3799 #endif
3800 }
3801 }
3802
3803 state |= IN_REPEAT;
3804 if (qn->lower != qn->upper)
3805 state |= IN_VAR_REPEAT;
3806 r = setup_tree(target, reg, state, env);
3807 if (r) break;
3808
3809 /* expand string */
3810 #define EXPAND_STRING_MAX_LENGTH 100
3811 if (NTYPE(target) == NT_STR) {
3812 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3813 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3814 int len = NSTRING_LEN(target);
3815 StrNode* sn = NSTR(target);
3816
3817 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3818 int i, n = qn->lower;
3819 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3820 for (i = 0; i < n; i++) {
3821 r = onig_node_str_cat(node, sn->s, sn->end);
3822 if (r) break;
3823 }
3824 onig_node_free(target);
3825 break; /* break case NT_QTFR: */
3826 }
3827 }
3828 }
3829
3830 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3831 if (qn->greedy && (qn->target_empty_info != 0)) {
3832 if (NTYPE(target) == NT_QTFR) {
3833 QtfrNode* tqn = NQTFR(target);
3834 if (IS_NOT_NULL(tqn->head_exact)) {
3835 qn->head_exact = tqn->head_exact;
3836 tqn->head_exact = NULL;
3837 }
3838 }
3839 else {
3840 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3841 }
3842 }
3843 #endif
3844 }
3845 break;
3846
3847 case NT_ENCLOSE:
3848 {
3849 EncloseNode* en = NENCLOSE(node);
3850
3851 switch (en->type) {
3852 case ENCLOSE_OPTION:
3853 {
3854 OnigOptionType options = reg->options;
3855 reg->options = NENCLOSE(node)->option;
3856 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3857 reg->options = options;
3858 }
3859 break;
3860
3861 case ENCLOSE_MEMORY:
3862 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
3863 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3864 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3865 }
3866 if (IS_ENCLOSE_CALLED(en))
3867 state |= IN_CALL;
3868 if (IS_ENCLOSE_RECURSION(en))
3869 state |= IN_RECCALL;
3870 else if ((state & IN_RECCALL) != 0)
3871 SET_CALL_RECURSION(node);
3872 r = setup_tree(en->target, reg, state, env);
3873 break;
3874
3875 case ENCLOSE_STOP_BACKTRACK:
3876 {
3877 Node* target = en->target;
3878 r = setup_tree(target, reg, state, env);
3879 if (NTYPE(target) == NT_QTFR) {
3880 QtfrNode* tqn = NQTFR(target);
3881 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3882 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3883 int qtype = NTYPE(tqn->target);
3884 if (IS_NODE_TYPE_SIMPLE(qtype))
3885 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3886 }
3887 }
3888 }
3889 break;
3890 }
3891 }
3892 break;
3893
3894 case NT_ANCHOR:
3895 {
3896 AnchorNode* an = NANCHOR(node);
3897
3898 switch (an->type) {
3899 case ANCHOR_PREC_READ:
3900 r = setup_tree(an->target, reg, state, env);
3901 break;
3902 case ANCHOR_PREC_READ_NOT:
3903 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3904 break;
3905
3906 /* allowed node types in look-behind */
3907 #define ALLOWED_TYPE_IN_LB \
3908 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3909 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3910
3911 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
3912 #define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
3913
3914 #define ALLOWED_ANCHOR_IN_LB \
3915 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
3916
3917 #define ALLOWED_ANCHOR_IN_LB_NOT \
3918 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
3919
3920 case ANCHOR_LOOK_BEHIND:
3921 {
3922 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3923 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3924 if (r < 0) return r;
3925 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3926 r = setup_tree(an->target, reg, state, env);
3927 if (r != 0) return r;
3928 r = setup_look_behind(node, reg, env);
3929 }
3930 break;
3931
3932 case ANCHOR_LOOK_BEHIND_NOT:
3933 {
3934 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3935 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3936 if (r < 0) return r;
3937 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3938 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3939 if (r != 0) return r;
3940 r = setup_look_behind(node, reg, env);
3941 }
3942 break;
3943 }
3944 }
3945 break;
3946
3947 default:
3948 break;
3949 }
3950
3951 return r;
3952 }
3953
3954 /* set skip map for Boyer-Moore search */
3955 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3956 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3957 UChar skip[], int** int_skip)
3958 {
3959 int i, len;
3960
3961 len = end - s;
3962 if (len < ONIG_CHAR_TABLE_SIZE) {
3963 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3964
3965 for (i = 0; i < len - 1; i++)
3966 skip[s[i]] = len - 1 - i;
3967 }
3968 else {
3969 if (IS_NULL(*int_skip)) {
3970 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3971 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3972 }
3973 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3974
3975 for (i = 0; i < len - 1; i++)
3976 (*int_skip)[s[i]] = len - 1 - i;
3977 }
3978 return 0;
3979 }
3980
3981 #define OPT_EXACT_MAXLEN 24
3982
3983 typedef struct {
3984 OnigLen min; /* min byte length */
3985 OnigLen max; /* max byte length */
3986 } MinMaxLen;
3987
3988 typedef struct {
3989 MinMaxLen mmd;
3990 OnigEncoding enc;
3991 OnigOptionType options;
3992 OnigCaseFoldType case_fold_flag;
3993 ScanEnv* scan_env;
3994 } OptEnv;
3995
3996 typedef struct {
3997 int left_anchor;
3998 int right_anchor;
3999 } OptAncInfo;
4000
4001 typedef struct {
4002 MinMaxLen mmd; /* info position */
4003 OptAncInfo anc;
4004
4005 int reach_end;
4006 int ignore_case;
4007 int len;
4008 UChar s[OPT_EXACT_MAXLEN];
4009 } OptExactInfo;
4010
4011 typedef struct {
4012 MinMaxLen mmd; /* info position */
4013 OptAncInfo anc;
4014
4015 int value; /* weighted value */
4016 UChar map[ONIG_CHAR_TABLE_SIZE];
4017 } OptMapInfo;
4018
4019 typedef struct {
4020 MinMaxLen len;
4021
4022 OptAncInfo anc;
4023 OptExactInfo exb; /* boundary */
4024 OptExactInfo exm; /* middle */
4025 OptExactInfo expr; /* prec read (?=...) */
4026
4027 OptMapInfo map; /* boundary */
4028 } NodeOptInfo;
4029
4030
4031 static int
map_position_value(OnigEncoding enc,int i)4032 map_position_value(OnigEncoding enc, int i)
4033 {
4034 static const short int ByteValTable[] = {
4035 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4037 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4038 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4039 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4040 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4041 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4042 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4043 };
4044
4045 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
4046 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4047 return 20;
4048 else
4049 return (int )ByteValTable[i];
4050 }
4051 else
4052 return 4; /* Take it easy. */
4053 }
4054
4055 static int
distance_value(MinMaxLen * mm)4056 distance_value(MinMaxLen* mm)
4057 {
4058 /* 1000 / (min-max-dist + 1) */
4059 static const short int dist_vals[] = {
4060 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4061 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4062 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4063 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4064 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4065 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4066 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4067 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4068 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4069 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4070 };
4071
4072 OnigLen d;
4073
4074 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4075
4076 d = mm->max - mm->min;
4077 if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4078 /* return dist_vals[d] * 16 / (mm->min + 12); */
4079 return (int )dist_vals[d];
4080 else
4081 return 1;
4082 }
4083
4084 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4085 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4086 {
4087 if (v2 <= 0) return -1;
4088 if (v1 <= 0) return 1;
4089
4090 v1 *= distance_value(d1);
4091 v2 *= distance_value(d2);
4092
4093 if (v2 > v1) return 1;
4094 if (v2 < v1) return -1;
4095
4096 if (d2->min < d1->min) return 1;
4097 if (d2->min > d1->min) return -1;
4098 return 0;
4099 }
4100
4101 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4102 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4103 {
4104 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4105 }
4106
4107
4108 static void
set_mml(MinMaxLen * mml,OnigLen min,OnigLen max)4109 set_mml(MinMaxLen* mml, OnigLen min, OnigLen max)
4110 {
4111 mml->min = min;
4112 mml->max = max;
4113 }
4114
4115 static void
clear_mml(MinMaxLen * mml)4116 clear_mml(MinMaxLen* mml)
4117 {
4118 mml->min = mml->max = 0;
4119 }
4120
4121 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4122 copy_mml(MinMaxLen* to, MinMaxLen* from)
4123 {
4124 to->min = from->min;
4125 to->max = from->max;
4126 }
4127
4128 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4129 add_mml(MinMaxLen* to, MinMaxLen* from)
4130 {
4131 to->min = distance_add(to->min, from->min);
4132 to->max = distance_add(to->max, from->max);
4133 }
4134
4135 #if 0
4136 static void
4137 add_len_mml(MinMaxLen* to, OnigLen len)
4138 {
4139 to->min = distance_add(to->min, len);
4140 to->max = distance_add(to->max, len);
4141 }
4142 #endif
4143
4144 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4145 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4146 {
4147 if (to->min > from->min) to->min = from->min;
4148 if (to->max < from->max) to->max = from->max;
4149 }
4150
4151 static void
copy_opt_env(OptEnv * to,OptEnv * from)4152 copy_opt_env(OptEnv* to, OptEnv* from)
4153 {
4154 *to = *from;
4155 }
4156
4157 static void
clear_opt_anc_info(OptAncInfo * anc)4158 clear_opt_anc_info(OptAncInfo* anc)
4159 {
4160 anc->left_anchor = 0;
4161 anc->right_anchor = 0;
4162 }
4163
4164 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4165 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4166 {
4167 *to = *from;
4168 }
4169
4170 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigLen left_len,OnigLen right_len)4171 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4172 OnigLen left_len, OnigLen right_len)
4173 {
4174 clear_opt_anc_info(to);
4175
4176 to->left_anchor = left->left_anchor;
4177 if (left_len == 0) {
4178 to->left_anchor |= right->left_anchor;
4179 }
4180
4181 to->right_anchor = right->right_anchor;
4182 if (right_len == 0) {
4183 to->right_anchor |= left->right_anchor;
4184 }
4185 else {
4186 to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
4187 }
4188 }
4189
4190 static int
is_left_anchor(int anc)4191 is_left_anchor(int anc)
4192 {
4193 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4194 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4195 anc == ANCHOR_PREC_READ_NOT)
4196 return 0;
4197
4198 return 1;
4199 }
4200
4201 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4202 is_set_opt_anc_info(OptAncInfo* to, int anc)
4203 {
4204 if ((to->left_anchor & anc) != 0) return 1;
4205
4206 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4207 }
4208
4209 static void
add_opt_anc_info(OptAncInfo * to,int anc)4210 add_opt_anc_info(OptAncInfo* to, int anc)
4211 {
4212 if (is_left_anchor(anc))
4213 to->left_anchor |= anc;
4214 else
4215 to->right_anchor |= anc;
4216 }
4217
4218 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4219 remove_opt_anc_info(OptAncInfo* to, int anc)
4220 {
4221 if (is_left_anchor(anc))
4222 to->left_anchor &= ~anc;
4223 else
4224 to->right_anchor &= ~anc;
4225 }
4226
4227 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4228 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4229 {
4230 to->left_anchor &= add->left_anchor;
4231 to->right_anchor &= add->right_anchor;
4232 }
4233
4234 static int
is_full_opt_exact_info(OptExactInfo * ex)4235 is_full_opt_exact_info(OptExactInfo* ex)
4236 {
4237 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4238 }
4239
4240 static void
clear_opt_exact_info(OptExactInfo * ex)4241 clear_opt_exact_info(OptExactInfo* ex)
4242 {
4243 clear_mml(&ex->mmd);
4244 clear_opt_anc_info(&ex->anc);
4245 ex->reach_end = 0;
4246 ex->ignore_case = 0;
4247 ex->len = 0;
4248 ex->s[0] = '\0';
4249 }
4250
4251 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4252 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4253 {
4254 *to = *from;
4255 }
4256
4257 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4258 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4259 {
4260 int i, j, len;
4261 UChar *p, *end;
4262 OptAncInfo tanc;
4263
4264 if (! to->ignore_case && add->ignore_case) {
4265 if (to->len >= add->len) return ; /* avoid */
4266
4267 to->ignore_case = 1;
4268 }
4269
4270 p = add->s;
4271 end = p + add->len;
4272 for (i = to->len; p < end; ) {
4273 len = enclen(enc, p);
4274 if (i + len > OPT_EXACT_MAXLEN) break;
4275 for (j = 0; j < len && p < end; j++)
4276 to->s[i++] = *p++;
4277 }
4278
4279 to->len = i;
4280 to->reach_end = (p == end ? add->reach_end : 0);
4281
4282 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4283 if (! to->reach_end) tanc.right_anchor = 0;
4284 copy_opt_anc_info(&to->anc, &tanc);
4285 }
4286
4287 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4288 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4289 int raw ARG_UNUSED, OnigEncoding enc)
4290 {
4291 int i, j, len;
4292 UChar *p;
4293
4294 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4295 len = enclen(enc, p);
4296 if (i + len > OPT_EXACT_MAXLEN) break;
4297 for (j = 0; j < len && p < end; j++)
4298 to->s[i++] = *p++;
4299 }
4300
4301 to->len = i;
4302 }
4303
4304 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4305 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4306 {
4307 int i, j, len;
4308
4309 if (add->len == 0 || to->len == 0) {
4310 clear_opt_exact_info(to);
4311 return ;
4312 }
4313
4314 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4315 clear_opt_exact_info(to);
4316 return ;
4317 }
4318
4319 for (i = 0; i < to->len && i < add->len; ) {
4320 if (to->s[i] != add->s[i]) break;
4321 len = enclen(env->enc, to->s + i);
4322
4323 for (j = 1; j < len; j++) {
4324 if (to->s[i+j] != add->s[i+j]) break;
4325 }
4326 if (j < len) break;
4327 i += len;
4328 }
4329
4330 if (! add->reach_end || i < add->len || i < to->len) {
4331 to->reach_end = 0;
4332 }
4333 to->len = i;
4334 to->ignore_case |= add->ignore_case;
4335
4336 alt_merge_opt_anc_info(&to->anc, &add->anc);
4337 if (! to->reach_end) to->anc.right_anchor = 0;
4338 }
4339
4340 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4341 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4342 {
4343 int v1, v2;
4344
4345 v1 = now->len;
4346 v2 = alt->len;
4347
4348 if (v2 == 0) {
4349 return ;
4350 }
4351 else if (v1 == 0) {
4352 copy_opt_exact_info(now, alt);
4353 return ;
4354 }
4355 else if (v1 <= 2 && v2 <= 2) {
4356 /* ByteValTable[x] is big value --> low price */
4357 v2 = map_position_value(enc, now->s[0]);
4358 v1 = map_position_value(enc, alt->s[0]);
4359
4360 if (now->len > 1) v1 += 5;
4361 if (alt->len > 1) v2 += 5;
4362 }
4363
4364 if (now->ignore_case == 0) v1 *= 2;
4365 if (alt->ignore_case == 0) v2 *= 2;
4366
4367 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4368 copy_opt_exact_info(now, alt);
4369 }
4370
4371 static void
clear_opt_map_info(OptMapInfo * map)4372 clear_opt_map_info(OptMapInfo* map)
4373 {
4374 static const OptMapInfo clean_info = {
4375 {0, 0}, {0, 0}, 0,
4376 {
4377 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4378 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4379 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4386 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4387 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4388 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4390 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4391 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4392 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4393 }
4394 };
4395
4396 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4397 }
4398
4399 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4400 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4401 {
4402 *to = *from;
4403 }
4404
4405 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4406 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4407 {
4408 if (map->map[c] == 0) {
4409 map->map[c] = 1;
4410 map->value += map_position_value(enc, c);
4411 }
4412 }
4413
4414 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4415 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4416 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4417 {
4418 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4419 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4420 int i, n;
4421
4422 add_char_opt_map_info(map, p[0], enc);
4423
4424 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4425 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4426 if (n < 0) return n;
4427
4428 for (i = 0; i < n; i++) {
4429 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4430 add_char_opt_map_info(map, buf[0], enc);
4431 }
4432
4433 return 0;
4434 }
4435
4436 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4437 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4438 {
4439 static int z = 1<<15; /* 32768: something big value */
4440
4441 int v1, v2;
4442
4443 if (alt->value == 0) return ;
4444 if (now->value == 0) {
4445 copy_opt_map_info(now, alt);
4446 return ;
4447 }
4448
4449 v1 = z / now->value;
4450 v2 = z / alt->value;
4451 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4452 copy_opt_map_info(now, alt);
4453 }
4454
4455 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4456 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4457 {
4458 #define COMP_EM_BASE 20
4459 int ve, vm;
4460
4461 if (m->value <= 0) return -1;
4462
4463 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4464 vm = COMP_EM_BASE * 5 * 2 / m->value;
4465 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4466 }
4467
4468 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4469 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4470 {
4471 int i, val;
4472
4473 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4474 if (to->value == 0) return ;
4475 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4476 clear_opt_map_info(to);
4477 return ;
4478 }
4479
4480 alt_merge_mml(&to->mmd, &add->mmd);
4481
4482 val = 0;
4483 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4484 if (add->map[i])
4485 to->map[i] = 1;
4486
4487 if (to->map[i])
4488 val += map_position_value(enc, i);
4489 }
4490 to->value = val;
4491
4492 alt_merge_opt_anc_info(&to->anc, &add->anc);
4493 }
4494
4495 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4496 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4497 {
4498 copy_mml(&(opt->exb.mmd), mmd);
4499 copy_mml(&(opt->expr.mmd), mmd);
4500 copy_mml(&(opt->map.mmd), mmd);
4501 }
4502
4503 static void
clear_node_opt_info(NodeOptInfo * opt)4504 clear_node_opt_info(NodeOptInfo* opt)
4505 {
4506 clear_mml(&opt->len);
4507 clear_opt_anc_info(&opt->anc);
4508 clear_opt_exact_info(&opt->exb);
4509 clear_opt_exact_info(&opt->exm);
4510 clear_opt_exact_info(&opt->expr);
4511 clear_opt_map_info(&opt->map);
4512 }
4513
4514 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4515 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4516 {
4517 *to = *from;
4518 }
4519
4520 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4521 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4522 {
4523 int exb_reach, exm_reach;
4524 OptAncInfo tanc;
4525
4526 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4527 copy_opt_anc_info(&to->anc, &tanc);
4528
4529 if (add->exb.len > 0 && to->len.max == 0) {
4530 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4531 to->len.max, add->len.max);
4532 copy_opt_anc_info(&add->exb.anc, &tanc);
4533 }
4534
4535 if (add->map.value > 0 && to->len.max == 0) {
4536 if (add->map.mmd.max == 0)
4537 add->map.anc.left_anchor |= to->anc.left_anchor;
4538 }
4539
4540 exb_reach = to->exb.reach_end;
4541 exm_reach = to->exm.reach_end;
4542
4543 if (add->len.max != 0)
4544 to->exb.reach_end = to->exm.reach_end = 0;
4545
4546 if (add->exb.len > 0) {
4547 if (exb_reach) {
4548 concat_opt_exact_info(&to->exb, &add->exb, enc);
4549 clear_opt_exact_info(&add->exb);
4550 }
4551 else if (exm_reach) {
4552 concat_opt_exact_info(&to->exm, &add->exb, enc);
4553 clear_opt_exact_info(&add->exb);
4554 }
4555 }
4556 select_opt_exact_info(enc, &to->exm, &add->exb);
4557 select_opt_exact_info(enc, &to->exm, &add->exm);
4558
4559 if (to->expr.len > 0) {
4560 if (add->len.max > 0) {
4561 if (to->expr.len > (int )add->len.max)
4562 to->expr.len = add->len.max;
4563
4564 if (to->expr.mmd.max == 0)
4565 select_opt_exact_info(enc, &to->exb, &to->expr);
4566 else
4567 select_opt_exact_info(enc, &to->exm, &to->expr);
4568 }
4569 }
4570 else if (add->expr.len > 0) {
4571 copy_opt_exact_info(&to->expr, &add->expr);
4572 }
4573
4574 select_opt_map_info(&to->map, &add->map);
4575
4576 add_mml(&to->len, &add->len);
4577 }
4578
4579 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4580 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4581 {
4582 alt_merge_opt_anc_info (&to->anc, &add->anc);
4583 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4584 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4585 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4586 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4587
4588 alt_merge_mml(&to->len, &add->len);
4589 }
4590
4591
4592 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4593
4594 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4595 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4596 {
4597 int type;
4598 int r = 0;
4599
4600 clear_node_opt_info(opt);
4601 set_bound_node_opt_info(opt, &env->mmd);
4602
4603 type = NTYPE(node);
4604 switch (type) {
4605 case NT_LIST:
4606 {
4607 OptEnv nenv;
4608 NodeOptInfo nopt;
4609 Node* nd = node;
4610
4611 copy_opt_env(&nenv, env);
4612 do {
4613 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4614 if (r == 0) {
4615 add_mml(&nenv.mmd, &nopt.len);
4616 concat_left_node_opt_info(env->enc, opt, &nopt);
4617 }
4618 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4619 }
4620 break;
4621
4622 case NT_ALT:
4623 {
4624 NodeOptInfo nopt;
4625 Node* nd = node;
4626
4627 do {
4628 r = optimize_node_left(NCAR(nd), &nopt, env);
4629 if (r == 0) {
4630 if (nd == node) copy_node_opt_info(opt, &nopt);
4631 else alt_merge_node_opt_info(opt, &nopt, env);
4632 }
4633 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4634 }
4635 break;
4636
4637 case NT_STR:
4638 {
4639 StrNode* sn = NSTR(node);
4640 int slen = sn->end - sn->s;
4641 int is_raw = NSTRING_IS_RAW(node);
4642
4643 if (! NSTRING_IS_AMBIG(node)) {
4644 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4645 NSTRING_IS_RAW(node), env->enc);
4646 if (slen > 0) {
4647 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4648 }
4649 set_mml(&opt->len, slen, slen);
4650 }
4651 else {
4652 int max;
4653
4654 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4655 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4656 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4657 }
4658 else {
4659 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4660 is_raw, env->enc);
4661 opt->exb.ignore_case = 1;
4662
4663 if (slen > 0) {
4664 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4665 env->enc, env->case_fold_flag);
4666 if (r != 0) break;
4667 }
4668
4669 max = slen;
4670 }
4671
4672 set_mml(&opt->len, slen, max);
4673 }
4674
4675 if (opt->exb.len == slen)
4676 opt->exb.reach_end = 1;
4677 }
4678 break;
4679
4680 case NT_CCLASS:
4681 {
4682 int i, z;
4683 CClassNode* cc = NCCLASS(node);
4684
4685 /* no need to check ignore case. (set in setup_tree()) */
4686
4687 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4688 OnigLen min = ONIGENC_MBC_MINLEN(env->enc);
4689 OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4690
4691 set_mml(&opt->len, min, max);
4692 }
4693 else {
4694 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4695 z = BITSET_AT(cc->bs, i);
4696 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4697 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4698 }
4699 }
4700 set_mml(&opt->len, 1, 1);
4701 }
4702 }
4703 break;
4704
4705 case NT_CTYPE:
4706 {
4707 int i, min, max;
4708
4709 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4710
4711 if (max == 1) {
4712 min = 1;
4713
4714 switch (NCTYPE(node)->ctype) {
4715 case ONIGENC_CTYPE_WORD:
4716 if (NCTYPE(node)->not != 0) {
4717 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4718 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4719 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4720 }
4721 }
4722 }
4723 else {
4724 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4725 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4726 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4727 }
4728 }
4729 }
4730 break;
4731 }
4732 }
4733 else {
4734 min = ONIGENC_MBC_MINLEN(env->enc);
4735 }
4736 set_mml(&opt->len, min, max);
4737 }
4738 break;
4739
4740 case NT_CANY:
4741 {
4742 OnigLen min = ONIGENC_MBC_MINLEN(env->enc);
4743 OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4744 set_mml(&opt->len, min, max);
4745 }
4746 break;
4747
4748 case NT_ANCHOR:
4749 switch (NANCHOR(node)->type) {
4750 case ANCHOR_BEGIN_BUF:
4751 case ANCHOR_BEGIN_POSITION:
4752 case ANCHOR_BEGIN_LINE:
4753 case ANCHOR_END_BUF:
4754 case ANCHOR_SEMI_END_BUF:
4755 case ANCHOR_END_LINE:
4756 case ANCHOR_PREC_READ_NOT:
4757 case ANCHOR_LOOK_BEHIND:
4758 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4759 break;
4760
4761 case ANCHOR_PREC_READ:
4762 {
4763 NodeOptInfo nopt;
4764
4765 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4766 if (r == 0) {
4767 if (nopt.exb.len > 0)
4768 copy_opt_exact_info(&opt->expr, &nopt.exb);
4769 else if (nopt.exm.len > 0)
4770 copy_opt_exact_info(&opt->expr, &nopt.exm);
4771
4772 opt->expr.reach_end = 0;
4773
4774 if (nopt.map.value > 0)
4775 copy_opt_map_info(&opt->map, &nopt.map);
4776 }
4777 }
4778 break;
4779
4780 case ANCHOR_LOOK_BEHIND_NOT:
4781 break;
4782 }
4783 break;
4784
4785 case NT_BREF:
4786 {
4787 int i;
4788 int* backs;
4789 OnigLen min, max, tmin, tmax;
4790 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4791 BRefNode* br = NBREF(node);
4792
4793 if (br->state & NST_RECURSION) {
4794 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4795 break;
4796 }
4797 backs = BACKREFS_P(br);
4798 r = get_min_len(nodes[backs[0]], &min, env->scan_env);
4799 if (r != 0) break;
4800 r = get_max_len(nodes[backs[0]], &max, env->scan_env);
4801 if (r != 0) break;
4802 for (i = 1; i < br->back_num; i++) {
4803 r = get_min_len(nodes[backs[i]], &tmin, env->scan_env);
4804 if (r != 0) break;
4805 r = get_max_len(nodes[backs[i]], &tmax, env->scan_env);
4806 if (r != 0) break;
4807 if (min > tmin) min = tmin;
4808 if (max < tmax) max = tmax;
4809 }
4810 if (r == 0) set_mml(&opt->len, min, max);
4811 }
4812 break;
4813
4814 #ifdef USE_SUBEXP_CALL
4815 case NT_CALL:
4816 if (IS_CALL_RECURSION(NCALL(node)))
4817 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4818 else {
4819 OnigOptionType save = env->options;
4820 env->options = NENCLOSE(NCALL(node)->target)->option;
4821 r = optimize_node_left(NCALL(node)->target, opt, env);
4822 env->options = save;
4823 }
4824 break;
4825 #endif
4826
4827 case NT_QTFR:
4828 {
4829 int i;
4830 OnigLen min, max;
4831 NodeOptInfo nopt;
4832 QtfrNode* qn = NQTFR(node);
4833
4834 r = optimize_node_left(qn->target, &nopt, env);
4835 if (r) break;
4836
4837 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4838 if (env->mmd.max == 0 &&
4839 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4840 if (IS_MULTILINE(env->options))
4841 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4842 else
4843 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4844 }
4845 }
4846 else {
4847 if (qn->lower > 0) {
4848 copy_node_opt_info(opt, &nopt);
4849 if (nopt.exb.len > 0) {
4850 if (nopt.exb.reach_end) {
4851 for (i = 2; i <= qn->lower &&
4852 ! is_full_opt_exact_info(&opt->exb); i++) {
4853 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4854 }
4855 if (i < qn->lower) {
4856 opt->exb.reach_end = 0;
4857 }
4858 }
4859 }
4860
4861 if (qn->lower != qn->upper) {
4862 opt->exb.reach_end = 0;
4863 opt->exm.reach_end = 0;
4864 }
4865 if (qn->lower > 1)
4866 opt->exm.reach_end = 0;
4867 }
4868 }
4869
4870 min = distance_multiply(nopt.len.min, qn->lower);
4871 if (IS_REPEAT_INFINITE(qn->upper))
4872 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4873 else
4874 max = distance_multiply(nopt.len.max, qn->upper);
4875
4876 set_mml(&opt->len, min, max);
4877 }
4878 break;
4879
4880 case NT_ENCLOSE:
4881 {
4882 EncloseNode* en = NENCLOSE(node);
4883
4884 switch (en->type) {
4885 case ENCLOSE_OPTION:
4886 {
4887 OnigOptionType save = env->options;
4888
4889 env->options = en->option;
4890 r = optimize_node_left(en->target, opt, env);
4891 env->options = save;
4892 }
4893 break;
4894
4895 case ENCLOSE_MEMORY:
4896 #ifdef USE_SUBEXP_CALL
4897 en->opt_count++;
4898 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4899 OnigLen min, max;
4900
4901 min = 0;
4902 max = ONIG_INFINITE_DISTANCE;
4903 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4904 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4905 set_mml(&opt->len, min, max);
4906 }
4907 else
4908 #endif
4909 {
4910 r = optimize_node_left(en->target, opt, env);
4911
4912 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4913 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4914 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4915 }
4916 }
4917 break;
4918
4919 case ENCLOSE_STOP_BACKTRACK:
4920 r = optimize_node_left(en->target, opt, env);
4921 break;
4922 }
4923 }
4924 break;
4925
4926 default:
4927 #ifdef ONIG_DEBUG
4928 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4929 NTYPE(node));
4930 #endif
4931 r = ONIGERR_TYPE_BUG;
4932 break;
4933 }
4934
4935 return r;
4936 }
4937
4938 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4939 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4940 {
4941 int r;
4942
4943 if (e->len == 0) return 0;
4944
4945 if (e->ignore_case) {
4946 reg->exact = (UChar* )xmalloc(e->len);
4947 CHECK_NULL_RETURN_MEMERR(reg->exact);
4948 xmemcpy(reg->exact, e->s, e->len);
4949 reg->exact_end = reg->exact + e->len;
4950 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4951 }
4952 else {
4953 int allow_reverse;
4954
4955 reg->exact = str_dup(e->s, e->s + e->len);
4956 CHECK_NULL_RETURN_MEMERR(reg->exact);
4957 reg->exact_end = reg->exact + e->len;
4958
4959 allow_reverse =
4960 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4961
4962 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4963 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4964 reg->map, &(reg->int_map));
4965 if (r) return r;
4966
4967 reg->optimize = (allow_reverse != 0
4968 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4969 }
4970 else {
4971 reg->optimize = ONIG_OPTIMIZE_EXACT;
4972 }
4973 }
4974
4975 reg->dmin = e->mmd.min;
4976 reg->dmax = e->mmd.max;
4977
4978 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4979 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4980 }
4981
4982 return 0;
4983 }
4984
4985 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4986 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4987 {
4988 int i;
4989
4990 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4991 reg->map[i] = m->map[i];
4992
4993 reg->optimize = ONIG_OPTIMIZE_MAP;
4994 reg->dmin = m->mmd.min;
4995 reg->dmax = m->mmd.max;
4996
4997 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4998 reg->threshold_len = reg->dmin + 1;
4999 }
5000 }
5001
5002 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)5003 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
5004 {
5005 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
5006 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
5007 }
5008
5009 #ifdef ONIG_DEBUG
5010 static void print_optimize_info(FILE* f, regex_t* reg);
5011 #endif
5012
5013 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)5014 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5015 {
5016
5017 int r;
5018 NodeOptInfo opt;
5019 OptEnv env;
5020
5021 env.enc = reg->enc;
5022 env.options = reg->options;
5023 env.case_fold_flag = reg->case_fold_flag;
5024 env.scan_env = scan_env;
5025 clear_mml(&env.mmd);
5026
5027 r = optimize_node_left(node, &opt, &env);
5028 if (r) return r;
5029
5030 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5031 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
5032 ANCHOR_LOOK_BEHIND);
5033
5034 if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
5035 reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
5036
5037 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
5038 ANCHOR_PREC_READ_NOT);
5039
5040 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5041 reg->anchor_dmin = opt.len.min;
5042 reg->anchor_dmax = opt.len.max;
5043 }
5044
5045 if (opt.exb.len > 0 || opt.exm.len > 0) {
5046 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5047 if (opt.map.value > 0 &&
5048 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5049 goto set_map;
5050 }
5051 else {
5052 r = set_optimize_exact_info(reg, &opt.exb);
5053 set_sub_anchor(reg, &opt.exb.anc);
5054 }
5055 }
5056 else if (opt.map.value > 0) {
5057 set_map:
5058 set_optimize_map_info(reg, &opt.map);
5059 set_sub_anchor(reg, &opt.map.anc);
5060 }
5061 else {
5062 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5063 if (opt.len.max == 0)
5064 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5065 }
5066
5067 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5068 print_optimize_info(stderr, reg);
5069 #endif
5070 return r;
5071 }
5072
5073 static void
clear_optimize_info(regex_t * reg)5074 clear_optimize_info(regex_t* reg)
5075 {
5076 reg->optimize = ONIG_OPTIMIZE_NONE;
5077 reg->anchor = 0;
5078 reg->anchor_dmin = 0;
5079 reg->anchor_dmax = 0;
5080 reg->sub_anchor = 0;
5081 reg->exact_end = (UChar* )NULL;
5082 reg->threshold_len = 0;
5083 if (IS_NOT_NULL(reg->exact)) {
5084 xfree(reg->exact);
5085 reg->exact = (UChar* )NULL;
5086 }
5087 }
5088
5089 #ifdef ONIG_DEBUG
5090
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5091 static void print_enc_string(FILE* fp, OnigEncoding enc,
5092 const UChar *s, const UChar *end)
5093 {
5094 fprintf(fp, "\nPATTERN: /");
5095
5096 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5097 const UChar *p;
5098 OnigCodePoint code;
5099
5100 p = s;
5101 while (p < end) {
5102 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5103 if (code >= 0x80) {
5104 fprintf(fp, " 0x%04x ", (int )code);
5105 }
5106 else {
5107 fputc((int )code, fp);
5108 }
5109
5110 p += enclen(enc, p);
5111 }
5112 }
5113 else {
5114 while (s < end) {
5115 fputc((int )*s, fp);
5116 s++;
5117 }
5118 }
5119
5120 fprintf(fp, "/\n");
5121 }
5122
5123 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)5124 print_distance_range(FILE* f, OnigLen a, OnigLen b)
5125 {
5126 if (a == ONIG_INFINITE_DISTANCE)
5127 fputs("inf", f);
5128 else
5129 fprintf(f, "(%u)", a);
5130
5131 fputs("-", f);
5132
5133 if (b == ONIG_INFINITE_DISTANCE)
5134 fputs("inf", f);
5135 else
5136 fprintf(f, "(%u)", b);
5137 }
5138
5139 static void
print_anchor(FILE * f,int anchor)5140 print_anchor(FILE* f, int anchor)
5141 {
5142 int q = 0;
5143
5144 fprintf(f, "[");
5145
5146 if (anchor & ANCHOR_BEGIN_BUF) {
5147 fprintf(f, "begin-buf");
5148 q = 1;
5149 }
5150 if (anchor & ANCHOR_BEGIN_LINE) {
5151 if (q) fprintf(f, ", ");
5152 q = 1;
5153 fprintf(f, "begin-line");
5154 }
5155 if (anchor & ANCHOR_BEGIN_POSITION) {
5156 if (q) fprintf(f, ", ");
5157 q = 1;
5158 fprintf(f, "begin-pos");
5159 }
5160 if (anchor & ANCHOR_END_BUF) {
5161 if (q) fprintf(f, ", ");
5162 q = 1;
5163 fprintf(f, "end-buf");
5164 }
5165 if (anchor & ANCHOR_SEMI_END_BUF) {
5166 if (q) fprintf(f, ", ");
5167 q = 1;
5168 fprintf(f, "semi-end-buf");
5169 }
5170 if (anchor & ANCHOR_END_LINE) {
5171 if (q) fprintf(f, ", ");
5172 q = 1;
5173 fprintf(f, "end-line");
5174 }
5175 if (anchor & ANCHOR_ANYCHAR_STAR) {
5176 if (q) fprintf(f, ", ");
5177 q = 1;
5178 fprintf(f, "anychar-star");
5179 }
5180 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5181 if (q) fprintf(f, ", ");
5182 fprintf(f, "anychar-star-ml");
5183 }
5184
5185 fprintf(f, "]");
5186 }
5187
5188 static void
print_optimize_info(FILE * f,regex_t * reg)5189 print_optimize_info(FILE* f, regex_t* reg)
5190 {
5191 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5192 "EXACT_IC", "MAP" };
5193
5194 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5195 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5196 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5197 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5198 fprintf(f, "\n");
5199
5200 if (reg->optimize) {
5201 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5202 fprintf(f, "\n");
5203 }
5204 fprintf(f, "\n");
5205
5206 if (reg->exact) {
5207 UChar *p;
5208 fprintf(f, "exact: [");
5209 for (p = reg->exact; p < reg->exact_end; p++) {
5210 fputc(*p, f);
5211 }
5212 fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact));
5213 }
5214 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5215 int c, i, n = 0;
5216
5217 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5218 if (reg->map[i]) n++;
5219
5220 fprintf(f, "map: n=%d\n", n);
5221 if (n > 0) {
5222 c = 0;
5223 fputc('[', f);
5224 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5225 if (reg->map[i] != 0) {
5226 if (c > 0) fputs(", ", f);
5227 c++;
5228 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5229 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5230 fputc(i, f);
5231 else
5232 fprintf(f, "%d", i);
5233 }
5234 }
5235 fprintf(f, "]\n");
5236 }
5237 }
5238 }
5239 #endif /* ONIG_DEBUG */
5240
5241
5242 extern void
onig_free_body(regex_t * reg)5243 onig_free_body(regex_t* reg)
5244 {
5245 if (IS_NOT_NULL(reg)) {
5246 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5247 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5248 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5249 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5250 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5251 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5252
5253 #ifdef USE_NAMED_GROUP
5254 onig_names_free(reg);
5255 #endif
5256 }
5257 }
5258
5259 extern void
onig_free(regex_t * reg)5260 onig_free(regex_t* reg)
5261 {
5262 if (IS_NOT_NULL(reg)) {
5263 onig_free_body(reg);
5264 xfree(reg);
5265 }
5266 }
5267
5268 #define REGEX_TRANSFER(to,from) do {\
5269 onig_free_body(to);\
5270 xmemcpy(to, from, sizeof(regex_t));\
5271 xfree(from);\
5272 } while (0)
5273
5274 extern void
onig_transfer(regex_t * to,regex_t * from)5275 onig_transfer(regex_t* to, regex_t* from)
5276 {
5277 REGEX_TRANSFER(to, from);
5278 }
5279
5280
5281 #ifdef ONIG_DEBUG
5282 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5283 #endif
5284 #ifdef ONIG_DEBUG_PARSE_TREE
5285 static void print_tree P_((FILE* f, Node* node));
5286 #endif
5287
5288 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5289 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5290 OnigErrorInfo* einfo)
5291 {
5292 #define COMPILE_INIT_SIZE 20
5293
5294 int r, init_size;
5295 Node* root;
5296 ScanEnv scan_env;
5297 #ifdef USE_SUBEXP_CALL
5298 UnsetAddrList uslist;
5299 #endif
5300
5301 root = 0;
5302 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5303
5304 #ifdef ONIG_DEBUG
5305 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5306 #endif
5307
5308 if (reg->alloc == 0) {
5309 init_size = (pattern_end - pattern) * 2;
5310 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5311 r = BBUF_INIT(reg, init_size);
5312 if (r != 0) goto end;
5313 }
5314 else
5315 reg->used = 0;
5316
5317 reg->num_mem = 0;
5318 reg->num_repeat = 0;
5319 reg->num_null_check = 0;
5320 reg->repeat_range_alloc = 0;
5321 reg->repeat_range = (OnigRepeatRange* )NULL;
5322 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5323 reg->num_comb_exp_check = 0;
5324 #endif
5325
5326 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5327 if (r != 0) goto err;
5328
5329 #ifdef USE_NAMED_GROUP
5330 /* mixed use named group and no-named group */
5331 if (scan_env.num_named > 0 &&
5332 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5333 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5334 if (scan_env.num_named != scan_env.num_mem)
5335 r = disable_noname_group_capture(&root, reg, &scan_env);
5336 else
5337 r = numbered_ref_check(root);
5338
5339 if (r != 0) goto err;
5340 }
5341 #endif
5342
5343 #ifdef USE_SUBEXP_CALL
5344 if (scan_env.num_call > 0) {
5345 r = unset_addr_list_init(&uslist, scan_env.num_call);
5346 if (r != 0) goto err;
5347 scan_env.unset_addr_list = &uslist;
5348 r = setup_subexp_call(root, &scan_env);
5349 if (r != 0) goto err_unset;
5350 r = subexp_recursive_check_trav(root, &scan_env);
5351 if (r < 0) goto err_unset;
5352 r = subexp_inf_recursive_check_trav(root, &scan_env);
5353 if (r != 0) goto err_unset;
5354
5355 reg->num_call = scan_env.num_call;
5356 }
5357 else
5358 reg->num_call = 0;
5359 #endif
5360
5361 r = setup_tree(root, reg, 0, &scan_env);
5362 if (r != 0) goto err_unset;
5363
5364 #ifdef ONIG_DEBUG_PARSE_TREE
5365 print_tree(stderr, root);
5366 #endif
5367
5368 reg->capture_history = scan_env.capture_history;
5369 reg->bt_mem_start = scan_env.bt_mem_start;
5370 reg->bt_mem_start |= reg->capture_history;
5371 if (IS_FIND_CONDITION(reg->options))
5372 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5373 else {
5374 reg->bt_mem_end = scan_env.bt_mem_end;
5375 reg->bt_mem_end |= reg->capture_history;
5376 }
5377
5378 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5379 if (scan_env.backrefed_mem == 0
5380 #ifdef USE_SUBEXP_CALL
5381 || scan_env.num_call == 0
5382 #endif
5383 ) {
5384 setup_comb_exp_check(root, 0, &scan_env);
5385 #ifdef USE_SUBEXP_CALL
5386 if (scan_env.has_recursion != 0) {
5387 scan_env.num_comb_exp_check = 0;
5388 }
5389 else
5390 #endif
5391 if (scan_env.comb_exp_max_regnum > 0) {
5392 int i;
5393 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5394 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5395 scan_env.num_comb_exp_check = 0;
5396 break;
5397 }
5398 }
5399 }
5400 }
5401
5402 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5403 #endif
5404
5405 clear_optimize_info(reg);
5406 #ifndef ONIG_DONT_OPTIMIZE
5407 r = set_optimize_info_from_tree(root, reg, &scan_env);
5408 if (r != 0) goto err_unset;
5409 #endif
5410
5411 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5412 xfree(scan_env.mem_nodes_dynamic);
5413 scan_env.mem_nodes_dynamic = (Node** )NULL;
5414 }
5415
5416 r = compile_tree(root, reg);
5417 if (r == 0) {
5418 r = add_opcode(reg, OP_END);
5419 #ifdef USE_SUBEXP_CALL
5420 if (scan_env.num_call > 0) {
5421 r = unset_addr_list_fix(&uslist, reg);
5422 unset_addr_list_end(&uslist);
5423 if (r) goto err;
5424 }
5425 #endif
5426
5427 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5428 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5429 else {
5430 if (reg->bt_mem_start != 0)
5431 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5432 else
5433 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5434 }
5435 }
5436 #ifdef USE_SUBEXP_CALL
5437 else if (scan_env.num_call > 0) {
5438 unset_addr_list_end(&uslist);
5439 }
5440 #endif
5441 onig_node_free(root);
5442
5443 #ifdef ONIG_DEBUG_COMPILE
5444 #ifdef USE_NAMED_GROUP
5445 onig_print_names(stderr, reg);
5446 #endif
5447 print_compiled_byte_code_list(stderr, reg);
5448 #endif
5449
5450 end:
5451 return r;
5452
5453 err_unset:
5454 #ifdef USE_SUBEXP_CALL
5455 if (scan_env.num_call > 0) {
5456 unset_addr_list_end(&uslist);
5457 }
5458 #endif
5459 err:
5460 if (IS_NOT_NULL(scan_env.error)) {
5461 if (IS_NOT_NULL(einfo)) {
5462 einfo->enc = scan_env.enc;
5463 einfo->par = scan_env.error;
5464 einfo->par_end = scan_env.error_end;
5465 }
5466 }
5467
5468 onig_node_free(root);
5469 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5470 xfree(scan_env.mem_nodes_dynamic);
5471 return r;
5472 }
5473
5474
5475 static int onig_inited = 0;
5476
5477 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5478 onig_reg_init(regex_t* reg, OnigOptionType option,
5479 OnigCaseFoldType case_fold_flag,
5480 OnigEncoding enc, OnigSyntaxType* syntax)
5481 {
5482 int r;
5483
5484 xmemset(reg, 0, sizeof(*reg));
5485
5486 if (onig_inited == 0) {
5487 #if 0
5488 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
5489 #else
5490 r = onig_initialize(NULL, 0);
5491 if (r != 0)
5492 return ONIGERR_FAIL_TO_INITIALIZE;
5493
5494 r = onig_initialize_encoding(enc);
5495 if (r != 0)
5496 return ONIGERR_FAIL_TO_INITIALIZE;
5497
5498 onig_warning("You didn't call onig_initialize() explicitly");
5499 #endif
5500 }
5501
5502 if (IS_NULL(reg))
5503 return ONIGERR_INVALID_ARGUMENT;
5504
5505 if (ONIGENC_IS_UNDEF(enc))
5506 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5507
5508 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5509 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5510 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5511 }
5512
5513 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5514 option |= syntax->options;
5515 option &= ~ONIG_OPTION_SINGLELINE;
5516 }
5517 else
5518 option |= syntax->options;
5519
5520 (reg)->enc = enc;
5521 (reg)->options = option;
5522 (reg)->syntax = syntax;
5523 (reg)->optimize = 0;
5524 (reg)->exact = (UChar* )NULL;
5525 (reg)->int_map = (int* )NULL;
5526 (reg)->int_map_backward = (int* )NULL;
5527 (reg)->chain = (regex_t* )NULL;
5528
5529 (reg)->p = (UChar* )NULL;
5530 (reg)->alloc = 0;
5531 (reg)->used = 0;
5532 (reg)->name_table = (void* )NULL;
5533
5534 (reg)->case_fold_flag = case_fold_flag;
5535 return 0;
5536 }
5537
5538 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5539 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5540 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5541 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5542 {
5543 int r;
5544
5545 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5546 if (r) return r;
5547
5548 r = onig_compile(reg, pattern, pattern_end, einfo);
5549 return r;
5550 }
5551
5552 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5553 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5554 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5555 OnigErrorInfo* einfo)
5556 {
5557 int r;
5558
5559 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5560 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5561
5562 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5563 if (r) goto err;
5564
5565 r = onig_compile(*reg, pattern, pattern_end, einfo);
5566 if (r) {
5567 err:
5568 onig_free(*reg);
5569 *reg = NULL;
5570 }
5571 return r;
5572 }
5573
5574 extern int
onig_initialize(OnigEncoding encodings[],int n)5575 onig_initialize(OnigEncoding encodings[], int n)
5576 {
5577 int i;
5578 int r;
5579
5580 if (onig_inited != 0)
5581 return 0;
5582
5583 onigenc_init();
5584
5585 onig_inited = 1;
5586
5587 for (i = 0; i < n; i++) {
5588 OnigEncoding enc = encodings[i];
5589 r = onig_initialize_encoding(enc);
5590 if (r != 0)
5591 return r;
5592 }
5593
5594 return 0;
5595 }
5596
5597 static OnigEndCallListItemType* EndCallTop;
5598
onig_add_end_call(void (* func)(void))5599 extern void onig_add_end_call(void (*func)(void))
5600 {
5601 OnigEndCallListItemType* item;
5602
5603 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
5604 if (item == 0) return ;
5605
5606 item->next = EndCallTop;
5607 item->func = func;
5608
5609 EndCallTop = item;
5610 }
5611
5612 static void
exec_end_call_list(void)5613 exec_end_call_list(void)
5614 {
5615 OnigEndCallListItemType* prev;
5616 void (*func)(void);
5617
5618 while (EndCallTop != 0) {
5619 func = EndCallTop->func;
5620 (*func)();
5621
5622 prev = EndCallTop;
5623 EndCallTop = EndCallTop->next;
5624 xfree(prev);
5625 }
5626 }
5627
5628 extern int
onig_end(void)5629 onig_end(void)
5630 {
5631 exec_end_call_list();
5632
5633 onig_inited = 0;
5634
5635 return 0;
5636 }
5637
5638 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5639 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5640 {
5641 OnigCodePoint n, *data;
5642 OnigCodePoint low, high, x;
5643
5644 GET_CODE_POINT(n, p);
5645 data = (OnigCodePoint* )p;
5646 data++;
5647
5648 for (low = 0, high = n; low < high; ) {
5649 x = (low + high) >> 1;
5650 if (code > data[x * 2 + 1])
5651 low = x + 1;
5652 else
5653 high = x;
5654 }
5655
5656 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5657 }
5658
5659 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5660 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5661 {
5662 int found;
5663
5664 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5665 if (IS_NULL(cc->mbuf)) {
5666 found = 0;
5667 }
5668 else {
5669 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5670 }
5671 }
5672 else {
5673 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5674 }
5675
5676 if (IS_NCCLASS_NOT(cc))
5677 return !found;
5678 else
5679 return found;
5680 }
5681
5682 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5683 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5684 {
5685 int len;
5686
5687 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5688 len = 2;
5689 }
5690 else {
5691 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5692 }
5693 return onig_is_code_in_cc_len(len, code, cc);
5694 }
5695
5696
5697 #ifdef ONIG_DEBUG
5698
5699 /* arguments type */
5700 #define ARG_SPECIAL -1
5701 #define ARG_NON 0
5702 #define ARG_RELADDR 1
5703 #define ARG_ABSADDR 2
5704 #define ARG_LENGTH 3
5705 #define ARG_MEMNUM 4
5706 #define ARG_OPTION 5
5707 #define ARG_STATE_CHECK 6
5708
5709 OnigOpInfoType OnigOpInfo[] = {
5710 { OP_FINISH, "finish", ARG_NON },
5711 { OP_END, "end", ARG_NON },
5712 { OP_EXACT1, "exact1", ARG_SPECIAL },
5713 { OP_EXACT2, "exact2", ARG_SPECIAL },
5714 { OP_EXACT3, "exact3", ARG_SPECIAL },
5715 { OP_EXACT4, "exact4", ARG_SPECIAL },
5716 { OP_EXACT5, "exact5", ARG_SPECIAL },
5717 { OP_EXACTN, "exactn", ARG_SPECIAL },
5718 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5719 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5720 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5721 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5722 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5723 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5724 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5725 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5726 { OP_CCLASS, "cclass", ARG_SPECIAL },
5727 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5728 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5729 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5730 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5731 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5732 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5733 { OP_ANYCHAR, "anychar", ARG_NON },
5734 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5735 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5736 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5737 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5738 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5739 { OP_WORD, "word", ARG_NON },
5740 { OP_NOT_WORD, "not-word", ARG_NON },
5741 { OP_WORD_BOUND, "word-bound", ARG_NON },
5742 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5743 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5744 { OP_WORD_END, "word-end", ARG_NON },
5745 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5746 { OP_END_BUF, "end-buf", ARG_NON },
5747 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5748 { OP_END_LINE, "end-line", ARG_NON },
5749 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5750 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5751 { OP_BACKREF1, "backref1", ARG_NON },
5752 { OP_BACKREF2, "backref2", ARG_NON },
5753 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5754 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5755 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5756 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5757 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5758 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5759 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5760 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5761 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5762 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5763 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5764 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5765 { OP_SET_OPTION, "set-option", ARG_OPTION },
5766 { OP_FAIL, "fail", ARG_NON },
5767 { OP_JUMP, "jump", ARG_RELADDR },
5768 { OP_PUSH, "push", ARG_RELADDR },
5769 { OP_POP, "pop", ARG_NON },
5770 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5771 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5772 { OP_REPEAT, "repeat", ARG_SPECIAL },
5773 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5774 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5775 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5776 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5777 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5778 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5779 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5780 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5781 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5782 { OP_PUSH_POS, "push-pos", ARG_NON },
5783 { OP_POP_POS, "pop-pos", ARG_NON },
5784 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5785 { OP_FAIL_POS, "fail-pos", ARG_NON },
5786 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5787 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5788 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5789 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5790 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5791 { OP_CALL, "call", ARG_ABSADDR },
5792 { OP_RETURN, "return", ARG_NON },
5793 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5794 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5795 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5796 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5797 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5798 "state-check-anychar-ml*", ARG_STATE_CHECK },
5799 { -1, "", ARG_NON }
5800 };
5801
5802 static char*
op2name(int opcode)5803 op2name(int opcode)
5804 {
5805 int i;
5806
5807 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5808 if (opcode == OnigOpInfo[i].opcode)
5809 return OnigOpInfo[i].name;
5810 }
5811 return "";
5812 }
5813
5814 static int
op2arg_type(int opcode)5815 op2arg_type(int opcode)
5816 {
5817 int i;
5818
5819 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5820 if (opcode == OnigOpInfo[i].opcode)
5821 return OnigOpInfo[i].arg_type;
5822 }
5823 return ARG_SPECIAL;
5824 }
5825
5826 static void
Indent(FILE * f,int indent)5827 Indent(FILE* f, int indent)
5828 {
5829 int i;
5830 for (i = 0; i < indent; i++) putc(' ', f);
5831 }
5832
5833 static void
p_string(FILE * f,int len,UChar * s)5834 p_string(FILE* f, int len, UChar* s)
5835 {
5836 fputs(":", f);
5837 while (len-- > 0) { fputc(*s++, f); }
5838 }
5839
5840 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5841 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5842 {
5843 int x = len * mb_len;
5844
5845 fprintf(f, ":%d:", len);
5846 while (x-- > 0) { fputc(*s++, f); }
5847 }
5848
5849 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5850 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5851 OnigEncoding enc)
5852 {
5853 int i, n, arg_type;
5854 RelAddrType addr;
5855 LengthType len;
5856 MemNumType mem;
5857 StateCheckNumType scn;
5858 OnigCodePoint code;
5859 UChar *q;
5860
5861 fprintf(f, "[%s", op2name(*bp));
5862 arg_type = op2arg_type(*bp);
5863 if (arg_type != ARG_SPECIAL) {
5864 bp++;
5865 switch (arg_type) {
5866 case ARG_NON:
5867 break;
5868 case ARG_RELADDR:
5869 GET_RELADDR_INC(addr, bp);
5870 fprintf(f, ":(%d)", addr);
5871 break;
5872 case ARG_ABSADDR:
5873 GET_ABSADDR_INC(addr, bp);
5874 fprintf(f, ":(%d)", addr);
5875 break;
5876 case ARG_LENGTH:
5877 GET_LENGTH_INC(len, bp);
5878 fprintf(f, ":%d", len);
5879 break;
5880 case ARG_MEMNUM:
5881 mem = *((MemNumType* )bp);
5882 bp += SIZE_MEMNUM;
5883 fprintf(f, ":%d", mem);
5884 break;
5885 case ARG_OPTION:
5886 {
5887 OnigOptionType option = *((OnigOptionType* )bp);
5888 bp += SIZE_OPTION;
5889 fprintf(f, ":%d", option);
5890 }
5891 break;
5892
5893 case ARG_STATE_CHECK:
5894 scn = *((StateCheckNumType* )bp);
5895 bp += SIZE_STATE_CHECK_NUM;
5896 fprintf(f, ":%d", scn);
5897 break;
5898 }
5899 }
5900 else {
5901 switch (*bp++) {
5902 case OP_EXACT1:
5903 case OP_ANYCHAR_STAR_PEEK_NEXT:
5904 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5905 p_string(f, 1, bp++); break;
5906 case OP_EXACT2:
5907 p_string(f, 2, bp); bp += 2; break;
5908 case OP_EXACT3:
5909 p_string(f, 3, bp); bp += 3; break;
5910 case OP_EXACT4:
5911 p_string(f, 4, bp); bp += 4; break;
5912 case OP_EXACT5:
5913 p_string(f, 5, bp); bp += 5; break;
5914 case OP_EXACTN:
5915 GET_LENGTH_INC(len, bp);
5916 p_len_string(f, len, 1, bp);
5917 bp += len;
5918 break;
5919
5920 case OP_EXACTMB2N1:
5921 p_string(f, 2, bp); bp += 2; break;
5922 case OP_EXACTMB2N2:
5923 p_string(f, 4, bp); bp += 4; break;
5924 case OP_EXACTMB2N3:
5925 p_string(f, 6, bp); bp += 6; break;
5926 case OP_EXACTMB2N:
5927 GET_LENGTH_INC(len, bp);
5928 p_len_string(f, len, 2, bp);
5929 bp += len * 2;
5930 break;
5931 case OP_EXACTMB3N:
5932 GET_LENGTH_INC(len, bp);
5933 p_len_string(f, len, 3, bp);
5934 bp += len * 3;
5935 break;
5936 case OP_EXACTMBN:
5937 {
5938 int mb_len;
5939
5940 GET_LENGTH_INC(mb_len, bp);
5941 GET_LENGTH_INC(len, bp);
5942 fprintf(f, ":%d:%d:", mb_len, len);
5943 n = len * mb_len;
5944 while (n-- > 0) { fputc(*bp++, f); }
5945 }
5946 break;
5947
5948 case OP_EXACT1_IC:
5949 len = enclen(enc, bp);
5950 p_string(f, len, bp);
5951 bp += len;
5952 break;
5953 case OP_EXACTN_IC:
5954 GET_LENGTH_INC(len, bp);
5955 p_len_string(f, len, 1, bp);
5956 bp += len;
5957 break;
5958
5959 case OP_CCLASS:
5960 n = bitset_on_num((BitSetRef )bp);
5961 bp += SIZE_BITSET;
5962 fprintf(f, ":%d", n);
5963 break;
5964
5965 case OP_CCLASS_NOT:
5966 n = bitset_on_num((BitSetRef )bp);
5967 bp += SIZE_BITSET;
5968 fprintf(f, ":%d", n);
5969 break;
5970
5971 case OP_CCLASS_MB:
5972 case OP_CCLASS_MB_NOT:
5973 GET_LENGTH_INC(len, bp);
5974 q = bp;
5975 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5976 ALIGNMENT_RIGHT(q);
5977 #endif
5978 GET_CODE_POINT(code, q);
5979 bp += len;
5980 fprintf(f, ":%d:%d", (int )code, len);
5981 break;
5982
5983 case OP_CCLASS_MIX:
5984 case OP_CCLASS_MIX_NOT:
5985 n = bitset_on_num((BitSetRef )bp);
5986 bp += SIZE_BITSET;
5987 GET_LENGTH_INC(len, bp);
5988 q = bp;
5989 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5990 ALIGNMENT_RIGHT(q);
5991 #endif
5992 GET_CODE_POINT(code, q);
5993 bp += len;
5994 fprintf(f, ":%d:%d:%d", n, (int )code, len);
5995 break;
5996
5997 case OP_CCLASS_NODE:
5998 {
5999 CClassNode *cc;
6000
6001 GET_POINTER_INC(cc, bp);
6002 n = bitset_on_num(cc->bs);
6003 fprintf(f, ":%p:%d", cc, n);
6004 }
6005 break;
6006
6007 case OP_BACKREFN_IC:
6008 mem = *((MemNumType* )bp);
6009 bp += SIZE_MEMNUM;
6010 fprintf(f, ":%d", mem);
6011 break;
6012
6013 case OP_BACKREF_MULTI_IC:
6014 case OP_BACKREF_MULTI:
6015 fputs(" ", f);
6016 GET_LENGTH_INC(len, bp);
6017 for (i = 0; i < len; i++) {
6018 GET_MEMNUM_INC(mem, bp);
6019 if (i > 0) fputs(", ", f);
6020 fprintf(f, "%d", mem);
6021 }
6022 break;
6023
6024 case OP_BACKREF_WITH_LEVEL:
6025 {
6026 OnigOptionType option;
6027 LengthType level;
6028
6029 GET_OPTION_INC(option, bp);
6030 fprintf(f, ":%d", option);
6031 GET_LENGTH_INC(level, bp);
6032 fprintf(f, ":%d", level);
6033
6034 fputs(" ", f);
6035 GET_LENGTH_INC(len, bp);
6036 for (i = 0; i < len; i++) {
6037 GET_MEMNUM_INC(mem, bp);
6038 if (i > 0) fputs(", ", f);
6039 fprintf(f, "%d", mem);
6040 }
6041 }
6042 break;
6043
6044 case OP_REPEAT:
6045 case OP_REPEAT_NG:
6046 {
6047 mem = *((MemNumType* )bp);
6048 bp += SIZE_MEMNUM;
6049 addr = *((RelAddrType* )bp);
6050 bp += SIZE_RELADDR;
6051 fprintf(f, ":%d:%d", mem, addr);
6052 }
6053 break;
6054
6055 case OP_PUSH_OR_JUMP_EXACT1:
6056 case OP_PUSH_IF_PEEK_NEXT:
6057 addr = *((RelAddrType* )bp);
6058 bp += SIZE_RELADDR;
6059 fprintf(f, ":(%d)", addr);
6060 p_string(f, 1, bp);
6061 bp += 1;
6062 break;
6063
6064 case OP_LOOK_BEHIND:
6065 GET_LENGTH_INC(len, bp);
6066 fprintf(f, ":%d", len);
6067 break;
6068
6069 case OP_PUSH_LOOK_BEHIND_NOT:
6070 GET_RELADDR_INC(addr, bp);
6071 GET_LENGTH_INC(len, bp);
6072 fprintf(f, ":%d:(%d)", len, addr);
6073 break;
6074
6075 case OP_STATE_CHECK_PUSH:
6076 case OP_STATE_CHECK_PUSH_OR_JUMP:
6077 scn = *((StateCheckNumType* )bp);
6078 bp += SIZE_STATE_CHECK_NUM;
6079 addr = *((RelAddrType* )bp);
6080 bp += SIZE_RELADDR;
6081 fprintf(f, ":%d:(%d)", scn, addr);
6082 break;
6083
6084 default:
6085 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6086 *--bp);
6087 }
6088 }
6089 fputs("]", f);
6090 if (nextp) *nextp = bp;
6091 }
6092
6093 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6094 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6095 {
6096 int ncode;
6097 UChar* bp = reg->p;
6098 UChar* end = reg->p + reg->used;
6099
6100 fprintf(f, "code length: %d\n", reg->used);
6101
6102 ncode = 0;
6103 while (bp < end) {
6104 ncode++;
6105 if (bp > reg->p) {
6106 if (ncode % 5 == 0)
6107 fprintf(f, "\n");
6108 else
6109 fputs(" ", f);
6110 }
6111 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6112 }
6113
6114 fprintf(f, "\n");
6115 }
6116
6117 static void
print_indent_tree(FILE * f,Node * node,int indent)6118 print_indent_tree(FILE* f, Node* node, int indent)
6119 {
6120 int i, type;
6121 int add = 3;
6122 UChar* p;
6123
6124 Indent(f, indent);
6125 if (IS_NULL(node)) {
6126 fprintf(f, "ERROR: null node!!!\n");
6127 exit (0);
6128 }
6129
6130 type = NTYPE(node);
6131 switch (type) {
6132 case NT_LIST:
6133 case NT_ALT:
6134 if (NTYPE(node) == NT_LIST)
6135 fprintf(f, "<list:%p>\n", node);
6136 else
6137 fprintf(f, "<alt:%p>\n", node);
6138
6139 print_indent_tree(f, NCAR(node), indent + add);
6140 while (IS_NOT_NULL(node = NCDR(node))) {
6141 if (NTYPE(node) != type) {
6142 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6143 exit(0);
6144 }
6145 print_indent_tree(f, NCAR(node), indent + add);
6146 }
6147 break;
6148
6149 case NT_STR:
6150 fprintf(f, "<string%s:%p>",
6151 (NSTRING_IS_RAW(node) ? "-raw" : ""), node);
6152 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6153 if (*p >= 0x20 && *p < 0x7f)
6154 fputc(*p, f);
6155 else {
6156 fprintf(f, " 0x%02x", *p);
6157 }
6158 }
6159 break;
6160
6161 case NT_CCLASS:
6162 fprintf(f, "<cclass:%p>", node);
6163 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6164 if (NCCLASS(node)->mbuf) {
6165 BBuf* bbuf = NCCLASS(node)->mbuf;
6166 for (i = 0; i < bbuf->used; i++) {
6167 if (i > 0) fprintf(f, ",");
6168 fprintf(f, "%0x", bbuf->p[i]);
6169 }
6170 }
6171 break;
6172
6173 case NT_CTYPE:
6174 fprintf(f, "<ctype:%p> ", node);
6175 switch (NCTYPE(node)->ctype) {
6176 case ONIGENC_CTYPE_WORD:
6177 if (NCTYPE(node)->not != 0)
6178 fputs("not word", f);
6179 else
6180 fputs("word", f);
6181 break;
6182
6183 default:
6184 fprintf(f, "ERROR: undefined ctype.\n");
6185 exit(0);
6186 }
6187 break;
6188
6189 case NT_CANY:
6190 fprintf(f, "<anychar:%p>", node);
6191 break;
6192
6193 case NT_ANCHOR:
6194 fprintf(f, "<anchor:%p> ", node);
6195 switch (NANCHOR(node)->type) {
6196 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6197 case ANCHOR_END_BUF: fputs("end buf", f); break;
6198 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6199 case ANCHOR_END_LINE: fputs("end line", f); break;
6200 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6201 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6202
6203 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6204 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6205 #ifdef USE_WORD_BEGIN_END
6206 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6207 case ANCHOR_WORD_END: fputs("word end", f); break;
6208 #endif
6209 case ANCHOR_PREC_READ:
6210 fprintf(f, "prec read\n");
6211 print_indent_tree(f, NANCHOR(node)->target, indent + add);
6212 break;
6213 case ANCHOR_PREC_READ_NOT:
6214 fprintf(f, "prec read not\n");
6215 print_indent_tree(f, NANCHOR(node)->target, indent + add);
6216 break;
6217 case ANCHOR_LOOK_BEHIND:
6218 fprintf(f, "look behind\n");
6219 print_indent_tree(f, NANCHOR(node)->target, indent + add);
6220 break;
6221 case ANCHOR_LOOK_BEHIND_NOT:
6222 fprintf(f, "look behind not\n");
6223 print_indent_tree(f, NANCHOR(node)->target, indent + add);
6224 break;
6225
6226 default:
6227 fprintf(f, "ERROR: undefined anchor type.\n");
6228 break;
6229 }
6230 break;
6231
6232 case NT_BREF:
6233 {
6234 int* p;
6235 BRefNode* br = NBREF(node);
6236 p = BACKREFS_P(br);
6237 fprintf(f, "<backref:%p>", node);
6238 for (i = 0; i < br->back_num; i++) {
6239 if (i > 0) fputs(", ", f);
6240 fprintf(f, "%d", p[i]);
6241 }
6242 }
6243 break;
6244
6245 #ifdef USE_SUBEXP_CALL
6246 case NT_CALL:
6247 {
6248 CallNode* cn = NCALL(node);
6249 fprintf(f, "<call:%p>", node);
6250 p_string(f, cn->name_end - cn->name, cn->name);
6251 }
6252 break;
6253 #endif
6254
6255 case NT_QTFR:
6256 fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node,
6257 NQTFR(node)->lower, NQTFR(node)->upper,
6258 (NQTFR(node)->greedy ? "" : "?"));
6259 print_indent_tree(f, NQTFR(node)->target, indent + add);
6260 break;
6261
6262 case NT_ENCLOSE:
6263 fprintf(f, "<enclose:%p> ", node);
6264 switch (NENCLOSE(node)->type) {
6265 case ENCLOSE_OPTION:
6266 fprintf(f, "option:%d", NENCLOSE(node)->option);
6267 break;
6268 case ENCLOSE_MEMORY:
6269 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6270 break;
6271 case ENCLOSE_STOP_BACKTRACK:
6272 fprintf(f, "stop-bt");
6273 break;
6274
6275 default:
6276 break;
6277 }
6278 fprintf(f, "\n");
6279 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6280 break;
6281
6282 default:
6283 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6284 break;
6285 }
6286
6287 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6288 type != NT_ENCLOSE)
6289 fprintf(f, "\n");
6290 fflush(f);
6291 }
6292 #endif /* ONIG_DEBUG */
6293
6294 #ifdef ONIG_DEBUG_PARSE_TREE
6295 static void
print_tree(FILE * f,Node * node)6296 print_tree(FILE* f, Node* node)
6297 {
6298 print_indent_tree(f, node, 0);
6299 }
6300 #endif
6301