xref: /PHP-5.5/ext/mbstring/oniguruma/regcomp.c (revision fe92d64a)
1 /**********************************************************************
2   regcomp.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33 
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37   return OnigDefaultCaseFoldFlag;
38 }
39 
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43   OnigDefaultCaseFoldFlag = case_fold_flag;
44   return 0;
45 }
46 
47 
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51 
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55   int len = end - s;
56 
57   if (len > 0) {
58     UChar* r = (UChar* )xmalloc(len + 1);
59     CHECK_NULL_RETURN(r);
60     xmemcpy(r, s, len);
61     r[len] = (UChar )0;
62     return r;
63   }
64   else return NULL;
65 }
66 
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70   Node c;
71   c = *a; *a = *b; *b = c;
72 
73   if (NTYPE(a) == NT_STR) {
74     StrNode* sn = NSTR(a);
75     if (sn->capa == 0) {
76       int len = sn->end - sn->s;
77       sn->s   = sn->buf;
78       sn->end = sn->s + len;
79     }
80   }
81 
82   if (NTYPE(b) == NT_STR) {
83     StrNode* sn = NSTR(b);
84     if (sn->capa == 0) {
85       int len = sn->end - sn->s;
86       sn->s   = sn->buf;
87       sn->end = sn->s + len;
88     }
89   }
90 }
91 
92 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)93 distance_add(OnigDistance d1, OnigDistance d2)
94 {
95   if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96     return ONIG_INFINITE_DISTANCE;
97   else {
98     if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99     else return ONIG_INFINITE_DISTANCE;
100   }
101 }
102 
103 static OnigDistance
distance_multiply(OnigDistance d,int m)104 distance_multiply(OnigDistance d, int m)
105 {
106   if (m == 0) return 0;
107 
108   if (d < ONIG_INFINITE_DISTANCE / m)
109     return d * m;
110   else
111     return ONIG_INFINITE_DISTANCE;
112 }
113 
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117   int i;
118   for (i = 0; i < (int )BITSET_SIZE; i++) {
119     if (bs[i] != 0) return 0;
120   }
121   return 1;
122 }
123 
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128   int i, n;
129 
130   n = 0;
131   for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132     if (BITSET_AT(bs, i)) n++;
133   }
134   return n;
135 }
136 #endif
137 
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141   if (size <= 0) {
142     size   = 0;
143     buf->p = NULL;
144   }
145   else {
146     buf->p = (UChar* )xmalloc(size);
147     if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148   }
149 
150   buf->alloc = size;
151   buf->used  = 0;
152   return 0;
153 }
154 
155 
156 #ifdef USE_SUBEXP_CALL
157 
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161   UnsetAddr* p;
162 
163   p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164   CHECK_NULL_RETURN_MEMERR(p);
165   uslist->num   = 0;
166   uslist->alloc = size;
167   uslist->us    = p;
168   return 0;
169 }
170 
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174   if (IS_NOT_NULL(uslist->us))
175     xfree(uslist->us);
176 }
177 
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181   UnsetAddr* p;
182   int size;
183 
184   if (uslist->num >= uslist->alloc) {
185     size = uslist->alloc * 2;
186     p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187     CHECK_NULL_RETURN_MEMERR(p);
188     uslist->alloc = size;
189     uslist->us    = p;
190   }
191 
192   uslist->us[uslist->num].offset = offset;
193   uslist->us[uslist->num].target = node;
194   uslist->num++;
195   return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198 
199 
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203   BBUF_ADD1(reg, opcode);
204   return 0;
205 }
206 
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211   StateCheckNumType n = (StateCheckNumType )num;
212 
213   BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214   return 0;
215 }
216 #endif
217 
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221   RelAddrType ra = (RelAddrType )addr;
222 
223   BBUF_ADD(reg, &ra, SIZE_RELADDR);
224   return 0;
225 }
226 
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230   AbsAddrType ra = (AbsAddrType )addr;
231 
232   BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233   return 0;
234 }
235 
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239   LengthType l = (LengthType )len;
240 
241   BBUF_ADD(reg, &l, SIZE_LENGTH);
242   return 0;
243 }
244 
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248   MemNumType n = (MemNumType )num;
249 
250   BBUF_ADD(reg, &n, SIZE_MEMNUM);
251   return 0;
252 }
253 
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257   PointerType ptr = (PointerType )addr;
258 
259   BBUF_ADD(reg, &ptr, SIZE_POINTER);
260   return 0;
261 }
262 
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266   BBUF_ADD(reg, &option, SIZE_OPTION);
267   return 0;
268 }
269 
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273   int r;
274 
275   r = add_opcode(reg, opcode);
276   if (r) return r;
277   r = add_rel_addr(reg, addr);
278   return r;
279 }
280 
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284   BBUF_ADD(reg, bytes, len);
285   return 0;
286 }
287 
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291   BBUF_ADD(reg, bs, SIZE_BITSET);
292   return 0;
293 }
294 
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298   int r;
299 
300   r = add_opcode(reg, opcode);
301   if (r) return r;
302   r = add_option(reg, option);
303   return r;
304 }
305 
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308 
309 
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311    ((op) == OP_EXACTN    || (op) == OP_EXACTMB2N ||\
312     (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN  || (op) == OP_EXACTN_IC)
313 
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317   int op;
318 
319   if (ignore_case) {
320     switch (str_len) {
321     case 1:  op = OP_EXACT1_IC; break;
322     default: op = OP_EXACTN_IC; break;
323     }
324   }
325   else {
326     switch (mb_len) {
327     case 1:
328       switch (str_len) {
329       case 1:  op = OP_EXACT1; break;
330       case 2:  op = OP_EXACT2; break;
331       case 3:  op = OP_EXACT3; break;
332       case 4:  op = OP_EXACT4; break;
333       case 5:  op = OP_EXACT5; break;
334       default: op = OP_EXACTN; break;
335       }
336       break;
337 
338     case 2:
339       switch (str_len) {
340       case 1:  op = OP_EXACTMB2N1; break;
341       case 2:  op = OP_EXACTMB2N2; break;
342       case 3:  op = OP_EXACTMB2N3; break;
343       default: op = OP_EXACTMB2N;  break;
344       }
345       break;
346 
347     case 3:
348       op = OP_EXACTMB3N;
349       break;
350 
351     default:
352       op = OP_EXACTMBN;
353       break;
354     }
355   }
356   return op;
357 }
358 
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362   int r;
363   int saved_num_null_check = reg->num_null_check;
364 
365   if (empty_info != 0) {
366     r = add_opcode(reg, OP_NULL_CHECK_START);
367     if (r) return r;
368     r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369     if (r) return r;
370     reg->num_null_check++;
371   }
372 
373   r = compile_tree(node, reg);
374   if (r) return r;
375 
376   if (empty_info != 0) {
377     if (empty_info == NQ_TARGET_IS_EMPTY)
378       r = add_opcode(reg, OP_NULL_CHECK_END);
379     else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380       r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381     else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382       r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383 
384     if (r) return r;
385     r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386   }
387   return r;
388 }
389 
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394   int r;
395 
396   r = add_opcode(reg, OP_CALL);
397   if (r) return r;
398   r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399                           node->target);
400   if (r) return r;
401   r = add_abs_addr(reg, 0 /*dummy addr.*/);
402   return r;
403 }
404 #endif
405 
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409   int i, r;
410 
411   for (i = 0; i < n; i++) {
412     r = compile_tree(node, reg);
413     if (r) return r;
414   }
415   return 0;
416 }
417 
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420                           regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422   int len;
423   int op = select_str_opcode(mb_len, str_len, ignore_case);
424 
425   len = SIZE_OPCODE;
426 
427   if (op == OP_EXACTMBN)  len += SIZE_LENGTH;
428   if (IS_NEED_STR_LEN_OP_EXACT(op))
429     len += SIZE_LENGTH;
430 
431   len += mb_len * str_len;
432   return len;
433 }
434 
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437                    regex_t* reg, int ignore_case)
438 {
439   int op = select_str_opcode(mb_len, str_len, ignore_case);
440   add_opcode(reg, op);
441 
442   if (op == OP_EXACTMBN)
443     add_length(reg, mb_len);
444 
445   if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446     if (op == OP_EXACTN_IC)
447       add_length(reg, mb_len * str_len);
448     else
449       add_length(reg, str_len);
450   }
451 
452   add_bytes(reg, s, mb_len * str_len);
453   return 0;
454 }
455 
456 
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460   int rlen, r, len, prev_len, slen, ambig;
461   OnigEncoding enc = reg->enc;
462   UChar *p, *prev;
463   StrNode* sn;
464 
465   sn = NSTR(node);
466   if (sn->end <= sn->s)
467     return 0;
468 
469   ambig = NSTRING_IS_AMBIG(node);
470 
471   p = prev = sn->s;
472   prev_len = enclen(enc, p);
473   p += prev_len;
474   slen = 1;
475   rlen = 0;
476 
477   for (; p < sn->end; ) {
478     len = enclen(enc, p);
479     if (len == prev_len) {
480       slen++;
481     }
482     else {
483       r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484       rlen += r;
485       prev = p;
486       slen = 1;
487       prev_len = len;
488     }
489     p += len;
490   }
491   r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492   rlen += r;
493   return rlen;
494 }
495 
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499   if (sn->end <= sn->s)
500     return 0;
501 
502   return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504 
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508   int r, len, prev_len, slen, ambig;
509   OnigEncoding enc = reg->enc;
510   UChar *p, *prev, *end;
511   StrNode* sn;
512 
513   sn = NSTR(node);
514   if (sn->end <= sn->s)
515     return 0;
516 
517   end = sn->end;
518   ambig = NSTRING_IS_AMBIG(node);
519 
520   p = prev = sn->s;
521   prev_len = enclen(enc, p);
522   p += prev_len;
523   slen = 1;
524 
525   for (; p < end; ) {
526     len = enclen(enc, p);
527     if (len == prev_len) {
528       slen++;
529     }
530     else {
531       r = add_compile_string(prev, prev_len, slen, reg, ambig);
532       if (r) return r;
533 
534       prev  = p;
535       slen  = 1;
536       prev_len = len;
537     }
538 
539     p += len;
540   }
541   return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543 
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547   if (sn->end <= sn->s)
548     return 0;
549 
550   return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552 
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557   add_length(reg, mbuf->used);
558   return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560   int r, pad_size;
561   UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562 
563   GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564   add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565   if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566 
567   r = add_bytes(reg, mbuf->p, mbuf->used);
568 
569   /* padding for return value from compile_length_cclass_node() to be fix. */
570   pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571   if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572   return r;
573 #endif
574 }
575 
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579   int len;
580 
581   if (IS_NCCLASS_SHARE(cc)) {
582     len = SIZE_OPCODE + SIZE_POINTER;
583     return len;
584   }
585 
586   if (IS_NULL(cc->mbuf)) {
587     len = SIZE_OPCODE + SIZE_BITSET;
588   }
589   else {
590     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591       len = SIZE_OPCODE;
592     }
593     else {
594       len = SIZE_OPCODE + SIZE_BITSET;
595     }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597     len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599     len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601   }
602 
603   return len;
604 }
605 
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609   int r;
610 
611   if (IS_NCCLASS_SHARE(cc)) {
612     add_opcode(reg, OP_CCLASS_NODE);
613     r = add_pointer(reg, cc);
614     return r;
615   }
616 
617   if (IS_NULL(cc->mbuf)) {
618     if (IS_NCCLASS_NOT(cc))
619       add_opcode(reg, OP_CCLASS_NOT);
620     else
621       add_opcode(reg, OP_CCLASS);
622 
623     r = add_bitset(reg, cc->bs);
624   }
625   else {
626     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627       if (IS_NCCLASS_NOT(cc))
628         add_opcode(reg, OP_CCLASS_MB_NOT);
629       else
630         add_opcode(reg, OP_CCLASS_MB);
631 
632       r = add_multi_byte_cclass(cc->mbuf, reg);
633     }
634     else {
635       if (IS_NCCLASS_NOT(cc))
636         add_opcode(reg, OP_CCLASS_MIX_NOT);
637       else
638         add_opcode(reg, OP_CCLASS_MIX);
639 
640       r = add_bitset(reg, cc->bs);
641       if (r) return r;
642       r = add_multi_byte_cclass(cc->mbuf, reg);
643     }
644   }
645 
646   return r;
647 }
648 
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC  4
653 
654   OnigRepeatRange* p;
655 
656   if (reg->repeat_range_alloc == 0) {
657     p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658     CHECK_NULL_RETURN_MEMERR(p);
659     reg->repeat_range = p;
660     reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661   }
662   else if (reg->repeat_range_alloc <= id) {
663     int n;
664     n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665     p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666                                     sizeof(OnigRepeatRange) * n);
667     CHECK_NULL_RETURN_MEMERR(p);
668     reg->repeat_range = p;
669     reg->repeat_range_alloc = n;
670   }
671   else {
672     p = reg->repeat_range;
673   }
674 
675   p[id].lower = lower;
676   p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677   return 0;
678 }
679 
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682                           regex_t* reg)
683 {
684   int r;
685   int num_repeat = reg->num_repeat;
686 
687   r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688   if (r) return r;
689   r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690   reg->num_repeat++;
691   if (r) return r;
692   r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693   if (r) return r;
694 
695   r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696   if (r) return r;
697 
698   r = compile_tree_empty_check(qn->target, reg, empty_info);
699   if (r) return r;
700 
701   if (
702 #ifdef USE_SUBEXP_CALL
703       reg->num_call > 0 ||
704 #endif
705       IS_QUANTIFIER_IN_REPEAT(qn)) {
706     r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707   }
708   else {
709     r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710   }
711   if (r) return r;
712   r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713   return r;
714 }
715 
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719   if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720       NTYPE(qn->target) == NT_CANY)
721     return 1;
722   else
723     return 0;
724 }
725 
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE   50
727 #define CKN_ON   (ckn > 0)
728 
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730 
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734   int len, mod_tlen, cklen;
735   int ckn;
736   int infinite = IS_REPEAT_INFINITE(qn->upper);
737   int empty_info = qn->target_empty_info;
738   int tlen = compile_length_tree(qn->target, reg);
739 
740   if (tlen < 0) return tlen;
741 
742   ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743 
744   cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745 
746   /* anychar repeat */
747   if (NTYPE(qn->target) == NT_CANY) {
748     if (qn->greedy && infinite) {
749       if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750         return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751       else
752         return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753     }
754   }
755 
756   if (empty_info != 0)
757     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758   else
759     mod_tlen = tlen;
760 
761   if (infinite && qn->lower <= 1) {
762     if (qn->greedy) {
763       if (qn->lower == 1)
764 	len = SIZE_OP_JUMP;
765       else
766 	len = 0;
767 
768       len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769     }
770     else {
771       if (qn->lower == 0)
772 	len = SIZE_OP_JUMP;
773       else
774 	len = 0;
775 
776       len += mod_tlen + SIZE_OP_PUSH + cklen;
777     }
778   }
779   else if (qn->upper == 0) {
780     if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781       len = SIZE_OP_JUMP + tlen;
782     else
783       len = 0;
784   }
785   else if (qn->upper == 1 && qn->greedy) {
786     if (qn->lower == 0) {
787       if (CKN_ON) {
788 	len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789       }
790       else {
791 	len = SIZE_OP_PUSH + tlen;
792       }
793     }
794     else {
795       len = tlen;
796     }
797   }
798   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799     len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800   }
801   else {
802     len = SIZE_OP_REPEAT_INC
803         + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804     if (CKN_ON)
805       len += SIZE_OP_STATE_CHECK;
806   }
807 
808   return len;
809 }
810 
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814   int r, mod_tlen;
815   int ckn;
816   int infinite = IS_REPEAT_INFINITE(qn->upper);
817   int empty_info = qn->target_empty_info;
818   int tlen = compile_length_tree(qn->target, reg);
819 
820   if (tlen < 0) return tlen;
821 
822   ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823 
824   if (is_anychar_star_quantifier(qn)) {
825     r = compile_tree_n_times(qn->target, qn->lower, reg);
826     if (r) return r;
827     if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828       if (IS_MULTILINE(reg->options))
829 	r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830       else
831 	r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832       if (r) return r;
833       if (CKN_ON) {
834 	r = add_state_check_num(reg, ckn);
835 	if (r) return r;
836       }
837 
838       return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839     }
840     else {
841       if (IS_MULTILINE(reg->options)) {
842 	r = add_opcode(reg, (CKN_ON ?
843 			       OP_STATE_CHECK_ANYCHAR_ML_STAR
844 			     : OP_ANYCHAR_ML_STAR));
845       }
846       else {
847 	r = add_opcode(reg, (CKN_ON ?
848 			       OP_STATE_CHECK_ANYCHAR_STAR
849 			     : OP_ANYCHAR_STAR));
850       }
851       if (r) return r;
852       if (CKN_ON)
853 	r = add_state_check_num(reg, ckn);
854 
855       return r;
856     }
857   }
858 
859   if (empty_info != 0)
860     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861   else
862     mod_tlen = tlen;
863 
864   if (infinite && qn->lower <= 1) {
865     if (qn->greedy) {
866       if (qn->lower == 1) {
867 	r = add_opcode_rel_addr(reg, OP_JUMP,
868 			(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 	if (r) return r;
870       }
871 
872       if (CKN_ON) {
873 	r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 	if (r) return r;
875 	r = add_state_check_num(reg, ckn);
876 	if (r) return r;
877 	r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878       }
879       else {
880 	r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881       }
882       if (r) return r;
883       r = compile_tree_empty_check(qn->target, reg, empty_info);
884       if (r) return r;
885       r = add_opcode_rel_addr(reg, OP_JUMP,
886 	      -(mod_tlen + (int )SIZE_OP_JUMP
887 		+ (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888     }
889     else {
890       if (qn->lower == 0) {
891 	r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 	if (r) return r;
893       }
894       r = compile_tree_empty_check(qn->target, reg, empty_info);
895       if (r) return r;
896       if (CKN_ON) {
897 	r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 	if (r) return r;
899 	r = add_state_check_num(reg, ckn);
900 	if (r) return r;
901 	r = add_rel_addr(reg,
902 		 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903       }
904       else
905 	r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906     }
907   }
908   else if (qn->upper == 0) {
909     if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910       r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911       if (r) return r;
912       r = compile_tree(qn->target, reg);
913     }
914     else
915       r = 0;
916   }
917   else if (qn->upper == 1 && qn->greedy) {
918     if (qn->lower == 0) {
919       if (CKN_ON) {
920 	r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 	if (r) return r;
922 	r = add_state_check_num(reg, ckn);
923 	if (r) return r;
924 	r = add_rel_addr(reg, tlen);
925       }
926       else {
927 	r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928       }
929       if (r) return r;
930     }
931 
932     r = compile_tree(qn->target, reg);
933   }
934   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935     if (CKN_ON) {
936       r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937       if (r) return r;
938       r = add_state_check_num(reg, ckn);
939       if (r) return r;
940       r = add_rel_addr(reg, SIZE_OP_JUMP);
941     }
942     else {
943       r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944     }
945 
946     if (r) return r;
947     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948     if (r) return r;
949     r = compile_tree(qn->target, reg);
950   }
951   else {
952     r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953     if (CKN_ON) {
954       if (r) return r;
955       r = add_opcode(reg, OP_STATE_CHECK);
956       if (r) return r;
957       r = add_state_check_num(reg, ckn);
958     }
959   }
960   return r;
961 }
962 
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964 
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968   int len, mod_tlen;
969   int infinite = IS_REPEAT_INFINITE(qn->upper);
970   int empty_info = qn->target_empty_info;
971   int tlen = compile_length_tree(qn->target, reg);
972 
973   if (tlen < 0) return tlen;
974 
975   /* anychar repeat */
976   if (NTYPE(qn->target) == NT_CANY) {
977     if (qn->greedy && infinite) {
978       if (IS_NOT_NULL(qn->next_head_exact))
979         return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980       else
981         return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982     }
983   }
984 
985   if (empty_info != 0)
986     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987   else
988     mod_tlen = tlen;
989 
990   if (infinite &&
991       (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993       len = SIZE_OP_JUMP;
994     }
995     else {
996       len = tlen * qn->lower;
997     }
998 
999     if (qn->greedy) {
1000       if (IS_NOT_NULL(qn->head_exact))
1001 	len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002       else if (IS_NOT_NULL(qn->next_head_exact))
1003 	len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004       else
1005 	len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006     }
1007     else
1008       len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009   }
1010   else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011     len = SIZE_OP_JUMP + tlen;
1012   }
1013   else if (!infinite && qn->greedy &&
1014            (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015                                       <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016     len = tlen * qn->lower;
1017     len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018   }
1019   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020     len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021   }
1022   else {
1023     len = SIZE_OP_REPEAT_INC
1024         + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025   }
1026 
1027   return len;
1028 }
1029 
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033   int i, r, mod_tlen;
1034   int infinite = IS_REPEAT_INFINITE(qn->upper);
1035   int empty_info = qn->target_empty_info;
1036   int tlen = compile_length_tree(qn->target, reg);
1037 
1038   if (tlen < 0) return tlen;
1039 
1040   if (is_anychar_star_quantifier(qn)) {
1041     r = compile_tree_n_times(qn->target, qn->lower, reg);
1042     if (r) return r;
1043     if (IS_NOT_NULL(qn->next_head_exact)) {
1044       if (IS_MULTILINE(reg->options))
1045 	r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046       else
1047 	r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048       if (r) return r;
1049       return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050     }
1051     else {
1052       if (IS_MULTILINE(reg->options))
1053 	return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054       else
1055 	return add_opcode(reg, OP_ANYCHAR_STAR);
1056     }
1057   }
1058 
1059   if (empty_info != 0)
1060     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061   else
1062     mod_tlen = tlen;
1063 
1064   if (infinite &&
1065       (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067       if (qn->greedy) {
1068 	if (IS_NOT_NULL(qn->head_exact))
1069 	  r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 	else if (IS_NOT_NULL(qn->next_head_exact))
1071 	  r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 	else
1073 	  r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074       }
1075       else {
1076 	r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077       }
1078       if (r) return r;
1079     }
1080     else {
1081       r = compile_tree_n_times(qn->target, qn->lower, reg);
1082       if (r) return r;
1083     }
1084 
1085     if (qn->greedy) {
1086       if (IS_NOT_NULL(qn->head_exact)) {
1087 	r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 			     mod_tlen + SIZE_OP_JUMP);
1089 	if (r) return r;
1090 	add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 	r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 	if (r) return r;
1093 	r = add_opcode_rel_addr(reg, OP_JUMP,
1094 	-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095       }
1096       else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 	r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 				mod_tlen + SIZE_OP_JUMP);
1099 	if (r) return r;
1100 	add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 	r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 	if (r) return r;
1103 	r = add_opcode_rel_addr(reg, OP_JUMP,
1104           -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105       }
1106       else {
1107 	r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 	if (r) return r;
1109 	r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 	if (r) return r;
1111 	r = add_opcode_rel_addr(reg, OP_JUMP,
1112 		     -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113       }
1114     }
1115     else {
1116       r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117       if (r) return r;
1118       r = compile_tree_empty_check(qn->target, reg, empty_info);
1119       if (r) return r;
1120       r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121     }
1122   }
1123   else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125     if (r) return r;
1126     r = compile_tree(qn->target, reg);
1127   }
1128   else if (!infinite && qn->greedy &&
1129            (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130                                   <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131     int n = qn->upper - qn->lower;
1132 
1133     r = compile_tree_n_times(qn->target, qn->lower, reg);
1134     if (r) return r;
1135 
1136     for (i = 0; i < n; i++) {
1137       r = add_opcode_rel_addr(reg, OP_PUSH,
1138 			   (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139       if (r) return r;
1140       r = compile_tree(qn->target, reg);
1141       if (r) return r;
1142     }
1143   }
1144   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145     r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146     if (r) return r;
1147     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148     if (r) return r;
1149     r = compile_tree(qn->target, reg);
1150   }
1151   else {
1152     r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153   }
1154   return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157 
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161   int tlen;
1162   OnigOptionType prev = reg->options;
1163 
1164   reg->options = node->option;
1165   tlen = compile_length_tree(node->target, reg);
1166   reg->options = prev;
1167 
1168   if (tlen < 0) return tlen;
1169 
1170   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171     return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172            + tlen + SIZE_OP_SET_OPTION;
1173   }
1174   else
1175     return tlen;
1176 }
1177 
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181   int r;
1182   OnigOptionType prev = reg->options;
1183 
1184   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185     r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186     if (r) return r;
1187     r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188     if (r) return r;
1189     r = add_opcode(reg, OP_FAIL);
1190     if (r) return r;
1191   }
1192 
1193   reg->options = node->option;
1194   r = compile_tree(node->target, reg);
1195   reg->options = prev;
1196 
1197   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198     if (r) return r;
1199     r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200   }
1201   return r;
1202 }
1203 
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207   int len;
1208   int tlen;
1209 
1210   if (node->type == ENCLOSE_OPTION)
1211     return compile_length_option_node(node, reg);
1212 
1213   if (node->target) {
1214     tlen = compile_length_tree(node->target, reg);
1215     if (tlen < 0) return tlen;
1216   }
1217   else
1218     tlen = 0;
1219 
1220   switch (node->type) {
1221   case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223     if (IS_ENCLOSE_CALLED(node)) {
1224       len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 	  + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 	len += (IS_ENCLOSE_RECURSION(node)
1228 		? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229       else
1230 	len += (IS_ENCLOSE_RECURSION(node)
1231 		? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232     }
1233     else
1234 #endif
1235     {
1236       if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 	len = SIZE_OP_MEMORY_START_PUSH;
1238       else
1239 	len = SIZE_OP_MEMORY_START;
1240 
1241       len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 		     ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1243     }
1244     break;
1245 
1246   case ENCLOSE_STOP_BACKTRACK:
1247     if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248       QtfrNode* qn = NQTFR(node->target);
1249       tlen = compile_length_tree(qn->target, reg);
1250       if (tlen < 0) return tlen;
1251 
1252       len = tlen * qn->lower
1253 	  + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1254     }
1255     else {
1256       len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1257     }
1258     break;
1259 
1260   default:
1261     return ONIGERR_TYPE_BUG;
1262     break;
1263   }
1264 
1265   return len;
1266 }
1267 
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1269 
1270 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1272 {
1273   int r, len;
1274 
1275   if (node->type == ENCLOSE_OPTION)
1276     return compile_option_node(node, reg);
1277 
1278   switch (node->type) {
1279   case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281     if (IS_ENCLOSE_CALLED(node)) {
1282       r = add_opcode(reg, OP_CALL);
1283       if (r) return r;
1284       node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285       node->state |= NST_ADDR_FIXED;
1286       r = add_abs_addr(reg, (int )node->call_addr);
1287       if (r) return r;
1288       len = compile_length_tree(node->target, reg);
1289       len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 	len += (IS_ENCLOSE_RECURSION(node)
1292 		? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293       else
1294 	len += (IS_ENCLOSE_RECURSION(node)
1295 		? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1296 
1297       r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298       if (r) return r;
1299     }
1300 #endif
1301     if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302       r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303     else
1304       r = add_opcode(reg, OP_MEMORY_START);
1305     if (r) return r;
1306     r = add_mem_num(reg, node->regnum);
1307     if (r) return r;
1308     r = compile_tree(node->target, reg);
1309     if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311     if (IS_ENCLOSE_CALLED(node)) {
1312       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 	r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 			     ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315       else
1316 	r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 			     ? OP_MEMORY_END_REC : OP_MEMORY_END));
1318 
1319       if (r) return r;
1320       r = add_mem_num(reg, node->regnum);
1321       if (r) return r;
1322       r = add_opcode(reg, OP_RETURN);
1323     }
1324     else
1325 #endif
1326     {
1327       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 	r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329       else
1330 	r = add_opcode(reg, OP_MEMORY_END);
1331       if (r) return r;
1332       r = add_mem_num(reg, node->regnum);
1333     }
1334     break;
1335 
1336   case ENCLOSE_STOP_BACKTRACK:
1337     if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338       QtfrNode* qn = NQTFR(node->target);
1339       r = compile_tree_n_times(qn->target, qn->lower, reg);
1340       if (r) return r;
1341 
1342       len = compile_length_tree(qn->target, reg);
1343       if (len < 0) return len;
1344 
1345       r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346       if (r) return r;
1347       r = compile_tree(qn->target, reg);
1348       if (r) return r;
1349       r = add_opcode(reg, OP_POP);
1350       if (r) return r;
1351       r = add_opcode_rel_addr(reg, OP_JUMP,
1352 	 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1353     }
1354     else {
1355       r = add_opcode(reg, OP_PUSH_STOP_BT);
1356       if (r) return r;
1357       r = compile_tree(node->target, reg);
1358       if (r) return r;
1359       r = add_opcode(reg, OP_POP_STOP_BT);
1360     }
1361     break;
1362 
1363   default:
1364     return ONIGERR_TYPE_BUG;
1365     break;
1366   }
1367 
1368   return r;
1369 }
1370 
1371 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1373 {
1374   int len;
1375   int tlen = 0;
1376 
1377   if (node->target) {
1378     tlen = compile_length_tree(node->target, reg);
1379     if (tlen < 0) return tlen;
1380   }
1381 
1382   switch (node->type) {
1383   case ANCHOR_PREC_READ:
1384     len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385     break;
1386   case ANCHOR_PREC_READ_NOT:
1387     len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388     break;
1389   case ANCHOR_LOOK_BEHIND:
1390     len = SIZE_OP_LOOK_BEHIND + tlen;
1391     break;
1392   case ANCHOR_LOOK_BEHIND_NOT:
1393     len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394     break;
1395 
1396   default:
1397     len = SIZE_OPCODE;
1398     break;
1399   }
1400 
1401   return len;
1402 }
1403 
1404 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1406 {
1407   int r, len;
1408 
1409   switch (node->type) {
1410   case ANCHOR_BEGIN_BUF:      r = add_opcode(reg, OP_BEGIN_BUF);      break;
1411   case ANCHOR_END_BUF:        r = add_opcode(reg, OP_END_BUF);        break;
1412   case ANCHOR_BEGIN_LINE:     r = add_opcode(reg, OP_BEGIN_LINE);     break;
1413   case ANCHOR_END_LINE:       r = add_opcode(reg, OP_END_LINE);       break;
1414   case ANCHOR_SEMI_END_BUF:   r = add_opcode(reg, OP_SEMI_END_BUF);   break;
1415   case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1416 
1417   case ANCHOR_WORD_BOUND:     r = add_opcode(reg, OP_WORD_BOUND);     break;
1418   case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420   case ANCHOR_WORD_BEGIN:     r = add_opcode(reg, OP_WORD_BEGIN);     break;
1421   case ANCHOR_WORD_END:       r = add_opcode(reg, OP_WORD_END);       break;
1422 #endif
1423 
1424   case ANCHOR_PREC_READ:
1425     r = add_opcode(reg, OP_PUSH_POS);
1426     if (r) return r;
1427     r = compile_tree(node->target, reg);
1428     if (r) return r;
1429     r = add_opcode(reg, OP_POP_POS);
1430     break;
1431 
1432   case ANCHOR_PREC_READ_NOT:
1433     len = compile_length_tree(node->target, reg);
1434     if (len < 0) return len;
1435     r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436     if (r) return r;
1437     r = compile_tree(node->target, reg);
1438     if (r) return r;
1439     r = add_opcode(reg, OP_FAIL_POS);
1440     break;
1441 
1442   case ANCHOR_LOOK_BEHIND:
1443     {
1444       int n;
1445       r = add_opcode(reg, OP_LOOK_BEHIND);
1446       if (r) return r;
1447       if (node->char_len < 0) {
1448 	r = get_char_length_tree(node->target, reg, &n);
1449 	if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1450       }
1451       else
1452 	n = node->char_len;
1453       r = add_length(reg, n);
1454       if (r) return r;
1455       r = compile_tree(node->target, reg);
1456     }
1457     break;
1458 
1459   case ANCHOR_LOOK_BEHIND_NOT:
1460     {
1461       int n;
1462       len = compile_length_tree(node->target, reg);
1463       r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 			   len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465       if (r) return r;
1466       if (node->char_len < 0) {
1467 	r = get_char_length_tree(node->target, reg, &n);
1468 	if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1469       }
1470       else
1471 	n = node->char_len;
1472       r = add_length(reg, n);
1473       if (r) return r;
1474       r = compile_tree(node->target, reg);
1475       if (r) return r;
1476       r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1477     }
1478     break;
1479 
1480   default:
1481     return ONIGERR_TYPE_BUG;
1482     break;
1483   }
1484 
1485   return r;
1486 }
1487 
1488 static int
compile_length_tree(Node * node,regex_t * reg)1489 compile_length_tree(Node* node, regex_t* reg)
1490 {
1491   int len, type, r;
1492 
1493   type = NTYPE(node);
1494   switch (type) {
1495   case NT_LIST:
1496     len = 0;
1497     do {
1498       r = compile_length_tree(NCAR(node), reg);
1499       if (r < 0) return r;
1500       len += r;
1501     } while (IS_NOT_NULL(node = NCDR(node)));
1502     r = len;
1503     break;
1504 
1505   case NT_ALT:
1506     {
1507       int n;
1508 
1509       n = r = 0;
1510       do {
1511 	r += compile_length_tree(NCAR(node), reg);
1512 	n++;
1513       } while (IS_NOT_NULL(node = NCDR(node)));
1514       r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1515     }
1516     break;
1517 
1518   case NT_STR:
1519     if (NSTRING_IS_RAW(node))
1520       r = compile_length_string_raw_node(NSTR(node), reg);
1521     else
1522       r = compile_length_string_node(node, reg);
1523     break;
1524 
1525   case NT_CCLASS:
1526     r = compile_length_cclass_node(NCCLASS(node), reg);
1527     break;
1528 
1529   case NT_CTYPE:
1530   case NT_CANY:
1531     r = SIZE_OPCODE;
1532     break;
1533 
1534   case NT_BREF:
1535     {
1536       BRefNode* br = NBREF(node);
1537 
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539       if (IS_BACKREF_NEST_LEVEL(br)) {
1540         r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541             SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1542       }
1543       else
1544 #endif
1545       if (br->back_num == 1) {
1546 	r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 	     ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1548       }
1549       else {
1550 	r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1551       }
1552     }
1553     break;
1554 
1555 #ifdef USE_SUBEXP_CALL
1556   case NT_CALL:
1557     r = SIZE_OP_CALL;
1558     break;
1559 #endif
1560 
1561   case NT_QTFR:
1562     r = compile_length_quantifier_node(NQTFR(node), reg);
1563     break;
1564 
1565   case NT_ENCLOSE:
1566     r = compile_length_enclose_node(NENCLOSE(node), reg);
1567     break;
1568 
1569   case NT_ANCHOR:
1570     r = compile_length_anchor_node(NANCHOR(node), reg);
1571     break;
1572 
1573   default:
1574     return ONIGERR_TYPE_BUG;
1575     break;
1576   }
1577 
1578   return r;
1579 }
1580 
1581 static int
compile_tree(Node * node,regex_t * reg)1582 compile_tree(Node* node, regex_t* reg)
1583 {
1584   int n, type, len, pos, r = 0;
1585 
1586   type = NTYPE(node);
1587   switch (type) {
1588   case NT_LIST:
1589     do {
1590       r = compile_tree(NCAR(node), reg);
1591     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592     break;
1593 
1594   case NT_ALT:
1595     {
1596       Node* x = node;
1597       len = 0;
1598       do {
1599 	len += compile_length_tree(NCAR(x), reg);
1600 	if (NCDR(x) != NULL) {
1601 	  len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1602 	}
1603       } while (IS_NOT_NULL(x = NCDR(x)));
1604       pos = reg->used + len;  /* goal position */
1605 
1606       do {
1607 	len = compile_length_tree(NCAR(node), reg);
1608 	if (IS_NOT_NULL(NCDR(node))) {
1609 	  r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 	  if (r) break;
1611 	}
1612 	r = compile_tree(NCAR(node), reg);
1613 	if (r) break;
1614 	if (IS_NOT_NULL(NCDR(node))) {
1615 	  len = pos - (reg->used + SIZE_OP_JUMP);
1616 	  r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 	  if (r) break;
1618 	}
1619       } while (IS_NOT_NULL(node = NCDR(node)));
1620     }
1621     break;
1622 
1623   case NT_STR:
1624     if (NSTRING_IS_RAW(node))
1625       r = compile_string_raw_node(NSTR(node), reg);
1626     else
1627       r = compile_string_node(node, reg);
1628     break;
1629 
1630   case NT_CCLASS:
1631     r = compile_cclass_node(NCCLASS(node), reg);
1632     break;
1633 
1634   case NT_CTYPE:
1635     {
1636       int op;
1637 
1638       switch (NCTYPE(node)->ctype) {
1639       case ONIGENC_CTYPE_WORD:
1640 	if (NCTYPE(node)->not != 0)  op = OP_NOT_WORD;
1641 	else                         op = OP_WORD;
1642 	break;
1643       default:
1644 	return ONIGERR_TYPE_BUG;
1645 	break;
1646       }
1647       r = add_opcode(reg, op);
1648     }
1649     break;
1650 
1651   case NT_CANY:
1652     if (IS_MULTILINE(reg->options))
1653       r = add_opcode(reg, OP_ANYCHAR_ML);
1654     else
1655       r = add_opcode(reg, OP_ANYCHAR);
1656     break;
1657 
1658   case NT_BREF:
1659     {
1660       BRefNode* br = NBREF(node);
1661 
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663       if (IS_BACKREF_NEST_LEVEL(br)) {
1664 	r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 	if (r) return r;
1666 	r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 	if (r) return r;
1668 	r = add_length(reg, br->nest_level);
1669 	if (r) return r;
1670 
1671 	goto add_bacref_mems;
1672       }
1673       else
1674 #endif
1675       if (br->back_num == 1) {
1676 	n = br->back_static[0];
1677 	if (IS_IGNORECASE(reg->options)) {
1678 	  r = add_opcode(reg, OP_BACKREFN_IC);
1679 	  if (r) return r;
1680 	  r = add_mem_num(reg, n);
1681 	}
1682 	else {
1683 	  switch (n) {
1684 	  case 1:  r = add_opcode(reg, OP_BACKREF1); break;
1685 	  case 2:  r = add_opcode(reg, OP_BACKREF2); break;
1686 	  default:
1687 	    r = add_opcode(reg, OP_BACKREFN);
1688 	    if (r) return r;
1689 	    r = add_mem_num(reg, n);
1690 	    break;
1691 	  }
1692 	}
1693       }
1694       else {
1695 	int i;
1696 	int* p;
1697 
1698         if (IS_IGNORECASE(reg->options)) {
1699           r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1700         }
1701         else {
1702           r = add_opcode(reg, OP_BACKREF_MULTI);
1703         }
1704 	if (r) return r;
1705 
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707       add_bacref_mems:
1708 #endif
1709 	r = add_length(reg, br->back_num);
1710 	if (r) return r;
1711 	p = BACKREFS_P(br);
1712 	for (i = br->back_num - 1; i >= 0; i--) {
1713 	  r = add_mem_num(reg, p[i]);
1714 	  if (r) return r;
1715 	}
1716       }
1717     }
1718     break;
1719 
1720 #ifdef USE_SUBEXP_CALL
1721   case NT_CALL:
1722     r = compile_call(NCALL(node), reg);
1723     break;
1724 #endif
1725 
1726   case NT_QTFR:
1727     r = compile_quantifier_node(NQTFR(node), reg);
1728     break;
1729 
1730   case NT_ENCLOSE:
1731     r = compile_enclose_node(NENCLOSE(node), reg);
1732     break;
1733 
1734   case NT_ANCHOR:
1735     r = compile_anchor_node(NANCHOR(node), reg);
1736     break;
1737 
1738   default:
1739 #ifdef ONIG_DEBUG
1740     fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742     break;
1743   }
1744 
1745   return r;
1746 }
1747 
1748 #ifdef USE_NAMED_GROUP
1749 
1750 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1752 {
1753   int r = 0;
1754   Node* node = *plink;
1755 
1756   switch (NTYPE(node)) {
1757   case NT_LIST:
1758   case NT_ALT:
1759     do {
1760       r = noname_disable_map(&(NCAR(node)), map, counter);
1761     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762     break;
1763 
1764   case NT_QTFR:
1765     {
1766       Node** ptarget = &(NQTFR(node)->target);
1767       Node*  old = *ptarget;
1768       r = noname_disable_map(ptarget, map, counter);
1769       if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 	onig_reduce_nested_quantifier(node, *ptarget);
1771       }
1772     }
1773     break;
1774 
1775   case NT_ENCLOSE:
1776     {
1777       EncloseNode* en = NENCLOSE(node);
1778       if (en->type == ENCLOSE_MEMORY) {
1779 	if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 	  (*counter)++;
1781 	  map[en->regnum].new_val = *counter;
1782 	  en->regnum = *counter;
1783 	  r = noname_disable_map(&(en->target), map, counter);
1784 	}
1785 	else {
1786 	  *plink = en->target;
1787 	  en->target = NULL_NODE;
1788 	  onig_node_free(node);
1789 	  r = noname_disable_map(plink, map, counter);
1790 	}
1791       }
1792       else
1793 	r = noname_disable_map(&(en->target), map, counter);
1794     }
1795     break;
1796 
1797   default:
1798     break;
1799   }
1800 
1801   return r;
1802 }
1803 
1804 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1805 renumber_node_backref(Node* node, GroupNumRemap* map)
1806 {
1807   int i, pos, n, old_num;
1808   int *backs;
1809   BRefNode* bn = NBREF(node);
1810 
1811   if (! IS_BACKREF_NAME_REF(bn))
1812     return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1813 
1814   old_num = bn->back_num;
1815   if (IS_NULL(bn->back_dynamic))
1816     backs = bn->back_static;
1817   else
1818     backs = bn->back_dynamic;
1819 
1820   for (i = 0, pos = 0; i < old_num; i++) {
1821     n = map[backs[i]].new_val;
1822     if (n > 0) {
1823       backs[pos] = n;
1824       pos++;
1825     }
1826   }
1827 
1828   bn->back_num = pos;
1829   return 0;
1830 }
1831 
1832 static int
renumber_by_map(Node * node,GroupNumRemap * map)1833 renumber_by_map(Node* node, GroupNumRemap* map)
1834 {
1835   int r = 0;
1836 
1837   switch (NTYPE(node)) {
1838   case NT_LIST:
1839   case NT_ALT:
1840     do {
1841       r = renumber_by_map(NCAR(node), map);
1842     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1843     break;
1844   case NT_QTFR:
1845     r = renumber_by_map(NQTFR(node)->target, map);
1846     break;
1847   case NT_ENCLOSE:
1848     r = renumber_by_map(NENCLOSE(node)->target, map);
1849     break;
1850 
1851   case NT_BREF:
1852     r = renumber_node_backref(node, map);
1853     break;
1854 
1855   default:
1856     break;
1857   }
1858 
1859   return r;
1860 }
1861 
1862 static int
numbered_ref_check(Node * node)1863 numbered_ref_check(Node* node)
1864 {
1865   int r = 0;
1866 
1867   switch (NTYPE(node)) {
1868   case NT_LIST:
1869   case NT_ALT:
1870     do {
1871       r = numbered_ref_check(NCAR(node));
1872     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1873     break;
1874   case NT_QTFR:
1875     r = numbered_ref_check(NQTFR(node)->target);
1876     break;
1877   case NT_ENCLOSE:
1878     r = numbered_ref_check(NENCLOSE(node)->target);
1879     break;
1880 
1881   case NT_BREF:
1882     if (! IS_BACKREF_NAME_REF(NBREF(node)))
1883       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1884     break;
1885 
1886   default:
1887     break;
1888   }
1889 
1890   return r;
1891 }
1892 
1893 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1894 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1895 {
1896   int r, i, pos, counter;
1897   BitStatusType loc;
1898   GroupNumRemap* map;
1899 
1900   map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1901   CHECK_NULL_RETURN_MEMERR(map);
1902   for (i = 1; i <= env->num_mem; i++) {
1903     map[i].new_val = 0;
1904   }
1905   counter = 0;
1906   r = noname_disable_map(root, map, &counter);
1907   if (r != 0) return r;
1908 
1909   r = renumber_by_map(*root, map);
1910   if (r != 0) return r;
1911 
1912   for (i = 1, pos = 1; i <= env->num_mem; i++) {
1913     if (map[i].new_val > 0) {
1914       SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1915       pos++;
1916     }
1917   }
1918 
1919   loc = env->capture_history;
1920   BIT_STATUS_CLEAR(env->capture_history);
1921   for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1922     if (BIT_STATUS_AT(loc, i)) {
1923       BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1924     }
1925   }
1926 
1927   env->num_mem = env->num_named;
1928   reg->num_mem = env->num_named;
1929 
1930   return onig_renumber_name_table(reg, map);
1931 }
1932 #endif /* USE_NAMED_GROUP */
1933 
1934 #ifdef USE_SUBEXP_CALL
1935 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1936 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1937 {
1938   int i, offset;
1939   EncloseNode* en;
1940   AbsAddrType addr;
1941 
1942   for (i = 0; i < uslist->num; i++) {
1943     en = NENCLOSE(uslist->us[i].target);
1944     if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1945     addr = en->call_addr;
1946     offset = uslist->us[i].offset;
1947 
1948     BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1949   }
1950   return 0;
1951 }
1952 #endif
1953 
1954 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1955 static int
quantifiers_memory_node_info(Node * node)1956 quantifiers_memory_node_info(Node* node)
1957 {
1958   int r = 0;
1959 
1960   switch (NTYPE(node)) {
1961   case NT_LIST:
1962   case NT_ALT:
1963     {
1964       int v;
1965       do {
1966 	v = quantifiers_memory_node_info(NCAR(node));
1967 	if (v > r) r = v;
1968       } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1969     }
1970     break;
1971 
1972 #ifdef USE_SUBEXP_CALL
1973   case NT_CALL:
1974     if (IS_CALL_RECURSION(NCALL(node))) {
1975       return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1976     }
1977     else
1978       r = quantifiers_memory_node_info(NCALL(node)->target);
1979     break;
1980 #endif
1981 
1982   case NT_QTFR:
1983     {
1984       QtfrNode* qn = NQTFR(node);
1985       if (qn->upper != 0) {
1986 	r = quantifiers_memory_node_info(qn->target);
1987       }
1988     }
1989     break;
1990 
1991   case NT_ENCLOSE:
1992     {
1993       EncloseNode* en = NENCLOSE(node);
1994       switch (en->type) {
1995       case ENCLOSE_MEMORY:
1996 	return NQ_TARGET_IS_EMPTY_MEM;
1997 	break;
1998 
1999       case ENCLOSE_OPTION:
2000       case ENCLOSE_STOP_BACKTRACK:
2001 	r = quantifiers_memory_node_info(en->target);
2002 	break;
2003       default:
2004 	break;
2005       }
2006     }
2007     break;
2008 
2009   case NT_BREF:
2010   case NT_STR:
2011   case NT_CTYPE:
2012   case NT_CCLASS:
2013   case NT_CANY:
2014   case NT_ANCHOR:
2015   default:
2016     break;
2017   }
2018 
2019   return r;
2020 }
2021 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2022 
2023 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2024 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2025 {
2026   OnigDistance tmin;
2027   int r = 0;
2028 
2029   *min = 0;
2030   switch (NTYPE(node)) {
2031   case NT_BREF:
2032     {
2033       int i;
2034       int* backs;
2035       Node** nodes = SCANENV_MEM_NODES(env);
2036       BRefNode* br = NBREF(node);
2037       if (br->state & NST_RECURSION) break;
2038 
2039       backs = BACKREFS_P(br);
2040       if (backs[0] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2041       r = get_min_match_length(nodes[backs[0]], min, env);
2042       if (r != 0) break;
2043       for (i = 1; i < br->back_num; i++) {
2044 	if (backs[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2045 	r = get_min_match_length(nodes[backs[i]], &tmin, env);
2046 	if (r != 0) break;
2047 	if (*min > tmin) *min = tmin;
2048       }
2049     }
2050     break;
2051 
2052 #ifdef USE_SUBEXP_CALL
2053   case NT_CALL:
2054     if (IS_CALL_RECURSION(NCALL(node))) {
2055       EncloseNode* en = NENCLOSE(NCALL(node)->target);
2056       if (IS_ENCLOSE_MIN_FIXED(en))
2057 	*min = en->min_len;
2058     }
2059     else
2060       r = get_min_match_length(NCALL(node)->target, min, env);
2061     break;
2062 #endif
2063 
2064   case NT_LIST:
2065     do {
2066       r = get_min_match_length(NCAR(node), &tmin, env);
2067       if (r == 0) *min += tmin;
2068     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2069     break;
2070 
2071   case NT_ALT:
2072     {
2073       Node *x, *y;
2074       y = node;
2075       do {
2076 	x = NCAR(y);
2077 	r = get_min_match_length(x, &tmin, env);
2078 	if (r != 0) break;
2079 	if (y == node) *min = tmin;
2080 	else if (*min > tmin) *min = tmin;
2081       } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2082     }
2083     break;
2084 
2085   case NT_STR:
2086     {
2087       StrNode* sn = NSTR(node);
2088       *min = sn->end - sn->s;
2089     }
2090     break;
2091 
2092   case NT_CTYPE:
2093     *min = 1;
2094     break;
2095 
2096   case NT_CCLASS:
2097   case NT_CANY:
2098     *min = 1;
2099     break;
2100 
2101   case NT_QTFR:
2102     {
2103       QtfrNode* qn = NQTFR(node);
2104 
2105       if (qn->lower > 0) {
2106 	r = get_min_match_length(qn->target, min, env);
2107 	if (r == 0)
2108 	  *min = distance_multiply(*min, qn->lower);
2109       }
2110     }
2111     break;
2112 
2113   case NT_ENCLOSE:
2114     {
2115       EncloseNode* en = NENCLOSE(node);
2116       switch (en->type) {
2117       case ENCLOSE_MEMORY:
2118 #ifdef USE_SUBEXP_CALL
2119 	if (IS_ENCLOSE_MIN_FIXED(en))
2120 	  *min = en->min_len;
2121 	else {
2122 	  r = get_min_match_length(en->target, min, env);
2123 	  if (r == 0) {
2124 	    en->min_len = *min;
2125 	    SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2126 	  }
2127 	}
2128 	break;
2129 #endif
2130       case ENCLOSE_OPTION:
2131       case ENCLOSE_STOP_BACKTRACK:
2132 	r = get_min_match_length(en->target, min, env);
2133 	break;
2134       }
2135     }
2136     break;
2137 
2138   case NT_ANCHOR:
2139   default:
2140     break;
2141   }
2142 
2143   return r;
2144 }
2145 
2146 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2147 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2148 {
2149   OnigDistance tmax;
2150   int r = 0;
2151 
2152   *max = 0;
2153   switch (NTYPE(node)) {
2154   case NT_LIST:
2155     do {
2156       r = get_max_match_length(NCAR(node), &tmax, env);
2157       if (r == 0)
2158 	*max = distance_add(*max, tmax);
2159     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2160     break;
2161 
2162   case NT_ALT:
2163     do {
2164       r = get_max_match_length(NCAR(node), &tmax, env);
2165       if (r == 0 && *max < tmax) *max = tmax;
2166     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2167     break;
2168 
2169   case NT_STR:
2170     {
2171       StrNode* sn = NSTR(node);
2172       *max = sn->end - sn->s;
2173     }
2174     break;
2175 
2176   case NT_CTYPE:
2177     *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2178     break;
2179 
2180   case NT_CCLASS:
2181   case NT_CANY:
2182     *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2183     break;
2184 
2185   case NT_BREF:
2186     {
2187       int i;
2188       int* backs;
2189       Node** nodes = SCANENV_MEM_NODES(env);
2190       BRefNode* br = NBREF(node);
2191       if (br->state & NST_RECURSION) {
2192 	*max = ONIG_INFINITE_DISTANCE;
2193 	break;
2194       }
2195       backs = BACKREFS_P(br);
2196       for (i = 0; i < br->back_num; i++) {
2197 	if (backs[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2198 	r = get_max_match_length(nodes[backs[i]], &tmax, env);
2199 	if (r != 0) break;
2200 	if (*max < tmax) *max = tmax;
2201       }
2202     }
2203     break;
2204 
2205 #ifdef USE_SUBEXP_CALL
2206   case NT_CALL:
2207     if (! IS_CALL_RECURSION(NCALL(node)))
2208       r = get_max_match_length(NCALL(node)->target, max, env);
2209     else
2210       *max = ONIG_INFINITE_DISTANCE;
2211     break;
2212 #endif
2213 
2214   case NT_QTFR:
2215     {
2216       QtfrNode* qn = NQTFR(node);
2217 
2218       if (qn->upper != 0) {
2219 	r = get_max_match_length(qn->target, max, env);
2220 	if (r == 0 && *max != 0) {
2221 	  if (! IS_REPEAT_INFINITE(qn->upper))
2222 	    *max = distance_multiply(*max, qn->upper);
2223 	  else
2224 	    *max = ONIG_INFINITE_DISTANCE;
2225 	}
2226       }
2227     }
2228     break;
2229 
2230   case NT_ENCLOSE:
2231     {
2232       EncloseNode* en = NENCLOSE(node);
2233       switch (en->type) {
2234       case ENCLOSE_MEMORY:
2235 #ifdef USE_SUBEXP_CALL
2236 	if (IS_ENCLOSE_MAX_FIXED(en))
2237 	  *max = en->max_len;
2238 	else {
2239 	  r = get_max_match_length(en->target, max, env);
2240 	  if (r == 0) {
2241 	    en->max_len = *max;
2242 	    SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2243 	  }
2244 	}
2245 	break;
2246 #endif
2247       case ENCLOSE_OPTION:
2248       case ENCLOSE_STOP_BACKTRACK:
2249 	r = get_max_match_length(en->target, max, env);
2250 	break;
2251       }
2252     }
2253     break;
2254 
2255   case NT_ANCHOR:
2256   default:
2257     break;
2258   }
2259 
2260   return r;
2261 }
2262 
2263 #define GET_CHAR_LEN_VARLEN           -1
2264 #define GET_CHAR_LEN_TOP_ALT_VARLEN   -2
2265 
2266 /* fixed size pattern node only */
2267 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2268 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2269 {
2270   int tlen;
2271   int r = 0;
2272 
2273   level++;
2274   *len = 0;
2275   switch (NTYPE(node)) {
2276   case NT_LIST:
2277     do {
2278       r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2279       if (r == 0)
2280 	*len = distance_add(*len, tlen);
2281     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2282     break;
2283 
2284   case NT_ALT:
2285     {
2286       int tlen2;
2287       int varlen = 0;
2288 
2289       r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2290       while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2291 	r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2292 	if (r == 0) {
2293 	  if (tlen != tlen2)
2294 	    varlen = 1;
2295 	}
2296       }
2297       if (r == 0) {
2298 	if (varlen != 0) {
2299 	  if (level == 1)
2300 	    r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2301 	  else
2302 	    r = GET_CHAR_LEN_VARLEN;
2303 	}
2304 	else
2305 	  *len = tlen;
2306       }
2307     }
2308     break;
2309 
2310   case NT_STR:
2311     {
2312       StrNode* sn = NSTR(node);
2313       UChar *s = sn->s;
2314       while (s < sn->end) {
2315 	s += enclen(reg->enc, s);
2316 	(*len)++;
2317       }
2318     }
2319     break;
2320 
2321   case NT_QTFR:
2322     {
2323       QtfrNode* qn = NQTFR(node);
2324       if (qn->lower == qn->upper) {
2325 	r = get_char_length_tree1(qn->target, reg, &tlen, level);
2326 	if (r == 0)
2327 	  *len = distance_multiply(tlen, qn->lower);
2328       }
2329       else
2330 	r = GET_CHAR_LEN_VARLEN;
2331     }
2332     break;
2333 
2334 #ifdef USE_SUBEXP_CALL
2335   case NT_CALL:
2336     if (! IS_CALL_RECURSION(NCALL(node)))
2337       r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2338     else
2339       r = GET_CHAR_LEN_VARLEN;
2340     break;
2341 #endif
2342 
2343   case NT_CTYPE:
2344     *len = 1;
2345     break;
2346 
2347   case NT_CCLASS:
2348   case NT_CANY:
2349     *len = 1;
2350     break;
2351 
2352   case NT_ENCLOSE:
2353     {
2354       EncloseNode* en = NENCLOSE(node);
2355       switch (en->type) {
2356       case ENCLOSE_MEMORY:
2357 #ifdef USE_SUBEXP_CALL
2358 	if (IS_ENCLOSE_CLEN_FIXED(en))
2359 	  *len = en->char_len;
2360 	else {
2361 	  r = get_char_length_tree1(en->target, reg, len, level);
2362 	  if (r == 0) {
2363 	    en->char_len = *len;
2364 	    SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2365 	  }
2366 	}
2367 	break;
2368 #endif
2369       case ENCLOSE_OPTION:
2370       case ENCLOSE_STOP_BACKTRACK:
2371 	r = get_char_length_tree1(en->target, reg, len, level);
2372 	break;
2373       default:
2374 	break;
2375       }
2376     }
2377     break;
2378 
2379   case NT_ANCHOR:
2380     break;
2381 
2382   default:
2383     r = GET_CHAR_LEN_VARLEN;
2384     break;
2385   }
2386 
2387   return r;
2388 }
2389 
2390 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2391 get_char_length_tree(Node* node, regex_t* reg, int* len)
2392 {
2393   return get_char_length_tree1(node, reg, len, 0);
2394 }
2395 
2396 /* x is not included y ==>  1 : 0 */
2397 static int
is_not_included(Node * x,Node * y,regex_t * reg)2398 is_not_included(Node* x, Node* y, regex_t* reg)
2399 {
2400   int i, len;
2401   OnigCodePoint code;
2402   UChar *p, c;
2403   int ytype;
2404 
2405  retry:
2406   ytype = NTYPE(y);
2407   switch (NTYPE(x)) {
2408   case NT_CTYPE:
2409     {
2410       switch (ytype) {
2411       case NT_CTYPE:
2412 	if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2413 	    NCTYPE(y)->not   != NCTYPE(x)->not)
2414 	  return 1;
2415 	else
2416 	  return 0;
2417 	break;
2418 
2419       case NT_CCLASS:
2420       swap:
2421 	{
2422 	  Node* tmp;
2423 	  tmp = x; x = y; y = tmp;
2424 	  goto retry;
2425 	}
2426 	break;
2427 
2428       case NT_STR:
2429 	goto swap;
2430 	break;
2431 
2432       default:
2433 	break;
2434       }
2435     }
2436     break;
2437 
2438   case NT_CCLASS:
2439     {
2440       CClassNode* xc = NCCLASS(x);
2441       switch (ytype) {
2442       case NT_CTYPE:
2443 	switch (NCTYPE(y)->ctype) {
2444 	case ONIGENC_CTYPE_WORD:
2445 	  if (NCTYPE(y)->not == 0) {
2446 	    if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2447 	      for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2448 		if (BITSET_AT(xc->bs, i)) {
2449 		  if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2450 		}
2451 	      }
2452 	      return 1;
2453 	    }
2454 	    return 0;
2455 	  }
2456 	  else {
2457 	    for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2458 	      if (! IS_CODE_SB_WORD(reg->enc, i)) {
2459 		if (!IS_NCCLASS_NOT(xc)) {
2460 		  if (BITSET_AT(xc->bs, i))
2461 		    return 0;
2462 		}
2463 		else {
2464 		  if (! BITSET_AT(xc->bs, i))
2465 		    return 0;
2466 		}
2467 	      }
2468 	    }
2469 	    return 1;
2470 	  }
2471 	  break;
2472 
2473 	default:
2474 	  break;
2475 	}
2476 	break;
2477 
2478       case NT_CCLASS:
2479 	{
2480 	  int v;
2481 	  CClassNode* yc = NCCLASS(y);
2482 
2483 	  for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2484 	    v = BITSET_AT(xc->bs, i);
2485 	    if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2486                 (v == 0 && IS_NCCLASS_NOT(xc))) {
2487 	      v = BITSET_AT(yc->bs, i);
2488 	      if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2489                   (v == 0 && IS_NCCLASS_NOT(yc)))
2490 		return 0;
2491 	    }
2492 	  }
2493 	  if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2494 	      (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2495 	    return 1;
2496 	  return 0;
2497 	}
2498 	break;
2499 
2500       case NT_STR:
2501 	goto swap;
2502 	break;
2503 
2504       default:
2505 	break;
2506       }
2507     }
2508     break;
2509 
2510   case NT_STR:
2511     {
2512       StrNode* xs = NSTR(x);
2513       if (NSTRING_LEN(x) == 0)
2514 	break;
2515 
2516       c = *(xs->s);
2517       switch (ytype) {
2518       case NT_CTYPE:
2519 	switch (NCTYPE(y)->ctype) {
2520 	case ONIGENC_CTYPE_WORD:
2521 	  if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2522 	    return NCTYPE(y)->not;
2523 	  else
2524 	    return !(NCTYPE(y)->not);
2525 	  break;
2526 	default:
2527 	  break;
2528 	}
2529 	break;
2530 
2531       case NT_CCLASS:
2532 	{
2533 	  CClassNode* cc = NCCLASS(y);
2534 
2535 	  code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2536 				     xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2537 	  return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2538 	}
2539 	break;
2540 
2541       case NT_STR:
2542 	{
2543 	  UChar *q;
2544 	  StrNode* ys = NSTR(y);
2545 	  len = NSTRING_LEN(x);
2546 	  if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2547 	  if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2548             /* tiny version */
2549             return 0;
2550 	  }
2551 	  else {
2552 	    for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2553 	      if (*p != *q) return 1;
2554 	    }
2555 	  }
2556 	}
2557 	break;
2558 
2559       default:
2560 	break;
2561       }
2562     }
2563     break;
2564 
2565   default:
2566     break;
2567   }
2568 
2569   return 0;
2570 }
2571 
2572 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2573 get_head_value_node(Node* node, int exact, regex_t* reg)
2574 {
2575   Node* n = NULL_NODE;
2576 
2577   switch (NTYPE(node)) {
2578   case NT_BREF:
2579   case NT_ALT:
2580   case NT_CANY:
2581 #ifdef USE_SUBEXP_CALL
2582   case NT_CALL:
2583 #endif
2584     break;
2585 
2586   case NT_CTYPE:
2587   case NT_CCLASS:
2588     if (exact == 0) {
2589       n = node;
2590     }
2591     break;
2592 
2593   case NT_LIST:
2594     n = get_head_value_node(NCAR(node), exact, reg);
2595     break;
2596 
2597   case NT_STR:
2598     {
2599       StrNode* sn = NSTR(node);
2600 
2601       if (sn->end <= sn->s)
2602 	break;
2603 
2604       if (exact != 0 &&
2605 	  !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2606       }
2607       else {
2608 	n = node;
2609       }
2610     }
2611     break;
2612 
2613   case NT_QTFR:
2614     {
2615       QtfrNode* qn = NQTFR(node);
2616       if (qn->lower > 0) {
2617 	if (IS_NOT_NULL(qn->head_exact))
2618 	  n = qn->head_exact;
2619 	else
2620 	  n = get_head_value_node(qn->target, exact, reg);
2621       }
2622     }
2623     break;
2624 
2625   case NT_ENCLOSE:
2626     {
2627       EncloseNode* en = NENCLOSE(node);
2628       switch (en->type) {
2629       case ENCLOSE_OPTION:
2630 	{
2631 	  OnigOptionType options = reg->options;
2632 
2633 	  reg->options = NENCLOSE(node)->option;
2634 	  n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2635 	  reg->options = options;
2636 	}
2637 	break;
2638 
2639       case ENCLOSE_MEMORY:
2640       case ENCLOSE_STOP_BACKTRACK:
2641 	n = get_head_value_node(en->target, exact, reg);
2642 	break;
2643       }
2644     }
2645     break;
2646 
2647   case NT_ANCHOR:
2648     if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2649       n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2650     break;
2651 
2652   default:
2653     break;
2654   }
2655 
2656   return n;
2657 }
2658 
2659 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2660 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2661 {
2662   int type, r = 0;
2663 
2664   type = NTYPE(node);
2665   if ((NTYPE2BIT(type) & type_mask) == 0)
2666     return 1;
2667 
2668   switch (type) {
2669   case NT_LIST:
2670   case NT_ALT:
2671     do {
2672       r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2673 			  anchor_mask);
2674     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2675     break;
2676 
2677   case NT_QTFR:
2678     r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2679 			anchor_mask);
2680     break;
2681 
2682   case NT_ENCLOSE:
2683     {
2684       EncloseNode* en = NENCLOSE(node);
2685       if ((en->type & enclose_mask) == 0)
2686 	return 1;
2687 
2688       r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2689     }
2690     break;
2691 
2692   case NT_ANCHOR:
2693     type = NANCHOR(node)->type;
2694     if ((type & anchor_mask) == 0)
2695       return 1;
2696 
2697     if (NANCHOR(node)->target)
2698       r = check_type_tree(NANCHOR(node)->target,
2699 			  type_mask, enclose_mask, anchor_mask);
2700     break;
2701 
2702   default:
2703     break;
2704   }
2705   return r;
2706 }
2707 
2708 #ifdef USE_SUBEXP_CALL
2709 
2710 #define RECURSION_EXIST       1
2711 #define RECURSION_INFINITE    2
2712 
2713 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2714 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2715 {
2716   int type;
2717   int r = 0;
2718 
2719   type = NTYPE(node);
2720   switch (type) {
2721   case NT_LIST:
2722     {
2723       Node *x;
2724       OnigDistance min;
2725       int ret;
2726 
2727       x = node;
2728       do {
2729 	ret = subexp_inf_recursive_check(NCAR(x), env, head);
2730 	if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2731 	r |= ret;
2732 	if (head) {
2733 	  ret = get_min_match_length(NCAR(x), &min, env);
2734 	  if (ret != 0) return ret;
2735 	  if (min != 0) head = 0;
2736 	}
2737       } while (IS_NOT_NULL(x = NCDR(x)));
2738     }
2739     break;
2740 
2741   case NT_ALT:
2742     {
2743       int ret;
2744       r = RECURSION_EXIST;
2745       do {
2746 	ret = subexp_inf_recursive_check(NCAR(node), env, head);
2747 	if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2748 	r &= ret;
2749       } while (IS_NOT_NULL(node = NCDR(node)));
2750     }
2751     break;
2752 
2753   case NT_QTFR:
2754     r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2755     if (r == RECURSION_EXIST) {
2756       if (NQTFR(node)->lower == 0) r = 0;
2757     }
2758     break;
2759 
2760   case NT_ANCHOR:
2761     {
2762       AnchorNode* an = NANCHOR(node);
2763       switch (an->type) {
2764       case ANCHOR_PREC_READ:
2765       case ANCHOR_PREC_READ_NOT:
2766       case ANCHOR_LOOK_BEHIND:
2767       case ANCHOR_LOOK_BEHIND_NOT:
2768 	r = subexp_inf_recursive_check(an->target, env, head);
2769 	break;
2770       }
2771     }
2772     break;
2773 
2774   case NT_CALL:
2775     r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2776     break;
2777 
2778   case NT_ENCLOSE:
2779     if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2780       return 0;
2781     else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2782       return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2783     else {
2784       SET_ENCLOSE_STATUS(node, NST_MARK2);
2785       r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2786       CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2787     }
2788     break;
2789 
2790   default:
2791     break;
2792   }
2793 
2794   return r;
2795 }
2796 
2797 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2798 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2799 {
2800   int type;
2801   int r = 0;
2802 
2803   type = NTYPE(node);
2804   switch (type) {
2805   case NT_LIST:
2806   case NT_ALT:
2807     do {
2808       r = subexp_inf_recursive_check_trav(NCAR(node), env);
2809     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2810     break;
2811 
2812   case NT_QTFR:
2813     r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2814     break;
2815 
2816   case NT_ANCHOR:
2817     {
2818       AnchorNode* an = NANCHOR(node);
2819       switch (an->type) {
2820       case ANCHOR_PREC_READ:
2821       case ANCHOR_PREC_READ_NOT:
2822       case ANCHOR_LOOK_BEHIND:
2823       case ANCHOR_LOOK_BEHIND_NOT:
2824 	r = subexp_inf_recursive_check_trav(an->target, env);
2825 	break;
2826       }
2827     }
2828     break;
2829 
2830   case NT_ENCLOSE:
2831     {
2832       EncloseNode* en = NENCLOSE(node);
2833 
2834       if (IS_ENCLOSE_RECURSION(en)) {
2835 	SET_ENCLOSE_STATUS(node, NST_MARK1);
2836 	r = subexp_inf_recursive_check(en->target, env, 1);
2837 	if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2838 	CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2839       }
2840       r = subexp_inf_recursive_check_trav(en->target, env);
2841     }
2842 
2843     break;
2844 
2845   default:
2846     break;
2847   }
2848 
2849   return r;
2850 }
2851 
2852 static int
subexp_recursive_check(Node * node)2853 subexp_recursive_check(Node* node)
2854 {
2855   int r = 0;
2856 
2857   switch (NTYPE(node)) {
2858   case NT_LIST:
2859   case NT_ALT:
2860     do {
2861       r |= subexp_recursive_check(NCAR(node));
2862     } while (IS_NOT_NULL(node = NCDR(node)));
2863     break;
2864 
2865   case NT_QTFR:
2866     r = subexp_recursive_check(NQTFR(node)->target);
2867     break;
2868 
2869   case NT_ANCHOR:
2870     {
2871       AnchorNode* an = NANCHOR(node);
2872       switch (an->type) {
2873       case ANCHOR_PREC_READ:
2874       case ANCHOR_PREC_READ_NOT:
2875       case ANCHOR_LOOK_BEHIND:
2876       case ANCHOR_LOOK_BEHIND_NOT:
2877 	r = subexp_recursive_check(an->target);
2878 	break;
2879       }
2880     }
2881     break;
2882 
2883   case NT_CALL:
2884     r = subexp_recursive_check(NCALL(node)->target);
2885     if (r != 0) SET_CALL_RECURSION(node);
2886     break;
2887 
2888   case NT_ENCLOSE:
2889     if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2890       return 0;
2891     else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2892       return 1; /* recursion */
2893     else {
2894       SET_ENCLOSE_STATUS(node, NST_MARK2);
2895       r = subexp_recursive_check(NENCLOSE(node)->target);
2896       CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2897     }
2898     break;
2899 
2900   default:
2901     break;
2902   }
2903 
2904   return r;
2905 }
2906 
2907 
2908 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2909 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2910 {
2911 #define FOUND_CALLED_NODE    1
2912 
2913   int type;
2914   int r = 0;
2915 
2916   type = NTYPE(node);
2917   switch (type) {
2918   case NT_LIST:
2919   case NT_ALT:
2920     {
2921       int ret;
2922       do {
2923 	ret = subexp_recursive_check_trav(NCAR(node), env);
2924 	if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2925 	else if (ret < 0) return ret;
2926       } while (IS_NOT_NULL(node = NCDR(node)));
2927     }
2928     break;
2929 
2930   case NT_QTFR:
2931     r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2932     if (NQTFR(node)->upper == 0) {
2933       if (r == FOUND_CALLED_NODE)
2934 	NQTFR(node)->is_refered = 1;
2935     }
2936     break;
2937 
2938   case NT_ANCHOR:
2939     {
2940       AnchorNode* an = NANCHOR(node);
2941       switch (an->type) {
2942       case ANCHOR_PREC_READ:
2943       case ANCHOR_PREC_READ_NOT:
2944       case ANCHOR_LOOK_BEHIND:
2945       case ANCHOR_LOOK_BEHIND_NOT:
2946 	r = subexp_recursive_check_trav(an->target, env);
2947 	break;
2948       }
2949     }
2950     break;
2951 
2952   case NT_ENCLOSE:
2953     {
2954       EncloseNode* en = NENCLOSE(node);
2955 
2956       if (! IS_ENCLOSE_RECURSION(en)) {
2957 	if (IS_ENCLOSE_CALLED(en)) {
2958 	  SET_ENCLOSE_STATUS(node, NST_MARK1);
2959 	  r = subexp_recursive_check(en->target);
2960 	  if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2961 	  CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2962 	}
2963       }
2964       r = subexp_recursive_check_trav(en->target, env);
2965       if (IS_ENCLOSE_CALLED(en))
2966 	r |= FOUND_CALLED_NODE;
2967     }
2968     break;
2969 
2970   default:
2971     break;
2972   }
2973 
2974   return r;
2975 }
2976 
2977 static int
setup_subexp_call(Node * node,ScanEnv * env)2978 setup_subexp_call(Node* node, ScanEnv* env)
2979 {
2980   int type;
2981   int r = 0;
2982 
2983   type = NTYPE(node);
2984   switch (type) {
2985   case NT_LIST:
2986     do {
2987       r = setup_subexp_call(NCAR(node), env);
2988     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2989     break;
2990 
2991   case NT_ALT:
2992     do {
2993       r = setup_subexp_call(NCAR(node), env);
2994     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2995     break;
2996 
2997   case NT_QTFR:
2998     r = setup_subexp_call(NQTFR(node)->target, env);
2999     break;
3000   case NT_ENCLOSE:
3001     r = setup_subexp_call(NENCLOSE(node)->target, env);
3002     break;
3003 
3004   case NT_CALL:
3005     {
3006       CallNode* cn = NCALL(node);
3007       Node** nodes = SCANENV_MEM_NODES(env);
3008 
3009       if (cn->group_num != 0) {
3010 	int gnum = cn->group_num;
3011 
3012 #ifdef USE_NAMED_GROUP
3013 	if (env->num_named > 0 &&
3014 	    IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3015 	    !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3016 	  return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3017 	}
3018 #endif
3019 	if (gnum > env->num_mem) {
3020 	  onig_scan_env_set_error_string(env,
3021 		 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3022 	  return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3023 	}
3024 
3025 #ifdef USE_NAMED_GROUP
3026       set_call_attr:
3027 #endif
3028 	cn->target = nodes[cn->group_num];
3029 	if (IS_NULL(cn->target)) {
3030 	  onig_scan_env_set_error_string(env,
3031 		 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3032 	  return ONIGERR_UNDEFINED_NAME_REFERENCE;
3033 	}
3034 	SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3035 	BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3036 	cn->unset_addr_list = env->unset_addr_list;
3037       }
3038 #ifdef USE_NAMED_GROUP
3039       else {
3040 	int *refs;
3041 
3042 	int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3043 					   &refs);
3044 	if (n <= 0) {
3045 	  onig_scan_env_set_error_string(env,
3046 		 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3047 	  return ONIGERR_UNDEFINED_NAME_REFERENCE;
3048 	}
3049 	else if (n > 1) {
3050 	  onig_scan_env_set_error_string(env,
3051 	    ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3052 	  return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3053 	}
3054 	else {
3055 	  cn->group_num = refs[0];
3056 	  goto set_call_attr;
3057 	}
3058       }
3059 #endif
3060     }
3061     break;
3062 
3063   case NT_ANCHOR:
3064     {
3065       AnchorNode* an = NANCHOR(node);
3066 
3067       switch (an->type) {
3068       case ANCHOR_PREC_READ:
3069       case ANCHOR_PREC_READ_NOT:
3070       case ANCHOR_LOOK_BEHIND:
3071       case ANCHOR_LOOK_BEHIND_NOT:
3072 	r = setup_subexp_call(an->target, env);
3073 	break;
3074       }
3075     }
3076     break;
3077 
3078   default:
3079     break;
3080   }
3081 
3082   return r;
3083 }
3084 #endif
3085 
3086 /* divide different length alternatives in look-behind.
3087   (?<=A|B) ==> (?<=A)|(?<=B)
3088   (?<!A|B) ==> (?<!A)(?<!B)
3089 */
3090 static int
divide_look_behind_alternatives(Node * node)3091 divide_look_behind_alternatives(Node* node)
3092 {
3093   Node *head, *np, *insert_node;
3094   AnchorNode* an = NANCHOR(node);
3095   int anc_type = an->type;
3096 
3097   head = an->target;
3098   np = NCAR(head);
3099   swap_node(node, head);
3100   NCAR(node) = head;
3101   NANCHOR(head)->target = np;
3102 
3103   np = node;
3104   while ((np = NCDR(np)) != NULL_NODE) {
3105     insert_node = onig_node_new_anchor(anc_type);
3106     CHECK_NULL_RETURN_MEMERR(insert_node);
3107     NANCHOR(insert_node)->target = NCAR(np);
3108     NCAR(np) = insert_node;
3109   }
3110 
3111   if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3112     np = node;
3113     do {
3114       SET_NTYPE(np, NT_LIST);  /* alt -> list */
3115     } while ((np = NCDR(np)) != NULL_NODE);
3116   }
3117   return 0;
3118 }
3119 
3120 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3121 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3122 {
3123   int r, len;
3124   AnchorNode* an = NANCHOR(node);
3125 
3126   r = get_char_length_tree(an->target, reg, &len);
3127   if (r == 0)
3128     an->char_len = len;
3129   else if (r == GET_CHAR_LEN_VARLEN)
3130     r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3131   else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3132     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3133       r = divide_look_behind_alternatives(node);
3134     else
3135       r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3136   }
3137 
3138   return r;
3139 }
3140 
3141 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3142 next_setup(Node* node, Node* next_node, regex_t* reg)
3143 {
3144   int type;
3145 
3146  retry:
3147   type = NTYPE(node);
3148   if (type == NT_QTFR) {
3149     QtfrNode* qn = NQTFR(node);
3150     if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3151 #ifdef USE_QTFR_PEEK_NEXT
3152       Node* n = get_head_value_node(next_node, 1, reg);
3153       /* '\0': for UTF-16BE etc... */
3154       if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3155 	qn->next_head_exact = n;
3156       }
3157 #endif
3158       /* automatic posseivation a*b ==> (?>a*)b */
3159       if (qn->lower <= 1) {
3160 	int ttype = NTYPE(qn->target);
3161 	if (IS_NODE_TYPE_SIMPLE(ttype)) {
3162 	  Node *x, *y;
3163 	  x = get_head_value_node(qn->target, 0, reg);
3164 	  if (IS_NOT_NULL(x)) {
3165 	    y = get_head_value_node(next_node,  0, reg);
3166 	    if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3167 	      Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3168 	      CHECK_NULL_RETURN_MEMERR(en);
3169 	      SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3170 	      swap_node(node, en);
3171 	      NENCLOSE(node)->target = en;
3172 	    }
3173 	  }
3174 	}
3175       }
3176     }
3177   }
3178   else if (type == NT_ENCLOSE) {
3179     EncloseNode* en = NENCLOSE(node);
3180     if (en->type == ENCLOSE_MEMORY) {
3181       node = en->target;
3182       goto retry;
3183     }
3184   }
3185   return 0;
3186 }
3187 
3188 
3189 static int
update_string_node_case_fold(regex_t * reg,Node * node)3190 update_string_node_case_fold(regex_t* reg, Node *node)
3191 {
3192   UChar *p, *q, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3193   UChar *sbuf, *ebuf, *sp;
3194   int r, i, len, sbuf_size;
3195   StrNode* sn = NSTR(node);
3196 
3197   end = sn->end;
3198   sbuf_size = (end - sn->s) * 2;
3199   sbuf = (UChar* )xmalloc(sbuf_size);
3200   CHECK_NULL_RETURN_MEMERR(sbuf);
3201   ebuf = sbuf + sbuf_size;
3202 
3203   sp = sbuf;
3204   p = sn->s;
3205   while (p < end) {
3206     len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3207     q = buf;
3208     for (i = 0; i < len; i++) {
3209       if (sp >= ebuf) {
3210 	sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3211 	CHECK_NULL_RETURN_MEMERR(sbuf);
3212 	sp = sbuf + sbuf_size;
3213 	sbuf_size *= 2;
3214 	ebuf = sbuf + sbuf_size;
3215       }
3216 
3217       *sp++ = buf[i];
3218     }
3219   }
3220 
3221   r = onig_node_str_set(node, sbuf, sp);
3222   if (r != 0) {
3223     xfree(sbuf);
3224     return r;
3225   }
3226 
3227   xfree(sbuf);
3228   return 0;
3229 }
3230 
3231 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3232 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3233 				 regex_t* reg)
3234 {
3235   int r;
3236   Node *node;
3237 
3238   node = onig_node_new_str(s, end);
3239   if (IS_NULL(node)) return ONIGERR_MEMORY;
3240 
3241   r = update_string_node_case_fold(reg, node);
3242   if (r != 0) {
3243     onig_node_free(node);
3244     return r;
3245   }
3246 
3247   NSTRING_SET_AMBIG(node);
3248   NSTRING_SET_DONT_GET_OPT_INFO(node);
3249   *rnode = node;
3250   return 0;
3251 }
3252 
3253 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3254 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3255 			    UChar *p, int slen, UChar *end,
3256 			    regex_t* reg, Node **rnode)
3257 {
3258   int r, i, j, len, varlen;
3259   Node *anode, *var_anode, *snode, *xnode, *an;
3260   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3261 
3262   *rnode = var_anode = NULL_NODE;
3263 
3264   varlen = 0;
3265   for (i = 0; i < item_num; i++) {
3266     if (items[i].byte_len != slen) {
3267       varlen = 1;
3268       break;
3269     }
3270   }
3271 
3272   if (varlen != 0) {
3273     *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3274     if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3275 
3276     xnode = onig_node_new_list(NULL, NULL);
3277     if (IS_NULL(xnode)) goto mem_err;
3278     NCAR(var_anode) = xnode;
3279 
3280     anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3281     if (IS_NULL(anode)) goto mem_err;
3282     NCAR(xnode) = anode;
3283   }
3284   else {
3285     *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3286     if (IS_NULL(anode)) return ONIGERR_MEMORY;
3287   }
3288 
3289   snode = onig_node_new_str(p, p + slen);
3290   if (IS_NULL(snode)) goto mem_err;
3291 
3292   NCAR(anode) = snode;
3293 
3294   for (i = 0; i < item_num; i++) {
3295     snode = onig_node_new_str(NULL, NULL);
3296     if (IS_NULL(snode)) goto mem_err;
3297 
3298     for (j = 0; j < items[i].code_len; j++) {
3299       len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3300       if (len < 0) {
3301 	r = len;
3302 	goto mem_err2;
3303       }
3304 
3305       r = onig_node_str_cat(snode, buf, buf + len);
3306       if (r != 0) goto mem_err2;
3307     }
3308 
3309     an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3310     if (IS_NULL(an)) {
3311       goto mem_err2;
3312     }
3313 
3314     if (items[i].byte_len != slen) {
3315       Node *rem;
3316       UChar *q = p + items[i].byte_len;
3317 
3318       if (q < end) {
3319 	r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3320 	if (r != 0) {
3321 	  onig_node_free(an);
3322 	  goto mem_err2;
3323 	}
3324 
3325 	xnode = onig_node_list_add(NULL_NODE, snode);
3326 	if (IS_NULL(xnode)) {
3327 	  onig_node_free(an);
3328 	  onig_node_free(rem);
3329 	  goto mem_err2;
3330 	}
3331 	if (IS_NULL(onig_node_list_add(xnode, rem))) {
3332 	  onig_node_free(an);
3333 	  onig_node_free(xnode);
3334 	  onig_node_free(rem);
3335 	  goto mem_err;
3336 	}
3337 
3338 	NCAR(an) = xnode;
3339       }
3340       else {
3341 	NCAR(an) = snode;
3342       }
3343 
3344       NCDR(var_anode) = an;
3345       var_anode = an;
3346     }
3347     else {
3348       NCAR(an)     = snode;
3349       NCDR(anode) = an;
3350       anode = an;
3351     }
3352   }
3353 
3354   return varlen;
3355 
3356  mem_err2:
3357   onig_node_free(snode);
3358 
3359  mem_err:
3360   onig_node_free(*rnode);
3361 
3362   return ONIGERR_MEMORY;
3363 }
3364 
3365 static int
expand_case_fold_string(Node * node,regex_t * reg)3366 expand_case_fold_string(Node* node, regex_t* reg)
3367 {
3368 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION  8
3369 
3370   int r, n, len, alt_num;
3371   UChar *start, *end, *p;
3372   Node *top_root, *root, *snode, *prev_node;
3373   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3374   StrNode* sn = NSTR(node);
3375 
3376   if (NSTRING_IS_AMBIG(node)) return 0;
3377 
3378   start = sn->s;
3379   end   = sn->end;
3380   if (start >= end) return 0;
3381 
3382   r = 0;
3383   top_root = root = prev_node = snode = NULL_NODE;
3384   alt_num = 1;
3385   p = start;
3386   while (p < end) {
3387     n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3388 					   p, end, items);
3389     if (n < 0) {
3390       r = n;
3391       goto err;
3392     }
3393 
3394     len = enclen(reg->enc, p);
3395 
3396     if (n == 0) {
3397       if (IS_NULL(snode)) {
3398 	if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3399 	  top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3400 	  if (IS_NULL(root)) {
3401 	    onig_node_free(prev_node);
3402 	    goto mem_err;
3403 	  }
3404 	}
3405 
3406 	prev_node = snode = onig_node_new_str(NULL, NULL);
3407 	if (IS_NULL(snode)) goto mem_err;
3408 	if (IS_NOT_NULL(root)) {
3409 	  if (IS_NULL(onig_node_list_add(root, snode))) {
3410 	    onig_node_free(snode);
3411 	    goto mem_err;
3412 	  }
3413 	}
3414       }
3415 
3416       r = onig_node_str_cat(snode, p, p + len);
3417       if (r != 0) goto err;
3418     }
3419     else {
3420       alt_num *= (n + 1);
3421       if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3422 
3423       if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3424 	top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3425 	if (IS_NULL(root)) {
3426 	  onig_node_free(prev_node);
3427 	  goto mem_err;
3428 	}
3429       }
3430 
3431       r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3432       if (r < 0) goto mem_err;
3433       if (r == 1) {
3434 	if (IS_NULL(root)) {
3435 	  top_root = prev_node;
3436 	}
3437 	else {
3438 	  if (IS_NULL(onig_node_list_add(root, prev_node))) {
3439 	    onig_node_free(prev_node);
3440 	    goto mem_err;
3441 	  }
3442 	}
3443 
3444 	root = NCAR(prev_node);
3445       }
3446       else { /* r == 0 */
3447 	if (IS_NOT_NULL(root)) {
3448 	  if (IS_NULL(onig_node_list_add(root, prev_node))) {
3449 	    onig_node_free(prev_node);
3450 	    goto mem_err;
3451 	  }
3452 	}
3453       }
3454 
3455       snode = NULL_NODE;
3456     }
3457 
3458     p += len;
3459   }
3460 
3461   if (p < end) {
3462     Node *srem;
3463 
3464     r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3465     if (r != 0) goto mem_err;
3466 
3467     if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3468       top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3469       if (IS_NULL(root)) {
3470 	onig_node_free(srem);
3471 	onig_node_free(prev_node);
3472 	goto mem_err;
3473       }
3474     }
3475 
3476     if (IS_NULL(root)) {
3477       prev_node = srem;
3478     }
3479     else {
3480       if (IS_NULL(onig_node_list_add(root, srem))) {
3481 	onig_node_free(srem);
3482 	goto mem_err;
3483       }
3484     }
3485   }
3486 
3487   /* ending */
3488   top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3489   swap_node(node, top_root);
3490   onig_node_free(top_root);
3491   return 0;
3492 
3493  mem_err:
3494   r = ONIGERR_MEMORY;
3495 
3496  err:
3497   onig_node_free(top_root);
3498   return r;
3499 }
3500 
3501 
3502 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3503 
3504 #define CEC_THRES_NUM_BIG_REPEAT         512
3505 #define CEC_INFINITE_NUM          0x7fffffff
3506 
3507 #define CEC_IN_INFINITE_REPEAT    (1<<0)
3508 #define CEC_IN_FINITE_REPEAT      (1<<1)
3509 #define CEC_CONT_BIG_REPEAT       (1<<2)
3510 
3511 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3512 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3513 {
3514   int type;
3515   int r = state;
3516 
3517   type = NTYPE(node);
3518   switch (type) {
3519   case NT_LIST:
3520     {
3521       Node* prev = NULL_NODE;
3522       do {
3523 	r = setup_comb_exp_check(NCAR(node), r, env);
3524 	prev = NCAR(node);
3525       } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3526     }
3527     break;
3528 
3529   case NT_ALT:
3530     {
3531       int ret;
3532       do {
3533 	ret = setup_comb_exp_check(NCAR(node), state, env);
3534 	r |= ret;
3535       } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3536     }
3537     break;
3538 
3539   case NT_QTFR:
3540     {
3541       int child_state = state;
3542       int add_state = 0;
3543       QtfrNode* qn = NQTFR(node);
3544       Node* target = qn->target;
3545       int var_num;
3546 
3547       if (! IS_REPEAT_INFINITE(qn->upper)) {
3548 	if (qn->upper > 1) {
3549 	  /* {0,1}, {1,1} are allowed */
3550 	  child_state |= CEC_IN_FINITE_REPEAT;
3551 
3552 	  /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3553 	  if (env->backrefed_mem == 0) {
3554 	    if (NTYPE(qn->target) == NT_ENCLOSE) {
3555 	      EncloseNode* en = NENCLOSE(qn->target);
3556 	      if (en->type == ENCLOSE_MEMORY) {
3557 		if (NTYPE(en->target) == NT_QTFR) {
3558 		  QtfrNode* q = NQTFR(en->target);
3559 		  if (IS_REPEAT_INFINITE(q->upper)
3560 		      && q->greedy == qn->greedy) {
3561 		    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3562 		    if (qn->upper == 1)
3563 		      child_state = state;
3564 		  }
3565 		}
3566 	      }
3567 	    }
3568 	  }
3569 	}
3570       }
3571 
3572       if (state & CEC_IN_FINITE_REPEAT) {
3573 	qn->comb_exp_check_num = -1;
3574       }
3575       else {
3576 	if (IS_REPEAT_INFINITE(qn->upper)) {
3577 	  var_num = CEC_INFINITE_NUM;
3578 	  child_state |= CEC_IN_INFINITE_REPEAT;
3579 	}
3580 	else {
3581 	  var_num = qn->upper - qn->lower;
3582 	}
3583 
3584 	if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3585 	  add_state |= CEC_CONT_BIG_REPEAT;
3586 
3587 	if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3588 	    ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3589 	     var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3590 	  if (qn->comb_exp_check_num == 0) {
3591 	    env->num_comb_exp_check++;
3592 	    qn->comb_exp_check_num = env->num_comb_exp_check;
3593 	    if (env->curr_max_regnum > env->comb_exp_max_regnum)
3594 	      env->comb_exp_max_regnum = env->curr_max_regnum;
3595 	  }
3596 	}
3597       }
3598 
3599       r = setup_comb_exp_check(target, child_state, env);
3600       r |= add_state;
3601     }
3602     break;
3603 
3604   case NT_ENCLOSE:
3605     {
3606       EncloseNode* en = NENCLOSE(node);
3607 
3608       switch (en->type) {
3609       case ENCLOSE_MEMORY:
3610 	{
3611 	  if (env->curr_max_regnum < en->regnum)
3612 	    env->curr_max_regnum = en->regnum;
3613 
3614 	  r = setup_comb_exp_check(en->target, state, env);
3615 	}
3616 	break;
3617 
3618       default:
3619 	r = setup_comb_exp_check(en->target, state, env);
3620 	break;
3621       }
3622     }
3623     break;
3624 
3625 #ifdef USE_SUBEXP_CALL
3626   case NT_CALL:
3627     if (IS_CALL_RECURSION(NCALL(node)))
3628       env->has_recursion = 1;
3629     else
3630       r = setup_comb_exp_check(NCALL(node)->target, state, env);
3631     break;
3632 #endif
3633 
3634   default:
3635     break;
3636   }
3637 
3638   return r;
3639 }
3640 #endif
3641 
3642 #define IN_ALT        (1<<0)
3643 #define IN_NOT        (1<<1)
3644 #define IN_REPEAT     (1<<2)
3645 #define IN_VAR_REPEAT (1<<3)
3646 
3647 /* setup_tree does the following work.
3648  1. check empty loop. (set qn->target_empty_info)
3649  2. expand ignore-case in char class.
3650  3. set memory status bit flags. (reg->mem_stats)
3651  4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3652  5. find invalid patterns in look-behind.
3653  6. expand repeated string.
3654  */
3655 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3656 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3657 {
3658   int type;
3659   int r = 0;
3660 
3661   type = NTYPE(node);
3662   switch (type) {
3663   case NT_LIST:
3664     {
3665       Node* prev = NULL_NODE;
3666       do {
3667 	r = setup_tree(NCAR(node), reg, state, env);
3668 	if (IS_NOT_NULL(prev) && r == 0) {
3669 	  r = next_setup(prev, NCAR(node), reg);
3670 	}
3671 	prev = NCAR(node);
3672       } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3673     }
3674     break;
3675 
3676   case NT_ALT:
3677     do {
3678       r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3679     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3680     break;
3681 
3682   case NT_CCLASS:
3683     break;
3684 
3685   case NT_STR:
3686     if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3687       r = expand_case_fold_string(node, reg);
3688     }
3689     break;
3690 
3691   case NT_CTYPE:
3692   case NT_CANY:
3693     break;
3694 
3695 #ifdef USE_SUBEXP_CALL
3696   case NT_CALL:
3697     break;
3698 #endif
3699 
3700   case NT_BREF:
3701     {
3702       int i;
3703       int* p;
3704       Node** nodes = SCANENV_MEM_NODES(env);
3705       BRefNode* br = NBREF(node);
3706       p = BACKREFS_P(br);
3707       for (i = 0; i < br->back_num; i++) {
3708 	if (p[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
3709 	BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3710 	BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3711 #ifdef USE_BACKREF_WITH_LEVEL
3712 	if (IS_BACKREF_NEST_LEVEL(br)) {
3713 	  BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3714 	}
3715 #endif
3716 	SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3717       }
3718     }
3719     break;
3720 
3721   case NT_QTFR:
3722     {
3723       OnigDistance d;
3724       QtfrNode* qn = NQTFR(node);
3725       Node* target = qn->target;
3726 
3727       if ((state & IN_REPEAT) != 0) {
3728         qn->state |= NST_IN_REPEAT;
3729       }
3730 
3731       if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3732 	r = get_min_match_length(target, &d, env);
3733 	if (r) break;
3734 	if (d == 0) {
3735 	  qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3736 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3737 	  r = quantifiers_memory_node_info(target);
3738 	  if (r < 0) break;
3739 	  if (r > 0) {
3740 	    qn->target_empty_info = r;
3741 	  }
3742 #endif
3743 #if 0
3744 	  r = get_max_match_length(target, &d, env);
3745 	  if (r == 0 && d == 0) {
3746 	    /*  ()* ==> ()?, ()+ ==> ()  */
3747 	    qn->upper = 1;
3748 	    if (qn->lower > 1) qn->lower = 1;
3749 	    if (NTYPE(target) == NT_STR) {
3750 	      qn->upper = qn->lower = 0;  /* /(?:)+/ ==> // */
3751 	    }
3752 	  }
3753 #endif
3754 	}
3755       }
3756 
3757       state |= IN_REPEAT;
3758       if (qn->lower != qn->upper)
3759 	state |= IN_VAR_REPEAT;
3760       r = setup_tree(target, reg, state, env);
3761       if (r) break;
3762 
3763       /* expand string */
3764 #define EXPAND_STRING_MAX_LENGTH  100
3765       if (NTYPE(target) == NT_STR) {
3766 	if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3767 	    qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3768 	  int len = NSTRING_LEN(target);
3769 	  StrNode* sn = NSTR(target);
3770 
3771 	  if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3772 	    int i, n = qn->lower;
3773 	    onig_node_conv_to_str_node(node, NSTR(target)->flag);
3774 	    for (i = 0; i < n; i++) {
3775 	      r = onig_node_str_cat(node, sn->s, sn->end);
3776 	      if (r) break;
3777 	    }
3778 	    onig_node_free(target);
3779 	    break; /* break case NT_QTFR: */
3780 	  }
3781 	}
3782       }
3783 
3784 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3785       if (qn->greedy && (qn->target_empty_info != 0)) {
3786 	if (NTYPE(target) == NT_QTFR) {
3787 	  QtfrNode* tqn = NQTFR(target);
3788 	  if (IS_NOT_NULL(tqn->head_exact)) {
3789 	    qn->head_exact  = tqn->head_exact;
3790 	    tqn->head_exact = NULL;
3791 	  }
3792 	}
3793 	else {
3794 	  qn->head_exact = get_head_value_node(qn->target, 1, reg);
3795 	}
3796       }
3797 #endif
3798     }
3799     break;
3800 
3801   case NT_ENCLOSE:
3802     {
3803       EncloseNode* en = NENCLOSE(node);
3804 
3805       switch (en->type) {
3806       case ENCLOSE_OPTION:
3807 	{
3808 	  OnigOptionType options = reg->options;
3809 	  reg->options = NENCLOSE(node)->option;
3810 	  r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3811 	  reg->options = options;
3812 	}
3813 	break;
3814 
3815       case ENCLOSE_MEMORY:
3816 	if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3817 	  BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3818 	  /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3819 	}
3820         r = setup_tree(en->target, reg, state, env);
3821         break;
3822 
3823       case ENCLOSE_STOP_BACKTRACK:
3824 	{
3825 	  Node* target = en->target;
3826 	  r = setup_tree(target, reg, state, env);
3827 	  if (NTYPE(target) == NT_QTFR) {
3828 	    QtfrNode* tqn = NQTFR(target);
3829 	    if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3830 		tqn->greedy != 0) {  /* (?>a*), a*+ etc... */
3831 	      int qtype = NTYPE(tqn->target);
3832 	      if (IS_NODE_TYPE_SIMPLE(qtype))
3833 		SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3834 	    }
3835 	  }
3836 	}
3837 	break;
3838       }
3839     }
3840     break;
3841 
3842   case NT_ANCHOR:
3843     {
3844       AnchorNode* an = NANCHOR(node);
3845 
3846       switch (an->type) {
3847       case ANCHOR_PREC_READ:
3848 	r = setup_tree(an->target, reg, state, env);
3849 	break;
3850       case ANCHOR_PREC_READ_NOT:
3851 	r = setup_tree(an->target, reg, (state | IN_NOT), env);
3852 	break;
3853 
3854 /* allowed node types in look-behind */
3855 #define ALLOWED_TYPE_IN_LB  \
3856   ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3857     BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3858 
3859 #define ALLOWED_ENCLOSE_IN_LB       ( ENCLOSE_MEMORY )
3860 #define ALLOWED_ENCLOSE_IN_LB_NOT   0
3861 
3862 #define ALLOWED_ANCHOR_IN_LB \
3863 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3864 #define ALLOWED_ANCHOR_IN_LB_NOT \
3865 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3866 
3867       case ANCHOR_LOOK_BEHIND:
3868 	{
3869 	  r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3870 			      ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3871 	  if (r < 0) return r;
3872 	  if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3873 	  r = setup_look_behind(node, reg, env);
3874 	  if (r != 0) return r;
3875 	  r = setup_tree(an->target, reg, state, env);
3876 	}
3877 	break;
3878 
3879       case ANCHOR_LOOK_BEHIND_NOT:
3880 	{
3881 	  r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3882 		      ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3883 	  if (r < 0) return r;
3884 	  if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3885 	  r = setup_look_behind(node, reg, env);
3886 	  if (r != 0) return r;
3887 	  r = setup_tree(an->target, reg, (state | IN_NOT), env);
3888 	}
3889 	break;
3890       }
3891     }
3892     break;
3893 
3894   default:
3895     break;
3896   }
3897 
3898   return r;
3899 }
3900 
3901 /* set skip map for Boyer-Moor search */
3902 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3903 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3904 	    UChar skip[], int** int_skip)
3905 {
3906   int i, len;
3907 
3908   len = end - s;
3909   if (len < ONIG_CHAR_TABLE_SIZE) {
3910     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3911 
3912     for (i = 0; i < len - 1; i++)
3913       skip[s[i]] = len - 1 - i;
3914   }
3915   else {
3916     if (IS_NULL(*int_skip)) {
3917       *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3918       if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3919     }
3920     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3921 
3922     for (i = 0; i < len - 1; i++)
3923       (*int_skip)[s[i]] = len - 1 - i;
3924   }
3925   return 0;
3926 }
3927 
3928 #define OPT_EXACT_MAXLEN   24
3929 
3930 typedef struct {
3931   OnigDistance min;  /* min byte length */
3932   OnigDistance max;  /* max byte length */
3933 } MinMaxLen;
3934 
3935 typedef struct {
3936   MinMaxLen        mmd;
3937   OnigEncoding     enc;
3938   OnigOptionType   options;
3939   OnigCaseFoldType case_fold_flag;
3940   ScanEnv*         scan_env;
3941 } OptEnv;
3942 
3943 typedef struct {
3944   int left_anchor;
3945   int right_anchor;
3946 } OptAncInfo;
3947 
3948 typedef struct {
3949   MinMaxLen  mmd; /* info position */
3950   OptAncInfo anc;
3951 
3952   int   reach_end;
3953   int   ignore_case;
3954   int   len;
3955   UChar s[OPT_EXACT_MAXLEN];
3956 } OptExactInfo;
3957 
3958 typedef struct {
3959   MinMaxLen mmd; /* info position */
3960   OptAncInfo anc;
3961 
3962   int   value;      /* weighted value */
3963   UChar map[ONIG_CHAR_TABLE_SIZE];
3964 } OptMapInfo;
3965 
3966 typedef struct {
3967   MinMaxLen    len;
3968 
3969   OptAncInfo   anc;
3970   OptExactInfo exb;    /* boundary */
3971   OptExactInfo exm;    /* middle */
3972   OptExactInfo expr;   /* prec read (?=...) */
3973 
3974   OptMapInfo   map;   /* boundary */
3975 } NodeOptInfo;
3976 
3977 
3978 static int
map_position_value(OnigEncoding enc,int i)3979 map_position_value(OnigEncoding enc, int i)
3980 {
3981   static const short int ByteValTable[] = {
3982      5,  1,  1,  1,  1,  1,  1,  1,  1, 10, 10,  1,  1, 10,  1,  1,
3983      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
3984     12,  4,  7,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
3985      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,
3986      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
3987      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,  5,  5,  5,
3988      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
3989      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  1
3990   };
3991 
3992   if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
3993     if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3994       return 20;
3995     else
3996       return (int )ByteValTable[i];
3997   }
3998   else
3999     return 4;   /* Take it easy. */
4000 }
4001 
4002 static int
distance_value(MinMaxLen * mm)4003 distance_value(MinMaxLen* mm)
4004 {
4005   /* 1000 / (min-max-dist + 1) */
4006   static const short int dist_vals[] = {
4007     1000,  500,  333,  250,  200,  167,  143,  125,  111,  100,
4008       91,   83,   77,   71,   67,   63,   59,   56,   53,   50,
4009       48,   45,   43,   42,   40,   38,   37,   36,   34,   33,
4010       32,   31,   30,   29,   29,   28,   27,   26,   26,   25,
4011       24,   24,   23,   23,   22,   22,   21,   21,   20,   20,
4012       20,   19,   19,   19,   18,   18,   18,   17,   17,   17,
4013       16,   16,   16,   16,   15,   15,   15,   15,   14,   14,
4014       14,   14,   14,   14,   13,   13,   13,   13,   13,   13,
4015       12,   12,   12,   12,   12,   12,   11,   11,   11,   11,
4016       11,   11,   11,   11,   11,   10,   10,   10,   10,   10
4017   };
4018 
4019   int d;
4020 
4021   if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4022 
4023   d = mm->max - mm->min;
4024   if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4025     /* return dist_vals[d] * 16 / (mm->min + 12); */
4026     return (int )dist_vals[d];
4027   else
4028     return 1;
4029 }
4030 
4031 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4032 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4033 {
4034   if (v2 <= 0) return -1;
4035   if (v1 <= 0) return  1;
4036 
4037   v1 *= distance_value(d1);
4038   v2 *= distance_value(d2);
4039 
4040   if (v2 > v1) return  1;
4041   if (v2 < v1) return -1;
4042 
4043   if (d2->min < d1->min) return  1;
4044   if (d2->min > d1->min) return -1;
4045   return 0;
4046 }
4047 
4048 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4049 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4050 {
4051   return (a->min == b->min && a->max == b->max) ? 1 : 0;
4052 }
4053 
4054 
4055 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4056 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4057 {
4058   mml->min = min;
4059   mml->max = max;
4060 }
4061 
4062 static void
clear_mml(MinMaxLen * mml)4063 clear_mml(MinMaxLen* mml)
4064 {
4065   mml->min = mml->max = 0;
4066 }
4067 
4068 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4069 copy_mml(MinMaxLen* to, MinMaxLen* from)
4070 {
4071   to->min = from->min;
4072   to->max = from->max;
4073 }
4074 
4075 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4076 add_mml(MinMaxLen* to, MinMaxLen* from)
4077 {
4078   to->min = distance_add(to->min, from->min);
4079   to->max = distance_add(to->max, from->max);
4080 }
4081 
4082 #if 0
4083 static void
4084 add_len_mml(MinMaxLen* to, OnigDistance len)
4085 {
4086   to->min = distance_add(to->min, len);
4087   to->max = distance_add(to->max, len);
4088 }
4089 #endif
4090 
4091 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4092 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4093 {
4094   if (to->min > from->min) to->min = from->min;
4095   if (to->max < from->max) to->max = from->max;
4096 }
4097 
4098 static void
copy_opt_env(OptEnv * to,OptEnv * from)4099 copy_opt_env(OptEnv* to, OptEnv* from)
4100 {
4101   *to = *from;
4102 }
4103 
4104 static void
clear_opt_anc_info(OptAncInfo * anc)4105 clear_opt_anc_info(OptAncInfo* anc)
4106 {
4107   anc->left_anchor  = 0;
4108   anc->right_anchor = 0;
4109 }
4110 
4111 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4112 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4113 {
4114   *to = *from;
4115 }
4116 
4117 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4118 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4119 		    OnigDistance left_len, OnigDistance right_len)
4120 {
4121   clear_opt_anc_info(to);
4122 
4123   to->left_anchor = left->left_anchor;
4124   if (left_len == 0) {
4125     to->left_anchor |= right->left_anchor;
4126   }
4127 
4128   to->right_anchor = right->right_anchor;
4129   if (right_len == 0) {
4130     to->right_anchor |= left->right_anchor;
4131   }
4132 }
4133 
4134 static int
is_left_anchor(int anc)4135 is_left_anchor(int anc)
4136 {
4137   if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4138       anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4139       anc == ANCHOR_PREC_READ_NOT)
4140     return 0;
4141 
4142   return 1;
4143 }
4144 
4145 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4146 is_set_opt_anc_info(OptAncInfo* to, int anc)
4147 {
4148   if ((to->left_anchor & anc) != 0) return 1;
4149 
4150   return ((to->right_anchor & anc) != 0 ? 1 : 0);
4151 }
4152 
4153 static void
add_opt_anc_info(OptAncInfo * to,int anc)4154 add_opt_anc_info(OptAncInfo* to, int anc)
4155 {
4156   if (is_left_anchor(anc))
4157     to->left_anchor |= anc;
4158   else
4159     to->right_anchor |= anc;
4160 }
4161 
4162 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4163 remove_opt_anc_info(OptAncInfo* to, int anc)
4164 {
4165   if (is_left_anchor(anc))
4166     to->left_anchor &= ~anc;
4167   else
4168     to->right_anchor &= ~anc;
4169 }
4170 
4171 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4172 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4173 {
4174   to->left_anchor  &= add->left_anchor;
4175   to->right_anchor &= add->right_anchor;
4176 }
4177 
4178 static int
is_full_opt_exact_info(OptExactInfo * ex)4179 is_full_opt_exact_info(OptExactInfo* ex)
4180 {
4181   return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4182 }
4183 
4184 static void
clear_opt_exact_info(OptExactInfo * ex)4185 clear_opt_exact_info(OptExactInfo* ex)
4186 {
4187   clear_mml(&ex->mmd);
4188   clear_opt_anc_info(&ex->anc);
4189   ex->reach_end   = 0;
4190   ex->ignore_case = 0;
4191   ex->len         = 0;
4192   ex->s[0]        = '\0';
4193 }
4194 
4195 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4196 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4197 {
4198   *to = *from;
4199 }
4200 
4201 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4202 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4203 {
4204   int i, j, len;
4205   UChar *p, *end;
4206   OptAncInfo tanc;
4207 
4208   if (! to->ignore_case && add->ignore_case) {
4209     if (to->len >= add->len) return ;  /* avoid */
4210 
4211     to->ignore_case = 1;
4212   }
4213 
4214   p = add->s;
4215   end = p + add->len;
4216   for (i = to->len; p < end; ) {
4217     len = enclen(enc, p);
4218     if (i + len > OPT_EXACT_MAXLEN) break;
4219     for (j = 0; j < len && p < end; j++)
4220       to->s[i++] = *p++;
4221   }
4222 
4223   to->len = i;
4224   to->reach_end = (p == end ? add->reach_end : 0);
4225 
4226   concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4227   if (! to->reach_end) tanc.right_anchor = 0;
4228   copy_opt_anc_info(&to->anc, &tanc);
4229 }
4230 
4231 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4232 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4233 			  int raw ARG_UNUSED, OnigEncoding enc)
4234 {
4235   int i, j, len;
4236   UChar *p;
4237 
4238   for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4239     len = enclen(enc, p);
4240     if (i + len > OPT_EXACT_MAXLEN) break;
4241     for (j = 0; j < len && p < end; j++)
4242       to->s[i++] = *p++;
4243   }
4244 
4245   to->len = i;
4246 }
4247 
4248 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4249 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4250 {
4251   int i, j, len;
4252 
4253   if (add->len == 0 || to->len == 0) {
4254     clear_opt_exact_info(to);
4255     return ;
4256   }
4257 
4258   if (! is_equal_mml(&to->mmd, &add->mmd)) {
4259     clear_opt_exact_info(to);
4260     return ;
4261   }
4262 
4263   for (i = 0; i < to->len && i < add->len; ) {
4264     if (to->s[i] != add->s[i]) break;
4265     len = enclen(env->enc, to->s + i);
4266 
4267     for (j = 1; j < len; j++) {
4268       if (to->s[i+j] != add->s[i+j]) break;
4269     }
4270     if (j < len) break;
4271     i += len;
4272   }
4273 
4274   if (! add->reach_end || i < add->len || i < to->len) {
4275     to->reach_end = 0;
4276   }
4277   to->len = i;
4278   to->ignore_case |= add->ignore_case;
4279 
4280   alt_merge_opt_anc_info(&to->anc, &add->anc);
4281   if (! to->reach_end) to->anc.right_anchor = 0;
4282 }
4283 
4284 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4285 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4286 {
4287   int v1, v2;
4288 
4289   v1 = now->len;
4290   v2 = alt->len;
4291 
4292   if (v2 == 0) {
4293     return ;
4294   }
4295   else if (v1 == 0) {
4296     copy_opt_exact_info(now, alt);
4297     return ;
4298   }
4299   else if (v1 <= 2 && v2 <= 2) {
4300     /* ByteValTable[x] is big value --> low price */
4301     v2 = map_position_value(enc, now->s[0]);
4302     v1 = map_position_value(enc, alt->s[0]);
4303 
4304     if (now->len > 1) v1 += 5;
4305     if (alt->len > 1) v2 += 5;
4306   }
4307 
4308   if (now->ignore_case == 0) v1 *= 2;
4309   if (alt->ignore_case == 0) v2 *= 2;
4310 
4311   if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4312     copy_opt_exact_info(now, alt);
4313 }
4314 
4315 static void
clear_opt_map_info(OptMapInfo * map)4316 clear_opt_map_info(OptMapInfo* map)
4317 {
4318   static const OptMapInfo clean_info = {
4319     {0, 0}, {0, 0}, 0,
4320     {
4321       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4322       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4323       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4324       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4325       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4326       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4327       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4328       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4329       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4330       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4331       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4332       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4333       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4334       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4335       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4336       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4337     }
4338   };
4339 
4340   xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4341 }
4342 
4343 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4344 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4345 {
4346   *to = *from;
4347 }
4348 
4349 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4350 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4351 {
4352   if (map->map[c] == 0) {
4353     map->map[c] = 1;
4354     map->value += map_position_value(enc, c);
4355   }
4356 }
4357 
4358 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4359 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4360                           OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4361 {
4362   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4363   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4364   int i, n;
4365 
4366   add_char_opt_map_info(map, p[0], enc);
4367 
4368   case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4369   n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4370   if (n < 0) return n;
4371 
4372   for (i = 0; i < n; i++) {
4373     ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4374     add_char_opt_map_info(map, buf[0], enc);
4375   }
4376 
4377   return 0;
4378 }
4379 
4380 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4381 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4382 {
4383   static int z = 1<<15; /* 32768: something big value */
4384 
4385   int v1, v2;
4386 
4387   if (alt->value == 0) return ;
4388   if (now->value == 0) {
4389     copy_opt_map_info(now, alt);
4390     return ;
4391   }
4392 
4393   v1 = z / now->value;
4394   v2 = z / alt->value;
4395   if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4396     copy_opt_map_info(now, alt);
4397 }
4398 
4399 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4400 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4401 {
4402 #define COMP_EM_BASE  20
4403   int ve, vm;
4404 
4405   if (m->value <= 0) return -1;
4406 
4407   ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4408   vm = COMP_EM_BASE * 5 * 2 / m->value;
4409   return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4410 }
4411 
4412 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4413 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4414 {
4415   int i, val;
4416 
4417   /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4418   if (to->value == 0) return ;
4419   if (add->value == 0 || to->mmd.max < add->mmd.min) {
4420     clear_opt_map_info(to);
4421     return ;
4422   }
4423 
4424   alt_merge_mml(&to->mmd, &add->mmd);
4425 
4426   val = 0;
4427   for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4428     if (add->map[i])
4429       to->map[i] = 1;
4430 
4431     if (to->map[i])
4432       val += map_position_value(enc, i);
4433   }
4434   to->value = val;
4435 
4436   alt_merge_opt_anc_info(&to->anc, &add->anc);
4437 }
4438 
4439 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4440 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4441 {
4442   copy_mml(&(opt->exb.mmd),  mmd);
4443   copy_mml(&(opt->expr.mmd), mmd);
4444   copy_mml(&(opt->map.mmd),  mmd);
4445 }
4446 
4447 static void
clear_node_opt_info(NodeOptInfo * opt)4448 clear_node_opt_info(NodeOptInfo* opt)
4449 {
4450   clear_mml(&opt->len);
4451   clear_opt_anc_info(&opt->anc);
4452   clear_opt_exact_info(&opt->exb);
4453   clear_opt_exact_info(&opt->exm);
4454   clear_opt_exact_info(&opt->expr);
4455   clear_opt_map_info(&opt->map);
4456 }
4457 
4458 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4459 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4460 {
4461   *to = *from;
4462 }
4463 
4464 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4465 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4466 {
4467   int exb_reach, exm_reach;
4468   OptAncInfo tanc;
4469 
4470   concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4471   copy_opt_anc_info(&to->anc, &tanc);
4472 
4473   if (add->exb.len > 0 && to->len.max == 0) {
4474     concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4475 			to->len.max, add->len.max);
4476     copy_opt_anc_info(&add->exb.anc, &tanc);
4477   }
4478 
4479   if (add->map.value > 0 && to->len.max == 0) {
4480     if (add->map.mmd.max == 0)
4481       add->map.anc.left_anchor |= to->anc.left_anchor;
4482   }
4483 
4484   exb_reach = to->exb.reach_end;
4485   exm_reach = to->exm.reach_end;
4486 
4487   if (add->len.max != 0)
4488     to->exb.reach_end = to->exm.reach_end = 0;
4489 
4490   if (add->exb.len > 0) {
4491     if (exb_reach) {
4492       concat_opt_exact_info(&to->exb, &add->exb, enc);
4493       clear_opt_exact_info(&add->exb);
4494     }
4495     else if (exm_reach) {
4496       concat_opt_exact_info(&to->exm, &add->exb, enc);
4497       clear_opt_exact_info(&add->exb);
4498     }
4499   }
4500   select_opt_exact_info(enc, &to->exm, &add->exb);
4501   select_opt_exact_info(enc, &to->exm, &add->exm);
4502 
4503   if (to->expr.len > 0) {
4504     if (add->len.max > 0) {
4505       if (to->expr.len > (int )add->len.max)
4506 	to->expr.len = add->len.max;
4507 
4508       if (to->expr.mmd.max == 0)
4509 	select_opt_exact_info(enc, &to->exb, &to->expr);
4510       else
4511 	select_opt_exact_info(enc, &to->exm, &to->expr);
4512     }
4513   }
4514   else if (add->expr.len > 0) {
4515     copy_opt_exact_info(&to->expr, &add->expr);
4516   }
4517 
4518   select_opt_map_info(&to->map, &add->map);
4519 
4520   add_mml(&to->len, &add->len);
4521 }
4522 
4523 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4524 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4525 {
4526   alt_merge_opt_anc_info  (&to->anc,  &add->anc);
4527   alt_merge_opt_exact_info(&to->exb,  &add->exb, env);
4528   alt_merge_opt_exact_info(&to->exm,  &add->exm, env);
4529   alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4530   alt_merge_opt_map_info(env->enc, &to->map,  &add->map);
4531 
4532   alt_merge_mml(&to->len, &add->len);
4533 }
4534 
4535 
4536 #define MAX_NODE_OPT_INFO_REF_COUNT    5
4537 
4538 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4539 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4540 {
4541   int type;
4542   int r = 0;
4543 
4544   clear_node_opt_info(opt);
4545   set_bound_node_opt_info(opt, &env->mmd);
4546 
4547   type = NTYPE(node);
4548   switch (type) {
4549   case NT_LIST:
4550     {
4551       OptEnv nenv;
4552       NodeOptInfo nopt;
4553       Node* nd = node;
4554 
4555       copy_opt_env(&nenv, env);
4556       do {
4557 	r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4558 	if (r == 0) {
4559 	  add_mml(&nenv.mmd, &nopt.len);
4560 	  concat_left_node_opt_info(env->enc, opt, &nopt);
4561 	}
4562       } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4563     }
4564     break;
4565 
4566   case NT_ALT:
4567     {
4568       NodeOptInfo nopt;
4569       Node* nd = node;
4570 
4571       do {
4572 	r = optimize_node_left(NCAR(nd), &nopt, env);
4573 	if (r == 0) {
4574 	  if (nd == node) copy_node_opt_info(opt, &nopt);
4575 	  else            alt_merge_node_opt_info(opt, &nopt, env);
4576 	}
4577       } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4578     }
4579     break;
4580 
4581   case NT_STR:
4582     {
4583       StrNode* sn = NSTR(node);
4584       int slen = sn->end - sn->s;
4585       int is_raw = NSTRING_IS_RAW(node);
4586 
4587       if (! NSTRING_IS_AMBIG(node)) {
4588 	concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4589 				  NSTRING_IS_RAW(node), env->enc);
4590 	if (slen > 0) {
4591 	  add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4592 	}
4593         set_mml(&opt->len, slen, slen);
4594       }
4595       else {
4596         int max;
4597 
4598 	if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4599           int n = onigenc_strlen(env->enc, sn->s, sn->end);
4600           max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4601 	}
4602 	else {
4603 	  concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4604 				    is_raw, env->enc);
4605 	  opt->exb.ignore_case = 1;
4606 
4607 	  if (slen > 0) {
4608 	    r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4609 					  env->enc, env->case_fold_flag);
4610 	    if (r != 0) break;
4611 	  }
4612 
4613 	  max = slen;
4614 	}
4615 
4616         set_mml(&opt->len, slen, max);
4617       }
4618 
4619       if (opt->exb.len == slen)
4620 	opt->exb.reach_end = 1;
4621     }
4622     break;
4623 
4624   case NT_CCLASS:
4625     {
4626       int i, z;
4627       CClassNode* cc = NCCLASS(node);
4628 
4629       /* no need to check ignore case. (setted in setup_tree()) */
4630 
4631       if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4632         OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4633 	OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4634 
4635 	set_mml(&opt->len, min, max);
4636       }
4637       else {
4638         for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4639           z = BITSET_AT(cc->bs, i);
4640           if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4641             add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4642           }
4643         }
4644 	set_mml(&opt->len, 1, 1);
4645       }
4646     }
4647     break;
4648 
4649   case NT_CTYPE:
4650     {
4651       int i, min, max;
4652 
4653       max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4654 
4655       if (max == 1) {
4656         min = 1;
4657 
4658 	switch (NCTYPE(node)->ctype) {
4659 	case ONIGENC_CTYPE_WORD:
4660 	  if (NCTYPE(node)->not != 0) {
4661 	    for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4662 	      if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4663 		add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4664 	      }
4665 	    }
4666 	  }
4667 	  else {
4668 	    for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4669 	      if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4670 		add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4671 	      }
4672 	    }
4673 	  }
4674 	  break;
4675 	}
4676       }
4677       else {
4678         min = ONIGENC_MBC_MINLEN(env->enc);
4679       }
4680       set_mml(&opt->len, min, max);
4681     }
4682     break;
4683 
4684   case NT_CANY:
4685     {
4686       OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4687       OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4688       set_mml(&opt->len, min, max);
4689     }
4690     break;
4691 
4692   case NT_ANCHOR:
4693     switch (NANCHOR(node)->type) {
4694     case ANCHOR_BEGIN_BUF:
4695     case ANCHOR_BEGIN_POSITION:
4696     case ANCHOR_BEGIN_LINE:
4697     case ANCHOR_END_BUF:
4698     case ANCHOR_SEMI_END_BUF:
4699     case ANCHOR_END_LINE:
4700       add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4701       break;
4702 
4703     case ANCHOR_PREC_READ:
4704       {
4705 	NodeOptInfo nopt;
4706 
4707 	r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4708 	if (r == 0) {
4709 	  if (nopt.exb.len > 0)
4710 	    copy_opt_exact_info(&opt->expr, &nopt.exb);
4711 	  else if (nopt.exm.len > 0)
4712 	    copy_opt_exact_info(&opt->expr, &nopt.exm);
4713 
4714 	  opt->expr.reach_end = 0;
4715 
4716 	  if (nopt.map.value > 0)
4717 	    copy_opt_map_info(&opt->map, &nopt.map);
4718 	}
4719       }
4720       break;
4721 
4722     case ANCHOR_PREC_READ_NOT:
4723     case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4724     case ANCHOR_LOOK_BEHIND_NOT:
4725       break;
4726     }
4727     break;
4728 
4729   case NT_BREF:
4730     {
4731       int i;
4732       int* backs;
4733       OnigDistance min, max, tmin, tmax;
4734       Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4735       BRefNode* br = NBREF(node);
4736 
4737       if (br->state & NST_RECURSION) {
4738 	set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4739 	break;
4740       }
4741       backs = BACKREFS_P(br);
4742       r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4743       if (r != 0) break;
4744       r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4745       if (r != 0) break;
4746       for (i = 1; i < br->back_num; i++) {
4747 	r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4748 	if (r != 0) break;
4749 	r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4750 	if (r != 0) break;
4751 	if (min > tmin) min = tmin;
4752 	if (max < tmax) max = tmax;
4753       }
4754       if (r == 0) set_mml(&opt->len, min, max);
4755     }
4756     break;
4757 
4758 #ifdef USE_SUBEXP_CALL
4759   case NT_CALL:
4760     if (IS_CALL_RECURSION(NCALL(node)))
4761       set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4762     else {
4763       OnigOptionType save = env->options;
4764       env->options = NENCLOSE(NCALL(node)->target)->option;
4765       r = optimize_node_left(NCALL(node)->target, opt, env);
4766       env->options = save;
4767     }
4768     break;
4769 #endif
4770 
4771   case NT_QTFR:
4772     {
4773       int i;
4774       OnigDistance min, max;
4775       NodeOptInfo nopt;
4776       QtfrNode* qn = NQTFR(node);
4777 
4778       r = optimize_node_left(qn->target, &nopt, env);
4779       if (r) break;
4780 
4781       if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4782 	if (env->mmd.max == 0 &&
4783 	    NTYPE(qn->target) == NT_CANY && qn->greedy) {
4784 	  if (IS_MULTILINE(env->options))
4785 	    add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4786 	  else
4787 	    add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4788 	}
4789       }
4790       else {
4791 	if (qn->lower > 0) {
4792 	  copy_node_opt_info(opt, &nopt);
4793 	  if (nopt.exb.len > 0) {
4794 	    if (nopt.exb.reach_end) {
4795 	      for (i = 2; i <= qn->lower &&
4796 		          ! is_full_opt_exact_info(&opt->exb); i++) {
4797 		concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4798 	      }
4799 	      if (i < qn->lower) {
4800 		opt->exb.reach_end = 0;
4801 	      }
4802 	    }
4803 	  }
4804 
4805 	  if (qn->lower != qn->upper) {
4806 	    opt->exb.reach_end = 0;
4807 	    opt->exm.reach_end = 0;
4808 	  }
4809 	  if (qn->lower > 1)
4810 	    opt->exm.reach_end = 0;
4811 	}
4812       }
4813 
4814       min = distance_multiply(nopt.len.min, qn->lower);
4815       if (IS_REPEAT_INFINITE(qn->upper))
4816 	max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4817       else
4818 	max = distance_multiply(nopt.len.max, qn->upper);
4819 
4820       set_mml(&opt->len, min, max);
4821     }
4822     break;
4823 
4824   case NT_ENCLOSE:
4825     {
4826       EncloseNode* en = NENCLOSE(node);
4827 
4828       switch (en->type) {
4829       case ENCLOSE_OPTION:
4830 	{
4831 	  OnigOptionType save = env->options;
4832 
4833 	  env->options = en->option;
4834 	  r = optimize_node_left(en->target, opt, env);
4835 	  env->options = save;
4836 	}
4837 	break;
4838 
4839       case ENCLOSE_MEMORY:
4840 #ifdef USE_SUBEXP_CALL
4841 	en->opt_count++;
4842 	if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4843 	  OnigDistance min, max;
4844 
4845 	  min = 0;
4846 	  max = ONIG_INFINITE_DISTANCE;
4847 	  if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4848 	  if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4849 	  set_mml(&opt->len, min, max);
4850 	}
4851 	else
4852 #endif
4853 	{
4854 	  r = optimize_node_left(en->target, opt, env);
4855 
4856 	  if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4857 	    if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4858 	      remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4859 	  }
4860 	}
4861 	break;
4862 
4863       case ENCLOSE_STOP_BACKTRACK:
4864 	r = optimize_node_left(en->target, opt, env);
4865 	break;
4866       }
4867     }
4868     break;
4869 
4870   default:
4871 #ifdef ONIG_DEBUG
4872     fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4873 	    NTYPE(node));
4874 #endif
4875     r = ONIGERR_TYPE_BUG;
4876     break;
4877   }
4878 
4879   return r;
4880 }
4881 
4882 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4883 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4884 {
4885   int r;
4886 
4887   if (e->len == 0) return 0;
4888 
4889   if (e->ignore_case) {
4890     reg->exact = (UChar* )xmalloc(e->len);
4891     CHECK_NULL_RETURN_MEMERR(reg->exact);
4892     xmemcpy(reg->exact, e->s, e->len);
4893     reg->exact_end = reg->exact + e->len;
4894     reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4895   }
4896   else {
4897     int allow_reverse;
4898 
4899     reg->exact = str_dup(e->s, e->s + e->len);
4900     CHECK_NULL_RETURN_MEMERR(reg->exact);
4901     reg->exact_end = reg->exact + e->len;
4902 
4903     allow_reverse =
4904 	ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4905 
4906     if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4907       r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4908 	              reg->map, &(reg->int_map));
4909       if (r) return r;
4910 
4911       reg->optimize = (allow_reverse != 0
4912 		       ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4913     }
4914     else {
4915       reg->optimize = ONIG_OPTIMIZE_EXACT;
4916     }
4917   }
4918 
4919   reg->dmin = e->mmd.min;
4920   reg->dmax = e->mmd.max;
4921 
4922   if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4923     reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4924   }
4925 
4926   return 0;
4927 }
4928 
4929 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4930 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4931 {
4932   int i;
4933 
4934   for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4935     reg->map[i] = m->map[i];
4936 
4937   reg->optimize   = ONIG_OPTIMIZE_MAP;
4938   reg->dmin       = m->mmd.min;
4939   reg->dmax       = m->mmd.max;
4940 
4941   if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4942     reg->threshold_len = reg->dmin + 1;
4943   }
4944 }
4945 
4946 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4947 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4948 {
4949   reg->sub_anchor |= anc->left_anchor  & ANCHOR_BEGIN_LINE;
4950   reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4951 }
4952 
4953 #ifdef ONIG_DEBUG
4954 static void print_optimize_info(FILE* f, regex_t* reg);
4955 #endif
4956 
4957 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4958 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4959 {
4960 
4961   int r;
4962   NodeOptInfo opt;
4963   OptEnv env;
4964 
4965   env.enc            = reg->enc;
4966   env.options        = reg->options;
4967   env.case_fold_flag = reg->case_fold_flag;
4968   env.scan_env   = scan_env;
4969   clear_mml(&env.mmd);
4970 
4971   r = optimize_node_left(node, &opt, &env);
4972   if (r) return r;
4973 
4974   reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4975         ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4976 
4977   reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4978 
4979   if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4980     reg->anchor_dmin = opt.len.min;
4981     reg->anchor_dmax = opt.len.max;
4982   }
4983 
4984   if (opt.exb.len > 0 || opt.exm.len > 0) {
4985     select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4986     if (opt.map.value > 0 &&
4987 	comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4988       goto set_map;
4989     }
4990     else {
4991       r = set_optimize_exact_info(reg, &opt.exb);
4992       set_sub_anchor(reg, &opt.exb.anc);
4993     }
4994   }
4995   else if (opt.map.value > 0) {
4996   set_map:
4997     set_optimize_map_info(reg, &opt.map);
4998     set_sub_anchor(reg, &opt.map.anc);
4999   }
5000   else {
5001     reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5002     if (opt.len.max == 0)
5003       reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5004   }
5005 
5006 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5007   print_optimize_info(stderr, reg);
5008 #endif
5009   return r;
5010 }
5011 
5012 static void
clear_optimize_info(regex_t * reg)5013 clear_optimize_info(regex_t* reg)
5014 {
5015   reg->optimize      = ONIG_OPTIMIZE_NONE;
5016   reg->anchor        = 0;
5017   reg->anchor_dmin   = 0;
5018   reg->anchor_dmax   = 0;
5019   reg->sub_anchor    = 0;
5020   reg->exact_end     = (UChar* )NULL;
5021   reg->threshold_len = 0;
5022   if (IS_NOT_NULL(reg->exact)) {
5023     xfree(reg->exact);
5024     reg->exact = (UChar* )NULL;
5025   }
5026 }
5027 
5028 #ifdef ONIG_DEBUG
5029 
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5030 static void print_enc_string(FILE* fp, OnigEncoding enc,
5031 			     const UChar *s, const UChar *end)
5032 {
5033   fprintf(fp, "\nPATTERN: /");
5034 
5035   if (ONIGENC_MBC_MINLEN(enc) > 1) {
5036     const UChar *p;
5037     OnigCodePoint code;
5038 
5039     p = s;
5040     while (p < end) {
5041       code = ONIGENC_MBC_TO_CODE(enc, p, end);
5042       if (code >= 0x80) {
5043 	fprintf(fp, " 0x%04x ", (int )code);
5044       }
5045       else {
5046 	fputc((int )code, fp);
5047       }
5048 
5049       p += enclen(enc, p);
5050     }
5051   }
5052   else {
5053     while (s < end) {
5054       fputc((int )*s, fp);
5055       s++;
5056     }
5057   }
5058 
5059   fprintf(fp, "/\n");
5060 }
5061 
5062 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5063 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5064 {
5065   if (a == ONIG_INFINITE_DISTANCE)
5066     fputs("inf", f);
5067   else
5068     fprintf(f, "(%u)", a);
5069 
5070   fputs("-", f);
5071 
5072   if (b == ONIG_INFINITE_DISTANCE)
5073     fputs("inf", f);
5074   else
5075     fprintf(f, "(%u)", b);
5076 }
5077 
5078 static void
print_anchor(FILE * f,int anchor)5079 print_anchor(FILE* f, int anchor)
5080 {
5081   int q = 0;
5082 
5083   fprintf(f, "[");
5084 
5085   if (anchor & ANCHOR_BEGIN_BUF) {
5086     fprintf(f, "begin-buf");
5087     q = 1;
5088   }
5089   if (anchor & ANCHOR_BEGIN_LINE) {
5090     if (q) fprintf(f, ", ");
5091     q = 1;
5092     fprintf(f, "begin-line");
5093   }
5094   if (anchor & ANCHOR_BEGIN_POSITION) {
5095     if (q) fprintf(f, ", ");
5096     q = 1;
5097     fprintf(f, "begin-pos");
5098   }
5099   if (anchor & ANCHOR_END_BUF) {
5100     if (q) fprintf(f, ", ");
5101     q = 1;
5102     fprintf(f, "end-buf");
5103   }
5104   if (anchor & ANCHOR_SEMI_END_BUF) {
5105     if (q) fprintf(f, ", ");
5106     q = 1;
5107     fprintf(f, "semi-end-buf");
5108   }
5109   if (anchor & ANCHOR_END_LINE) {
5110     if (q) fprintf(f, ", ");
5111     q = 1;
5112     fprintf(f, "end-line");
5113   }
5114   if (anchor & ANCHOR_ANYCHAR_STAR) {
5115     if (q) fprintf(f, ", ");
5116     q = 1;
5117     fprintf(f, "anychar-star");
5118   }
5119   if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5120     if (q) fprintf(f, ", ");
5121     fprintf(f, "anychar-star-pl");
5122   }
5123 
5124   fprintf(f, "]");
5125 }
5126 
5127 static void
print_optimize_info(FILE * f,regex_t * reg)5128 print_optimize_info(FILE* f, regex_t* reg)
5129 {
5130   static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5131                               "EXACT_IC", "MAP" };
5132 
5133   fprintf(f, "optimize: %s\n", on[reg->optimize]);
5134   fprintf(f, "  anchor: "); print_anchor(f, reg->anchor);
5135   if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5136     print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5137   fprintf(f, "\n");
5138 
5139   if (reg->optimize) {
5140     fprintf(f, "  sub anchor: "); print_anchor(f, reg->sub_anchor);
5141     fprintf(f, "\n");
5142   }
5143   fprintf(f, "\n");
5144 
5145   if (reg->exact) {
5146     UChar *p;
5147     fprintf(f, "exact: [");
5148     for (p = reg->exact; p < reg->exact_end; p++) {
5149       fputc(*p, f);
5150     }
5151     fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5152   }
5153   else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5154     int c, i, n = 0;
5155 
5156     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5157       if (reg->map[i]) n++;
5158 
5159     fprintf(f, "map: n=%d\n", n);
5160     if (n > 0) {
5161       c = 0;
5162       fputc('[', f);
5163       for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5164 	if (reg->map[i] != 0) {
5165           if (c > 0)  fputs(", ", f);
5166           c++;
5167           if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5168               ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5169             fputc(i, f);
5170           else
5171             fprintf(f, "%d", i);
5172         }
5173       }
5174       fprintf(f, "]\n");
5175     }
5176   }
5177 }
5178 #endif /* ONIG_DEBUG */
5179 
5180 
5181 extern void
onig_free_body(regex_t * reg)5182 onig_free_body(regex_t* reg)
5183 {
5184   if (IS_NOT_NULL(reg)) {
5185     if (IS_NOT_NULL(reg->p))                xfree(reg->p);
5186     if (IS_NOT_NULL(reg->exact))            xfree(reg->exact);
5187     if (IS_NOT_NULL(reg->int_map))          xfree(reg->int_map);
5188     if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5189     if (IS_NOT_NULL(reg->repeat_range))     xfree(reg->repeat_range);
5190     if (IS_NOT_NULL(reg->chain))            onig_free(reg->chain);
5191 
5192 #ifdef USE_NAMED_GROUP
5193     onig_names_free(reg);
5194 #endif
5195   }
5196 }
5197 
5198 extern void
onig_free(regex_t * reg)5199 onig_free(regex_t* reg)
5200 {
5201   if (IS_NOT_NULL(reg)) {
5202     onig_free_body(reg);
5203     xfree(reg);
5204   }
5205 }
5206 
5207 #define REGEX_TRANSFER(to,from) do {\
5208   (to)->state = ONIG_STATE_MODIFY;\
5209   onig_free_body(to);\
5210   xmemcpy(to, from, sizeof(regex_t));\
5211   xfree(from);\
5212 } while (0)
5213 
5214 extern void
onig_transfer(regex_t * to,regex_t * from)5215 onig_transfer(regex_t* to, regex_t* from)
5216 {
5217   THREAD_ATOMIC_START;
5218   REGEX_TRANSFER(to, from);
5219   THREAD_ATOMIC_END;
5220 }
5221 
5222 #define REGEX_CHAIN_HEAD(reg) do {\
5223   while (IS_NOT_NULL((reg)->chain)) {\
5224     (reg) = (reg)->chain;\
5225   }\
5226 } while (0)
5227 
5228 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5229 onig_chain_link_add(regex_t* to, regex_t* add)
5230 {
5231   THREAD_ATOMIC_START;
5232   REGEX_CHAIN_HEAD(to);
5233   to->chain = add;
5234   THREAD_ATOMIC_END;
5235 }
5236 
5237 extern void
onig_chain_reduce(regex_t * reg)5238 onig_chain_reduce(regex_t* reg)
5239 {
5240   regex_t *head, *prev;
5241 
5242   prev = reg;
5243   head = prev->chain;
5244   if (IS_NOT_NULL(head)) {
5245     reg->state = ONIG_STATE_MODIFY;
5246     while (IS_NOT_NULL(head->chain)) {
5247       prev = head;
5248       head = head->chain;
5249     }
5250     prev->chain = (regex_t* )NULL;
5251     REGEX_TRANSFER(reg, head);
5252   }
5253 }
5254 
5255 #ifdef ONIG_DEBUG
5256 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5257 #endif
5258 #ifdef ONIG_DEBUG_PARSE_TREE
5259 static void print_tree P_((FILE* f, Node* node));
5260 #endif
5261 
5262 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5263 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5264 	      OnigErrorInfo* einfo)
5265 {
5266 #define COMPILE_INIT_SIZE  20
5267 
5268   int r, init_size;
5269   Node*  root;
5270   ScanEnv  scan_env;
5271 #ifdef USE_SUBEXP_CALL
5272   UnsetAddrList  uslist;
5273 #endif
5274 
5275   if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5276 
5277   reg->state = ONIG_STATE_COMPILING;
5278 
5279 #ifdef ONIG_DEBUG
5280   print_enc_string(stderr, reg->enc, pattern, pattern_end);
5281 #endif
5282 
5283   if (reg->alloc == 0) {
5284     init_size = (pattern_end - pattern) * 2;
5285     if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5286     r = BBUF_INIT(reg, init_size);
5287     if (r != 0) goto end;
5288   }
5289   else
5290     reg->used = 0;
5291 
5292   reg->num_mem            = 0;
5293   reg->num_repeat         = 0;
5294   reg->num_null_check     = 0;
5295   reg->repeat_range_alloc = 0;
5296   reg->repeat_range       = (OnigRepeatRange* )NULL;
5297 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5298   reg->num_comb_exp_check = 0;
5299 #endif
5300 
5301   r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5302   if (r != 0) goto err;
5303 
5304 #ifdef USE_NAMED_GROUP
5305   /* mixed use named group and no-named group */
5306   if (scan_env.num_named > 0 &&
5307       IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5308       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5309     if (scan_env.num_named != scan_env.num_mem)
5310       r = disable_noname_group_capture(&root, reg, &scan_env);
5311     else
5312       r = numbered_ref_check(root);
5313 
5314     if (r != 0) goto err;
5315   }
5316 #endif
5317 
5318 #ifdef USE_SUBEXP_CALL
5319   if (scan_env.num_call > 0) {
5320     r = unset_addr_list_init(&uslist, scan_env.num_call);
5321     if (r != 0) goto err;
5322     scan_env.unset_addr_list = &uslist;
5323     r = setup_subexp_call(root, &scan_env);
5324     if (r != 0) goto err_unset;
5325     r = subexp_recursive_check_trav(root, &scan_env);
5326     if (r  < 0) goto err_unset;
5327     r = subexp_inf_recursive_check_trav(root, &scan_env);
5328     if (r != 0) goto err_unset;
5329 
5330     reg->num_call = scan_env.num_call;
5331   }
5332   else
5333     reg->num_call = 0;
5334 #endif
5335 
5336   r = setup_tree(root, reg, 0, &scan_env);
5337   if (r != 0) goto err_unset;
5338 
5339 #ifdef ONIG_DEBUG_PARSE_TREE
5340   print_tree(stderr, root);
5341 #endif
5342 
5343   reg->capture_history  = scan_env.capture_history;
5344   reg->bt_mem_start     = scan_env.bt_mem_start;
5345   reg->bt_mem_start    |= reg->capture_history;
5346   if (IS_FIND_CONDITION(reg->options))
5347     BIT_STATUS_ON_ALL(reg->bt_mem_end);
5348   else {
5349     reg->bt_mem_end  = scan_env.bt_mem_end;
5350     reg->bt_mem_end |= reg->capture_history;
5351   }
5352 
5353 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5354   if (scan_env.backrefed_mem == 0
5355 #ifdef USE_SUBEXP_CALL
5356       || scan_env.num_call == 0
5357 #endif
5358       ) {
5359     setup_comb_exp_check(root, 0, &scan_env);
5360 #ifdef USE_SUBEXP_CALL
5361     if (scan_env.has_recursion != 0) {
5362       scan_env.num_comb_exp_check = 0;
5363     }
5364     else
5365 #endif
5366     if (scan_env.comb_exp_max_regnum > 0) {
5367       int i;
5368       for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5369 	if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5370 	  scan_env.num_comb_exp_check = 0;
5371 	  break;
5372 	}
5373       }
5374     }
5375   }
5376 
5377   reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5378 #endif
5379 
5380   clear_optimize_info(reg);
5381 #ifndef ONIG_DONT_OPTIMIZE
5382   r = set_optimize_info_from_tree(root, reg, &scan_env);
5383   if (r != 0) goto err_unset;
5384 #endif
5385 
5386   if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5387     xfree(scan_env.mem_nodes_dynamic);
5388     scan_env.mem_nodes_dynamic = (Node** )NULL;
5389   }
5390 
5391   r = compile_tree(root, reg);
5392   if (r == 0) {
5393     r = add_opcode(reg, OP_END);
5394 #ifdef USE_SUBEXP_CALL
5395     if (scan_env.num_call > 0) {
5396       r = unset_addr_list_fix(&uslist, reg);
5397       unset_addr_list_end(&uslist);
5398       if (r) goto err;
5399     }
5400 #endif
5401 
5402     if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5403       reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5404     else {
5405       if (reg->bt_mem_start != 0)
5406 	reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5407       else
5408 	reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5409     }
5410   }
5411 #ifdef USE_SUBEXP_CALL
5412   else if (scan_env.num_call > 0) {
5413     unset_addr_list_end(&uslist);
5414   }
5415 #endif
5416   onig_node_free(root);
5417 
5418 #ifdef ONIG_DEBUG_COMPILE
5419 #ifdef USE_NAMED_GROUP
5420   onig_print_names(stderr, reg);
5421 #endif
5422   print_compiled_byte_code_list(stderr, reg);
5423 #endif
5424 
5425  end:
5426   reg->state = ONIG_STATE_NORMAL;
5427   return r;
5428 
5429  err_unset:
5430 #ifdef USE_SUBEXP_CALL
5431   if (scan_env.num_call > 0) {
5432     unset_addr_list_end(&uslist);
5433   }
5434 #endif
5435  err:
5436   if (IS_NOT_NULL(scan_env.error)) {
5437     if (IS_NOT_NULL(einfo)) {
5438       einfo->enc     = scan_env.enc;
5439       einfo->par     = scan_env.error;
5440       einfo->par_end = scan_env.error_end;
5441     }
5442   }
5443 
5444   onig_node_free(root);
5445   if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5446       xfree(scan_env.mem_nodes_dynamic);
5447   return r;
5448 }
5449 
5450 #ifdef USE_RECOMPILE_API
5451 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5452 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5453 	    OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5454 	    OnigErrorInfo* einfo)
5455 {
5456   int r;
5457   regex_t *new_reg;
5458 
5459   r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5460   if (r) return r;
5461   if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5462     onig_transfer(reg, new_reg);
5463   }
5464   else {
5465     onig_chain_link_add(reg, new_reg);
5466   }
5467   return 0;
5468 }
5469 #endif
5470 
5471 static int onig_inited = 0;
5472 
5473 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5474 onig_reg_init(regex_t* reg, OnigOptionType option,
5475 	      OnigCaseFoldType case_fold_flag,
5476 	      OnigEncoding enc, OnigSyntaxType* syntax)
5477 {
5478   if (! onig_inited)
5479     onig_init();
5480 
5481   if (IS_NULL(reg))
5482     return ONIGERR_INVALID_ARGUMENT;
5483 
5484   if (ONIGENC_IS_UNDEF(enc))
5485     return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5486 
5487   if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5488       == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5489     return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5490   }
5491 
5492   (reg)->state = ONIG_STATE_MODIFY;
5493 
5494   if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5495     option |= syntax->options;
5496     option &= ~ONIG_OPTION_SINGLELINE;
5497   }
5498   else
5499     option |= syntax->options;
5500 
5501   (reg)->enc              = enc;
5502   (reg)->options          = option;
5503   (reg)->syntax           = syntax;
5504   (reg)->optimize         = 0;
5505   (reg)->exact            = (UChar* )NULL;
5506   (reg)->int_map          = (int* )NULL;
5507   (reg)->int_map_backward = (int* )NULL;
5508   (reg)->chain            = (regex_t* )NULL;
5509 
5510   (reg)->p                = (UChar* )NULL;
5511   (reg)->alloc            = 0;
5512   (reg)->used             = 0;
5513   (reg)->name_table       = (void* )NULL;
5514 
5515   (reg)->case_fold_flag   = case_fold_flag;
5516   return 0;
5517 }
5518 
5519 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5520 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5521           const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5522           OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5523 {
5524   int r;
5525 
5526   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5527   if (r) return r;
5528 
5529   r = onig_compile(reg, pattern, pattern_end, einfo);
5530   return r;
5531 }
5532 
5533 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5534 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5535 	  OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5536 	  OnigErrorInfo* einfo)
5537 {
5538   int r;
5539 
5540   *reg = (regex_t* )xmalloc(sizeof(regex_t));
5541   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5542 
5543   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5544   if (r) goto err;
5545 
5546   r = onig_compile(*reg, pattern, pattern_end, einfo);
5547   if (r) {
5548   err:
5549     onig_free(*reg);
5550     *reg = NULL;
5551   }
5552   return r;
5553 }
5554 
5555 
5556 extern int
onig_init(void)5557 onig_init(void)
5558 {
5559   if (onig_inited != 0)
5560     return 0;
5561 
5562   THREAD_SYSTEM_INIT;
5563   THREAD_ATOMIC_START;
5564 
5565   onig_inited = 1;
5566 
5567   onigenc_init();
5568   /* onigenc_set_default_caseconv_table((UChar* )0); */
5569 
5570 #ifdef ONIG_DEBUG_STATISTICS
5571   onig_statistics_init();
5572 #endif
5573 
5574   THREAD_ATOMIC_END;
5575   return 0;
5576 }
5577 
5578 
5579 extern int
onig_end(void)5580 onig_end(void)
5581 {
5582   THREAD_ATOMIC_START;
5583 
5584 #ifdef ONIG_DEBUG_STATISTICS
5585   onig_print_statistics(stderr);
5586 #endif
5587 
5588 #ifdef USE_SHARED_CCLASS_TABLE
5589   onig_free_shared_cclass_table();
5590 #endif
5591 
5592 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5593   onig_free_node_list();
5594 #endif
5595 
5596   onig_inited = 0;
5597 
5598   THREAD_ATOMIC_END;
5599   THREAD_SYSTEM_END;
5600   return 0;
5601 }
5602 
5603 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5604 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5605 {
5606   OnigCodePoint n, *data;
5607   OnigCodePoint low, high, x;
5608 
5609   GET_CODE_POINT(n, p);
5610   data = (OnigCodePoint* )p;
5611   data++;
5612 
5613   for (low = 0, high = n; low < high; ) {
5614     x = (low + high) >> 1;
5615     if (code > data[x * 2 + 1])
5616       low = x + 1;
5617     else
5618       high = x;
5619   }
5620 
5621   return ((low < n && code >= data[low * 2]) ? 1 : 0);
5622 }
5623 
5624 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5625 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5626 {
5627   int found;
5628 
5629   if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5630     if (IS_NULL(cc->mbuf)) {
5631       found = 0;
5632     }
5633     else {
5634       found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5635     }
5636   }
5637   else {
5638     found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5639   }
5640 
5641   if (IS_NCCLASS_NOT(cc))
5642     return !found;
5643   else
5644     return found;
5645 }
5646 
5647 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5648 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5649 {
5650   int len;
5651 
5652   if (ONIGENC_MBC_MINLEN(enc) > 1) {
5653     len = 2;
5654   }
5655   else {
5656     len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5657   }
5658   return onig_is_code_in_cc_len(len, code, cc);
5659 }
5660 
5661 
5662 #ifdef ONIG_DEBUG
5663 
5664 /* arguments type */
5665 #define ARG_SPECIAL     -1
5666 #define ARG_NON          0
5667 #define ARG_RELADDR      1
5668 #define ARG_ABSADDR      2
5669 #define ARG_LENGTH       3
5670 #define ARG_MEMNUM       4
5671 #define ARG_OPTION       5
5672 #define ARG_STATE_CHECK  6
5673 
5674 OnigOpInfoType OnigOpInfo[] = {
5675   { OP_FINISH,            "finish",          ARG_NON },
5676   { OP_END,               "end",             ARG_NON },
5677   { OP_EXACT1,            "exact1",          ARG_SPECIAL },
5678   { OP_EXACT2,            "exact2",          ARG_SPECIAL },
5679   { OP_EXACT3,            "exact3",          ARG_SPECIAL },
5680   { OP_EXACT4,            "exact4",          ARG_SPECIAL },
5681   { OP_EXACT5,            "exact5",          ARG_SPECIAL },
5682   { OP_EXACTN,            "exactn",          ARG_SPECIAL },
5683   { OP_EXACTMB2N1,        "exactmb2-n1",     ARG_SPECIAL },
5684   { OP_EXACTMB2N2,        "exactmb2-n2",     ARG_SPECIAL },
5685   { OP_EXACTMB2N3,        "exactmb2-n3",     ARG_SPECIAL },
5686   { OP_EXACTMB2N,         "exactmb2-n",      ARG_SPECIAL },
5687   { OP_EXACTMB3N,         "exactmb3n"  ,     ARG_SPECIAL },
5688   { OP_EXACTMBN,          "exactmbn",        ARG_SPECIAL },
5689   { OP_EXACT1_IC,         "exact1-ic",       ARG_SPECIAL },
5690   { OP_EXACTN_IC,         "exactn-ic",       ARG_SPECIAL },
5691   { OP_CCLASS,            "cclass",          ARG_SPECIAL },
5692   { OP_CCLASS_MB,         "cclass-mb",       ARG_SPECIAL },
5693   { OP_CCLASS_MIX,        "cclass-mix",      ARG_SPECIAL },
5694   { OP_CCLASS_NOT,        "cclass-not",      ARG_SPECIAL },
5695   { OP_CCLASS_MB_NOT,     "cclass-mb-not",   ARG_SPECIAL },
5696   { OP_CCLASS_MIX_NOT,    "cclass-mix-not",  ARG_SPECIAL },
5697   { OP_CCLASS_NODE,       "cclass-node",     ARG_SPECIAL },
5698   { OP_ANYCHAR,           "anychar",         ARG_NON },
5699   { OP_ANYCHAR_ML,        "anychar-ml",      ARG_NON },
5700   { OP_ANYCHAR_STAR,      "anychar*",        ARG_NON },
5701   { OP_ANYCHAR_ML_STAR,   "anychar-ml*",     ARG_NON },
5702   { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5703   { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5704   { OP_WORD,                "word",            ARG_NON },
5705   { OP_NOT_WORD,            "not-word",        ARG_NON },
5706   { OP_WORD_BOUND,          "word-bound",      ARG_NON },
5707   { OP_NOT_WORD_BOUND,      "not-word-bound",  ARG_NON },
5708   { OP_WORD_BEGIN,          "word-begin",      ARG_NON },
5709   { OP_WORD_END,            "word-end",        ARG_NON },
5710   { OP_BEGIN_BUF,           "begin-buf",       ARG_NON },
5711   { OP_END_BUF,             "end-buf",         ARG_NON },
5712   { OP_BEGIN_LINE,          "begin-line",      ARG_NON },
5713   { OP_END_LINE,            "end-line",        ARG_NON },
5714   { OP_SEMI_END_BUF,        "semi-end-buf",    ARG_NON },
5715   { OP_BEGIN_POSITION,      "begin-position",  ARG_NON },
5716   { OP_BACKREF1,            "backref1",             ARG_NON },
5717   { OP_BACKREF2,            "backref2",             ARG_NON },
5718   { OP_BACKREFN,            "backrefn",             ARG_MEMNUM  },
5719   { OP_BACKREFN_IC,         "backrefn-ic",          ARG_SPECIAL },
5720   { OP_BACKREF_MULTI,       "backref_multi",        ARG_SPECIAL },
5721   { OP_BACKREF_MULTI_IC,    "backref_multi-ic",     ARG_SPECIAL },
5722   { OP_BACKREF_WITH_LEVEL,    "backref_at_level",     ARG_SPECIAL },
5723   { OP_MEMORY_START_PUSH,   "mem-start-push",       ARG_MEMNUM  },
5724   { OP_MEMORY_START,        "mem-start",            ARG_MEMNUM  },
5725   { OP_MEMORY_END_PUSH,     "mem-end-push",         ARG_MEMNUM  },
5726   { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec",     ARG_MEMNUM  },
5727   { OP_MEMORY_END,          "mem-end",              ARG_MEMNUM  },
5728   { OP_MEMORY_END_REC,      "mem-end-rec",          ARG_MEMNUM  },
5729   { OP_SET_OPTION_PUSH,     "set-option-push",      ARG_OPTION  },
5730   { OP_SET_OPTION,          "set-option",           ARG_OPTION  },
5731   { OP_FAIL,                "fail",                 ARG_NON },
5732   { OP_JUMP,                "jump",                 ARG_RELADDR },
5733   { OP_PUSH,                "push",                 ARG_RELADDR },
5734   { OP_POP,                 "pop",                  ARG_NON },
5735   { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1",      ARG_SPECIAL },
5736   { OP_PUSH_IF_PEEK_NEXT,   "push-if-peek-next",    ARG_SPECIAL },
5737   { OP_REPEAT,              "repeat",               ARG_SPECIAL },
5738   { OP_REPEAT_NG,           "repeat-ng",            ARG_SPECIAL },
5739   { OP_REPEAT_INC,          "repeat-inc",           ARG_MEMNUM  },
5740   { OP_REPEAT_INC_NG,       "repeat-inc-ng",        ARG_MEMNUM  },
5741   { OP_REPEAT_INC_SG,       "repeat-inc-sg",        ARG_MEMNUM  },
5742   { OP_REPEAT_INC_NG_SG,    "repeat-inc-ng-sg",     ARG_MEMNUM  },
5743   { OP_NULL_CHECK_START,    "null-check-start",     ARG_MEMNUM  },
5744   { OP_NULL_CHECK_END,      "null-check-end",       ARG_MEMNUM  },
5745   { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM  },
5746   { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM  },
5747   { OP_PUSH_POS,             "push-pos",             ARG_NON },
5748   { OP_POP_POS,              "pop-pos",              ARG_NON },
5749   { OP_PUSH_POS_NOT,         "push-pos-not",         ARG_RELADDR },
5750   { OP_FAIL_POS,             "fail-pos",             ARG_NON },
5751   { OP_PUSH_STOP_BT,         "push-stop-bt",         ARG_NON },
5752   { OP_POP_STOP_BT,          "pop-stop-bt",          ARG_NON },
5753   { OP_LOOK_BEHIND,          "look-behind",          ARG_SPECIAL },
5754   { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5755   { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5756   { OP_CALL,                 "call",                 ARG_ABSADDR },
5757   { OP_RETURN,               "return",               ARG_NON },
5758   { OP_STATE_CHECK_PUSH,         "state-check-push",         ARG_SPECIAL },
5759   { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5760   { OP_STATE_CHECK,              "state-check",              ARG_STATE_CHECK },
5761   { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*",     ARG_STATE_CHECK },
5762   { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5763     "state-check-anychar-ml*", ARG_STATE_CHECK },
5764   { -1, "", ARG_NON }
5765 };
5766 
5767 static char*
op2name(int opcode)5768 op2name(int opcode)
5769 {
5770   int i;
5771 
5772   for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5773     if (opcode == OnigOpInfo[i].opcode)
5774       return OnigOpInfo[i].name;
5775   }
5776   return "";
5777 }
5778 
5779 static int
op2arg_type(int opcode)5780 op2arg_type(int opcode)
5781 {
5782   int i;
5783 
5784   for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5785     if (opcode == OnigOpInfo[i].opcode)
5786       return OnigOpInfo[i].arg_type;
5787   }
5788   return ARG_SPECIAL;
5789 }
5790 
5791 static void
Indent(FILE * f,int indent)5792 Indent(FILE* f, int indent)
5793 {
5794   int i;
5795   for (i = 0; i < indent; i++) putc(' ', f);
5796 }
5797 
5798 static void
p_string(FILE * f,int len,UChar * s)5799 p_string(FILE* f, int len, UChar* s)
5800 {
5801   fputs(":", f);
5802   while (len-- > 0) { fputc(*s++, f); }
5803 }
5804 
5805 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5806 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5807 {
5808   int x = len * mb_len;
5809 
5810   fprintf(f, ":%d:", len);
5811   while (x-- > 0) { fputc(*s++, f); }
5812 }
5813 
5814 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5815 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5816                               OnigEncoding enc)
5817 {
5818   int i, n, arg_type;
5819   RelAddrType addr;
5820   LengthType len;
5821   MemNumType mem;
5822   StateCheckNumType scn;
5823   OnigCodePoint code;
5824   UChar *q;
5825 
5826   fprintf(f, "[%s", op2name(*bp));
5827   arg_type = op2arg_type(*bp);
5828   if (arg_type != ARG_SPECIAL) {
5829     bp++;
5830     switch (arg_type) {
5831     case ARG_NON:
5832       break;
5833     case ARG_RELADDR:
5834       GET_RELADDR_INC(addr, bp);
5835       fprintf(f, ":(%d)", addr);
5836       break;
5837     case ARG_ABSADDR:
5838       GET_ABSADDR_INC(addr, bp);
5839       fprintf(f, ":(%d)", addr);
5840       break;
5841     case ARG_LENGTH:
5842       GET_LENGTH_INC(len, bp);
5843       fprintf(f, ":%d", len);
5844       break;
5845     case ARG_MEMNUM:
5846       mem = *((MemNumType* )bp);
5847       bp += SIZE_MEMNUM;
5848       fprintf(f, ":%d", mem);
5849       break;
5850     case ARG_OPTION:
5851       {
5852 	OnigOptionType option = *((OnigOptionType* )bp);
5853 	bp += SIZE_OPTION;
5854 	fprintf(f, ":%d", option);
5855       }
5856       break;
5857 
5858     case ARG_STATE_CHECK:
5859       scn = *((StateCheckNumType* )bp);
5860       bp += SIZE_STATE_CHECK_NUM;
5861       fprintf(f, ":%d", scn);
5862       break;
5863     }
5864   }
5865   else {
5866     switch (*bp++) {
5867     case OP_EXACT1:
5868     case OP_ANYCHAR_STAR_PEEK_NEXT:
5869     case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5870       p_string(f, 1, bp++); break;
5871     case OP_EXACT2:
5872       p_string(f, 2, bp); bp += 2; break;
5873     case OP_EXACT3:
5874       p_string(f, 3, bp); bp += 3; break;
5875     case OP_EXACT4:
5876       p_string(f, 4, bp); bp += 4; break;
5877     case OP_EXACT5:
5878       p_string(f, 5, bp); bp += 5; break;
5879     case OP_EXACTN:
5880       GET_LENGTH_INC(len, bp);
5881       p_len_string(f, len, 1, bp);
5882       bp += len;
5883       break;
5884 
5885     case OP_EXACTMB2N1:
5886       p_string(f, 2, bp); bp += 2; break;
5887     case OP_EXACTMB2N2:
5888       p_string(f, 4, bp); bp += 4; break;
5889     case OP_EXACTMB2N3:
5890       p_string(f, 6, bp); bp += 6; break;
5891     case OP_EXACTMB2N:
5892       GET_LENGTH_INC(len, bp);
5893       p_len_string(f, len, 2, bp);
5894       bp += len * 2;
5895       break;
5896     case OP_EXACTMB3N:
5897       GET_LENGTH_INC(len, bp);
5898       p_len_string(f, len, 3, bp);
5899       bp += len * 3;
5900       break;
5901     case OP_EXACTMBN:
5902       {
5903 	int mb_len;
5904 
5905 	GET_LENGTH_INC(mb_len, bp);
5906 	GET_LENGTH_INC(len, bp);
5907 	fprintf(f, ":%d:%d:", mb_len, len);
5908 	n = len * mb_len;
5909 	while (n-- > 0) { fputc(*bp++, f); }
5910       }
5911       break;
5912 
5913     case OP_EXACT1_IC:
5914       len = enclen(enc, bp);
5915       p_string(f, len, bp);
5916       bp += len;
5917       break;
5918     case OP_EXACTN_IC:
5919       GET_LENGTH_INC(len, bp);
5920       p_len_string(f, len, 1, bp);
5921       bp += len;
5922       break;
5923 
5924     case OP_CCLASS:
5925       n = bitset_on_num((BitSetRef )bp);
5926       bp += SIZE_BITSET;
5927       fprintf(f, ":%d", n);
5928       break;
5929 
5930     case OP_CCLASS_NOT:
5931       n = bitset_on_num((BitSetRef )bp);
5932       bp += SIZE_BITSET;
5933       fprintf(f, ":%d", n);
5934       break;
5935 
5936     case OP_CCLASS_MB:
5937     case OP_CCLASS_MB_NOT:
5938       GET_LENGTH_INC(len, bp);
5939       q = bp;
5940 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5941       ALIGNMENT_RIGHT(q);
5942 #endif
5943       GET_CODE_POINT(code, q);
5944       bp += len;
5945       fprintf(f, ":%d:%d", (int )code, len);
5946       break;
5947 
5948     case OP_CCLASS_MIX:
5949     case OP_CCLASS_MIX_NOT:
5950       n = bitset_on_num((BitSetRef )bp);
5951       bp += SIZE_BITSET;
5952       GET_LENGTH_INC(len, bp);
5953       q = bp;
5954 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5955       ALIGNMENT_RIGHT(q);
5956 #endif
5957       GET_CODE_POINT(code, q);
5958       bp += len;
5959       fprintf(f, ":%d:%d:%d", n, (int )code, len);
5960       break;
5961 
5962     case OP_CCLASS_NODE:
5963       {
5964         CClassNode *cc;
5965 
5966         GET_POINTER_INC(cc, bp);
5967         n = bitset_on_num(cc->bs);
5968         fprintf(f, ":%u:%d", (unsigned int )cc, n);
5969       }
5970       break;
5971 
5972     case OP_BACKREFN_IC:
5973       mem = *((MemNumType* )bp);
5974       bp += SIZE_MEMNUM;
5975       fprintf(f, ":%d", mem);
5976       break;
5977 
5978     case OP_BACKREF_MULTI_IC:
5979     case OP_BACKREF_MULTI:
5980       fputs(" ", f);
5981       GET_LENGTH_INC(len, bp);
5982       for (i = 0; i < len; i++) {
5983 	GET_MEMNUM_INC(mem, bp);
5984 	if (i > 0) fputs(", ", f);
5985 	fprintf(f, "%d", mem);
5986       }
5987       break;
5988 
5989     case OP_BACKREF_WITH_LEVEL:
5990       {
5991 	OnigOptionType option;
5992 	LengthType level;
5993 
5994 	GET_OPTION_INC(option, bp);
5995 	fprintf(f, ":%d", option);
5996 	GET_LENGTH_INC(level, bp);
5997 	fprintf(f, ":%d", level);
5998 
5999 	fputs(" ", f);
6000 	GET_LENGTH_INC(len, bp);
6001 	for (i = 0; i < len; i++) {
6002 	  GET_MEMNUM_INC(mem, bp);
6003 	  if (i > 0) fputs(", ", f);
6004 	  fprintf(f, "%d", mem);
6005 	}
6006       }
6007       break;
6008 
6009     case OP_REPEAT:
6010     case OP_REPEAT_NG:
6011       {
6012 	mem = *((MemNumType* )bp);
6013 	bp += SIZE_MEMNUM;
6014 	addr = *((RelAddrType* )bp);
6015 	bp += SIZE_RELADDR;
6016 	fprintf(f, ":%d:%d", mem, addr);
6017       }
6018       break;
6019 
6020     case OP_PUSH_OR_JUMP_EXACT1:
6021     case OP_PUSH_IF_PEEK_NEXT:
6022       addr = *((RelAddrType* )bp);
6023       bp += SIZE_RELADDR;
6024       fprintf(f, ":(%d)", addr);
6025       p_string(f, 1, bp);
6026       bp += 1;
6027       break;
6028 
6029     case OP_LOOK_BEHIND:
6030       GET_LENGTH_INC(len, bp);
6031       fprintf(f, ":%d", len);
6032       break;
6033 
6034     case OP_PUSH_LOOK_BEHIND_NOT:
6035       GET_RELADDR_INC(addr, bp);
6036       GET_LENGTH_INC(len, bp);
6037       fprintf(f, ":%d:(%d)", len, addr);
6038       break;
6039 
6040     case OP_STATE_CHECK_PUSH:
6041     case OP_STATE_CHECK_PUSH_OR_JUMP:
6042       scn = *((StateCheckNumType* )bp);
6043       bp += SIZE_STATE_CHECK_NUM;
6044       addr = *((RelAddrType* )bp);
6045       bp += SIZE_RELADDR;
6046       fprintf(f, ":%d:(%d)", scn, addr);
6047       break;
6048 
6049     default:
6050       fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6051 	      *--bp);
6052     }
6053   }
6054   fputs("]", f);
6055   if (nextp) *nextp = bp;
6056 }
6057 
6058 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6059 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6060 {
6061   int ncode;
6062   UChar* bp = reg->p;
6063   UChar* end = reg->p + reg->used;
6064 
6065   fprintf(f, "code length: %d\n", reg->used);
6066 
6067   ncode = 0;
6068   while (bp < end) {
6069     ncode++;
6070     if (bp > reg->p) {
6071       if (ncode % 5 == 0)
6072 	fprintf(f, "\n");
6073       else
6074 	fputs(" ", f);
6075     }
6076     onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6077   }
6078 
6079   fprintf(f, "\n");
6080 }
6081 
6082 static void
print_indent_tree(FILE * f,Node * node,int indent)6083 print_indent_tree(FILE* f, Node* node, int indent)
6084 {
6085   int i, type;
6086   int add = 3;
6087   UChar* p;
6088 
6089   Indent(f, indent);
6090   if (IS_NULL(node)) {
6091     fprintf(f, "ERROR: null node!!!\n");
6092     exit (0);
6093   }
6094 
6095   type = NTYPE(node);
6096   switch (type) {
6097   case NT_LIST:
6098   case NT_ALT:
6099     if (NTYPE(node) == NT_LIST)
6100       fprintf(f, "<list:%x>\n", (int )node);
6101     else
6102       fprintf(f, "<alt:%x>\n", (int )node);
6103 
6104     print_indent_tree(f, NCAR(node), indent + add);
6105     while (IS_NOT_NULL(node = NCDR(node))) {
6106       if (NTYPE(node) != type) {
6107 	fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6108 	exit(0);
6109       }
6110       print_indent_tree(f, NCAR(node), indent + add);
6111     }
6112     break;
6113 
6114   case NT_STR:
6115     fprintf(f, "<string%s:%x>",
6116 	    (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6117     for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6118       if (*p >= 0x20 && *p < 0x7f)
6119 	fputc(*p, f);
6120       else {
6121 	fprintf(f, " 0x%02x", *p);
6122       }
6123     }
6124     break;
6125 
6126   case NT_CCLASS:
6127     fprintf(f, "<cclass:%x>", (int )node);
6128     if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6129     if (NCCLASS(node)->mbuf) {
6130       BBuf* bbuf = NCCLASS(node)->mbuf;
6131       for (i = 0; i < bbuf->used; i++) {
6132 	if (i > 0) fprintf(f, ",");
6133 	fprintf(f, "%0x", bbuf->p[i]);
6134       }
6135     }
6136     break;
6137 
6138   case NT_CTYPE:
6139     fprintf(f, "<ctype:%x> ", (int )node);
6140     switch (NCTYPE(node)->ctype) {
6141     case ONIGENC_CTYPE_WORD:
6142       if (NCTYPE(node)->not != 0)
6143 	fputs("not word",       f);
6144       else
6145 	fputs("word",           f);
6146       break;
6147 
6148     default:
6149       fprintf(f, "ERROR: undefined ctype.\n");
6150       exit(0);
6151     }
6152     break;
6153 
6154   case NT_CANY:
6155     fprintf(f, "<anychar:%x>", (int )node);
6156     break;
6157 
6158   case NT_ANCHOR:
6159     fprintf(f, "<anchor:%x> ", (int )node);
6160     switch (NANCHOR(node)->type) {
6161     case ANCHOR_BEGIN_BUF:      fputs("begin buf",      f); break;
6162     case ANCHOR_END_BUF:        fputs("end buf",        f); break;
6163     case ANCHOR_BEGIN_LINE:     fputs("begin line",     f); break;
6164     case ANCHOR_END_LINE:       fputs("end line",       f); break;
6165     case ANCHOR_SEMI_END_BUF:   fputs("semi end buf",   f); break;
6166     case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6167 
6168     case ANCHOR_WORD_BOUND:      fputs("word bound",     f); break;
6169     case ANCHOR_NOT_WORD_BOUND:  fputs("not word bound", f); break;
6170 #ifdef USE_WORD_BEGIN_END
6171     case ANCHOR_WORD_BEGIN:      fputs("word begin", f);     break;
6172     case ANCHOR_WORD_END:        fputs("word end", f);       break;
6173 #endif
6174     case ANCHOR_PREC_READ:       fputs("prec read",      f); break;
6175     case ANCHOR_PREC_READ_NOT:   fputs("prec read not",  f); break;
6176     case ANCHOR_LOOK_BEHIND:     fputs("look_behind",    f); break;
6177     case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6178 
6179     default:
6180       fprintf(f, "ERROR: undefined anchor type.\n");
6181       break;
6182     }
6183     break;
6184 
6185   case NT_BREF:
6186     {
6187       int* p;
6188       BRefNode* br = NBREF(node);
6189       p = BACKREFS_P(br);
6190       fprintf(f, "<backref:%x>", (int )node);
6191       for (i = 0; i < br->back_num; i++) {
6192 	if (i > 0) fputs(", ", f);
6193 	fprintf(f, "%d", p[i]);
6194       }
6195     }
6196     break;
6197 
6198 #ifdef USE_SUBEXP_CALL
6199   case NT_CALL:
6200     {
6201       CallNode* cn = NCALL(node);
6202       fprintf(f, "<call:%x>", (int )node);
6203       p_string(f, cn->name_end - cn->name, cn->name);
6204     }
6205     break;
6206 #endif
6207 
6208   case NT_QTFR:
6209     fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6210 	    NQTFR(node)->lower, NQTFR(node)->upper,
6211 	    (NQTFR(node)->greedy ? "" : "?"));
6212     print_indent_tree(f, NQTFR(node)->target, indent + add);
6213     break;
6214 
6215   case NT_ENCLOSE:
6216     fprintf(f, "<enclose:%x> ", (int )node);
6217     switch (NENCLOSE(node)->type) {
6218     case ENCLOSE_OPTION:
6219       fprintf(f, "option:%d\n", NENCLOSE(node)->option);
6220       print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6221       break;
6222     case ENCLOSE_MEMORY:
6223       fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6224       break;
6225     case ENCLOSE_STOP_BACKTRACK:
6226       fprintf(f, "stop-bt");
6227       break;
6228 
6229     default:
6230       break;
6231     }
6232     fprintf(f, "\n");
6233     print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6234     break;
6235 
6236   default:
6237     fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6238     break;
6239   }
6240 
6241   if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6242       type != NT_ENCLOSE)
6243     fprintf(f, "\n");
6244   fflush(f);
6245 }
6246 #endif /* ONIG_DEBUG */
6247 
6248 #ifdef ONIG_DEBUG_PARSE_TREE
6249 static void
print_tree(FILE * f,Node * node)6250 print_tree(FILE* f, Node* node)
6251 {
6252   print_indent_tree(f, node, 0);
6253 }
6254 #endif
6255