xref: /PHP-5.6/ext/mbstring/oniguruma/regcomp.c (revision c95daa9c)
1 /**********************************************************************
2   regcomp.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33 
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37   return OnigDefaultCaseFoldFlag;
38 }
39 
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43   OnigDefaultCaseFoldFlag = case_fold_flag;
44   return 0;
45 }
46 
47 
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51 
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55   int len = end - s;
56 
57   if (len > 0) {
58     UChar* r = (UChar* )xmalloc(len + 1);
59     CHECK_NULL_RETURN(r);
60     xmemcpy(r, s, len);
61     r[len] = (UChar )0;
62     return r;
63   }
64   else return NULL;
65 }
66 
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70   Node c;
71   c = *a; *a = *b; *b = c;
72 
73   if (NTYPE(a) == NT_STR) {
74     StrNode* sn = NSTR(a);
75     if (sn->capa == 0) {
76       int len = sn->end - sn->s;
77       sn->s   = sn->buf;
78       sn->end = sn->s + len;
79     }
80   }
81 
82   if (NTYPE(b) == NT_STR) {
83     StrNode* sn = NSTR(b);
84     if (sn->capa == 0) {
85       int len = sn->end - sn->s;
86       sn->s   = sn->buf;
87       sn->end = sn->s + len;
88     }
89   }
90 }
91 
92 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)93 distance_add(OnigDistance d1, OnigDistance d2)
94 {
95   if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96     return ONIG_INFINITE_DISTANCE;
97   else {
98     if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99     else return ONIG_INFINITE_DISTANCE;
100   }
101 }
102 
103 static OnigDistance
distance_multiply(OnigDistance d,int m)104 distance_multiply(OnigDistance d, int m)
105 {
106   if (m == 0) return 0;
107 
108   if (d < ONIG_INFINITE_DISTANCE / m)
109     return d * m;
110   else
111     return ONIG_INFINITE_DISTANCE;
112 }
113 
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117   int i;
118   for (i = 0; i < (int )BITSET_SIZE; i++) {
119     if (bs[i] != 0) return 0;
120   }
121   return 1;
122 }
123 
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128   int i, n;
129 
130   n = 0;
131   for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132     if (BITSET_AT(bs, i)) n++;
133   }
134   return n;
135 }
136 #endif
137 
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141   if (size <= 0) {
142     size   = 0;
143     buf->p = NULL;
144   }
145   else {
146     buf->p = (UChar* )xmalloc(size);
147     if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148   }
149 
150   buf->alloc = size;
151   buf->used  = 0;
152   return 0;
153 }
154 
155 
156 #ifdef USE_SUBEXP_CALL
157 
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161   UnsetAddr* p;
162 
163   p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164   CHECK_NULL_RETURN_MEMERR(p);
165   uslist->num   = 0;
166   uslist->alloc = size;
167   uslist->us    = p;
168   return 0;
169 }
170 
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174   if (IS_NOT_NULL(uslist->us))
175     xfree(uslist->us);
176 }
177 
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181   UnsetAddr* p;
182   int size;
183 
184   if (uslist->num >= uslist->alloc) {
185     size = uslist->alloc * 2;
186     p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187     CHECK_NULL_RETURN_MEMERR(p);
188     uslist->alloc = size;
189     uslist->us    = p;
190   }
191 
192   uslist->us[uslist->num].offset = offset;
193   uslist->us[uslist->num].target = node;
194   uslist->num++;
195   return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198 
199 
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203   BBUF_ADD1(reg, opcode);
204   return 0;
205 }
206 
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211   StateCheckNumType n = (StateCheckNumType )num;
212 
213   BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214   return 0;
215 }
216 #endif
217 
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221   RelAddrType ra = (RelAddrType )addr;
222 
223   BBUF_ADD(reg, &ra, SIZE_RELADDR);
224   return 0;
225 }
226 
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230   AbsAddrType ra = (AbsAddrType )addr;
231 
232   BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233   return 0;
234 }
235 
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239   LengthType l = (LengthType )len;
240 
241   BBUF_ADD(reg, &l, SIZE_LENGTH);
242   return 0;
243 }
244 
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248   MemNumType n = (MemNumType )num;
249 
250   BBUF_ADD(reg, &n, SIZE_MEMNUM);
251   return 0;
252 }
253 
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257   PointerType ptr = (PointerType )addr;
258 
259   BBUF_ADD(reg, &ptr, SIZE_POINTER);
260   return 0;
261 }
262 
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266   BBUF_ADD(reg, &option, SIZE_OPTION);
267   return 0;
268 }
269 
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273   int r;
274 
275   r = add_opcode(reg, opcode);
276   if (r) return r;
277   r = add_rel_addr(reg, addr);
278   return r;
279 }
280 
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284   BBUF_ADD(reg, bytes, len);
285   return 0;
286 }
287 
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291   BBUF_ADD(reg, bs, SIZE_BITSET);
292   return 0;
293 }
294 
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298   int r;
299 
300   r = add_opcode(reg, opcode);
301   if (r) return r;
302   r = add_option(reg, option);
303   return r;
304 }
305 
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308 
309 
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311    ((op) == OP_EXACTN    || (op) == OP_EXACTMB2N ||\
312     (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN  || (op) == OP_EXACTN_IC)
313 
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317   int op;
318 
319   if (ignore_case) {
320     switch (str_len) {
321     case 1:  op = OP_EXACT1_IC; break;
322     default: op = OP_EXACTN_IC; break;
323     }
324   }
325   else {
326     switch (mb_len) {
327     case 1:
328       switch (str_len) {
329       case 1:  op = OP_EXACT1; break;
330       case 2:  op = OP_EXACT2; break;
331       case 3:  op = OP_EXACT3; break;
332       case 4:  op = OP_EXACT4; break;
333       case 5:  op = OP_EXACT5; break;
334       default: op = OP_EXACTN; break;
335       }
336       break;
337 
338     case 2:
339       switch (str_len) {
340       case 1:  op = OP_EXACTMB2N1; break;
341       case 2:  op = OP_EXACTMB2N2; break;
342       case 3:  op = OP_EXACTMB2N3; break;
343       default: op = OP_EXACTMB2N;  break;
344       }
345       break;
346 
347     case 3:
348       op = OP_EXACTMB3N;
349       break;
350 
351     default:
352       op = OP_EXACTMBN;
353       break;
354     }
355   }
356   return op;
357 }
358 
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362   int r;
363   int saved_num_null_check = reg->num_null_check;
364 
365   if (empty_info != 0) {
366     r = add_opcode(reg, OP_NULL_CHECK_START);
367     if (r) return r;
368     r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369     if (r) return r;
370     reg->num_null_check++;
371   }
372 
373   r = compile_tree(node, reg);
374   if (r) return r;
375 
376   if (empty_info != 0) {
377     if (empty_info == NQ_TARGET_IS_EMPTY)
378       r = add_opcode(reg, OP_NULL_CHECK_END);
379     else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380       r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381     else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382       r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383 
384     if (r) return r;
385     r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386   }
387   return r;
388 }
389 
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394   int r;
395 
396   r = add_opcode(reg, OP_CALL);
397   if (r) return r;
398   r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399                           node->target);
400   if (r) return r;
401   r = add_abs_addr(reg, 0 /*dummy addr.*/);
402   return r;
403 }
404 #endif
405 
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409   int i, r;
410 
411   for (i = 0; i < n; i++) {
412     r = compile_tree(node, reg);
413     if (r) return r;
414   }
415   return 0;
416 }
417 
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420                           regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422   int len;
423   int op = select_str_opcode(mb_len, str_len, ignore_case);
424 
425   len = SIZE_OPCODE;
426 
427   if (op == OP_EXACTMBN)  len += SIZE_LENGTH;
428   if (IS_NEED_STR_LEN_OP_EXACT(op))
429     len += SIZE_LENGTH;
430 
431   len += mb_len * str_len;
432   return len;
433 }
434 
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437                    regex_t* reg, int ignore_case)
438 {
439   int op = select_str_opcode(mb_len, str_len, ignore_case);
440   add_opcode(reg, op);
441 
442   if (op == OP_EXACTMBN)
443     add_length(reg, mb_len);
444 
445   if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446     if (op == OP_EXACTN_IC)
447       add_length(reg, mb_len * str_len);
448     else
449       add_length(reg, str_len);
450   }
451 
452   add_bytes(reg, s, mb_len * str_len);
453   return 0;
454 }
455 
456 
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460   int rlen, r, len, prev_len, slen, ambig;
461   OnigEncoding enc = reg->enc;
462   UChar *p, *prev;
463   StrNode* sn;
464 
465   sn = NSTR(node);
466   if (sn->end <= sn->s)
467     return 0;
468 
469   ambig = NSTRING_IS_AMBIG(node);
470 
471   p = prev = sn->s;
472   SAFE_ENC_LEN(enc, p, sn->end, prev_len);
473   p += prev_len;
474   slen = 1;
475   rlen = 0;
476 
477   for (; p < sn->end; ) {
478     SAFE_ENC_LEN(enc, p, sn->end, len);
479     if (len == prev_len) {
480       slen++;
481     }
482     else {
483       r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484       rlen += r;
485       prev = p;
486       slen = 1;
487       prev_len = len;
488     }
489     p += len;
490   }
491   r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492   rlen += r;
493   return rlen;
494 }
495 
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499   if (sn->end <= sn->s)
500     return 0;
501 
502   return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504 
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508   int r, len, prev_len, slen, ambig;
509   OnigEncoding enc = reg->enc;
510   UChar *p, *prev, *end;
511   StrNode* sn;
512 
513   sn = NSTR(node);
514   if (sn->end <= sn->s)
515     return 0;
516 
517   end = sn->end;
518   ambig = NSTRING_IS_AMBIG(node);
519 
520   p = prev = sn->s;
521   SAFE_ENC_LEN(enc, p, end, prev_len);
522   p += prev_len;
523   slen = 1;
524 
525   for (; p < end; ) {
526     SAFE_ENC_LEN(enc, p, end, len);
527     if (len == prev_len) {
528       slen++;
529     }
530     else {
531       r = add_compile_string(prev, prev_len, slen, reg, ambig);
532       if (r) return r;
533 
534       prev  = p;
535       slen  = 1;
536       prev_len = len;
537     }
538 
539     p += len;
540   }
541   return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543 
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547   if (sn->end <= sn->s)
548     return 0;
549 
550   return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552 
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557   add_length(reg, mbuf->used);
558   return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560   int r, pad_size;
561   UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562 
563   GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564   add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565   if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566 
567   r = add_bytes(reg, mbuf->p, mbuf->used);
568 
569   /* padding for return value from compile_length_cclass_node() to be fix. */
570   pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571   if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572   return r;
573 #endif
574 }
575 
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579   int len;
580 
581   if (IS_NCCLASS_SHARE(cc)) {
582     len = SIZE_OPCODE + SIZE_POINTER;
583     return len;
584   }
585 
586   if (IS_NULL(cc->mbuf)) {
587     len = SIZE_OPCODE + SIZE_BITSET;
588   }
589   else {
590     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591       len = SIZE_OPCODE;
592     }
593     else {
594       len = SIZE_OPCODE + SIZE_BITSET;
595     }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597     len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599     len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601   }
602 
603   return len;
604 }
605 
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609   int r;
610 
611   if (IS_NCCLASS_SHARE(cc)) {
612     add_opcode(reg, OP_CCLASS_NODE);
613     r = add_pointer(reg, cc);
614     return r;
615   }
616 
617   if (IS_NULL(cc->mbuf)) {
618     if (IS_NCCLASS_NOT(cc))
619       add_opcode(reg, OP_CCLASS_NOT);
620     else
621       add_opcode(reg, OP_CCLASS);
622 
623     r = add_bitset(reg, cc->bs);
624   }
625   else {
626     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627       if (IS_NCCLASS_NOT(cc))
628         add_opcode(reg, OP_CCLASS_MB_NOT);
629       else
630         add_opcode(reg, OP_CCLASS_MB);
631 
632       r = add_multi_byte_cclass(cc->mbuf, reg);
633     }
634     else {
635       if (IS_NCCLASS_NOT(cc))
636         add_opcode(reg, OP_CCLASS_MIX_NOT);
637       else
638         add_opcode(reg, OP_CCLASS_MIX);
639 
640       r = add_bitset(reg, cc->bs);
641       if (r) return r;
642       r = add_multi_byte_cclass(cc->mbuf, reg);
643     }
644   }
645 
646   return r;
647 }
648 
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC  4
653 
654   OnigRepeatRange* p;
655 
656   if (reg->repeat_range_alloc == 0) {
657     p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658     CHECK_NULL_RETURN_MEMERR(p);
659     reg->repeat_range = p;
660     reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661   }
662   else if (reg->repeat_range_alloc <= id) {
663     int n;
664     n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665     p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666                                     sizeof(OnigRepeatRange) * n);
667     CHECK_NULL_RETURN_MEMERR(p);
668     reg->repeat_range = p;
669     reg->repeat_range_alloc = n;
670   }
671   else {
672     p = reg->repeat_range;
673   }
674 
675   p[id].lower = lower;
676   p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677   return 0;
678 }
679 
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682                           regex_t* reg)
683 {
684   int r;
685   int num_repeat = reg->num_repeat;
686 
687   r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688   if (r) return r;
689   r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690   reg->num_repeat++;
691   if (r) return r;
692   r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693   if (r) return r;
694 
695   r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696   if (r) return r;
697 
698   r = compile_tree_empty_check(qn->target, reg, empty_info);
699   if (r) return r;
700 
701   if (
702 #ifdef USE_SUBEXP_CALL
703       reg->num_call > 0 ||
704 #endif
705       IS_QUANTIFIER_IN_REPEAT(qn)) {
706     r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707   }
708   else {
709     r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710   }
711   if (r) return r;
712   r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713   return r;
714 }
715 
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719   if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720       NTYPE(qn->target) == NT_CANY)
721     return 1;
722   else
723     return 0;
724 }
725 
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE   50
727 #define CKN_ON   (ckn > 0)
728 
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730 
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734   int len, mod_tlen, cklen;
735   int ckn;
736   int infinite = IS_REPEAT_INFINITE(qn->upper);
737   int empty_info = qn->target_empty_info;
738   int tlen = compile_length_tree(qn->target, reg);
739 
740   if (tlen < 0) return tlen;
741 
742   ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743 
744   cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745 
746   /* anychar repeat */
747   if (NTYPE(qn->target) == NT_CANY) {
748     if (qn->greedy && infinite) {
749       if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750         return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751       else
752         return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753     }
754   }
755 
756   if (empty_info != 0)
757     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758   else
759     mod_tlen = tlen;
760 
761   if (infinite && qn->lower <= 1) {
762     if (qn->greedy) {
763       if (qn->lower == 1)
764 	len = SIZE_OP_JUMP;
765       else
766 	len = 0;
767 
768       len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769     }
770     else {
771       if (qn->lower == 0)
772 	len = SIZE_OP_JUMP;
773       else
774 	len = 0;
775 
776       len += mod_tlen + SIZE_OP_PUSH + cklen;
777     }
778   }
779   else if (qn->upper == 0) {
780     if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781       len = SIZE_OP_JUMP + tlen;
782     else
783       len = 0;
784   }
785   else if (qn->upper == 1 && qn->greedy) {
786     if (qn->lower == 0) {
787       if (CKN_ON) {
788 	len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789       }
790       else {
791 	len = SIZE_OP_PUSH + tlen;
792       }
793     }
794     else {
795       len = tlen;
796     }
797   }
798   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799     len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800   }
801   else {
802     len = SIZE_OP_REPEAT_INC
803         + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804     if (CKN_ON)
805       len += SIZE_OP_STATE_CHECK;
806   }
807 
808   return len;
809 }
810 
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814   int r, mod_tlen;
815   int ckn;
816   int infinite = IS_REPEAT_INFINITE(qn->upper);
817   int empty_info = qn->target_empty_info;
818   int tlen = compile_length_tree(qn->target, reg);
819 
820   if (tlen < 0) return tlen;
821 
822   ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823 
824   if (is_anychar_star_quantifier(qn)) {
825     r = compile_tree_n_times(qn->target, qn->lower, reg);
826     if (r) return r;
827     if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828       if (IS_MULTILINE(reg->options))
829 	r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830       else
831 	r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832       if (r) return r;
833       if (CKN_ON) {
834 	r = add_state_check_num(reg, ckn);
835 	if (r) return r;
836       }
837 
838       return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839     }
840     else {
841       if (IS_MULTILINE(reg->options)) {
842 	r = add_opcode(reg, (CKN_ON ?
843 			       OP_STATE_CHECK_ANYCHAR_ML_STAR
844 			     : OP_ANYCHAR_ML_STAR));
845       }
846       else {
847 	r = add_opcode(reg, (CKN_ON ?
848 			       OP_STATE_CHECK_ANYCHAR_STAR
849 			     : OP_ANYCHAR_STAR));
850       }
851       if (r) return r;
852       if (CKN_ON)
853 	r = add_state_check_num(reg, ckn);
854 
855       return r;
856     }
857   }
858 
859   if (empty_info != 0)
860     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861   else
862     mod_tlen = tlen;
863 
864   if (infinite && qn->lower <= 1) {
865     if (qn->greedy) {
866       if (qn->lower == 1) {
867 	r = add_opcode_rel_addr(reg, OP_JUMP,
868 			(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 	if (r) return r;
870       }
871 
872       if (CKN_ON) {
873 	r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 	if (r) return r;
875 	r = add_state_check_num(reg, ckn);
876 	if (r) return r;
877 	r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878       }
879       else {
880 	r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881       }
882       if (r) return r;
883       r = compile_tree_empty_check(qn->target, reg, empty_info);
884       if (r) return r;
885       r = add_opcode_rel_addr(reg, OP_JUMP,
886 	      -(mod_tlen + (int )SIZE_OP_JUMP
887 		+ (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888     }
889     else {
890       if (qn->lower == 0) {
891 	r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 	if (r) return r;
893       }
894       r = compile_tree_empty_check(qn->target, reg, empty_info);
895       if (r) return r;
896       if (CKN_ON) {
897 	r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 	if (r) return r;
899 	r = add_state_check_num(reg, ckn);
900 	if (r) return r;
901 	r = add_rel_addr(reg,
902 		 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903       }
904       else
905 	r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906     }
907   }
908   else if (qn->upper == 0) {
909     if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910       r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911       if (r) return r;
912       r = compile_tree(qn->target, reg);
913     }
914     else
915       r = 0;
916   }
917   else if (qn->upper == 1 && qn->greedy) {
918     if (qn->lower == 0) {
919       if (CKN_ON) {
920 	r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 	if (r) return r;
922 	r = add_state_check_num(reg, ckn);
923 	if (r) return r;
924 	r = add_rel_addr(reg, tlen);
925       }
926       else {
927 	r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928       }
929       if (r) return r;
930     }
931 
932     r = compile_tree(qn->target, reg);
933   }
934   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935     if (CKN_ON) {
936       r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937       if (r) return r;
938       r = add_state_check_num(reg, ckn);
939       if (r) return r;
940       r = add_rel_addr(reg, SIZE_OP_JUMP);
941     }
942     else {
943       r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944     }
945 
946     if (r) return r;
947     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948     if (r) return r;
949     r = compile_tree(qn->target, reg);
950   }
951   else {
952     r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953     if (CKN_ON) {
954       if (r) return r;
955       r = add_opcode(reg, OP_STATE_CHECK);
956       if (r) return r;
957       r = add_state_check_num(reg, ckn);
958     }
959   }
960   return r;
961 }
962 
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964 
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968   int len, mod_tlen;
969   int infinite = IS_REPEAT_INFINITE(qn->upper);
970   int empty_info = qn->target_empty_info;
971   int tlen = compile_length_tree(qn->target, reg);
972 
973   if (tlen < 0) return tlen;
974 
975   /* anychar repeat */
976   if (NTYPE(qn->target) == NT_CANY) {
977     if (qn->greedy && infinite) {
978       if (IS_NOT_NULL(qn->next_head_exact))
979         return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980       else
981         return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982     }
983   }
984 
985   if (empty_info != 0)
986     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987   else
988     mod_tlen = tlen;
989 
990   if (infinite &&
991       (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993       len = SIZE_OP_JUMP;
994     }
995     else {
996       len = tlen * qn->lower;
997     }
998 
999     if (qn->greedy) {
1000       if (IS_NOT_NULL(qn->head_exact))
1001 	len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002       else if (IS_NOT_NULL(qn->next_head_exact))
1003 	len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004       else
1005 	len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006     }
1007     else
1008       len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009   }
1010   else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011     len = SIZE_OP_JUMP + tlen;
1012   }
1013   else if (!infinite && qn->greedy &&
1014            (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015                                       <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016     len = tlen * qn->lower;
1017     len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018   }
1019   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020     len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021   }
1022   else {
1023     len = SIZE_OP_REPEAT_INC
1024         + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025   }
1026 
1027   return len;
1028 }
1029 
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033   int i, r, mod_tlen;
1034   int infinite = IS_REPEAT_INFINITE(qn->upper);
1035   int empty_info = qn->target_empty_info;
1036   int tlen = compile_length_tree(qn->target, reg);
1037 
1038   if (tlen < 0) return tlen;
1039 
1040   if (is_anychar_star_quantifier(qn)) {
1041     r = compile_tree_n_times(qn->target, qn->lower, reg);
1042     if (r) return r;
1043     if (IS_NOT_NULL(qn->next_head_exact)) {
1044       if (IS_MULTILINE(reg->options))
1045 	r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046       else
1047 	r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048       if (r) return r;
1049       return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050     }
1051     else {
1052       if (IS_MULTILINE(reg->options))
1053 	return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054       else
1055 	return add_opcode(reg, OP_ANYCHAR_STAR);
1056     }
1057   }
1058 
1059   if (empty_info != 0)
1060     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061   else
1062     mod_tlen = tlen;
1063 
1064   if (infinite &&
1065       (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067       if (qn->greedy) {
1068 	if (IS_NOT_NULL(qn->head_exact))
1069 	  r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 	else if (IS_NOT_NULL(qn->next_head_exact))
1071 	  r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 	else
1073 	  r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074       }
1075       else {
1076 	r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077       }
1078       if (r) return r;
1079     }
1080     else {
1081       r = compile_tree_n_times(qn->target, qn->lower, reg);
1082       if (r) return r;
1083     }
1084 
1085     if (qn->greedy) {
1086       if (IS_NOT_NULL(qn->head_exact)) {
1087 	r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 			     mod_tlen + SIZE_OP_JUMP);
1089 	if (r) return r;
1090 	add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 	r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 	if (r) return r;
1093 	r = add_opcode_rel_addr(reg, OP_JUMP,
1094 	-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095       }
1096       else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 	r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 				mod_tlen + SIZE_OP_JUMP);
1099 	if (r) return r;
1100 	add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 	r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 	if (r) return r;
1103 	r = add_opcode_rel_addr(reg, OP_JUMP,
1104           -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105       }
1106       else {
1107 	r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 	if (r) return r;
1109 	r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 	if (r) return r;
1111 	r = add_opcode_rel_addr(reg, OP_JUMP,
1112 		     -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113       }
1114     }
1115     else {
1116       r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117       if (r) return r;
1118       r = compile_tree_empty_check(qn->target, reg, empty_info);
1119       if (r) return r;
1120       r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121     }
1122   }
1123   else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125     if (r) return r;
1126     r = compile_tree(qn->target, reg);
1127   }
1128   else if (!infinite && qn->greedy &&
1129            (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130                                   <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131     int n = qn->upper - qn->lower;
1132 
1133     r = compile_tree_n_times(qn->target, qn->lower, reg);
1134     if (r) return r;
1135 
1136     for (i = 0; i < n; i++) {
1137       r = add_opcode_rel_addr(reg, OP_PUSH,
1138 			   (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139       if (r) return r;
1140       r = compile_tree(qn->target, reg);
1141       if (r) return r;
1142     }
1143   }
1144   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145     r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146     if (r) return r;
1147     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148     if (r) return r;
1149     r = compile_tree(qn->target, reg);
1150   }
1151   else {
1152     r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153   }
1154   return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157 
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161   int tlen;
1162   OnigOptionType prev = reg->options;
1163 
1164   reg->options = node->option;
1165   tlen = compile_length_tree(node->target, reg);
1166   reg->options = prev;
1167 
1168   if (tlen < 0) return tlen;
1169 
1170   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171     return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172            + tlen + SIZE_OP_SET_OPTION;
1173   }
1174   else
1175     return tlen;
1176 }
1177 
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181   int r;
1182   OnigOptionType prev = reg->options;
1183 
1184   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185     r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186     if (r) return r;
1187     r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188     if (r) return r;
1189     r = add_opcode(reg, OP_FAIL);
1190     if (r) return r;
1191   }
1192 
1193   reg->options = node->option;
1194   r = compile_tree(node->target, reg);
1195   reg->options = prev;
1196 
1197   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198     if (r) return r;
1199     r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200   }
1201   return r;
1202 }
1203 
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207   int len;
1208   int tlen;
1209 
1210   if (node->type == ENCLOSE_OPTION)
1211     return compile_length_option_node(node, reg);
1212 
1213   if (node->target) {
1214     tlen = compile_length_tree(node->target, reg);
1215     if (tlen < 0) return tlen;
1216   }
1217   else
1218     tlen = 0;
1219 
1220   switch (node->type) {
1221   case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223     if (IS_ENCLOSE_CALLED(node)) {
1224       len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 	  + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 	len += (IS_ENCLOSE_RECURSION(node)
1228 		? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229       else
1230 	len += (IS_ENCLOSE_RECURSION(node)
1231 		? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232     }
1233     else
1234 #endif
1235     {
1236       if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 	len = SIZE_OP_MEMORY_START_PUSH;
1238       else
1239 	len = SIZE_OP_MEMORY_START;
1240 
1241       len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 		     ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1243     }
1244     break;
1245 
1246   case ENCLOSE_STOP_BACKTRACK:
1247     if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248       QtfrNode* qn = NQTFR(node->target);
1249       tlen = compile_length_tree(qn->target, reg);
1250       if (tlen < 0) return tlen;
1251 
1252       len = tlen * qn->lower
1253 	  + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1254     }
1255     else {
1256       len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1257     }
1258     break;
1259 
1260   default:
1261     return ONIGERR_TYPE_BUG;
1262     break;
1263   }
1264 
1265   return len;
1266 }
1267 
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1269 
1270 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1272 {
1273   int r, len;
1274 
1275   if (node->type == ENCLOSE_OPTION)
1276     return compile_option_node(node, reg);
1277 
1278   switch (node->type) {
1279   case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281     if (IS_ENCLOSE_CALLED(node)) {
1282       r = add_opcode(reg, OP_CALL);
1283       if (r) return r;
1284       node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285       node->state |= NST_ADDR_FIXED;
1286       r = add_abs_addr(reg, (int )node->call_addr);
1287       if (r) return r;
1288       len = compile_length_tree(node->target, reg);
1289       len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 	len += (IS_ENCLOSE_RECURSION(node)
1292 		? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293       else
1294 	len += (IS_ENCLOSE_RECURSION(node)
1295 		? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1296 
1297       r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298       if (r) return r;
1299     }
1300 #endif
1301     if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302       r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303     else
1304       r = add_opcode(reg, OP_MEMORY_START);
1305     if (r) return r;
1306     r = add_mem_num(reg, node->regnum);
1307     if (r) return r;
1308     r = compile_tree(node->target, reg);
1309     if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311     if (IS_ENCLOSE_CALLED(node)) {
1312       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 	r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 			     ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315       else
1316 	r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 			     ? OP_MEMORY_END_REC : OP_MEMORY_END));
1318 
1319       if (r) return r;
1320       r = add_mem_num(reg, node->regnum);
1321       if (r) return r;
1322       r = add_opcode(reg, OP_RETURN);
1323     }
1324     else
1325 #endif
1326     {
1327       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 	r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329       else
1330 	r = add_opcode(reg, OP_MEMORY_END);
1331       if (r) return r;
1332       r = add_mem_num(reg, node->regnum);
1333     }
1334     break;
1335 
1336   case ENCLOSE_STOP_BACKTRACK:
1337     if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338       QtfrNode* qn = NQTFR(node->target);
1339       r = compile_tree_n_times(qn->target, qn->lower, reg);
1340       if (r) return r;
1341 
1342       len = compile_length_tree(qn->target, reg);
1343       if (len < 0) return len;
1344 
1345       r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346       if (r) return r;
1347       r = compile_tree(qn->target, reg);
1348       if (r) return r;
1349       r = add_opcode(reg, OP_POP);
1350       if (r) return r;
1351       r = add_opcode_rel_addr(reg, OP_JUMP,
1352 	 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1353     }
1354     else {
1355       r = add_opcode(reg, OP_PUSH_STOP_BT);
1356       if (r) return r;
1357       r = compile_tree(node->target, reg);
1358       if (r) return r;
1359       r = add_opcode(reg, OP_POP_STOP_BT);
1360     }
1361     break;
1362 
1363   default:
1364     return ONIGERR_TYPE_BUG;
1365     break;
1366   }
1367 
1368   return r;
1369 }
1370 
1371 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1373 {
1374   int len;
1375   int tlen = 0;
1376 
1377   if (node->target) {
1378     tlen = compile_length_tree(node->target, reg);
1379     if (tlen < 0) return tlen;
1380   }
1381 
1382   switch (node->type) {
1383   case ANCHOR_PREC_READ:
1384     len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385     break;
1386   case ANCHOR_PREC_READ_NOT:
1387     len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388     break;
1389   case ANCHOR_LOOK_BEHIND:
1390     len = SIZE_OP_LOOK_BEHIND + tlen;
1391     break;
1392   case ANCHOR_LOOK_BEHIND_NOT:
1393     len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394     break;
1395 
1396   default:
1397     len = SIZE_OPCODE;
1398     break;
1399   }
1400 
1401   return len;
1402 }
1403 
1404 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1406 {
1407   int r, len;
1408 
1409   switch (node->type) {
1410   case ANCHOR_BEGIN_BUF:      r = add_opcode(reg, OP_BEGIN_BUF);      break;
1411   case ANCHOR_END_BUF:        r = add_opcode(reg, OP_END_BUF);        break;
1412   case ANCHOR_BEGIN_LINE:     r = add_opcode(reg, OP_BEGIN_LINE);     break;
1413   case ANCHOR_END_LINE:       r = add_opcode(reg, OP_END_LINE);       break;
1414   case ANCHOR_SEMI_END_BUF:   r = add_opcode(reg, OP_SEMI_END_BUF);   break;
1415   case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1416 
1417   case ANCHOR_WORD_BOUND:     r = add_opcode(reg, OP_WORD_BOUND);     break;
1418   case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420   case ANCHOR_WORD_BEGIN:     r = add_opcode(reg, OP_WORD_BEGIN);     break;
1421   case ANCHOR_WORD_END:       r = add_opcode(reg, OP_WORD_END);       break;
1422 #endif
1423 
1424   case ANCHOR_PREC_READ:
1425     r = add_opcode(reg, OP_PUSH_POS);
1426     if (r) return r;
1427     r = compile_tree(node->target, reg);
1428     if (r) return r;
1429     r = add_opcode(reg, OP_POP_POS);
1430     break;
1431 
1432   case ANCHOR_PREC_READ_NOT:
1433     len = compile_length_tree(node->target, reg);
1434     if (len < 0) return len;
1435     r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436     if (r) return r;
1437     r = compile_tree(node->target, reg);
1438     if (r) return r;
1439     r = add_opcode(reg, OP_FAIL_POS);
1440     break;
1441 
1442   case ANCHOR_LOOK_BEHIND:
1443     {
1444       int n;
1445       r = add_opcode(reg, OP_LOOK_BEHIND);
1446       if (r) return r;
1447       if (node->char_len < 0) {
1448 	r = get_char_length_tree(node->target, reg, &n);
1449 	if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1450       }
1451       else
1452 	n = node->char_len;
1453       r = add_length(reg, n);
1454       if (r) return r;
1455       r = compile_tree(node->target, reg);
1456     }
1457     break;
1458 
1459   case ANCHOR_LOOK_BEHIND_NOT:
1460     {
1461       int n;
1462       len = compile_length_tree(node->target, reg);
1463       r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 			   len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465       if (r) return r;
1466       if (node->char_len < 0) {
1467 	r = get_char_length_tree(node->target, reg, &n);
1468 	if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1469       }
1470       else
1471 	n = node->char_len;
1472       r = add_length(reg, n);
1473       if (r) return r;
1474       r = compile_tree(node->target, reg);
1475       if (r) return r;
1476       r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1477     }
1478     break;
1479 
1480   default:
1481     return ONIGERR_TYPE_BUG;
1482     break;
1483   }
1484 
1485   return r;
1486 }
1487 
1488 static int
compile_length_tree(Node * node,regex_t * reg)1489 compile_length_tree(Node* node, regex_t* reg)
1490 {
1491   int len, type, r;
1492 
1493   type = NTYPE(node);
1494   switch (type) {
1495   case NT_LIST:
1496     len = 0;
1497     do {
1498       r = compile_length_tree(NCAR(node), reg);
1499       if (r < 0) return r;
1500       len += r;
1501     } while (IS_NOT_NULL(node = NCDR(node)));
1502     r = len;
1503     break;
1504 
1505   case NT_ALT:
1506     {
1507       int n;
1508 
1509       n = r = 0;
1510       do {
1511 	r += compile_length_tree(NCAR(node), reg);
1512 	n++;
1513       } while (IS_NOT_NULL(node = NCDR(node)));
1514       r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1515     }
1516     break;
1517 
1518   case NT_STR:
1519     if (NSTRING_IS_RAW(node))
1520       r = compile_length_string_raw_node(NSTR(node), reg);
1521     else
1522       r = compile_length_string_node(node, reg);
1523     break;
1524 
1525   case NT_CCLASS:
1526     r = compile_length_cclass_node(NCCLASS(node), reg);
1527     break;
1528 
1529   case NT_CTYPE:
1530   case NT_CANY:
1531     r = SIZE_OPCODE;
1532     break;
1533 
1534   case NT_BREF:
1535     {
1536       BRefNode* br = NBREF(node);
1537 
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539       if (IS_BACKREF_NEST_LEVEL(br)) {
1540         r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541             SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1542       }
1543       else
1544 #endif
1545       if (br->back_num == 1) {
1546 	r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 	     ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1548       }
1549       else {
1550 	r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1551       }
1552     }
1553     break;
1554 
1555 #ifdef USE_SUBEXP_CALL
1556   case NT_CALL:
1557     r = SIZE_OP_CALL;
1558     break;
1559 #endif
1560 
1561   case NT_QTFR:
1562     r = compile_length_quantifier_node(NQTFR(node), reg);
1563     break;
1564 
1565   case NT_ENCLOSE:
1566     r = compile_length_enclose_node(NENCLOSE(node), reg);
1567     break;
1568 
1569   case NT_ANCHOR:
1570     r = compile_length_anchor_node(NANCHOR(node), reg);
1571     break;
1572 
1573   default:
1574     return ONIGERR_TYPE_BUG;
1575     break;
1576   }
1577 
1578   return r;
1579 }
1580 
1581 static int
compile_tree(Node * node,regex_t * reg)1582 compile_tree(Node* node, regex_t* reg)
1583 {
1584   int n, type, len, pos, r = 0;
1585 
1586   type = NTYPE(node);
1587   switch (type) {
1588   case NT_LIST:
1589     do {
1590       r = compile_tree(NCAR(node), reg);
1591     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592     break;
1593 
1594   case NT_ALT:
1595     {
1596       Node* x = node;
1597       len = 0;
1598       do {
1599 	len += compile_length_tree(NCAR(x), reg);
1600 	if (NCDR(x) != NULL) {
1601 	  len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1602 	}
1603       } while (IS_NOT_NULL(x = NCDR(x)));
1604       pos = reg->used + len;  /* goal position */
1605 
1606       do {
1607 	len = compile_length_tree(NCAR(node), reg);
1608 	if (IS_NOT_NULL(NCDR(node))) {
1609 	  r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 	  if (r) break;
1611 	}
1612 	r = compile_tree(NCAR(node), reg);
1613 	if (r) break;
1614 	if (IS_NOT_NULL(NCDR(node))) {
1615 	  len = pos - (reg->used + SIZE_OP_JUMP);
1616 	  r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 	  if (r) break;
1618 	}
1619       } while (IS_NOT_NULL(node = NCDR(node)));
1620     }
1621     break;
1622 
1623   case NT_STR:
1624     if (NSTRING_IS_RAW(node))
1625       r = compile_string_raw_node(NSTR(node), reg);
1626     else
1627       r = compile_string_node(node, reg);
1628     break;
1629 
1630   case NT_CCLASS:
1631     r = compile_cclass_node(NCCLASS(node), reg);
1632     break;
1633 
1634   case NT_CTYPE:
1635     {
1636       int op;
1637 
1638       switch (NCTYPE(node)->ctype) {
1639       case ONIGENC_CTYPE_WORD:
1640 	if (NCTYPE(node)->not != 0)  op = OP_NOT_WORD;
1641 	else                         op = OP_WORD;
1642 	break;
1643       default:
1644 	return ONIGERR_TYPE_BUG;
1645 	break;
1646       }
1647       r = add_opcode(reg, op);
1648     }
1649     break;
1650 
1651   case NT_CANY:
1652     if (IS_MULTILINE(reg->options))
1653       r = add_opcode(reg, OP_ANYCHAR_ML);
1654     else
1655       r = add_opcode(reg, OP_ANYCHAR);
1656     break;
1657 
1658   case NT_BREF:
1659     {
1660       BRefNode* br = NBREF(node);
1661 
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663       if (IS_BACKREF_NEST_LEVEL(br)) {
1664 	r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 	if (r) return r;
1666 	r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 	if (r) return r;
1668 	r = add_length(reg, br->nest_level);
1669 	if (r) return r;
1670 
1671 	goto add_bacref_mems;
1672       }
1673       else
1674 #endif
1675       if (br->back_num == 1) {
1676 	n = br->back_static[0];
1677 	if (IS_IGNORECASE(reg->options)) {
1678 	  r = add_opcode(reg, OP_BACKREFN_IC);
1679 	  if (r) return r;
1680 	  r = add_mem_num(reg, n);
1681 	}
1682 	else {
1683 	  switch (n) {
1684 	  case 1:  r = add_opcode(reg, OP_BACKREF1); break;
1685 	  case 2:  r = add_opcode(reg, OP_BACKREF2); break;
1686 	  default:
1687 	    r = add_opcode(reg, OP_BACKREFN);
1688 	    if (r) return r;
1689 	    r = add_mem_num(reg, n);
1690 	    break;
1691 	  }
1692 	}
1693       }
1694       else {
1695 	int i;
1696 	int* p;
1697 
1698         if (IS_IGNORECASE(reg->options)) {
1699           r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1700         }
1701         else {
1702           r = add_opcode(reg, OP_BACKREF_MULTI);
1703         }
1704 	if (r) return r;
1705 
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707       add_bacref_mems:
1708 #endif
1709 	r = add_length(reg, br->back_num);
1710 	if (r) return r;
1711 	p = BACKREFS_P(br);
1712 	for (i = br->back_num - 1; i >= 0; i--) {
1713 	  r = add_mem_num(reg, p[i]);
1714 	  if (r) return r;
1715 	}
1716       }
1717     }
1718     break;
1719 
1720 #ifdef USE_SUBEXP_CALL
1721   case NT_CALL:
1722     r = compile_call(NCALL(node), reg);
1723     break;
1724 #endif
1725 
1726   case NT_QTFR:
1727     r = compile_quantifier_node(NQTFR(node), reg);
1728     break;
1729 
1730   case NT_ENCLOSE:
1731     r = compile_enclose_node(NENCLOSE(node), reg);
1732     break;
1733 
1734   case NT_ANCHOR:
1735     r = compile_anchor_node(NANCHOR(node), reg);
1736     break;
1737 
1738   default:
1739 #ifdef ONIG_DEBUG
1740     fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742     break;
1743   }
1744 
1745   return r;
1746 }
1747 
1748 #ifdef USE_NAMED_GROUP
1749 
1750 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1752 {
1753   int r = 0;
1754   Node* node = *plink;
1755 
1756   switch (NTYPE(node)) {
1757   case NT_LIST:
1758   case NT_ALT:
1759     do {
1760       r = noname_disable_map(&(NCAR(node)), map, counter);
1761     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762     break;
1763 
1764   case NT_QTFR:
1765     {
1766       Node** ptarget = &(NQTFR(node)->target);
1767       Node*  old = *ptarget;
1768       r = noname_disable_map(ptarget, map, counter);
1769       if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 	onig_reduce_nested_quantifier(node, *ptarget);
1771       }
1772     }
1773     break;
1774 
1775   case NT_ENCLOSE:
1776     {
1777       EncloseNode* en = NENCLOSE(node);
1778       if (en->type == ENCLOSE_MEMORY) {
1779 	if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 	  (*counter)++;
1781 	  map[en->regnum].new_val = *counter;
1782 	  en->regnum = *counter;
1783 	  r = noname_disable_map(&(en->target), map, counter);
1784 	}
1785 	else {
1786 	  *plink = en->target;
1787 	  en->target = NULL_NODE;
1788 	  onig_node_free(node);
1789 	  r = noname_disable_map(plink, map, counter);
1790 	}
1791       }
1792       else
1793 	r = noname_disable_map(&(en->target), map, counter);
1794     }
1795     break;
1796 
1797   default:
1798     break;
1799   }
1800 
1801   return r;
1802 }
1803 
1804 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1805 renumber_node_backref(Node* node, GroupNumRemap* map)
1806 {
1807   int i, pos, n, old_num;
1808   int *backs;
1809   BRefNode* bn = NBREF(node);
1810 
1811   if (! IS_BACKREF_NAME_REF(bn))
1812     return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1813 
1814   old_num = bn->back_num;
1815   if (IS_NULL(bn->back_dynamic))
1816     backs = bn->back_static;
1817   else
1818     backs = bn->back_dynamic;
1819 
1820   for (i = 0, pos = 0; i < old_num; i++) {
1821     n = map[backs[i]].new_val;
1822     if (n > 0) {
1823       backs[pos] = n;
1824       pos++;
1825     }
1826   }
1827 
1828   bn->back_num = pos;
1829   return 0;
1830 }
1831 
1832 static int
renumber_by_map(Node * node,GroupNumRemap * map)1833 renumber_by_map(Node* node, GroupNumRemap* map)
1834 {
1835   int r = 0;
1836 
1837   switch (NTYPE(node)) {
1838   case NT_LIST:
1839   case NT_ALT:
1840     do {
1841       r = renumber_by_map(NCAR(node), map);
1842     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1843     break;
1844   case NT_QTFR:
1845     r = renumber_by_map(NQTFR(node)->target, map);
1846     break;
1847   case NT_ENCLOSE:
1848     r = renumber_by_map(NENCLOSE(node)->target, map);
1849     break;
1850 
1851   case NT_BREF:
1852     r = renumber_node_backref(node, map);
1853     break;
1854 
1855   default:
1856     break;
1857   }
1858 
1859   return r;
1860 }
1861 
1862 static int
numbered_ref_check(Node * node)1863 numbered_ref_check(Node* node)
1864 {
1865   int r = 0;
1866 
1867   switch (NTYPE(node)) {
1868   case NT_LIST:
1869   case NT_ALT:
1870     do {
1871       r = numbered_ref_check(NCAR(node));
1872     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1873     break;
1874   case NT_QTFR:
1875     r = numbered_ref_check(NQTFR(node)->target);
1876     break;
1877   case NT_ENCLOSE:
1878     r = numbered_ref_check(NENCLOSE(node)->target);
1879     break;
1880 
1881   case NT_BREF:
1882     if (! IS_BACKREF_NAME_REF(NBREF(node)))
1883       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1884     break;
1885 
1886   default:
1887     break;
1888   }
1889 
1890   return r;
1891 }
1892 
1893 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1894 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1895 {
1896   int r, i, pos, counter;
1897   BitStatusType loc;
1898   GroupNumRemap* map;
1899 
1900   map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1901   CHECK_NULL_RETURN_MEMERR(map);
1902   for (i = 1; i <= env->num_mem; i++) {
1903     map[i].new_val = 0;
1904   }
1905   counter = 0;
1906   r = noname_disable_map(root, map, &counter);
1907   if (r != 0) return r;
1908 
1909   r = renumber_by_map(*root, map);
1910   if (r != 0) return r;
1911 
1912   for (i = 1, pos = 1; i <= env->num_mem; i++) {
1913     if (map[i].new_val > 0) {
1914       SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1915       pos++;
1916     }
1917   }
1918 
1919   loc = env->capture_history;
1920   BIT_STATUS_CLEAR(env->capture_history);
1921   for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1922     if (BIT_STATUS_AT(loc, i)) {
1923       BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1924     }
1925   }
1926 
1927   env->num_mem = env->num_named;
1928   reg->num_mem = env->num_named;
1929 
1930   return onig_renumber_name_table(reg, map);
1931 }
1932 #endif /* USE_NAMED_GROUP */
1933 
1934 #ifdef USE_SUBEXP_CALL
1935 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1936 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1937 {
1938   int i, offset;
1939   EncloseNode* en;
1940   AbsAddrType addr;
1941 
1942   for (i = 0; i < uslist->num; i++) {
1943     en = NENCLOSE(uslist->us[i].target);
1944     if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1945     addr = en->call_addr;
1946     offset = uslist->us[i].offset;
1947 
1948     BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1949   }
1950   return 0;
1951 }
1952 #endif
1953 
1954 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1955 static int
quantifiers_memory_node_info(Node * node)1956 quantifiers_memory_node_info(Node* node)
1957 {
1958   int r = 0;
1959 
1960   switch (NTYPE(node)) {
1961   case NT_LIST:
1962   case NT_ALT:
1963     {
1964       int v;
1965       do {
1966 	v = quantifiers_memory_node_info(NCAR(node));
1967 	if (v > r) r = v;
1968       } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1969     }
1970     break;
1971 
1972 #ifdef USE_SUBEXP_CALL
1973   case NT_CALL:
1974     if (IS_CALL_RECURSION(NCALL(node))) {
1975       return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1976     }
1977     else
1978       r = quantifiers_memory_node_info(NCALL(node)->target);
1979     break;
1980 #endif
1981 
1982   case NT_QTFR:
1983     {
1984       QtfrNode* qn = NQTFR(node);
1985       if (qn->upper != 0) {
1986 	r = quantifiers_memory_node_info(qn->target);
1987       }
1988     }
1989     break;
1990 
1991   case NT_ENCLOSE:
1992     {
1993       EncloseNode* en = NENCLOSE(node);
1994       switch (en->type) {
1995       case ENCLOSE_MEMORY:
1996 	return NQ_TARGET_IS_EMPTY_MEM;
1997 	break;
1998 
1999       case ENCLOSE_OPTION:
2000       case ENCLOSE_STOP_BACKTRACK:
2001 	r = quantifiers_memory_node_info(en->target);
2002 	break;
2003       default:
2004 	break;
2005       }
2006     }
2007     break;
2008 
2009   case NT_BREF:
2010   case NT_STR:
2011   case NT_CTYPE:
2012   case NT_CCLASS:
2013   case NT_CANY:
2014   case NT_ANCHOR:
2015   default:
2016     break;
2017   }
2018 
2019   return r;
2020 }
2021 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2022 
2023 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2024 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2025 {
2026   OnigDistance tmin;
2027   int r = 0;
2028 
2029   *min = 0;
2030   switch (NTYPE(node)) {
2031   case NT_BREF:
2032     {
2033       int i;
2034       int* backs;
2035       Node** nodes = SCANENV_MEM_NODES(env);
2036       BRefNode* br = NBREF(node);
2037       if (br->state & NST_RECURSION) break;
2038 
2039       backs = BACKREFS_P(br);
2040       if (backs[0] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2041       r = get_min_match_length(nodes[backs[0]], min, env);
2042       if (r != 0) break;
2043       for (i = 1; i < br->back_num; i++) {
2044 	if (backs[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2045 	r = get_min_match_length(nodes[backs[i]], &tmin, env);
2046 	if (r != 0) break;
2047 	if (*min > tmin) *min = tmin;
2048       }
2049     }
2050     break;
2051 
2052 #ifdef USE_SUBEXP_CALL
2053   case NT_CALL:
2054     if (IS_CALL_RECURSION(NCALL(node))) {
2055       EncloseNode* en = NENCLOSE(NCALL(node)->target);
2056       if (IS_ENCLOSE_MIN_FIXED(en))
2057 	*min = en->min_len;
2058     }
2059     else
2060       r = get_min_match_length(NCALL(node)->target, min, env);
2061     break;
2062 #endif
2063 
2064   case NT_LIST:
2065     do {
2066       r = get_min_match_length(NCAR(node), &tmin, env);
2067       if (r == 0) *min += tmin;
2068     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2069     break;
2070 
2071   case NT_ALT:
2072     {
2073       Node *x, *y;
2074       y = node;
2075       do {
2076 	x = NCAR(y);
2077 	r = get_min_match_length(x, &tmin, env);
2078 	if (r != 0) break;
2079 	if (y == node) *min = tmin;
2080 	else if (*min > tmin) *min = tmin;
2081       } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2082     }
2083     break;
2084 
2085   case NT_STR:
2086     {
2087       StrNode* sn = NSTR(node);
2088       *min = sn->end - sn->s;
2089     }
2090     break;
2091 
2092   case NT_CTYPE:
2093     *min = 1;
2094     break;
2095 
2096   case NT_CCLASS:
2097   case NT_CANY:
2098     *min = 1;
2099     break;
2100 
2101   case NT_QTFR:
2102     {
2103       QtfrNode* qn = NQTFR(node);
2104 
2105       if (qn->lower > 0) {
2106 	r = get_min_match_length(qn->target, min, env);
2107 	if (r == 0)
2108 	  *min = distance_multiply(*min, qn->lower);
2109       }
2110     }
2111     break;
2112 
2113   case NT_ENCLOSE:
2114     {
2115       EncloseNode* en = NENCLOSE(node);
2116       switch (en->type) {
2117       case ENCLOSE_MEMORY:
2118 #ifdef USE_SUBEXP_CALL
2119 	if (IS_ENCLOSE_MIN_FIXED(en))
2120 	  *min = en->min_len;
2121 	else {
2122 	  r = get_min_match_length(en->target, min, env);
2123 	  if (r == 0) {
2124 	    en->min_len = *min;
2125 	    SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2126 	  }
2127 	}
2128 	break;
2129 #endif
2130       case ENCLOSE_OPTION:
2131       case ENCLOSE_STOP_BACKTRACK:
2132 	r = get_min_match_length(en->target, min, env);
2133 	break;
2134       }
2135     }
2136     break;
2137 
2138   case NT_ANCHOR:
2139   default:
2140     break;
2141   }
2142 
2143   return r;
2144 }
2145 
2146 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2147 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2148 {
2149   OnigDistance tmax;
2150   int r = 0;
2151 
2152   *max = 0;
2153   switch (NTYPE(node)) {
2154   case NT_LIST:
2155     do {
2156       r = get_max_match_length(NCAR(node), &tmax, env);
2157       if (r == 0)
2158 	*max = distance_add(*max, tmax);
2159     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2160     break;
2161 
2162   case NT_ALT:
2163     do {
2164       r = get_max_match_length(NCAR(node), &tmax, env);
2165       if (r == 0 && *max < tmax) *max = tmax;
2166     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2167     break;
2168 
2169   case NT_STR:
2170     {
2171       StrNode* sn = NSTR(node);
2172       *max = sn->end - sn->s;
2173     }
2174     break;
2175 
2176   case NT_CTYPE:
2177     *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2178     break;
2179 
2180   case NT_CCLASS:
2181   case NT_CANY:
2182     *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2183     break;
2184 
2185   case NT_BREF:
2186     {
2187       int i;
2188       int* backs;
2189       Node** nodes = SCANENV_MEM_NODES(env);
2190       BRefNode* br = NBREF(node);
2191       if (br->state & NST_RECURSION) {
2192 	*max = ONIG_INFINITE_DISTANCE;
2193 	break;
2194       }
2195       backs = BACKREFS_P(br);
2196       for (i = 0; i < br->back_num; i++) {
2197 	if (backs[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2198 	r = get_max_match_length(nodes[backs[i]], &tmax, env);
2199 	if (r != 0) break;
2200 	if (*max < tmax) *max = tmax;
2201       }
2202     }
2203     break;
2204 
2205 #ifdef USE_SUBEXP_CALL
2206   case NT_CALL:
2207     if (! IS_CALL_RECURSION(NCALL(node)))
2208       r = get_max_match_length(NCALL(node)->target, max, env);
2209     else
2210       *max = ONIG_INFINITE_DISTANCE;
2211     break;
2212 #endif
2213 
2214   case NT_QTFR:
2215     {
2216       QtfrNode* qn = NQTFR(node);
2217 
2218       if (qn->upper != 0) {
2219 	r = get_max_match_length(qn->target, max, env);
2220 	if (r == 0 && *max != 0) {
2221 	  if (! IS_REPEAT_INFINITE(qn->upper))
2222 	    *max = distance_multiply(*max, qn->upper);
2223 	  else
2224 	    *max = ONIG_INFINITE_DISTANCE;
2225 	}
2226       }
2227     }
2228     break;
2229 
2230   case NT_ENCLOSE:
2231     {
2232       EncloseNode* en = NENCLOSE(node);
2233       switch (en->type) {
2234       case ENCLOSE_MEMORY:
2235 #ifdef USE_SUBEXP_CALL
2236 	if (IS_ENCLOSE_MAX_FIXED(en))
2237 	  *max = en->max_len;
2238 	else {
2239 	  r = get_max_match_length(en->target, max, env);
2240 	  if (r == 0) {
2241 	    en->max_len = *max;
2242 	    SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2243 	  }
2244 	}
2245 	break;
2246 #endif
2247       case ENCLOSE_OPTION:
2248       case ENCLOSE_STOP_BACKTRACK:
2249 	r = get_max_match_length(en->target, max, env);
2250 	break;
2251       }
2252     }
2253     break;
2254 
2255   case NT_ANCHOR:
2256   default:
2257     break;
2258   }
2259 
2260   return r;
2261 }
2262 
2263 #define GET_CHAR_LEN_VARLEN           -1
2264 #define GET_CHAR_LEN_TOP_ALT_VARLEN   -2
2265 
2266 /* fixed size pattern node only */
2267 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2268 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2269 {
2270   int tlen;
2271   int r = 0;
2272 
2273   level++;
2274   *len = 0;
2275   switch (NTYPE(node)) {
2276   case NT_LIST:
2277     do {
2278       r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2279       if (r == 0)
2280 	*len = distance_add(*len, tlen);
2281     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2282     break;
2283 
2284   case NT_ALT:
2285     {
2286       int tlen2;
2287       int varlen = 0;
2288 
2289       r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2290       while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2291 	r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2292 	if (r == 0) {
2293 	  if (tlen != tlen2)
2294 	    varlen = 1;
2295 	}
2296       }
2297       if (r == 0) {
2298 	if (varlen != 0) {
2299 	  if (level == 1)
2300 	    r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2301 	  else
2302 	    r = GET_CHAR_LEN_VARLEN;
2303 	}
2304 	else
2305 	  *len = tlen;
2306       }
2307     }
2308     break;
2309 
2310   case NT_STR:
2311     {
2312       StrNode* sn = NSTR(node);
2313       UChar *s = sn->s;
2314       while (s < sn->end) {
2315 	s += enclen(reg->enc, s);
2316 	(*len)++;
2317       }
2318     }
2319     break;
2320 
2321   case NT_QTFR:
2322     {
2323       QtfrNode* qn = NQTFR(node);
2324       if (qn->lower == qn->upper) {
2325 	r = get_char_length_tree1(qn->target, reg, &tlen, level);
2326 	if (r == 0)
2327 	  *len = distance_multiply(tlen, qn->lower);
2328       }
2329       else
2330 	r = GET_CHAR_LEN_VARLEN;
2331     }
2332     break;
2333 
2334 #ifdef USE_SUBEXP_CALL
2335   case NT_CALL:
2336     if (! IS_CALL_RECURSION(NCALL(node)))
2337       r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2338     else
2339       r = GET_CHAR_LEN_VARLEN;
2340     break;
2341 #endif
2342 
2343   case NT_CTYPE:
2344     *len = 1;
2345     break;
2346 
2347   case NT_CCLASS:
2348   case NT_CANY:
2349     *len = 1;
2350     break;
2351 
2352   case NT_ENCLOSE:
2353     {
2354       EncloseNode* en = NENCLOSE(node);
2355       switch (en->type) {
2356       case ENCLOSE_MEMORY:
2357 #ifdef USE_SUBEXP_CALL
2358 	if (IS_ENCLOSE_CLEN_FIXED(en))
2359 	  *len = en->char_len;
2360 	else {
2361 	  r = get_char_length_tree1(en->target, reg, len, level);
2362 	  if (r == 0) {
2363 	    en->char_len = *len;
2364 	    SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2365 	  }
2366 	}
2367 	break;
2368 #endif
2369       case ENCLOSE_OPTION:
2370       case ENCLOSE_STOP_BACKTRACK:
2371 	r = get_char_length_tree1(en->target, reg, len, level);
2372 	break;
2373       default:
2374 	break;
2375       }
2376     }
2377     break;
2378 
2379   case NT_ANCHOR:
2380     break;
2381 
2382   default:
2383     r = GET_CHAR_LEN_VARLEN;
2384     break;
2385   }
2386 
2387   return r;
2388 }
2389 
2390 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2391 get_char_length_tree(Node* node, regex_t* reg, int* len)
2392 {
2393   return get_char_length_tree1(node, reg, len, 0);
2394 }
2395 
2396 /* x is not included y ==>  1 : 0 */
2397 static int
is_not_included(Node * x,Node * y,regex_t * reg)2398 is_not_included(Node* x, Node* y, regex_t* reg)
2399 {
2400   int i, len;
2401   OnigCodePoint code;
2402   UChar *p;
2403   int ytype;
2404 
2405  retry:
2406   ytype = NTYPE(y);
2407   switch (NTYPE(x)) {
2408   case NT_CTYPE:
2409     {
2410       switch (ytype) {
2411       case NT_CTYPE:
2412 	if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2413 	    NCTYPE(y)->not   != NCTYPE(x)->not)
2414 	  return 1;
2415 	else
2416 	  return 0;
2417 	break;
2418 
2419       case NT_CCLASS:
2420       swap:
2421 	{
2422 	  Node* tmp;
2423 	  tmp = x; x = y; y = tmp;
2424 	  goto retry;
2425 	}
2426 	break;
2427 
2428       case NT_STR:
2429 	goto swap;
2430 	break;
2431 
2432       default:
2433 	break;
2434       }
2435     }
2436     break;
2437 
2438   case NT_CCLASS:
2439     {
2440       CClassNode* xc = NCCLASS(x);
2441       switch (ytype) {
2442       case NT_CTYPE:
2443 	switch (NCTYPE(y)->ctype) {
2444 	case ONIGENC_CTYPE_WORD:
2445 	  if (NCTYPE(y)->not == 0) {
2446 	    if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2447 	      for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2448 		if (BITSET_AT(xc->bs, i)) {
2449 		  if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2450 		}
2451 	      }
2452 	      return 1;
2453 	    }
2454 	    return 0;
2455 	  }
2456 	  else {
2457 	    for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2458 	      if (! IS_CODE_SB_WORD(reg->enc, i)) {
2459 		if (!IS_NCCLASS_NOT(xc)) {
2460 		  if (BITSET_AT(xc->bs, i))
2461 		    return 0;
2462 		}
2463 		else {
2464 		  if (! BITSET_AT(xc->bs, i))
2465 		    return 0;
2466 		}
2467 	      }
2468 	    }
2469 	    return 1;
2470 	  }
2471 	  break;
2472 
2473 	default:
2474 	  break;
2475 	}
2476 	break;
2477 
2478       case NT_CCLASS:
2479 	{
2480 	  int v;
2481 	  CClassNode* yc = NCCLASS(y);
2482 
2483 	  for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2484 	    v = BITSET_AT(xc->bs, i);
2485 	    if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2486                 (v == 0 && IS_NCCLASS_NOT(xc))) {
2487 	      v = BITSET_AT(yc->bs, i);
2488 	      if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2489                   (v == 0 && IS_NCCLASS_NOT(yc)))
2490 		return 0;
2491 	    }
2492 	  }
2493 	  if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2494 	      (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2495 	    return 1;
2496 	  return 0;
2497 	}
2498 	break;
2499 
2500       case NT_STR:
2501 	goto swap;
2502 	break;
2503 
2504       default:
2505 	break;
2506       }
2507     }
2508     break;
2509 
2510   case NT_STR:
2511     {
2512       StrNode* xs = NSTR(x);
2513       if (NSTRING_LEN(x) == 0)
2514 	break;
2515 
2516       //c = *(xs->s);
2517       switch (ytype) {
2518       case NT_CTYPE:
2519         switch (NCTYPE(y)->ctype) {
2520         case ONIGENC_CTYPE_WORD:
2521           if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2522             return NCTYPE(y)->not;
2523           else
2524             return !(NCTYPE(y)->not);
2525           break;
2526         default:
2527           break;
2528         }
2529         break;
2530 
2531       case NT_CCLASS:
2532         {
2533           CClassNode* cc = NCCLASS(y);
2534 
2535           code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2536                                      xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2537           return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2538         }
2539         break;
2540 
2541       case NT_STR:
2542         {
2543           UChar *q;
2544           StrNode* ys = NSTR(y);
2545           len = NSTRING_LEN(x);
2546           if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2547           if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2548             /* tiny version */
2549             return 0;
2550           }
2551           else {
2552             for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2553               if (*p != *q) return 1;
2554             }
2555           }
2556         }
2557         break;
2558 
2559       default:
2560         break;
2561       }
2562     }
2563     break;
2564 
2565   default:
2566     break;
2567   }
2568 
2569   return 0;
2570 }
2571 
2572 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2573 get_head_value_node(Node* node, int exact, regex_t* reg)
2574 {
2575   Node* n = NULL_NODE;
2576 
2577   switch (NTYPE(node)) {
2578   case NT_BREF:
2579   case NT_ALT:
2580   case NT_CANY:
2581 #ifdef USE_SUBEXP_CALL
2582   case NT_CALL:
2583 #endif
2584     break;
2585 
2586   case NT_CTYPE:
2587   case NT_CCLASS:
2588     if (exact == 0) {
2589       n = node;
2590     }
2591     break;
2592 
2593   case NT_LIST:
2594     n = get_head_value_node(NCAR(node), exact, reg);
2595     break;
2596 
2597   case NT_STR:
2598     {
2599       StrNode* sn = NSTR(node);
2600 
2601       if (sn->end <= sn->s)
2602 	break;
2603 
2604       if (exact != 0 &&
2605 	  !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2606       }
2607       else {
2608 	n = node;
2609       }
2610     }
2611     break;
2612 
2613   case NT_QTFR:
2614     {
2615       QtfrNode* qn = NQTFR(node);
2616       if (qn->lower > 0) {
2617 	if (IS_NOT_NULL(qn->head_exact))
2618 	  n = qn->head_exact;
2619 	else
2620 	  n = get_head_value_node(qn->target, exact, reg);
2621       }
2622     }
2623     break;
2624 
2625   case NT_ENCLOSE:
2626     {
2627       EncloseNode* en = NENCLOSE(node);
2628       switch (en->type) {
2629       case ENCLOSE_OPTION:
2630 	{
2631 	  OnigOptionType options = reg->options;
2632 
2633 	  reg->options = NENCLOSE(node)->option;
2634 	  n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2635 	  reg->options = options;
2636 	}
2637 	break;
2638 
2639       case ENCLOSE_MEMORY:
2640       case ENCLOSE_STOP_BACKTRACK:
2641 	n = get_head_value_node(en->target, exact, reg);
2642 	break;
2643       }
2644     }
2645     break;
2646 
2647   case NT_ANCHOR:
2648     if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2649       n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2650     break;
2651 
2652   default:
2653     break;
2654   }
2655 
2656   return n;
2657 }
2658 
2659 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2660 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2661 {
2662   int type, r = 0;
2663 
2664   type = NTYPE(node);
2665   if ((NTYPE2BIT(type) & type_mask) == 0)
2666     return 1;
2667 
2668   switch (type) {
2669   case NT_LIST:
2670   case NT_ALT:
2671     do {
2672       r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2673 			  anchor_mask);
2674     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2675     break;
2676 
2677   case NT_QTFR:
2678     r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2679 			anchor_mask);
2680     break;
2681 
2682   case NT_ENCLOSE:
2683     {
2684       EncloseNode* en = NENCLOSE(node);
2685       if ((en->type & enclose_mask) == 0)
2686 	return 1;
2687 
2688       r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2689     }
2690     break;
2691 
2692   case NT_ANCHOR:
2693     type = NANCHOR(node)->type;
2694     if ((type & anchor_mask) == 0)
2695       return 1;
2696 
2697     if (NANCHOR(node)->target)
2698       r = check_type_tree(NANCHOR(node)->target,
2699 			  type_mask, enclose_mask, anchor_mask);
2700     break;
2701 
2702   default:
2703     break;
2704   }
2705   return r;
2706 }
2707 
2708 #ifdef USE_SUBEXP_CALL
2709 
2710 #define RECURSION_EXIST       1
2711 #define RECURSION_INFINITE    2
2712 
2713 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2714 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2715 {
2716   int type;
2717   int r = 0;
2718 
2719   type = NTYPE(node);
2720   switch (type) {
2721   case NT_LIST:
2722     {
2723       Node *x;
2724       OnigDistance min;
2725       int ret;
2726 
2727       x = node;
2728       do {
2729 	ret = subexp_inf_recursive_check(NCAR(x), env, head);
2730 	if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2731 	r |= ret;
2732 	if (head) {
2733 	  ret = get_min_match_length(NCAR(x), &min, env);
2734 	  if (ret != 0) return ret;
2735 	  if (min != 0) head = 0;
2736 	}
2737       } while (IS_NOT_NULL(x = NCDR(x)));
2738     }
2739     break;
2740 
2741   case NT_ALT:
2742     {
2743       int ret;
2744       r = RECURSION_EXIST;
2745       do {
2746 	ret = subexp_inf_recursive_check(NCAR(node), env, head);
2747 	if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2748 	r &= ret;
2749       } while (IS_NOT_NULL(node = NCDR(node)));
2750     }
2751     break;
2752 
2753   case NT_QTFR:
2754     r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2755     if (r == RECURSION_EXIST) {
2756       if (NQTFR(node)->lower == 0) r = 0;
2757     }
2758     break;
2759 
2760   case NT_ANCHOR:
2761     {
2762       AnchorNode* an = NANCHOR(node);
2763       switch (an->type) {
2764       case ANCHOR_PREC_READ:
2765       case ANCHOR_PREC_READ_NOT:
2766       case ANCHOR_LOOK_BEHIND:
2767       case ANCHOR_LOOK_BEHIND_NOT:
2768 	r = subexp_inf_recursive_check(an->target, env, head);
2769 	break;
2770       }
2771     }
2772     break;
2773 
2774   case NT_CALL:
2775     r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2776     break;
2777 
2778   case NT_ENCLOSE:
2779     if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2780       return 0;
2781     else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2782       return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2783     else {
2784       SET_ENCLOSE_STATUS(node, NST_MARK2);
2785       r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2786       CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2787     }
2788     break;
2789 
2790   default:
2791     break;
2792   }
2793 
2794   return r;
2795 }
2796 
2797 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2798 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2799 {
2800   int type;
2801   int r = 0;
2802 
2803   type = NTYPE(node);
2804   switch (type) {
2805   case NT_LIST:
2806   case NT_ALT:
2807     do {
2808       r = subexp_inf_recursive_check_trav(NCAR(node), env);
2809     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2810     break;
2811 
2812   case NT_QTFR:
2813     r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2814     break;
2815 
2816   case NT_ANCHOR:
2817     {
2818       AnchorNode* an = NANCHOR(node);
2819       switch (an->type) {
2820       case ANCHOR_PREC_READ:
2821       case ANCHOR_PREC_READ_NOT:
2822       case ANCHOR_LOOK_BEHIND:
2823       case ANCHOR_LOOK_BEHIND_NOT:
2824 	r = subexp_inf_recursive_check_trav(an->target, env);
2825 	break;
2826       }
2827     }
2828     break;
2829 
2830   case NT_ENCLOSE:
2831     {
2832       EncloseNode* en = NENCLOSE(node);
2833 
2834       if (IS_ENCLOSE_RECURSION(en)) {
2835 	SET_ENCLOSE_STATUS(node, NST_MARK1);
2836 	r = subexp_inf_recursive_check(en->target, env, 1);
2837 	if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2838 	CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2839       }
2840       r = subexp_inf_recursive_check_trav(en->target, env);
2841     }
2842 
2843     break;
2844 
2845   default:
2846     break;
2847   }
2848 
2849   return r;
2850 }
2851 
2852 static int
subexp_recursive_check(Node * node)2853 subexp_recursive_check(Node* node)
2854 {
2855   int r = 0;
2856 
2857   switch (NTYPE(node)) {
2858   case NT_LIST:
2859   case NT_ALT:
2860     do {
2861       r |= subexp_recursive_check(NCAR(node));
2862     } while (IS_NOT_NULL(node = NCDR(node)));
2863     break;
2864 
2865   case NT_QTFR:
2866     r = subexp_recursive_check(NQTFR(node)->target);
2867     break;
2868 
2869   case NT_ANCHOR:
2870     {
2871       AnchorNode* an = NANCHOR(node);
2872       switch (an->type) {
2873       case ANCHOR_PREC_READ:
2874       case ANCHOR_PREC_READ_NOT:
2875       case ANCHOR_LOOK_BEHIND:
2876       case ANCHOR_LOOK_BEHIND_NOT:
2877 	r = subexp_recursive_check(an->target);
2878 	break;
2879       }
2880     }
2881     break;
2882 
2883   case NT_CALL:
2884     r = subexp_recursive_check(NCALL(node)->target);
2885     if (r != 0) SET_CALL_RECURSION(node);
2886     break;
2887 
2888   case NT_ENCLOSE:
2889     if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2890       return 0;
2891     else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2892       return 1; /* recursion */
2893     else {
2894       SET_ENCLOSE_STATUS(node, NST_MARK2);
2895       r = subexp_recursive_check(NENCLOSE(node)->target);
2896       CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2897     }
2898     break;
2899 
2900   default:
2901     break;
2902   }
2903 
2904   return r;
2905 }
2906 
2907 
2908 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2909 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2910 {
2911 #define FOUND_CALLED_NODE    1
2912 
2913   int type;
2914   int r = 0;
2915 
2916   type = NTYPE(node);
2917   switch (type) {
2918   case NT_LIST:
2919   case NT_ALT:
2920     {
2921       int ret;
2922       do {
2923 	ret = subexp_recursive_check_trav(NCAR(node), env);
2924 	if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2925 	else if (ret < 0) return ret;
2926       } while (IS_NOT_NULL(node = NCDR(node)));
2927     }
2928     break;
2929 
2930   case NT_QTFR:
2931     r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2932     if (NQTFR(node)->upper == 0) {
2933       if (r == FOUND_CALLED_NODE)
2934 	NQTFR(node)->is_refered = 1;
2935     }
2936     break;
2937 
2938   case NT_ANCHOR:
2939     {
2940       AnchorNode* an = NANCHOR(node);
2941       switch (an->type) {
2942       case ANCHOR_PREC_READ:
2943       case ANCHOR_PREC_READ_NOT:
2944       case ANCHOR_LOOK_BEHIND:
2945       case ANCHOR_LOOK_BEHIND_NOT:
2946 	r = subexp_recursive_check_trav(an->target, env);
2947 	break;
2948       }
2949     }
2950     break;
2951 
2952   case NT_ENCLOSE:
2953     {
2954       EncloseNode* en = NENCLOSE(node);
2955 
2956       if (! IS_ENCLOSE_RECURSION(en)) {
2957 	if (IS_ENCLOSE_CALLED(en)) {
2958 	  SET_ENCLOSE_STATUS(node, NST_MARK1);
2959 	  r = subexp_recursive_check(en->target);
2960 	  if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2961 	  CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2962 	}
2963       }
2964       r = subexp_recursive_check_trav(en->target, env);
2965       if (IS_ENCLOSE_CALLED(en))
2966 	r |= FOUND_CALLED_NODE;
2967     }
2968     break;
2969 
2970   default:
2971     break;
2972   }
2973 
2974   return r;
2975 }
2976 
2977 static int
setup_subexp_call(Node * node,ScanEnv * env)2978 setup_subexp_call(Node* node, ScanEnv* env)
2979 {
2980   int type;
2981   int r = 0;
2982 
2983   type = NTYPE(node);
2984   switch (type) {
2985   case NT_LIST:
2986     do {
2987       r = setup_subexp_call(NCAR(node), env);
2988     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2989     break;
2990 
2991   case NT_ALT:
2992     do {
2993       r = setup_subexp_call(NCAR(node), env);
2994     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2995     break;
2996 
2997   case NT_QTFR:
2998     r = setup_subexp_call(NQTFR(node)->target, env);
2999     break;
3000   case NT_ENCLOSE:
3001     r = setup_subexp_call(NENCLOSE(node)->target, env);
3002     break;
3003 
3004   case NT_CALL:
3005     {
3006       CallNode* cn = NCALL(node);
3007       Node** nodes = SCANENV_MEM_NODES(env);
3008 
3009       if (cn->group_num != 0) {
3010 	int gnum = cn->group_num;
3011 
3012 #ifdef USE_NAMED_GROUP
3013 	if (env->num_named > 0 &&
3014 	    IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3015 	    !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3016 	  return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3017 	}
3018 #endif
3019 	if (gnum > env->num_mem) {
3020 	  onig_scan_env_set_error_string(env,
3021 		 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3022 	  return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3023 	}
3024 
3025 #ifdef USE_NAMED_GROUP
3026       set_call_attr:
3027 #endif
3028 	cn->target = nodes[cn->group_num];
3029 	if (IS_NULL(cn->target)) {
3030 	  onig_scan_env_set_error_string(env,
3031 		 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3032 	  return ONIGERR_UNDEFINED_NAME_REFERENCE;
3033 	}
3034 	SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3035 	BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3036 	cn->unset_addr_list = env->unset_addr_list;
3037       }
3038 #ifdef USE_NAMED_GROUP
3039       else {
3040 	int *refs;
3041 
3042 	int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3043 					   &refs);
3044 	if (n <= 0) {
3045 	  onig_scan_env_set_error_string(env,
3046 		 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3047 	  return ONIGERR_UNDEFINED_NAME_REFERENCE;
3048 	}
3049 	else if (n > 1) {
3050 	  onig_scan_env_set_error_string(env,
3051 	    ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3052 	  return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3053 	}
3054 	else {
3055 	  cn->group_num = refs[0];
3056 	  goto set_call_attr;
3057 	}
3058       }
3059 #endif
3060     }
3061     break;
3062 
3063   case NT_ANCHOR:
3064     {
3065       AnchorNode* an = NANCHOR(node);
3066 
3067       switch (an->type) {
3068       case ANCHOR_PREC_READ:
3069       case ANCHOR_PREC_READ_NOT:
3070       case ANCHOR_LOOK_BEHIND:
3071       case ANCHOR_LOOK_BEHIND_NOT:
3072 	r = setup_subexp_call(an->target, env);
3073 	break;
3074       }
3075     }
3076     break;
3077 
3078   default:
3079     break;
3080   }
3081 
3082   return r;
3083 }
3084 #endif
3085 
3086 /* divide different length alternatives in look-behind.
3087   (?<=A|B) ==> (?<=A)|(?<=B)
3088   (?<!A|B) ==> (?<!A)(?<!B)
3089 */
3090 static int
divide_look_behind_alternatives(Node * node)3091 divide_look_behind_alternatives(Node* node)
3092 {
3093   Node *head, *np, *insert_node;
3094   AnchorNode* an = NANCHOR(node);
3095   int anc_type = an->type;
3096 
3097   head = an->target;
3098   np = NCAR(head);
3099   swap_node(node, head);
3100   NCAR(node) = head;
3101   NANCHOR(head)->target = np;
3102 
3103   np = node;
3104   while ((np = NCDR(np)) != NULL_NODE) {
3105     insert_node = onig_node_new_anchor(anc_type);
3106     CHECK_NULL_RETURN_MEMERR(insert_node);
3107     NANCHOR(insert_node)->target = NCAR(np);
3108     NCAR(np) = insert_node;
3109   }
3110 
3111   if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3112     np = node;
3113     do {
3114       SET_NTYPE(np, NT_LIST);  /* alt -> list */
3115     } while ((np = NCDR(np)) != NULL_NODE);
3116   }
3117   return 0;
3118 }
3119 
3120 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3121 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3122 {
3123   int r, len;
3124   AnchorNode* an = NANCHOR(node);
3125 
3126   r = get_char_length_tree(an->target, reg, &len);
3127   if (r == 0)
3128     an->char_len = len;
3129   else if (r == GET_CHAR_LEN_VARLEN)
3130     r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3131   else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3132     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3133       r = divide_look_behind_alternatives(node);
3134     else
3135       r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3136   }
3137 
3138   return r;
3139 }
3140 
3141 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3142 next_setup(Node* node, Node* next_node, regex_t* reg)
3143 {
3144   int type;
3145 
3146  retry:
3147   type = NTYPE(node);
3148   if (type == NT_QTFR) {
3149     QtfrNode* qn = NQTFR(node);
3150     if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3151 #ifdef USE_QTFR_PEEK_NEXT
3152       Node* n = get_head_value_node(next_node, 1, reg);
3153       /* '\0': for UTF-16BE etc... */
3154       if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3155 	qn->next_head_exact = n;
3156       }
3157 #endif
3158       /* automatic posseivation a*b ==> (?>a*)b */
3159       if (qn->lower <= 1) {
3160 	int ttype = NTYPE(qn->target);
3161 	if (IS_NODE_TYPE_SIMPLE(ttype)) {
3162 	  Node *x, *y;
3163 	  x = get_head_value_node(qn->target, 0, reg);
3164 	  if (IS_NOT_NULL(x)) {
3165 	    y = get_head_value_node(next_node,  0, reg);
3166 	    if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3167 	      Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3168 	      CHECK_NULL_RETURN_MEMERR(en);
3169 	      SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3170 	      swap_node(node, en);
3171 	      NENCLOSE(node)->target = en;
3172 	    }
3173 	  }
3174 	}
3175       }
3176     }
3177   }
3178   else if (type == NT_ENCLOSE) {
3179     EncloseNode* en = NENCLOSE(node);
3180     if (en->type == ENCLOSE_MEMORY) {
3181       node = en->target;
3182       goto retry;
3183     }
3184   }
3185   return 0;
3186 }
3187 
3188 
3189 static int
update_string_node_case_fold(regex_t * reg,Node * node)3190 update_string_node_case_fold(regex_t* reg, Node *node)
3191 {
3192   UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3193   UChar *sbuf, *ebuf, *sp;
3194   int r, i, len, sbuf_size;
3195   StrNode* sn = NSTR(node);
3196 
3197   end = sn->end;
3198   sbuf_size = (end - sn->s) * 2;
3199   sbuf = (UChar* )xmalloc(sbuf_size);
3200   CHECK_NULL_RETURN_MEMERR(sbuf);
3201   ebuf = sbuf + sbuf_size;
3202 
3203   sp = sbuf;
3204   p = sn->s;
3205   while (p < end) {
3206     len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3207     for (i = 0; i < len; i++) {
3208       if (sp >= ebuf) {
3209         sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3210         CHECK_NULL_RETURN_MEMERR(sbuf);
3211         sp = sbuf + sbuf_size;
3212         sbuf_size *= 2;
3213         ebuf = sbuf + sbuf_size;
3214       }
3215 
3216       *sp++ = buf[i];
3217     }
3218   }
3219 
3220   r = onig_node_str_set(node, sbuf, sp);
3221   if (r != 0) {
3222     xfree(sbuf);
3223     return r;
3224   }
3225 
3226   xfree(sbuf);
3227   return 0;
3228 }
3229 
3230 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3231 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3232 				 regex_t* reg)
3233 {
3234   int r;
3235   Node *node;
3236 
3237   node = onig_node_new_str(s, end);
3238   if (IS_NULL(node)) return ONIGERR_MEMORY;
3239 
3240   r = update_string_node_case_fold(reg, node);
3241   if (r != 0) {
3242     onig_node_free(node);
3243     return r;
3244   }
3245 
3246   NSTRING_SET_AMBIG(node);
3247   NSTRING_SET_DONT_GET_OPT_INFO(node);
3248   *rnode = node;
3249   return 0;
3250 }
3251 
3252 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3253 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3254 			    UChar *p, int slen, UChar *end,
3255 			    regex_t* reg, Node **rnode)
3256 {
3257   int r, i, j, len, varlen;
3258   Node *anode, *var_anode, *snode, *xnode, *an;
3259   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3260 
3261   *rnode = var_anode = NULL_NODE;
3262 
3263   varlen = 0;
3264   for (i = 0; i < item_num; i++) {
3265     if (items[i].byte_len != slen) {
3266       varlen = 1;
3267       break;
3268     }
3269   }
3270 
3271   if (varlen != 0) {
3272     *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3273     if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3274 
3275     xnode = onig_node_new_list(NULL, NULL);
3276     if (IS_NULL(xnode)) goto mem_err;
3277     NCAR(var_anode) = xnode;
3278 
3279     anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3280     if (IS_NULL(anode)) goto mem_err;
3281     NCAR(xnode) = anode;
3282   }
3283   else {
3284     *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3285     if (IS_NULL(anode)) return ONIGERR_MEMORY;
3286   }
3287 
3288   snode = onig_node_new_str(p, p + slen);
3289   if (IS_NULL(snode)) goto mem_err;
3290 
3291   NCAR(anode) = snode;
3292 
3293   for (i = 0; i < item_num; i++) {
3294     snode = onig_node_new_str(NULL, NULL);
3295     if (IS_NULL(snode)) goto mem_err;
3296 
3297     for (j = 0; j < items[i].code_len; j++) {
3298       len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3299       if (len < 0) {
3300 	r = len;
3301 	goto mem_err2;
3302       }
3303 
3304       r = onig_node_str_cat(snode, buf, buf + len);
3305       if (r != 0) goto mem_err2;
3306     }
3307 
3308     an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3309     if (IS_NULL(an)) {
3310       goto mem_err2;
3311     }
3312 
3313     if (items[i].byte_len != slen) {
3314       Node *rem;
3315       UChar *q = p + items[i].byte_len;
3316 
3317       if (q < end) {
3318         r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3319         if (r != 0) {
3320           onig_node_free(an);
3321           goto mem_err2;
3322         }
3323 
3324         xnode = onig_node_list_add(NULL_NODE, snode);
3325         if (IS_NULL(xnode)) {
3326           onig_node_free(an);
3327           onig_node_free(rem);
3328           goto mem_err2;
3329         }
3330         if (IS_NULL(onig_node_list_add(xnode, rem))) {
3331           onig_node_free(an);
3332           onig_node_free(xnode);
3333           onig_node_free(rem);
3334           goto mem_err;
3335         }
3336 
3337         NCAR(an) = xnode;
3338       }
3339       else {
3340         NCAR(an) = snode;
3341       }
3342 
3343       NCDR(var_anode) = an;
3344       var_anode = an;
3345     }
3346     else {
3347       NCAR(an)     = snode;
3348       NCDR(anode) = an;
3349       anode = an;
3350     }
3351   }
3352 
3353   return varlen;
3354 
3355  mem_err2:
3356   onig_node_free(snode);
3357 
3358  mem_err:
3359   onig_node_free(*rnode);
3360 
3361   return ONIGERR_MEMORY;
3362 }
3363 
3364 static int
expand_case_fold_string(Node * node,regex_t * reg)3365 expand_case_fold_string(Node* node, regex_t* reg)
3366 {
3367 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION  8
3368 
3369   int r, n, len, alt_num;
3370   UChar *start, *end, *p;
3371   Node *top_root, *root, *snode, *prev_node;
3372   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3373   StrNode* sn = NSTR(node);
3374 
3375   if (NSTRING_IS_AMBIG(node)) return 0;
3376 
3377   start = sn->s;
3378   end   = sn->end;
3379   if (start >= end) return 0;
3380 
3381   r = 0;
3382   top_root = root = prev_node = snode = NULL_NODE;
3383   alt_num = 1;
3384   p = start;
3385   while (p < end) {
3386     n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3387 					   p, end, items);
3388     if (n < 0) {
3389       r = n;
3390       goto err;
3391     }
3392 
3393 	SAFE_ENC_LEN(reg->enc, p, end, len);
3394 
3395     if (n == 0) {
3396       if (IS_NULL(snode)) {
3397 	if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3398 	  top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3399 	  if (IS_NULL(root)) {
3400 	    onig_node_free(prev_node);
3401 	    goto mem_err;
3402 	  }
3403 	}
3404 
3405 	prev_node = snode = onig_node_new_str(NULL, NULL);
3406 	if (IS_NULL(snode)) goto mem_err;
3407 	if (IS_NOT_NULL(root)) {
3408 	  if (IS_NULL(onig_node_list_add(root, snode))) {
3409 	    onig_node_free(snode);
3410 	    goto mem_err;
3411 	  }
3412 	}
3413       }
3414 
3415       r = onig_node_str_cat(snode, p, p + len);
3416       if (r != 0) goto err;
3417     }
3418     else {
3419       alt_num *= (n + 1);
3420       if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3421 
3422       if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3423 	top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3424 	if (IS_NULL(root)) {
3425 	  onig_node_free(prev_node);
3426 	  goto mem_err;
3427 	}
3428       }
3429 
3430       r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3431       if (r < 0) goto mem_err;
3432       if (r == 1) {
3433 	if (IS_NULL(root)) {
3434 	  top_root = prev_node;
3435 	}
3436 	else {
3437 	  if (IS_NULL(onig_node_list_add(root, prev_node))) {
3438 	    onig_node_free(prev_node);
3439 	    goto mem_err;
3440 	  }
3441 	}
3442 
3443 	root = NCAR(prev_node);
3444       }
3445       else { /* r == 0 */
3446 	if (IS_NOT_NULL(root)) {
3447 	  if (IS_NULL(onig_node_list_add(root, prev_node))) {
3448 	    onig_node_free(prev_node);
3449 	    goto mem_err;
3450 	  }
3451 	}
3452       }
3453 
3454       snode = NULL_NODE;
3455     }
3456 
3457     p += len;
3458   }
3459 
3460   if (p < end) {
3461     Node *srem;
3462 
3463     r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3464     if (r != 0) goto mem_err;
3465 
3466     if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3467       top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3468       if (IS_NULL(root)) {
3469 	onig_node_free(srem);
3470 	onig_node_free(prev_node);
3471 	goto mem_err;
3472       }
3473     }
3474 
3475     if (IS_NULL(root)) {
3476       prev_node = srem;
3477     }
3478     else {
3479       if (IS_NULL(onig_node_list_add(root, srem))) {
3480 	onig_node_free(srem);
3481 	goto mem_err;
3482       }
3483     }
3484   }
3485 
3486   /* ending */
3487   top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3488   swap_node(node, top_root);
3489   onig_node_free(top_root);
3490   return 0;
3491 
3492  mem_err:
3493   r = ONIGERR_MEMORY;
3494 
3495  err:
3496   onig_node_free(top_root);
3497   return r;
3498 }
3499 
3500 
3501 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3502 
3503 #define CEC_THRES_NUM_BIG_REPEAT         512
3504 #define CEC_INFINITE_NUM          0x7fffffff
3505 
3506 #define CEC_IN_INFINITE_REPEAT    (1<<0)
3507 #define CEC_IN_FINITE_REPEAT      (1<<1)
3508 #define CEC_CONT_BIG_REPEAT       (1<<2)
3509 
3510 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3511 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3512 {
3513   int type;
3514   int r = state;
3515 
3516   type = NTYPE(node);
3517   switch (type) {
3518   case NT_LIST:
3519     {
3520       Node* prev = NULL_NODE;
3521       do {
3522 	r = setup_comb_exp_check(NCAR(node), r, env);
3523 	prev = NCAR(node);
3524       } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3525     }
3526     break;
3527 
3528   case NT_ALT:
3529     {
3530       int ret;
3531       do {
3532 	ret = setup_comb_exp_check(NCAR(node), state, env);
3533 	r |= ret;
3534       } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3535     }
3536     break;
3537 
3538   case NT_QTFR:
3539     {
3540       int child_state = state;
3541       int add_state = 0;
3542       QtfrNode* qn = NQTFR(node);
3543       Node* target = qn->target;
3544       int var_num;
3545 
3546       if (! IS_REPEAT_INFINITE(qn->upper)) {
3547 	if (qn->upper > 1) {
3548 	  /* {0,1}, {1,1} are allowed */
3549 	  child_state |= CEC_IN_FINITE_REPEAT;
3550 
3551 	  /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3552 	  if (env->backrefed_mem == 0) {
3553 	    if (NTYPE(qn->target) == NT_ENCLOSE) {
3554 	      EncloseNode* en = NENCLOSE(qn->target);
3555 	      if (en->type == ENCLOSE_MEMORY) {
3556 		if (NTYPE(en->target) == NT_QTFR) {
3557 		  QtfrNode* q = NQTFR(en->target);
3558 		  if (IS_REPEAT_INFINITE(q->upper)
3559 		      && q->greedy == qn->greedy) {
3560 		    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3561 		    if (qn->upper == 1)
3562 		      child_state = state;
3563 		  }
3564 		}
3565 	      }
3566 	    }
3567 	  }
3568 	}
3569       }
3570 
3571       if (state & CEC_IN_FINITE_REPEAT) {
3572 	qn->comb_exp_check_num = -1;
3573       }
3574       else {
3575 	if (IS_REPEAT_INFINITE(qn->upper)) {
3576 	  var_num = CEC_INFINITE_NUM;
3577 	  child_state |= CEC_IN_INFINITE_REPEAT;
3578 	}
3579 	else {
3580 	  var_num = qn->upper - qn->lower;
3581 	}
3582 
3583 	if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3584 	  add_state |= CEC_CONT_BIG_REPEAT;
3585 
3586 	if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3587 	    ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3588 	     var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3589 	  if (qn->comb_exp_check_num == 0) {
3590 	    env->num_comb_exp_check++;
3591 	    qn->comb_exp_check_num = env->num_comb_exp_check;
3592 	    if (env->curr_max_regnum > env->comb_exp_max_regnum)
3593 	      env->comb_exp_max_regnum = env->curr_max_regnum;
3594 	  }
3595 	}
3596       }
3597 
3598       r = setup_comb_exp_check(target, child_state, env);
3599       r |= add_state;
3600     }
3601     break;
3602 
3603   case NT_ENCLOSE:
3604     {
3605       EncloseNode* en = NENCLOSE(node);
3606 
3607       switch (en->type) {
3608       case ENCLOSE_MEMORY:
3609 	{
3610 	  if (env->curr_max_regnum < en->regnum)
3611 	    env->curr_max_regnum = en->regnum;
3612 
3613 	  r = setup_comb_exp_check(en->target, state, env);
3614 	}
3615 	break;
3616 
3617       default:
3618 	r = setup_comb_exp_check(en->target, state, env);
3619 	break;
3620       }
3621     }
3622     break;
3623 
3624 #ifdef USE_SUBEXP_CALL
3625   case NT_CALL:
3626     if (IS_CALL_RECURSION(NCALL(node)))
3627       env->has_recursion = 1;
3628     else
3629       r = setup_comb_exp_check(NCALL(node)->target, state, env);
3630     break;
3631 #endif
3632 
3633   default:
3634     break;
3635   }
3636 
3637   return r;
3638 }
3639 #endif
3640 
3641 #define IN_ALT        (1<<0)
3642 #define IN_NOT        (1<<1)
3643 #define IN_REPEAT     (1<<2)
3644 #define IN_VAR_REPEAT (1<<3)
3645 
3646 /* setup_tree does the following work.
3647  1. check empty loop. (set qn->target_empty_info)
3648  2. expand ignore-case in char class.
3649  3. set memory status bit flags. (reg->mem_stats)
3650  4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3651  5. find invalid patterns in look-behind.
3652  6. expand repeated string.
3653  */
3654 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3655 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3656 {
3657   int type;
3658   int r = 0;
3659 
3660   type = NTYPE(node);
3661   switch (type) {
3662   case NT_LIST:
3663     {
3664       Node* prev = NULL_NODE;
3665       do {
3666 	r = setup_tree(NCAR(node), reg, state, env);
3667 	if (IS_NOT_NULL(prev) && r == 0) {
3668 	  r = next_setup(prev, NCAR(node), reg);
3669 	}
3670 	prev = NCAR(node);
3671       } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3672     }
3673     break;
3674 
3675   case NT_ALT:
3676     do {
3677       r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3678     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3679     break;
3680 
3681   case NT_CCLASS:
3682     break;
3683 
3684   case NT_STR:
3685     if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3686       r = expand_case_fold_string(node, reg);
3687     }
3688     break;
3689 
3690   case NT_CTYPE:
3691   case NT_CANY:
3692     break;
3693 
3694 #ifdef USE_SUBEXP_CALL
3695   case NT_CALL:
3696     break;
3697 #endif
3698 
3699   case NT_BREF:
3700     {
3701       int i;
3702       int* p;
3703       Node** nodes = SCANENV_MEM_NODES(env);
3704       BRefNode* br = NBREF(node);
3705       p = BACKREFS_P(br);
3706       for (i = 0; i < br->back_num; i++) {
3707 	if (p[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
3708 	BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3709 	BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3710 #ifdef USE_BACKREF_WITH_LEVEL
3711 	if (IS_BACKREF_NEST_LEVEL(br)) {
3712 	  BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3713 	}
3714 #endif
3715 	SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3716       }
3717     }
3718     break;
3719 
3720   case NT_QTFR:
3721     {
3722       OnigDistance d;
3723       QtfrNode* qn = NQTFR(node);
3724       Node* target = qn->target;
3725 
3726       if ((state & IN_REPEAT) != 0) {
3727         qn->state |= NST_IN_REPEAT;
3728       }
3729 
3730       if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3731 	r = get_min_match_length(target, &d, env);
3732 	if (r) break;
3733 	if (d == 0) {
3734 	  qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3735 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3736 	  r = quantifiers_memory_node_info(target);
3737 	  if (r < 0) break;
3738 	  if (r > 0) {
3739 	    qn->target_empty_info = r;
3740 	  }
3741 #endif
3742 #if 0
3743 	  r = get_max_match_length(target, &d, env);
3744 	  if (r == 0 && d == 0) {
3745 	    /*  ()* ==> ()?, ()+ ==> ()  */
3746 	    qn->upper = 1;
3747 	    if (qn->lower > 1) qn->lower = 1;
3748 	    if (NTYPE(target) == NT_STR) {
3749 	      qn->upper = qn->lower = 0;  /* /(?:)+/ ==> // */
3750 	    }
3751 	  }
3752 #endif
3753 	}
3754       }
3755 
3756       state |= IN_REPEAT;
3757       if (qn->lower != qn->upper)
3758 	state |= IN_VAR_REPEAT;
3759       r = setup_tree(target, reg, state, env);
3760       if (r) break;
3761 
3762       /* expand string */
3763 #define EXPAND_STRING_MAX_LENGTH  100
3764       if (NTYPE(target) == NT_STR) {
3765 	if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3766 	    qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3767 	  int len = NSTRING_LEN(target);
3768 	  StrNode* sn = NSTR(target);
3769 
3770 	  if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3771 	    int i, n = qn->lower;
3772 	    onig_node_conv_to_str_node(node, NSTR(target)->flag);
3773 	    for (i = 0; i < n; i++) {
3774 	      r = onig_node_str_cat(node, sn->s, sn->end);
3775 	      if (r) break;
3776 	    }
3777 	    onig_node_free(target);
3778 	    break; /* break case NT_QTFR: */
3779 	  }
3780 	}
3781       }
3782 
3783 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3784       if (qn->greedy && (qn->target_empty_info != 0)) {
3785 	if (NTYPE(target) == NT_QTFR) {
3786 	  QtfrNode* tqn = NQTFR(target);
3787 	  if (IS_NOT_NULL(tqn->head_exact)) {
3788 	    qn->head_exact  = tqn->head_exact;
3789 	    tqn->head_exact = NULL;
3790 	  }
3791 	}
3792 	else {
3793 	  qn->head_exact = get_head_value_node(qn->target, 1, reg);
3794 	}
3795       }
3796 #endif
3797     }
3798     break;
3799 
3800   case NT_ENCLOSE:
3801     {
3802       EncloseNode* en = NENCLOSE(node);
3803 
3804       switch (en->type) {
3805       case ENCLOSE_OPTION:
3806 	{
3807 	  OnigOptionType options = reg->options;
3808 	  reg->options = NENCLOSE(node)->option;
3809 	  r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3810 	  reg->options = options;
3811 	}
3812 	break;
3813 
3814       case ENCLOSE_MEMORY:
3815 	if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3816 	  BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3817 	  /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3818 	}
3819         r = setup_tree(en->target, reg, state, env);
3820         break;
3821 
3822       case ENCLOSE_STOP_BACKTRACK:
3823 	{
3824 	  Node* target = en->target;
3825 	  r = setup_tree(target, reg, state, env);
3826 	  if (NTYPE(target) == NT_QTFR) {
3827 	    QtfrNode* tqn = NQTFR(target);
3828 	    if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3829 		tqn->greedy != 0) {  /* (?>a*), a*+ etc... */
3830 	      int qtype = NTYPE(tqn->target);
3831 	      if (IS_NODE_TYPE_SIMPLE(qtype))
3832 		SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3833 	    }
3834 	  }
3835 	}
3836 	break;
3837       }
3838     }
3839     break;
3840 
3841   case NT_ANCHOR:
3842     {
3843       AnchorNode* an = NANCHOR(node);
3844 
3845       switch (an->type) {
3846       case ANCHOR_PREC_READ:
3847 	r = setup_tree(an->target, reg, state, env);
3848 	break;
3849       case ANCHOR_PREC_READ_NOT:
3850 	r = setup_tree(an->target, reg, (state | IN_NOT), env);
3851 	break;
3852 
3853 /* allowed node types in look-behind */
3854 #define ALLOWED_TYPE_IN_LB  \
3855   ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3856     BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3857 
3858 #define ALLOWED_ENCLOSE_IN_LB       ( ENCLOSE_MEMORY )
3859 #define ALLOWED_ENCLOSE_IN_LB_NOT   0
3860 
3861 #define ALLOWED_ANCHOR_IN_LB \
3862 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3863 #define ALLOWED_ANCHOR_IN_LB_NOT \
3864 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3865 
3866       case ANCHOR_LOOK_BEHIND:
3867 	{
3868 	  r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3869 			      ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3870 	  if (r < 0) return r;
3871 	  if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3872 	  r = setup_look_behind(node, reg, env);
3873 	  if (r != 0) return r;
3874 	  r = setup_tree(an->target, reg, state, env);
3875 	}
3876 	break;
3877 
3878       case ANCHOR_LOOK_BEHIND_NOT:
3879 	{
3880 	  r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3881 		      ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3882 	  if (r < 0) return r;
3883 	  if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3884 	  r = setup_look_behind(node, reg, env);
3885 	  if (r != 0) return r;
3886 	  r = setup_tree(an->target, reg, (state | IN_NOT), env);
3887 	}
3888 	break;
3889       }
3890     }
3891     break;
3892 
3893   default:
3894     break;
3895   }
3896 
3897   return r;
3898 }
3899 
3900 /* set skip map for Boyer-Moor search */
3901 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3902 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3903 	    UChar skip[], int** int_skip)
3904 {
3905   int i, len;
3906 
3907   len = end - s;
3908   if (len < ONIG_CHAR_TABLE_SIZE) {
3909     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3910 
3911     for (i = 0; i < len - 1; i++)
3912       skip[s[i]] = len - 1 - i;
3913   }
3914   else {
3915     if (IS_NULL(*int_skip)) {
3916       *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3917       if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3918     }
3919     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3920 
3921     for (i = 0; i < len - 1; i++)
3922       (*int_skip)[s[i]] = len - 1 - i;
3923   }
3924   return 0;
3925 }
3926 
3927 #define OPT_EXACT_MAXLEN   24
3928 
3929 typedef struct {
3930   OnigDistance min;  /* min byte length */
3931   OnigDistance max;  /* max byte length */
3932 } MinMaxLen;
3933 
3934 typedef struct {
3935   MinMaxLen        mmd;
3936   OnigEncoding     enc;
3937   OnigOptionType   options;
3938   OnigCaseFoldType case_fold_flag;
3939   ScanEnv*         scan_env;
3940 } OptEnv;
3941 
3942 typedef struct {
3943   int left_anchor;
3944   int right_anchor;
3945 } OptAncInfo;
3946 
3947 typedef struct {
3948   MinMaxLen  mmd; /* info position */
3949   OptAncInfo anc;
3950 
3951   int   reach_end;
3952   int   ignore_case;
3953   int   len;
3954   UChar s[OPT_EXACT_MAXLEN];
3955 } OptExactInfo;
3956 
3957 typedef struct {
3958   MinMaxLen mmd; /* info position */
3959   OptAncInfo anc;
3960 
3961   int   value;      /* weighted value */
3962   UChar map[ONIG_CHAR_TABLE_SIZE];
3963 } OptMapInfo;
3964 
3965 typedef struct {
3966   MinMaxLen    len;
3967 
3968   OptAncInfo   anc;
3969   OptExactInfo exb;    /* boundary */
3970   OptExactInfo exm;    /* middle */
3971   OptExactInfo expr;   /* prec read (?=...) */
3972 
3973   OptMapInfo   map;   /* boundary */
3974 } NodeOptInfo;
3975 
3976 
3977 static int
map_position_value(OnigEncoding enc,int i)3978 map_position_value(OnigEncoding enc, int i)
3979 {
3980   static const short int ByteValTable[] = {
3981      5,  1,  1,  1,  1,  1,  1,  1,  1, 10, 10,  1,  1, 10,  1,  1,
3982      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
3983     12,  4,  7,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
3984      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,
3985      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
3986      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,  5,  5,  5,
3987      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
3988      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  1
3989   };
3990 
3991   if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
3992     if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3993       return 20;
3994     else
3995       return (int )ByteValTable[i];
3996   }
3997   else
3998     return 4;   /* Take it easy. */
3999 }
4000 
4001 static int
distance_value(MinMaxLen * mm)4002 distance_value(MinMaxLen* mm)
4003 {
4004   /* 1000 / (min-max-dist + 1) */
4005   static const short int dist_vals[] = {
4006     1000,  500,  333,  250,  200,  167,  143,  125,  111,  100,
4007       91,   83,   77,   71,   67,   63,   59,   56,   53,   50,
4008       48,   45,   43,   42,   40,   38,   37,   36,   34,   33,
4009       32,   31,   30,   29,   29,   28,   27,   26,   26,   25,
4010       24,   24,   23,   23,   22,   22,   21,   21,   20,   20,
4011       20,   19,   19,   19,   18,   18,   18,   17,   17,   17,
4012       16,   16,   16,   16,   15,   15,   15,   15,   14,   14,
4013       14,   14,   14,   14,   13,   13,   13,   13,   13,   13,
4014       12,   12,   12,   12,   12,   12,   11,   11,   11,   11,
4015       11,   11,   11,   11,   11,   10,   10,   10,   10,   10
4016   };
4017 
4018   int d;
4019 
4020   if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4021 
4022   d = mm->max - mm->min;
4023   if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4024     /* return dist_vals[d] * 16 / (mm->min + 12); */
4025     return (int )dist_vals[d];
4026   else
4027     return 1;
4028 }
4029 
4030 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4031 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4032 {
4033   if (v2 <= 0) return -1;
4034   if (v1 <= 0) return  1;
4035 
4036   v1 *= distance_value(d1);
4037   v2 *= distance_value(d2);
4038 
4039   if (v2 > v1) return  1;
4040   if (v2 < v1) return -1;
4041 
4042   if (d2->min < d1->min) return  1;
4043   if (d2->min > d1->min) return -1;
4044   return 0;
4045 }
4046 
4047 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4048 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4049 {
4050   return (a->min == b->min && a->max == b->max) ? 1 : 0;
4051 }
4052 
4053 
4054 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4055 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4056 {
4057   mml->min = min;
4058   mml->max = max;
4059 }
4060 
4061 static void
clear_mml(MinMaxLen * mml)4062 clear_mml(MinMaxLen* mml)
4063 {
4064   mml->min = mml->max = 0;
4065 }
4066 
4067 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4068 copy_mml(MinMaxLen* to, MinMaxLen* from)
4069 {
4070   to->min = from->min;
4071   to->max = from->max;
4072 }
4073 
4074 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4075 add_mml(MinMaxLen* to, MinMaxLen* from)
4076 {
4077   to->min = distance_add(to->min, from->min);
4078   to->max = distance_add(to->max, from->max);
4079 }
4080 
4081 #if 0
4082 static void
4083 add_len_mml(MinMaxLen* to, OnigDistance len)
4084 {
4085   to->min = distance_add(to->min, len);
4086   to->max = distance_add(to->max, len);
4087 }
4088 #endif
4089 
4090 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4091 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4092 {
4093   if (to->min > from->min) to->min = from->min;
4094   if (to->max < from->max) to->max = from->max;
4095 }
4096 
4097 static void
copy_opt_env(OptEnv * to,OptEnv * from)4098 copy_opt_env(OptEnv* to, OptEnv* from)
4099 {
4100   *to = *from;
4101 }
4102 
4103 static void
clear_opt_anc_info(OptAncInfo * anc)4104 clear_opt_anc_info(OptAncInfo* anc)
4105 {
4106   anc->left_anchor  = 0;
4107   anc->right_anchor = 0;
4108 }
4109 
4110 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4111 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4112 {
4113   *to = *from;
4114 }
4115 
4116 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4117 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4118 		    OnigDistance left_len, OnigDistance right_len)
4119 {
4120   clear_opt_anc_info(to);
4121 
4122   to->left_anchor = left->left_anchor;
4123   if (left_len == 0) {
4124     to->left_anchor |= right->left_anchor;
4125   }
4126 
4127   to->right_anchor = right->right_anchor;
4128   if (right_len == 0) {
4129     to->right_anchor |= left->right_anchor;
4130   }
4131 }
4132 
4133 static int
is_left_anchor(int anc)4134 is_left_anchor(int anc)
4135 {
4136   if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4137       anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4138       anc == ANCHOR_PREC_READ_NOT)
4139     return 0;
4140 
4141   return 1;
4142 }
4143 
4144 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4145 is_set_opt_anc_info(OptAncInfo* to, int anc)
4146 {
4147   if ((to->left_anchor & anc) != 0) return 1;
4148 
4149   return ((to->right_anchor & anc) != 0 ? 1 : 0);
4150 }
4151 
4152 static void
add_opt_anc_info(OptAncInfo * to,int anc)4153 add_opt_anc_info(OptAncInfo* to, int anc)
4154 {
4155   if (is_left_anchor(anc))
4156     to->left_anchor |= anc;
4157   else
4158     to->right_anchor |= anc;
4159 }
4160 
4161 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4162 remove_opt_anc_info(OptAncInfo* to, int anc)
4163 {
4164   if (is_left_anchor(anc))
4165     to->left_anchor &= ~anc;
4166   else
4167     to->right_anchor &= ~anc;
4168 }
4169 
4170 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4171 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4172 {
4173   to->left_anchor  &= add->left_anchor;
4174   to->right_anchor &= add->right_anchor;
4175 }
4176 
4177 static int
is_full_opt_exact_info(OptExactInfo * ex)4178 is_full_opt_exact_info(OptExactInfo* ex)
4179 {
4180   return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4181 }
4182 
4183 static void
clear_opt_exact_info(OptExactInfo * ex)4184 clear_opt_exact_info(OptExactInfo* ex)
4185 {
4186   clear_mml(&ex->mmd);
4187   clear_opt_anc_info(&ex->anc);
4188   ex->reach_end   = 0;
4189   ex->ignore_case = 0;
4190   ex->len         = 0;
4191   ex->s[0]        = '\0';
4192 }
4193 
4194 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4195 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4196 {
4197   *to = *from;
4198 }
4199 
4200 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4201 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4202 {
4203   int i, j, len;
4204   UChar *p, *end;
4205   OptAncInfo tanc;
4206 
4207   if (! to->ignore_case && add->ignore_case) {
4208     if (to->len >= add->len) return ;  /* avoid */
4209 
4210     to->ignore_case = 1;
4211   }
4212 
4213   p = add->s;
4214   end = p + add->len;
4215   for (i = to->len; p < end; ) {
4216     len = enclen(enc, p);
4217     if (i + len > OPT_EXACT_MAXLEN) break;
4218     for (j = 0; j < len && p < end; j++)
4219       to->s[i++] = *p++;
4220   }
4221 
4222   to->len = i;
4223   to->reach_end = (p == end ? add->reach_end : 0);
4224 
4225   concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4226   if (! to->reach_end) tanc.right_anchor = 0;
4227   copy_opt_anc_info(&to->anc, &tanc);
4228 }
4229 
4230 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4231 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4232 			  int raw ARG_UNUSED, OnigEncoding enc)
4233 {
4234   int i, j, len;
4235   UChar *p;
4236 
4237   for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4238     len = enclen(enc, p);
4239     if (i + len > OPT_EXACT_MAXLEN) break;
4240     for (j = 0; j < len && p < end; j++)
4241       to->s[i++] = *p++;
4242   }
4243 
4244   to->len = i;
4245 }
4246 
4247 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4248 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4249 {
4250   int i, j, len;
4251 
4252   if (add->len == 0 || to->len == 0) {
4253     clear_opt_exact_info(to);
4254     return ;
4255   }
4256 
4257   if (! is_equal_mml(&to->mmd, &add->mmd)) {
4258     clear_opt_exact_info(to);
4259     return ;
4260   }
4261 
4262   for (i = 0; i < to->len && i < add->len; ) {
4263     if (to->s[i] != add->s[i]) break;
4264     len = enclen(env->enc, to->s + i);
4265 
4266     for (j = 1; j < len; j++) {
4267       if (to->s[i+j] != add->s[i+j]) break;
4268     }
4269     if (j < len) break;
4270     i += len;
4271   }
4272 
4273   if (! add->reach_end || i < add->len || i < to->len) {
4274     to->reach_end = 0;
4275   }
4276   to->len = i;
4277   to->ignore_case |= add->ignore_case;
4278 
4279   alt_merge_opt_anc_info(&to->anc, &add->anc);
4280   if (! to->reach_end) to->anc.right_anchor = 0;
4281 }
4282 
4283 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4284 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4285 {
4286   int v1, v2;
4287 
4288   v1 = now->len;
4289   v2 = alt->len;
4290 
4291   if (v2 == 0) {
4292     return ;
4293   }
4294   else if (v1 == 0) {
4295     copy_opt_exact_info(now, alt);
4296     return ;
4297   }
4298   else if (v1 <= 2 && v2 <= 2) {
4299     /* ByteValTable[x] is big value --> low price */
4300     v2 = map_position_value(enc, now->s[0]);
4301     v1 = map_position_value(enc, alt->s[0]);
4302 
4303     if (now->len > 1) v1 += 5;
4304     if (alt->len > 1) v2 += 5;
4305   }
4306 
4307   if (now->ignore_case == 0) v1 *= 2;
4308   if (alt->ignore_case == 0) v2 *= 2;
4309 
4310   if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4311     copy_opt_exact_info(now, alt);
4312 }
4313 
4314 static void
clear_opt_map_info(OptMapInfo * map)4315 clear_opt_map_info(OptMapInfo* map)
4316 {
4317   static const OptMapInfo clean_info = {
4318     {0, 0}, {0, 0}, 0,
4319     {
4320       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4321       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4322       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4323       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4324       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4325       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4326       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4327       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4328       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4329       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4330       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4331       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4332       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4333       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4334       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4335       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4336     }
4337   };
4338 
4339   xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4340 }
4341 
4342 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4343 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4344 {
4345   *to = *from;
4346 }
4347 
4348 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4349 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4350 {
4351   if (map->map[c] == 0) {
4352     map->map[c] = 1;
4353     map->value += map_position_value(enc, c);
4354   }
4355 }
4356 
4357 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4358 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4359                           OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4360 {
4361   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4362   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4363   int i, n;
4364 
4365   add_char_opt_map_info(map, p[0], enc);
4366 
4367   case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4368   n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4369   if (n < 0) return n;
4370 
4371   for (i = 0; i < n; i++) {
4372     ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4373     add_char_opt_map_info(map, buf[0], enc);
4374   }
4375 
4376   return 0;
4377 }
4378 
4379 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4380 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4381 {
4382   static int z = 1<<15; /* 32768: something big value */
4383 
4384   int v1, v2;
4385 
4386   if (alt->value == 0) return ;
4387   if (now->value == 0) {
4388     copy_opt_map_info(now, alt);
4389     return ;
4390   }
4391 
4392   v1 = z / now->value;
4393   v2 = z / alt->value;
4394   if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4395     copy_opt_map_info(now, alt);
4396 }
4397 
4398 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4399 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4400 {
4401 #define COMP_EM_BASE  20
4402   int ve, vm;
4403 
4404   if (m->value <= 0) return -1;
4405 
4406   ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4407   vm = COMP_EM_BASE * 5 * 2 / m->value;
4408   return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4409 }
4410 
4411 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4412 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4413 {
4414   int i, val;
4415 
4416   /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4417   if (to->value == 0) return ;
4418   if (add->value == 0 || to->mmd.max < add->mmd.min) {
4419     clear_opt_map_info(to);
4420     return ;
4421   }
4422 
4423   alt_merge_mml(&to->mmd, &add->mmd);
4424 
4425   val = 0;
4426   for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4427     if (add->map[i])
4428       to->map[i] = 1;
4429 
4430     if (to->map[i])
4431       val += map_position_value(enc, i);
4432   }
4433   to->value = val;
4434 
4435   alt_merge_opt_anc_info(&to->anc, &add->anc);
4436 }
4437 
4438 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4439 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4440 {
4441   copy_mml(&(opt->exb.mmd),  mmd);
4442   copy_mml(&(opt->expr.mmd), mmd);
4443   copy_mml(&(opt->map.mmd),  mmd);
4444 }
4445 
4446 static void
clear_node_opt_info(NodeOptInfo * opt)4447 clear_node_opt_info(NodeOptInfo* opt)
4448 {
4449   clear_mml(&opt->len);
4450   clear_opt_anc_info(&opt->anc);
4451   clear_opt_exact_info(&opt->exb);
4452   clear_opt_exact_info(&opt->exm);
4453   clear_opt_exact_info(&opt->expr);
4454   clear_opt_map_info(&opt->map);
4455 }
4456 
4457 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4458 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4459 {
4460   *to = *from;
4461 }
4462 
4463 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4464 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4465 {
4466   int exb_reach, exm_reach;
4467   OptAncInfo tanc;
4468 
4469   concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4470   copy_opt_anc_info(&to->anc, &tanc);
4471 
4472   if (add->exb.len > 0 && to->len.max == 0) {
4473     concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4474 			to->len.max, add->len.max);
4475     copy_opt_anc_info(&add->exb.anc, &tanc);
4476   }
4477 
4478   if (add->map.value > 0 && to->len.max == 0) {
4479     if (add->map.mmd.max == 0)
4480       add->map.anc.left_anchor |= to->anc.left_anchor;
4481   }
4482 
4483   exb_reach = to->exb.reach_end;
4484   exm_reach = to->exm.reach_end;
4485 
4486   if (add->len.max != 0)
4487     to->exb.reach_end = to->exm.reach_end = 0;
4488 
4489   if (add->exb.len > 0) {
4490     if (exb_reach) {
4491       concat_opt_exact_info(&to->exb, &add->exb, enc);
4492       clear_opt_exact_info(&add->exb);
4493     }
4494     else if (exm_reach) {
4495       concat_opt_exact_info(&to->exm, &add->exb, enc);
4496       clear_opt_exact_info(&add->exb);
4497     }
4498   }
4499   select_opt_exact_info(enc, &to->exm, &add->exb);
4500   select_opt_exact_info(enc, &to->exm, &add->exm);
4501 
4502   if (to->expr.len > 0) {
4503     if (add->len.max > 0) {
4504       if (to->expr.len > (int )add->len.max)
4505 	to->expr.len = add->len.max;
4506 
4507       if (to->expr.mmd.max == 0)
4508 	select_opt_exact_info(enc, &to->exb, &to->expr);
4509       else
4510 	select_opt_exact_info(enc, &to->exm, &to->expr);
4511     }
4512   }
4513   else if (add->expr.len > 0) {
4514     copy_opt_exact_info(&to->expr, &add->expr);
4515   }
4516 
4517   select_opt_map_info(&to->map, &add->map);
4518 
4519   add_mml(&to->len, &add->len);
4520 }
4521 
4522 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4523 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4524 {
4525   alt_merge_opt_anc_info  (&to->anc,  &add->anc);
4526   alt_merge_opt_exact_info(&to->exb,  &add->exb, env);
4527   alt_merge_opt_exact_info(&to->exm,  &add->exm, env);
4528   alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4529   alt_merge_opt_map_info(env->enc, &to->map,  &add->map);
4530 
4531   alt_merge_mml(&to->len, &add->len);
4532 }
4533 
4534 
4535 #define MAX_NODE_OPT_INFO_REF_COUNT    5
4536 
4537 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4538 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4539 {
4540   int type;
4541   int r = 0;
4542 
4543   clear_node_opt_info(opt);
4544   set_bound_node_opt_info(opt, &env->mmd);
4545 
4546   type = NTYPE(node);
4547   switch (type) {
4548   case NT_LIST:
4549     {
4550       OptEnv nenv;
4551       NodeOptInfo nopt;
4552       Node* nd = node;
4553 
4554       copy_opt_env(&nenv, env);
4555       do {
4556 	r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4557 	if (r == 0) {
4558 	  add_mml(&nenv.mmd, &nopt.len);
4559 	  concat_left_node_opt_info(env->enc, opt, &nopt);
4560 	}
4561       } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4562     }
4563     break;
4564 
4565   case NT_ALT:
4566     {
4567       NodeOptInfo nopt;
4568       Node* nd = node;
4569 
4570       do {
4571 	r = optimize_node_left(NCAR(nd), &nopt, env);
4572 	if (r == 0) {
4573 	  if (nd == node) copy_node_opt_info(opt, &nopt);
4574 	  else            alt_merge_node_opt_info(opt, &nopt, env);
4575 	}
4576       } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4577     }
4578     break;
4579 
4580   case NT_STR:
4581     {
4582       StrNode* sn = NSTR(node);
4583       int slen = sn->end - sn->s;
4584       int is_raw = NSTRING_IS_RAW(node);
4585 
4586       if (! NSTRING_IS_AMBIG(node)) {
4587 	concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4588 				  NSTRING_IS_RAW(node), env->enc);
4589 	if (slen > 0) {
4590 	  add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4591 	}
4592         set_mml(&opt->len, slen, slen);
4593       }
4594       else {
4595         int max;
4596 
4597 	if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4598           int n = onigenc_strlen(env->enc, sn->s, sn->end);
4599           max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4600 	}
4601 	else {
4602 	  concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4603 				    is_raw, env->enc);
4604 	  opt->exb.ignore_case = 1;
4605 
4606 	  if (slen > 0) {
4607 	    r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4608 					  env->enc, env->case_fold_flag);
4609 	    if (r != 0) break;
4610 	  }
4611 
4612 	  max = slen;
4613 	}
4614 
4615         set_mml(&opt->len, slen, max);
4616       }
4617 
4618       if (opt->exb.len == slen)
4619 	opt->exb.reach_end = 1;
4620     }
4621     break;
4622 
4623   case NT_CCLASS:
4624     {
4625       int i, z;
4626       CClassNode* cc = NCCLASS(node);
4627 
4628       /* no need to check ignore case. (setted in setup_tree()) */
4629 
4630       if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4631         OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4632 	OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4633 
4634 	set_mml(&opt->len, min, max);
4635       }
4636       else {
4637         for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4638           z = BITSET_AT(cc->bs, i);
4639           if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4640             add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4641           }
4642         }
4643 	set_mml(&opt->len, 1, 1);
4644       }
4645     }
4646     break;
4647 
4648   case NT_CTYPE:
4649     {
4650       int i, min, max;
4651 
4652       max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4653 
4654       if (max == 1) {
4655         min = 1;
4656 
4657 	switch (NCTYPE(node)->ctype) {
4658 	case ONIGENC_CTYPE_WORD:
4659 	  if (NCTYPE(node)->not != 0) {
4660 	    for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4661 	      if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4662 		add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4663 	      }
4664 	    }
4665 	  }
4666 	  else {
4667 	    for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4668 	      if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4669 		add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4670 	      }
4671 	    }
4672 	  }
4673 	  break;
4674 	}
4675       }
4676       else {
4677         min = ONIGENC_MBC_MINLEN(env->enc);
4678       }
4679       set_mml(&opt->len, min, max);
4680     }
4681     break;
4682 
4683   case NT_CANY:
4684     {
4685       OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4686       OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4687       set_mml(&opt->len, min, max);
4688     }
4689     break;
4690 
4691   case NT_ANCHOR:
4692     switch (NANCHOR(node)->type) {
4693     case ANCHOR_BEGIN_BUF:
4694     case ANCHOR_BEGIN_POSITION:
4695     case ANCHOR_BEGIN_LINE:
4696     case ANCHOR_END_BUF:
4697     case ANCHOR_SEMI_END_BUF:
4698     case ANCHOR_END_LINE:
4699       add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4700       break;
4701 
4702     case ANCHOR_PREC_READ:
4703       {
4704 	NodeOptInfo nopt;
4705 
4706 	r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4707 	if (r == 0) {
4708 	  if (nopt.exb.len > 0)
4709 	    copy_opt_exact_info(&opt->expr, &nopt.exb);
4710 	  else if (nopt.exm.len > 0)
4711 	    copy_opt_exact_info(&opt->expr, &nopt.exm);
4712 
4713 	  opt->expr.reach_end = 0;
4714 
4715 	  if (nopt.map.value > 0)
4716 	    copy_opt_map_info(&opt->map, &nopt.map);
4717 	}
4718       }
4719       break;
4720 
4721     case ANCHOR_PREC_READ_NOT:
4722     case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4723     case ANCHOR_LOOK_BEHIND_NOT:
4724       break;
4725     }
4726     break;
4727 
4728   case NT_BREF:
4729     {
4730       int i;
4731       int* backs;
4732       OnigDistance min, max, tmin, tmax;
4733       Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4734       BRefNode* br = NBREF(node);
4735 
4736       if (br->state & NST_RECURSION) {
4737 	set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4738 	break;
4739       }
4740       backs = BACKREFS_P(br);
4741       r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4742       if (r != 0) break;
4743       r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4744       if (r != 0) break;
4745       for (i = 1; i < br->back_num; i++) {
4746 	r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4747 	if (r != 0) break;
4748 	r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4749 	if (r != 0) break;
4750 	if (min > tmin) min = tmin;
4751 	if (max < tmax) max = tmax;
4752       }
4753       if (r == 0) set_mml(&opt->len, min, max);
4754     }
4755     break;
4756 
4757 #ifdef USE_SUBEXP_CALL
4758   case NT_CALL:
4759     if (IS_CALL_RECURSION(NCALL(node)))
4760       set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4761     else {
4762       OnigOptionType save = env->options;
4763       env->options = NENCLOSE(NCALL(node)->target)->option;
4764       r = optimize_node_left(NCALL(node)->target, opt, env);
4765       env->options = save;
4766     }
4767     break;
4768 #endif
4769 
4770   case NT_QTFR:
4771     {
4772       int i;
4773       OnigDistance min, max;
4774       NodeOptInfo nopt;
4775       QtfrNode* qn = NQTFR(node);
4776 
4777       r = optimize_node_left(qn->target, &nopt, env);
4778       if (r) break;
4779 
4780       if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4781 	if (env->mmd.max == 0 &&
4782 	    NTYPE(qn->target) == NT_CANY && qn->greedy) {
4783 	  if (IS_MULTILINE(env->options))
4784 	    add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4785 	  else
4786 	    add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4787 	}
4788       }
4789       else {
4790 	if (qn->lower > 0) {
4791 	  copy_node_opt_info(opt, &nopt);
4792 	  if (nopt.exb.len > 0) {
4793 	    if (nopt.exb.reach_end) {
4794 	      for (i = 2; i <= qn->lower &&
4795 		          ! is_full_opt_exact_info(&opt->exb); i++) {
4796 		concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4797 	      }
4798 	      if (i < qn->lower) {
4799 		opt->exb.reach_end = 0;
4800 	      }
4801 	    }
4802 	  }
4803 
4804 	  if (qn->lower != qn->upper) {
4805 	    opt->exb.reach_end = 0;
4806 	    opt->exm.reach_end = 0;
4807 	  }
4808 	  if (qn->lower > 1)
4809 	    opt->exm.reach_end = 0;
4810 	}
4811       }
4812 
4813       min = distance_multiply(nopt.len.min, qn->lower);
4814       if (IS_REPEAT_INFINITE(qn->upper))
4815 	max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4816       else
4817 	max = distance_multiply(nopt.len.max, qn->upper);
4818 
4819       set_mml(&opt->len, min, max);
4820     }
4821     break;
4822 
4823   case NT_ENCLOSE:
4824     {
4825       EncloseNode* en = NENCLOSE(node);
4826 
4827       switch (en->type) {
4828       case ENCLOSE_OPTION:
4829 	{
4830 	  OnigOptionType save = env->options;
4831 
4832 	  env->options = en->option;
4833 	  r = optimize_node_left(en->target, opt, env);
4834 	  env->options = save;
4835 	}
4836 	break;
4837 
4838       case ENCLOSE_MEMORY:
4839 #ifdef USE_SUBEXP_CALL
4840 	en->opt_count++;
4841 	if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4842 	  OnigDistance min, max;
4843 
4844 	  min = 0;
4845 	  max = ONIG_INFINITE_DISTANCE;
4846 	  if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4847 	  if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4848 	  set_mml(&opt->len, min, max);
4849 	}
4850 	else
4851 #endif
4852 	{
4853 	  r = optimize_node_left(en->target, opt, env);
4854 
4855 	  if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4856 	    if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4857 	      remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4858 	  }
4859 	}
4860 	break;
4861 
4862       case ENCLOSE_STOP_BACKTRACK:
4863 	r = optimize_node_left(en->target, opt, env);
4864 	break;
4865       }
4866     }
4867     break;
4868 
4869   default:
4870 #ifdef ONIG_DEBUG
4871     fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4872 	    NTYPE(node));
4873 #endif
4874     r = ONIGERR_TYPE_BUG;
4875     break;
4876   }
4877 
4878   return r;
4879 }
4880 
4881 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4882 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4883 {
4884   int r;
4885 
4886   if (e->len == 0) return 0;
4887 
4888   if (e->ignore_case) {
4889     reg->exact = (UChar* )xmalloc(e->len);
4890     CHECK_NULL_RETURN_MEMERR(reg->exact);
4891     xmemcpy(reg->exact, e->s, e->len);
4892     reg->exact_end = reg->exact + e->len;
4893     reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4894   }
4895   else {
4896     int allow_reverse;
4897 
4898     reg->exact = str_dup(e->s, e->s + e->len);
4899     CHECK_NULL_RETURN_MEMERR(reg->exact);
4900     reg->exact_end = reg->exact + e->len;
4901 
4902     allow_reverse =
4903 	ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4904 
4905     if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4906       r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4907 	              reg->map, &(reg->int_map));
4908       if (r) return r;
4909 
4910       reg->optimize = (allow_reverse != 0
4911 		       ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4912     }
4913     else {
4914       reg->optimize = ONIG_OPTIMIZE_EXACT;
4915     }
4916   }
4917 
4918   reg->dmin = e->mmd.min;
4919   reg->dmax = e->mmd.max;
4920 
4921   if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4922     reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4923   }
4924 
4925   return 0;
4926 }
4927 
4928 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4929 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4930 {
4931   int i;
4932 
4933   for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4934     reg->map[i] = m->map[i];
4935 
4936   reg->optimize   = ONIG_OPTIMIZE_MAP;
4937   reg->dmin       = m->mmd.min;
4938   reg->dmax       = m->mmd.max;
4939 
4940   if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4941     reg->threshold_len = reg->dmin + 1;
4942   }
4943 }
4944 
4945 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4946 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4947 {
4948   reg->sub_anchor |= anc->left_anchor  & ANCHOR_BEGIN_LINE;
4949   reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4950 }
4951 
4952 #ifdef ONIG_DEBUG
4953 static void print_optimize_info(FILE* f, regex_t* reg);
4954 #endif
4955 
4956 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4957 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4958 {
4959 
4960   int r;
4961   NodeOptInfo opt;
4962   OptEnv env;
4963 
4964   env.enc            = reg->enc;
4965   env.options        = reg->options;
4966   env.case_fold_flag = reg->case_fold_flag;
4967   env.scan_env   = scan_env;
4968   clear_mml(&env.mmd);
4969 
4970   r = optimize_node_left(node, &opt, &env);
4971   if (r) return r;
4972 
4973   reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4974         ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4975 
4976   reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4977 
4978   if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4979     reg->anchor_dmin = opt.len.min;
4980     reg->anchor_dmax = opt.len.max;
4981   }
4982 
4983   if (opt.exb.len > 0 || opt.exm.len > 0) {
4984     select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4985     if (opt.map.value > 0 &&
4986 	comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4987       goto set_map;
4988     }
4989     else {
4990       r = set_optimize_exact_info(reg, &opt.exb);
4991       set_sub_anchor(reg, &opt.exb.anc);
4992     }
4993   }
4994   else if (opt.map.value > 0) {
4995   set_map:
4996     set_optimize_map_info(reg, &opt.map);
4997     set_sub_anchor(reg, &opt.map.anc);
4998   }
4999   else {
5000     reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5001     if (opt.len.max == 0)
5002       reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5003   }
5004 
5005 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5006   print_optimize_info(stderr, reg);
5007 #endif
5008   return r;
5009 }
5010 
5011 static void
clear_optimize_info(regex_t * reg)5012 clear_optimize_info(regex_t* reg)
5013 {
5014   reg->optimize      = ONIG_OPTIMIZE_NONE;
5015   reg->anchor        = 0;
5016   reg->anchor_dmin   = 0;
5017   reg->anchor_dmax   = 0;
5018   reg->sub_anchor    = 0;
5019   reg->exact_end     = (UChar* )NULL;
5020   reg->threshold_len = 0;
5021   if (IS_NOT_NULL(reg->exact)) {
5022     xfree(reg->exact);
5023     reg->exact = (UChar* )NULL;
5024   }
5025 }
5026 
5027 #ifdef ONIG_DEBUG
5028 
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5029 static void print_enc_string(FILE* fp, OnigEncoding enc,
5030 			     const UChar *s, const UChar *end)
5031 {
5032   fprintf(fp, "\nPATTERN: /");
5033 
5034   if (ONIGENC_MBC_MINLEN(enc) > 1) {
5035     const UChar *p;
5036     OnigCodePoint code;
5037 
5038     p = s;
5039     while (p < end) {
5040       code = ONIGENC_MBC_TO_CODE(enc, p, end);
5041       if (code >= 0x80) {
5042 	fprintf(fp, " 0x%04x ", (int )code);
5043       }
5044       else {
5045 	fputc((int )code, fp);
5046       }
5047 
5048       p += enclen(enc, p);
5049     }
5050   }
5051   else {
5052     while (s < end) {
5053       fputc((int )*s, fp);
5054       s++;
5055     }
5056   }
5057 
5058   fprintf(fp, "/\n");
5059 }
5060 
5061 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5062 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5063 {
5064   if (a == ONIG_INFINITE_DISTANCE)
5065     fputs("inf", f);
5066   else
5067     fprintf(f, "(%u)", a);
5068 
5069   fputs("-", f);
5070 
5071   if (b == ONIG_INFINITE_DISTANCE)
5072     fputs("inf", f);
5073   else
5074     fprintf(f, "(%u)", b);
5075 }
5076 
5077 static void
print_anchor(FILE * f,int anchor)5078 print_anchor(FILE* f, int anchor)
5079 {
5080   int q = 0;
5081 
5082   fprintf(f, "[");
5083 
5084   if (anchor & ANCHOR_BEGIN_BUF) {
5085     fprintf(f, "begin-buf");
5086     q = 1;
5087   }
5088   if (anchor & ANCHOR_BEGIN_LINE) {
5089     if (q) fprintf(f, ", ");
5090     q = 1;
5091     fprintf(f, "begin-line");
5092   }
5093   if (anchor & ANCHOR_BEGIN_POSITION) {
5094     if (q) fprintf(f, ", ");
5095     q = 1;
5096     fprintf(f, "begin-pos");
5097   }
5098   if (anchor & ANCHOR_END_BUF) {
5099     if (q) fprintf(f, ", ");
5100     q = 1;
5101     fprintf(f, "end-buf");
5102   }
5103   if (anchor & ANCHOR_SEMI_END_BUF) {
5104     if (q) fprintf(f, ", ");
5105     q = 1;
5106     fprintf(f, "semi-end-buf");
5107   }
5108   if (anchor & ANCHOR_END_LINE) {
5109     if (q) fprintf(f, ", ");
5110     q = 1;
5111     fprintf(f, "end-line");
5112   }
5113   if (anchor & ANCHOR_ANYCHAR_STAR) {
5114     if (q) fprintf(f, ", ");
5115     q = 1;
5116     fprintf(f, "anychar-star");
5117   }
5118   if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5119     if (q) fprintf(f, ", ");
5120     fprintf(f, "anychar-star-pl");
5121   }
5122 
5123   fprintf(f, "]");
5124 }
5125 
5126 static void
print_optimize_info(FILE * f,regex_t * reg)5127 print_optimize_info(FILE* f, regex_t* reg)
5128 {
5129   static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5130                               "EXACT_IC", "MAP" };
5131 
5132   fprintf(f, "optimize: %s\n", on[reg->optimize]);
5133   fprintf(f, "  anchor: "); print_anchor(f, reg->anchor);
5134   if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5135     print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5136   fprintf(f, "\n");
5137 
5138   if (reg->optimize) {
5139     fprintf(f, "  sub anchor: "); print_anchor(f, reg->sub_anchor);
5140     fprintf(f, "\n");
5141   }
5142   fprintf(f, "\n");
5143 
5144   if (reg->exact) {
5145     UChar *p;
5146     fprintf(f, "exact: [");
5147     for (p = reg->exact; p < reg->exact_end; p++) {
5148       fputc(*p, f);
5149     }
5150     fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5151   }
5152   else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5153     int c, i, n = 0;
5154 
5155     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5156       if (reg->map[i]) n++;
5157 
5158     fprintf(f, "map: n=%d\n", n);
5159     if (n > 0) {
5160       c = 0;
5161       fputc('[', f);
5162       for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5163 	if (reg->map[i] != 0) {
5164           if (c > 0)  fputs(", ", f);
5165           c++;
5166           if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5167               ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5168             fputc(i, f);
5169           else
5170             fprintf(f, "%d", i);
5171         }
5172       }
5173       fprintf(f, "]\n");
5174     }
5175   }
5176 }
5177 #endif /* ONIG_DEBUG */
5178 
5179 
5180 extern void
onig_free_body(regex_t * reg)5181 onig_free_body(regex_t* reg)
5182 {
5183   if (IS_NOT_NULL(reg)) {
5184     if (IS_NOT_NULL(reg->p))                xfree(reg->p);
5185     if (IS_NOT_NULL(reg->exact))            xfree(reg->exact);
5186     if (IS_NOT_NULL(reg->int_map))          xfree(reg->int_map);
5187     if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5188     if (IS_NOT_NULL(reg->repeat_range))     xfree(reg->repeat_range);
5189     if (IS_NOT_NULL(reg->chain))            onig_free(reg->chain);
5190 
5191 #ifdef USE_NAMED_GROUP
5192     onig_names_free(reg);
5193 #endif
5194   }
5195 }
5196 
5197 extern void
onig_free(regex_t * reg)5198 onig_free(regex_t* reg)
5199 {
5200   if (IS_NOT_NULL(reg)) {
5201     onig_free_body(reg);
5202     xfree(reg);
5203   }
5204 }
5205 
5206 #define REGEX_TRANSFER(to,from) do {\
5207   (to)->state = ONIG_STATE_MODIFY;\
5208   onig_free_body(to);\
5209   xmemcpy(to, from, sizeof(regex_t));\
5210   xfree(from);\
5211 } while (0)
5212 
5213 extern void
onig_transfer(regex_t * to,regex_t * from)5214 onig_transfer(regex_t* to, regex_t* from)
5215 {
5216   THREAD_ATOMIC_START;
5217   REGEX_TRANSFER(to, from);
5218   THREAD_ATOMIC_END;
5219 }
5220 
5221 #define REGEX_CHAIN_HEAD(reg) do {\
5222   while (IS_NOT_NULL((reg)->chain)) {\
5223     (reg) = (reg)->chain;\
5224   }\
5225 } while (0)
5226 
5227 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5228 onig_chain_link_add(regex_t* to, regex_t* add)
5229 {
5230   THREAD_ATOMIC_START;
5231   REGEX_CHAIN_HEAD(to);
5232   to->chain = add;
5233   THREAD_ATOMIC_END;
5234 }
5235 
5236 extern void
onig_chain_reduce(regex_t * reg)5237 onig_chain_reduce(regex_t* reg)
5238 {
5239   regex_t *head, *prev;
5240 
5241   prev = reg;
5242   head = prev->chain;
5243   if (IS_NOT_NULL(head)) {
5244     reg->state = ONIG_STATE_MODIFY;
5245     while (IS_NOT_NULL(head->chain)) {
5246       prev = head;
5247       head = head->chain;
5248     }
5249     prev->chain = (regex_t* )NULL;
5250     REGEX_TRANSFER(reg, head);
5251   }
5252 }
5253 
5254 #ifdef ONIG_DEBUG
5255 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5256 #endif
5257 #ifdef ONIG_DEBUG_PARSE_TREE
5258 static void print_tree P_((FILE* f, Node* node));
5259 #endif
5260 
5261 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5262 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5263 	      OnigErrorInfo* einfo)
5264 {
5265 #define COMPILE_INIT_SIZE  20
5266 
5267   int r, init_size;
5268   Node*  root;
5269   ScanEnv  scan_env;
5270 #ifdef USE_SUBEXP_CALL
5271   UnsetAddrList  uslist;
5272 #endif
5273 
5274   if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5275 
5276   reg->state = ONIG_STATE_COMPILING;
5277 
5278 #ifdef ONIG_DEBUG
5279   print_enc_string(stderr, reg->enc, pattern, pattern_end);
5280 #endif
5281 
5282   if (reg->alloc == 0) {
5283     init_size = (pattern_end - pattern) * 2;
5284     if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5285     r = BBUF_INIT(reg, init_size);
5286     if (r != 0) goto end;
5287   }
5288   else
5289     reg->used = 0;
5290 
5291   reg->num_mem            = 0;
5292   reg->num_repeat         = 0;
5293   reg->num_null_check     = 0;
5294   reg->repeat_range_alloc = 0;
5295   reg->repeat_range       = (OnigRepeatRange* )NULL;
5296 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5297   reg->num_comb_exp_check = 0;
5298 #endif
5299 
5300   r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5301   if (r != 0) goto err;
5302 
5303 #ifdef USE_NAMED_GROUP
5304   /* mixed use named group and no-named group */
5305   if (scan_env.num_named > 0 &&
5306       IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5307       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5308     if (scan_env.num_named != scan_env.num_mem)
5309       r = disable_noname_group_capture(&root, reg, &scan_env);
5310     else
5311       r = numbered_ref_check(root);
5312 
5313     if (r != 0) goto err;
5314   }
5315 #endif
5316 
5317 #ifdef USE_SUBEXP_CALL
5318   if (scan_env.num_call > 0) {
5319     r = unset_addr_list_init(&uslist, scan_env.num_call);
5320     if (r != 0) goto err;
5321     scan_env.unset_addr_list = &uslist;
5322     r = setup_subexp_call(root, &scan_env);
5323     if (r != 0) goto err_unset;
5324     r = subexp_recursive_check_trav(root, &scan_env);
5325     if (r  < 0) goto err_unset;
5326     r = subexp_inf_recursive_check_trav(root, &scan_env);
5327     if (r != 0) goto err_unset;
5328 
5329     reg->num_call = scan_env.num_call;
5330   }
5331   else
5332     reg->num_call = 0;
5333 #endif
5334 
5335   r = setup_tree(root, reg, 0, &scan_env);
5336   if (r != 0) goto err_unset;
5337 
5338 #ifdef ONIG_DEBUG_PARSE_TREE
5339   print_tree(stderr, root);
5340 #endif
5341 
5342   reg->capture_history  = scan_env.capture_history;
5343   reg->bt_mem_start     = scan_env.bt_mem_start;
5344   reg->bt_mem_start    |= reg->capture_history;
5345   if (IS_FIND_CONDITION(reg->options))
5346     BIT_STATUS_ON_ALL(reg->bt_mem_end);
5347   else {
5348     reg->bt_mem_end  = scan_env.bt_mem_end;
5349     reg->bt_mem_end |= reg->capture_history;
5350   }
5351 
5352 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5353   if (scan_env.backrefed_mem == 0
5354 #ifdef USE_SUBEXP_CALL
5355       || scan_env.num_call == 0
5356 #endif
5357       ) {
5358     setup_comb_exp_check(root, 0, &scan_env);
5359 #ifdef USE_SUBEXP_CALL
5360     if (scan_env.has_recursion != 0) {
5361       scan_env.num_comb_exp_check = 0;
5362     }
5363     else
5364 #endif
5365     if (scan_env.comb_exp_max_regnum > 0) {
5366       int i;
5367       for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5368 	if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5369 	  scan_env.num_comb_exp_check = 0;
5370 	  break;
5371 	}
5372       }
5373     }
5374   }
5375 
5376   reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5377 #endif
5378 
5379   clear_optimize_info(reg);
5380 #ifndef ONIG_DONT_OPTIMIZE
5381   r = set_optimize_info_from_tree(root, reg, &scan_env);
5382   if (r != 0) goto err_unset;
5383 #endif
5384 
5385   if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5386     xfree(scan_env.mem_nodes_dynamic);
5387     scan_env.mem_nodes_dynamic = (Node** )NULL;
5388   }
5389 
5390   r = compile_tree(root, reg);
5391   if (r == 0) {
5392     r = add_opcode(reg, OP_END);
5393 #ifdef USE_SUBEXP_CALL
5394     if (scan_env.num_call > 0) {
5395       r = unset_addr_list_fix(&uslist, reg);
5396       unset_addr_list_end(&uslist);
5397       if (r) goto err;
5398     }
5399 #endif
5400 
5401     if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5402       reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5403     else {
5404       if (reg->bt_mem_start != 0)
5405 	reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5406       else
5407 	reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5408     }
5409   }
5410 #ifdef USE_SUBEXP_CALL
5411   else if (scan_env.num_call > 0) {
5412     unset_addr_list_end(&uslist);
5413   }
5414 #endif
5415   onig_node_free(root);
5416 
5417 #ifdef ONIG_DEBUG_COMPILE
5418 #ifdef USE_NAMED_GROUP
5419   onig_print_names(stderr, reg);
5420 #endif
5421   print_compiled_byte_code_list(stderr, reg);
5422 #endif
5423 
5424  end:
5425   reg->state = ONIG_STATE_NORMAL;
5426   return r;
5427 
5428  err_unset:
5429 #ifdef USE_SUBEXP_CALL
5430   if (scan_env.num_call > 0) {
5431     unset_addr_list_end(&uslist);
5432   }
5433 #endif
5434  err:
5435   if (IS_NOT_NULL(scan_env.error)) {
5436     if (IS_NOT_NULL(einfo)) {
5437       einfo->enc     = scan_env.enc;
5438       einfo->par     = scan_env.error;
5439       einfo->par_end = scan_env.error_end;
5440     }
5441   }
5442 
5443   onig_node_free(root);
5444   if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5445       xfree(scan_env.mem_nodes_dynamic);
5446   return r;
5447 }
5448 
5449 #ifdef USE_RECOMPILE_API
5450 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5451 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5452 	    OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5453 	    OnigErrorInfo* einfo)
5454 {
5455   int r;
5456   regex_t *new_reg;
5457 
5458   r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5459   if (r) return r;
5460   if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5461     onig_transfer(reg, new_reg);
5462   }
5463   else {
5464     onig_chain_link_add(reg, new_reg);
5465   }
5466   return 0;
5467 }
5468 #endif
5469 
5470 static int onig_inited = 0;
5471 
5472 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5473 onig_reg_init(regex_t* reg, OnigOptionType option,
5474 	      OnigCaseFoldType case_fold_flag,
5475 	      OnigEncoding enc, OnigSyntaxType* syntax)
5476 {
5477   if (! onig_inited)
5478     onig_init();
5479 
5480   if (IS_NULL(reg))
5481     return ONIGERR_INVALID_ARGUMENT;
5482 
5483   if (ONIGENC_IS_UNDEF(enc))
5484     return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5485 
5486   if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5487       == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5488     return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5489   }
5490 
5491   (reg)->state = ONIG_STATE_MODIFY;
5492 
5493   if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5494     option |= syntax->options;
5495     option &= ~ONIG_OPTION_SINGLELINE;
5496   }
5497   else
5498     option |= syntax->options;
5499 
5500   (reg)->enc              = enc;
5501   (reg)->options          = option;
5502   (reg)->syntax           = syntax;
5503   (reg)->optimize         = 0;
5504   (reg)->exact            = (UChar* )NULL;
5505   (reg)->int_map          = (int* )NULL;
5506   (reg)->int_map_backward = (int* )NULL;
5507   (reg)->chain            = (regex_t* )NULL;
5508 
5509   (reg)->p                = (UChar* )NULL;
5510   (reg)->alloc            = 0;
5511   (reg)->used             = 0;
5512   (reg)->name_table       = (void* )NULL;
5513 
5514   (reg)->case_fold_flag   = case_fold_flag;
5515   return 0;
5516 }
5517 
5518 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5519 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5520           const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5521           OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5522 {
5523   int r;
5524 
5525   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5526   if (r) return r;
5527 
5528   r = onig_compile(reg, pattern, pattern_end, einfo);
5529   return r;
5530 }
5531 
5532 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5533 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5534 	  OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5535 	  OnigErrorInfo* einfo)
5536 {
5537   int r;
5538 
5539   *reg = (regex_t* )xmalloc(sizeof(regex_t));
5540   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5541 
5542   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5543   if (r) goto err;
5544 
5545   r = onig_compile(*reg, pattern, pattern_end, einfo);
5546   if (r) {
5547   err:
5548     onig_free(*reg);
5549     *reg = NULL;
5550   }
5551   return r;
5552 }
5553 
5554 
5555 extern int
onig_init(void)5556 onig_init(void)
5557 {
5558   if (onig_inited != 0)
5559     return 0;
5560 
5561   THREAD_SYSTEM_INIT;
5562   THREAD_ATOMIC_START;
5563 
5564   onig_inited = 1;
5565 
5566   onigenc_init();
5567   /* onigenc_set_default_caseconv_table((UChar* )0); */
5568 
5569 #ifdef ONIG_DEBUG_STATISTICS
5570   onig_statistics_init();
5571 #endif
5572 
5573   THREAD_ATOMIC_END;
5574   return 0;
5575 }
5576 
5577 
5578 extern int
onig_end(void)5579 onig_end(void)
5580 {
5581   THREAD_ATOMIC_START;
5582 
5583 #ifdef ONIG_DEBUG_STATISTICS
5584   onig_print_statistics(stderr);
5585 #endif
5586 
5587 #ifdef USE_SHARED_CCLASS_TABLE
5588   onig_free_shared_cclass_table();
5589 #endif
5590 
5591 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5592   onig_free_node_list();
5593 #endif
5594 
5595   onig_inited = 0;
5596 
5597   THREAD_ATOMIC_END;
5598   THREAD_SYSTEM_END;
5599   return 0;
5600 }
5601 
5602 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5603 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5604 {
5605   OnigCodePoint n, *data;
5606   OnigCodePoint low, high, x;
5607 
5608   GET_CODE_POINT(n, p);
5609   data = (OnigCodePoint* )p;
5610   data++;
5611 
5612   for (low = 0, high = n; low < high; ) {
5613     x = (low + high) >> 1;
5614     if (code > data[x * 2 + 1])
5615       low = x + 1;
5616     else
5617       high = x;
5618   }
5619 
5620   return ((low < n && code >= data[low * 2]) ? 1 : 0);
5621 }
5622 
5623 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5624 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5625 {
5626   int found;
5627 
5628   if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5629     if (IS_NULL(cc->mbuf)) {
5630       found = 0;
5631     }
5632     else {
5633       found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5634     }
5635   }
5636   else {
5637     found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5638   }
5639 
5640   if (IS_NCCLASS_NOT(cc))
5641     return !found;
5642   else
5643     return found;
5644 }
5645 
5646 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5647 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5648 {
5649   int len;
5650 
5651   if (ONIGENC_MBC_MINLEN(enc) > 1) {
5652     len = 2;
5653   }
5654   else {
5655     len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5656   }
5657   return onig_is_code_in_cc_len(len, code, cc);
5658 }
5659 
5660 
5661 #ifdef ONIG_DEBUG
5662 
5663 /* arguments type */
5664 #define ARG_SPECIAL     -1
5665 #define ARG_NON          0
5666 #define ARG_RELADDR      1
5667 #define ARG_ABSADDR      2
5668 #define ARG_LENGTH       3
5669 #define ARG_MEMNUM       4
5670 #define ARG_OPTION       5
5671 #define ARG_STATE_CHECK  6
5672 
5673 OnigOpInfoType OnigOpInfo[] = {
5674   { OP_FINISH,            "finish",          ARG_NON },
5675   { OP_END,               "end",             ARG_NON },
5676   { OP_EXACT1,            "exact1",          ARG_SPECIAL },
5677   { OP_EXACT2,            "exact2",          ARG_SPECIAL },
5678   { OP_EXACT3,            "exact3",          ARG_SPECIAL },
5679   { OP_EXACT4,            "exact4",          ARG_SPECIAL },
5680   { OP_EXACT5,            "exact5",          ARG_SPECIAL },
5681   { OP_EXACTN,            "exactn",          ARG_SPECIAL },
5682   { OP_EXACTMB2N1,        "exactmb2-n1",     ARG_SPECIAL },
5683   { OP_EXACTMB2N2,        "exactmb2-n2",     ARG_SPECIAL },
5684   { OP_EXACTMB2N3,        "exactmb2-n3",     ARG_SPECIAL },
5685   { OP_EXACTMB2N,         "exactmb2-n",      ARG_SPECIAL },
5686   { OP_EXACTMB3N,         "exactmb3n"  ,     ARG_SPECIAL },
5687   { OP_EXACTMBN,          "exactmbn",        ARG_SPECIAL },
5688   { OP_EXACT1_IC,         "exact1-ic",       ARG_SPECIAL },
5689   { OP_EXACTN_IC,         "exactn-ic",       ARG_SPECIAL },
5690   { OP_CCLASS,            "cclass",          ARG_SPECIAL },
5691   { OP_CCLASS_MB,         "cclass-mb",       ARG_SPECIAL },
5692   { OP_CCLASS_MIX,        "cclass-mix",      ARG_SPECIAL },
5693   { OP_CCLASS_NOT,        "cclass-not",      ARG_SPECIAL },
5694   { OP_CCLASS_MB_NOT,     "cclass-mb-not",   ARG_SPECIAL },
5695   { OP_CCLASS_MIX_NOT,    "cclass-mix-not",  ARG_SPECIAL },
5696   { OP_CCLASS_NODE,       "cclass-node",     ARG_SPECIAL },
5697   { OP_ANYCHAR,           "anychar",         ARG_NON },
5698   { OP_ANYCHAR_ML,        "anychar-ml",      ARG_NON },
5699   { OP_ANYCHAR_STAR,      "anychar*",        ARG_NON },
5700   { OP_ANYCHAR_ML_STAR,   "anychar-ml*",     ARG_NON },
5701   { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5702   { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5703   { OP_WORD,                "word",            ARG_NON },
5704   { OP_NOT_WORD,            "not-word",        ARG_NON },
5705   { OP_WORD_BOUND,          "word-bound",      ARG_NON },
5706   { OP_NOT_WORD_BOUND,      "not-word-bound",  ARG_NON },
5707   { OP_WORD_BEGIN,          "word-begin",      ARG_NON },
5708   { OP_WORD_END,            "word-end",        ARG_NON },
5709   { OP_BEGIN_BUF,           "begin-buf",       ARG_NON },
5710   { OP_END_BUF,             "end-buf",         ARG_NON },
5711   { OP_BEGIN_LINE,          "begin-line",      ARG_NON },
5712   { OP_END_LINE,            "end-line",        ARG_NON },
5713   { OP_SEMI_END_BUF,        "semi-end-buf",    ARG_NON },
5714   { OP_BEGIN_POSITION,      "begin-position",  ARG_NON },
5715   { OP_BACKREF1,            "backref1",             ARG_NON },
5716   { OP_BACKREF2,            "backref2",             ARG_NON },
5717   { OP_BACKREFN,            "backrefn",             ARG_MEMNUM  },
5718   { OP_BACKREFN_IC,         "backrefn-ic",          ARG_SPECIAL },
5719   { OP_BACKREF_MULTI,       "backref_multi",        ARG_SPECIAL },
5720   { OP_BACKREF_MULTI_IC,    "backref_multi-ic",     ARG_SPECIAL },
5721   { OP_BACKREF_WITH_LEVEL,    "backref_at_level",     ARG_SPECIAL },
5722   { OP_MEMORY_START_PUSH,   "mem-start-push",       ARG_MEMNUM  },
5723   { OP_MEMORY_START,        "mem-start",            ARG_MEMNUM  },
5724   { OP_MEMORY_END_PUSH,     "mem-end-push",         ARG_MEMNUM  },
5725   { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec",     ARG_MEMNUM  },
5726   { OP_MEMORY_END,          "mem-end",              ARG_MEMNUM  },
5727   { OP_MEMORY_END_REC,      "mem-end-rec",          ARG_MEMNUM  },
5728   { OP_SET_OPTION_PUSH,     "set-option-push",      ARG_OPTION  },
5729   { OP_SET_OPTION,          "set-option",           ARG_OPTION  },
5730   { OP_FAIL,                "fail",                 ARG_NON },
5731   { OP_JUMP,                "jump",                 ARG_RELADDR },
5732   { OP_PUSH,                "push",                 ARG_RELADDR },
5733   { OP_POP,                 "pop",                  ARG_NON },
5734   { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1",      ARG_SPECIAL },
5735   { OP_PUSH_IF_PEEK_NEXT,   "push-if-peek-next",    ARG_SPECIAL },
5736   { OP_REPEAT,              "repeat",               ARG_SPECIAL },
5737   { OP_REPEAT_NG,           "repeat-ng",            ARG_SPECIAL },
5738   { OP_REPEAT_INC,          "repeat-inc",           ARG_MEMNUM  },
5739   { OP_REPEAT_INC_NG,       "repeat-inc-ng",        ARG_MEMNUM  },
5740   { OP_REPEAT_INC_SG,       "repeat-inc-sg",        ARG_MEMNUM  },
5741   { OP_REPEAT_INC_NG_SG,    "repeat-inc-ng-sg",     ARG_MEMNUM  },
5742   { OP_NULL_CHECK_START,    "null-check-start",     ARG_MEMNUM  },
5743   { OP_NULL_CHECK_END,      "null-check-end",       ARG_MEMNUM  },
5744   { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM  },
5745   { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM  },
5746   { OP_PUSH_POS,             "push-pos",             ARG_NON },
5747   { OP_POP_POS,              "pop-pos",              ARG_NON },
5748   { OP_PUSH_POS_NOT,         "push-pos-not",         ARG_RELADDR },
5749   { OP_FAIL_POS,             "fail-pos",             ARG_NON },
5750   { OP_PUSH_STOP_BT,         "push-stop-bt",         ARG_NON },
5751   { OP_POP_STOP_BT,          "pop-stop-bt",          ARG_NON },
5752   { OP_LOOK_BEHIND,          "look-behind",          ARG_SPECIAL },
5753   { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5754   { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5755   { OP_CALL,                 "call",                 ARG_ABSADDR },
5756   { OP_RETURN,               "return",               ARG_NON },
5757   { OP_STATE_CHECK_PUSH,         "state-check-push",         ARG_SPECIAL },
5758   { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5759   { OP_STATE_CHECK,              "state-check",              ARG_STATE_CHECK },
5760   { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*",     ARG_STATE_CHECK },
5761   { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5762     "state-check-anychar-ml*", ARG_STATE_CHECK },
5763   { -1, "", ARG_NON }
5764 };
5765 
5766 static char*
op2name(int opcode)5767 op2name(int opcode)
5768 {
5769   int i;
5770 
5771   for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5772     if (opcode == OnigOpInfo[i].opcode)
5773       return OnigOpInfo[i].name;
5774   }
5775   return "";
5776 }
5777 
5778 static int
op2arg_type(int opcode)5779 op2arg_type(int opcode)
5780 {
5781   int i;
5782 
5783   for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5784     if (opcode == OnigOpInfo[i].opcode)
5785       return OnigOpInfo[i].arg_type;
5786   }
5787   return ARG_SPECIAL;
5788 }
5789 
5790 static void
Indent(FILE * f,int indent)5791 Indent(FILE* f, int indent)
5792 {
5793   int i;
5794   for (i = 0; i < indent; i++) putc(' ', f);
5795 }
5796 
5797 static void
p_string(FILE * f,int len,UChar * s)5798 p_string(FILE* f, int len, UChar* s)
5799 {
5800   fputs(":", f);
5801   while (len-- > 0) { fputc(*s++, f); }
5802 }
5803 
5804 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5805 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5806 {
5807   int x = len * mb_len;
5808 
5809   fprintf(f, ":%d:", len);
5810   while (x-- > 0) { fputc(*s++, f); }
5811 }
5812 
5813 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5814 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5815                               OnigEncoding enc)
5816 {
5817   int i, n, arg_type;
5818   RelAddrType addr;
5819   LengthType len;
5820   MemNumType mem;
5821   StateCheckNumType scn;
5822   OnigCodePoint code;
5823   UChar *q;
5824 
5825   fprintf(f, "[%s", op2name(*bp));
5826   arg_type = op2arg_type(*bp);
5827   if (arg_type != ARG_SPECIAL) {
5828     bp++;
5829     switch (arg_type) {
5830     case ARG_NON:
5831       break;
5832     case ARG_RELADDR:
5833       GET_RELADDR_INC(addr, bp);
5834       fprintf(f, ":(%d)", addr);
5835       break;
5836     case ARG_ABSADDR:
5837       GET_ABSADDR_INC(addr, bp);
5838       fprintf(f, ":(%d)", addr);
5839       break;
5840     case ARG_LENGTH:
5841       GET_LENGTH_INC(len, bp);
5842       fprintf(f, ":%d", len);
5843       break;
5844     case ARG_MEMNUM:
5845       mem = *((MemNumType* )bp);
5846       bp += SIZE_MEMNUM;
5847       fprintf(f, ":%d", mem);
5848       break;
5849     case ARG_OPTION:
5850       {
5851 	OnigOptionType option = *((OnigOptionType* )bp);
5852 	bp += SIZE_OPTION;
5853 	fprintf(f, ":%d", option);
5854       }
5855       break;
5856 
5857     case ARG_STATE_CHECK:
5858       scn = *((StateCheckNumType* )bp);
5859       bp += SIZE_STATE_CHECK_NUM;
5860       fprintf(f, ":%d", scn);
5861       break;
5862     }
5863   }
5864   else {
5865     switch (*bp++) {
5866     case OP_EXACT1:
5867     case OP_ANYCHAR_STAR_PEEK_NEXT:
5868     case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5869       p_string(f, 1, bp++); break;
5870     case OP_EXACT2:
5871       p_string(f, 2, bp); bp += 2; break;
5872     case OP_EXACT3:
5873       p_string(f, 3, bp); bp += 3; break;
5874     case OP_EXACT4:
5875       p_string(f, 4, bp); bp += 4; break;
5876     case OP_EXACT5:
5877       p_string(f, 5, bp); bp += 5; break;
5878     case OP_EXACTN:
5879       GET_LENGTH_INC(len, bp);
5880       p_len_string(f, len, 1, bp);
5881       bp += len;
5882       break;
5883 
5884     case OP_EXACTMB2N1:
5885       p_string(f, 2, bp); bp += 2; break;
5886     case OP_EXACTMB2N2:
5887       p_string(f, 4, bp); bp += 4; break;
5888     case OP_EXACTMB2N3:
5889       p_string(f, 6, bp); bp += 6; break;
5890     case OP_EXACTMB2N:
5891       GET_LENGTH_INC(len, bp);
5892       p_len_string(f, len, 2, bp);
5893       bp += len * 2;
5894       break;
5895     case OP_EXACTMB3N:
5896       GET_LENGTH_INC(len, bp);
5897       p_len_string(f, len, 3, bp);
5898       bp += len * 3;
5899       break;
5900     case OP_EXACTMBN:
5901       {
5902 	int mb_len;
5903 
5904 	GET_LENGTH_INC(mb_len, bp);
5905 	GET_LENGTH_INC(len, bp);
5906 	fprintf(f, ":%d:%d:", mb_len, len);
5907 	n = len * mb_len;
5908 	while (n-- > 0) { fputc(*bp++, f); }
5909       }
5910       break;
5911 
5912     case OP_EXACT1_IC:
5913       len = enclen(enc, bp);
5914       p_string(f, len, bp);
5915       bp += len;
5916       break;
5917     case OP_EXACTN_IC:
5918       GET_LENGTH_INC(len, bp);
5919       p_len_string(f, len, 1, bp);
5920       bp += len;
5921       break;
5922 
5923     case OP_CCLASS:
5924       n = bitset_on_num((BitSetRef )bp);
5925       bp += SIZE_BITSET;
5926       fprintf(f, ":%d", n);
5927       break;
5928 
5929     case OP_CCLASS_NOT:
5930       n = bitset_on_num((BitSetRef )bp);
5931       bp += SIZE_BITSET;
5932       fprintf(f, ":%d", n);
5933       break;
5934 
5935     case OP_CCLASS_MB:
5936     case OP_CCLASS_MB_NOT:
5937       GET_LENGTH_INC(len, bp);
5938       q = bp;
5939 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5940       ALIGNMENT_RIGHT(q);
5941 #endif
5942       GET_CODE_POINT(code, q);
5943       bp += len;
5944       fprintf(f, ":%d:%d", (int )code, len);
5945       break;
5946 
5947     case OP_CCLASS_MIX:
5948     case OP_CCLASS_MIX_NOT:
5949       n = bitset_on_num((BitSetRef )bp);
5950       bp += SIZE_BITSET;
5951       GET_LENGTH_INC(len, bp);
5952       q = bp;
5953 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5954       ALIGNMENT_RIGHT(q);
5955 #endif
5956       GET_CODE_POINT(code, q);
5957       bp += len;
5958       fprintf(f, ":%d:%d:%d", n, (int )code, len);
5959       break;
5960 
5961     case OP_CCLASS_NODE:
5962       {
5963         CClassNode *cc;
5964 
5965         GET_POINTER_INC(cc, bp);
5966         n = bitset_on_num(cc->bs);
5967         fprintf(f, ":%u:%d", (unsigned int )cc, n);
5968       }
5969       break;
5970 
5971     case OP_BACKREFN_IC:
5972       mem = *((MemNumType* )bp);
5973       bp += SIZE_MEMNUM;
5974       fprintf(f, ":%d", mem);
5975       break;
5976 
5977     case OP_BACKREF_MULTI_IC:
5978     case OP_BACKREF_MULTI:
5979       fputs(" ", f);
5980       GET_LENGTH_INC(len, bp);
5981       for (i = 0; i < len; i++) {
5982 	GET_MEMNUM_INC(mem, bp);
5983 	if (i > 0) fputs(", ", f);
5984 	fprintf(f, "%d", mem);
5985       }
5986       break;
5987 
5988     case OP_BACKREF_WITH_LEVEL:
5989       {
5990 	OnigOptionType option;
5991 	LengthType level;
5992 
5993 	GET_OPTION_INC(option, bp);
5994 	fprintf(f, ":%d", option);
5995 	GET_LENGTH_INC(level, bp);
5996 	fprintf(f, ":%d", level);
5997 
5998 	fputs(" ", f);
5999 	GET_LENGTH_INC(len, bp);
6000 	for (i = 0; i < len; i++) {
6001 	  GET_MEMNUM_INC(mem, bp);
6002 	  if (i > 0) fputs(", ", f);
6003 	  fprintf(f, "%d", mem);
6004 	}
6005       }
6006       break;
6007 
6008     case OP_REPEAT:
6009     case OP_REPEAT_NG:
6010       {
6011 	mem = *((MemNumType* )bp);
6012 	bp += SIZE_MEMNUM;
6013 	addr = *((RelAddrType* )bp);
6014 	bp += SIZE_RELADDR;
6015 	fprintf(f, ":%d:%d", mem, addr);
6016       }
6017       break;
6018 
6019     case OP_PUSH_OR_JUMP_EXACT1:
6020     case OP_PUSH_IF_PEEK_NEXT:
6021       addr = *((RelAddrType* )bp);
6022       bp += SIZE_RELADDR;
6023       fprintf(f, ":(%d)", addr);
6024       p_string(f, 1, bp);
6025       bp += 1;
6026       break;
6027 
6028     case OP_LOOK_BEHIND:
6029       GET_LENGTH_INC(len, bp);
6030       fprintf(f, ":%d", len);
6031       break;
6032 
6033     case OP_PUSH_LOOK_BEHIND_NOT:
6034       GET_RELADDR_INC(addr, bp);
6035       GET_LENGTH_INC(len, bp);
6036       fprintf(f, ":%d:(%d)", len, addr);
6037       break;
6038 
6039     case OP_STATE_CHECK_PUSH:
6040     case OP_STATE_CHECK_PUSH_OR_JUMP:
6041       scn = *((StateCheckNumType* )bp);
6042       bp += SIZE_STATE_CHECK_NUM;
6043       addr = *((RelAddrType* )bp);
6044       bp += SIZE_RELADDR;
6045       fprintf(f, ":%d:(%d)", scn, addr);
6046       break;
6047 
6048     default:
6049       fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6050 	      *--bp);
6051     }
6052   }
6053   fputs("]", f);
6054   if (nextp) *nextp = bp;
6055 }
6056 
6057 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6058 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6059 {
6060   int ncode;
6061   UChar* bp = reg->p;
6062   UChar* end = reg->p + reg->used;
6063 
6064   fprintf(f, "code length: %d\n", reg->used);
6065 
6066   ncode = 0;
6067   while (bp < end) {
6068     ncode++;
6069     if (bp > reg->p) {
6070       if (ncode % 5 == 0)
6071 	fprintf(f, "\n");
6072       else
6073 	fputs(" ", f);
6074     }
6075     onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6076   }
6077 
6078   fprintf(f, "\n");
6079 }
6080 
6081 static void
print_indent_tree(FILE * f,Node * node,int indent)6082 print_indent_tree(FILE* f, Node* node, int indent)
6083 {
6084   int i, type;
6085   int add = 3;
6086   UChar* p;
6087 
6088   Indent(f, indent);
6089   if (IS_NULL(node)) {
6090     fprintf(f, "ERROR: null node!!!\n");
6091     exit (0);
6092   }
6093 
6094   type = NTYPE(node);
6095   switch (type) {
6096   case NT_LIST:
6097   case NT_ALT:
6098     if (NTYPE(node) == NT_LIST)
6099       fprintf(f, "<list:%x>\n", (int )node);
6100     else
6101       fprintf(f, "<alt:%x>\n", (int )node);
6102 
6103     print_indent_tree(f, NCAR(node), indent + add);
6104     while (IS_NOT_NULL(node = NCDR(node))) {
6105       if (NTYPE(node) != type) {
6106 	fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6107 	exit(0);
6108       }
6109       print_indent_tree(f, NCAR(node), indent + add);
6110     }
6111     break;
6112 
6113   case NT_STR:
6114     fprintf(f, "<string%s:%x>",
6115 	    (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6116     for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6117       if (*p >= 0x20 && *p < 0x7f)
6118 	fputc(*p, f);
6119       else {
6120 	fprintf(f, " 0x%02x", *p);
6121       }
6122     }
6123     break;
6124 
6125   case NT_CCLASS:
6126     fprintf(f, "<cclass:%x>", (int )node);
6127     if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6128     if (NCCLASS(node)->mbuf) {
6129       BBuf* bbuf = NCCLASS(node)->mbuf;
6130       for (i = 0; i < bbuf->used; i++) {
6131 	if (i > 0) fprintf(f, ",");
6132 	fprintf(f, "%0x", bbuf->p[i]);
6133       }
6134     }
6135     break;
6136 
6137   case NT_CTYPE:
6138     fprintf(f, "<ctype:%x> ", (int )node);
6139     switch (NCTYPE(node)->ctype) {
6140     case ONIGENC_CTYPE_WORD:
6141       if (NCTYPE(node)->not != 0)
6142 	fputs("not word",       f);
6143       else
6144 	fputs("word",           f);
6145       break;
6146 
6147     default:
6148       fprintf(f, "ERROR: undefined ctype.\n");
6149       exit(0);
6150     }
6151     break;
6152 
6153   case NT_CANY:
6154     fprintf(f, "<anychar:%x>", (int )node);
6155     break;
6156 
6157   case NT_ANCHOR:
6158     fprintf(f, "<anchor:%x> ", (int )node);
6159     switch (NANCHOR(node)->type) {
6160     case ANCHOR_BEGIN_BUF:      fputs("begin buf",      f); break;
6161     case ANCHOR_END_BUF:        fputs("end buf",        f); break;
6162     case ANCHOR_BEGIN_LINE:     fputs("begin line",     f); break;
6163     case ANCHOR_END_LINE:       fputs("end line",       f); break;
6164     case ANCHOR_SEMI_END_BUF:   fputs("semi end buf",   f); break;
6165     case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6166 
6167     case ANCHOR_WORD_BOUND:      fputs("word bound",     f); break;
6168     case ANCHOR_NOT_WORD_BOUND:  fputs("not word bound", f); break;
6169 #ifdef USE_WORD_BEGIN_END
6170     case ANCHOR_WORD_BEGIN:      fputs("word begin", f);     break;
6171     case ANCHOR_WORD_END:        fputs("word end", f);       break;
6172 #endif
6173     case ANCHOR_PREC_READ:       fputs("prec read",      f); break;
6174     case ANCHOR_PREC_READ_NOT:   fputs("prec read not",  f); break;
6175     case ANCHOR_LOOK_BEHIND:     fputs("look_behind",    f); break;
6176     case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6177 
6178     default:
6179       fprintf(f, "ERROR: undefined anchor type.\n");
6180       break;
6181     }
6182     break;
6183 
6184   case NT_BREF:
6185     {
6186       int* p;
6187       BRefNode* br = NBREF(node);
6188       p = BACKREFS_P(br);
6189       fprintf(f, "<backref:%x>", (int )node);
6190       for (i = 0; i < br->back_num; i++) {
6191 	if (i > 0) fputs(", ", f);
6192 	fprintf(f, "%d", p[i]);
6193       }
6194     }
6195     break;
6196 
6197 #ifdef USE_SUBEXP_CALL
6198   case NT_CALL:
6199     {
6200       CallNode* cn = NCALL(node);
6201       fprintf(f, "<call:%x>", (int )node);
6202       p_string(f, cn->name_end - cn->name, cn->name);
6203     }
6204     break;
6205 #endif
6206 
6207   case NT_QTFR:
6208     fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6209 	    NQTFR(node)->lower, NQTFR(node)->upper,
6210 	    (NQTFR(node)->greedy ? "" : "?"));
6211     print_indent_tree(f, NQTFR(node)->target, indent + add);
6212     break;
6213 
6214   case NT_ENCLOSE:
6215     fprintf(f, "<enclose:%x> ", (int )node);
6216     switch (NENCLOSE(node)->type) {
6217     case ENCLOSE_OPTION:
6218       fprintf(f, "option:%d", NENCLOSE(node)->option);
6219       break;
6220     case ENCLOSE_MEMORY:
6221       fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6222       break;
6223     case ENCLOSE_STOP_BACKTRACK:
6224       fprintf(f, "stop-bt");
6225       break;
6226 
6227     default:
6228       break;
6229     }
6230     fprintf(f, "\n");
6231     print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6232     break;
6233 
6234   default:
6235     fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6236     break;
6237   }
6238 
6239   if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6240       type != NT_ENCLOSE)
6241     fprintf(f, "\n");
6242   fflush(f);
6243 }
6244 #endif /* ONIG_DEBUG */
6245 
6246 #ifdef ONIG_DEBUG_PARSE_TREE
6247 static void
print_tree(FILE * f,Node * node)6248 print_tree(FILE* f, Node* node)
6249 {
6250   print_indent_tree(f, node, 0);
6251 }
6252 #endif
6253