xref: /PHP-7.2/ext/mbstring/oniguruma/src/regcomp.c (revision 0ae2f95b)
1 /**********************************************************************
2   regcomp.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2017  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33 
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37   return OnigDefaultCaseFoldFlag;
38 }
39 
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43   OnigDefaultCaseFoldFlag = case_fold_flag;
44   return 0;
45 }
46 
47 
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51 
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55   int len = end - s;
56 
57   if (len > 0) {
58     UChar* r = (UChar* )xmalloc(len + 1);
59     CHECK_NULL_RETURN(r);
60     xmemcpy(r, s, len);
61     r[len] = (UChar )0;
62     return r;
63   }
64   else return NULL;
65 }
66 
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70   Node c;
71   c = *a; *a = *b; *b = c;
72 
73   if (NTYPE(a) == NT_STR) {
74     StrNode* sn = NSTR(a);
75     if (sn->capa == 0) {
76       int len = sn->end - sn->s;
77       sn->s   = sn->buf;
78       sn->end = sn->s + len;
79     }
80   }
81 
82   if (NTYPE(b) == NT_STR) {
83     StrNode* sn = NSTR(b);
84     if (sn->capa == 0) {
85       int len = sn->end - sn->s;
86       sn->s   = sn->buf;
87       sn->end = sn->s + len;
88     }
89   }
90 }
91 
92 static OnigLen
distance_add(OnigLen d1,OnigLen d2)93 distance_add(OnigLen d1, OnigLen d2)
94 {
95   if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96     return ONIG_INFINITE_DISTANCE;
97   else {
98     if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99     else return ONIG_INFINITE_DISTANCE;
100   }
101 }
102 
103 static OnigLen
distance_multiply(OnigLen d,int m)104 distance_multiply(OnigLen d, int m)
105 {
106   if (m == 0) return 0;
107 
108   if (d < ONIG_INFINITE_DISTANCE / m)
109     return d * m;
110   else
111     return ONIG_INFINITE_DISTANCE;
112 }
113 
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117   int i;
118   for (i = 0; i < (int )BITSET_SIZE; i++) {
119     if (bs[i] != 0) return 0;
120   }
121   return 1;
122 }
123 
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128   int i, n;
129 
130   n = 0;
131   for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132     if (BITSET_AT(bs, i)) n++;
133   }
134   return n;
135 }
136 #endif
137 
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141   if (size <= 0) {
142     size   = 0;
143     buf->p = NULL;
144   }
145   else {
146     buf->p = (UChar* )xmalloc(size);
147     if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148   }
149 
150   buf->alloc = size;
151   buf->used  = 0;
152   return 0;
153 }
154 
155 
156 #ifdef USE_SUBEXP_CALL
157 
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161   UnsetAddr* p;
162 
163   p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164   CHECK_NULL_RETURN_MEMERR(p);
165   uslist->num   = 0;
166   uslist->alloc = size;
167   uslist->us    = p;
168   return 0;
169 }
170 
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174   if (IS_NOT_NULL(uslist->us))
175     xfree(uslist->us);
176 }
177 
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181   UnsetAddr* p;
182   int size;
183 
184   if (uslist->num >= uslist->alloc) {
185     size = uslist->alloc * 2;
186     p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187     CHECK_NULL_RETURN_MEMERR(p);
188     uslist->alloc = size;
189     uslist->us    = p;
190   }
191 
192   uslist->us[uslist->num].offset = offset;
193   uslist->us[uslist->num].target = node;
194   uslist->num++;
195   return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198 
199 
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203   BBUF_ADD1(reg, opcode);
204   return 0;
205 }
206 
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211   StateCheckNumType n = (StateCheckNumType )num;
212 
213   BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214   return 0;
215 }
216 #endif
217 
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221   RelAddrType ra = (RelAddrType )addr;
222 
223   BBUF_ADD(reg, &ra, SIZE_RELADDR);
224   return 0;
225 }
226 
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230   AbsAddrType ra = (AbsAddrType )addr;
231 
232   BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233   return 0;
234 }
235 
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239   LengthType l = (LengthType )len;
240 
241   BBUF_ADD(reg, &l, SIZE_LENGTH);
242   return 0;
243 }
244 
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248   MemNumType n = (MemNumType )num;
249 
250   BBUF_ADD(reg, &n, SIZE_MEMNUM);
251   return 0;
252 }
253 
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257   PointerType ptr = (PointerType )addr;
258 
259   BBUF_ADD(reg, &ptr, SIZE_POINTER);
260   return 0;
261 }
262 
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266   BBUF_ADD(reg, &option, SIZE_OPTION);
267   return 0;
268 }
269 
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273   int r;
274 
275   r = add_opcode(reg, opcode);
276   if (r) return r;
277   r = add_rel_addr(reg, addr);
278   return r;
279 }
280 
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284   BBUF_ADD(reg, bytes, len);
285   return 0;
286 }
287 
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291   BBUF_ADD(reg, bs, SIZE_BITSET);
292   return 0;
293 }
294 
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298   int r;
299 
300   r = add_opcode(reg, opcode);
301   if (r) return r;
302   r = add_option(reg, option);
303   return r;
304 }
305 
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308 
309 
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311    ((op) == OP_EXACTN    || (op) == OP_EXACTMB2N ||\
312     (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN  || (op) == OP_EXACTN_IC)
313 
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317   int op;
318 
319   if (ignore_case) {
320     switch (str_len) {
321     case 1:  op = OP_EXACT1_IC; break;
322     default: op = OP_EXACTN_IC; break;
323     }
324   }
325   else {
326     switch (mb_len) {
327     case 1:
328       switch (str_len) {
329       case 1:  op = OP_EXACT1; break;
330       case 2:  op = OP_EXACT2; break;
331       case 3:  op = OP_EXACT3; break;
332       case 4:  op = OP_EXACT4; break;
333       case 5:  op = OP_EXACT5; break;
334       default: op = OP_EXACTN; break;
335       }
336       break;
337 
338     case 2:
339       switch (str_len) {
340       case 1:  op = OP_EXACTMB2N1; break;
341       case 2:  op = OP_EXACTMB2N2; break;
342       case 3:  op = OP_EXACTMB2N3; break;
343       default: op = OP_EXACTMB2N;  break;
344       }
345       break;
346 
347     case 3:
348       op = OP_EXACTMB3N;
349       break;
350 
351     default:
352       op = OP_EXACTMBN;
353       break;
354     }
355   }
356   return op;
357 }
358 
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362   int r;
363   int saved_num_null_check = reg->num_null_check;
364 
365   if (empty_info != 0) {
366     r = add_opcode(reg, OP_NULL_CHECK_START);
367     if (r) return r;
368     r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369     if (r) return r;
370     reg->num_null_check++;
371   }
372 
373   r = compile_tree(node, reg);
374   if (r) return r;
375 
376   if (empty_info != 0) {
377     if (empty_info == NQ_TARGET_IS_EMPTY)
378       r = add_opcode(reg, OP_NULL_CHECK_END);
379     else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380       r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381     else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382       r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383 
384     if (r) return r;
385     r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386   }
387   return r;
388 }
389 
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394   int r;
395 
396   r = add_opcode(reg, OP_CALL);
397   if (r) return r;
398   r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399                           node->target);
400   if (r) return r;
401   r = add_abs_addr(reg, 0 /*dummy addr.*/);
402   return r;
403 }
404 #endif
405 
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409   int i, r;
410 
411   for (i = 0; i < n; i++) {
412     r = compile_tree(node, reg);
413     if (r) return r;
414   }
415   return 0;
416 }
417 
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420                           regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422   int len;
423   int op = select_str_opcode(mb_len, str_len, ignore_case);
424 
425   len = SIZE_OPCODE;
426 
427   if (op == OP_EXACTMBN)  len += SIZE_LENGTH;
428   if (IS_NEED_STR_LEN_OP_EXACT(op))
429     len += SIZE_LENGTH;
430 
431   len += mb_len * str_len;
432   return len;
433 }
434 
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437                    regex_t* reg, int ignore_case)
438 {
439   int op = select_str_opcode(mb_len, str_len, ignore_case);
440   add_opcode(reg, op);
441 
442   if (op == OP_EXACTMBN)
443     add_length(reg, mb_len);
444 
445   if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446     if (op == OP_EXACTN_IC)
447       add_length(reg, mb_len * str_len);
448     else
449       add_length(reg, str_len);
450   }
451 
452   add_bytes(reg, s, mb_len * str_len);
453   return 0;
454 }
455 
456 
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460   int rlen, r, len, prev_len, slen, ambig;
461   OnigEncoding enc = reg->enc;
462   UChar *p, *prev;
463   StrNode* sn;
464 
465   sn = NSTR(node);
466   if (sn->end <= sn->s)
467     return 0;
468 
469   ambig = NSTRING_IS_AMBIG(node);
470 
471   p = prev = sn->s;
472   SAFE_ENC_LEN(enc, p, sn->end, prev_len);
473   p += prev_len;
474   slen = 1;
475   rlen = 0;
476 
477   for (; p < sn->end; ) {
478     SAFE_ENC_LEN(enc, p, sn->end, len);
479     if (len == prev_len) {
480       slen++;
481     }
482     else {
483       r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484       rlen += r;
485       prev = p;
486       slen = 1;
487       prev_len = len;
488     }
489     p += len;
490   }
491   r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492   rlen += r;
493   return rlen;
494 }
495 
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499   if (sn->end <= sn->s)
500     return 0;
501 
502   return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504 
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508   int r, len, prev_len, slen, ambig;
509   OnigEncoding enc = reg->enc;
510   UChar *p, *prev, *end;
511   StrNode* sn;
512 
513   sn = NSTR(node);
514   if (sn->end <= sn->s)
515     return 0;
516 
517   end = sn->end;
518   ambig = NSTRING_IS_AMBIG(node);
519 
520   p = prev = sn->s;
521   SAFE_ENC_LEN(enc, p, end, prev_len);
522   p += prev_len;
523   slen = 1;
524 
525   for (; p < end; ) {
526     SAFE_ENC_LEN(enc, p, end, len);
527     if (len == prev_len) {
528       slen++;
529     }
530     else {
531       r = add_compile_string(prev, prev_len, slen, reg, ambig);
532       if (r) return r;
533 
534       prev  = p;
535       slen  = 1;
536       prev_len = len;
537     }
538 
539     p += len;
540   }
541   return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543 
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547   if (sn->end <= sn->s)
548     return 0;
549 
550   return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552 
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557   add_length(reg, mbuf->used);
558   return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560   int r, pad_size;
561   UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562 
563   GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564   add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565   if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566 
567   r = add_bytes(reg, mbuf->p, mbuf->used);
568 
569   /* padding for return value from compile_length_cclass_node() to be fix. */
570   pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571   if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572   return r;
573 #endif
574 }
575 
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579   int len;
580 
581   if (IS_NCCLASS_SHARE(cc)) {
582     len = SIZE_OPCODE + SIZE_POINTER;
583     return len;
584   }
585 
586   if (IS_NULL(cc->mbuf)) {
587     len = SIZE_OPCODE + SIZE_BITSET;
588   }
589   else {
590     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591       len = SIZE_OPCODE;
592     }
593     else {
594       len = SIZE_OPCODE + SIZE_BITSET;
595     }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597     len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599     len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601   }
602 
603   return len;
604 }
605 
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609   int r;
610 
611   if (IS_NCCLASS_SHARE(cc)) {
612     add_opcode(reg, OP_CCLASS_NODE);
613     r = add_pointer(reg, cc);
614     return r;
615   }
616 
617   if (IS_NULL(cc->mbuf)) {
618     if (IS_NCCLASS_NOT(cc))
619       add_opcode(reg, OP_CCLASS_NOT);
620     else
621       add_opcode(reg, OP_CCLASS);
622 
623     r = add_bitset(reg, cc->bs);
624   }
625   else {
626     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627       if (IS_NCCLASS_NOT(cc))
628         add_opcode(reg, OP_CCLASS_MB_NOT);
629       else
630         add_opcode(reg, OP_CCLASS_MB);
631 
632       r = add_multi_byte_cclass(cc->mbuf, reg);
633     }
634     else {
635       if (IS_NCCLASS_NOT(cc))
636         add_opcode(reg, OP_CCLASS_MIX_NOT);
637       else
638         add_opcode(reg, OP_CCLASS_MIX);
639 
640       r = add_bitset(reg, cc->bs);
641       if (r) return r;
642       r = add_multi_byte_cclass(cc->mbuf, reg);
643     }
644   }
645 
646   return r;
647 }
648 
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC  4
653 
654   OnigRepeatRange* p;
655 
656   if (reg->repeat_range_alloc == 0) {
657     p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658     CHECK_NULL_RETURN_MEMERR(p);
659     reg->repeat_range = p;
660     reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661   }
662   else if (reg->repeat_range_alloc <= id) {
663     int n;
664     n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665     p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666                                     sizeof(OnigRepeatRange) * n);
667     CHECK_NULL_RETURN_MEMERR(p);
668     reg->repeat_range = p;
669     reg->repeat_range_alloc = n;
670   }
671   else {
672     p = reg->repeat_range;
673   }
674 
675   p[id].lower = lower;
676   p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677   return 0;
678 }
679 
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682                           regex_t* reg)
683 {
684   int r;
685   int num_repeat = reg->num_repeat;
686 
687   r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688   if (r) return r;
689   r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690   reg->num_repeat++;
691   if (r) return r;
692   r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693   if (r) return r;
694 
695   r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696   if (r) return r;
697 
698   r = compile_tree_empty_check(qn->target, reg, empty_info);
699   if (r) return r;
700 
701   if (
702 #ifdef USE_SUBEXP_CALL
703       reg->num_call > 0 ||
704 #endif
705       IS_QUANTIFIER_IN_REPEAT(qn)) {
706     r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707   }
708   else {
709     r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710   }
711   if (r) return r;
712   r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713   return r;
714 }
715 
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719   if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720       NTYPE(qn->target) == NT_CANY)
721     return 1;
722   else
723     return 0;
724 }
725 
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE   50
727 #define CKN_ON   (ckn > 0)
728 
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730 
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734   int len, mod_tlen, cklen;
735   int ckn;
736   int infinite = IS_REPEAT_INFINITE(qn->upper);
737   int empty_info = qn->target_empty_info;
738   int tlen = compile_length_tree(qn->target, reg);
739 
740   if (tlen < 0) return tlen;
741 
742   ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743 
744   cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745 
746   /* anychar repeat */
747   if (NTYPE(qn->target) == NT_CANY) {
748     if (qn->greedy && infinite) {
749       if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750         return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751       else
752         return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753     }
754   }
755 
756   if (empty_info != 0)
757     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758   else
759     mod_tlen = tlen;
760 
761   if (infinite && qn->lower <= 1) {
762     if (qn->greedy) {
763       if (qn->lower == 1)
764         len = SIZE_OP_JUMP;
765       else
766         len = 0;
767 
768       len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769     }
770     else {
771       if (qn->lower == 0)
772         len = SIZE_OP_JUMP;
773       else
774         len = 0;
775 
776       len += mod_tlen + SIZE_OP_PUSH + cklen;
777     }
778   }
779   else if (qn->upper == 0) {
780     if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781       len = SIZE_OP_JUMP + tlen;
782     else
783       len = 0;
784   }
785   else if (qn->upper == 1 && qn->greedy) {
786     if (qn->lower == 0) {
787       if (CKN_ON) {
788         len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789       }
790       else {
791         len = SIZE_OP_PUSH + tlen;
792       }
793     }
794     else {
795       len = tlen;
796     }
797   }
798   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799     len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800   }
801   else {
802     len = SIZE_OP_REPEAT_INC
803         + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804     if (CKN_ON)
805       len += SIZE_OP_STATE_CHECK;
806   }
807 
808   return len;
809 }
810 
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814   int r, mod_tlen;
815   int ckn;
816   int infinite = IS_REPEAT_INFINITE(qn->upper);
817   int empty_info = qn->target_empty_info;
818   int tlen = compile_length_tree(qn->target, reg);
819 
820   if (tlen < 0) return tlen;
821 
822   ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823 
824   if (is_anychar_star_quantifier(qn)) {
825     r = compile_tree_n_times(qn->target, qn->lower, reg);
826     if (r) return r;
827     if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828       if (IS_MULTILINE(reg->options))
829         r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830       else
831         r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832       if (r) return r;
833       if (CKN_ON) {
834         r = add_state_check_num(reg, ckn);
835         if (r) return r;
836       }
837 
838       return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839     }
840     else {
841       if (IS_MULTILINE(reg->options)) {
842         r = add_opcode(reg, (CKN_ON ?
843                              OP_STATE_CHECK_ANYCHAR_ML_STAR
844                              : OP_ANYCHAR_ML_STAR));
845       }
846       else {
847         r = add_opcode(reg, (CKN_ON ?
848                              OP_STATE_CHECK_ANYCHAR_STAR
849                              : OP_ANYCHAR_STAR));
850       }
851       if (r) return r;
852       if (CKN_ON)
853         r = add_state_check_num(reg, ckn);
854 
855       return r;
856     }
857   }
858 
859   if (empty_info != 0)
860     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861   else
862     mod_tlen = tlen;
863 
864   if (infinite && qn->lower <= 1) {
865     if (qn->greedy) {
866       if (qn->lower == 1) {
867         r = add_opcode_rel_addr(reg, OP_JUMP,
868                        (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869         if (r) return r;
870       }
871 
872       if (CKN_ON) {
873         r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874         if (r) return r;
875         r = add_state_check_num(reg, ckn);
876         if (r) return r;
877         r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878       }
879       else {
880         r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881       }
882       if (r) return r;
883       r = compile_tree_empty_check(qn->target, reg, empty_info);
884       if (r) return r;
885       r = add_opcode_rel_addr(reg, OP_JUMP,
886                 -(mod_tlen + (int )SIZE_OP_JUMP
887                 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888     }
889     else {
890       if (qn->lower == 0) {
891         r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892         if (r) return r;
893       }
894       r = compile_tree_empty_check(qn->target, reg, empty_info);
895       if (r) return r;
896       if (CKN_ON) {
897         r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898         if (r) return r;
899         r = add_state_check_num(reg, ckn);
900         if (r) return r;
901         r = add_rel_addr(reg,
902                          -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903       }
904       else
905         r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906     }
907   }
908   else if (qn->upper == 0) {
909     if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910       r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911       if (r) return r;
912       r = compile_tree(qn->target, reg);
913     }
914     else
915       r = 0;
916   }
917   else if (qn->upper == 1 && qn->greedy) {
918     if (qn->lower == 0) {
919       if (CKN_ON) {
920         r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921         if (r) return r;
922         r = add_state_check_num(reg, ckn);
923         if (r) return r;
924         r = add_rel_addr(reg, tlen);
925       }
926       else {
927         r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928       }
929       if (r) return r;
930     }
931 
932     r = compile_tree(qn->target, reg);
933   }
934   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935     if (CKN_ON) {
936       r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937       if (r) return r;
938       r = add_state_check_num(reg, ckn);
939       if (r) return r;
940       r = add_rel_addr(reg, SIZE_OP_JUMP);
941     }
942     else {
943       r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944     }
945 
946     if (r) return r;
947     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948     if (r) return r;
949     r = compile_tree(qn->target, reg);
950   }
951   else {
952     r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953     if (CKN_ON) {
954       if (r) return r;
955       r = add_opcode(reg, OP_STATE_CHECK);
956       if (r) return r;
957       r = add_state_check_num(reg, ckn);
958     }
959   }
960   return r;
961 }
962 
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964 
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968   int len, mod_tlen;
969   int infinite = IS_REPEAT_INFINITE(qn->upper);
970   int empty_info = qn->target_empty_info;
971   int tlen = compile_length_tree(qn->target, reg);
972 
973   if (tlen < 0) return tlen;
974 
975   /* anychar repeat */
976   if (NTYPE(qn->target) == NT_CANY) {
977     if (qn->greedy && infinite) {
978       if (IS_NOT_NULL(qn->next_head_exact))
979         return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980       else
981         return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982     }
983   }
984 
985   if (empty_info != 0)
986     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987   else
988     mod_tlen = tlen;
989 
990   if (infinite &&
991       (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993       len = SIZE_OP_JUMP;
994     }
995     else {
996       len = tlen * qn->lower;
997     }
998 
999     if (qn->greedy) {
1000       if (IS_NOT_NULL(qn->head_exact))
1001         len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002       else if (IS_NOT_NULL(qn->next_head_exact))
1003         len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004       else
1005         len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006     }
1007     else
1008       len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009   }
1010   else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011     len = SIZE_OP_JUMP + tlen;
1012   }
1013   else if (!infinite && qn->greedy &&
1014            (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015                                       <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016     len = tlen * qn->lower;
1017     len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018   }
1019   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020     len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021   }
1022   else {
1023     len = SIZE_OP_REPEAT_INC
1024         + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025   }
1026 
1027   return len;
1028 }
1029 
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033   int i, r, mod_tlen;
1034   int infinite = IS_REPEAT_INFINITE(qn->upper);
1035   int empty_info = qn->target_empty_info;
1036   int tlen = compile_length_tree(qn->target, reg);
1037 
1038   if (tlen < 0) return tlen;
1039 
1040   if (is_anychar_star_quantifier(qn)) {
1041     r = compile_tree_n_times(qn->target, qn->lower, reg);
1042     if (r) return r;
1043     if (IS_NOT_NULL(qn->next_head_exact)) {
1044       if (IS_MULTILINE(reg->options))
1045         r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046       else
1047         r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048       if (r) return r;
1049       return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050     }
1051     else {
1052       if (IS_MULTILINE(reg->options))
1053         return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054       else
1055         return add_opcode(reg, OP_ANYCHAR_STAR);
1056     }
1057   }
1058 
1059   if (empty_info != 0)
1060     mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061   else
1062     mod_tlen = tlen;
1063 
1064   if (infinite &&
1065       (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067       if (qn->greedy) {
1068         if (IS_NOT_NULL(qn->head_exact))
1069           r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070         else if (IS_NOT_NULL(qn->next_head_exact))
1071           r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072         else
1073           r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074       }
1075       else {
1076         r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077       }
1078       if (r) return r;
1079     }
1080     else {
1081       r = compile_tree_n_times(qn->target, qn->lower, reg);
1082       if (r) return r;
1083     }
1084 
1085     if (qn->greedy) {
1086       if (IS_NOT_NULL(qn->head_exact)) {
1087         r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088                                 mod_tlen + SIZE_OP_JUMP);
1089         if (r) return r;
1090         add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091         r = compile_tree_empty_check(qn->target, reg, empty_info);
1092         if (r) return r;
1093         r = add_opcode_rel_addr(reg, OP_JUMP,
1094          -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095       }
1096       else if (IS_NOT_NULL(qn->next_head_exact)) {
1097         r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098                                 mod_tlen + SIZE_OP_JUMP);
1099         if (r) return r;
1100         add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101         r = compile_tree_empty_check(qn->target, reg, empty_info);
1102         if (r) return r;
1103         r = add_opcode_rel_addr(reg, OP_JUMP,
1104           -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105       }
1106       else {
1107         r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108         if (r) return r;
1109         r = compile_tree_empty_check(qn->target, reg, empty_info);
1110         if (r) return r;
1111         r = add_opcode_rel_addr(reg, OP_JUMP,
1112                    -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113       }
1114     }
1115     else {
1116       r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117       if (r) return r;
1118       r = compile_tree_empty_check(qn->target, reg, empty_info);
1119       if (r) return r;
1120       r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121     }
1122   }
1123   else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125     if (r) return r;
1126     r = compile_tree(qn->target, reg);
1127   }
1128   else if (!infinite && qn->greedy &&
1129            (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130                                   <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131     int n = qn->upper - qn->lower;
1132 
1133     r = compile_tree_n_times(qn->target, qn->lower, reg);
1134     if (r) return r;
1135 
1136     for (i = 0; i < n; i++) {
1137       r = add_opcode_rel_addr(reg, OP_PUSH,
1138 			   (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139       if (r) return r;
1140       r = compile_tree(qn->target, reg);
1141       if (r) return r;
1142     }
1143   }
1144   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145     r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146     if (r) return r;
1147     r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148     if (r) return r;
1149     r = compile_tree(qn->target, reg);
1150   }
1151   else {
1152     r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153   }
1154   return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157 
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161   int tlen;
1162   OnigOptionType prev = reg->options;
1163 
1164   reg->options = node->option;
1165   tlen = compile_length_tree(node->target, reg);
1166   reg->options = prev;
1167 
1168   if (tlen < 0) return tlen;
1169 
1170   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171     return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172            + tlen + SIZE_OP_SET_OPTION;
1173   }
1174   else
1175     return tlen;
1176 }
1177 
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181   int r;
1182   OnigOptionType prev = reg->options;
1183 
1184   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185     r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186     if (r) return r;
1187     r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188     if (r) return r;
1189     r = add_opcode(reg, OP_FAIL);
1190     if (r) return r;
1191   }
1192 
1193   reg->options = node->option;
1194   r = compile_tree(node->target, reg);
1195   reg->options = prev;
1196 
1197   if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198     if (r) return r;
1199     r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200   }
1201   return r;
1202 }
1203 
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207   int len;
1208   int tlen;
1209 
1210   if (node->type == ENCLOSE_OPTION)
1211     return compile_length_option_node(node, reg);
1212 
1213   if (node->target) {
1214     tlen = compile_length_tree(node->target, reg);
1215     if (tlen < 0) return tlen;
1216   }
1217   else
1218     tlen = 0;
1219 
1220   switch (node->type) {
1221   case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223     if (IS_ENCLOSE_CALLED(node)) {
1224       len = SIZE_OP_MEMORY_START_PUSH + tlen
1225         + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227         len += (IS_ENCLOSE_RECURSION(node)
1228                 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229       else
1230         len += (IS_ENCLOSE_RECURSION(node)
1231                 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232     }
1233     else if (IS_ENCLOSE_RECURSION(node)) {
1234       len = SIZE_OP_MEMORY_START_PUSH;
1235       len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1236                      ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1237     }
1238     else
1239 #endif
1240     {
1241       if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1242         len = SIZE_OP_MEMORY_START_PUSH;
1243       else
1244         len = SIZE_OP_MEMORY_START;
1245 
1246       len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1247 		     ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1248     }
1249     break;
1250 
1251   case ENCLOSE_STOP_BACKTRACK:
1252     if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1253       QtfrNode* qn = NQTFR(node->target);
1254       tlen = compile_length_tree(qn->target, reg);
1255       if (tlen < 0) return tlen;
1256 
1257       len = tlen * qn->lower
1258         + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1259     }
1260     else {
1261       len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1262     }
1263     break;
1264 
1265   default:
1266     return ONIGERR_TYPE_BUG;
1267     break;
1268   }
1269 
1270   return len;
1271 }
1272 
1273 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1274 
1275 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1276 compile_enclose_node(EncloseNode* node, regex_t* reg)
1277 {
1278   int r, len;
1279 
1280   if (node->type == ENCLOSE_OPTION)
1281     return compile_option_node(node, reg);
1282 
1283   switch (node->type) {
1284   case ENCLOSE_MEMORY:
1285 #ifdef USE_SUBEXP_CALL
1286     if (IS_ENCLOSE_CALLED(node)) {
1287       r = add_opcode(reg, OP_CALL);
1288       if (r) return r;
1289       node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1290       node->state |= NST_ADDR_FIXED;
1291       r = add_abs_addr(reg, (int )node->call_addr);
1292       if (r) return r;
1293       len = compile_length_tree(node->target, reg);
1294       len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1295       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1296         len += (IS_ENCLOSE_RECURSION(node)
1297                 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1298       else
1299         len += (IS_ENCLOSE_RECURSION(node)
1300                 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1301 
1302       r = add_opcode_rel_addr(reg, OP_JUMP, len);
1303       if (r) return r;
1304     }
1305 #endif
1306     if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1307       r = add_opcode(reg, OP_MEMORY_START_PUSH);
1308     else
1309       r = add_opcode(reg, OP_MEMORY_START);
1310     if (r) return r;
1311     r = add_mem_num(reg, node->regnum);
1312     if (r) return r;
1313     r = compile_tree(node->target, reg);
1314     if (r) return r;
1315 #ifdef USE_SUBEXP_CALL
1316     if (IS_ENCLOSE_CALLED(node)) {
1317       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1318         r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1319                              ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1320       else
1321         r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1322                              ? OP_MEMORY_END_REC : OP_MEMORY_END));
1323 
1324       if (r) return r;
1325       r = add_mem_num(reg, node->regnum);
1326       if (r) return r;
1327       r = add_opcode(reg, OP_RETURN);
1328     }
1329     else if (IS_ENCLOSE_RECURSION(node)) {
1330       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1331         r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
1332       else
1333         r = add_opcode(reg, OP_MEMORY_END_REC);
1334       if (r) return r;
1335       r = add_mem_num(reg, node->regnum);
1336     }
1337     else
1338 #endif
1339     {
1340       if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1341         r = add_opcode(reg, OP_MEMORY_END_PUSH);
1342       else
1343         r = add_opcode(reg, OP_MEMORY_END);
1344       if (r) return r;
1345       r = add_mem_num(reg, node->regnum);
1346     }
1347     break;
1348 
1349   case ENCLOSE_STOP_BACKTRACK:
1350     if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1351       QtfrNode* qn = NQTFR(node->target);
1352       r = compile_tree_n_times(qn->target, qn->lower, reg);
1353       if (r) return r;
1354 
1355       len = compile_length_tree(qn->target, reg);
1356       if (len < 0) return len;
1357 
1358       r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1359       if (r) return r;
1360       r = compile_tree(qn->target, reg);
1361       if (r) return r;
1362       r = add_opcode(reg, OP_POP);
1363       if (r) return r;
1364       r = add_opcode_rel_addr(reg, OP_JUMP,
1365              -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1366     }
1367     else {
1368       r = add_opcode(reg, OP_PUSH_STOP_BT);
1369       if (r) return r;
1370       r = compile_tree(node->target, reg);
1371       if (r) return r;
1372       r = add_opcode(reg, OP_POP_STOP_BT);
1373     }
1374     break;
1375 
1376   default:
1377     return ONIGERR_TYPE_BUG;
1378     break;
1379   }
1380 
1381   return r;
1382 }
1383 
1384 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1385 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1386 {
1387   int len;
1388   int tlen = 0;
1389 
1390   if (node->target) {
1391     tlen = compile_length_tree(node->target, reg);
1392     if (tlen < 0) return tlen;
1393   }
1394 
1395   switch (node->type) {
1396   case ANCHOR_PREC_READ:
1397     len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1398     break;
1399   case ANCHOR_PREC_READ_NOT:
1400     len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1401     break;
1402   case ANCHOR_LOOK_BEHIND:
1403     len = SIZE_OP_LOOK_BEHIND + tlen;
1404     break;
1405   case ANCHOR_LOOK_BEHIND_NOT:
1406     len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1407     break;
1408 
1409   default:
1410     len = SIZE_OPCODE;
1411     break;
1412   }
1413 
1414   return len;
1415 }
1416 
1417 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1418 compile_anchor_node(AnchorNode* node, regex_t* reg)
1419 {
1420   int r, len;
1421 
1422   switch (node->type) {
1423   case ANCHOR_BEGIN_BUF:      r = add_opcode(reg, OP_BEGIN_BUF);      break;
1424   case ANCHOR_END_BUF:        r = add_opcode(reg, OP_END_BUF);        break;
1425   case ANCHOR_BEGIN_LINE:     r = add_opcode(reg, OP_BEGIN_LINE);     break;
1426   case ANCHOR_END_LINE:       r = add_opcode(reg, OP_END_LINE);       break;
1427   case ANCHOR_SEMI_END_BUF:   r = add_opcode(reg, OP_SEMI_END_BUF);   break;
1428   case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1429 
1430   case ANCHOR_WORD_BOUND:     r = add_opcode(reg, OP_WORD_BOUND);     break;
1431   case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1432 #ifdef USE_WORD_BEGIN_END
1433   case ANCHOR_WORD_BEGIN:     r = add_opcode(reg, OP_WORD_BEGIN);     break;
1434   case ANCHOR_WORD_END:       r = add_opcode(reg, OP_WORD_END);       break;
1435 #endif
1436 
1437   case ANCHOR_PREC_READ:
1438     r = add_opcode(reg, OP_PUSH_POS);
1439     if (r) return r;
1440     r = compile_tree(node->target, reg);
1441     if (r) return r;
1442     r = add_opcode(reg, OP_POP_POS);
1443     break;
1444 
1445   case ANCHOR_PREC_READ_NOT:
1446     len = compile_length_tree(node->target, reg);
1447     if (len < 0) return len;
1448     r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1449     if (r) return r;
1450     r = compile_tree(node->target, reg);
1451     if (r) return r;
1452     r = add_opcode(reg, OP_FAIL_POS);
1453     break;
1454 
1455   case ANCHOR_LOOK_BEHIND:
1456     {
1457       int n;
1458       r = add_opcode(reg, OP_LOOK_BEHIND);
1459       if (r) return r;
1460       if (node->char_len < 0) {
1461         r = get_char_length_tree(node->target, reg, &n);
1462         if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1463       }
1464       else
1465         n = node->char_len;
1466 
1467       r = add_length(reg, n);
1468       if (r) return r;
1469       r = compile_tree(node->target, reg);
1470     }
1471     break;
1472 
1473   case ANCHOR_LOOK_BEHIND_NOT:
1474     {
1475       int n;
1476       len = compile_length_tree(node->target, reg);
1477       r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1478 			   len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1479       if (r) return r;
1480       if (node->char_len < 0) {
1481         r = get_char_length_tree(node->target, reg, &n);
1482         if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1483       }
1484       else
1485         n = node->char_len;
1486       r = add_length(reg, n);
1487       if (r) return r;
1488       r = compile_tree(node->target, reg);
1489       if (r) return r;
1490       r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1491     }
1492     break;
1493 
1494   default:
1495     return ONIGERR_TYPE_BUG;
1496     break;
1497   }
1498 
1499   return r;
1500 }
1501 
1502 static int
compile_length_tree(Node * node,regex_t * reg)1503 compile_length_tree(Node* node, regex_t* reg)
1504 {
1505   int len, type, r;
1506 
1507   type = NTYPE(node);
1508   switch (type) {
1509   case NT_LIST:
1510     len = 0;
1511     do {
1512       r = compile_length_tree(NCAR(node), reg);
1513       if (r < 0) return r;
1514       len += r;
1515     } while (IS_NOT_NULL(node = NCDR(node)));
1516     r = len;
1517     break;
1518 
1519   case NT_ALT:
1520     {
1521       int n;
1522 
1523       n = r = 0;
1524       do {
1525         r += compile_length_tree(NCAR(node), reg);
1526         n++;
1527       } while (IS_NOT_NULL(node = NCDR(node)));
1528       r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1529     }
1530     break;
1531 
1532   case NT_STR:
1533     if (NSTRING_IS_RAW(node))
1534       r = compile_length_string_raw_node(NSTR(node), reg);
1535     else
1536       r = compile_length_string_node(node, reg);
1537     break;
1538 
1539   case NT_CCLASS:
1540     r = compile_length_cclass_node(NCCLASS(node), reg);
1541     break;
1542 
1543   case NT_CTYPE:
1544   case NT_CANY:
1545     r = SIZE_OPCODE;
1546     break;
1547 
1548   case NT_BREF:
1549     {
1550       BRefNode* br = NBREF(node);
1551 
1552 #ifdef USE_BACKREF_WITH_LEVEL
1553       if (IS_BACKREF_NEST_LEVEL(br)) {
1554         r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1555             SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1556       }
1557       else
1558 #endif
1559       if (br->back_num == 1) {
1560         r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1561              ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1562       }
1563       else {
1564         r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1565       }
1566     }
1567     break;
1568 
1569 #ifdef USE_SUBEXP_CALL
1570   case NT_CALL:
1571     r = SIZE_OP_CALL;
1572     break;
1573 #endif
1574 
1575   case NT_QTFR:
1576     r = compile_length_quantifier_node(NQTFR(node), reg);
1577     break;
1578 
1579   case NT_ENCLOSE:
1580     r = compile_length_enclose_node(NENCLOSE(node), reg);
1581     break;
1582 
1583   case NT_ANCHOR:
1584     r = compile_length_anchor_node(NANCHOR(node), reg);
1585     break;
1586 
1587   default:
1588     return ONIGERR_TYPE_BUG;
1589     break;
1590   }
1591 
1592   return r;
1593 }
1594 
1595 static int
compile_tree(Node * node,regex_t * reg)1596 compile_tree(Node* node, regex_t* reg)
1597 {
1598   int n, type, len, pos, r = 0;
1599 
1600   type = NTYPE(node);
1601   switch (type) {
1602   case NT_LIST:
1603     do {
1604       r = compile_tree(NCAR(node), reg);
1605     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1606     break;
1607 
1608   case NT_ALT:
1609     {
1610       Node* x = node;
1611       len = 0;
1612       do {
1613         len += compile_length_tree(NCAR(x), reg);
1614         if (NCDR(x) != NULL) {
1615           len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1616         }
1617       } while (IS_NOT_NULL(x = NCDR(x)));
1618       pos = reg->used + len;  /* goal position */
1619 
1620       do {
1621         len = compile_length_tree(NCAR(node), reg);
1622         if (IS_NOT_NULL(NCDR(node))) {
1623           r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1624           if (r) break;
1625         }
1626         r = compile_tree(NCAR(node), reg);
1627         if (r) break;
1628         if (IS_NOT_NULL(NCDR(node))) {
1629           len = pos - (reg->used + SIZE_OP_JUMP);
1630           r = add_opcode_rel_addr(reg, OP_JUMP, len);
1631           if (r) break;
1632         }
1633       } while (IS_NOT_NULL(node = NCDR(node)));
1634     }
1635     break;
1636 
1637   case NT_STR:
1638     if (NSTRING_IS_RAW(node))
1639       r = compile_string_raw_node(NSTR(node), reg);
1640     else
1641       r = compile_string_node(node, reg);
1642     break;
1643 
1644   case NT_CCLASS:
1645     r = compile_cclass_node(NCCLASS(node), reg);
1646     break;
1647 
1648   case NT_CTYPE:
1649     {
1650       int op;
1651 
1652       switch (NCTYPE(node)->ctype) {
1653       case ONIGENC_CTYPE_WORD:
1654         if (NCTYPE(node)->not != 0)  op = OP_NOT_WORD;
1655         else                         op = OP_WORD;
1656         break;
1657       default:
1658         return ONIGERR_TYPE_BUG;
1659         break;
1660       }
1661       r = add_opcode(reg, op);
1662     }
1663     break;
1664 
1665   case NT_CANY:
1666     if (IS_MULTILINE(reg->options))
1667       r = add_opcode(reg, OP_ANYCHAR_ML);
1668     else
1669       r = add_opcode(reg, OP_ANYCHAR);
1670     break;
1671 
1672   case NT_BREF:
1673     {
1674       BRefNode* br = NBREF(node);
1675 
1676 #ifdef USE_BACKREF_WITH_LEVEL
1677       if (IS_BACKREF_NEST_LEVEL(br)) {
1678         r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1679         if (r) return r;
1680         r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1681         if (r) return r;
1682         r = add_length(reg, br->nest_level);
1683         if (r) return r;
1684 
1685         goto add_bacref_mems;
1686       }
1687       else
1688 #endif
1689       if (br->back_num == 1) {
1690         n = br->back_static[0];
1691         if (IS_IGNORECASE(reg->options)) {
1692           r = add_opcode(reg, OP_BACKREFN_IC);
1693           if (r) return r;
1694           r = add_mem_num(reg, n);
1695         }
1696         else {
1697           switch (n) {
1698           case 1:  r = add_opcode(reg, OP_BACKREF1); break;
1699           case 2:  r = add_opcode(reg, OP_BACKREF2); break;
1700           default:
1701             r = add_opcode(reg, OP_BACKREFN);
1702             if (r) return r;
1703             r = add_mem_num(reg, n);
1704             break;
1705           }
1706         }
1707       }
1708       else {
1709         int i;
1710         int* p;
1711 
1712         if (IS_IGNORECASE(reg->options)) {
1713           r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1714         }
1715         else {
1716           r = add_opcode(reg, OP_BACKREF_MULTI);
1717         }
1718         if (r) return r;
1719 
1720 #ifdef USE_BACKREF_WITH_LEVEL
1721       add_bacref_mems:
1722 #endif
1723         r = add_length(reg, br->back_num);
1724         if (r) return r;
1725         p = BACKREFS_P(br);
1726         for (i = br->back_num - 1; i >= 0; i--) {
1727           r = add_mem_num(reg, p[i]);
1728           if (r) return r;
1729         }
1730       }
1731     }
1732     break;
1733 
1734 #ifdef USE_SUBEXP_CALL
1735   case NT_CALL:
1736     r = compile_call(NCALL(node), reg);
1737     break;
1738 #endif
1739 
1740   case NT_QTFR:
1741     r = compile_quantifier_node(NQTFR(node), reg);
1742     break;
1743 
1744   case NT_ENCLOSE:
1745     r = compile_enclose_node(NENCLOSE(node), reg);
1746     break;
1747 
1748   case NT_ANCHOR:
1749     r = compile_anchor_node(NANCHOR(node), reg);
1750     break;
1751 
1752   default:
1753 #ifdef ONIG_DEBUG
1754     fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1755 #endif
1756     break;
1757   }
1758 
1759   return r;
1760 }
1761 
1762 #ifdef USE_NAMED_GROUP
1763 
1764 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1765 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1766 {
1767   int r = 0;
1768   Node* node = *plink;
1769 
1770   switch (NTYPE(node)) {
1771   case NT_LIST:
1772   case NT_ALT:
1773     do {
1774       r = noname_disable_map(&(NCAR(node)), map, counter);
1775     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1776     break;
1777 
1778   case NT_QTFR:
1779     {
1780       Node** ptarget = &(NQTFR(node)->target);
1781       Node*  old = *ptarget;
1782       r = noname_disable_map(ptarget, map, counter);
1783       if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1784         onig_reduce_nested_quantifier(node, *ptarget);
1785       }
1786     }
1787     break;
1788 
1789   case NT_ENCLOSE:
1790     {
1791       EncloseNode* en = NENCLOSE(node);
1792       if (en->type == ENCLOSE_MEMORY) {
1793         if (IS_ENCLOSE_NAMED_GROUP(en)) {
1794           (*counter)++;
1795           map[en->regnum].new_val = *counter;
1796           en->regnum = *counter;
1797           r = noname_disable_map(&(en->target), map, counter);
1798         }
1799         else {
1800           *plink = en->target;
1801           en->target = NULL_NODE;
1802           onig_node_free(node);
1803           r = noname_disable_map(plink, map, counter);
1804         }
1805       }
1806       else
1807         r = noname_disable_map(&(en->target), map, counter);
1808     }
1809     break;
1810 
1811   case NT_ANCHOR:
1812     if (NANCHOR(node)->target)
1813       r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
1814     break;
1815 
1816   default:
1817     break;
1818   }
1819 
1820   return r;
1821 }
1822 
1823 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1824 renumber_node_backref(Node* node, GroupNumRemap* map)
1825 {
1826   int i, pos, n, old_num;
1827   int *backs;
1828   BRefNode* bn = NBREF(node);
1829 
1830   if (! IS_BACKREF_NAME_REF(bn))
1831     return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1832 
1833   old_num = bn->back_num;
1834   if (IS_NULL(bn->back_dynamic))
1835     backs = bn->back_static;
1836   else
1837     backs = bn->back_dynamic;
1838 
1839   for (i = 0, pos = 0; i < old_num; i++) {
1840     n = map[backs[i]].new_val;
1841     if (n > 0) {
1842       backs[pos] = n;
1843       pos++;
1844     }
1845   }
1846 
1847   bn->back_num = pos;
1848   return 0;
1849 }
1850 
1851 static int
renumber_by_map(Node * node,GroupNumRemap * map)1852 renumber_by_map(Node* node, GroupNumRemap* map)
1853 {
1854   int r = 0;
1855 
1856   switch (NTYPE(node)) {
1857   case NT_LIST:
1858   case NT_ALT:
1859     do {
1860       r = renumber_by_map(NCAR(node), map);
1861     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1862     break;
1863   case NT_QTFR:
1864     r = renumber_by_map(NQTFR(node)->target, map);
1865     break;
1866   case NT_ENCLOSE:
1867     r = renumber_by_map(NENCLOSE(node)->target, map);
1868     break;
1869 
1870   case NT_BREF:
1871     r = renumber_node_backref(node, map);
1872     break;
1873 
1874   case NT_ANCHOR:
1875     if (NANCHOR(node)->target)
1876       r = renumber_by_map(NANCHOR(node)->target, map);
1877     break;
1878 
1879   default:
1880     break;
1881   }
1882 
1883   return r;
1884 }
1885 
1886 static int
numbered_ref_check(Node * node)1887 numbered_ref_check(Node* node)
1888 {
1889   int r = 0;
1890 
1891   switch (NTYPE(node)) {
1892   case NT_LIST:
1893   case NT_ALT:
1894     do {
1895       r = numbered_ref_check(NCAR(node));
1896     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1897     break;
1898   case NT_QTFR:
1899     r = numbered_ref_check(NQTFR(node)->target);
1900     break;
1901   case NT_ENCLOSE:
1902     r = numbered_ref_check(NENCLOSE(node)->target);
1903     break;
1904 
1905   case NT_BREF:
1906     if (! IS_BACKREF_NAME_REF(NBREF(node)))
1907       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1908     break;
1909 
1910   case NT_ANCHOR:
1911     if (NANCHOR(node)->target)
1912       r = numbered_ref_check(NANCHOR(node)->target);
1913     break;
1914 
1915   default:
1916     break;
1917   }
1918 
1919   return r;
1920 }
1921 
1922 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1923 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1924 {
1925   int r, i, pos, counter;
1926   BitStatusType loc;
1927   GroupNumRemap* map;
1928 
1929   map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1930   CHECK_NULL_RETURN_MEMERR(map);
1931   for (i = 1; i <= env->num_mem; i++) {
1932     map[i].new_val = 0;
1933   }
1934   counter = 0;
1935   r = noname_disable_map(root, map, &counter);
1936   if (r != 0) return r;
1937 
1938   r = renumber_by_map(*root, map);
1939   if (r != 0) return r;
1940 
1941   for (i = 1, pos = 1; i <= env->num_mem; i++) {
1942     if (map[i].new_val > 0) {
1943       SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1944       pos++;
1945     }
1946   }
1947 
1948   loc = env->capture_history;
1949   BIT_STATUS_CLEAR(env->capture_history);
1950   for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1951     if (BIT_STATUS_AT(loc, i)) {
1952       BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1953     }
1954   }
1955 
1956   env->num_mem = env->num_named;
1957   reg->num_mem = env->num_named;
1958 
1959   return onig_renumber_name_table(reg, map);
1960 }
1961 #endif /* USE_NAMED_GROUP */
1962 
1963 #ifdef USE_SUBEXP_CALL
1964 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1965 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1966 {
1967   int i, offset;
1968   EncloseNode* en;
1969   AbsAddrType addr;
1970 
1971   for (i = 0; i < uslist->num; i++) {
1972     en = NENCLOSE(uslist->us[i].target);
1973     if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1974     addr = en->call_addr;
1975     offset = uslist->us[i].offset;
1976 
1977     BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1978   }
1979   return 0;
1980 }
1981 #endif
1982 
1983 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1984 static int
quantifiers_memory_node_info(Node * node)1985 quantifiers_memory_node_info(Node* node)
1986 {
1987   int r = 0;
1988 
1989   switch (NTYPE(node)) {
1990   case NT_LIST:
1991   case NT_ALT:
1992     {
1993       int v;
1994       do {
1995         v = quantifiers_memory_node_info(NCAR(node));
1996         if (v > r) r = v;
1997       } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1998     }
1999     break;
2000 
2001 #ifdef USE_SUBEXP_CALL
2002   case NT_CALL:
2003     if (IS_CALL_RECURSION(NCALL(node))) {
2004       return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2005     }
2006     else
2007       r = quantifiers_memory_node_info(NCALL(node)->target);
2008     break;
2009 #endif
2010 
2011   case NT_QTFR:
2012     {
2013       QtfrNode* qn = NQTFR(node);
2014       if (qn->upper != 0) {
2015         r = quantifiers_memory_node_info(qn->target);
2016       }
2017     }
2018     break;
2019 
2020   case NT_ENCLOSE:
2021     {
2022       EncloseNode* en = NENCLOSE(node);
2023       switch (en->type) {
2024       case ENCLOSE_MEMORY:
2025         return NQ_TARGET_IS_EMPTY_MEM;
2026         break;
2027 
2028       case ENCLOSE_OPTION:
2029       case ENCLOSE_STOP_BACKTRACK:
2030         r = quantifiers_memory_node_info(en->target);
2031         break;
2032       default:
2033         break;
2034       }
2035     }
2036     break;
2037 
2038   case NT_BREF:
2039   case NT_STR:
2040   case NT_CTYPE:
2041   case NT_CCLASS:
2042   case NT_CANY:
2043   case NT_ANCHOR:
2044   default:
2045     break;
2046   }
2047 
2048   return r;
2049 }
2050 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2051 
2052 
2053 #define GET_CHAR_LEN_VARLEN           -1
2054 #define GET_CHAR_LEN_TOP_ALT_VARLEN   -2
2055 
2056 /* fixed size pattern node only */
2057 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2058 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2059 {
2060   int tlen;
2061   int r = 0;
2062 
2063   level++;
2064   *len = 0;
2065   switch (NTYPE(node)) {
2066   case NT_LIST:
2067     do {
2068       r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2069       if (r == 0)
2070         *len = distance_add(*len, tlen);
2071     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2072     break;
2073 
2074   case NT_ALT:
2075     {
2076       int tlen2;
2077       int varlen = 0;
2078 
2079       r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2080       while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2081         r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2082         if (r == 0) {
2083           if (tlen != tlen2)
2084             varlen = 1;
2085         }
2086       }
2087       if (r == 0) {
2088         if (varlen != 0) {
2089           if (level == 1)
2090             r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2091           else
2092             r = GET_CHAR_LEN_VARLEN;
2093         }
2094         else
2095           *len = tlen;
2096       }
2097     }
2098     break;
2099 
2100   case NT_STR:
2101     {
2102       StrNode* sn = NSTR(node);
2103       UChar *s = sn->s;
2104       while (s < sn->end) {
2105         s += enclen(reg->enc, s);
2106         (*len)++;
2107       }
2108     }
2109     break;
2110 
2111   case NT_QTFR:
2112     {
2113       QtfrNode* qn = NQTFR(node);
2114       if (qn->lower == qn->upper) {
2115         r = get_char_length_tree1(qn->target, reg, &tlen, level);
2116         if (r == 0)
2117           *len = distance_multiply(tlen, qn->lower);
2118       }
2119       else
2120         r = GET_CHAR_LEN_VARLEN;
2121     }
2122     break;
2123 
2124 #ifdef USE_SUBEXP_CALL
2125   case NT_CALL:
2126     if (! IS_CALL_RECURSION(NCALL(node)))
2127       r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2128     else
2129       r = GET_CHAR_LEN_VARLEN;
2130     break;
2131 #endif
2132 
2133   case NT_CTYPE:
2134     *len = 1;
2135     break;
2136 
2137   case NT_CCLASS:
2138   case NT_CANY:
2139     *len = 1;
2140     break;
2141 
2142   case NT_ENCLOSE:
2143     {
2144       EncloseNode* en = NENCLOSE(node);
2145       switch (en->type) {
2146       case ENCLOSE_MEMORY:
2147 #ifdef USE_SUBEXP_CALL
2148         if (IS_ENCLOSE_CLEN_FIXED(en))
2149           *len = en->char_len;
2150         else {
2151           r = get_char_length_tree1(en->target, reg, len, level);
2152           if (r == 0) {
2153             en->char_len = *len;
2154             SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2155           }
2156         }
2157         break;
2158 #endif
2159       case ENCLOSE_OPTION:
2160       case ENCLOSE_STOP_BACKTRACK:
2161         r = get_char_length_tree1(en->target, reg, len, level);
2162         break;
2163       default:
2164         break;
2165       }
2166     }
2167     break;
2168 
2169   case NT_ANCHOR:
2170     break;
2171 
2172   default:
2173     r = GET_CHAR_LEN_VARLEN;
2174     break;
2175   }
2176 
2177   return r;
2178 }
2179 
2180 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2181 get_char_length_tree(Node* node, regex_t* reg, int* len)
2182 {
2183   return get_char_length_tree1(node, reg, len, 0);
2184 }
2185 
2186 /* x is not included y ==>  1 : 0 */
2187 static int
is_not_included(Node * x,Node * y,regex_t * reg)2188 is_not_included(Node* x, Node* y, regex_t* reg)
2189 {
2190   int i, len;
2191   OnigCodePoint code;
2192   UChar *p;
2193   int ytype;
2194 
2195  retry:
2196   ytype = NTYPE(y);
2197   switch (NTYPE(x)) {
2198   case NT_CTYPE:
2199     {
2200       switch (ytype) {
2201       case NT_CTYPE:
2202         if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2203             NCTYPE(y)->not   != NCTYPE(x)->not)
2204           return 1;
2205         else
2206           return 0;
2207         break;
2208 
2209       case NT_CCLASS:
2210       swap:
2211         {
2212           Node* tmp;
2213           tmp = x; x = y; y = tmp;
2214           goto retry;
2215         }
2216         break;
2217 
2218       case NT_STR:
2219         goto swap;
2220         break;
2221 
2222       default:
2223         break;
2224       }
2225     }
2226     break;
2227 
2228   case NT_CCLASS:
2229     {
2230       CClassNode* xc = NCCLASS(x);
2231       switch (ytype) {
2232       case NT_CTYPE:
2233         switch (NCTYPE(y)->ctype) {
2234         case ONIGENC_CTYPE_WORD:
2235           if (NCTYPE(y)->not == 0) {
2236             if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2237               for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2238                 if (BITSET_AT(xc->bs, i)) {
2239                   if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2240                 }
2241               }
2242               return 1;
2243             }
2244             return 0;
2245           }
2246           else {
2247             if (IS_NOT_NULL(xc->mbuf)) return 0;
2248             for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2249               if (! IS_CODE_SB_WORD(reg->enc, i)) {
2250                 if (!IS_NCCLASS_NOT(xc)) {
2251                   if (BITSET_AT(xc->bs, i))
2252                     return 0;
2253                 }
2254                 else {
2255                   if (! BITSET_AT(xc->bs, i))
2256                     return 0;
2257                 }
2258               }
2259             }
2260             return 1;
2261           }
2262           break;
2263 
2264         default:
2265           break;
2266         }
2267         break;
2268 
2269       case NT_CCLASS:
2270         {
2271           int v;
2272           CClassNode* yc = NCCLASS(y);
2273 
2274           for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2275             v = BITSET_AT(xc->bs, i);
2276             if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2277                 (v == 0 && IS_NCCLASS_NOT(xc))) {
2278               v = BITSET_AT(yc->bs, i);
2279               if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2280                   (v == 0 && IS_NCCLASS_NOT(yc)))
2281                 return 0;
2282             }
2283           }
2284           if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2285               (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2286             return 1;
2287           return 0;
2288         }
2289         break;
2290 
2291       case NT_STR:
2292         goto swap;
2293         break;
2294 
2295       default:
2296         break;
2297       }
2298     }
2299     break;
2300 
2301   case NT_STR:
2302     {
2303       StrNode* xs = NSTR(x);
2304       if (NSTRING_LEN(x) == 0)
2305         break;
2306 
2307       //c = *(xs->s);
2308       switch (ytype) {
2309       case NT_CTYPE:
2310         switch (NCTYPE(y)->ctype) {
2311         case ONIGENC_CTYPE_WORD:
2312           if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2313             return NCTYPE(y)->not;
2314           else
2315             return !(NCTYPE(y)->not);
2316           break;
2317         default:
2318           break;
2319         }
2320         break;
2321 
2322       case NT_CCLASS:
2323         {
2324           CClassNode* cc = NCCLASS(y);
2325 
2326           code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2327                                      xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2328           return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2329         }
2330         break;
2331 
2332       case NT_STR:
2333         {
2334           UChar *q;
2335           StrNode* ys = NSTR(y);
2336           len = NSTRING_LEN(x);
2337           if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2338           if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2339             /* tiny version */
2340             return 0;
2341           }
2342           else {
2343             for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2344               if (*p != *q) return 1;
2345             }
2346           }
2347         }
2348         break;
2349 
2350       default:
2351         break;
2352       }
2353     }
2354     break;
2355 
2356   default:
2357     break;
2358   }
2359 
2360   return 0;
2361 }
2362 
2363 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2364 get_head_value_node(Node* node, int exact, regex_t* reg)
2365 {
2366   Node* n = NULL_NODE;
2367 
2368   switch (NTYPE(node)) {
2369   case NT_BREF:
2370   case NT_ALT:
2371   case NT_CANY:
2372 #ifdef USE_SUBEXP_CALL
2373   case NT_CALL:
2374 #endif
2375     break;
2376 
2377   case NT_CTYPE:
2378   case NT_CCLASS:
2379     if (exact == 0) {
2380       n = node;
2381     }
2382     break;
2383 
2384   case NT_LIST:
2385     n = get_head_value_node(NCAR(node), exact, reg);
2386     break;
2387 
2388   case NT_STR:
2389     {
2390       StrNode* sn = NSTR(node);
2391 
2392       if (sn->end <= sn->s)
2393         break;
2394 
2395       if (exact != 0 &&
2396           !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2397       }
2398       else {
2399         n = node;
2400       }
2401     }
2402     break;
2403 
2404   case NT_QTFR:
2405     {
2406       QtfrNode* qn = NQTFR(node);
2407       if (qn->lower > 0) {
2408         if (IS_NOT_NULL(qn->head_exact))
2409           n = qn->head_exact;
2410         else
2411           n = get_head_value_node(qn->target, exact, reg);
2412       }
2413     }
2414     break;
2415 
2416   case NT_ENCLOSE:
2417     {
2418       EncloseNode* en = NENCLOSE(node);
2419       switch (en->type) {
2420       case ENCLOSE_OPTION:
2421         {
2422           OnigOptionType options = reg->options;
2423 
2424           reg->options = NENCLOSE(node)->option;
2425           n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2426           reg->options = options;
2427         }
2428         break;
2429 
2430       case ENCLOSE_MEMORY:
2431       case ENCLOSE_STOP_BACKTRACK:
2432         n = get_head_value_node(en->target, exact, reg);
2433         break;
2434       }
2435     }
2436     break;
2437 
2438   case NT_ANCHOR:
2439     if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2440       n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2441     break;
2442 
2443   default:
2444     break;
2445   }
2446 
2447   return n;
2448 }
2449 
2450 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2451 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2452 {
2453   int type, r = 0;
2454 
2455   type = NTYPE(node);
2456   if ((NTYPE2BIT(type) & type_mask) == 0)
2457     return 1;
2458 
2459   switch (type) {
2460   case NT_LIST:
2461   case NT_ALT:
2462     do {
2463       r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2464                           anchor_mask);
2465     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2466     break;
2467 
2468   case NT_QTFR:
2469     r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2470                         anchor_mask);
2471     break;
2472 
2473   case NT_ENCLOSE:
2474     {
2475       EncloseNode* en = NENCLOSE(node);
2476       if ((en->type & enclose_mask) == 0)
2477         return 1;
2478 
2479       r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2480     }
2481     break;
2482 
2483   case NT_ANCHOR:
2484     type = NANCHOR(node)->type;
2485     if ((type & anchor_mask) == 0)
2486       return 1;
2487 
2488     if (NANCHOR(node)->target)
2489       r = check_type_tree(NANCHOR(node)->target,
2490 			  type_mask, enclose_mask, anchor_mask);
2491     break;
2492 
2493   default:
2494     break;
2495   }
2496   return r;
2497 }
2498 
2499 static int
get_min_len(Node * node,OnigLen * min,ScanEnv * env)2500 get_min_len(Node* node, OnigLen *min, ScanEnv* env)
2501 {
2502   OnigLen tmin;
2503   int r = 0;
2504 
2505   *min = 0;
2506   switch (NTYPE(node)) {
2507   case NT_BREF:
2508     {
2509       int i;
2510       int* backs;
2511       Node** nodes = SCANENV_MEM_NODES(env);
2512       BRefNode* br = NBREF(node);
2513       if (br->state & NST_RECURSION) break;
2514 
2515       backs = BACKREFS_P(br);
2516       if (backs[0] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2517       r = get_min_len(nodes[backs[0]], min, env);
2518       if (r != 0) break;
2519       for (i = 1; i < br->back_num; i++) {
2520         if (backs[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2521         r = get_min_len(nodes[backs[i]], &tmin, env);
2522         if (r != 0) break;
2523         if (*min > tmin) *min = tmin;
2524       }
2525     }
2526     break;
2527 
2528 #ifdef USE_SUBEXP_CALL
2529   case NT_CALL:
2530     if (IS_CALL_RECURSION(NCALL(node))) {
2531       EncloseNode* en = NENCLOSE(NCALL(node)->target);
2532       if (IS_ENCLOSE_MIN_FIXED(en))
2533         *min = en->min_len;
2534     }
2535     else
2536       r = get_min_len(NCALL(node)->target, min, env);
2537     break;
2538 #endif
2539 
2540   case NT_LIST:
2541     do {
2542       r = get_min_len(NCAR(node), &tmin, env);
2543       if (r == 0) *min += tmin;
2544     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2545     break;
2546 
2547   case NT_ALT:
2548     {
2549       Node *x, *y;
2550       y = node;
2551       do {
2552         x = NCAR(y);
2553         r = get_min_len(x, &tmin, env);
2554         if (r != 0) break;
2555         if (y == node) *min = tmin;
2556         else if (*min > tmin) *min = tmin;
2557       } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2558     }
2559     break;
2560 
2561   case NT_STR:
2562     {
2563       StrNode* sn = NSTR(node);
2564       *min = sn->end - sn->s;
2565     }
2566     break;
2567 
2568   case NT_CTYPE:
2569     *min = 1;
2570     break;
2571 
2572   case NT_CCLASS:
2573   case NT_CANY:
2574     *min = 1;
2575     break;
2576 
2577   case NT_QTFR:
2578     {
2579       QtfrNode* qn = NQTFR(node);
2580 
2581       if (qn->lower > 0) {
2582         r = get_min_len(qn->target, min, env);
2583         if (r == 0)
2584           *min = distance_multiply(*min, qn->lower);
2585       }
2586     }
2587     break;
2588 
2589   case NT_ENCLOSE:
2590     {
2591       EncloseNode* en = NENCLOSE(node);
2592       switch (en->type) {
2593       case ENCLOSE_MEMORY:
2594         if (IS_ENCLOSE_MIN_FIXED(en))
2595           *min = en->min_len;
2596         else {
2597           if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2598             *min = 0;  // recursive
2599           else {
2600             SET_ENCLOSE_STATUS(node, NST_MARK1);
2601             r = get_min_len(en->target, min, env);
2602             CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2603             if (r == 0) {
2604               en->min_len = *min;
2605               SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2606             }
2607           }
2608         }
2609         break;
2610 
2611       case ENCLOSE_OPTION:
2612       case ENCLOSE_STOP_BACKTRACK:
2613         r = get_min_len(en->target, min, env);
2614         break;
2615       }
2616     }
2617     break;
2618 
2619   case NT_ANCHOR:
2620   default:
2621     break;
2622   }
2623 
2624   return r;
2625 }
2626 
2627 static int
get_max_len(Node * node,OnigLen * max,ScanEnv * env)2628 get_max_len(Node* node, OnigLen *max, ScanEnv* env)
2629 {
2630   OnigLen tmax;
2631   int r = 0;
2632 
2633   *max = 0;
2634   switch (NTYPE(node)) {
2635   case NT_LIST:
2636     do {
2637       r = get_max_len(NCAR(node), &tmax, env);
2638       if (r == 0)
2639         *max = distance_add(*max, tmax);
2640     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2641     break;
2642 
2643   case NT_ALT:
2644     do {
2645       r = get_max_len(NCAR(node), &tmax, env);
2646       if (r == 0 && *max < tmax) *max = tmax;
2647     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2648     break;
2649 
2650   case NT_STR:
2651     {
2652       StrNode* sn = NSTR(node);
2653       *max = sn->end - sn->s;
2654     }
2655     break;
2656 
2657   case NT_CTYPE:
2658     *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2659     break;
2660 
2661   case NT_CCLASS:
2662   case NT_CANY:
2663     *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2664     break;
2665 
2666   case NT_BREF:
2667     {
2668       int i;
2669       int* backs;
2670       Node** nodes = SCANENV_MEM_NODES(env);
2671       BRefNode* br = NBREF(node);
2672       if (br->state & NST_RECURSION) {
2673         *max = ONIG_INFINITE_DISTANCE;
2674         break;
2675       }
2676       backs = BACKREFS_P(br);
2677       for (i = 0; i < br->back_num; i++) {
2678         if (backs[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
2679         r = get_max_len(nodes[backs[i]], &tmax, env);
2680         if (r != 0) break;
2681         if (*max < tmax) *max = tmax;
2682       }
2683     }
2684     break;
2685 
2686 #ifdef USE_SUBEXP_CALL
2687   case NT_CALL:
2688     if (! IS_CALL_RECURSION(NCALL(node)))
2689       r = get_max_len(NCALL(node)->target, max, env);
2690     else
2691       *max = ONIG_INFINITE_DISTANCE;
2692     break;
2693 #endif
2694 
2695   case NT_QTFR:
2696     {
2697       QtfrNode* qn = NQTFR(node);
2698 
2699       if (qn->upper != 0) {
2700         r = get_max_len(qn->target, max, env);
2701         if (r == 0 && *max != 0) {
2702           if (! IS_REPEAT_INFINITE(qn->upper))
2703             *max = distance_multiply(*max, qn->upper);
2704           else
2705             *max = ONIG_INFINITE_DISTANCE;
2706         }
2707       }
2708     }
2709     break;
2710 
2711   case NT_ENCLOSE:
2712     {
2713       EncloseNode* en = NENCLOSE(node);
2714       switch (en->type) {
2715       case ENCLOSE_MEMORY:
2716         if (IS_ENCLOSE_MAX_FIXED(en))
2717           *max = en->max_len;
2718         else {
2719           if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2720             *max = ONIG_INFINITE_DISTANCE;
2721           else {
2722             SET_ENCLOSE_STATUS(node, NST_MARK1);
2723             r = get_max_len(en->target, max, env);
2724             CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2725             if (r == 0) {
2726               en->max_len = *max;
2727               SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2728             }
2729           }
2730         }
2731         break;
2732 
2733       case ENCLOSE_OPTION:
2734       case ENCLOSE_STOP_BACKTRACK:
2735         r = get_max_len(en->target, max, env);
2736         break;
2737       }
2738     }
2739     break;
2740 
2741   case NT_ANCHOR:
2742   default:
2743     break;
2744   }
2745 
2746   return r;
2747 }
2748 
2749 
2750 #ifdef USE_SUBEXP_CALL
2751 
2752 #define RECURSION_EXIST       1
2753 #define RECURSION_INFINITE    2
2754 
2755 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2756 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2757 {
2758   int type;
2759   int r = 0;
2760 
2761   type = NTYPE(node);
2762   switch (type) {
2763   case NT_LIST:
2764     {
2765       Node *x;
2766       OnigLen min;
2767       int ret;
2768 
2769       x = node;
2770       do {
2771         ret = subexp_inf_recursive_check(NCAR(x), env, head);
2772         if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2773         r |= ret;
2774         if (head) {
2775           ret = get_min_len(NCAR(x), &min, env);
2776           if (ret != 0) return ret;
2777           if (min != 0) head = 0;
2778         }
2779       } while (IS_NOT_NULL(x = NCDR(x)));
2780     }
2781     break;
2782 
2783   case NT_ALT:
2784     {
2785       int ret;
2786       r = RECURSION_EXIST;
2787       do {
2788         ret = subexp_inf_recursive_check(NCAR(node), env, head);
2789         if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2790         r &= ret;
2791       } while (IS_NOT_NULL(node = NCDR(node)));
2792     }
2793     break;
2794 
2795   case NT_QTFR:
2796     r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2797     if (r == RECURSION_EXIST) {
2798       if (NQTFR(node)->lower == 0) r = 0;
2799     }
2800     break;
2801 
2802   case NT_ANCHOR:
2803     {
2804       AnchorNode* an = NANCHOR(node);
2805       switch (an->type) {
2806       case ANCHOR_PREC_READ:
2807       case ANCHOR_PREC_READ_NOT:
2808       case ANCHOR_LOOK_BEHIND:
2809       case ANCHOR_LOOK_BEHIND_NOT:
2810         r = subexp_inf_recursive_check(an->target, env, head);
2811         break;
2812       }
2813     }
2814     break;
2815 
2816   case NT_CALL:
2817     r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2818     break;
2819 
2820   case NT_ENCLOSE:
2821     if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2822       return 0;
2823     else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2824       return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2825     else {
2826       SET_ENCLOSE_STATUS(node, NST_MARK2);
2827       r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2828       CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2829     }
2830     break;
2831 
2832   default:
2833     break;
2834   }
2835 
2836   return r;
2837 }
2838 
2839 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2840 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2841 {
2842   int type;
2843   int r = 0;
2844 
2845   type = NTYPE(node);
2846   switch (type) {
2847   case NT_LIST:
2848   case NT_ALT:
2849     do {
2850       r = subexp_inf_recursive_check_trav(NCAR(node), env);
2851     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2852     break;
2853 
2854   case NT_QTFR:
2855     r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2856     break;
2857 
2858   case NT_ANCHOR:
2859     {
2860       AnchorNode* an = NANCHOR(node);
2861       switch (an->type) {
2862       case ANCHOR_PREC_READ:
2863       case ANCHOR_PREC_READ_NOT:
2864       case ANCHOR_LOOK_BEHIND:
2865       case ANCHOR_LOOK_BEHIND_NOT:
2866         r = subexp_inf_recursive_check_trav(an->target, env);
2867         break;
2868       }
2869     }
2870     break;
2871 
2872   case NT_ENCLOSE:
2873     {
2874       EncloseNode* en = NENCLOSE(node);
2875 
2876       if (IS_ENCLOSE_RECURSION(en)) {
2877         SET_ENCLOSE_STATUS(node, NST_MARK1);
2878         r = subexp_inf_recursive_check(en->target, env, 1);
2879         if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2880         CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2881       }
2882       r = subexp_inf_recursive_check_trav(en->target, env);
2883     }
2884     break;
2885 
2886   default:
2887     break;
2888   }
2889 
2890   return r;
2891 }
2892 
2893 static int
subexp_recursive_check(Node * node)2894 subexp_recursive_check(Node* node)
2895 {
2896   int r = 0;
2897 
2898   switch (NTYPE(node)) {
2899   case NT_LIST:
2900   case NT_ALT:
2901     do {
2902       r |= subexp_recursive_check(NCAR(node));
2903     } while (IS_NOT_NULL(node = NCDR(node)));
2904     break;
2905 
2906   case NT_QTFR:
2907     r = subexp_recursive_check(NQTFR(node)->target);
2908     break;
2909 
2910   case NT_ANCHOR:
2911     {
2912       AnchorNode* an = NANCHOR(node);
2913       switch (an->type) {
2914       case ANCHOR_PREC_READ:
2915       case ANCHOR_PREC_READ_NOT:
2916       case ANCHOR_LOOK_BEHIND:
2917       case ANCHOR_LOOK_BEHIND_NOT:
2918         r = subexp_recursive_check(an->target);
2919         break;
2920       }
2921     }
2922     break;
2923 
2924   case NT_CALL:
2925     r = subexp_recursive_check(NCALL(node)->target);
2926     if (r != 0) SET_CALL_RECURSION(node);
2927     break;
2928 
2929   case NT_ENCLOSE:
2930     if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2931       return 0;
2932     else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2933       return 1; /* recursion */
2934     else {
2935       SET_ENCLOSE_STATUS(node, NST_MARK2);
2936       r = subexp_recursive_check(NENCLOSE(node)->target);
2937       CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2938     }
2939     break;
2940 
2941   default:
2942     break;
2943   }
2944 
2945   return r;
2946 }
2947 
2948 
2949 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2950 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2951 {
2952 #define FOUND_CALLED_NODE    1
2953 
2954   int type;
2955   int r = 0;
2956 
2957   type = NTYPE(node);
2958   switch (type) {
2959   case NT_LIST:
2960   case NT_ALT:
2961     {
2962       int ret;
2963       do {
2964         ret = subexp_recursive_check_trav(NCAR(node), env);
2965         if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2966         else if (ret < 0) return ret;
2967       } while (IS_NOT_NULL(node = NCDR(node)));
2968     }
2969     break;
2970 
2971   case NT_QTFR:
2972     r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2973     if (NQTFR(node)->upper == 0) {
2974       if (r == FOUND_CALLED_NODE)
2975         NQTFR(node)->is_refered = 1;
2976     }
2977     break;
2978 
2979   case NT_ANCHOR:
2980     {
2981       AnchorNode* an = NANCHOR(node);
2982       switch (an->type) {
2983       case ANCHOR_PREC_READ:
2984       case ANCHOR_PREC_READ_NOT:
2985       case ANCHOR_LOOK_BEHIND:
2986       case ANCHOR_LOOK_BEHIND_NOT:
2987         r = subexp_recursive_check_trav(an->target, env);
2988         break;
2989       }
2990     }
2991     break;
2992 
2993   case NT_ENCLOSE:
2994     {
2995       EncloseNode* en = NENCLOSE(node);
2996 
2997       if (! IS_ENCLOSE_RECURSION(en)) {
2998         if (IS_ENCLOSE_CALLED(en)) {
2999           SET_ENCLOSE_STATUS(node, NST_MARK1);
3000           r = subexp_recursive_check(en->target);
3001           if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
3002           CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3003         }
3004       }
3005       r = subexp_recursive_check_trav(en->target, env);
3006       if (IS_ENCLOSE_CALLED(en))
3007         r |= FOUND_CALLED_NODE;
3008     }
3009     break;
3010 
3011   default:
3012     break;
3013   }
3014 
3015   return r;
3016 }
3017 
3018 static int
setup_subexp_call(Node * node,ScanEnv * env)3019 setup_subexp_call(Node* node, ScanEnv* env)
3020 {
3021   int type;
3022   int r = 0;
3023 
3024   type = NTYPE(node);
3025   switch (type) {
3026   case NT_LIST:
3027     do {
3028       r = setup_subexp_call(NCAR(node), env);
3029     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3030     break;
3031 
3032   case NT_ALT:
3033     do {
3034       r = setup_subexp_call(NCAR(node), env);
3035     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3036     break;
3037 
3038   case NT_QTFR:
3039     r = setup_subexp_call(NQTFR(node)->target, env);
3040     break;
3041   case NT_ENCLOSE:
3042     r = setup_subexp_call(NENCLOSE(node)->target, env);
3043     break;
3044 
3045   case NT_CALL:
3046     {
3047       CallNode* cn = NCALL(node);
3048       Node** nodes = SCANENV_MEM_NODES(env);
3049 
3050       if (cn->group_num != 0) {
3051         int gnum = cn->group_num;
3052 
3053 #ifdef USE_NAMED_GROUP
3054         if (env->num_named > 0 &&
3055             IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3056             !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3057           return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3058         }
3059 #endif
3060         if (gnum > env->num_mem) {
3061           onig_scan_env_set_error_string(env,
3062                    ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3063           return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3064         }
3065 
3066 #ifdef USE_NAMED_GROUP
3067       set_call_attr:
3068 #endif
3069         cn->target = nodes[cn->group_num];
3070         if (IS_NULL(cn->target)) {
3071           onig_scan_env_set_error_string(env,
3072                    ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3073           return ONIGERR_UNDEFINED_NAME_REFERENCE;
3074         }
3075         SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3076         BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3077         cn->unset_addr_list = env->unset_addr_list;
3078       }
3079 #ifdef USE_NAMED_GROUP
3080       else {
3081         int *refs;
3082 
3083         int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3084                                            &refs);
3085         if (n <= 0) {
3086           onig_scan_env_set_error_string(env,
3087                    ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3088           return ONIGERR_UNDEFINED_NAME_REFERENCE;
3089         }
3090         else if (n > 1) {
3091           onig_scan_env_set_error_string(env,
3092                ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3093           return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3094         }
3095         else {
3096           cn->group_num = refs[0];
3097           goto set_call_attr;
3098         }
3099       }
3100 #endif
3101     }
3102     break;
3103 
3104   case NT_ANCHOR:
3105     {
3106       AnchorNode* an = NANCHOR(node);
3107 
3108       switch (an->type) {
3109       case ANCHOR_PREC_READ:
3110       case ANCHOR_PREC_READ_NOT:
3111       case ANCHOR_LOOK_BEHIND:
3112       case ANCHOR_LOOK_BEHIND_NOT:
3113         r = setup_subexp_call(an->target, env);
3114         break;
3115       }
3116     }
3117     break;
3118 
3119   default:
3120     break;
3121   }
3122 
3123   return r;
3124 }
3125 #endif
3126 
3127 /* divide different length alternatives in look-behind.
3128   (?<=A|B) ==> (?<=A)|(?<=B)
3129   (?<!A|B) ==> (?<!A)(?<!B)
3130 */
3131 static int
divide_look_behind_alternatives(Node * node)3132 divide_look_behind_alternatives(Node* node)
3133 {
3134   Node *head, *np, *insert_node;
3135   AnchorNode* an = NANCHOR(node);
3136   int anc_type = an->type;
3137 
3138   /* fprintf(stderr, "divide_look_behind: %d\n", (int )node); */
3139 
3140   head = an->target;
3141   np = NCAR(head);
3142   swap_node(node, head);
3143   NCAR(node) = head;
3144   NANCHOR(head)->target = np;
3145 
3146   np = node;
3147   while ((np = NCDR(np)) != NULL_NODE) {
3148     insert_node = onig_node_new_anchor(anc_type);
3149     CHECK_NULL_RETURN_MEMERR(insert_node);
3150     NANCHOR(insert_node)->target = NCAR(np);
3151     NCAR(np) = insert_node;
3152   }
3153 
3154   if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3155     np = node;
3156     do {
3157       SET_NTYPE(np, NT_LIST);  /* alt -> list */
3158     } while ((np = NCDR(np)) != NULL_NODE);
3159   }
3160   return 0;
3161 }
3162 
3163 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3164 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3165 {
3166   int r, len;
3167   AnchorNode* an = NANCHOR(node);
3168 
3169   /* fprintf(stderr, "setup_look_behind: %x\n", (int )node); */
3170 
3171   r = get_char_length_tree(an->target, reg, &len);
3172   if (r == 0)
3173     an->char_len = len;
3174   else if (r == GET_CHAR_LEN_VARLEN)
3175     r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3176   else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3177     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3178       r = divide_look_behind_alternatives(node);
3179     else
3180       r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3181   }
3182 
3183   return r;
3184 }
3185 
3186 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3187 next_setup(Node* node, Node* next_node, regex_t* reg)
3188 {
3189   int type;
3190 
3191  retry:
3192   type = NTYPE(node);
3193   if (type == NT_QTFR) {
3194     QtfrNode* qn = NQTFR(node);
3195     if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3196 #ifdef USE_QTFR_PEEK_NEXT
3197       Node* n = get_head_value_node(next_node, 1, reg);
3198       /* '\0': for UTF-16BE etc... */
3199       if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3200         qn->next_head_exact = n;
3201       }
3202 #endif
3203       /* automatic posseivation a*b ==> (?>a*)b */
3204       if (qn->lower <= 1) {
3205         int ttype = NTYPE(qn->target);
3206         if (IS_NODE_TYPE_SIMPLE(ttype)) {
3207           Node *x, *y;
3208           x = get_head_value_node(qn->target, 0, reg);
3209           if (IS_NOT_NULL(x)) {
3210             y = get_head_value_node(next_node,  0, reg);
3211             if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3212               Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3213               CHECK_NULL_RETURN_MEMERR(en);
3214               SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3215               swap_node(node, en);
3216               NENCLOSE(node)->target = en;
3217             }
3218           }
3219         }
3220       }
3221     }
3222   }
3223   else if (type == NT_ENCLOSE) {
3224     EncloseNode* en = NENCLOSE(node);
3225     if (en->type == ENCLOSE_MEMORY) {
3226       node = en->target;
3227       goto retry;
3228     }
3229   }
3230   return 0;
3231 }
3232 
3233 
3234 static int
update_string_node_case_fold(regex_t * reg,Node * node)3235 update_string_node_case_fold(regex_t* reg, Node *node)
3236 {
3237   UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3238   UChar *sbuf, *ebuf, *sp;
3239   int r, i, len, sbuf_size;
3240   StrNode* sn = NSTR(node);
3241 
3242   end = sn->end;
3243   sbuf_size = (end - sn->s) * 2;
3244   sbuf = (UChar* )xmalloc(sbuf_size);
3245   CHECK_NULL_RETURN_MEMERR(sbuf);
3246   ebuf = sbuf + sbuf_size;
3247 
3248   sp = sbuf;
3249   p = sn->s;
3250   while (p < end) {
3251     len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3252     for (i = 0; i < len; i++) {
3253       if (sp >= ebuf) {
3254         sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3255         CHECK_NULL_RETURN_MEMERR(sbuf);
3256         sp = sbuf + sbuf_size;
3257         sbuf_size *= 2;
3258         ebuf = sbuf + sbuf_size;
3259       }
3260 
3261       *sp++ = buf[i];
3262     }
3263   }
3264 
3265   r = onig_node_str_set(node, sbuf, sp);
3266   if (r != 0) {
3267     xfree(sbuf);
3268     return r;
3269   }
3270 
3271   xfree(sbuf);
3272   return 0;
3273 }
3274 
3275 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3276 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3277 				 regex_t* reg)
3278 {
3279   int r;
3280   Node *node;
3281 
3282   node = onig_node_new_str(s, end);
3283   if (IS_NULL(node)) return ONIGERR_MEMORY;
3284 
3285   r = update_string_node_case_fold(reg, node);
3286   if (r != 0) {
3287     onig_node_free(node);
3288     return r;
3289   }
3290 
3291   NSTRING_SET_AMBIG(node);
3292   NSTRING_SET_DONT_GET_OPT_INFO(node);
3293   *rnode = node;
3294   return 0;
3295 }
3296 
3297 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3298 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3299 			    UChar *p, int slen, UChar *end,
3300 			    regex_t* reg, Node **rnode)
3301 {
3302   int r, i, j, len, varlen;
3303   Node *anode, *var_anode, *snode, *xnode, *an;
3304   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3305 
3306   *rnode = var_anode = NULL_NODE;
3307 
3308   varlen = 0;
3309   for (i = 0; i < item_num; i++) {
3310     if (items[i].byte_len != slen) {
3311       varlen = 1;
3312       break;
3313     }
3314   }
3315 
3316   if (varlen != 0) {
3317     *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3318     if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3319 
3320     xnode = onig_node_new_list(NULL, NULL);
3321     if (IS_NULL(xnode)) goto mem_err;
3322     NCAR(var_anode) = xnode;
3323 
3324     anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3325     if (IS_NULL(anode)) goto mem_err;
3326     NCAR(xnode) = anode;
3327   }
3328   else {
3329     *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3330     if (IS_NULL(anode)) return ONIGERR_MEMORY;
3331   }
3332 
3333   snode = onig_node_new_str(p, p + slen);
3334   if (IS_NULL(snode)) goto mem_err;
3335 
3336   NCAR(anode) = snode;
3337 
3338   for (i = 0; i < item_num; i++) {
3339     snode = onig_node_new_str(NULL, NULL);
3340     if (IS_NULL(snode)) goto mem_err;
3341 
3342     for (j = 0; j < items[i].code_len; j++) {
3343       len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3344       if (len < 0) {
3345         r = len;
3346         goto mem_err2;
3347       }
3348 
3349       r = onig_node_str_cat(snode, buf, buf + len);
3350       if (r != 0) goto mem_err2;
3351     }
3352 
3353     an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3354     if (IS_NULL(an)) {
3355       goto mem_err2;
3356     }
3357 
3358     if (items[i].byte_len != slen) {
3359       Node *rem;
3360       UChar *q = p + items[i].byte_len;
3361 
3362       if (q < end) {
3363         r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3364         if (r != 0) {
3365           onig_node_free(an);
3366           goto mem_err2;
3367         }
3368 
3369         xnode = onig_node_list_add(NULL_NODE, snode);
3370         if (IS_NULL(xnode)) {
3371           onig_node_free(an);
3372           onig_node_free(rem);
3373           goto mem_err2;
3374         }
3375         if (IS_NULL(onig_node_list_add(xnode, rem))) {
3376           onig_node_free(an);
3377           onig_node_free(xnode);
3378           onig_node_free(rem);
3379           goto mem_err;
3380         }
3381 
3382         NCAR(an) = xnode;
3383       }
3384       else {
3385         NCAR(an) = snode;
3386       }
3387 
3388       NCDR(var_anode) = an;
3389       var_anode = an;
3390     }
3391     else {
3392       NCAR(an)     = snode;
3393       NCDR(anode) = an;
3394       anode = an;
3395     }
3396   }
3397 
3398   return varlen;
3399 
3400  mem_err2:
3401   onig_node_free(snode);
3402 
3403  mem_err:
3404   onig_node_free(*rnode);
3405 
3406   return ONIGERR_MEMORY;
3407 }
3408 
3409 static int
expand_case_fold_string(Node * node,regex_t * reg)3410 expand_case_fold_string(Node* node, regex_t* reg)
3411 {
3412 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION  8
3413 
3414   int r, n, len, alt_num;
3415   UChar *start, *end, *p;
3416   Node *top_root, *root, *snode, *prev_node;
3417   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3418   StrNode* sn = NSTR(node);
3419 
3420   if (NSTRING_IS_AMBIG(node)) return 0;
3421 
3422   start = sn->s;
3423   end   = sn->end;
3424   if (start >= end) return 0;
3425 
3426   r = 0;
3427   top_root = root = prev_node = snode = NULL_NODE;
3428   alt_num = 1;
3429   p = start;
3430   while (p < end) {
3431     n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3432 					   p, end, items);
3433     if (n < 0) {
3434       r = n;
3435       goto err;
3436     }
3437 
3438 	SAFE_ENC_LEN(reg->enc, p, end, len);
3439 
3440     if (n == 0) {
3441       if (IS_NULL(snode)) {
3442         if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3443           top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3444           if (IS_NULL(root)) {
3445             onig_node_free(prev_node);
3446             goto mem_err;
3447           }
3448         }
3449 
3450         prev_node = snode = onig_node_new_str(NULL, NULL);
3451         if (IS_NULL(snode)) goto mem_err;
3452         if (IS_NOT_NULL(root)) {
3453           if (IS_NULL(onig_node_list_add(root, snode))) {
3454             onig_node_free(snode);
3455             goto mem_err;
3456           }
3457         }
3458       }
3459 
3460       r = onig_node_str_cat(snode, p, p + len);
3461       if (r != 0) goto err;
3462     }
3463     else {
3464       alt_num *= (n + 1);
3465       if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3466 
3467       if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3468         top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3469         if (IS_NULL(root)) {
3470           onig_node_free(prev_node);
3471           goto mem_err;
3472         }
3473       }
3474 
3475       r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3476       if (r < 0) goto mem_err;
3477       if (r == 1) {
3478         if (IS_NULL(root)) {
3479           top_root = prev_node;
3480         }
3481         else {
3482           if (IS_NULL(onig_node_list_add(root, prev_node))) {
3483             onig_node_free(prev_node);
3484             goto mem_err;
3485           }
3486         }
3487 
3488         root = NCAR(prev_node);
3489       }
3490       else { /* r == 0 */
3491         if (IS_NOT_NULL(root)) {
3492           if (IS_NULL(onig_node_list_add(root, prev_node))) {
3493             onig_node_free(prev_node);
3494             goto mem_err;
3495           }
3496         }
3497       }
3498 
3499       snode = NULL_NODE;
3500     }
3501 
3502     p += len;
3503   }
3504 
3505   if (p < end) {
3506     Node *srem;
3507 
3508     r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3509     if (r != 0) goto mem_err;
3510 
3511     if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3512       top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3513       if (IS_NULL(root)) {
3514         onig_node_free(srem);
3515         onig_node_free(prev_node);
3516         goto mem_err;
3517       }
3518     }
3519 
3520     if (IS_NULL(root)) {
3521       prev_node = srem;
3522     }
3523     else {
3524       if (IS_NULL(onig_node_list_add(root, srem))) {
3525         onig_node_free(srem);
3526         goto mem_err;
3527       }
3528     }
3529   }
3530 
3531   /* ending */
3532   top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3533   swap_node(node, top_root);
3534   onig_node_free(top_root);
3535   return 0;
3536 
3537  mem_err:
3538   r = ONIGERR_MEMORY;
3539 
3540  err:
3541   onig_node_free(top_root);
3542   return r;
3543 }
3544 
3545 
3546 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3547 
3548 #define CEC_THRES_NUM_BIG_REPEAT         512
3549 #define CEC_INFINITE_NUM          0x7fffffff
3550 
3551 #define CEC_IN_INFINITE_REPEAT    (1<<0)
3552 #define CEC_IN_FINITE_REPEAT      (1<<1)
3553 #define CEC_CONT_BIG_REPEAT       (1<<2)
3554 
3555 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3556 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3557 {
3558   int type;
3559   int r = state;
3560 
3561   type = NTYPE(node);
3562   switch (type) {
3563   case NT_LIST:
3564     {
3565       Node* prev = NULL_NODE;
3566       do {
3567         r = setup_comb_exp_check(NCAR(node), r, env);
3568         prev = NCAR(node);
3569       } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3570     }
3571     break;
3572 
3573   case NT_ALT:
3574     {
3575       int ret;
3576       do {
3577         ret = setup_comb_exp_check(NCAR(node), state, env);
3578         r |= ret;
3579       } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3580     }
3581     break;
3582 
3583   case NT_QTFR:
3584     {
3585       int child_state = state;
3586       int add_state = 0;
3587       QtfrNode* qn = NQTFR(node);
3588       Node* target = qn->target;
3589       int var_num;
3590 
3591       if (! IS_REPEAT_INFINITE(qn->upper)) {
3592         if (qn->upper > 1) {
3593           /* {0,1}, {1,1} are allowed */
3594           child_state |= CEC_IN_FINITE_REPEAT;
3595 
3596           /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3597           if (env->backrefed_mem == 0) {
3598             if (NTYPE(qn->target) == NT_ENCLOSE) {
3599               EncloseNode* en = NENCLOSE(qn->target);
3600               if (en->type == ENCLOSE_MEMORY) {
3601                 if (NTYPE(en->target) == NT_QTFR) {
3602                   QtfrNode* q = NQTFR(en->target);
3603                   if (IS_REPEAT_INFINITE(q->upper)
3604                       && q->greedy == qn->greedy) {
3605                     qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3606                     if (qn->upper == 1)
3607                       child_state = state;
3608                   }
3609                 }
3610               }
3611             }
3612           }
3613         }
3614       }
3615 
3616       if (state & CEC_IN_FINITE_REPEAT) {
3617         qn->comb_exp_check_num = -1;
3618       }
3619       else {
3620         if (IS_REPEAT_INFINITE(qn->upper)) {
3621           var_num = CEC_INFINITE_NUM;
3622           child_state |= CEC_IN_INFINITE_REPEAT;
3623         }
3624         else {
3625           var_num = qn->upper - qn->lower;
3626         }
3627 
3628         if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3629           add_state |= CEC_CONT_BIG_REPEAT;
3630 
3631         if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3632             ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3633              var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3634           if (qn->comb_exp_check_num == 0) {
3635             env->num_comb_exp_check++;
3636             qn->comb_exp_check_num = env->num_comb_exp_check;
3637             if (env->curr_max_regnum > env->comb_exp_max_regnum)
3638               env->comb_exp_max_regnum = env->curr_max_regnum;
3639           }
3640         }
3641       }
3642 
3643       r = setup_comb_exp_check(target, child_state, env);
3644       r |= add_state;
3645     }
3646     break;
3647 
3648   case NT_ENCLOSE:
3649     {
3650       EncloseNode* en = NENCLOSE(node);
3651 
3652       switch (en->type) {
3653       case ENCLOSE_MEMORY:
3654         {
3655           if (env->curr_max_regnum < en->regnum)
3656             env->curr_max_regnum = en->regnum;
3657 
3658           r = setup_comb_exp_check(en->target, state, env);
3659         }
3660         break;
3661 
3662       default:
3663         r = setup_comb_exp_check(en->target, state, env);
3664         break;
3665       }
3666     }
3667     break;
3668 
3669 #ifdef USE_SUBEXP_CALL
3670   case NT_CALL:
3671     if (IS_CALL_RECURSION(NCALL(node)))
3672       env->has_recursion = 1;
3673     else
3674       r = setup_comb_exp_check(NCALL(node)->target, state, env);
3675     break;
3676 #endif
3677 
3678   default:
3679     break;
3680   }
3681 
3682   return r;
3683 }
3684 #endif
3685 
3686 #define IN_ALT        (1<<0)
3687 #define IN_NOT        (1<<1)
3688 #define IN_REPEAT     (1<<2)
3689 #define IN_VAR_REPEAT (1<<3)
3690 #define IN_CALL       (1<<4)
3691 #define IN_RECCALL    (1<<5)
3692 
3693 /* setup_tree does the following work.
3694  1. check empty loop. (set qn->target_empty_info)
3695  2. expand ignore-case in char class.
3696  3. set memory status bit flags. (reg->mem_stats)
3697  4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3698  5. find invalid patterns in look-behind.
3699  6. expand repeated string.
3700  */
3701 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3702 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3703 {
3704   int type;
3705   int r = 0;
3706 
3707   type = NTYPE(node);
3708   switch (type) {
3709   case NT_LIST:
3710     {
3711       Node* prev = NULL_NODE;
3712       do {
3713         r = setup_tree(NCAR(node), reg, state, env);
3714         if (IS_NOT_NULL(prev) && r == 0) {
3715           r = next_setup(prev, NCAR(node), reg);
3716         }
3717         prev = NCAR(node);
3718       } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3719     }
3720     break;
3721 
3722   case NT_ALT:
3723     do {
3724       r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3725     } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3726     break;
3727 
3728   case NT_CCLASS:
3729     break;
3730 
3731   case NT_STR:
3732     if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3733       r = expand_case_fold_string(node, reg);
3734     }
3735     break;
3736 
3737   case NT_CTYPE:
3738   case NT_CANY:
3739     break;
3740 
3741 #ifdef USE_SUBEXP_CALL
3742   case NT_CALL:
3743     break;
3744 #endif
3745 
3746   case NT_BREF:
3747     {
3748       int i;
3749       int* p;
3750       Node** nodes = SCANENV_MEM_NODES(env);
3751       BRefNode* br = NBREF(node);
3752       p = BACKREFS_P(br);
3753       for (i = 0; i < br->back_num; i++) {
3754         if (p[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
3755         BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3756         BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3757 #ifdef USE_BACKREF_WITH_LEVEL
3758         if (IS_BACKREF_NEST_LEVEL(br)) {
3759           BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3760         }
3761 #endif
3762         SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3763       }
3764     }
3765     break;
3766 
3767   case NT_QTFR:
3768     {
3769       OnigLen d;
3770       QtfrNode* qn = NQTFR(node);
3771       Node* target = qn->target;
3772 
3773       if ((state & IN_REPEAT) != 0) {
3774         qn->state |= NST_IN_REPEAT;
3775       }
3776 
3777       if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3778         r = get_min_len(target, &d, env);
3779         if (r) break;
3780         if (d == 0) {
3781           qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3782 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3783           r = quantifiers_memory_node_info(target);
3784           if (r < 0) break;
3785           if (r > 0) {
3786             qn->target_empty_info = r;
3787           }
3788 #endif
3789 #if 0
3790           r = get_max_len(target, &d, env);
3791           if (r == 0 && d == 0) {
3792             /*  ()* ==> ()?, ()+ ==> ()  */
3793             qn->upper = 1;
3794             if (qn->lower > 1) qn->lower = 1;
3795             if (NTYPE(target) == NT_STR) {
3796               qn->upper = qn->lower = 0;  /* /(?:)+/ ==> // */
3797             }
3798           }
3799 #endif
3800         }
3801       }
3802 
3803       state |= IN_REPEAT;
3804       if (qn->lower != qn->upper)
3805         state |= IN_VAR_REPEAT;
3806       r = setup_tree(target, reg, state, env);
3807       if (r) break;
3808 
3809       /* expand string */
3810 #define EXPAND_STRING_MAX_LENGTH  100
3811       if (NTYPE(target) == NT_STR) {
3812         if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3813             qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3814           int len = NSTRING_LEN(target);
3815           StrNode* sn = NSTR(target);
3816 
3817           if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3818             int i, n = qn->lower;
3819             onig_node_conv_to_str_node(node, NSTR(target)->flag);
3820             for (i = 0; i < n; i++) {
3821               r = onig_node_str_cat(node, sn->s, sn->end);
3822               if (r) break;
3823             }
3824             onig_node_free(target);
3825             break; /* break case NT_QTFR: */
3826           }
3827         }
3828       }
3829 
3830 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3831       if (qn->greedy && (qn->target_empty_info != 0)) {
3832         if (NTYPE(target) == NT_QTFR) {
3833           QtfrNode* tqn = NQTFR(target);
3834           if (IS_NOT_NULL(tqn->head_exact)) {
3835             qn->head_exact  = tqn->head_exact;
3836             tqn->head_exact = NULL;
3837           }
3838         }
3839         else {
3840           qn->head_exact = get_head_value_node(qn->target, 1, reg);
3841         }
3842       }
3843 #endif
3844     }
3845     break;
3846 
3847   case NT_ENCLOSE:
3848     {
3849       EncloseNode* en = NENCLOSE(node);
3850 
3851       switch (en->type) {
3852       case ENCLOSE_OPTION:
3853         {
3854           OnigOptionType options = reg->options;
3855           reg->options = NENCLOSE(node)->option;
3856           r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3857           reg->options = options;
3858         }
3859         break;
3860 
3861       case ENCLOSE_MEMORY:
3862         if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
3863           BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3864           /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3865         }
3866         if (IS_ENCLOSE_CALLED(en))
3867           state |= IN_CALL;
3868         if (IS_ENCLOSE_RECURSION(en))
3869           state |= IN_RECCALL;
3870         else if ((state & IN_RECCALL) != 0)
3871           SET_CALL_RECURSION(node);
3872         r = setup_tree(en->target, reg, state, env);
3873         break;
3874 
3875       case ENCLOSE_STOP_BACKTRACK:
3876         {
3877           Node* target = en->target;
3878           r = setup_tree(target, reg, state, env);
3879           if (NTYPE(target) == NT_QTFR) {
3880             QtfrNode* tqn = NQTFR(target);
3881             if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3882                 tqn->greedy != 0) {  /* (?>a*), a*+ etc... */
3883               int qtype = NTYPE(tqn->target);
3884               if (IS_NODE_TYPE_SIMPLE(qtype))
3885                 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3886             }
3887           }
3888         }
3889         break;
3890       }
3891     }
3892     break;
3893 
3894   case NT_ANCHOR:
3895     {
3896       AnchorNode* an = NANCHOR(node);
3897 
3898       switch (an->type) {
3899       case ANCHOR_PREC_READ:
3900         r = setup_tree(an->target, reg, state, env);
3901         break;
3902       case ANCHOR_PREC_READ_NOT:
3903         r = setup_tree(an->target, reg, (state | IN_NOT), env);
3904         break;
3905 
3906 /* allowed node types in look-behind */
3907 #define ALLOWED_TYPE_IN_LB  \
3908   ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3909     BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3910 
3911 #define ALLOWED_ENCLOSE_IN_LB       ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
3912 #define ALLOWED_ENCLOSE_IN_LB_NOT   ENCLOSE_OPTION
3913 
3914 #define ALLOWED_ANCHOR_IN_LB \
3915 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
3916 
3917 #define ALLOWED_ANCHOR_IN_LB_NOT \
3918 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
3919 
3920       case ANCHOR_LOOK_BEHIND:
3921         {
3922           r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3923                               ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3924           if (r < 0) return r;
3925           if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3926           r = setup_tree(an->target, reg, state, env);
3927           if (r != 0) return r;
3928           r = setup_look_behind(node, reg, env);
3929         }
3930         break;
3931 
3932       case ANCHOR_LOOK_BEHIND_NOT:
3933         {
3934           r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3935                    ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3936           if (r < 0) return r;
3937           if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3938           r = setup_tree(an->target, reg, (state | IN_NOT), env);
3939           if (r != 0) return r;
3940           r = setup_look_behind(node, reg, env);
3941         }
3942         break;
3943       }
3944     }
3945     break;
3946 
3947   default:
3948     break;
3949   }
3950 
3951   return r;
3952 }
3953 
3954 /* set skip map for Boyer-Moore search */
3955 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3956 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3957 	    UChar skip[], int** int_skip)
3958 {
3959   int i, len;
3960 
3961   len = end - s;
3962   if (len < ONIG_CHAR_TABLE_SIZE) {
3963     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3964 
3965     for (i = 0; i < len - 1; i++)
3966       skip[s[i]] = len - 1 - i;
3967   }
3968   else {
3969     if (IS_NULL(*int_skip)) {
3970       *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3971       if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3972     }
3973     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3974 
3975     for (i = 0; i < len - 1; i++)
3976       (*int_skip)[s[i]] = len - 1 - i;
3977   }
3978   return 0;
3979 }
3980 
3981 #define OPT_EXACT_MAXLEN   24
3982 
3983 typedef struct {
3984   OnigLen min;  /* min byte length */
3985   OnigLen max;  /* max byte length */
3986 } MinMaxLen;
3987 
3988 typedef struct {
3989   MinMaxLen        mmd;
3990   OnigEncoding     enc;
3991   OnigOptionType   options;
3992   OnigCaseFoldType case_fold_flag;
3993   ScanEnv*         scan_env;
3994 } OptEnv;
3995 
3996 typedef struct {
3997   int left_anchor;
3998   int right_anchor;
3999 } OptAncInfo;
4000 
4001 typedef struct {
4002   MinMaxLen  mmd; /* info position */
4003   OptAncInfo anc;
4004 
4005   int   reach_end;
4006   int   ignore_case;
4007   int   len;
4008   UChar s[OPT_EXACT_MAXLEN];
4009 } OptExactInfo;
4010 
4011 typedef struct {
4012   MinMaxLen mmd; /* info position */
4013   OptAncInfo anc;
4014 
4015   int   value;      /* weighted value */
4016   UChar map[ONIG_CHAR_TABLE_SIZE];
4017 } OptMapInfo;
4018 
4019 typedef struct {
4020   MinMaxLen    len;
4021 
4022   OptAncInfo   anc;
4023   OptExactInfo exb;    /* boundary */
4024   OptExactInfo exm;    /* middle */
4025   OptExactInfo expr;   /* prec read (?=...) */
4026 
4027   OptMapInfo   map;   /* boundary */
4028 } NodeOptInfo;
4029 
4030 
4031 static int
map_position_value(OnigEncoding enc,int i)4032 map_position_value(OnigEncoding enc, int i)
4033 {
4034   static const short int ByteValTable[] = {
4035      5,  1,  1,  1,  1,  1,  1,  1,  1, 10, 10,  1,  1, 10,  1,  1,
4036      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
4037     12,  4,  7,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
4038      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,
4039      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
4040      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,  5,  5,  5,
4041      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
4042      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  1
4043   };
4044 
4045   if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
4046     if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4047       return 20;
4048     else
4049       return (int )ByteValTable[i];
4050   }
4051   else
4052     return 4;   /* Take it easy. */
4053 }
4054 
4055 static int
distance_value(MinMaxLen * mm)4056 distance_value(MinMaxLen* mm)
4057 {
4058   /* 1000 / (min-max-dist + 1) */
4059   static const short int dist_vals[] = {
4060     1000,  500,  333,  250,  200,  167,  143,  125,  111,  100,
4061       91,   83,   77,   71,   67,   63,   59,   56,   53,   50,
4062       48,   45,   43,   42,   40,   38,   37,   36,   34,   33,
4063       32,   31,   30,   29,   29,   28,   27,   26,   26,   25,
4064       24,   24,   23,   23,   22,   22,   21,   21,   20,   20,
4065       20,   19,   19,   19,   18,   18,   18,   17,   17,   17,
4066       16,   16,   16,   16,   15,   15,   15,   15,   14,   14,
4067       14,   14,   14,   14,   13,   13,   13,   13,   13,   13,
4068       12,   12,   12,   12,   12,   12,   11,   11,   11,   11,
4069       11,   11,   11,   11,   11,   10,   10,   10,   10,   10
4070   };
4071 
4072   OnigLen d;
4073 
4074   if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4075 
4076   d = mm->max - mm->min;
4077   if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4078     /* return dist_vals[d] * 16 / (mm->min + 12); */
4079     return (int )dist_vals[d];
4080   else
4081     return 1;
4082 }
4083 
4084 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4085 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4086 {
4087   if (v2 <= 0) return -1;
4088   if (v1 <= 0) return  1;
4089 
4090   v1 *= distance_value(d1);
4091   v2 *= distance_value(d2);
4092 
4093   if (v2 > v1) return  1;
4094   if (v2 < v1) return -1;
4095 
4096   if (d2->min < d1->min) return  1;
4097   if (d2->min > d1->min) return -1;
4098   return 0;
4099 }
4100 
4101 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4102 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4103 {
4104   return (a->min == b->min && a->max == b->max) ? 1 : 0;
4105 }
4106 
4107 
4108 static void
set_mml(MinMaxLen * mml,OnigLen min,OnigLen max)4109 set_mml(MinMaxLen* mml, OnigLen min, OnigLen max)
4110 {
4111   mml->min = min;
4112   mml->max = max;
4113 }
4114 
4115 static void
clear_mml(MinMaxLen * mml)4116 clear_mml(MinMaxLen* mml)
4117 {
4118   mml->min = mml->max = 0;
4119 }
4120 
4121 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4122 copy_mml(MinMaxLen* to, MinMaxLen* from)
4123 {
4124   to->min = from->min;
4125   to->max = from->max;
4126 }
4127 
4128 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4129 add_mml(MinMaxLen* to, MinMaxLen* from)
4130 {
4131   to->min = distance_add(to->min, from->min);
4132   to->max = distance_add(to->max, from->max);
4133 }
4134 
4135 #if 0
4136 static void
4137 add_len_mml(MinMaxLen* to, OnigLen len)
4138 {
4139   to->min = distance_add(to->min, len);
4140   to->max = distance_add(to->max, len);
4141 }
4142 #endif
4143 
4144 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4145 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4146 {
4147   if (to->min > from->min) to->min = from->min;
4148   if (to->max < from->max) to->max = from->max;
4149 }
4150 
4151 static void
copy_opt_env(OptEnv * to,OptEnv * from)4152 copy_opt_env(OptEnv* to, OptEnv* from)
4153 {
4154   *to = *from;
4155 }
4156 
4157 static void
clear_opt_anc_info(OptAncInfo * anc)4158 clear_opt_anc_info(OptAncInfo* anc)
4159 {
4160   anc->left_anchor  = 0;
4161   anc->right_anchor = 0;
4162 }
4163 
4164 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4165 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4166 {
4167   *to = *from;
4168 }
4169 
4170 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigLen left_len,OnigLen right_len)4171 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4172 		    OnigLen left_len, OnigLen right_len)
4173 {
4174   clear_opt_anc_info(to);
4175 
4176   to->left_anchor = left->left_anchor;
4177   if (left_len == 0) {
4178     to->left_anchor |= right->left_anchor;
4179   }
4180 
4181   to->right_anchor = right->right_anchor;
4182   if (right_len == 0) {
4183     to->right_anchor |= left->right_anchor;
4184   }
4185   else {
4186     to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
4187   }
4188 }
4189 
4190 static int
is_left_anchor(int anc)4191 is_left_anchor(int anc)
4192 {
4193   if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4194       anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4195       anc == ANCHOR_PREC_READ_NOT)
4196     return 0;
4197 
4198   return 1;
4199 }
4200 
4201 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4202 is_set_opt_anc_info(OptAncInfo* to, int anc)
4203 {
4204   if ((to->left_anchor & anc) != 0) return 1;
4205 
4206   return ((to->right_anchor & anc) != 0 ? 1 : 0);
4207 }
4208 
4209 static void
add_opt_anc_info(OptAncInfo * to,int anc)4210 add_opt_anc_info(OptAncInfo* to, int anc)
4211 {
4212   if (is_left_anchor(anc))
4213     to->left_anchor |= anc;
4214   else
4215     to->right_anchor |= anc;
4216 }
4217 
4218 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4219 remove_opt_anc_info(OptAncInfo* to, int anc)
4220 {
4221   if (is_left_anchor(anc))
4222     to->left_anchor &= ~anc;
4223   else
4224     to->right_anchor &= ~anc;
4225 }
4226 
4227 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4228 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4229 {
4230   to->left_anchor  &= add->left_anchor;
4231   to->right_anchor &= add->right_anchor;
4232 }
4233 
4234 static int
is_full_opt_exact_info(OptExactInfo * ex)4235 is_full_opt_exact_info(OptExactInfo* ex)
4236 {
4237   return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4238 }
4239 
4240 static void
clear_opt_exact_info(OptExactInfo * ex)4241 clear_opt_exact_info(OptExactInfo* ex)
4242 {
4243   clear_mml(&ex->mmd);
4244   clear_opt_anc_info(&ex->anc);
4245   ex->reach_end   = 0;
4246   ex->ignore_case = 0;
4247   ex->len         = 0;
4248   ex->s[0]        = '\0';
4249 }
4250 
4251 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4252 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4253 {
4254   *to = *from;
4255 }
4256 
4257 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4258 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4259 {
4260   int i, j, len;
4261   UChar *p, *end;
4262   OptAncInfo tanc;
4263 
4264   if (! to->ignore_case && add->ignore_case) {
4265     if (to->len >= add->len) return ;  /* avoid */
4266 
4267     to->ignore_case = 1;
4268   }
4269 
4270   p = add->s;
4271   end = p + add->len;
4272   for (i = to->len; p < end; ) {
4273     len = enclen(enc, p);
4274     if (i + len > OPT_EXACT_MAXLEN) break;
4275     for (j = 0; j < len && p < end; j++)
4276       to->s[i++] = *p++;
4277   }
4278 
4279   to->len = i;
4280   to->reach_end = (p == end ? add->reach_end : 0);
4281 
4282   concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4283   if (! to->reach_end) tanc.right_anchor = 0;
4284   copy_opt_anc_info(&to->anc, &tanc);
4285 }
4286 
4287 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4288 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4289 			  int raw ARG_UNUSED, OnigEncoding enc)
4290 {
4291   int i, j, len;
4292   UChar *p;
4293 
4294   for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4295     len = enclen(enc, p);
4296     if (i + len > OPT_EXACT_MAXLEN) break;
4297     for (j = 0; j < len && p < end; j++)
4298       to->s[i++] = *p++;
4299   }
4300 
4301   to->len = i;
4302 }
4303 
4304 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4305 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4306 {
4307   int i, j, len;
4308 
4309   if (add->len == 0 || to->len == 0) {
4310     clear_opt_exact_info(to);
4311     return ;
4312   }
4313 
4314   if (! is_equal_mml(&to->mmd, &add->mmd)) {
4315     clear_opt_exact_info(to);
4316     return ;
4317   }
4318 
4319   for (i = 0; i < to->len && i < add->len; ) {
4320     if (to->s[i] != add->s[i]) break;
4321     len = enclen(env->enc, to->s + i);
4322 
4323     for (j = 1; j < len; j++) {
4324       if (to->s[i+j] != add->s[i+j]) break;
4325     }
4326     if (j < len) break;
4327     i += len;
4328   }
4329 
4330   if (! add->reach_end || i < add->len || i < to->len) {
4331     to->reach_end = 0;
4332   }
4333   to->len = i;
4334   to->ignore_case |= add->ignore_case;
4335 
4336   alt_merge_opt_anc_info(&to->anc, &add->anc);
4337   if (! to->reach_end) to->anc.right_anchor = 0;
4338 }
4339 
4340 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4341 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4342 {
4343   int v1, v2;
4344 
4345   v1 = now->len;
4346   v2 = alt->len;
4347 
4348   if (v2 == 0) {
4349     return ;
4350   }
4351   else if (v1 == 0) {
4352     copy_opt_exact_info(now, alt);
4353     return ;
4354   }
4355   else if (v1 <= 2 && v2 <= 2) {
4356     /* ByteValTable[x] is big value --> low price */
4357     v2 = map_position_value(enc, now->s[0]);
4358     v1 = map_position_value(enc, alt->s[0]);
4359 
4360     if (now->len > 1) v1 += 5;
4361     if (alt->len > 1) v2 += 5;
4362   }
4363 
4364   if (now->ignore_case == 0) v1 *= 2;
4365   if (alt->ignore_case == 0) v2 *= 2;
4366 
4367   if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4368     copy_opt_exact_info(now, alt);
4369 }
4370 
4371 static void
clear_opt_map_info(OptMapInfo * map)4372 clear_opt_map_info(OptMapInfo* map)
4373 {
4374   static const OptMapInfo clean_info = {
4375     {0, 0}, {0, 0}, 0,
4376     {
4377       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4378       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4379       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4380       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4381       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4382       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4383       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4384       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4385       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4386       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4387       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4388       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4389       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4390       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4391       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4392       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4393     }
4394   };
4395 
4396   xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4397 }
4398 
4399 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4400 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4401 {
4402   *to = *from;
4403 }
4404 
4405 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4406 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4407 {
4408   if (map->map[c] == 0) {
4409     map->map[c] = 1;
4410     map->value += map_position_value(enc, c);
4411   }
4412 }
4413 
4414 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4415 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4416                           OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4417 {
4418   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4419   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4420   int i, n;
4421 
4422   add_char_opt_map_info(map, p[0], enc);
4423 
4424   case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4425   n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4426   if (n < 0) return n;
4427 
4428   for (i = 0; i < n; i++) {
4429     ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4430     add_char_opt_map_info(map, buf[0], enc);
4431   }
4432 
4433   return 0;
4434 }
4435 
4436 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4437 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4438 {
4439   static int z = 1<<15; /* 32768: something big value */
4440 
4441   int v1, v2;
4442 
4443   if (alt->value == 0) return ;
4444   if (now->value == 0) {
4445     copy_opt_map_info(now, alt);
4446     return ;
4447   }
4448 
4449   v1 = z / now->value;
4450   v2 = z / alt->value;
4451   if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4452     copy_opt_map_info(now, alt);
4453 }
4454 
4455 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4456 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4457 {
4458 #define COMP_EM_BASE  20
4459   int ve, vm;
4460 
4461   if (m->value <= 0) return -1;
4462 
4463   ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4464   vm = COMP_EM_BASE * 5 * 2 / m->value;
4465   return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4466 }
4467 
4468 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4469 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4470 {
4471   int i, val;
4472 
4473   /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4474   if (to->value == 0) return ;
4475   if (add->value == 0 || to->mmd.max < add->mmd.min) {
4476     clear_opt_map_info(to);
4477     return ;
4478   }
4479 
4480   alt_merge_mml(&to->mmd, &add->mmd);
4481 
4482   val = 0;
4483   for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4484     if (add->map[i])
4485       to->map[i] = 1;
4486 
4487     if (to->map[i])
4488       val += map_position_value(enc, i);
4489   }
4490   to->value = val;
4491 
4492   alt_merge_opt_anc_info(&to->anc, &add->anc);
4493 }
4494 
4495 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4496 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4497 {
4498   copy_mml(&(opt->exb.mmd),  mmd);
4499   copy_mml(&(opt->expr.mmd), mmd);
4500   copy_mml(&(opt->map.mmd),  mmd);
4501 }
4502 
4503 static void
clear_node_opt_info(NodeOptInfo * opt)4504 clear_node_opt_info(NodeOptInfo* opt)
4505 {
4506   clear_mml(&opt->len);
4507   clear_opt_anc_info(&opt->anc);
4508   clear_opt_exact_info(&opt->exb);
4509   clear_opt_exact_info(&opt->exm);
4510   clear_opt_exact_info(&opt->expr);
4511   clear_opt_map_info(&opt->map);
4512 }
4513 
4514 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4515 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4516 {
4517   *to = *from;
4518 }
4519 
4520 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4521 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4522 {
4523   int exb_reach, exm_reach;
4524   OptAncInfo tanc;
4525 
4526   concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4527   copy_opt_anc_info(&to->anc, &tanc);
4528 
4529   if (add->exb.len > 0 && to->len.max == 0) {
4530     concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4531 			to->len.max, add->len.max);
4532     copy_opt_anc_info(&add->exb.anc, &tanc);
4533   }
4534 
4535   if (add->map.value > 0 && to->len.max == 0) {
4536     if (add->map.mmd.max == 0)
4537       add->map.anc.left_anchor |= to->anc.left_anchor;
4538   }
4539 
4540   exb_reach = to->exb.reach_end;
4541   exm_reach = to->exm.reach_end;
4542 
4543   if (add->len.max != 0)
4544     to->exb.reach_end = to->exm.reach_end = 0;
4545 
4546   if (add->exb.len > 0) {
4547     if (exb_reach) {
4548       concat_opt_exact_info(&to->exb, &add->exb, enc);
4549       clear_opt_exact_info(&add->exb);
4550     }
4551     else if (exm_reach) {
4552       concat_opt_exact_info(&to->exm, &add->exb, enc);
4553       clear_opt_exact_info(&add->exb);
4554     }
4555   }
4556   select_opt_exact_info(enc, &to->exm, &add->exb);
4557   select_opt_exact_info(enc, &to->exm, &add->exm);
4558 
4559   if (to->expr.len > 0) {
4560     if (add->len.max > 0) {
4561       if (to->expr.len > (int )add->len.max)
4562         to->expr.len = add->len.max;
4563 
4564       if (to->expr.mmd.max == 0)
4565         select_opt_exact_info(enc, &to->exb, &to->expr);
4566       else
4567         select_opt_exact_info(enc, &to->exm, &to->expr);
4568     }
4569   }
4570   else if (add->expr.len > 0) {
4571     copy_opt_exact_info(&to->expr, &add->expr);
4572   }
4573 
4574   select_opt_map_info(&to->map, &add->map);
4575 
4576   add_mml(&to->len, &add->len);
4577 }
4578 
4579 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4580 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4581 {
4582   alt_merge_opt_anc_info  (&to->anc,  &add->anc);
4583   alt_merge_opt_exact_info(&to->exb,  &add->exb, env);
4584   alt_merge_opt_exact_info(&to->exm,  &add->exm, env);
4585   alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4586   alt_merge_opt_map_info(env->enc, &to->map,  &add->map);
4587 
4588   alt_merge_mml(&to->len, &add->len);
4589 }
4590 
4591 
4592 #define MAX_NODE_OPT_INFO_REF_COUNT    5
4593 
4594 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4595 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4596 {
4597   int type;
4598   int r = 0;
4599 
4600   clear_node_opt_info(opt);
4601   set_bound_node_opt_info(opt, &env->mmd);
4602 
4603   type = NTYPE(node);
4604   switch (type) {
4605   case NT_LIST:
4606     {
4607       OptEnv nenv;
4608       NodeOptInfo nopt;
4609       Node* nd = node;
4610 
4611       copy_opt_env(&nenv, env);
4612       do {
4613         r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4614         if (r == 0) {
4615           add_mml(&nenv.mmd, &nopt.len);
4616           concat_left_node_opt_info(env->enc, opt, &nopt);
4617         }
4618       } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4619     }
4620     break;
4621 
4622   case NT_ALT:
4623     {
4624       NodeOptInfo nopt;
4625       Node* nd = node;
4626 
4627       do {
4628         r = optimize_node_left(NCAR(nd), &nopt, env);
4629         if (r == 0) {
4630           if (nd == node) copy_node_opt_info(opt, &nopt);
4631           else            alt_merge_node_opt_info(opt, &nopt, env);
4632         }
4633       } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4634     }
4635     break;
4636 
4637   case NT_STR:
4638     {
4639       StrNode* sn = NSTR(node);
4640       int slen = sn->end - sn->s;
4641       int is_raw = NSTRING_IS_RAW(node);
4642 
4643       if (! NSTRING_IS_AMBIG(node)) {
4644         concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4645                                   NSTRING_IS_RAW(node), env->enc);
4646         if (slen > 0) {
4647           add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4648         }
4649         set_mml(&opt->len, slen, slen);
4650       }
4651       else {
4652         int max;
4653 
4654         if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4655           int n = onigenc_strlen(env->enc, sn->s, sn->end);
4656           max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4657         }
4658         else {
4659           concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4660                                     is_raw, env->enc);
4661           opt->exb.ignore_case = 1;
4662 
4663           if (slen > 0) {
4664             r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4665                                           env->enc, env->case_fold_flag);
4666             if (r != 0) break;
4667           }
4668 
4669           max = slen;
4670         }
4671 
4672         set_mml(&opt->len, slen, max);
4673       }
4674 
4675       if (opt->exb.len == slen)
4676         opt->exb.reach_end = 1;
4677     }
4678     break;
4679 
4680   case NT_CCLASS:
4681     {
4682       int i, z;
4683       CClassNode* cc = NCCLASS(node);
4684 
4685       /* no need to check ignore case. (set in setup_tree()) */
4686 
4687       if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4688         OnigLen min = ONIGENC_MBC_MINLEN(env->enc);
4689         OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4690 
4691         set_mml(&opt->len, min, max);
4692       }
4693       else {
4694         for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4695           z = BITSET_AT(cc->bs, i);
4696           if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4697             add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4698           }
4699         }
4700         set_mml(&opt->len, 1, 1);
4701       }
4702     }
4703     break;
4704 
4705   case NT_CTYPE:
4706     {
4707       int i, min, max;
4708 
4709       max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4710 
4711       if (max == 1) {
4712         min = 1;
4713 
4714         switch (NCTYPE(node)->ctype) {
4715         case ONIGENC_CTYPE_WORD:
4716           if (NCTYPE(node)->not != 0) {
4717             for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4718               if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4719                 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4720               }
4721             }
4722           }
4723           else {
4724             for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4725               if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4726                 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4727               }
4728             }
4729           }
4730           break;
4731         }
4732       }
4733       else {
4734         min = ONIGENC_MBC_MINLEN(env->enc);
4735       }
4736       set_mml(&opt->len, min, max);
4737     }
4738     break;
4739 
4740   case NT_CANY:
4741     {
4742       OnigLen min = ONIGENC_MBC_MINLEN(env->enc);
4743       OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4744       set_mml(&opt->len, min, max);
4745     }
4746     break;
4747 
4748   case NT_ANCHOR:
4749     switch (NANCHOR(node)->type) {
4750     case ANCHOR_BEGIN_BUF:
4751     case ANCHOR_BEGIN_POSITION:
4752     case ANCHOR_BEGIN_LINE:
4753     case ANCHOR_END_BUF:
4754     case ANCHOR_SEMI_END_BUF:
4755     case ANCHOR_END_LINE:
4756     case ANCHOR_PREC_READ_NOT:
4757     case ANCHOR_LOOK_BEHIND:
4758       add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4759       break;
4760 
4761     case ANCHOR_PREC_READ:
4762       {
4763         NodeOptInfo nopt;
4764 
4765         r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4766         if (r == 0) {
4767           if (nopt.exb.len > 0)
4768             copy_opt_exact_info(&opt->expr, &nopt.exb);
4769           else if (nopt.exm.len > 0)
4770             copy_opt_exact_info(&opt->expr, &nopt.exm);
4771 
4772           opt->expr.reach_end = 0;
4773 
4774           if (nopt.map.value > 0)
4775             copy_opt_map_info(&opt->map, &nopt.map);
4776         }
4777       }
4778       break;
4779 
4780     case ANCHOR_LOOK_BEHIND_NOT:
4781       break;
4782     }
4783     break;
4784 
4785   case NT_BREF:
4786     {
4787       int i;
4788       int* backs;
4789       OnigLen min, max, tmin, tmax;
4790       Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4791       BRefNode* br = NBREF(node);
4792 
4793       if (br->state & NST_RECURSION) {
4794         set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4795         break;
4796       }
4797       backs = BACKREFS_P(br);
4798       r = get_min_len(nodes[backs[0]], &min, env->scan_env);
4799       if (r != 0) break;
4800       r = get_max_len(nodes[backs[0]], &max, env->scan_env);
4801       if (r != 0) break;
4802       for (i = 1; i < br->back_num; i++) {
4803         r = get_min_len(nodes[backs[i]], &tmin, env->scan_env);
4804         if (r != 0) break;
4805         r = get_max_len(nodes[backs[i]], &tmax, env->scan_env);
4806         if (r != 0) break;
4807         if (min > tmin) min = tmin;
4808         if (max < tmax) max = tmax;
4809       }
4810       if (r == 0) set_mml(&opt->len, min, max);
4811     }
4812     break;
4813 
4814 #ifdef USE_SUBEXP_CALL
4815   case NT_CALL:
4816     if (IS_CALL_RECURSION(NCALL(node)))
4817       set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4818     else {
4819       OnigOptionType save = env->options;
4820       env->options = NENCLOSE(NCALL(node)->target)->option;
4821       r = optimize_node_left(NCALL(node)->target, opt, env);
4822       env->options = save;
4823     }
4824     break;
4825 #endif
4826 
4827   case NT_QTFR:
4828     {
4829       int i;
4830       OnigLen min, max;
4831       NodeOptInfo nopt;
4832       QtfrNode* qn = NQTFR(node);
4833 
4834       r = optimize_node_left(qn->target, &nopt, env);
4835       if (r) break;
4836 
4837       if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4838         if (env->mmd.max == 0 &&
4839             NTYPE(qn->target) == NT_CANY && qn->greedy) {
4840           if (IS_MULTILINE(env->options))
4841             add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4842           else
4843             add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4844         }
4845       }
4846       else {
4847         if (qn->lower > 0) {
4848           copy_node_opt_info(opt, &nopt);
4849           if (nopt.exb.len > 0) {
4850             if (nopt.exb.reach_end) {
4851               for (i = 2; i <= qn->lower &&
4852                      ! is_full_opt_exact_info(&opt->exb); i++) {
4853                 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4854               }
4855               if (i < qn->lower) {
4856                 opt->exb.reach_end = 0;
4857               }
4858             }
4859           }
4860 
4861           if (qn->lower != qn->upper) {
4862             opt->exb.reach_end = 0;
4863             opt->exm.reach_end = 0;
4864           }
4865           if (qn->lower > 1)
4866             opt->exm.reach_end = 0;
4867         }
4868       }
4869 
4870       min = distance_multiply(nopt.len.min, qn->lower);
4871       if (IS_REPEAT_INFINITE(qn->upper))
4872         max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4873       else
4874         max = distance_multiply(nopt.len.max, qn->upper);
4875 
4876       set_mml(&opt->len, min, max);
4877     }
4878     break;
4879 
4880   case NT_ENCLOSE:
4881     {
4882       EncloseNode* en = NENCLOSE(node);
4883 
4884       switch (en->type) {
4885       case ENCLOSE_OPTION:
4886         {
4887           OnigOptionType save = env->options;
4888 
4889           env->options = en->option;
4890           r = optimize_node_left(en->target, opt, env);
4891           env->options = save;
4892         }
4893         break;
4894 
4895       case ENCLOSE_MEMORY:
4896 #ifdef USE_SUBEXP_CALL
4897         en->opt_count++;
4898         if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4899           OnigLen min, max;
4900 
4901           min = 0;
4902           max = ONIG_INFINITE_DISTANCE;
4903           if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4904           if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4905           set_mml(&opt->len, min, max);
4906         }
4907         else
4908 #endif
4909           {
4910             r = optimize_node_left(en->target, opt, env);
4911 
4912             if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4913               if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4914                 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4915             }
4916           }
4917         break;
4918 
4919       case ENCLOSE_STOP_BACKTRACK:
4920         r = optimize_node_left(en->target, opt, env);
4921         break;
4922       }
4923     }
4924     break;
4925 
4926   default:
4927 #ifdef ONIG_DEBUG
4928     fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4929 	    NTYPE(node));
4930 #endif
4931     r = ONIGERR_TYPE_BUG;
4932     break;
4933   }
4934 
4935   return r;
4936 }
4937 
4938 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4939 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4940 {
4941   int r;
4942 
4943   if (e->len == 0) return 0;
4944 
4945   if (e->ignore_case) {
4946     reg->exact = (UChar* )xmalloc(e->len);
4947     CHECK_NULL_RETURN_MEMERR(reg->exact);
4948     xmemcpy(reg->exact, e->s, e->len);
4949     reg->exact_end = reg->exact + e->len;
4950     reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4951   }
4952   else {
4953     int allow_reverse;
4954 
4955     reg->exact = str_dup(e->s, e->s + e->len);
4956     CHECK_NULL_RETURN_MEMERR(reg->exact);
4957     reg->exact_end = reg->exact + e->len;
4958 
4959     allow_reverse =
4960       ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4961 
4962     if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4963       r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4964 	              reg->map, &(reg->int_map));
4965       if (r) return r;
4966 
4967       reg->optimize = (allow_reverse != 0
4968 		       ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4969     }
4970     else {
4971       reg->optimize = ONIG_OPTIMIZE_EXACT;
4972     }
4973   }
4974 
4975   reg->dmin = e->mmd.min;
4976   reg->dmax = e->mmd.max;
4977 
4978   if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4979     reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4980   }
4981 
4982   return 0;
4983 }
4984 
4985 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4986 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4987 {
4988   int i;
4989 
4990   for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4991     reg->map[i] = m->map[i];
4992 
4993   reg->optimize   = ONIG_OPTIMIZE_MAP;
4994   reg->dmin       = m->mmd.min;
4995   reg->dmax       = m->mmd.max;
4996 
4997   if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4998     reg->threshold_len = reg->dmin + 1;
4999   }
5000 }
5001 
5002 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)5003 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
5004 {
5005   reg->sub_anchor |= anc->left_anchor  & ANCHOR_BEGIN_LINE;
5006   reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
5007 }
5008 
5009 #ifdef ONIG_DEBUG
5010 static void print_optimize_info(FILE* f, regex_t* reg);
5011 #endif
5012 
5013 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)5014 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5015 {
5016 
5017   int r;
5018   NodeOptInfo opt;
5019   OptEnv env;
5020 
5021   env.enc            = reg->enc;
5022   env.options        = reg->options;
5023   env.case_fold_flag = reg->case_fold_flag;
5024   env.scan_env   = scan_env;
5025   clear_mml(&env.mmd);
5026 
5027   r = optimize_node_left(node, &opt, &env);
5028   if (r) return r;
5029 
5030   reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5031         ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
5032         ANCHOR_LOOK_BEHIND);
5033 
5034   if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
5035     reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
5036 
5037   reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
5038        ANCHOR_PREC_READ_NOT);
5039 
5040   if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5041     reg->anchor_dmin = opt.len.min;
5042     reg->anchor_dmax = opt.len.max;
5043   }
5044 
5045   if (opt.exb.len > 0 || opt.exm.len > 0) {
5046     select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5047     if (opt.map.value > 0 &&
5048         comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5049       goto set_map;
5050     }
5051     else {
5052       r = set_optimize_exact_info(reg, &opt.exb);
5053       set_sub_anchor(reg, &opt.exb.anc);
5054     }
5055   }
5056   else if (opt.map.value > 0) {
5057   set_map:
5058     set_optimize_map_info(reg, &opt.map);
5059     set_sub_anchor(reg, &opt.map.anc);
5060   }
5061   else {
5062     reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5063     if (opt.len.max == 0)
5064       reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5065   }
5066 
5067 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5068   print_optimize_info(stderr, reg);
5069 #endif
5070   return r;
5071 }
5072 
5073 static void
clear_optimize_info(regex_t * reg)5074 clear_optimize_info(regex_t* reg)
5075 {
5076   reg->optimize      = ONIG_OPTIMIZE_NONE;
5077   reg->anchor        = 0;
5078   reg->anchor_dmin   = 0;
5079   reg->anchor_dmax   = 0;
5080   reg->sub_anchor    = 0;
5081   reg->exact_end     = (UChar* )NULL;
5082   reg->threshold_len = 0;
5083   if (IS_NOT_NULL(reg->exact)) {
5084     xfree(reg->exact);
5085     reg->exact = (UChar* )NULL;
5086   }
5087 }
5088 
5089 #ifdef ONIG_DEBUG
5090 
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5091 static void print_enc_string(FILE* fp, OnigEncoding enc,
5092 			     const UChar *s, const UChar *end)
5093 {
5094   fprintf(fp, "\nPATTERN: /");
5095 
5096   if (ONIGENC_MBC_MINLEN(enc) > 1) {
5097     const UChar *p;
5098     OnigCodePoint code;
5099 
5100     p = s;
5101     while (p < end) {
5102       code = ONIGENC_MBC_TO_CODE(enc, p, end);
5103       if (code >= 0x80) {
5104         fprintf(fp, " 0x%04x ", (int )code);
5105       }
5106       else {
5107         fputc((int )code, fp);
5108       }
5109 
5110       p += enclen(enc, p);
5111     }
5112   }
5113   else {
5114     while (s < end) {
5115       fputc((int )*s, fp);
5116       s++;
5117     }
5118   }
5119 
5120   fprintf(fp, "/\n");
5121 }
5122 
5123 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)5124 print_distance_range(FILE* f, OnigLen a, OnigLen b)
5125 {
5126   if (a == ONIG_INFINITE_DISTANCE)
5127     fputs("inf", f);
5128   else
5129     fprintf(f, "(%u)", a);
5130 
5131   fputs("-", f);
5132 
5133   if (b == ONIG_INFINITE_DISTANCE)
5134     fputs("inf", f);
5135   else
5136     fprintf(f, "(%u)", b);
5137 }
5138 
5139 static void
print_anchor(FILE * f,int anchor)5140 print_anchor(FILE* f, int anchor)
5141 {
5142   int q = 0;
5143 
5144   fprintf(f, "[");
5145 
5146   if (anchor & ANCHOR_BEGIN_BUF) {
5147     fprintf(f, "begin-buf");
5148     q = 1;
5149   }
5150   if (anchor & ANCHOR_BEGIN_LINE) {
5151     if (q) fprintf(f, ", ");
5152     q = 1;
5153     fprintf(f, "begin-line");
5154   }
5155   if (anchor & ANCHOR_BEGIN_POSITION) {
5156     if (q) fprintf(f, ", ");
5157     q = 1;
5158     fprintf(f, "begin-pos");
5159   }
5160   if (anchor & ANCHOR_END_BUF) {
5161     if (q) fprintf(f, ", ");
5162     q = 1;
5163     fprintf(f, "end-buf");
5164   }
5165   if (anchor & ANCHOR_SEMI_END_BUF) {
5166     if (q) fprintf(f, ", ");
5167     q = 1;
5168     fprintf(f, "semi-end-buf");
5169   }
5170   if (anchor & ANCHOR_END_LINE) {
5171     if (q) fprintf(f, ", ");
5172     q = 1;
5173     fprintf(f, "end-line");
5174   }
5175   if (anchor & ANCHOR_ANYCHAR_STAR) {
5176     if (q) fprintf(f, ", ");
5177     q = 1;
5178     fprintf(f, "anychar-star");
5179   }
5180   if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5181     if (q) fprintf(f, ", ");
5182     fprintf(f, "anychar-star-ml");
5183   }
5184 
5185   fprintf(f, "]");
5186 }
5187 
5188 static void
print_optimize_info(FILE * f,regex_t * reg)5189 print_optimize_info(FILE* f, regex_t* reg)
5190 {
5191   static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5192                               "EXACT_IC", "MAP" };
5193 
5194   fprintf(f, "optimize: %s\n", on[reg->optimize]);
5195   fprintf(f, "  anchor: "); print_anchor(f, reg->anchor);
5196   if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5197     print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5198   fprintf(f, "\n");
5199 
5200   if (reg->optimize) {
5201     fprintf(f, "  sub anchor: "); print_anchor(f, reg->sub_anchor);
5202     fprintf(f, "\n");
5203   }
5204   fprintf(f, "\n");
5205 
5206   if (reg->exact) {
5207     UChar *p;
5208     fprintf(f, "exact: [");
5209     for (p = reg->exact; p < reg->exact_end; p++) {
5210       fputc(*p, f);
5211     }
5212     fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact));
5213   }
5214   else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5215     int c, i, n = 0;
5216 
5217     for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5218       if (reg->map[i]) n++;
5219 
5220     fprintf(f, "map: n=%d\n", n);
5221     if (n > 0) {
5222       c = 0;
5223       fputc('[', f);
5224       for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5225         if (reg->map[i] != 0) {
5226           if (c > 0)  fputs(", ", f);
5227           c++;
5228           if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5229               ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5230             fputc(i, f);
5231           else
5232             fprintf(f, "%d", i);
5233         }
5234       }
5235       fprintf(f, "]\n");
5236     }
5237   }
5238 }
5239 #endif /* ONIG_DEBUG */
5240 
5241 
5242 extern void
onig_free_body(regex_t * reg)5243 onig_free_body(regex_t* reg)
5244 {
5245   if (IS_NOT_NULL(reg)) {
5246     if (IS_NOT_NULL(reg->p))                xfree(reg->p);
5247     if (IS_NOT_NULL(reg->exact))            xfree(reg->exact);
5248     if (IS_NOT_NULL(reg->int_map))          xfree(reg->int_map);
5249     if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5250     if (IS_NOT_NULL(reg->repeat_range))     xfree(reg->repeat_range);
5251     if (IS_NOT_NULL(reg->chain))            onig_free(reg->chain);
5252 
5253 #ifdef USE_NAMED_GROUP
5254     onig_names_free(reg);
5255 #endif
5256   }
5257 }
5258 
5259 extern void
onig_free(regex_t * reg)5260 onig_free(regex_t* reg)
5261 {
5262   if (IS_NOT_NULL(reg)) {
5263     onig_free_body(reg);
5264     xfree(reg);
5265   }
5266 }
5267 
5268 #define REGEX_TRANSFER(to,from) do {\
5269   onig_free_body(to);\
5270   xmemcpy(to, from, sizeof(regex_t));\
5271   xfree(from);\
5272 } while (0)
5273 
5274 extern void
onig_transfer(regex_t * to,regex_t * from)5275 onig_transfer(regex_t* to, regex_t* from)
5276 {
5277   REGEX_TRANSFER(to, from);
5278 }
5279 
5280 
5281 #ifdef ONIG_DEBUG
5282 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5283 #endif
5284 #ifdef ONIG_DEBUG_PARSE_TREE
5285 static void print_tree P_((FILE* f, Node* node));
5286 #endif
5287 
5288 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5289 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5290 	      OnigErrorInfo* einfo)
5291 {
5292 #define COMPILE_INIT_SIZE  20
5293 
5294   int r, init_size;
5295   Node*  root;
5296   ScanEnv  scan_env;
5297 #ifdef USE_SUBEXP_CALL
5298   UnsetAddrList  uslist;
5299 #endif
5300 
5301   root = 0;
5302   if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5303 
5304 #ifdef ONIG_DEBUG
5305   print_enc_string(stderr, reg->enc, pattern, pattern_end);
5306 #endif
5307 
5308   if (reg->alloc == 0) {
5309     init_size = (pattern_end - pattern) * 2;
5310     if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5311     r = BBUF_INIT(reg, init_size);
5312     if (r != 0) goto end;
5313   }
5314   else
5315     reg->used = 0;
5316 
5317   reg->num_mem            = 0;
5318   reg->num_repeat         = 0;
5319   reg->num_null_check     = 0;
5320   reg->repeat_range_alloc = 0;
5321   reg->repeat_range       = (OnigRepeatRange* )NULL;
5322 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5323   reg->num_comb_exp_check = 0;
5324 #endif
5325 
5326   r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5327   if (r != 0) goto err;
5328 
5329 #ifdef USE_NAMED_GROUP
5330   /* mixed use named group and no-named group */
5331   if (scan_env.num_named > 0 &&
5332       IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5333       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5334     if (scan_env.num_named != scan_env.num_mem)
5335       r = disable_noname_group_capture(&root, reg, &scan_env);
5336     else
5337       r = numbered_ref_check(root);
5338 
5339     if (r != 0) goto err;
5340   }
5341 #endif
5342 
5343 #ifdef USE_SUBEXP_CALL
5344   if (scan_env.num_call > 0) {
5345     r = unset_addr_list_init(&uslist, scan_env.num_call);
5346     if (r != 0) goto err;
5347     scan_env.unset_addr_list = &uslist;
5348     r = setup_subexp_call(root, &scan_env);
5349     if (r != 0) goto err_unset;
5350     r = subexp_recursive_check_trav(root, &scan_env);
5351     if (r  < 0) goto err_unset;
5352     r = subexp_inf_recursive_check_trav(root, &scan_env);
5353     if (r != 0) goto err_unset;
5354 
5355     reg->num_call = scan_env.num_call;
5356   }
5357   else
5358     reg->num_call = 0;
5359 #endif
5360 
5361   r = setup_tree(root, reg, 0, &scan_env);
5362   if (r != 0) goto err_unset;
5363 
5364 #ifdef ONIG_DEBUG_PARSE_TREE
5365   print_tree(stderr, root);
5366 #endif
5367 
5368   reg->capture_history  = scan_env.capture_history;
5369   reg->bt_mem_start     = scan_env.bt_mem_start;
5370   reg->bt_mem_start    |= reg->capture_history;
5371   if (IS_FIND_CONDITION(reg->options))
5372     BIT_STATUS_ON_ALL(reg->bt_mem_end);
5373   else {
5374     reg->bt_mem_end  = scan_env.bt_mem_end;
5375     reg->bt_mem_end |= reg->capture_history;
5376   }
5377 
5378 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5379   if (scan_env.backrefed_mem == 0
5380 #ifdef USE_SUBEXP_CALL
5381       || scan_env.num_call == 0
5382 #endif
5383       ) {
5384     setup_comb_exp_check(root, 0, &scan_env);
5385 #ifdef USE_SUBEXP_CALL
5386     if (scan_env.has_recursion != 0) {
5387       scan_env.num_comb_exp_check = 0;
5388     }
5389     else
5390 #endif
5391     if (scan_env.comb_exp_max_regnum > 0) {
5392       int i;
5393       for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5394         if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5395           scan_env.num_comb_exp_check = 0;
5396           break;
5397         }
5398       }
5399     }
5400   }
5401 
5402   reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5403 #endif
5404 
5405   clear_optimize_info(reg);
5406 #ifndef ONIG_DONT_OPTIMIZE
5407   r = set_optimize_info_from_tree(root, reg, &scan_env);
5408   if (r != 0) goto err_unset;
5409 #endif
5410 
5411   if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5412     xfree(scan_env.mem_nodes_dynamic);
5413     scan_env.mem_nodes_dynamic = (Node** )NULL;
5414   }
5415 
5416   r = compile_tree(root, reg);
5417   if (r == 0) {
5418     r = add_opcode(reg, OP_END);
5419 #ifdef USE_SUBEXP_CALL
5420     if (scan_env.num_call > 0) {
5421       r = unset_addr_list_fix(&uslist, reg);
5422       unset_addr_list_end(&uslist);
5423       if (r) goto err;
5424     }
5425 #endif
5426 
5427     if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5428       reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5429     else {
5430       if (reg->bt_mem_start != 0)
5431         reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5432       else
5433         reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5434     }
5435   }
5436 #ifdef USE_SUBEXP_CALL
5437   else if (scan_env.num_call > 0) {
5438     unset_addr_list_end(&uslist);
5439   }
5440 #endif
5441   onig_node_free(root);
5442 
5443 #ifdef ONIG_DEBUG_COMPILE
5444 #ifdef USE_NAMED_GROUP
5445   onig_print_names(stderr, reg);
5446 #endif
5447   print_compiled_byte_code_list(stderr, reg);
5448 #endif
5449 
5450  end:
5451   return r;
5452 
5453  err_unset:
5454 #ifdef USE_SUBEXP_CALL
5455   if (scan_env.num_call > 0) {
5456     unset_addr_list_end(&uslist);
5457   }
5458 #endif
5459  err:
5460   if (IS_NOT_NULL(scan_env.error)) {
5461     if (IS_NOT_NULL(einfo)) {
5462       einfo->enc     = scan_env.enc;
5463       einfo->par     = scan_env.error;
5464       einfo->par_end = scan_env.error_end;
5465     }
5466   }
5467 
5468   onig_node_free(root);
5469   if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5470       xfree(scan_env.mem_nodes_dynamic);
5471   return r;
5472 }
5473 
5474 
5475 static int onig_inited = 0;
5476 
5477 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5478 onig_reg_init(regex_t* reg, OnigOptionType option,
5479 	      OnigCaseFoldType case_fold_flag,
5480 	      OnigEncoding enc, OnigSyntaxType* syntax)
5481 {
5482   int r;
5483 
5484   xmemset(reg, 0, sizeof(*reg));
5485 
5486   if (onig_inited == 0) {
5487 #if 0
5488     return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
5489 #else
5490     r = onig_initialize(NULL, 0);
5491     if (r != 0)
5492       return ONIGERR_FAIL_TO_INITIALIZE;
5493 
5494     r = onig_initialize_encoding(enc);
5495     if (r != 0)
5496       return ONIGERR_FAIL_TO_INITIALIZE;
5497 
5498     onig_warning("You didn't call onig_initialize() explicitly");
5499 #endif
5500   }
5501 
5502   if (IS_NULL(reg))
5503     return ONIGERR_INVALID_ARGUMENT;
5504 
5505   if (ONIGENC_IS_UNDEF(enc))
5506     return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5507 
5508   if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5509       == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5510     return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5511   }
5512 
5513   if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5514     option |= syntax->options;
5515     option &= ~ONIG_OPTION_SINGLELINE;
5516   }
5517   else
5518     option |= syntax->options;
5519 
5520   (reg)->enc              = enc;
5521   (reg)->options          = option;
5522   (reg)->syntax           = syntax;
5523   (reg)->optimize         = 0;
5524   (reg)->exact            = (UChar* )NULL;
5525   (reg)->int_map          = (int* )NULL;
5526   (reg)->int_map_backward = (int* )NULL;
5527   (reg)->chain            = (regex_t* )NULL;
5528 
5529   (reg)->p                = (UChar* )NULL;
5530   (reg)->alloc            = 0;
5531   (reg)->used             = 0;
5532   (reg)->name_table       = (void* )NULL;
5533 
5534   (reg)->case_fold_flag   = case_fold_flag;
5535   return 0;
5536 }
5537 
5538 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5539 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5540           const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5541           OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5542 {
5543   int r;
5544 
5545   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5546   if (r) return r;
5547 
5548   r = onig_compile(reg, pattern, pattern_end, einfo);
5549   return r;
5550 }
5551 
5552 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5553 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5554 	  OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5555 	  OnigErrorInfo* einfo)
5556 {
5557   int r;
5558 
5559   *reg = (regex_t* )xmalloc(sizeof(regex_t));
5560   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5561 
5562   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5563   if (r) goto err;
5564 
5565   r = onig_compile(*reg, pattern, pattern_end, einfo);
5566   if (r) {
5567   err:
5568     onig_free(*reg);
5569     *reg = NULL;
5570   }
5571   return r;
5572 }
5573 
5574 extern int
onig_initialize(OnigEncoding encodings[],int n)5575 onig_initialize(OnigEncoding encodings[], int n)
5576 {
5577   int i;
5578   int r;
5579 
5580   if (onig_inited != 0)
5581     return 0;
5582 
5583   onigenc_init();
5584 
5585   onig_inited = 1;
5586 
5587   for (i = 0; i < n; i++) {
5588     OnigEncoding enc = encodings[i];
5589     r = onig_initialize_encoding(enc);
5590     if (r != 0)
5591       return r;
5592   }
5593 
5594   return 0;
5595 }
5596 
5597 static OnigEndCallListItemType* EndCallTop;
5598 
onig_add_end_call(void (* func)(void))5599 extern void onig_add_end_call(void (*func)(void))
5600 {
5601   OnigEndCallListItemType* item;
5602 
5603   item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
5604   if (item == 0) return ;
5605 
5606   item->next = EndCallTop;
5607   item->func = func;
5608 
5609   EndCallTop = item;
5610 }
5611 
5612 static void
exec_end_call_list(void)5613 exec_end_call_list(void)
5614 {
5615   OnigEndCallListItemType* prev;
5616   void (*func)(void);
5617 
5618   while (EndCallTop != 0) {
5619     func = EndCallTop->func;
5620     (*func)();
5621 
5622     prev = EndCallTop;
5623     EndCallTop = EndCallTop->next;
5624     xfree(prev);
5625   }
5626 }
5627 
5628 extern int
onig_end(void)5629 onig_end(void)
5630 {
5631   exec_end_call_list();
5632 
5633   onig_inited = 0;
5634 
5635   return 0;
5636 }
5637 
5638 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5639 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5640 {
5641   OnigCodePoint n, *data;
5642   OnigCodePoint low, high, x;
5643 
5644   GET_CODE_POINT(n, p);
5645   data = (OnigCodePoint* )p;
5646   data++;
5647 
5648   for (low = 0, high = n; low < high; ) {
5649     x = (low + high) >> 1;
5650     if (code > data[x * 2 + 1])
5651       low = x + 1;
5652     else
5653       high = x;
5654   }
5655 
5656   return ((low < n && code >= data[low * 2]) ? 1 : 0);
5657 }
5658 
5659 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5660 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5661 {
5662   int found;
5663 
5664   if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5665     if (IS_NULL(cc->mbuf)) {
5666       found = 0;
5667     }
5668     else {
5669       found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5670     }
5671   }
5672   else {
5673     found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5674   }
5675 
5676   if (IS_NCCLASS_NOT(cc))
5677     return !found;
5678   else
5679     return found;
5680 }
5681 
5682 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5683 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5684 {
5685   int len;
5686 
5687   if (ONIGENC_MBC_MINLEN(enc) > 1) {
5688     len = 2;
5689   }
5690   else {
5691     len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5692   }
5693   return onig_is_code_in_cc_len(len, code, cc);
5694 }
5695 
5696 
5697 #ifdef ONIG_DEBUG
5698 
5699 /* arguments type */
5700 #define ARG_SPECIAL     -1
5701 #define ARG_NON          0
5702 #define ARG_RELADDR      1
5703 #define ARG_ABSADDR      2
5704 #define ARG_LENGTH       3
5705 #define ARG_MEMNUM       4
5706 #define ARG_OPTION       5
5707 #define ARG_STATE_CHECK  6
5708 
5709 OnigOpInfoType OnigOpInfo[] = {
5710   { OP_FINISH,            "finish",          ARG_NON },
5711   { OP_END,               "end",             ARG_NON },
5712   { OP_EXACT1,            "exact1",          ARG_SPECIAL },
5713   { OP_EXACT2,            "exact2",          ARG_SPECIAL },
5714   { OP_EXACT3,            "exact3",          ARG_SPECIAL },
5715   { OP_EXACT4,            "exact4",          ARG_SPECIAL },
5716   { OP_EXACT5,            "exact5",          ARG_SPECIAL },
5717   { OP_EXACTN,            "exactn",          ARG_SPECIAL },
5718   { OP_EXACTMB2N1,        "exactmb2-n1",     ARG_SPECIAL },
5719   { OP_EXACTMB2N2,        "exactmb2-n2",     ARG_SPECIAL },
5720   { OP_EXACTMB2N3,        "exactmb2-n3",     ARG_SPECIAL },
5721   { OP_EXACTMB2N,         "exactmb2-n",      ARG_SPECIAL },
5722   { OP_EXACTMB3N,         "exactmb3n"  ,     ARG_SPECIAL },
5723   { OP_EXACTMBN,          "exactmbn",        ARG_SPECIAL },
5724   { OP_EXACT1_IC,         "exact1-ic",       ARG_SPECIAL },
5725   { OP_EXACTN_IC,         "exactn-ic",       ARG_SPECIAL },
5726   { OP_CCLASS,            "cclass",          ARG_SPECIAL },
5727   { OP_CCLASS_MB,         "cclass-mb",       ARG_SPECIAL },
5728   { OP_CCLASS_MIX,        "cclass-mix",      ARG_SPECIAL },
5729   { OP_CCLASS_NOT,        "cclass-not",      ARG_SPECIAL },
5730   { OP_CCLASS_MB_NOT,     "cclass-mb-not",   ARG_SPECIAL },
5731   { OP_CCLASS_MIX_NOT,    "cclass-mix-not",  ARG_SPECIAL },
5732   { OP_CCLASS_NODE,       "cclass-node",     ARG_SPECIAL },
5733   { OP_ANYCHAR,           "anychar",         ARG_NON },
5734   { OP_ANYCHAR_ML,        "anychar-ml",      ARG_NON },
5735   { OP_ANYCHAR_STAR,      "anychar*",        ARG_NON },
5736   { OP_ANYCHAR_ML_STAR,   "anychar-ml*",     ARG_NON },
5737   { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5738   { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5739   { OP_WORD,                "word",            ARG_NON },
5740   { OP_NOT_WORD,            "not-word",        ARG_NON },
5741   { OP_WORD_BOUND,          "word-bound",      ARG_NON },
5742   { OP_NOT_WORD_BOUND,      "not-word-bound",  ARG_NON },
5743   { OP_WORD_BEGIN,          "word-begin",      ARG_NON },
5744   { OP_WORD_END,            "word-end",        ARG_NON },
5745   { OP_BEGIN_BUF,           "begin-buf",       ARG_NON },
5746   { OP_END_BUF,             "end-buf",         ARG_NON },
5747   { OP_BEGIN_LINE,          "begin-line",      ARG_NON },
5748   { OP_END_LINE,            "end-line",        ARG_NON },
5749   { OP_SEMI_END_BUF,        "semi-end-buf",    ARG_NON },
5750   { OP_BEGIN_POSITION,      "begin-position",  ARG_NON },
5751   { OP_BACKREF1,            "backref1",             ARG_NON },
5752   { OP_BACKREF2,            "backref2",             ARG_NON },
5753   { OP_BACKREFN,            "backrefn",             ARG_MEMNUM  },
5754   { OP_BACKREFN_IC,         "backrefn-ic",          ARG_SPECIAL },
5755   { OP_BACKREF_MULTI,       "backref_multi",        ARG_SPECIAL },
5756   { OP_BACKREF_MULTI_IC,    "backref_multi-ic",     ARG_SPECIAL },
5757   { OP_BACKREF_WITH_LEVEL,    "backref_at_level",     ARG_SPECIAL },
5758   { OP_MEMORY_START_PUSH,   "mem-start-push",       ARG_MEMNUM  },
5759   { OP_MEMORY_START,        "mem-start",            ARG_MEMNUM  },
5760   { OP_MEMORY_END_PUSH,     "mem-end-push",         ARG_MEMNUM  },
5761   { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec",     ARG_MEMNUM  },
5762   { OP_MEMORY_END,          "mem-end",              ARG_MEMNUM  },
5763   { OP_MEMORY_END_REC,      "mem-end-rec",          ARG_MEMNUM  },
5764   { OP_SET_OPTION_PUSH,     "set-option-push",      ARG_OPTION  },
5765   { OP_SET_OPTION,          "set-option",           ARG_OPTION  },
5766   { OP_FAIL,                "fail",                 ARG_NON },
5767   { OP_JUMP,                "jump",                 ARG_RELADDR },
5768   { OP_PUSH,                "push",                 ARG_RELADDR },
5769   { OP_POP,                 "pop",                  ARG_NON },
5770   { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1",      ARG_SPECIAL },
5771   { OP_PUSH_IF_PEEK_NEXT,   "push-if-peek-next",    ARG_SPECIAL },
5772   { OP_REPEAT,              "repeat",               ARG_SPECIAL },
5773   { OP_REPEAT_NG,           "repeat-ng",            ARG_SPECIAL },
5774   { OP_REPEAT_INC,          "repeat-inc",           ARG_MEMNUM  },
5775   { OP_REPEAT_INC_NG,       "repeat-inc-ng",        ARG_MEMNUM  },
5776   { OP_REPEAT_INC_SG,       "repeat-inc-sg",        ARG_MEMNUM  },
5777   { OP_REPEAT_INC_NG_SG,    "repeat-inc-ng-sg",     ARG_MEMNUM  },
5778   { OP_NULL_CHECK_START,    "null-check-start",     ARG_MEMNUM  },
5779   { OP_NULL_CHECK_END,      "null-check-end",       ARG_MEMNUM  },
5780   { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM  },
5781   { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM  },
5782   { OP_PUSH_POS,             "push-pos",             ARG_NON },
5783   { OP_POP_POS,              "pop-pos",              ARG_NON },
5784   { OP_PUSH_POS_NOT,         "push-pos-not",         ARG_RELADDR },
5785   { OP_FAIL_POS,             "fail-pos",             ARG_NON },
5786   { OP_PUSH_STOP_BT,         "push-stop-bt",         ARG_NON },
5787   { OP_POP_STOP_BT,          "pop-stop-bt",          ARG_NON },
5788   { OP_LOOK_BEHIND,          "look-behind",          ARG_SPECIAL },
5789   { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5790   { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5791   { OP_CALL,                 "call",                 ARG_ABSADDR },
5792   { OP_RETURN,               "return",               ARG_NON },
5793   { OP_STATE_CHECK_PUSH,         "state-check-push",         ARG_SPECIAL },
5794   { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5795   { OP_STATE_CHECK,              "state-check",              ARG_STATE_CHECK },
5796   { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*",     ARG_STATE_CHECK },
5797   { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5798     "state-check-anychar-ml*", ARG_STATE_CHECK },
5799   { -1, "", ARG_NON }
5800 };
5801 
5802 static char*
op2name(int opcode)5803 op2name(int opcode)
5804 {
5805   int i;
5806 
5807   for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5808     if (opcode == OnigOpInfo[i].opcode)
5809       return OnigOpInfo[i].name;
5810   }
5811   return "";
5812 }
5813 
5814 static int
op2arg_type(int opcode)5815 op2arg_type(int opcode)
5816 {
5817   int i;
5818 
5819   for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5820     if (opcode == OnigOpInfo[i].opcode)
5821       return OnigOpInfo[i].arg_type;
5822   }
5823   return ARG_SPECIAL;
5824 }
5825 
5826 static void
Indent(FILE * f,int indent)5827 Indent(FILE* f, int indent)
5828 {
5829   int i;
5830   for (i = 0; i < indent; i++) putc(' ', f);
5831 }
5832 
5833 static void
p_string(FILE * f,int len,UChar * s)5834 p_string(FILE* f, int len, UChar* s)
5835 {
5836   fputs(":", f);
5837   while (len-- > 0) { fputc(*s++, f); }
5838 }
5839 
5840 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5841 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5842 {
5843   int x = len * mb_len;
5844 
5845   fprintf(f, ":%d:", len);
5846   while (x-- > 0) { fputc(*s++, f); }
5847 }
5848 
5849 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5850 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5851                               OnigEncoding enc)
5852 {
5853   int i, n, arg_type;
5854   RelAddrType addr;
5855   LengthType len;
5856   MemNumType mem;
5857   StateCheckNumType scn;
5858   OnigCodePoint code;
5859   UChar *q;
5860 
5861   fprintf(f, "[%s", op2name(*bp));
5862   arg_type = op2arg_type(*bp);
5863   if (arg_type != ARG_SPECIAL) {
5864     bp++;
5865     switch (arg_type) {
5866     case ARG_NON:
5867       break;
5868     case ARG_RELADDR:
5869       GET_RELADDR_INC(addr, bp);
5870       fprintf(f, ":(%d)", addr);
5871       break;
5872     case ARG_ABSADDR:
5873       GET_ABSADDR_INC(addr, bp);
5874       fprintf(f, ":(%d)", addr);
5875       break;
5876     case ARG_LENGTH:
5877       GET_LENGTH_INC(len, bp);
5878       fprintf(f, ":%d", len);
5879       break;
5880     case ARG_MEMNUM:
5881       mem = *((MemNumType* )bp);
5882       bp += SIZE_MEMNUM;
5883       fprintf(f, ":%d", mem);
5884       break;
5885     case ARG_OPTION:
5886       {
5887         OnigOptionType option = *((OnigOptionType* )bp);
5888         bp += SIZE_OPTION;
5889         fprintf(f, ":%d", option);
5890       }
5891       break;
5892 
5893     case ARG_STATE_CHECK:
5894       scn = *((StateCheckNumType* )bp);
5895       bp += SIZE_STATE_CHECK_NUM;
5896       fprintf(f, ":%d", scn);
5897       break;
5898     }
5899   }
5900   else {
5901     switch (*bp++) {
5902     case OP_EXACT1:
5903     case OP_ANYCHAR_STAR_PEEK_NEXT:
5904     case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5905       p_string(f, 1, bp++); break;
5906     case OP_EXACT2:
5907       p_string(f, 2, bp); bp += 2; break;
5908     case OP_EXACT3:
5909       p_string(f, 3, bp); bp += 3; break;
5910     case OP_EXACT4:
5911       p_string(f, 4, bp); bp += 4; break;
5912     case OP_EXACT5:
5913       p_string(f, 5, bp); bp += 5; break;
5914     case OP_EXACTN:
5915       GET_LENGTH_INC(len, bp);
5916       p_len_string(f, len, 1, bp);
5917       bp += len;
5918       break;
5919 
5920     case OP_EXACTMB2N1:
5921       p_string(f, 2, bp); bp += 2; break;
5922     case OP_EXACTMB2N2:
5923       p_string(f, 4, bp); bp += 4; break;
5924     case OP_EXACTMB2N3:
5925       p_string(f, 6, bp); bp += 6; break;
5926     case OP_EXACTMB2N:
5927       GET_LENGTH_INC(len, bp);
5928       p_len_string(f, len, 2, bp);
5929       bp += len * 2;
5930       break;
5931     case OP_EXACTMB3N:
5932       GET_LENGTH_INC(len, bp);
5933       p_len_string(f, len, 3, bp);
5934       bp += len * 3;
5935       break;
5936     case OP_EXACTMBN:
5937       {
5938         int mb_len;
5939 
5940         GET_LENGTH_INC(mb_len, bp);
5941         GET_LENGTH_INC(len, bp);
5942         fprintf(f, ":%d:%d:", mb_len, len);
5943         n = len * mb_len;
5944         while (n-- > 0) { fputc(*bp++, f); }
5945       }
5946       break;
5947 
5948     case OP_EXACT1_IC:
5949       len = enclen(enc, bp);
5950       p_string(f, len, bp);
5951       bp += len;
5952       break;
5953     case OP_EXACTN_IC:
5954       GET_LENGTH_INC(len, bp);
5955       p_len_string(f, len, 1, bp);
5956       bp += len;
5957       break;
5958 
5959     case OP_CCLASS:
5960       n = bitset_on_num((BitSetRef )bp);
5961       bp += SIZE_BITSET;
5962       fprintf(f, ":%d", n);
5963       break;
5964 
5965     case OP_CCLASS_NOT:
5966       n = bitset_on_num((BitSetRef )bp);
5967       bp += SIZE_BITSET;
5968       fprintf(f, ":%d", n);
5969       break;
5970 
5971     case OP_CCLASS_MB:
5972     case OP_CCLASS_MB_NOT:
5973       GET_LENGTH_INC(len, bp);
5974       q = bp;
5975 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5976       ALIGNMENT_RIGHT(q);
5977 #endif
5978       GET_CODE_POINT(code, q);
5979       bp += len;
5980       fprintf(f, ":%d:%d", (int )code, len);
5981       break;
5982 
5983     case OP_CCLASS_MIX:
5984     case OP_CCLASS_MIX_NOT:
5985       n = bitset_on_num((BitSetRef )bp);
5986       bp += SIZE_BITSET;
5987       GET_LENGTH_INC(len, bp);
5988       q = bp;
5989 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5990       ALIGNMENT_RIGHT(q);
5991 #endif
5992       GET_CODE_POINT(code, q);
5993       bp += len;
5994       fprintf(f, ":%d:%d:%d", n, (int )code, len);
5995       break;
5996 
5997     case OP_CCLASS_NODE:
5998       {
5999         CClassNode *cc;
6000 
6001         GET_POINTER_INC(cc, bp);
6002         n = bitset_on_num(cc->bs);
6003         fprintf(f, ":%p:%d", cc, n);
6004       }
6005       break;
6006 
6007     case OP_BACKREFN_IC:
6008       mem = *((MemNumType* )bp);
6009       bp += SIZE_MEMNUM;
6010       fprintf(f, ":%d", mem);
6011       break;
6012 
6013     case OP_BACKREF_MULTI_IC:
6014     case OP_BACKREF_MULTI:
6015       fputs(" ", f);
6016       GET_LENGTH_INC(len, bp);
6017       for (i = 0; i < len; i++) {
6018         GET_MEMNUM_INC(mem, bp);
6019         if (i > 0) fputs(", ", f);
6020         fprintf(f, "%d", mem);
6021       }
6022       break;
6023 
6024     case OP_BACKREF_WITH_LEVEL:
6025       {
6026         OnigOptionType option;
6027         LengthType level;
6028 
6029         GET_OPTION_INC(option, bp);
6030         fprintf(f, ":%d", option);
6031         GET_LENGTH_INC(level, bp);
6032         fprintf(f, ":%d", level);
6033 
6034         fputs(" ", f);
6035         GET_LENGTH_INC(len, bp);
6036         for (i = 0; i < len; i++) {
6037           GET_MEMNUM_INC(mem, bp);
6038           if (i > 0) fputs(", ", f);
6039           fprintf(f, "%d", mem);
6040         }
6041       }
6042       break;
6043 
6044     case OP_REPEAT:
6045     case OP_REPEAT_NG:
6046       {
6047         mem = *((MemNumType* )bp);
6048         bp += SIZE_MEMNUM;
6049         addr = *((RelAddrType* )bp);
6050         bp += SIZE_RELADDR;
6051         fprintf(f, ":%d:%d", mem, addr);
6052       }
6053       break;
6054 
6055     case OP_PUSH_OR_JUMP_EXACT1:
6056     case OP_PUSH_IF_PEEK_NEXT:
6057       addr = *((RelAddrType* )bp);
6058       bp += SIZE_RELADDR;
6059       fprintf(f, ":(%d)", addr);
6060       p_string(f, 1, bp);
6061       bp += 1;
6062       break;
6063 
6064     case OP_LOOK_BEHIND:
6065       GET_LENGTH_INC(len, bp);
6066       fprintf(f, ":%d", len);
6067       break;
6068 
6069     case OP_PUSH_LOOK_BEHIND_NOT:
6070       GET_RELADDR_INC(addr, bp);
6071       GET_LENGTH_INC(len, bp);
6072       fprintf(f, ":%d:(%d)", len, addr);
6073       break;
6074 
6075     case OP_STATE_CHECK_PUSH:
6076     case OP_STATE_CHECK_PUSH_OR_JUMP:
6077       scn = *((StateCheckNumType* )bp);
6078       bp += SIZE_STATE_CHECK_NUM;
6079       addr = *((RelAddrType* )bp);
6080       bp += SIZE_RELADDR;
6081       fprintf(f, ":%d:(%d)", scn, addr);
6082       break;
6083 
6084     default:
6085       fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6086 	      *--bp);
6087     }
6088   }
6089   fputs("]", f);
6090   if (nextp) *nextp = bp;
6091 }
6092 
6093 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6094 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6095 {
6096   int ncode;
6097   UChar* bp = reg->p;
6098   UChar* end = reg->p + reg->used;
6099 
6100   fprintf(f, "code length: %d\n", reg->used);
6101 
6102   ncode = 0;
6103   while (bp < end) {
6104     ncode++;
6105     if (bp > reg->p) {
6106       if (ncode % 5 == 0)
6107         fprintf(f, "\n");
6108       else
6109         fputs(" ", f);
6110     }
6111     onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6112   }
6113 
6114   fprintf(f, "\n");
6115 }
6116 
6117 static void
print_indent_tree(FILE * f,Node * node,int indent)6118 print_indent_tree(FILE* f, Node* node, int indent)
6119 {
6120   int i, type;
6121   int add = 3;
6122   UChar* p;
6123 
6124   Indent(f, indent);
6125   if (IS_NULL(node)) {
6126     fprintf(f, "ERROR: null node!!!\n");
6127     exit (0);
6128   }
6129 
6130   type = NTYPE(node);
6131   switch (type) {
6132   case NT_LIST:
6133   case NT_ALT:
6134     if (NTYPE(node) == NT_LIST)
6135       fprintf(f, "<list:%p>\n", node);
6136     else
6137       fprintf(f, "<alt:%p>\n", node);
6138 
6139     print_indent_tree(f, NCAR(node), indent + add);
6140     while (IS_NOT_NULL(node = NCDR(node))) {
6141       if (NTYPE(node) != type) {
6142         fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6143         exit(0);
6144       }
6145       print_indent_tree(f, NCAR(node), indent + add);
6146     }
6147     break;
6148 
6149   case NT_STR:
6150     fprintf(f, "<string%s:%p>",
6151 	    (NSTRING_IS_RAW(node) ? "-raw" : ""), node);
6152     for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6153       if (*p >= 0x20 && *p < 0x7f)
6154         fputc(*p, f);
6155       else {
6156         fprintf(f, " 0x%02x", *p);
6157       }
6158     }
6159     break;
6160 
6161   case NT_CCLASS:
6162     fprintf(f, "<cclass:%p>", node);
6163     if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6164     if (NCCLASS(node)->mbuf) {
6165       BBuf* bbuf = NCCLASS(node)->mbuf;
6166       for (i = 0; i < bbuf->used; i++) {
6167         if (i > 0) fprintf(f, ",");
6168         fprintf(f, "%0x", bbuf->p[i]);
6169       }
6170     }
6171     break;
6172 
6173   case NT_CTYPE:
6174     fprintf(f, "<ctype:%p> ", node);
6175     switch (NCTYPE(node)->ctype) {
6176     case ONIGENC_CTYPE_WORD:
6177       if (NCTYPE(node)->not != 0)
6178         fputs("not word",       f);
6179       else
6180         fputs("word",           f);
6181       break;
6182 
6183     default:
6184       fprintf(f, "ERROR: undefined ctype.\n");
6185       exit(0);
6186     }
6187     break;
6188 
6189   case NT_CANY:
6190     fprintf(f, "<anychar:%p>", node);
6191     break;
6192 
6193   case NT_ANCHOR:
6194     fprintf(f, "<anchor:%p> ", node);
6195     switch (NANCHOR(node)->type) {
6196     case ANCHOR_BEGIN_BUF:      fputs("begin buf",      f); break;
6197     case ANCHOR_END_BUF:        fputs("end buf",        f); break;
6198     case ANCHOR_BEGIN_LINE:     fputs("begin line",     f); break;
6199     case ANCHOR_END_LINE:       fputs("end line",       f); break;
6200     case ANCHOR_SEMI_END_BUF:   fputs("semi end buf",   f); break;
6201     case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6202 
6203     case ANCHOR_WORD_BOUND:      fputs("word bound",     f); break;
6204     case ANCHOR_NOT_WORD_BOUND:  fputs("not word bound", f); break;
6205 #ifdef USE_WORD_BEGIN_END
6206     case ANCHOR_WORD_BEGIN:      fputs("word begin", f);     break;
6207     case ANCHOR_WORD_END:        fputs("word end", f);       break;
6208 #endif
6209     case ANCHOR_PREC_READ:
6210       fprintf(f, "prec read\n");
6211       print_indent_tree(f, NANCHOR(node)->target, indent + add);
6212       break;
6213     case ANCHOR_PREC_READ_NOT:
6214       fprintf(f, "prec read not\n");
6215       print_indent_tree(f, NANCHOR(node)->target, indent + add);
6216       break;
6217     case ANCHOR_LOOK_BEHIND:
6218       fprintf(f, "look behind\n");
6219       print_indent_tree(f, NANCHOR(node)->target, indent + add);
6220       break;
6221     case ANCHOR_LOOK_BEHIND_NOT:
6222       fprintf(f, "look behind not\n");
6223       print_indent_tree(f, NANCHOR(node)->target, indent + add);
6224       break;
6225 
6226     default:
6227       fprintf(f, "ERROR: undefined anchor type.\n");
6228       break;
6229     }
6230     break;
6231 
6232   case NT_BREF:
6233     {
6234       int* p;
6235       BRefNode* br = NBREF(node);
6236       p = BACKREFS_P(br);
6237       fprintf(f, "<backref:%p>", node);
6238       for (i = 0; i < br->back_num; i++) {
6239         if (i > 0) fputs(", ", f);
6240         fprintf(f, "%d", p[i]);
6241       }
6242     }
6243     break;
6244 
6245 #ifdef USE_SUBEXP_CALL
6246   case NT_CALL:
6247     {
6248       CallNode* cn = NCALL(node);
6249       fprintf(f, "<call:%p>", node);
6250       p_string(f, cn->name_end - cn->name, cn->name);
6251     }
6252     break;
6253 #endif
6254 
6255   case NT_QTFR:
6256     fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node,
6257 	    NQTFR(node)->lower, NQTFR(node)->upper,
6258 	    (NQTFR(node)->greedy ? "" : "?"));
6259     print_indent_tree(f, NQTFR(node)->target, indent + add);
6260     break;
6261 
6262   case NT_ENCLOSE:
6263     fprintf(f, "<enclose:%p> ", node);
6264     switch (NENCLOSE(node)->type) {
6265     case ENCLOSE_OPTION:
6266       fprintf(f, "option:%d", NENCLOSE(node)->option);
6267       break;
6268     case ENCLOSE_MEMORY:
6269       fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6270       break;
6271     case ENCLOSE_STOP_BACKTRACK:
6272       fprintf(f, "stop-bt");
6273       break;
6274 
6275     default:
6276       break;
6277     }
6278     fprintf(f, "\n");
6279     print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6280     break;
6281 
6282   default:
6283     fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6284     break;
6285   }
6286 
6287   if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6288       type != NT_ENCLOSE)
6289     fprintf(f, "\n");
6290   fflush(f);
6291 }
6292 #endif /* ONIG_DEBUG */
6293 
6294 #ifdef ONIG_DEBUG_PARSE_TREE
6295 static void
print_tree(FILE * f,Node * node)6296 print_tree(FILE* f, Node* node)
6297 {
6298   print_indent_tree(f, node, 0);
6299 }
6300 #endif
6301