xref: /PHP-7.3/ext/mbstring/oniguruma/src/regparse.h (revision 1979c5d1)
1 #ifndef REGPARSE_H
2 #define REGPARSE_H
3 /**********************************************************************
4   regparse.h -  Oniguruma (regular expression library)
5 **********************************************************************/
6 /*-
7  * Copyright (c) 2002-2019  K.Kosako
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include "regint.h"
33 
34 #define NODE_STRING_MARGIN         16
35 #define NODE_STRING_BUF_SIZE       20  /* sizeof(CClassNode) - sizeof(int)*4 */
36 #define NODE_BACKREFS_SIZE          6
37 
38 /* node type */
39 typedef enum {
40   NODE_STRING    =  0,
41   NODE_CCLASS    =  1,
42   NODE_CTYPE     =  2,
43   NODE_BACKREF   =  3,
44   NODE_QUANT     =  4,
45   NODE_BAG       =  5,
46   NODE_ANCHOR    =  6,
47   NODE_LIST      =  7,
48   NODE_ALT       =  8,
49   NODE_CALL      =  9,
50   NODE_GIMMICK   = 10
51 } NodeType;
52 
53 enum BagType {
54   BAG_MEMORY         = 0,
55   BAG_OPTION         = 1,
56   BAG_STOP_BACKTRACK = 2,
57   BAG_IF_ELSE        = 3,
58 };
59 
60 enum GimmickType {
61   GIMMICK_FAIL       = 0,
62   GIMMICK_SAVE       = 1,
63   GIMMICK_UPDATE_VAR = 2,
64 #ifdef USE_CALLOUT
65   GIMMICK_CALLOUT    = 3,
66 #endif
67 };
68 
69 enum BodyEmptyType {
70   BODY_IS_NOT_EMPTY             = 0,
71   BODY_IS_EMPTY_POSSIBILITY     = 1,
72   BODY_IS_EMPTY_POSSIBILITY_MEM = 2,
73   BODY_IS_EMPTY_POSSIBILITY_REC = 3
74 };
75 
76 struct _Node;
77 
78 typedef struct {
79   NodeType node_type;
80   int status;
81   struct _Node* parent;
82 
83   UChar* s;
84   UChar* end;
85   unsigned int flag;
86   UChar  buf[NODE_STRING_BUF_SIZE];
87   int    capacity;    /* (allocated size - 1) or 0: use buf[] */
88   int    case_min_len;
89 } StrNode;
90 
91 typedef struct {
92   NodeType node_type;
93   int status;
94   struct _Node* parent;
95 
96   unsigned int flags;
97   BitSet bs;
98   BBuf*  mbuf;   /* multi-byte info or NULL */
99 } CClassNode;
100 
101 typedef struct {
102   NodeType node_type;
103   int status;
104   struct _Node* parent;
105   struct _Node* body;
106 
107   int lower;
108   int upper;
109   int greedy;
110   enum BodyEmptyType emptiness;
111   struct _Node* head_exact;
112   struct _Node* next_head_exact;
113   int include_referred;   /* include called node. don't eliminate even if {0} */
114 } QuantNode;
115 
116 typedef struct {
117   NodeType node_type;
118   int status;
119   struct _Node* parent;
120   struct _Node* body;
121 
122   enum BagType type;
123   union {
124     struct {
125       int regnum;
126       AbsAddrType called_addr;
127       int entry_count;
128       int called_state;
129     } m;
130     struct {
131       OnigOptionType options;
132     } o;
133     struct {
134       /* body is condition */
135       struct _Node* Then;
136       struct _Node* Else;
137     } te;
138   };
139   /* for multiple call reference */
140   OnigLen min_len;   /* min length (byte) */
141   OnigLen max_len;   /* max length (byte) */
142   int char_len;      /* character length  */
143   int opt_count;     /* referenced count in optimize_nodes() */
144 } BagNode;
145 
146 #ifdef USE_CALL
147 
148 typedef struct {
149   int           offset;
150   struct _Node* target;
151 } UnsetAddr;
152 
153 typedef struct {
154   int        num;
155   int        alloc;
156   UnsetAddr* us;
157 } UnsetAddrList;
158 
159 typedef struct {
160   NodeType node_type;
161   int status;
162   struct _Node* parent;
163   struct _Node* body; /* to BagNode : BAG_MEMORY */
164 
165   int     by_number;
166   int     group_num;
167   UChar*  name;
168   UChar*  name_end;
169   int     entry_count;
170 } CallNode;
171 
172 #endif
173 
174 typedef struct {
175   NodeType node_type;
176   int status;
177   struct _Node* parent;
178 
179   int  back_num;
180   int  back_static[NODE_BACKREFS_SIZE];
181   int* back_dynamic;
182   int  nest_level;
183 } BackRefNode;
184 
185 typedef struct {
186   NodeType node_type;
187   int status;
188   struct _Node* parent;
189   struct _Node* body;
190 
191   int type;
192   int char_len;
193   int ascii_mode;
194 } AnchorNode;
195 
196 typedef struct {
197   NodeType node_type;
198   int status;
199   struct _Node* parent;
200 
201   struct _Node* car;
202   struct _Node* cdr;
203 } ConsAltNode;
204 
205 typedef struct {
206   NodeType node_type;
207   int status;
208   struct _Node* parent;
209 
210   int ctype;
211   int not;
212   OnigOptionType options;
213   int ascii_mode;
214 } CtypeNode;
215 
216 typedef struct {
217   NodeType node_type;
218   int status;
219   struct _Node* parent;
220 
221   enum GimmickType type;
222   int  detail_type;
223   int  num;
224   int  id;
225 } GimmickNode;
226 
227 typedef struct _Node {
228   union {
229     struct {
230       NodeType node_type;
231       int status;
232       struct _Node* parent;
233       struct _Node* body;
234     } base;
235 
236     StrNode       str;
237     CClassNode    cclass;
238     QuantNode     quant;
239     BagNode       bag;
240     BackRefNode   backref;
241     AnchorNode    anchor;
242     ConsAltNode   cons;
243     CtypeNode     ctype;
244 #ifdef USE_CALL
245     CallNode      call;
246 #endif
247     GimmickNode   gimmick;
248   } u;
249 } Node;
250 
251 #define NULL_NODE  ((Node* )0)
252 
253 
254 /* node type bit */
255 #define NODE_TYPE2BIT(type)      (1<<(type))
256 
257 #define NODE_BIT_STRING     NODE_TYPE2BIT(NODE_STRING)
258 #define NODE_BIT_CCLASS     NODE_TYPE2BIT(NODE_CCLASS)
259 #define NODE_BIT_CTYPE      NODE_TYPE2BIT(NODE_CTYPE)
260 #define NODE_BIT_BACKREF    NODE_TYPE2BIT(NODE_BACKREF)
261 #define NODE_BIT_QUANT      NODE_TYPE2BIT(NODE_QUANT)
262 #define NODE_BIT_BAG        NODE_TYPE2BIT(NODE_BAG)
263 #define NODE_BIT_ANCHOR     NODE_TYPE2BIT(NODE_ANCHOR)
264 #define NODE_BIT_LIST       NODE_TYPE2BIT(NODE_LIST)
265 #define NODE_BIT_ALT        NODE_TYPE2BIT(NODE_ALT)
266 #define NODE_BIT_CALL       NODE_TYPE2BIT(NODE_CALL)
267 #define NODE_BIT_GIMMICK    NODE_TYPE2BIT(NODE_GIMMICK)
268 
269 #define NODE_TYPE(node)             ((node)->u.base.node_type)
270 #define NODE_SET_TYPE(node, ntype)   (node)->u.base.node_type = (ntype)
271 
272 #define STR_(node)         (&((node)->u.str))
273 #define CCLASS_(node)      (&((node)->u.cclass))
274 #define CTYPE_(node)       (&((node)->u.ctype))
275 #define BACKREF_(node)     (&((node)->u.backref))
276 #define QUANT_(node)       (&((node)->u.quant))
277 #define BAG_(node)         (&((node)->u.bag))
278 #define ANCHOR_(node)      (&((node)->u.anchor))
279 #define CONS_(node)        (&((node)->u.cons))
280 #define CALL_(node)        (&((node)->u.call))
281 #define GIMMICK_(node)     (&((node)->u.gimmick))
282 
283 #define NODE_CAR(node)         (CONS_(node)->car)
284 #define NODE_CDR(node)         (CONS_(node)->cdr)
285 
286 #define CTYPE_ANYCHAR      -1
287 #define NODE_IS_ANYCHAR(node) \
288   (NODE_TYPE(node) == NODE_CTYPE && CTYPE_(node)->ctype == CTYPE_ANYCHAR)
289 
290 #define CTYPE_OPTION(node, reg) \
291   (NODE_IS_FIXED_OPTION(node) ? CTYPE_(node)->options : reg->options)
292 
293 
294 #define ANCR_ANYCHAR_INF_MASK  (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML)
295 #define ANCR_END_BUF_MASK      (ANCR_END_BUF | ANCR_SEMI_END_BUF)
296 
297 #define NODE_STRING_CRUDE              (1<<0)
298 #define NODE_STRING_CASE_EXPANDED      (1<<1)
299 #define NODE_STRING_CASE_FOLD_MATCH    (1<<2)
300 
301 #define NODE_STRING_LEN(node)            (int )((node)->u.str.end - (node)->u.str.s)
302 #define NODE_STRING_SET_CRUDE(node)         (node)->u.str.flag |= NODE_STRING_CRUDE
303 #define NODE_STRING_CLEAR_CRUDE(node)       (node)->u.str.flag &= ~NODE_STRING_CRUDE
304 #define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED
305 #define NODE_STRING_SET_CASE_FOLD_MATCH(node) (node)->u.str.flag |= NODE_STRING_CASE_FOLD_MATCH
306 #define NODE_STRING_IS_CRUDE(node) \
307   (((node)->u.str.flag & NODE_STRING_CRUDE) != 0)
308 #define NODE_STRING_IS_CASE_EXPANDED(node) \
309   (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0)
310 #define NODE_STRING_IS_CASE_FOLD_MATCH(node) \
311   (((node)->u.str.flag & NODE_STRING_CASE_FOLD_MATCH) != 0)
312 
313 #define BACKREFS_P(br) \
314   (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static)
315 
316 /* node status bits */
317 #define NODE_ST_MIN_FIXED             (1<<0)
318 #define NODE_ST_MAX_FIXED             (1<<1)
319 #define NODE_ST_CLEN_FIXED            (1<<2)
320 #define NODE_ST_MARK1                 (1<<3)
321 #define NODE_ST_MARK2                 (1<<4)
322 #define NODE_ST_STRICT_REAL_REPEAT    (1<<5)
323 #define NODE_ST_RECURSION             (1<<6)
324 #define NODE_ST_CALLED                (1<<7)
325 #define NODE_ST_ADDR_FIXED            (1<<8)
326 #define NODE_ST_NAMED_GROUP           (1<<9)
327 #define NODE_ST_IN_REAL_REPEAT        (1<<10) /* STK_REPEAT is nested in stack. */
328 #define NODE_ST_IN_ZERO_REPEAT        (1<<11) /* (....){0} */
329 #define NODE_ST_IN_MULTI_ENTRY        (1<<12)
330 #define NODE_ST_NEST_LEVEL            (1<<13)
331 #define NODE_ST_BY_NUMBER             (1<<14) /* {n,m} */
332 #define NODE_ST_BY_NAME               (1<<15) /* backref by name */
333 #define NODE_ST_BACKREF               (1<<16)
334 #define NODE_ST_CHECKER               (1<<17)
335 #define NODE_ST_FIXED_OPTION          (1<<18)
336 #define NODE_ST_PROHIBIT_RECURSION    (1<<19)
337 #define NODE_ST_SUPER                 (1<<20)
338 #define NODE_ST_EMPTY_STATUS_CHECK    (1<<21)
339 
340 
341 #define NODE_STATUS(node)           (((Node* )node)->u.base.status)
342 #define NODE_STATUS_ADD(node,f)     (NODE_STATUS(node) |= (NODE_ST_ ## f))
343 #define NODE_STATUS_REMOVE(node,f)  (NODE_STATUS(node) &= ~(NODE_ST_ ## f))
344 
345 #define NODE_IS_BY_NUMBER(node)       ((NODE_STATUS(node) & NODE_ST_BY_NUMBER)      != 0)
346 #define NODE_IS_IN_REAL_REPEAT(node)  ((NODE_STATUS(node) & NODE_ST_IN_REAL_REPEAT) != 0)
347 #define NODE_IS_CALLED(node)          ((NODE_STATUS(node) & NODE_ST_CALLED)         != 0)
348 #define NODE_IS_IN_MULTI_ENTRY(node)  ((NODE_STATUS(node) & NODE_ST_IN_MULTI_ENTRY) != 0)
349 #define NODE_IS_RECURSION(node)       ((NODE_STATUS(node) & NODE_ST_RECURSION)      != 0)
350 #define NODE_IS_IN_ZERO_REPEAT(node)  ((NODE_STATUS(node) & NODE_ST_IN_ZERO_REPEAT) != 0)
351 #define NODE_IS_NAMED_GROUP(node)     ((NODE_STATUS(node) & NODE_ST_NAMED_GROUP)  != 0)
352 #define NODE_IS_ADDR_FIXED(node)      ((NODE_STATUS(node) & NODE_ST_ADDR_FIXED)   != 0)
353 #define NODE_IS_CLEN_FIXED(node)      ((NODE_STATUS(node) & NODE_ST_CLEN_FIXED)   != 0)
354 #define NODE_IS_MIN_FIXED(node)       ((NODE_STATUS(node) & NODE_ST_MIN_FIXED)    != 0)
355 #define NODE_IS_MAX_FIXED(node)       ((NODE_STATUS(node) & NODE_ST_MAX_FIXED)    != 0)
356 #define NODE_IS_MARK1(node)           ((NODE_STATUS(node) & NODE_ST_MARK1)        != 0)
357 #define NODE_IS_MARK2(node)           ((NODE_STATUS(node) & NODE_ST_MARK2)        != 0)
358 #define NODE_IS_NEST_LEVEL(node)      ((NODE_STATUS(node) & NODE_ST_NEST_LEVEL)   != 0)
359 #define NODE_IS_BY_NAME(node)         ((NODE_STATUS(node) & NODE_ST_BY_NAME)      != 0)
360 #define NODE_IS_BACKREF(node)         ((NODE_STATUS(node) & NODE_ST_BACKREF)      != 0)
361 #define NODE_IS_CHECKER(node)         ((NODE_STATUS(node) & NODE_ST_CHECKER)      != 0)
362 #define NODE_IS_FIXED_OPTION(node)    ((NODE_STATUS(node) & NODE_ST_FIXED_OPTION) != 0)
363 #define NODE_IS_SUPER(node)           ((NODE_STATUS(node) & NODE_ST_SUPER)        != 0)
364 #define NODE_IS_PROHIBIT_RECURSION(node) \
365     ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0)
366 #define NODE_IS_STRICT_REAL_REPEAT(node) \
367     ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0)
368 #define NODE_IS_EMPTY_STATUS_CHECK(node) \
369     ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0)
370 
371 #define NODE_PARENT(node)         ((node)->u.base.parent)
372 #define NODE_BODY(node)           ((node)->u.base.body)
373 #define NODE_QUANT_BODY(node)     ((node)->body)
374 #define NODE_BAG_BODY(node)       ((node)->body)
375 #define NODE_CALL_BODY(node)      ((node)->body)
376 #define NODE_ANCHOR_BODY(node)    ((node)->body)
377 
378 #define SCANENV_MEMENV_SIZE               8
379 #define SCANENV_MEMENV(senv) \
380  (IS_NOT_NULL((senv)->mem_env_dynamic) ? \
381     (senv)->mem_env_dynamic : (senv)->mem_env_static)
382 
383 typedef struct {
384   Node* mem_node;
385   Node* empty_repeat_node;
386 } MemEnv;
387 
388 typedef struct {
389   enum SaveType type;
390 } SaveItem;
391 
392 typedef struct {
393   OnigOptionType   options;
394   OnigCaseFoldType case_fold_flag;
395   OnigEncoding     enc;
396   OnigSyntaxType*  syntax;
397   MemStatusType    cap_history;
398   MemStatusType    backtrack_mem; /* backtrack/recursion */
399   MemStatusType    backrefed_mem;
400   UChar*           pattern;
401   UChar*           pattern_end;
402   UChar*           error;
403   UChar*           error_end;
404   regex_t*         reg;       /* for reg->names only */
405   int              num_call;
406 #ifdef USE_CALL
407   UnsetAddrList*   unset_addr_list;
408   int              has_call_zero;
409 #endif
410   int              num_mem;
411   int              num_named;
412   int              mem_alloc;
413   MemEnv           mem_env_static[SCANENV_MEMENV_SIZE];
414   MemEnv*          mem_env_dynamic;
415   unsigned int     parse_depth;
416 #ifdef ONIG_DEBUG_PARSE
417   unsigned int     max_parse_depth;
418 #endif
419   int backref_num;
420   int keep_num;
421   int save_num;
422   int save_alloc_num;
423   SaveItem* saves;
424 } ScanEnv;
425 
426 
427 #define IS_SYNTAX_OP(syn, opm)    (((syn)->op  & (opm)) != 0)
428 #define IS_SYNTAX_OP2(syn, opm)   (((syn)->op2 & (opm)) != 0)
429 #define IS_SYNTAX_BV(syn, bvm)    (((syn)->behavior & (bvm)) != 0)
430 
431 typedef struct {
432   int new_val;
433 } GroupNumRemap;
434 
435 extern int    onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map));
436 
437 extern int    onig_strncmp P_((const UChar* s1, const UChar* s2, int n));
438 extern void   onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end));
439 extern void   onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end));
440 extern int    onig_reduce_nested_quantifier P_((Node* pnode));
441 extern int    onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end));
442 extern int    onig_node_str_set P_((Node* node, const UChar* s, const UChar* end));
443 extern void   onig_node_free P_((Node* node));
444 extern Node*  onig_node_new_bag P_((enum BagType type));
445 extern Node*  onig_node_new_anchor P_((int type, int ascii_mode));
446 extern Node*  onig_node_new_str P_((const UChar* s, const UChar* end));
447 extern Node*  onig_node_new_list P_((Node* left, Node* right));
448 extern Node*  onig_node_new_alt P_((Node* left, Node* right));
449 extern void   onig_node_str_clear P_((Node* node));
450 extern int    onig_names_free P_((regex_t* reg));
451 extern int    onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env));
452 extern int    onig_free_shared_cclass_table P_((void));
453 extern int    onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc));
454 extern int    onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]);
455 extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node);
456 
457 #ifdef USE_CALLOUT
458 extern int onig_global_callout_names_free(void);
459 #endif
460 
461 #ifdef ONIG_DEBUG
462 extern int onig_print_names(FILE*, regex_t*);
463 #endif
464 
465 #endif /* REGPARSE_H */
466