xref: /php-src/ext/dom/lexbor/lexbor/html/tokenizer.h (revision bffab33a)
1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #ifndef LEXBOR_HTML_TOKENIZER_H
8 #define LEXBOR_HTML_TOKENIZER_H
9 
10 #ifdef __cplusplus
11 extern "C" {
12 #endif
13 
14 #include "lexbor/core/sbst.h"
15 #include "lexbor/core/array_obj.h"
16 
17 #include "lexbor/html/base.h"
18 #include "lexbor/html/token.h"
19 
20 #include "lexbor/tag/tag.h"
21 #include "lexbor/ns/ns.h"
22 
23 
24 /* State */
25 typedef const lxb_char_t *
26 (*lxb_html_tokenizer_state_f)(lxb_html_tokenizer_t *tkz,
27                               const lxb_char_t *data, const lxb_char_t *end);
28 
29 typedef lxb_html_token_t *
30 (*lxb_html_tokenizer_token_f)(lxb_html_tokenizer_t *tkz,
31                               lxb_html_token_t *token, void *ctx);
32 
33 
34 struct lxb_html_tokenizer {
35     lxb_html_tokenizer_state_f       state;
36     lxb_html_tokenizer_state_f       state_return;
37 
38     lxb_html_tokenizer_token_f       callback_token_done;
39     void                             *callback_token_ctx;
40 
41     lexbor_hash_t                    *tags;
42     lexbor_hash_t                    *attrs;
43     lexbor_mraw_t                    *attrs_mraw;
44 
45     /* For a temp strings and other templary data */
46     lexbor_mraw_t                    *mraw;
47 
48     /* Current process token */
49     lxb_html_token_t                 *token;
50 
51     /* Memory for token and attr */
52     lexbor_dobject_t                 *dobj_token;
53     lexbor_dobject_t                 *dobj_token_attr;
54 
55     /* Parse error */
56     lexbor_array_obj_t               *parse_errors;
57 
58     /*
59      * Leak abstractions.
60      * The only place where the specification causes mixing Tree Builder
61      * and Tokenizer. We kill all beauty.
62      * Current Tree parser. This is not ref (not ref count).
63      */
64     lxb_html_tree_t                  *tree;
65 
66     /* Temp */
67     const lxb_char_t                 *markup;
68     const lxb_char_t                 *temp;
69     lxb_tag_id_t                     tmp_tag_id;
70 
71     lxb_char_t                       *start;
72     lxb_char_t                       *pos;
73     const lxb_char_t                 *end;
74     const lxb_char_t                 *begin;
75     const lxb_char_t                 *last;
76     size_t                           current_line;
77     size_t                           current_column;
78 
79     /* Entities */
80     const lexbor_sbst_entry_static_t *entity;
81     const lexbor_sbst_entry_static_t *entity_match;
82     uintptr_t                        entity_start;
83     uintptr_t                        entity_end;
84     uint32_t                         entity_length;
85     uint32_t                         entity_number;
86     bool                             is_attribute;
87 
88     /* Process */
89     lxb_html_tokenizer_opt_t         opt;
90     lxb_status_t                     status;
91     bool                             is_eof;
92 
93     lxb_html_tokenizer_t             *base;
94     size_t                           ref_count;
95 };
96 
97 
98 #include "lexbor/html/tokenizer/error.h"
99 
100 
101 extern const lxb_char_t *lxb_html_tokenizer_eof;
102 
103 LXB_API lxb_html_tokenizer_t *
104 lxb_html_tokenizer_create(void);
105 
106 LXB_API lxb_status_t
107 lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz);
108 
109 LXB_API lxb_status_t
110 lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
111                            lxb_html_tokenizer_t *tkz_from);
112 
113 LXB_API lxb_html_tokenizer_t *
114 lxb_html_tokenizer_ref(lxb_html_tokenizer_t *tkz);
115 
116 LXB_API lxb_html_tokenizer_t *
117 lxb_html_tokenizer_unref(lxb_html_tokenizer_t *tkz);
118 
119 LXB_API void
120 lxb_html_tokenizer_clean(lxb_html_tokenizer_t *tkz);
121 
122 LXB_API lxb_html_tokenizer_t *
123 lxb_html_tokenizer_destroy(lxb_html_tokenizer_t *tkz);
124 
125 LXB_API lxb_status_t
126 lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t *tkz, size_t table_size);
127 
128 LXB_API void
129 lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t *tkz);
130 
131 LXB_API lxb_status_t
132 lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t *tkz, size_t table_size);
133 
134 LXB_API void
135 lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t *tkz);
136 
137 LXB_API lxb_status_t
138 lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz);
139 
140 LXB_API lxb_status_t
141 lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz,
142                          const lxb_char_t *data, size_t size);
143 
144 LXB_API lxb_status_t
145 lxb_html_tokenizer_end(lxb_html_tokenizer_t *tkz);
146 
147 
148 LXB_API const lxb_char_t *
149 lxb_html_tokenizer_change_incoming(lxb_html_tokenizer_t *tkz,
150                                    const lxb_char_t *pos);
151 
152 LXB_API lxb_ns_id_t
153 lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz);
154 
155 LXB_API void
156 lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t *tkz, bool scripting,
157                                     lxb_tag_id_t tag_id, lxb_ns_id_t ns);
158 
159 
160 /*
161  * Inline functions
162  */
163 lxb_inline void
lxb_html_tokenizer_status_set(lxb_html_tokenizer_t * tkz,lxb_status_t status)164 lxb_html_tokenizer_status_set(lxb_html_tokenizer_t *tkz, lxb_status_t status)
165 {
166     tkz->status = status;
167 }
168 
169 lxb_inline void
lxb_html_tokenizer_tags_set(lxb_html_tokenizer_t * tkz,lexbor_hash_t * tags)170 lxb_html_tokenizer_tags_set(lxb_html_tokenizer_t *tkz, lexbor_hash_t *tags)
171 {
172     tkz->tags = tags;
173 }
174 
175 lxb_inline lexbor_hash_t *
lxb_html_tokenizer_tags(lxb_html_tokenizer_t * tkz)176 lxb_html_tokenizer_tags(lxb_html_tokenizer_t *tkz)
177 {
178     return tkz->tags;
179 }
180 
181 lxb_inline void
lxb_html_tokenizer_attrs_set(lxb_html_tokenizer_t * tkz,lexbor_hash_t * attrs)182 lxb_html_tokenizer_attrs_set(lxb_html_tokenizer_t *tkz, lexbor_hash_t *attrs)
183 {
184     tkz->attrs = attrs;
185 }
186 
187 lxb_inline lexbor_hash_t *
lxb_html_tokenizer_attrs(lxb_html_tokenizer_t * tkz)188 lxb_html_tokenizer_attrs(lxb_html_tokenizer_t *tkz)
189 {
190     return tkz->attrs;
191 }
192 
193 lxb_inline void
lxb_html_tokenizer_attrs_mraw_set(lxb_html_tokenizer_t * tkz,lexbor_mraw_t * mraw)194 lxb_html_tokenizer_attrs_mraw_set(lxb_html_tokenizer_t *tkz,
195                                   lexbor_mraw_t *mraw)
196 {
197     tkz->attrs_mraw = mraw;
198 }
199 
200 lxb_inline lexbor_mraw_t *
lxb_html_tokenizer_attrs_mraw(lxb_html_tokenizer_t * tkz)201 lxb_html_tokenizer_attrs_mraw(lxb_html_tokenizer_t *tkz)
202 {
203     return tkz->attrs_mraw;
204 }
205 
206 lxb_inline void
lxb_html_tokenizer_callback_token_done_set(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_token_f call_func,void * ctx)207 lxb_html_tokenizer_callback_token_done_set(lxb_html_tokenizer_t *tkz,
208                                            lxb_html_tokenizer_token_f call_func,
209                                            void *ctx)
210 {
211     tkz->callback_token_done = call_func;
212     tkz->callback_token_ctx = ctx;
213 }
214 
215 lxb_inline void *
lxb_html_tokenizer_callback_token_done_ctx(lxb_html_tokenizer_t * tkz)216 lxb_html_tokenizer_callback_token_done_ctx(lxb_html_tokenizer_t *tkz)
217 {
218     return tkz->callback_token_ctx;
219 }
220 
221 lxb_inline void
lxb_html_tokenizer_state_set(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_state_f state)222 lxb_html_tokenizer_state_set(lxb_html_tokenizer_t *tkz,
223                              lxb_html_tokenizer_state_f state)
224 {
225     tkz->state = state;
226 }
227 
228 lxb_inline void
lxb_html_tokenizer_tmp_tag_id_set(lxb_html_tokenizer_t * tkz,lxb_tag_id_t tag_id)229 lxb_html_tokenizer_tmp_tag_id_set(lxb_html_tokenizer_t *tkz,
230                                   lxb_tag_id_t tag_id)
231 {
232     tkz->tmp_tag_id = tag_id;
233 }
234 
235 lxb_inline lxb_html_tree_t *
lxb_html_tokenizer_tree(lxb_html_tokenizer_t * tkz)236 lxb_html_tokenizer_tree(lxb_html_tokenizer_t *tkz)
237 {
238     return tkz->tree;
239 }
240 
241 lxb_inline void
lxb_html_tokenizer_tree_set(lxb_html_tokenizer_t * tkz,lxb_html_tree_t * tree)242 lxb_html_tokenizer_tree_set(lxb_html_tokenizer_t *tkz, lxb_html_tree_t *tree)
243 {
244     tkz->tree = tree;
245 }
246 
247 lxb_inline lexbor_mraw_t *
lxb_html_tokenizer_mraw(lxb_html_tokenizer_t * tkz)248 lxb_html_tokenizer_mraw(lxb_html_tokenizer_t *tkz)
249 {
250     return tkz->mraw;
251 }
252 
253 lxb_inline lxb_status_t
lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t * tkz,size_t size)254 lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t *tkz, size_t size)
255 {
256     size_t length = tkz->pos - tkz->start;
257     size_t new_size = (tkz->end - tkz->start) + size + 4096;
258 
259     tkz->start = (lxb_char_t *)lexbor_realloc(tkz->start, new_size);
260     if (tkz->start == NULL) {
261         tkz->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
262         return tkz->status;
263     }
264 
265     tkz->pos = tkz->start + length;
266     tkz->end = tkz->start + new_size;
267 
268     return LXB_STATUS_OK;
269 }
270 
271 lxb_inline lxb_status_t
lxb_html_tokenizer_temp_append_data(lxb_html_tokenizer_t * tkz,const lxb_char_t * data)272 lxb_html_tokenizer_temp_append_data(lxb_html_tokenizer_t *tkz,
273                                     const lxb_char_t *data)
274 {
275     size_t size = data - tkz->begin;
276 
277     if ((tkz->pos + size) > tkz->end) {
278         if(lxb_html_tokenizer_temp_realloc(tkz, size)) {
279             return tkz->status;
280         }
281     }
282 
283     tkz->pos = (lxb_char_t *) memcpy(tkz->pos, tkz->begin, size) + size;
284 
285     return LXB_STATUS_OK;
286 }
287 
288 lxb_inline lxb_status_t
lxb_html_tokenizer_temp_append(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,size_t size)289 lxb_html_tokenizer_temp_append(lxb_html_tokenizer_t *tkz,
290                                const lxb_char_t *data, size_t size)
291 {
292     if ((tkz->pos + size) > tkz->end) {
293         if(lxb_html_tokenizer_temp_realloc(tkz, size)) {
294             return tkz->status;
295         }
296     }
297 
298     tkz->pos = (lxb_char_t *) memcpy(tkz->pos, data, size) + size;
299 
300     return LXB_STATUS_OK;
301 }
302 
303 
304 /*
305  * No inline functions for ABI.
306  */
307 LXB_API void
308 lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t *tkz,
309                                   lxb_status_t status);
310 
311 LXB_API void
312 lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t *tkz,
313                                                lxb_html_tokenizer_token_f call_func,
314                                                void *ctx);
315 
316 LXB_API void *
317 lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t *tkz);
318 
319 LXB_API void
320 lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t *tkz,
321                                  lxb_html_tokenizer_state_f state);
322 
323 LXB_API void
324 lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t *tkz,
325                                       lxb_tag_id_t tag_id);
326 
327 LXB_API lxb_html_tree_t *
328 lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t *tkz);
329 
330 LXB_API void
331 lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t *tkz,
332                                 lxb_html_tree_t *tree);
333 
334 LXB_API lexbor_mraw_t *
335 lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t *tkz);
336 
337 LXB_API lexbor_hash_t *
338 lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t *tkz);
339 
340 
341 #ifdef __cplusplus
342 } /* extern "C" */
343 #endif
344 
345 #endif /* LEXBOR_HTML_TOKENIZER_H */
346