1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #ifndef LEXBOR_HTML_TOKENIZER_H
8 #define LEXBOR_HTML_TOKENIZER_H
9
10 #ifdef __cplusplus
11 extern "C" {
12 #endif
13
14 #include "lexbor/core/sbst.h"
15 #include "lexbor/core/array_obj.h"
16
17 #include "lexbor/html/base.h"
18 #include "lexbor/html/token.h"
19
20 #include "lexbor/tag/tag.h"
21 #include "lexbor/ns/ns.h"
22
23
24 /* State */
25 typedef const lxb_char_t *
26 (*lxb_html_tokenizer_state_f)(lxb_html_tokenizer_t *tkz,
27 const lxb_char_t *data, const lxb_char_t *end);
28
29 typedef lxb_html_token_t *
30 (*lxb_html_tokenizer_token_f)(lxb_html_tokenizer_t *tkz,
31 lxb_html_token_t *token, void *ctx);
32
33
34 struct lxb_html_tokenizer {
35 lxb_html_tokenizer_state_f state;
36 lxb_html_tokenizer_state_f state_return;
37
38 lxb_html_tokenizer_token_f callback_token_done;
39 void *callback_token_ctx;
40
41 lexbor_hash_t *tags;
42 lexbor_hash_t *attrs;
43 lexbor_mraw_t *attrs_mraw;
44
45 /* For a temp strings and other templary data */
46 lexbor_mraw_t *mraw;
47
48 /* Current process token */
49 lxb_html_token_t *token;
50
51 /* Memory for token and attr */
52 lexbor_dobject_t *dobj_token;
53 lexbor_dobject_t *dobj_token_attr;
54
55 /* Parse error */
56 lexbor_array_obj_t *parse_errors;
57
58 /*
59 * Leak abstractions.
60 * The only place where the specification causes mixing Tree Builder
61 * and Tokenizer. We kill all beauty.
62 * Current Tree parser. This is not ref (not ref count).
63 */
64 lxb_html_tree_t *tree;
65
66 /* Temp */
67 const lxb_char_t *markup;
68 const lxb_char_t *temp;
69 lxb_tag_id_t tmp_tag_id;
70
71 lxb_char_t *start;
72 lxb_char_t *pos;
73 const lxb_char_t *end;
74 const lxb_char_t *begin;
75 const lxb_char_t *last;
76 size_t current_line;
77 size_t current_column;
78
79 /* Entities */
80 const lexbor_sbst_entry_static_t *entity;
81 const lexbor_sbst_entry_static_t *entity_match;
82 uintptr_t entity_start;
83 uintptr_t entity_end;
84 uint32_t entity_length;
85 uint32_t entity_number;
86 bool is_attribute;
87
88 /* Process */
89 lxb_html_tokenizer_opt_t opt;
90 lxb_status_t status;
91 bool is_eof;
92
93 lxb_html_tokenizer_t *base;
94 size_t ref_count;
95 };
96
97
98 #include "lexbor/html/tokenizer/error.h"
99
100
101 extern const lxb_char_t *lxb_html_tokenizer_eof;
102
103 LXB_API lxb_html_tokenizer_t *
104 lxb_html_tokenizer_create(void);
105
106 LXB_API lxb_status_t
107 lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz);
108
109 LXB_API lxb_status_t
110 lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
111 lxb_html_tokenizer_t *tkz_from);
112
113 LXB_API lxb_html_tokenizer_t *
114 lxb_html_tokenizer_ref(lxb_html_tokenizer_t *tkz);
115
116 LXB_API lxb_html_tokenizer_t *
117 lxb_html_tokenizer_unref(lxb_html_tokenizer_t *tkz);
118
119 LXB_API void
120 lxb_html_tokenizer_clean(lxb_html_tokenizer_t *tkz);
121
122 LXB_API lxb_html_tokenizer_t *
123 lxb_html_tokenizer_destroy(lxb_html_tokenizer_t *tkz);
124
125 LXB_API lxb_status_t
126 lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t *tkz, size_t table_size);
127
128 LXB_API void
129 lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t *tkz);
130
131 LXB_API lxb_status_t
132 lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t *tkz, size_t table_size);
133
134 LXB_API void
135 lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t *tkz);
136
137 LXB_API lxb_status_t
138 lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz);
139
140 LXB_API lxb_status_t
141 lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz,
142 const lxb_char_t *data, size_t size);
143
144 LXB_API lxb_status_t
145 lxb_html_tokenizer_end(lxb_html_tokenizer_t *tkz);
146
147
148 LXB_API const lxb_char_t *
149 lxb_html_tokenizer_change_incoming(lxb_html_tokenizer_t *tkz,
150 const lxb_char_t *pos);
151
152 LXB_API lxb_ns_id_t
153 lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz);
154
155 LXB_API void
156 lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t *tkz, bool scripting,
157 lxb_tag_id_t tag_id, lxb_ns_id_t ns);
158
159
160 /*
161 * Inline functions
162 */
163 lxb_inline void
lxb_html_tokenizer_status_set(lxb_html_tokenizer_t * tkz,lxb_status_t status)164 lxb_html_tokenizer_status_set(lxb_html_tokenizer_t *tkz, lxb_status_t status)
165 {
166 tkz->status = status;
167 }
168
169 lxb_inline void
lxb_html_tokenizer_tags_set(lxb_html_tokenizer_t * tkz,lexbor_hash_t * tags)170 lxb_html_tokenizer_tags_set(lxb_html_tokenizer_t *tkz, lexbor_hash_t *tags)
171 {
172 tkz->tags = tags;
173 }
174
175 lxb_inline lexbor_hash_t *
lxb_html_tokenizer_tags(lxb_html_tokenizer_t * tkz)176 lxb_html_tokenizer_tags(lxb_html_tokenizer_t *tkz)
177 {
178 return tkz->tags;
179 }
180
181 lxb_inline void
lxb_html_tokenizer_attrs_set(lxb_html_tokenizer_t * tkz,lexbor_hash_t * attrs)182 lxb_html_tokenizer_attrs_set(lxb_html_tokenizer_t *tkz, lexbor_hash_t *attrs)
183 {
184 tkz->attrs = attrs;
185 }
186
187 lxb_inline lexbor_hash_t *
lxb_html_tokenizer_attrs(lxb_html_tokenizer_t * tkz)188 lxb_html_tokenizer_attrs(lxb_html_tokenizer_t *tkz)
189 {
190 return tkz->attrs;
191 }
192
193 lxb_inline void
lxb_html_tokenizer_attrs_mraw_set(lxb_html_tokenizer_t * tkz,lexbor_mraw_t * mraw)194 lxb_html_tokenizer_attrs_mraw_set(lxb_html_tokenizer_t *tkz,
195 lexbor_mraw_t *mraw)
196 {
197 tkz->attrs_mraw = mraw;
198 }
199
200 lxb_inline lexbor_mraw_t *
lxb_html_tokenizer_attrs_mraw(lxb_html_tokenizer_t * tkz)201 lxb_html_tokenizer_attrs_mraw(lxb_html_tokenizer_t *tkz)
202 {
203 return tkz->attrs_mraw;
204 }
205
206 lxb_inline void
lxb_html_tokenizer_callback_token_done_set(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_token_f call_func,void * ctx)207 lxb_html_tokenizer_callback_token_done_set(lxb_html_tokenizer_t *tkz,
208 lxb_html_tokenizer_token_f call_func,
209 void *ctx)
210 {
211 tkz->callback_token_done = call_func;
212 tkz->callback_token_ctx = ctx;
213 }
214
215 lxb_inline void *
lxb_html_tokenizer_callback_token_done_ctx(lxb_html_tokenizer_t * tkz)216 lxb_html_tokenizer_callback_token_done_ctx(lxb_html_tokenizer_t *tkz)
217 {
218 return tkz->callback_token_ctx;
219 }
220
221 lxb_inline void
lxb_html_tokenizer_state_set(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_state_f state)222 lxb_html_tokenizer_state_set(lxb_html_tokenizer_t *tkz,
223 lxb_html_tokenizer_state_f state)
224 {
225 tkz->state = state;
226 }
227
228 lxb_inline void
lxb_html_tokenizer_tmp_tag_id_set(lxb_html_tokenizer_t * tkz,lxb_tag_id_t tag_id)229 lxb_html_tokenizer_tmp_tag_id_set(lxb_html_tokenizer_t *tkz,
230 lxb_tag_id_t tag_id)
231 {
232 tkz->tmp_tag_id = tag_id;
233 }
234
235 lxb_inline lxb_html_tree_t *
lxb_html_tokenizer_tree(lxb_html_tokenizer_t * tkz)236 lxb_html_tokenizer_tree(lxb_html_tokenizer_t *tkz)
237 {
238 return tkz->tree;
239 }
240
241 lxb_inline void
lxb_html_tokenizer_tree_set(lxb_html_tokenizer_t * tkz,lxb_html_tree_t * tree)242 lxb_html_tokenizer_tree_set(lxb_html_tokenizer_t *tkz, lxb_html_tree_t *tree)
243 {
244 tkz->tree = tree;
245 }
246
247 lxb_inline lexbor_mraw_t *
lxb_html_tokenizer_mraw(lxb_html_tokenizer_t * tkz)248 lxb_html_tokenizer_mraw(lxb_html_tokenizer_t *tkz)
249 {
250 return tkz->mraw;
251 }
252
253 lxb_inline lxb_status_t
lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t * tkz,size_t size)254 lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t *tkz, size_t size)
255 {
256 size_t length = tkz->pos - tkz->start;
257 size_t new_size = (tkz->end - tkz->start) + size + 4096;
258
259 tkz->start = (lxb_char_t *)lexbor_realloc(tkz->start, new_size);
260 if (tkz->start == NULL) {
261 tkz->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
262 return tkz->status;
263 }
264
265 tkz->pos = tkz->start + length;
266 tkz->end = tkz->start + new_size;
267
268 return LXB_STATUS_OK;
269 }
270
271 lxb_inline lxb_status_t
lxb_html_tokenizer_temp_append_data(lxb_html_tokenizer_t * tkz,const lxb_char_t * data)272 lxb_html_tokenizer_temp_append_data(lxb_html_tokenizer_t *tkz,
273 const lxb_char_t *data)
274 {
275 size_t size = data - tkz->begin;
276
277 if ((tkz->pos + size) > tkz->end) {
278 if(lxb_html_tokenizer_temp_realloc(tkz, size)) {
279 return tkz->status;
280 }
281 }
282
283 tkz->pos = (lxb_char_t *) memcpy(tkz->pos, tkz->begin, size) + size;
284
285 return LXB_STATUS_OK;
286 }
287
288 lxb_inline lxb_status_t
lxb_html_tokenizer_temp_append(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,size_t size)289 lxb_html_tokenizer_temp_append(lxb_html_tokenizer_t *tkz,
290 const lxb_char_t *data, size_t size)
291 {
292 if ((tkz->pos + size) > tkz->end) {
293 if(lxb_html_tokenizer_temp_realloc(tkz, size)) {
294 return tkz->status;
295 }
296 }
297
298 tkz->pos = (lxb_char_t *) memcpy(tkz->pos, data, size) + size;
299
300 return LXB_STATUS_OK;
301 }
302
303
304 /*
305 * No inline functions for ABI.
306 */
307 LXB_API void
308 lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t *tkz,
309 lxb_status_t status);
310
311 LXB_API void
312 lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t *tkz,
313 lxb_html_tokenizer_token_f call_func,
314 void *ctx);
315
316 LXB_API void *
317 lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t *tkz);
318
319 LXB_API void
320 lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t *tkz,
321 lxb_html_tokenizer_state_f state);
322
323 LXB_API void
324 lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t *tkz,
325 lxb_tag_id_t tag_id);
326
327 LXB_API lxb_html_tree_t *
328 lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t *tkz);
329
330 LXB_API void
331 lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t *tkz,
332 lxb_html_tree_t *tree);
333
334 LXB_API lexbor_mraw_t *
335 lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t *tkz);
336
337 LXB_API lexbor_hash_t *
338 lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t *tkz);
339
340
341 #ifdef __cplusplus
342 } /* extern "C" */
343 #endif
344
345 #endif /* LEXBOR_HTML_TOKENIZER_H */
346