1 /*
2 * Copyright (C) 2018-2021 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/parser.h"
8 #include "lexbor/html/node.h"
9 #include "lexbor/html/tree/open_elements.h"
10 #include "lexbor/html/interfaces/element.h"
11 #include "lexbor/html/interfaces/html_element.h"
12 #include "lexbor/html/interfaces/form_element.h"
13 #include "lexbor/html/tree/template_insertion.h"
14 #include "lexbor/html/tree/insertion_mode.h"
15
16 #define LXB_HTML_TAG_RES_DATA
17 #define LXB_HTML_TAG_RES_SHS_DATA
18 #include "lexbor/html/tag_res.h"
19
20
21 static void
22 lxb_html_parse_fragment_chunk_destroy(lxb_html_parser_t *parser);
23
24
25 lxb_html_parser_t *
lxb_html_parser_create(void)26 lxb_html_parser_create(void)
27 {
28 return lexbor_calloc(1, sizeof(lxb_html_parser_t));
29 }
30
31 lxb_status_t
lxb_html_parser_init(lxb_html_parser_t * parser)32 lxb_html_parser_init(lxb_html_parser_t *parser)
33 {
34 if (parser == NULL) {
35 return LXB_STATUS_ERROR_OBJECT_IS_NULL;
36 }
37
38 /* Tokenizer */
39 parser->tkz = lxb_html_tokenizer_create();
40 lxb_status_t status = lxb_html_tokenizer_init(parser->tkz);
41
42 if (status != LXB_STATUS_OK) {
43 return status;
44 }
45
46 /* Tree */
47 parser->tree = lxb_html_tree_create();
48 status = lxb_html_tree_init(parser->tree, parser->tkz);
49
50 if (status != LXB_STATUS_OK) {
51 return status;
52 }
53
54 parser->original_tree = NULL;
55 parser->form = NULL;
56 parser->root = NULL;
57
58 parser->state = LXB_HTML_PARSER_STATE_BEGIN;
59
60 parser->ref_count = 1;
61
62 return LXB_STATUS_OK;
63 }
64
65 void
lxb_html_parser_clean(lxb_html_parser_t * parser)66 lxb_html_parser_clean(lxb_html_parser_t *parser)
67 {
68 parser->original_tree = NULL;
69 parser->form = NULL;
70 parser->root = NULL;
71
72 parser->state = LXB_HTML_PARSER_STATE_BEGIN;
73
74 lxb_html_tokenizer_clean(parser->tkz);
75 lxb_html_tree_clean(parser->tree);
76 }
77
78 lxb_html_parser_t *
lxb_html_parser_destroy(lxb_html_parser_t * parser)79 lxb_html_parser_destroy(lxb_html_parser_t *parser)
80 {
81 if (parser == NULL) {
82 return NULL;
83 }
84
85 parser->tkz = lxb_html_tokenizer_unref(parser->tkz);
86 parser->tree = lxb_html_tree_unref(parser->tree);
87
88 return lexbor_free(parser);
89 }
90
91 lxb_html_parser_t *
lxb_html_parser_ref(lxb_html_parser_t * parser)92 lxb_html_parser_ref(lxb_html_parser_t *parser)
93 {
94 if (parser == NULL) {
95 return NULL;
96 }
97
98 parser->ref_count++;
99
100 return parser;
101 }
102
103 lxb_html_parser_t *
lxb_html_parser_unref(lxb_html_parser_t * parser)104 lxb_html_parser_unref(lxb_html_parser_t *parser)
105 {
106 if (parser == NULL || parser->ref_count == 0) {
107 return NULL;
108 }
109
110 parser->ref_count--;
111
112 if (parser->ref_count == 0) {
113 lxb_html_parser_destroy(parser);
114 }
115
116 return NULL;
117 }
118
119
120 lxb_html_document_t *
lxb_html_parse(lxb_html_parser_t * parser,const lxb_char_t * html,size_t size)121 lxb_html_parse(lxb_html_parser_t *parser, const lxb_char_t *html, size_t size)
122 {
123 lxb_html_document_t *document = lxb_html_parse_chunk_begin(parser);
124 if (document == NULL) {
125 return NULL;
126 }
127
128 lxb_html_parse_chunk_process(parser, html, size);
129 if (parser->status != LXB_STATUS_OK) {
130 goto failed;
131 }
132
133 lxb_html_parse_chunk_end(parser);
134 if (parser->status != LXB_STATUS_OK) {
135 goto failed;
136 }
137
138 return document;
139
140 failed:
141
142 lxb_html_document_interface_destroy(document);
143
144 return NULL;
145 }
146
147 lxb_dom_node_t *
lxb_html_parse_fragment(lxb_html_parser_t * parser,lxb_html_element_t * element,const lxb_char_t * html,size_t size)148 lxb_html_parse_fragment(lxb_html_parser_t *parser, lxb_html_element_t *element,
149 const lxb_char_t *html, size_t size)
150 {
151 return lxb_html_parse_fragment_by_tag_id(parser,
152 parser->tree->document,
153 element->element.node.local_name,
154 element->element.node.ns,
155 html, size);
156 }
157
158 lxb_dom_node_t *
lxb_html_parse_fragment_by_tag_id(lxb_html_parser_t * parser,lxb_html_document_t * document,lxb_tag_id_t tag_id,lxb_ns_id_t ns,const lxb_char_t * html,size_t size)159 lxb_html_parse_fragment_by_tag_id(lxb_html_parser_t *parser,
160 lxb_html_document_t *document,
161 lxb_tag_id_t tag_id, lxb_ns_id_t ns,
162 const lxb_char_t *html, size_t size)
163 {
164 lxb_html_parse_fragment_chunk_begin(parser, document, tag_id, ns);
165 if (parser->status != LXB_STATUS_OK) {
166 return NULL;
167 }
168
169 lxb_html_parse_fragment_chunk_process(parser, html, size);
170 if (parser->status != LXB_STATUS_OK) {
171 return NULL;
172 }
173
174 return lxb_html_parse_fragment_chunk_end(parser);
175 }
176
177 lxb_status_t
lxb_html_parse_fragment_chunk_begin(lxb_html_parser_t * parser,lxb_html_document_t * document,lxb_tag_id_t tag_id,lxb_ns_id_t ns)178 lxb_html_parse_fragment_chunk_begin(lxb_html_parser_t *parser,
179 lxb_html_document_t *document,
180 lxb_tag_id_t tag_id, lxb_ns_id_t ns)
181 {
182 lxb_dom_document_t *doc;
183 lxb_html_document_t *new_doc;
184
185 if (parser->state != LXB_HTML_PARSER_STATE_BEGIN) {
186 lxb_html_parser_clean(parser);
187 }
188
189 parser->state = LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS;
190
191 new_doc = lxb_html_document_interface_create(document);
192 if (new_doc == NULL) {
193 parser->state = LXB_HTML_PARSER_STATE_ERROR;
194 return parser->status;
195 }
196
197 doc = lxb_dom_interface_document(new_doc);
198
199 if (document == NULL) {
200 doc->scripting = parser->tree->scripting;
201 doc->compat_mode = LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS;
202 }
203
204 lxb_html_tokenizer_set_state_by_tag(parser->tkz, doc->scripting, tag_id, ns);
205
206 parser->root = lxb_html_interface_create(new_doc, LXB_TAG_HTML, LXB_NS_HTML);
207 if (parser->root == NULL) {
208 parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
209
210 goto done;
211 }
212
213 lxb_dom_node_insert_child_wo_events(lxb_dom_interface_node(new_doc),
214 parser->root);
215 lxb_dom_document_attach_element(doc, lxb_dom_interface_element(parser->root));
216
217 parser->tree->fragment = lxb_html_interface_create(new_doc, tag_id, ns);
218 if (parser->tree->fragment == NULL) {
219 parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
220
221 goto done;
222 }
223
224 /* Contains just the single element root */
225 parser->status = lxb_html_tree_open_elements_push(parser->tree, parser->root);
226 if (parser->status != LXB_STATUS_OK) {
227 goto done;
228 }
229
230 if (tag_id == LXB_TAG_TEMPLATE && ns == LXB_NS_HTML) {
231 parser->status = lxb_html_tree_template_insertion_push(parser->tree,
232 lxb_html_tree_insertion_mode_in_template);
233 if (parser->status != LXB_STATUS_OK) {
234 goto done;
235 }
236 }
237
238 lxb_html_tree_attach_document(parser->tree, new_doc);
239 lxb_html_tree_reset_insertion_mode_appropriately(parser->tree);
240
241 if (tag_id == LXB_TAG_FORM && ns == LXB_NS_HTML) {
242 parser->form = lxb_html_interface_create(new_doc,
243 LXB_TAG_FORM, LXB_NS_HTML);
244 if (parser->form == NULL) {
245 parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
246
247 goto done;
248 }
249
250 parser->tree->form = lxb_html_interface_form(parser->form);
251 }
252
253 parser->original_tree = lxb_html_tokenizer_tree(parser->tkz);
254 lxb_html_tokenizer_tree_set(parser->tkz, parser->tree);
255
256 lxb_html_tokenizer_tags_set(parser->tkz, doc->tags);
257 lxb_html_tokenizer_attrs_set(parser->tkz, doc->attrs);
258 lxb_html_tokenizer_attrs_mraw_set(parser->tkz, doc->text);
259
260 parser->status = lxb_html_tree_begin(parser->tree, new_doc);
261
262 done:
263
264 if (parser->status != LXB_STATUS_OK) {
265 if (parser->root != NULL) {
266 lxb_html_html_element_interface_destroy(lxb_html_interface_html(parser->root));
267 }
268
269 parser->state = LXB_HTML_PARSER_STATE_ERROR;
270 parser->root = NULL;
271
272 lxb_html_parse_fragment_chunk_destroy(parser);
273 }
274
275 return parser->status;
276 }
277
278 lxb_status_t
lxb_html_parse_fragment_chunk_process(lxb_html_parser_t * parser,const lxb_char_t * html,size_t size)279 lxb_html_parse_fragment_chunk_process(lxb_html_parser_t *parser,
280 const lxb_char_t *html, size_t size)
281 {
282 if (parser->state != LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS) {
283 return LXB_STATUS_ERROR_WRONG_STAGE;
284 }
285
286 parser->status = lxb_html_tree_chunk(parser->tree, html, size);
287 if (parser->status != LXB_STATUS_OK) {
288 lxb_html_html_element_interface_destroy(lxb_html_interface_html(parser->root));
289
290 parser->state = LXB_HTML_PARSER_STATE_ERROR;
291 parser->root = NULL;
292
293 lxb_html_parse_fragment_chunk_destroy(parser);
294 }
295
296 return parser->status;
297 }
298
299 lxb_dom_node_t *
lxb_html_parse_fragment_chunk_end(lxb_html_parser_t * parser)300 lxb_html_parse_fragment_chunk_end(lxb_html_parser_t *parser)
301 {
302 if (parser->state != LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS) {
303 parser->status = LXB_STATUS_ERROR_WRONG_STAGE;
304
305 return NULL;
306 }
307
308 parser->status = lxb_html_tree_end(parser->tree);
309 if (parser->status != LXB_STATUS_OK) {
310 lxb_html_html_element_interface_destroy(lxb_html_interface_html(parser->root));
311
312 parser->root = NULL;
313 }
314
315 lxb_html_parse_fragment_chunk_destroy(parser);
316
317 lxb_html_tokenizer_tree_set(parser->tkz, parser->original_tree);
318
319 parser->state = LXB_HTML_PARSER_STATE_END;
320
321 return parser->root;
322 }
323
324 static void
lxb_html_parse_fragment_chunk_destroy(lxb_html_parser_t * parser)325 lxb_html_parse_fragment_chunk_destroy(lxb_html_parser_t *parser)
326 {
327 lxb_dom_document_t *doc;
328
329 if (parser->form != NULL) {
330 lxb_html_form_element_interface_destroy(lxb_html_interface_form(parser->form));
331
332 parser->form = NULL;
333 }
334
335 if (parser->tree->fragment != NULL) {
336 lxb_html_interface_destroy(parser->tree->fragment);
337
338 parser->tree->fragment = NULL;
339 }
340
341 if (lxb_html_document_is_original(parser->tree->document) == false) {
342 if (parser->root != NULL) {
343 doc = lxb_dom_interface_node(parser->tree->document)->owner_document;
344 parser->root->parent = &doc->node;
345 }
346
347 lxb_html_document_interface_destroy(parser->tree->document);
348
349 parser->tree->document = NULL;
350 }
351 }
352
353 LXB_API lxb_status_t
lxb_html_parse_chunk_prepare(lxb_html_parser_t * parser,lxb_html_document_t * document)354 lxb_html_parse_chunk_prepare(lxb_html_parser_t *parser,
355 lxb_html_document_t *document)
356 {
357 parser->state = LXB_HTML_PARSER_STATE_PROCESS;
358
359 parser->original_tree = lxb_html_tokenizer_tree(parser->tkz);
360 lxb_html_tokenizer_tree_set(parser->tkz, parser->tree);
361
362 lxb_html_tokenizer_tags_set(parser->tkz, document->dom_document.tags);
363 lxb_html_tokenizer_attrs_set(parser->tkz, document->dom_document.attrs);
364 lxb_html_tokenizer_attrs_mraw_set(parser->tkz, document->dom_document.text);
365
366 parser->status = lxb_html_tree_begin(parser->tree, document);
367 if (parser->status != LXB_STATUS_OK) {
368 parser->state = LXB_HTML_PARSER_STATE_ERROR;
369 }
370
371 return parser->status;
372 }
373
374 lxb_html_document_t *
lxb_html_parse_chunk_begin(lxb_html_parser_t * parser)375 lxb_html_parse_chunk_begin(lxb_html_parser_t *parser)
376 {
377 lxb_html_document_t *document;
378
379 if (parser->state != LXB_HTML_PARSER_STATE_BEGIN) {
380 lxb_html_parser_clean(parser);
381 }
382
383 document = lxb_html_document_interface_create(NULL);
384 if (document == NULL) {
385 parser->state = LXB_HTML_PARSER_STATE_ERROR;
386 parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
387
388 return lxb_html_document_destroy(document);
389 }
390
391 document->dom_document.scripting = parser->tree->scripting;
392
393 parser->status = lxb_html_parse_chunk_prepare(parser, document);
394 if (parser->status != LXB_STATUS_OK) {
395 return lxb_html_document_destroy(document);
396 }
397
398 return document;
399 }
400
401 lxb_status_t
lxb_html_parse_chunk_process(lxb_html_parser_t * parser,const lxb_char_t * html,size_t size)402 lxb_html_parse_chunk_process(lxb_html_parser_t *parser,
403 const lxb_char_t *html, size_t size)
404 {
405 if (parser->state != LXB_HTML_PARSER_STATE_PROCESS) {
406 return LXB_STATUS_ERROR_WRONG_STAGE;
407 }
408
409 parser->status = lxb_html_tree_chunk(parser->tree, html, size);
410 if (parser->status != LXB_STATUS_OK) {
411 parser->state = LXB_HTML_PARSER_STATE_ERROR;
412 }
413
414 return parser->status;
415 }
416
417 lxb_status_t
lxb_html_parse_chunk_end(lxb_html_parser_t * parser)418 lxb_html_parse_chunk_end(lxb_html_parser_t *parser)
419 {
420 if (parser->state != LXB_HTML_PARSER_STATE_PROCESS) {
421 return LXB_STATUS_ERROR_WRONG_STAGE;
422 }
423
424 parser->status = lxb_html_tree_end(parser->tree);
425
426 lxb_html_tokenizer_tree_set(parser->tkz, parser->original_tree);
427
428 parser->state = LXB_HTML_PARSER_STATE_END;
429
430 return parser->status;
431 }
432
433 /*
434 * No inline functions for ABI.
435 */
436 lxb_html_tokenizer_t *
lxb_html_parser_tokenizer_noi(lxb_html_parser_t * parser)437 lxb_html_parser_tokenizer_noi(lxb_html_parser_t *parser)
438 {
439 return lxb_html_parser_tokenizer(parser);
440 }
441
442 lxb_html_tree_t *
lxb_html_parser_tree_noi(lxb_html_parser_t * parser)443 lxb_html_parser_tree_noi(lxb_html_parser_t *parser)
444 {
445 return lxb_html_parser_tree(parser);
446 }
447
448 lxb_status_t
lxb_html_parser_status_noi(lxb_html_parser_t * parser)449 lxb_html_parser_status_noi(lxb_html_parser_t *parser)
450 {
451 return lxb_html_parser_status(parser);
452 }
453
454 lxb_status_t
lxb_html_parser_state_noi(lxb_html_parser_t * parser)455 lxb_html_parser_state_noi(lxb_html_parser_t *parser)
456 {
457 return lxb_html_parser_state(parser);
458 }
459
460 bool
lxb_html_parser_scripting_noi(lxb_html_parser_t * parser)461 lxb_html_parser_scripting_noi(lxb_html_parser_t *parser)
462 {
463 return lxb_html_parser_scripting(parser);
464 }
465
466 void
lxb_html_parser_scripting_set_noi(lxb_html_parser_t * parser,bool scripting)467 lxb_html_parser_scripting_set_noi(lxb_html_parser_t *parser, bool scripting)
468 {
469 lxb_html_parser_scripting_set(parser, scripting);
470 }
471