xref: /php-src/ext/dom/lexbor/lexbor/html/parser.c (revision bffab33a)
1 /*
2  * Copyright (C) 2018-2021 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/parser.h"
8 #include "lexbor/html/node.h"
9 #include "lexbor/html/tree/open_elements.h"
10 #include "lexbor/html/interfaces/element.h"
11 #include "lexbor/html/interfaces/html_element.h"
12 #include "lexbor/html/interfaces/form_element.h"
13 #include "lexbor/html/tree/template_insertion.h"
14 #include "lexbor/html/tree/insertion_mode.h"
15 
16 #define LXB_HTML_TAG_RES_DATA
17 #define LXB_HTML_TAG_RES_SHS_DATA
18 #include "lexbor/html/tag_res.h"
19 
20 
21 static void
22 lxb_html_parse_fragment_chunk_destroy(lxb_html_parser_t *parser);
23 
24 
25 lxb_html_parser_t *
lxb_html_parser_create(void)26 lxb_html_parser_create(void)
27 {
28     return lexbor_calloc(1, sizeof(lxb_html_parser_t));
29 }
30 
31 lxb_status_t
lxb_html_parser_init(lxb_html_parser_t * parser)32 lxb_html_parser_init(lxb_html_parser_t *parser)
33 {
34     if (parser == NULL) {
35         return LXB_STATUS_ERROR_OBJECT_IS_NULL;
36     }
37 
38     /* Tokenizer */
39     parser->tkz = lxb_html_tokenizer_create();
40     lxb_status_t status = lxb_html_tokenizer_init(parser->tkz);
41 
42     if (status != LXB_STATUS_OK) {
43         return status;
44     }
45 
46     /* Tree */
47     parser->tree = lxb_html_tree_create();
48     status = lxb_html_tree_init(parser->tree, parser->tkz);
49 
50     if (status != LXB_STATUS_OK) {
51         return status;
52     }
53 
54     parser->original_tree = NULL;
55     parser->form = NULL;
56     parser->root = NULL;
57 
58     parser->state = LXB_HTML_PARSER_STATE_BEGIN;
59 
60     parser->ref_count = 1;
61 
62     return LXB_STATUS_OK;
63 }
64 
65 void
lxb_html_parser_clean(lxb_html_parser_t * parser)66 lxb_html_parser_clean(lxb_html_parser_t *parser)
67 {
68     parser->original_tree = NULL;
69     parser->form = NULL;
70     parser->root = NULL;
71 
72     parser->state = LXB_HTML_PARSER_STATE_BEGIN;
73 
74     lxb_html_tokenizer_clean(parser->tkz);
75     lxb_html_tree_clean(parser->tree);
76 }
77 
78 lxb_html_parser_t *
lxb_html_parser_destroy(lxb_html_parser_t * parser)79 lxb_html_parser_destroy(lxb_html_parser_t *parser)
80 {
81     if (parser == NULL) {
82         return NULL;
83     }
84 
85     parser->tkz = lxb_html_tokenizer_unref(parser->tkz);
86     parser->tree = lxb_html_tree_unref(parser->tree);
87 
88     return lexbor_free(parser);
89 }
90 
91 lxb_html_parser_t *
lxb_html_parser_ref(lxb_html_parser_t * parser)92 lxb_html_parser_ref(lxb_html_parser_t *parser)
93 {
94     if (parser == NULL) {
95         return NULL;
96     }
97 
98     parser->ref_count++;
99 
100     return parser;
101 }
102 
103 lxb_html_parser_t *
lxb_html_parser_unref(lxb_html_parser_t * parser)104 lxb_html_parser_unref(lxb_html_parser_t *parser)
105 {
106     if (parser == NULL || parser->ref_count == 0) {
107         return NULL;
108     }
109 
110     parser->ref_count--;
111 
112     if (parser->ref_count == 0) {
113         lxb_html_parser_destroy(parser);
114     }
115 
116     return NULL;
117 }
118 
119 
120 lxb_html_document_t *
lxb_html_parse(lxb_html_parser_t * parser,const lxb_char_t * html,size_t size)121 lxb_html_parse(lxb_html_parser_t *parser, const lxb_char_t *html, size_t size)
122 {
123     lxb_html_document_t *document = lxb_html_parse_chunk_begin(parser);
124     if (document == NULL) {
125         return NULL;
126     }
127 
128     lxb_html_parse_chunk_process(parser, html, size);
129     if (parser->status != LXB_STATUS_OK) {
130         goto failed;
131     }
132 
133     lxb_html_parse_chunk_end(parser);
134     if (parser->status != LXB_STATUS_OK) {
135         goto failed;
136     }
137 
138     return document;
139 
140 failed:
141 
142     lxb_html_document_interface_destroy(document);
143 
144     return NULL;
145 }
146 
147 lxb_dom_node_t *
lxb_html_parse_fragment(lxb_html_parser_t * parser,lxb_html_element_t * element,const lxb_char_t * html,size_t size)148 lxb_html_parse_fragment(lxb_html_parser_t *parser, lxb_html_element_t *element,
149                         const lxb_char_t *html, size_t size)
150 {
151     return lxb_html_parse_fragment_by_tag_id(parser,
152                                              parser->tree->document,
153                                              element->element.node.local_name,
154                                              element->element.node.ns,
155                                              html, size);
156 }
157 
158 lxb_dom_node_t *
lxb_html_parse_fragment_by_tag_id(lxb_html_parser_t * parser,lxb_html_document_t * document,lxb_tag_id_t tag_id,lxb_ns_id_t ns,const lxb_char_t * html,size_t size)159 lxb_html_parse_fragment_by_tag_id(lxb_html_parser_t *parser,
160                                   lxb_html_document_t *document,
161                                   lxb_tag_id_t tag_id, lxb_ns_id_t ns,
162                                   const lxb_char_t *html, size_t size)
163 {
164     lxb_html_parse_fragment_chunk_begin(parser, document, tag_id, ns);
165     if (parser->status != LXB_STATUS_OK) {
166         return NULL;
167     }
168 
169     lxb_html_parse_fragment_chunk_process(parser, html, size);
170     if (parser->status != LXB_STATUS_OK) {
171         return NULL;
172     }
173 
174     return lxb_html_parse_fragment_chunk_end(parser);
175 }
176 
177 lxb_status_t
lxb_html_parse_fragment_chunk_begin(lxb_html_parser_t * parser,lxb_html_document_t * document,lxb_tag_id_t tag_id,lxb_ns_id_t ns)178 lxb_html_parse_fragment_chunk_begin(lxb_html_parser_t *parser,
179                                     lxb_html_document_t *document,
180                                     lxb_tag_id_t tag_id, lxb_ns_id_t ns)
181 {
182     lxb_dom_document_t *doc;
183     lxb_html_document_t *new_doc;
184 
185     if (parser->state != LXB_HTML_PARSER_STATE_BEGIN) {
186         lxb_html_parser_clean(parser);
187     }
188 
189     parser->state = LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS;
190 
191     new_doc = lxb_html_document_interface_create(document);
192     if (new_doc == NULL) {
193         parser->state = LXB_HTML_PARSER_STATE_ERROR;
194         return parser->status;
195     }
196 
197     doc = lxb_dom_interface_document(new_doc);
198 
199     if (document == NULL) {
200         doc->scripting = parser->tree->scripting;
201         doc->compat_mode = LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS;
202     }
203 
204     lxb_html_tokenizer_set_state_by_tag(parser->tkz, doc->scripting, tag_id, ns);
205 
206     parser->root = lxb_html_interface_create(new_doc, LXB_TAG_HTML, LXB_NS_HTML);
207     if (parser->root == NULL) {
208         parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
209 
210         goto done;
211     }
212 
213     lxb_dom_node_insert_child_wo_events(lxb_dom_interface_node(new_doc),
214                                         parser->root);
215     lxb_dom_document_attach_element(doc, lxb_dom_interface_element(parser->root));
216 
217     parser->tree->fragment = lxb_html_interface_create(new_doc, tag_id, ns);
218     if (parser->tree->fragment == NULL) {
219         parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
220 
221         goto done;
222     }
223 
224     /* Contains just the single element root */
225     parser->status = lxb_html_tree_open_elements_push(parser->tree, parser->root);
226     if (parser->status != LXB_STATUS_OK) {
227         goto done;
228     }
229 
230     if (tag_id == LXB_TAG_TEMPLATE && ns == LXB_NS_HTML) {
231         parser->status = lxb_html_tree_template_insertion_push(parser->tree,
232                                       lxb_html_tree_insertion_mode_in_template);
233         if (parser->status != LXB_STATUS_OK) {
234             goto done;
235         }
236     }
237 
238     lxb_html_tree_attach_document(parser->tree, new_doc);
239     lxb_html_tree_reset_insertion_mode_appropriately(parser->tree);
240 
241     if (tag_id == LXB_TAG_FORM && ns == LXB_NS_HTML) {
242         parser->form = lxb_html_interface_create(new_doc,
243                                                  LXB_TAG_FORM, LXB_NS_HTML);
244         if (parser->form == NULL) {
245             parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
246 
247             goto done;
248         }
249 
250         parser->tree->form = lxb_html_interface_form(parser->form);
251     }
252 
253     parser->original_tree = lxb_html_tokenizer_tree(parser->tkz);
254     lxb_html_tokenizer_tree_set(parser->tkz, parser->tree);
255 
256     lxb_html_tokenizer_tags_set(parser->tkz, doc->tags);
257     lxb_html_tokenizer_attrs_set(parser->tkz, doc->attrs);
258     lxb_html_tokenizer_attrs_mraw_set(parser->tkz, doc->text);
259 
260     parser->status = lxb_html_tree_begin(parser->tree, new_doc);
261 
262 done:
263 
264     if (parser->status != LXB_STATUS_OK) {
265         if (parser->root != NULL) {
266             lxb_html_html_element_interface_destroy(lxb_html_interface_html(parser->root));
267         }
268 
269         parser->state = LXB_HTML_PARSER_STATE_ERROR;
270         parser->root = NULL;
271 
272         lxb_html_parse_fragment_chunk_destroy(parser);
273     }
274 
275     return parser->status;
276 }
277 
278 lxb_status_t
lxb_html_parse_fragment_chunk_process(lxb_html_parser_t * parser,const lxb_char_t * html,size_t size)279 lxb_html_parse_fragment_chunk_process(lxb_html_parser_t *parser,
280                                       const lxb_char_t *html, size_t size)
281 {
282     if (parser->state != LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS) {
283         return LXB_STATUS_ERROR_WRONG_STAGE;
284     }
285 
286     parser->status = lxb_html_tree_chunk(parser->tree, html, size);
287     if (parser->status != LXB_STATUS_OK) {
288         lxb_html_html_element_interface_destroy(lxb_html_interface_html(parser->root));
289 
290         parser->state = LXB_HTML_PARSER_STATE_ERROR;
291         parser->root = NULL;
292 
293         lxb_html_parse_fragment_chunk_destroy(parser);
294     }
295 
296     return parser->status;
297 }
298 
299 lxb_dom_node_t *
lxb_html_parse_fragment_chunk_end(lxb_html_parser_t * parser)300 lxb_html_parse_fragment_chunk_end(lxb_html_parser_t *parser)
301 {
302     if (parser->state != LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS) {
303         parser->status = LXB_STATUS_ERROR_WRONG_STAGE;
304 
305         return NULL;
306     }
307 
308     parser->status = lxb_html_tree_end(parser->tree);
309     if (parser->status != LXB_STATUS_OK) {
310         lxb_html_html_element_interface_destroy(lxb_html_interface_html(parser->root));
311 
312         parser->root = NULL;
313     }
314 
315     lxb_html_parse_fragment_chunk_destroy(parser);
316 
317     lxb_html_tokenizer_tree_set(parser->tkz, parser->original_tree);
318 
319     parser->state = LXB_HTML_PARSER_STATE_END;
320 
321     return parser->root;
322 }
323 
324 static void
lxb_html_parse_fragment_chunk_destroy(lxb_html_parser_t * parser)325 lxb_html_parse_fragment_chunk_destroy(lxb_html_parser_t *parser)
326 {
327     lxb_dom_document_t *doc;
328 
329     if (parser->form != NULL) {
330         lxb_html_form_element_interface_destroy(lxb_html_interface_form(parser->form));
331 
332         parser->form = NULL;
333     }
334 
335     if (parser->tree->fragment != NULL) {
336         lxb_html_interface_destroy(parser->tree->fragment);
337 
338         parser->tree->fragment = NULL;
339     }
340 
341     if (lxb_html_document_is_original(parser->tree->document) == false) {
342         if (parser->root != NULL) {
343             doc = lxb_dom_interface_node(parser->tree->document)->owner_document;
344             parser->root->parent = &doc->node;
345         }
346 
347         lxb_html_document_interface_destroy(parser->tree->document);
348 
349         parser->tree->document = NULL;
350     }
351 }
352 
353 LXB_API lxb_status_t
lxb_html_parse_chunk_prepare(lxb_html_parser_t * parser,lxb_html_document_t * document)354 lxb_html_parse_chunk_prepare(lxb_html_parser_t *parser,
355                              lxb_html_document_t *document)
356 {
357     parser->state = LXB_HTML_PARSER_STATE_PROCESS;
358 
359     parser->original_tree = lxb_html_tokenizer_tree(parser->tkz);
360     lxb_html_tokenizer_tree_set(parser->tkz, parser->tree);
361 
362     lxb_html_tokenizer_tags_set(parser->tkz, document->dom_document.tags);
363     lxb_html_tokenizer_attrs_set(parser->tkz, document->dom_document.attrs);
364     lxb_html_tokenizer_attrs_mraw_set(parser->tkz, document->dom_document.text);
365 
366     parser->status = lxb_html_tree_begin(parser->tree, document);
367     if (parser->status != LXB_STATUS_OK) {
368         parser->state = LXB_HTML_PARSER_STATE_ERROR;
369     }
370 
371     return parser->status;
372 }
373 
374 lxb_html_document_t *
lxb_html_parse_chunk_begin(lxb_html_parser_t * parser)375 lxb_html_parse_chunk_begin(lxb_html_parser_t *parser)
376 {
377     lxb_html_document_t *document;
378 
379     if (parser->state != LXB_HTML_PARSER_STATE_BEGIN) {
380         lxb_html_parser_clean(parser);
381     }
382 
383     document = lxb_html_document_interface_create(NULL);
384     if (document == NULL) {
385         parser->state = LXB_HTML_PARSER_STATE_ERROR;
386         parser->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
387 
388         return lxb_html_document_destroy(document);
389     }
390 
391     document->dom_document.scripting = parser->tree->scripting;
392 
393     parser->status = lxb_html_parse_chunk_prepare(parser, document);
394     if (parser->status != LXB_STATUS_OK) {
395         return lxb_html_document_destroy(document);
396     }
397 
398     return document;
399 }
400 
401 lxb_status_t
lxb_html_parse_chunk_process(lxb_html_parser_t * parser,const lxb_char_t * html,size_t size)402 lxb_html_parse_chunk_process(lxb_html_parser_t *parser,
403                              const lxb_char_t *html, size_t size)
404 {
405     if (parser->state != LXB_HTML_PARSER_STATE_PROCESS) {
406         return LXB_STATUS_ERROR_WRONG_STAGE;
407     }
408 
409     parser->status = lxb_html_tree_chunk(parser->tree, html, size);
410     if (parser->status != LXB_STATUS_OK) {
411         parser->state = LXB_HTML_PARSER_STATE_ERROR;
412     }
413 
414     return parser->status;
415 }
416 
417 lxb_status_t
lxb_html_parse_chunk_end(lxb_html_parser_t * parser)418 lxb_html_parse_chunk_end(lxb_html_parser_t *parser)
419 {
420     if (parser->state != LXB_HTML_PARSER_STATE_PROCESS) {
421         return LXB_STATUS_ERROR_WRONG_STAGE;
422     }
423 
424     parser->status = lxb_html_tree_end(parser->tree);
425 
426     lxb_html_tokenizer_tree_set(parser->tkz, parser->original_tree);
427 
428     parser->state = LXB_HTML_PARSER_STATE_END;
429 
430     return parser->status;
431 }
432 
433 /*
434  * No inline functions for ABI.
435  */
436 lxb_html_tokenizer_t *
lxb_html_parser_tokenizer_noi(lxb_html_parser_t * parser)437 lxb_html_parser_tokenizer_noi(lxb_html_parser_t *parser)
438 {
439     return lxb_html_parser_tokenizer(parser);
440 }
441 
442 lxb_html_tree_t *
lxb_html_parser_tree_noi(lxb_html_parser_t * parser)443 lxb_html_parser_tree_noi(lxb_html_parser_t *parser)
444 {
445     return lxb_html_parser_tree(parser);
446 }
447 
448 lxb_status_t
lxb_html_parser_status_noi(lxb_html_parser_t * parser)449 lxb_html_parser_status_noi(lxb_html_parser_t *parser)
450 {
451     return lxb_html_parser_status(parser);
452 }
453 
454 lxb_status_t
lxb_html_parser_state_noi(lxb_html_parser_t * parser)455 lxb_html_parser_state_noi(lxb_html_parser_t *parser)
456 {
457     return lxb_html_parser_state(parser);
458 }
459 
460 bool
lxb_html_parser_scripting_noi(lxb_html_parser_t * parser)461 lxb_html_parser_scripting_noi(lxb_html_parser_t *parser)
462 {
463     return lxb_html_parser_scripting(parser);
464 }
465 
466 void
lxb_html_parser_scripting_set_noi(lxb_html_parser_t * parser,bool scripting)467 lxb_html_parser_scripting_set_noi(lxb_html_parser_t *parser, bool scripting)
468 {
469     lxb_html_parser_scripting_set(parser, scripting);
470 }
471