xref: /php-src/ext/dom/html5_parser.c (revision 6980eba8)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_parser.h"
25 #include "private_data.h"
26 #include <lexbor/html/parser.h>
27 #include <lexbor/html/interfaces/element.h>
28 #include <lexbor/html/interfaces/template_element.h>
29 #include <lexbor/dom/dom.h>
30 #include <libxml/parserInternals.h>
31 #include <libxml/HTMLtree.h>
32 
33 #define WORK_LIST_INIT_SIZE 128
34 /* libxml2 reserves 2 pointer-sized words for interned strings */
35 #define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
36 
37 typedef struct work_list_item {
38     lxb_dom_node_t *node;
39     uintptr_t current_active_namespace;
40     xmlNodePtr lxml_parent;
41     xmlNsPtr lxml_ns;
42 } work_list_item;
43 
lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t * array,lxb_dom_node_t * node,uintptr_t current_active_namespace,xmlNodePtr lxml_parent,xmlNsPtr lxml_ns)44 static void lexbor_libxml2_bridge_work_list_item_push(
45     lexbor_array_obj_t *array,
46     lxb_dom_node_t *node,
47     uintptr_t current_active_namespace,
48     xmlNodePtr lxml_parent,
49     xmlNsPtr lxml_ns
50 )
51 {
52     work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array);
53     item->node = node;
54     item->current_active_namespace = current_active_namespace;
55     item->lxml_parent = lxml_parent;
56     item->lxml_ns = lxml_ns;
57 }
58 
sanitize_line_nr(size_t line)59 static unsigned short sanitize_line_nr(size_t line)
60 {
61     if (line > USHRT_MAX) {
62         return USHRT_MAX;
63     }
64     return (unsigned short) line;
65 }
66 
67 struct lxml_ns {
68 	const php_dom_ns_magic_token *token;
69 	const char *href;
70 	size_t href_len;
71 };
72 
get_libxml_namespace_href(uintptr_t lexbor_namespace)73 static struct lxml_ns get_libxml_namespace_href(uintptr_t lexbor_namespace)
74 {
75     if (lexbor_namespace == LXB_NS_SVG) {
76         return (struct lxml_ns) { php_dom_ns_is_svg_magic_token, ZEND_STRL(DOM_SVG_NS_URI) };
77     } else if (lexbor_namespace == LXB_NS_MATH) {
78         return (struct lxml_ns) { php_dom_ns_is_mathml_magic_token, ZEND_STRL(DOM_MATHML_NS_URI) };
79     } else {
80         return (struct lxml_ns) { php_dom_ns_is_html_magic_token, ZEND_STRL(DOM_XHTML_NS_URI) };
81     }
82 }
83 
lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,const lxb_char_t * data,size_t data_length,bool compact_text_nodes)84 static zend_always_inline xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
85 {
86     if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
87         /* See xmlSAX2TextNode() in libxml2 */
88         xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
89         if (UNEXPECTED(lxml_text == NULL)) {
90             return NULL;
91         }
92         memset(lxml_text, 0, sizeof(*lxml_text));
93         lxml_text->name = xmlStringText;
94         lxml_text->type = XML_TEXT_NODE;
95         lxml_text->doc = lxml_doc;
96         lxml_text->content = BAD_CAST &lxml_text->properties;
97         if (data != NULL) {
98             memcpy(lxml_text->content, data, data_length);
99         }
100         return lxml_text;
101     } else {
102         return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
103     }
104 }
105 
lexbor_libxml2_bridge_convert(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,xmlNodePtr root,bool compact_text_nodes,bool create_default_ns,php_dom_private_data * private_data)106 static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
107     lxb_dom_node_t *start_node,
108     xmlDocPtr lxml_doc,
109     xmlNodePtr root,
110     bool compact_text_nodes,
111     bool create_default_ns,
112     php_dom_private_data *private_data
113 )
114 {
115     lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
116 
117 	php_dom_libxml_ns_mapper *ns_mapper = php_dom_ns_mapper_from_private(private_data);
118     xmlNsPtr html_ns = php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper);
119     xmlNsPtr xlink_ns = NULL;
120     xmlNsPtr prefixed_xmlns_ns = NULL;
121 
122     lexbor_array_obj_t work_list;
123     lexbor_array_obj_init(&work_list, WORK_LIST_INIT_SIZE, sizeof(work_list_item));
124 
125     for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) {
126         lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, root, NULL);
127     }
128 
129     work_list_item *current_stack_item;
130     while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) {
131         lxb_dom_node_t *node = current_stack_item->node;
132         xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
133 
134         /* CDATA section and processing instructions don't occur in parsed HTML documents.
135          * The historical types are not emitted by the parser either. */
136         if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
137             /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name.
138              * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty.
139              * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */
140             lxb_dom_element_t *element = lxb_dom_interface_element(node);
141             const lxb_char_t *name = lxb_dom_element_local_name(element, NULL);
142             xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL);
143             if (UNEXPECTED(lxml_element == NULL)) {
144                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
145                 break;
146             }
147             xmlAddChild(lxml_parent, lxml_element);
148             lxml_element->line = sanitize_line_nr(node->line);
149 
150             /* Namespaces, note: namespace switches are uncommon */
151             uintptr_t entering_namespace = element->node.ns;
152             xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
153             if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
154                 if (entering_namespace == LXB_NS_HTML) {
155                     current_lxml_ns = html_ns;
156                 } else {
157 					struct lxml_ns ns = get_libxml_namespace_href(entering_namespace);
158                     zend_string *uri = zend_string_init(ns.href, ns.href_len, false);
159                     current_lxml_ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, NULL, uri);
160                     zend_string_release_ex(uri, false);
161                     if (EXPECTED(current_lxml_ns != NULL)) {
162                         current_lxml_ns->_private = (void *) ns.token;
163                     }
164                 }
165             }
166             /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */
167             lxml_element->ns = current_lxml_ns;
168 
169 			/* Handle template element by creating a fragment node to contain its children.
170 			 * Other types of nodes contain their children directly. */
171 			xmlNodePtr lxml_child_parent = lxml_element;
172 			lxb_dom_node_t *child_node = element->node.last_child;
173 			if (lxb_html_tree_node_is(&element->node, LXB_TAG_TEMPLATE)) {
174 				if (create_default_ns) {
175 					lxml_child_parent = xmlNewDocFragment(lxml_doc);
176 					if (UNEXPECTED(lxml_child_parent == NULL)) {
177 						retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
178 						break;
179 					}
180 
181 					lxml_child_parent->parent = lxml_element;
182 					dom_add_element_ns_hook(private_data, lxml_element);
183 					php_dom_add_templated_content(private_data, lxml_element, lxml_child_parent);
184 				}
185 
186 				lxb_html_template_element_t *template = lxb_html_interface_template(&element->node);
187 				if (template->content != NULL) {
188 					child_node = template->content->node.last_child;
189 				}
190 			}
191 
192             for (; child_node != NULL; child_node = child_node->prev) {
193                 lexbor_libxml2_bridge_work_list_item_push(
194                     &work_list,
195                     child_node,
196                     entering_namespace,
197                     lxml_child_parent,
198                     current_lxml_ns
199                 );
200             }
201 
202             xmlAttrPtr last_added_attr = NULL;
203             for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
204                 /* Same namespace remark as for elements */
205                 size_t local_name_length, value_length;
206                 const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
207                 const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
208 
209                 if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
210                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
211                     break;
212                 }
213 
214                 xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
215                 if (UNEXPECTED(lxml_attr == NULL)) {
216                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
217                     break;
218                 }
219 
220                 memset(lxml_attr, 0, sizeof(xmlAttr));
221                 lxml_attr->type = XML_ATTRIBUTE_NODE;
222                 lxml_attr->parent = lxml_element;
223                 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
224                 lxml_attr->doc = lxml_doc;
225                 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
226                 if (UNEXPECTED(lxml_text == NULL)) {
227                     xmlFreeProp(lxml_attr);
228                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
229                     break;
230                 }
231 
232                 lxml_attr->children = lxml_attr->last = lxml_text;
233                 lxml_text->parent = (xmlNodePtr) lxml_attr;
234 
235                 if (attr->node.ns == LXB_NS_XMLNS) {
236                     if (strcmp((const char *) local_name, "xmlns") != 0) {
237                         if (prefixed_xmlns_ns == NULL) {
238                             prefixed_xmlns_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xmlns", DOM_XMLNS_NS_URI);
239                         }
240                         lxml_attr->ns = prefixed_xmlns_ns;
241                     } else {
242                         lxml_attr->ns = php_dom_libxml_ns_mapper_ensure_prefixless_xmlns_ns(ns_mapper);
243                     }
244                     lxml_attr->ns->_private = (void *) php_dom_ns_is_xmlns_magic_token;
245                 } else if (attr->node.ns == LXB_NS_XLINK) {
246                     if (xlink_ns == NULL) {
247                         xlink_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xlink", DOM_XLINK_NS_URI);
248                         xlink_ns->_private = (void *) php_dom_ns_is_xlink_magic_token;
249                     }
250                     lxml_attr->ns = xlink_ns;
251                 }
252 
253                 if (last_added_attr == NULL) {
254                     lxml_element->properties = lxml_attr;
255                 } else {
256                     last_added_attr->next = lxml_attr;
257                     lxml_attr->prev = last_added_attr;
258                 }
259                 last_added_attr = lxml_attr;
260 
261                 /* xmlIsID does some other stuff too that is irrelevant here. */
262                 if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd' && attr->node.ns == LXB_NS_HTML) {
263                     xmlAddID(NULL, lxml_doc, value, lxml_attr);
264                 }
265 
266                 /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
267             }
268         } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
269             lxb_dom_text_t *text = lxb_dom_interface_text(node);
270             const lxb_char_t *data = text->char_data.data.data;
271             size_t data_length = text->char_data.data.length;
272             if (UNEXPECTED(data_length >= INT_MAX)) {
273                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
274                 break;
275             }
276             xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
277             if (UNEXPECTED(lxml_text == NULL)) {
278                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
279                 break;
280             }
281             xmlAddChild(lxml_parent, lxml_text);
282             if (node->line >= USHRT_MAX) {
283                 lxml_text->line = USHRT_MAX;
284                 lxml_text->psvi = (void *) (ptrdiff_t) node->line;
285             } else {
286                 lxml_text->line = (unsigned short) node->line;
287             }
288         } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
289             lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node);
290             const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL);
291             size_t public_id_len, system_id_len;
292             const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len);
293             const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len);
294             xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
295                 lxml_doc,
296                 name,
297                 public_id_len ? public_id : NULL,
298                 system_id_len ? system_id : NULL
299             );
300             if (UNEXPECTED(lxml_dtd == NULL)) {
301                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
302                 break;
303             }
304             /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
305         } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
306             lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
307             xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
308             if (UNEXPECTED(lxml_comment == NULL)) {
309                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
310                 break;
311             }
312             xmlAddChild(lxml_parent, lxml_comment);
313             lxml_comment->line = sanitize_line_nr(node->line);
314         }
315     }
316 
317     lexbor_array_obj_destroy(&work_list, false);
318     return retval;
319 }
320 
lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context * ctx)321 void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
322 {
323     memset(ctx, 0, sizeof(*ctx));
324 }
325 
lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context * ctx,lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)326 void lexbor_libxml2_bridge_parse_set_error_callbacks(
327     lexbor_libxml2_bridge_parse_context *ctx,
328     lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,
329     lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
330 )
331 {
332     ctx->tokenizer_error_reporter = tokenizer_error_reporter;
333     ctx->tree_error_reporter = tree_error_reporter;
334 }
335 
lexbor_libxml2_bridge_convert_document(lxb_html_document_t * document,xmlDocPtr * doc_out,bool compact_text_nodes,bool create_default_ns,php_dom_private_data * private_data)336 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
337     lxb_html_document_t *document,
338     xmlDocPtr *doc_out,
339     bool compact_text_nodes,
340     bool create_default_ns,
341 	php_dom_private_data *private_data
342 )
343 {
344     xmlDocPtr lxml_doc = php_dom_create_html_doc();
345     if (UNEXPECTED(!lxml_doc)) {
346         return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
347     }
348     lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
349         lxb_dom_interface_node(document)->last_child,
350         lxml_doc,
351         (xmlNodePtr) lxml_doc,
352         compact_text_nodes,
353         create_default_ns,
354         private_data
355     );
356     if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
357         xmlFreeDoc(lxml_doc);
358         return status;
359     }
360     *doc_out = lxml_doc;
361     return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
362 }
363 
lexbor_libxml2_bridge_convert_fragment(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,xmlNodePtr * fragment_out,bool compact_text_nodes,bool create_default_ns,php_dom_private_data * private_data)364 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_fragment(
365     lxb_dom_node_t *start_node,
366     xmlDocPtr lxml_doc,
367     xmlNodePtr *fragment_out,
368     bool compact_text_nodes,
369     bool create_default_ns,
370 	php_dom_private_data *private_data
371 )
372 {
373     xmlNodePtr fragment = xmlNewDocFragment(lxml_doc);
374     if (UNEXPECTED(fragment == NULL)) {
375         return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
376     }
377     lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
378         start_node,
379         lxml_doc,
380         fragment,
381         compact_text_nodes,
382         create_default_ns,
383         private_data
384     );
385     if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
386         xmlFreeNode(fragment);
387         return status;
388     }
389     *fragment_out = fragment;
390     return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
391 }
392 
lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context * ctx,lxb_html_parser_t * parser,const lxb_char_t * input_html,size_t chunk_offset,size_t * error_index_offset_tokenizer,size_t * error_index_offset_tree)393 void lexbor_libxml2_bridge_report_errors(
394     const lexbor_libxml2_bridge_parse_context *ctx,
395     lxb_html_parser_t *parser,
396     const lxb_char_t *input_html,
397     size_t chunk_offset,
398     size_t *error_index_offset_tokenizer,
399     size_t *error_index_offset_tree
400 )
401 {
402     void *error;
403 
404     /* Tokenizer errors */
405     lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors;
406     size_t index = *error_index_offset_tokenizer;
407     while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
408         /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */
409         lxb_html_tokenizer_error_t *token_error = error;
410         if (ctx->tokenizer_error_reporter) {
411             ctx->tokenizer_error_reporter(
412                 ctx->application_data,
413                 token_error,
414                 token_error->pos - input_html + chunk_offset
415             );
416         }
417         index++;
418     }
419     *error_index_offset_tokenizer = index;
420 
421     /* Tree parser errors */
422     parse_errors = lxb_html_parser_tree(parser)->parse_errors;
423     index = *error_index_offset_tree;
424     while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
425         /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */
426         lxb_html_tree_error_t *tree_error = error;
427         if (ctx->tree_error_reporter) {
428             ctx->tree_error_reporter(
429                 ctx->application_data,
430                 tree_error,
431                 tree_error->line + 1,
432                 tree_error->column + 1,
433                 tree_error->length
434             );
435         }
436         index++;
437     }
438     *error_index_offset_tree = index;
439 }
440 
dom_translate_quirks_mode(lxb_dom_document_cmode_t quirks_mode)441 static php_libxml_quirks_mode dom_translate_quirks_mode(lxb_dom_document_cmode_t quirks_mode)
442 {
443 	switch (quirks_mode) {
444 		case LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS: return PHP_LIBXML_NO_QUIRKS;
445 		case LXB_DOM_DOCUMENT_CMODE_LIMITED_QUIRKS: return PHP_LIBXML_LIMITED_QUIRKS;
446 		case LXB_DOM_DOCUMENT_CMODE_QUIRKS: return PHP_LIBXML_QUIRKS;
447 		EMPTY_SWITCH_DEFAULT_CASE();
448 	}
449 }
450 
lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t * tree,lexbor_libxml2_bridge_extracted_observations * observations)451 void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
452 {
453     observations->has_explicit_html_tag = tree->has_explicit_html_tag;
454     observations->has_explicit_head_tag = tree->has_explicit_head_tag;
455     observations->has_explicit_body_tag = tree->has_explicit_body_tag;
456     observations->quirks_mode = dom_translate_quirks_mode(lxb_dom_interface_document(tree->document)->compat_mode);
457 }
458 
459 #endif  /* HAVE_LIBXML && HAVE_DOM */
460