xref: /php-src/ext/dom/html5_parser.c (revision b0da6ed6)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_parser.h"
25 #include <lexbor/html/parser.h>
26 #include <lexbor/html/interfaces/element.h>
27 #include <libxml/parserInternals.h>
28 #include <libxml/HTMLtree.h>
29 #include <Zend/zend.h>
30 
31 #define WORK_LIST_INIT_SIZE 128
32 /* libxml2 reserves 2 pointer-sized words for interned strings */
33 #define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
34 
35 typedef struct _work_list_item {
36     lxb_dom_node_t *node;
37     uintptr_t current_active_namespace;
38     xmlNodePtr lxml_parent;
39     xmlNsPtr lxml_ns;
40 } work_list_item;
41 
lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t * array,lxb_dom_node_t * node,uintptr_t current_active_namespace,xmlNodePtr lxml_parent,xmlNsPtr lxml_ns)42 static void lexbor_libxml2_bridge_work_list_item_push(
43     lexbor_array_obj_t *array,
44     lxb_dom_node_t *node,
45     uintptr_t current_active_namespace,
46     xmlNodePtr lxml_parent,
47     xmlNsPtr lxml_ns
48 )
49 {
50     work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array);
51     item->node = node;
52     item->current_active_namespace = current_active_namespace;
53     item->lxml_parent = lxml_parent;
54     item->lxml_ns = lxml_ns;
55 }
56 
sanitize_line_nr(size_t line)57 static unsigned short sanitize_line_nr(size_t line)
58 {
59     if (line > USHRT_MAX) {
60         return USHRT_MAX;
61     }
62     return (unsigned short) line;
63 }
64 
get_libxml_namespace_href(uintptr_t lexbor_namespace)65 static const php_dom_ns_magic_token *get_libxml_namespace_href(uintptr_t lexbor_namespace)
66 {
67     if (lexbor_namespace == LXB_NS_SVG) {
68         return php_dom_ns_is_svg_magic_token;
69     } else if (lexbor_namespace == LXB_NS_MATH) {
70         return php_dom_ns_is_mathml_magic_token;
71     } else {
72         return php_dom_ns_is_html_magic_token;
73     }
74 }
75 
lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,const lxb_char_t * data,size_t data_length,bool compact_text_nodes)76 static zend_always_inline xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
77 {
78     if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
79         /* See xmlSAX2TextNode() in libxml2 */
80         xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
81         if (UNEXPECTED(lxml_text == NULL)) {
82             return NULL;
83         }
84         memset(lxml_text, 0, sizeof(*lxml_text));
85         lxml_text->name = xmlStringText;
86         lxml_text->type = XML_TEXT_NODE;
87         lxml_text->doc = lxml_doc;
88         lxml_text->content = BAD_CAST &lxml_text->properties;
89         if (data != NULL) {
90             memcpy(lxml_text->content, data, data_length);
91         }
92         return lxml_text;
93     } else {
94         return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
95     }
96 }
97 
lexbor_libxml2_bridge_convert(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,bool compact_text_nodes,bool create_default_ns,php_dom_libxml_ns_mapper * ns_mapper)98 static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
99     lxb_dom_node_t *start_node,
100     xmlDocPtr lxml_doc,
101     bool compact_text_nodes,
102     bool create_default_ns,
103     php_dom_libxml_ns_mapper *ns_mapper
104 )
105 {
106     lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
107 
108     xmlNsPtr html_ns = php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper);
109     xmlNsPtr xlink_ns = NULL;
110     xmlNsPtr prefixed_xmlns_ns = NULL;
111 
112     lexbor_array_obj_t work_list;
113     lexbor_array_obj_init(&work_list, WORK_LIST_INIT_SIZE, sizeof(work_list_item));
114 
115     for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) {
116         lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, (xmlNodePtr) lxml_doc, NULL);
117     }
118 
119     work_list_item *current_stack_item;
120     while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) {
121         lxb_dom_node_t *node = current_stack_item->node;
122         xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
123 
124         /* CDATA section and processing instructions don't occur in parsed HTML documents.
125          * The historical types are not emitted by the parser either. */
126         if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
127             /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name.
128              * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty.
129              * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */
130             lxb_dom_element_t *element = lxb_dom_interface_element(node);
131             const lxb_char_t *name = lxb_dom_element_local_name(element, NULL);
132             xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL);
133             if (UNEXPECTED(lxml_element == NULL)) {
134                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
135                 break;
136             }
137             xmlAddChild(lxml_parent, lxml_element);
138             lxml_element->line = sanitize_line_nr(node->line);
139 
140             /* Namespaces, note: namespace switches are uncommon */
141             uintptr_t entering_namespace = element->node.ns;
142             xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
143             if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
144                 if (entering_namespace == LXB_NS_HTML) {
145                     current_lxml_ns = html_ns;
146                 } else {
147                     const php_dom_ns_magic_token *magic_token = get_libxml_namespace_href(entering_namespace);
148                     zend_string *uri = zend_string_init((char *) magic_token, strlen((char *) magic_token), false);
149                     current_lxml_ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, NULL, uri);
150                     zend_string_release_ex(uri, false);
151                     if (EXPECTED(current_lxml_ns != NULL)) {
152                         current_lxml_ns->_private = (void *) magic_token;
153                     }
154                 }
155             }
156             /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */
157             lxml_element->ns = current_lxml_ns;
158 
159             for (lxb_dom_node_t *child_node = element->node.last_child; child_node != NULL; child_node = child_node->prev) {
160                 lexbor_libxml2_bridge_work_list_item_push(
161                     &work_list,
162                     child_node,
163                     entering_namespace,
164                     lxml_element,
165                     current_lxml_ns
166                 );
167             }
168 
169             xmlAttrPtr last_added_attr = NULL;
170             for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
171                 /* Same namespace remark as for elements */
172                 size_t local_name_length, value_length;
173                 const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
174                 const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
175 
176                 if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
177                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
178                     break;
179                 }
180 
181                 xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
182                 if (UNEXPECTED(lxml_attr == NULL)) {
183                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
184                     break;
185                 }
186 
187                 memset(lxml_attr, 0, sizeof(xmlAttr));
188                 lxml_attr->type = XML_ATTRIBUTE_NODE;
189                 lxml_attr->parent = lxml_element;
190                 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
191                 lxml_attr->doc = lxml_doc;
192                 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
193                 if (UNEXPECTED(lxml_text == NULL)) {
194                     xmlFreeProp(lxml_attr);
195                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
196                     break;
197                 }
198 
199                 lxml_attr->children = lxml_attr->last = lxml_text;
200                 lxml_text->parent = (xmlNodePtr) lxml_attr;
201 
202                 if (attr->node.ns == LXB_NS_XMLNS) {
203                     if (strcmp((const char *) local_name, "xmlns") != 0) {
204                         if (prefixed_xmlns_ns == NULL) {
205                             prefixed_xmlns_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xmlns", DOM_XMLNS_NS_URI);
206                         }
207                         lxml_attr->ns = prefixed_xmlns_ns;
208                     } else {
209                         lxml_attr->ns = php_dom_libxml_ns_mapper_ensure_prefixless_xmlns_ns(ns_mapper);
210                     }
211                     lxml_attr->ns->_private = (void *) php_dom_ns_is_xmlns_magic_token;
212                 } else if (attr->node.ns == LXB_NS_XLINK) {
213                     if (xlink_ns == NULL) {
214                         xlink_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xlink", DOM_XLINK_NS_URI);
215                         xlink_ns->_private = (void *) php_dom_ns_is_xlink_magic_token;
216                     }
217                     lxml_attr->ns = xlink_ns;
218                 }
219 
220                 if (last_added_attr == NULL) {
221                     lxml_element->properties = lxml_attr;
222                 } else {
223                     last_added_attr->next = lxml_attr;
224                     lxml_attr->prev = last_added_attr;
225                 }
226                 last_added_attr = lxml_attr;
227 
228                 /* xmlIsID does some other stuff too that is irrelevant here. */
229                 if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd' && attr->node.ns == LXB_NS_HTML) {
230                     xmlAddID(NULL, lxml_doc, value, lxml_attr);
231                 }
232 
233                 /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
234             }
235         } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
236             lxb_dom_text_t *text = lxb_dom_interface_text(node);
237             const lxb_char_t *data = text->char_data.data.data;
238             size_t data_length = text->char_data.data.length;
239             if (UNEXPECTED(data_length >= INT_MAX)) {
240                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
241                 break;
242             }
243             xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
244             if (UNEXPECTED(lxml_text == NULL)) {
245                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
246                 break;
247             }
248             xmlAddChild(lxml_parent, lxml_text);
249             if (node->line >= USHRT_MAX) {
250                 lxml_text->line = USHRT_MAX;
251                 lxml_text->psvi = (void *) (ptrdiff_t) node->line;
252             } else {
253                 lxml_text->line = (unsigned short) node->line;
254             }
255         } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
256             lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node);
257             const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL);
258             size_t public_id_len, system_id_len;
259             const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len);
260             const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len);
261             xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
262                 lxml_doc,
263                 name,
264                 public_id_len ? public_id : NULL,
265                 system_id_len ? system_id : NULL
266             );
267             if (UNEXPECTED(lxml_dtd == NULL)) {
268                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
269                 break;
270             }
271             /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
272         } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
273             lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
274             xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
275             if (UNEXPECTED(lxml_comment == NULL)) {
276                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
277                 break;
278             }
279             xmlAddChild(lxml_parent, lxml_comment);
280             lxml_comment->line = sanitize_line_nr(node->line);
281         }
282     }
283 
284     lexbor_array_obj_destroy(&work_list, false);
285     return retval;
286 }
287 
lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context * ctx)288 void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
289 {
290     memset(ctx, 0, sizeof(*ctx));
291 }
292 
lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context * ctx,lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)293 void lexbor_libxml2_bridge_parse_set_error_callbacks(
294     lexbor_libxml2_bridge_parse_context *ctx,
295     lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,
296     lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
297 )
298 {
299     ctx->tokenizer_error_reporter = tokenizer_error_reporter;
300     ctx->tree_error_reporter = tree_error_reporter;
301 }
302 
lexbor_libxml2_bridge_convert_document(lxb_html_document_t * document,xmlDocPtr * doc_out,bool compact_text_nodes,bool create_default_ns,php_dom_libxml_ns_mapper * ns_mapper)303 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
304     lxb_html_document_t *document,
305     xmlDocPtr *doc_out,
306     bool compact_text_nodes,
307     bool create_default_ns,
308     php_dom_libxml_ns_mapper *ns_mapper
309 )
310 {
311     xmlDocPtr lxml_doc = php_dom_create_html_doc();
312     if (UNEXPECTED(!lxml_doc)) {
313         return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
314     }
315     lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
316         lxb_dom_interface_node(document)->last_child,
317         lxml_doc,
318         compact_text_nodes,
319         create_default_ns,
320         ns_mapper
321     );
322     if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
323         xmlFreeDoc(lxml_doc);
324         return status;
325     }
326     *doc_out = lxml_doc;
327     return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
328 }
329 
lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context * ctx,lxb_html_parser_t * parser,const lxb_char_t * input_html,size_t chunk_offset,size_t * error_index_offset_tokenizer,size_t * error_index_offset_tree)330 void lexbor_libxml2_bridge_report_errors(
331     const lexbor_libxml2_bridge_parse_context *ctx,
332     lxb_html_parser_t *parser,
333     const lxb_char_t *input_html,
334     size_t chunk_offset,
335     size_t *error_index_offset_tokenizer,
336     size_t *error_index_offset_tree
337 )
338 {
339     void *error;
340 
341     /* Tokenizer errors */
342     lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors;
343     size_t index = *error_index_offset_tokenizer;
344     while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
345         /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */
346         lxb_html_tokenizer_error_t *token_error = error;
347         if (ctx->tokenizer_error_reporter) {
348             ctx->tokenizer_error_reporter(
349                 ctx->application_data,
350                 token_error,
351                 token_error->pos - input_html + chunk_offset
352             );
353         }
354         index++;
355     }
356     *error_index_offset_tokenizer = index;
357 
358     /* Tree parser errors */
359     parse_errors = lxb_html_parser_tree(parser)->parse_errors;
360     index = *error_index_offset_tree;
361     while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
362         /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */
363         lxb_html_tree_error_t *tree_error = error;
364         if (ctx->tree_error_reporter) {
365             ctx->tree_error_reporter(
366                 ctx->application_data,
367                 tree_error,
368                 tree_error->line + 1,
369                 tree_error->column + 1,
370                 tree_error->length
371             );
372         }
373         index++;
374     }
375     *error_index_offset_tree = index;
376 }
377 
lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t * tree,lexbor_libxml2_bridge_extracted_observations * observations)378 void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
379 {
380     observations->has_explicit_html_tag = tree->has_explicit_html_tag;
381     observations->has_explicit_head_tag = tree->has_explicit_head_tag;
382     observations->has_explicit_body_tag = tree->has_explicit_body_tag;
383 }
384 
385 #endif  /* HAVE_LIBXML && HAVE_DOM */
386