xref: /PHP-8.2/ext/dom/html5_parser.c (revision 89ea24f6)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "html5_parser.h"
24 #include "namespace_compat.h"
25 #include <lexbor/html/parser.h>
26 #include <lexbor/html/interfaces/element.h>
27 #include <libxml/tree.h>
28 #include <libxml/parserInternals.h>
29 #include <libxml/HTMLtree.h>
30 #include <Zend/zend.h>
31 
32 #define WORK_LIST_INIT_SIZE 128
33 /* libxml2 reserves 2 pointer-sized words for interned strings */
34 #define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
35 
36 typedef struct _work_list_item {
37     lxb_dom_node_t *node;
38     uintptr_t current_active_namespace;
39     xmlNodePtr lxml_parent;
40     xmlNsPtr lxml_ns;
41 } work_list_item;
42 
lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t * array,lxb_dom_node_t * node,uintptr_t current_active_namespace,xmlNodePtr lxml_parent,xmlNsPtr lxml_ns)43 static void lexbor_libxml2_bridge_work_list_item_push(
44     lexbor_array_obj_t *array,
45     lxb_dom_node_t *node,
46     uintptr_t current_active_namespace,
47     xmlNodePtr lxml_parent,
48     xmlNsPtr lxml_ns
49 )
50 {
51     work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array);
52     item->node = node;
53     item->current_active_namespace = current_active_namespace;
54     item->lxml_parent = lxml_parent;
55     item->lxml_ns = lxml_ns;
56 }
57 
sanitize_line_nr(size_t line)58 static unsigned short sanitize_line_nr(size_t line)
59 {
60     if (line > USHRT_MAX) {
61         return USHRT_MAX;
62     }
63     return (unsigned short) line;
64 }
65 
get_libxml_namespace_href(uintptr_t lexbor_namespace)66 static const xmlChar *get_libxml_namespace_href(uintptr_t lexbor_namespace)
67 {
68     if (lexbor_namespace == LXB_NS_SVG) {
69         return (const xmlChar *) DOM_SVG_NS_URI;
70     } else if (lexbor_namespace == LXB_NS_MATH) {
71         return (const xmlChar *) DOM_MATHML_NS_URI;
72     } else {
73         return (const xmlChar *) DOM_XHTML_NS_URI;
74     }
75 }
76 
lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,const lxb_char_t * data,size_t data_length,bool compact_text_nodes)77 static xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
78 {
79     if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
80         /* See xmlSAX2TextNode() in libxml2 */
81         xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
82         if (UNEXPECTED(lxml_text == NULL)) {
83             return NULL;
84         }
85         memset(lxml_text, 0, sizeof(*lxml_text));
86         lxml_text->name = xmlStringText;
87         lxml_text->type = XML_TEXT_NODE;
88         lxml_text->doc = lxml_doc;
89         lxml_text->content = (xmlChar *) &lxml_text->properties;
90         memcpy(lxml_text->content, data, data_length);
91         return lxml_text;
92     } else {
93         return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
94     }
95 }
96 
lexbor_libxml2_bridge_convert(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,bool compact_text_nodes,bool create_default_ns)97 static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
98     lxb_dom_node_t *start_node,
99     xmlDocPtr lxml_doc,
100     bool compact_text_nodes,
101     bool create_default_ns
102 )
103 {
104     lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
105 
106     lexbor_array_obj_t work_list;
107     lexbor_array_obj_init(&work_list, WORK_LIST_INIT_SIZE, sizeof(work_list_item));
108 
109     for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) {
110         lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, (xmlNodePtr) lxml_doc, NULL);
111     }
112 
113     work_list_item *current_stack_item;
114     while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) {
115         lxb_dom_node_t *node = current_stack_item->node;
116         xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
117 
118         /* CDATA section and processing instructions don't occur in parsed HTML documents.
119          * The historical types are not emitted by the parser either. */
120         if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
121             /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name.
122              * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty.
123              * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */
124             lxb_dom_element_t *element = lxb_dom_interface_element(node);
125             const lxb_char_t *name = lxb_dom_element_local_name(element, NULL);
126             xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL);
127             if (UNEXPECTED(lxml_element == NULL)) {
128                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
129                 goto out;
130             }
131             xmlAddChild(lxml_parent, lxml_element);
132             lxml_element->line = sanitize_line_nr(node->line);
133 
134             /* Namespaces, note: namespace switches are uncommon */
135             uintptr_t entering_namespace = element->node.ns;
136             xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
137             if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
138                 current_lxml_ns = xmlNewNs(lxml_element, get_libxml_namespace_href(entering_namespace), NULL);
139             }
140             /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */
141             lxml_element->ns = current_lxml_ns;
142 
143             for (lxb_dom_node_t *child_node = element->node.last_child; child_node != NULL; child_node = child_node->prev) {
144                 lexbor_libxml2_bridge_work_list_item_push(
145                     &work_list,
146                     child_node,
147                     entering_namespace,
148                     lxml_element,
149                     current_lxml_ns
150                 );
151             }
152 
153             xmlAttrPtr last_added_attr = NULL;
154             for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
155                 /* Same namespace remark as for elements */
156                 size_t local_name_length, value_length;
157                 const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
158                 const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
159 
160                 if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
161                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
162                     goto out;
163                 }
164 
165                 xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
166                 if (UNEXPECTED(lxml_attr == NULL)) {
167                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
168                     goto out;
169                 }
170 
171                 memset(lxml_attr, 0, sizeof(xmlAttr));
172                 lxml_attr->type = XML_ATTRIBUTE_NODE;
173                 lxml_attr->parent = lxml_element;
174                 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
175                 lxml_attr->doc = lxml_doc;
176                 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
177                 if (UNEXPECTED(lxml_text == NULL)) {
178                     xmlFreeProp(lxml_attr);
179                     retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
180                     goto out;
181                 }
182 
183                 lxml_attr->children = lxml_attr->last = lxml_text;
184 
185                 if (last_added_attr == NULL) {
186                     lxml_element->properties = lxml_attr;
187                 } else {
188                     last_added_attr->next = lxml_attr;
189                     lxml_attr->prev = last_added_attr;
190                 }
191                 last_added_attr = lxml_attr;
192 
193                 /* xmlIsID does some other stuff too that is irrelevant here. */
194                 if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd') {
195                     xmlAddID(NULL, lxml_doc, value, lxml_attr);
196                 }
197 
198                 /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
199             }
200         } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
201             lxb_dom_text_t *text = lxb_dom_interface_text(node);
202             const lxb_char_t *data = text->char_data.data.data;
203             size_t data_length = text->char_data.data.length;
204             if (UNEXPECTED(data_length >= INT_MAX)) {
205                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
206                 goto out;
207             }
208             xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
209             if (UNEXPECTED(lxml_text == NULL)) {
210                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
211                 goto out;
212             }
213             xmlAddChild(lxml_parent, lxml_text);
214             if (node->line >= USHRT_MAX) {
215                 lxml_text->line = USHRT_MAX;
216                 lxml_text->psvi = (void *) (ptrdiff_t) node->line;
217             } else {
218                 lxml_text->line = (unsigned short) node->line;
219             }
220         } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
221             lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node);
222             const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL);
223             size_t public_id_len, system_id_len;
224             const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len);
225             const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len);
226             xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
227                 lxml_doc,
228                 name,
229                 public_id_len ? public_id : NULL,
230                 system_id_len ? system_id : NULL
231             );
232             if (UNEXPECTED(lxml_dtd == NULL)) {
233                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
234                 goto out;
235             }
236             /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
237         } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
238             lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
239             xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
240             if (UNEXPECTED(lxml_comment == NULL)) {
241                 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
242                 goto out;
243             }
244             xmlAddChild(lxml_parent, lxml_comment);
245             lxml_comment->line = sanitize_line_nr(node->line);
246         }
247     }
248 
249 out:
250     lexbor_array_obj_destroy(&work_list, false);
251     return retval;
252 }
253 
lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context * ctx)254 void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
255 {
256     memset(ctx, 0, sizeof(*ctx));
257 }
258 
lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context * ctx,lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)259 void lexbor_libxml2_bridge_parse_set_error_callbacks(
260     lexbor_libxml2_bridge_parse_context *ctx,
261     lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,
262     lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
263 )
264 {
265     ctx->tokenizer_error_reporter = tokenizer_error_reporter;
266     ctx->tree_error_reporter = tree_error_reporter;
267 }
268 
lexbor_libxml2_bridge_convert_document(lxb_html_document_t * document,xmlDocPtr * doc_out,bool compact_text_nodes,bool create_default_ns)269 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
270     lxb_html_document_t *document,
271     xmlDocPtr *doc_out,
272     bool compact_text_nodes,
273     bool create_default_ns
274 )
275 {
276 #ifdef LIBXML_HTML_ENABLED
277     xmlDocPtr lxml_doc = htmlNewDocNoDtD(NULL, NULL);
278     if (UNEXPECTED(!lxml_doc)) {
279         return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
280     }
281 #else
282     /* If HTML support is not enabled, then htmlNewDocNoDtD() is not available.
283      * This code mimics the behaviour. */
284     xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) "1.0");
285     if (UNEXPECTED(!lxml_doc)) {
286         return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
287     }
288     lxml_doc->type = XML_HTML_DOCUMENT_NODE;
289 #endif
290     lxml_doc->dict = xmlDictCreate();
291     lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
292         lxb_dom_interface_node(document)->last_child,
293         lxml_doc,
294         compact_text_nodes,
295         create_default_ns
296     );
297     if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
298         xmlFreeDoc(lxml_doc);
299         return status;
300     }
301     *doc_out = lxml_doc;
302     return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
303 }
304 
lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context * ctx,lxb_html_parser_t * parser,const lxb_char_t * input_html,size_t chunk_offset,size_t * error_index_offset_tokenizer,size_t * error_index_offset_tree)305 void lexbor_libxml2_bridge_report_errors(
306     const lexbor_libxml2_bridge_parse_context *ctx,
307     lxb_html_parser_t *parser,
308     const lxb_char_t *input_html,
309     size_t chunk_offset,
310     size_t *error_index_offset_tokenizer,
311     size_t *error_index_offset_tree
312 )
313 {
314     void *error;
315 
316     /* Tokenizer errors */
317     lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors;
318     size_t index = *error_index_offset_tokenizer;
319     while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
320         /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */
321         lxb_html_tokenizer_error_t *token_error = error;
322         if (ctx->tokenizer_error_reporter) {
323             ctx->tokenizer_error_reporter(
324                 ctx->application_data,
325                 token_error,
326                 token_error->pos - input_html + chunk_offset
327             );
328         }
329         index++;
330     }
331     *error_index_offset_tokenizer = index;
332 
333     /* Tree parser errors */
334     parse_errors = lxb_html_parser_tree(parser)->parse_errors;
335     index = *error_index_offset_tree;
336     while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
337         /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */
338         lxb_html_tree_error_t *tree_error = error;
339         if (ctx->tree_error_reporter) {
340             ctx->tree_error_reporter(
341                 ctx->application_data,
342                 tree_error,
343                 tree_error->line + 1,
344                 tree_error->column + 1,
345                 tree_error->length
346             );
347         }
348         index++;
349     }
350     *error_index_offset_tree = index;
351 }
352 
lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t * tree,lexbor_libxml2_bridge_extracted_observations * observations)353 void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
354 {
355     observations->has_explicit_html_tag = tree->has_explicit_html_tag;
356     observations->has_explicit_head_tag = tree->has_explicit_head_tag;
357     observations->has_explicit_body_tag = tree->has_explicit_body_tag;
358 }
359 
360 #endif  /* HAVE_LIBXML && HAVE_DOM */
361