1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Niels Dossche <nielsdos@php.net> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_parser.h"
25 #include "private_data.h"
26 #include <lexbor/html/parser.h>
27 #include <lexbor/html/interfaces/element.h>
28 #include <lexbor/html/interfaces/template_element.h>
29 #include <lexbor/dom/dom.h>
30 #include <libxml/parserInternals.h>
31 #include <libxml/HTMLtree.h>
32
33 #define WORK_LIST_INIT_SIZE 128
34 /* libxml2 reserves 2 pointer-sized words for interned strings */
35 #define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
36
37 typedef struct work_list_item {
38 lxb_dom_node_t *node;
39 uintptr_t current_active_namespace;
40 xmlNodePtr lxml_parent;
41 xmlNsPtr lxml_ns;
42 } work_list_item;
43
lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t * array,lxb_dom_node_t * node,uintptr_t current_active_namespace,xmlNodePtr lxml_parent,xmlNsPtr lxml_ns)44 static void lexbor_libxml2_bridge_work_list_item_push(
45 lexbor_array_obj_t *array,
46 lxb_dom_node_t *node,
47 uintptr_t current_active_namespace,
48 xmlNodePtr lxml_parent,
49 xmlNsPtr lxml_ns
50 )
51 {
52 work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array);
53 item->node = node;
54 item->current_active_namespace = current_active_namespace;
55 item->lxml_parent = lxml_parent;
56 item->lxml_ns = lxml_ns;
57 }
58
sanitize_line_nr(size_t line)59 static unsigned short sanitize_line_nr(size_t line)
60 {
61 if (line > USHRT_MAX) {
62 return USHRT_MAX;
63 }
64 return (unsigned short) line;
65 }
66
67 struct lxml_ns {
68 const php_dom_ns_magic_token *token;
69 const char *href;
70 size_t href_len;
71 };
72
get_libxml_namespace_href(uintptr_t lexbor_namespace)73 static struct lxml_ns get_libxml_namespace_href(uintptr_t lexbor_namespace)
74 {
75 if (lexbor_namespace == LXB_NS_SVG) {
76 return (struct lxml_ns) { php_dom_ns_is_svg_magic_token, ZEND_STRL(DOM_SVG_NS_URI) };
77 } else if (lexbor_namespace == LXB_NS_MATH) {
78 return (struct lxml_ns) { php_dom_ns_is_mathml_magic_token, ZEND_STRL(DOM_MATHML_NS_URI) };
79 } else {
80 return (struct lxml_ns) { php_dom_ns_is_html_magic_token, ZEND_STRL(DOM_XHTML_NS_URI) };
81 }
82 }
83
lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,const lxb_char_t * data,size_t data_length,bool compact_text_nodes)84 static zend_always_inline xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
85 {
86 if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
87 /* See xmlSAX2TextNode() in libxml2 */
88 xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
89 if (UNEXPECTED(lxml_text == NULL)) {
90 return NULL;
91 }
92 memset(lxml_text, 0, sizeof(*lxml_text));
93 lxml_text->name = xmlStringText;
94 lxml_text->type = XML_TEXT_NODE;
95 lxml_text->doc = lxml_doc;
96 lxml_text->content = BAD_CAST &lxml_text->properties;
97 if (data != NULL) {
98 memcpy(lxml_text->content, data, data_length);
99 }
100 return lxml_text;
101 } else {
102 return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
103 }
104 }
105
lexbor_libxml2_bridge_convert(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,xmlNodePtr root,bool compact_text_nodes,bool create_default_ns,php_dom_private_data * private_data)106 static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
107 lxb_dom_node_t *start_node,
108 xmlDocPtr lxml_doc,
109 xmlNodePtr root,
110 bool compact_text_nodes,
111 bool create_default_ns,
112 php_dom_private_data *private_data
113 )
114 {
115 lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
116
117 php_dom_libxml_ns_mapper *ns_mapper = php_dom_ns_mapper_from_private(private_data);
118 xmlNsPtr html_ns = php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper);
119 xmlNsPtr xlink_ns = NULL;
120 xmlNsPtr prefixed_xmlns_ns = NULL;
121
122 lexbor_array_obj_t work_list;
123 lexbor_array_obj_init(&work_list, WORK_LIST_INIT_SIZE, sizeof(work_list_item));
124
125 for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) {
126 lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, root, NULL);
127 }
128
129 work_list_item *current_stack_item;
130 while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) {
131 lxb_dom_node_t *node = current_stack_item->node;
132 xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
133
134 /* CDATA section and processing instructions don't occur in parsed HTML documents.
135 * The historical types are not emitted by the parser either. */
136 if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
137 /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name.
138 * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty.
139 * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */
140 lxb_dom_element_t *element = lxb_dom_interface_element(node);
141 const lxb_char_t *name = lxb_dom_element_local_name(element, NULL);
142 xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL);
143 if (UNEXPECTED(lxml_element == NULL)) {
144 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
145 break;
146 }
147 xmlAddChild(lxml_parent, lxml_element);
148 lxml_element->line = sanitize_line_nr(node->line);
149
150 /* Namespaces, note: namespace switches are uncommon */
151 uintptr_t entering_namespace = element->node.ns;
152 xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
153 if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
154 if (entering_namespace == LXB_NS_HTML) {
155 current_lxml_ns = html_ns;
156 } else {
157 struct lxml_ns ns = get_libxml_namespace_href(entering_namespace);
158 zend_string *uri = zend_string_init(ns.href, ns.href_len, false);
159 current_lxml_ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, NULL, uri);
160 zend_string_release_ex(uri, false);
161 if (EXPECTED(current_lxml_ns != NULL)) {
162 current_lxml_ns->_private = (void *) ns.token;
163 }
164 }
165 }
166 /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */
167 lxml_element->ns = current_lxml_ns;
168
169 /* Handle template element by creating a fragment node to contain its children.
170 * Other types of nodes contain their children directly. */
171 xmlNodePtr lxml_child_parent = lxml_element;
172 lxb_dom_node_t *child_node = element->node.last_child;
173 if (lxb_html_tree_node_is(&element->node, LXB_TAG_TEMPLATE)) {
174 if (create_default_ns) {
175 lxml_child_parent = xmlNewDocFragment(lxml_doc);
176 if (UNEXPECTED(lxml_child_parent == NULL)) {
177 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
178 break;
179 }
180
181 lxml_child_parent->parent = lxml_element;
182 dom_add_element_ns_hook(private_data, lxml_element);
183 php_dom_add_templated_content(private_data, lxml_element, lxml_child_parent);
184 }
185
186 lxb_html_template_element_t *template = lxb_html_interface_template(&element->node);
187 if (template->content != NULL) {
188 child_node = template->content->node.last_child;
189 }
190 }
191
192 for (; child_node != NULL; child_node = child_node->prev) {
193 lexbor_libxml2_bridge_work_list_item_push(
194 &work_list,
195 child_node,
196 entering_namespace,
197 lxml_child_parent,
198 current_lxml_ns
199 );
200 }
201
202 xmlAttrPtr last_added_attr = NULL;
203 for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
204 /* Same namespace remark as for elements */
205 size_t local_name_length, value_length;
206 const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
207 const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
208
209 if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
210 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
211 break;
212 }
213
214 xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
215 if (UNEXPECTED(lxml_attr == NULL)) {
216 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
217 break;
218 }
219
220 memset(lxml_attr, 0, sizeof(xmlAttr));
221 lxml_attr->type = XML_ATTRIBUTE_NODE;
222 lxml_attr->parent = lxml_element;
223 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
224 lxml_attr->doc = lxml_doc;
225 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
226 if (UNEXPECTED(lxml_text == NULL)) {
227 xmlFreeProp(lxml_attr);
228 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
229 break;
230 }
231
232 lxml_attr->children = lxml_attr->last = lxml_text;
233 lxml_text->parent = (xmlNodePtr) lxml_attr;
234
235 if (attr->node.ns == LXB_NS_XMLNS) {
236 if (strcmp((const char *) local_name, "xmlns") != 0) {
237 if (prefixed_xmlns_ns == NULL) {
238 prefixed_xmlns_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xmlns", DOM_XMLNS_NS_URI);
239 }
240 lxml_attr->ns = prefixed_xmlns_ns;
241 } else {
242 lxml_attr->ns = php_dom_libxml_ns_mapper_ensure_prefixless_xmlns_ns(ns_mapper);
243 }
244 lxml_attr->ns->_private = (void *) php_dom_ns_is_xmlns_magic_token;
245 } else if (attr->node.ns == LXB_NS_XLINK) {
246 if (xlink_ns == NULL) {
247 xlink_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xlink", DOM_XLINK_NS_URI);
248 xlink_ns->_private = (void *) php_dom_ns_is_xlink_magic_token;
249 }
250 lxml_attr->ns = xlink_ns;
251 }
252
253 if (last_added_attr == NULL) {
254 lxml_element->properties = lxml_attr;
255 } else {
256 last_added_attr->next = lxml_attr;
257 lxml_attr->prev = last_added_attr;
258 }
259 last_added_attr = lxml_attr;
260
261 /* xmlIsID does some other stuff too that is irrelevant here. */
262 if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd' && attr->node.ns == LXB_NS_HTML) {
263 xmlAddID(NULL, lxml_doc, value, lxml_attr);
264 }
265
266 /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
267 }
268 } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
269 lxb_dom_text_t *text = lxb_dom_interface_text(node);
270 const lxb_char_t *data = text->char_data.data.data;
271 size_t data_length = text->char_data.data.length;
272 if (UNEXPECTED(data_length >= INT_MAX)) {
273 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
274 break;
275 }
276 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
277 if (UNEXPECTED(lxml_text == NULL)) {
278 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
279 break;
280 }
281 xmlAddChild(lxml_parent, lxml_text);
282 if (node->line >= USHRT_MAX) {
283 lxml_text->line = USHRT_MAX;
284 lxml_text->psvi = (void *) (ptrdiff_t) node->line;
285 } else {
286 lxml_text->line = (unsigned short) node->line;
287 }
288 } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
289 lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node);
290 const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL);
291 size_t public_id_len, system_id_len;
292 const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len);
293 const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len);
294 xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
295 lxml_doc,
296 name,
297 public_id_len ? public_id : NULL,
298 system_id_len ? system_id : NULL
299 );
300 if (UNEXPECTED(lxml_dtd == NULL)) {
301 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
302 break;
303 }
304 /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
305 } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
306 lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
307 xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
308 if (UNEXPECTED(lxml_comment == NULL)) {
309 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
310 break;
311 }
312 xmlAddChild(lxml_parent, lxml_comment);
313 lxml_comment->line = sanitize_line_nr(node->line);
314 }
315 }
316
317 lexbor_array_obj_destroy(&work_list, false);
318 return retval;
319 }
320
lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context * ctx)321 void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
322 {
323 memset(ctx, 0, sizeof(*ctx));
324 }
325
lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context * ctx,lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)326 void lexbor_libxml2_bridge_parse_set_error_callbacks(
327 lexbor_libxml2_bridge_parse_context *ctx,
328 lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,
329 lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
330 )
331 {
332 ctx->tokenizer_error_reporter = tokenizer_error_reporter;
333 ctx->tree_error_reporter = tree_error_reporter;
334 }
335
lexbor_libxml2_bridge_convert_document(lxb_html_document_t * document,xmlDocPtr * doc_out,bool compact_text_nodes,bool create_default_ns,php_dom_private_data * private_data)336 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
337 lxb_html_document_t *document,
338 xmlDocPtr *doc_out,
339 bool compact_text_nodes,
340 bool create_default_ns,
341 php_dom_private_data *private_data
342 )
343 {
344 xmlDocPtr lxml_doc = php_dom_create_html_doc();
345 if (UNEXPECTED(!lxml_doc)) {
346 return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
347 }
348 lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
349 lxb_dom_interface_node(document)->last_child,
350 lxml_doc,
351 (xmlNodePtr) lxml_doc,
352 compact_text_nodes,
353 create_default_ns,
354 private_data
355 );
356 if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
357 xmlFreeDoc(lxml_doc);
358 return status;
359 }
360 *doc_out = lxml_doc;
361 return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
362 }
363
lexbor_libxml2_bridge_convert_fragment(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,xmlNodePtr * fragment_out,bool compact_text_nodes,bool create_default_ns,php_dom_private_data * private_data)364 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_fragment(
365 lxb_dom_node_t *start_node,
366 xmlDocPtr lxml_doc,
367 xmlNodePtr *fragment_out,
368 bool compact_text_nodes,
369 bool create_default_ns,
370 php_dom_private_data *private_data
371 )
372 {
373 xmlNodePtr fragment = xmlNewDocFragment(lxml_doc);
374 if (UNEXPECTED(fragment == NULL)) {
375 return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
376 }
377 lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
378 start_node,
379 lxml_doc,
380 fragment,
381 compact_text_nodes,
382 create_default_ns,
383 private_data
384 );
385 if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
386 xmlFreeNode(fragment);
387 return status;
388 }
389 *fragment_out = fragment;
390 return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
391 }
392
lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context * ctx,lxb_html_parser_t * parser,const lxb_char_t * input_html,size_t chunk_offset,size_t * error_index_offset_tokenizer,size_t * error_index_offset_tree)393 void lexbor_libxml2_bridge_report_errors(
394 const lexbor_libxml2_bridge_parse_context *ctx,
395 lxb_html_parser_t *parser,
396 const lxb_char_t *input_html,
397 size_t chunk_offset,
398 size_t *error_index_offset_tokenizer,
399 size_t *error_index_offset_tree
400 )
401 {
402 void *error;
403
404 /* Tokenizer errors */
405 lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors;
406 size_t index = *error_index_offset_tokenizer;
407 while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
408 /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */
409 lxb_html_tokenizer_error_t *token_error = error;
410 if (ctx->tokenizer_error_reporter) {
411 ctx->tokenizer_error_reporter(
412 ctx->application_data,
413 token_error,
414 token_error->pos - input_html + chunk_offset
415 );
416 }
417 index++;
418 }
419 *error_index_offset_tokenizer = index;
420
421 /* Tree parser errors */
422 parse_errors = lxb_html_parser_tree(parser)->parse_errors;
423 index = *error_index_offset_tree;
424 while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
425 /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */
426 lxb_html_tree_error_t *tree_error = error;
427 if (ctx->tree_error_reporter) {
428 ctx->tree_error_reporter(
429 ctx->application_data,
430 tree_error,
431 tree_error->line + 1,
432 tree_error->column + 1,
433 tree_error->length
434 );
435 }
436 index++;
437 }
438 *error_index_offset_tree = index;
439 }
440
dom_translate_quirks_mode(lxb_dom_document_cmode_t quirks_mode)441 static php_libxml_quirks_mode dom_translate_quirks_mode(lxb_dom_document_cmode_t quirks_mode)
442 {
443 switch (quirks_mode) {
444 case LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS: return PHP_LIBXML_NO_QUIRKS;
445 case LXB_DOM_DOCUMENT_CMODE_LIMITED_QUIRKS: return PHP_LIBXML_LIMITED_QUIRKS;
446 case LXB_DOM_DOCUMENT_CMODE_QUIRKS: return PHP_LIBXML_QUIRKS;
447 EMPTY_SWITCH_DEFAULT_CASE();
448 }
449 }
450
lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t * tree,lexbor_libxml2_bridge_extracted_observations * observations)451 void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
452 {
453 observations->has_explicit_html_tag = tree->has_explicit_html_tag;
454 observations->has_explicit_head_tag = tree->has_explicit_head_tag;
455 observations->has_explicit_body_tag = tree->has_explicit_body_tag;
456 observations->quirks_mode = dom_translate_quirks_mode(lxb_dom_interface_document(tree->document)->compat_mode);
457 }
458
459 #endif /* HAVE_LIBXML && HAVE_DOM */
460