xref: /php-src/ext/dom/inner_html_mixin.c (revision 1b077725)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "dom_properties.h"
25 #include "html5_parser.h"
26 #include "html5_serializer.h"
27 #include "xml_serializer.h"
28 #include "domexception.h"
29 #include <libxml/xmlsave.h>
30 #include <lexbor/dom/interfaces/element.h>
31 #include <lexbor/html/interfaces/document.h>
32 #include <lexbor/tag/tag.h>
33 #include <lexbor/encoding/encoding.h>
34 
35 /* Spec date: 2024-04-14 */
36 
dom_inner_html_write_string(void * application_data,const char * buf)37 static zend_result dom_inner_html_write_string(void *application_data, const char *buf)
38 {
39 	smart_str *output = application_data;
40 	smart_str_appends(output, buf);
41 	return SUCCESS;
42 }
43 
dom_inner_html_write_string_len(void * application_data,const char * buf,size_t len)44 static zend_result dom_inner_html_write_string_len(void *application_data, const char *buf, size_t len)
45 {
46 	smart_str *output = application_data;
47 	smart_str_appendl(output, buf, len);
48 	return SUCCESS;
49 }
50 
dom_write_smart_str(void * context,const char * buffer,int len)51 static int dom_write_smart_str(void *context, const char *buffer, int len)
52 {
53 	smart_str *str = context;
54 	smart_str_appendl(str, buffer, len);
55 	return len;
56 }
57 
58 /* https://w3c.github.io/DOM-Parsing/#the-innerhtml-mixin
59  * and https://w3c.github.io/DOM-Parsing/#dfn-fragment-serializing-algorithm */
dom_element_inner_html_read(dom_object * obj,zval * retval)60 zend_result dom_element_inner_html_read(dom_object *obj, zval *retval)
61 {
62 	DOM_PROP_NODE(xmlNodePtr, node, obj);
63 
64 	/* 1. Let context document be the value of node's node document. */
65 	const xmlDoc *context_document = node->doc;
66 
67 	/* 2. If context document is an HTML document, return an HTML serialization of node. */
68 	if (context_document->type == XML_HTML_DOCUMENT_NODE) {
69 		smart_str output = {0};
70 		dom_html5_serialize_context ctx;
71 		ctx.private_data = php_dom_get_private_data(obj);
72 		ctx.application_data = &output;
73 		ctx.write_string = dom_inner_html_write_string;
74 		ctx.write_string_len = dom_inner_html_write_string_len;
75 		dom_html5_serialize(&ctx, node);
76 		ZVAL_STR(retval, smart_str_extract(&output));
77 	}
78 	/* 3. Otherwise, context document is an XML document; return an XML serialization of node passing the flag require well-formed. */
79 	else {
80 		ZEND_ASSERT(context_document->type == XML_DOCUMENT_NODE);
81 
82 		int status = -1;
83 		smart_str str = {0};
84 		/* No need to check buf's return value, as xmlSaveToBuffer() will fail instead. */
85 		xmlSaveCtxtPtr ctxt = xmlSaveToIO(dom_write_smart_str, NULL, &str, "UTF-8", XML_SAVE_AS_XML);
86 		if (EXPECTED(ctxt != NULL)) {
87 			xmlCharEncodingHandlerPtr handler = xmlFindCharEncodingHandler("UTF-8");
88 			xmlOutputBufferPtr out = xmlOutputBufferCreateIO(dom_write_smart_str, NULL, &str, handler);
89 			if (EXPECTED(out != NULL)) {
90 				php_dom_private_data *private_data = php_dom_get_private_data(obj);
91 				/* Note: the innerHTML mixin sets the well-formed flag to true. */
92 				xmlNodePtr child = node->children;
93 				status = 0;
94 				while (child != NULL && status == 0) {
95 					status = dom_xml_serialize(ctxt, out, child, false, true, private_data);
96 					child = child->next;
97 				}
98 				status |= xmlOutputBufferFlush(out);
99 				status |= xmlOutputBufferClose(out);
100 			}
101 			(void) xmlSaveClose(ctxt);
102 			xmlCharEncCloseFunc(handler);
103 		}
104 		if (UNEXPECTED(status < 0)) {
105 			smart_str_free_ex(&str, false);
106 			php_dom_throw_error_with_message(SYNTAX_ERR, "The resulting XML serialization is not well-formed", true);
107 			return FAILURE;
108 		}
109 		ZVAL_STR(retval, smart_str_extract(&str));
110 	}
111 
112 	return SUCCESS;
113 }
114 
dom_html_fragment_lexbor_parse(lxb_html_document_t * document,lxb_dom_element_t * element,const zend_string * input)115 static lxb_dom_node_t *dom_html_fragment_lexbor_parse(lxb_html_document_t *document, lxb_dom_element_t *element, const zend_string *input)
116 {
117 	lxb_status_t status = lxb_html_document_parse_fragment_chunk_begin(document, element);
118 	if (status != LXB_STATUS_OK) {
119 		return NULL;
120 	}
121 
122 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
123 	lxb_encoding_decode_t decode;
124 	lxb_encoding_decode_init_single(&decode, encoding_data);
125 
126 	const lxb_char_t *buf_ref = (const lxb_char_t *) ZSTR_VAL(input);
127 	if (ZSTR_IS_VALID_UTF8(input)) {
128 		/* If we know the input is valid UTF-8, we don't have to perform checks and replace invalid sequences. */
129 		status = lxb_html_document_parse_fragment_chunk(document, buf_ref, ZSTR_LEN(input));
130 		if (UNEXPECTED(status != LXB_STATUS_OK)) {
131 			return NULL;
132 		}
133 	} else {
134 		/* See dom_decode_encode_fast_path(), simplified version for in-memory use-case. */
135 		const lxb_char_t *buf_end = buf_ref + ZSTR_LEN(input);
136 		const lxb_char_t *last_output = buf_ref;
137 		while (buf_ref < buf_end) {
138 			if (decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
139 				buf_ref++;
140 				continue;
141 			}
142 
143 			const lxb_char_t *buf_ref_backup = buf_ref;
144 			lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decode, &buf_ref, buf_end);
145 			if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
146 				status = lxb_html_document_parse_fragment_chunk(document, last_output, buf_ref_backup - last_output);
147 				if (UNEXPECTED(status != LXB_STATUS_OK)) {
148 					return NULL;
149 				}
150 
151 				status = lxb_html_document_parse_fragment_chunk(document, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
152 				if (UNEXPECTED(status != LXB_STATUS_OK)) {
153 					return NULL;
154 				}
155 
156 				last_output = buf_ref;
157 			}
158 		}
159 
160 		if (buf_ref != last_output) {
161 			status = lxb_html_document_parse_fragment_chunk(document, last_output, buf_ref - last_output);
162 			if (UNEXPECTED(status != LXB_STATUS_OK)) {
163 				return NULL;
164 			}
165 		}
166 	}
167 
168 	return lxb_html_document_parse_fragment_chunk_end(document);
169 }
170 
dom_translate_quirks_mode(php_libxml_quirks_mode quirks_mode)171 static lxb_dom_document_cmode_t dom_translate_quirks_mode(php_libxml_quirks_mode quirks_mode)
172 {
173 	switch (quirks_mode) {
174 		case PHP_LIBXML_NO_QUIRKS: return LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS;
175 		case PHP_LIBXML_LIMITED_QUIRKS: return LXB_DOM_DOCUMENT_CMODE_LIMITED_QUIRKS;
176 		case PHP_LIBXML_QUIRKS: return LXB_DOM_DOCUMENT_CMODE_QUIRKS;
177 		EMPTY_SWITCH_DEFAULT_CASE();
178 	}
179 }
180 
181 /* https://html.spec.whatwg.org/#html-fragment-parsing-algorithm */
dom_html_fragment_parsing_algorithm(dom_object * obj,xmlNodePtr context_node,const zend_string * input,php_libxml_quirks_mode quirks_mode)182 static xmlNodePtr dom_html_fragment_parsing_algorithm(dom_object *obj, xmlNodePtr context_node, const zend_string *input, php_libxml_quirks_mode quirks_mode)
183 {
184 	/* The whole algorithm is implemented in Lexbor, we just have to be the adapter between the
185 	 * data structures used in PHP and what Lexbor expects. */
186 
187 	lxb_html_document_t *document = lxb_html_document_create();
188 	document->dom_document.compat_mode = dom_translate_quirks_mode(quirks_mode);
189 	lxb_dom_element_t *element = lxb_dom_element_interface_create(&document->dom_document);
190 
191 	const lxb_tag_data_t *tag_data = lxb_tag_data_by_name(document->dom_document.tags, (lxb_char_t *) context_node->name, xmlStrlen(context_node->name));
192 	element->node.local_name = tag_data == NULL ? LXB_TAG__UNDEF : tag_data->tag_id;
193 
194 	const lxb_char_t *ns_uri;
195 	size_t ns_uri_len;
196 	if (context_node->ns == NULL || context_node->ns->href == NULL) {
197 		ns_uri = (lxb_char_t *) "";
198 		ns_uri_len = 0;
199 	} else {
200 		ns_uri = context_node->ns->href;
201 		ns_uri_len = xmlStrlen(ns_uri);
202 	}
203 	const lxb_ns_data_t *ns_data = lxb_ns_data_by_link(document->dom_document.ns, ns_uri, ns_uri_len);
204 	element->node.ns = ns_data == NULL ? LXB_NS__UNDEF : ns_data->ns_id;
205 
206 	lxb_dom_node_t *node = dom_html_fragment_lexbor_parse(document, element, input);
207 	xmlNodePtr fragment = NULL;
208 	if (node != NULL) {
209 		/* node->last_child could be NULL, but that is allowed. */
210 		lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert_fragment(node->last_child, context_node->doc, &fragment, true, true, php_dom_get_private_data(obj));
211 		if (UNEXPECTED(status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
212 			php_dom_throw_error(INVALID_STATE_ERR, true);
213 		}
214 	} else {
215 		php_dom_throw_error(INVALID_STATE_ERR, true);
216 	}
217 
218 	lxb_html_document_destroy(document);
219 
220 	return fragment;
221 }
222 
dom_xml_parser_tag_name(const xmlNode * context_node,xmlParserCtxtPtr parser)223 static void dom_xml_parser_tag_name(const xmlNode *context_node, xmlParserCtxtPtr parser)
224 {
225 	if (context_node->ns != NULL && context_node->ns->prefix != NULL) {
226 		xmlParseChunk(parser, (const char *) context_node->ns->prefix, xmlStrlen(context_node->ns->prefix), 0);
227 		xmlParseChunk(parser, ":", 1, 0);
228 	}
229 
230 	xmlParseChunk(parser, (const char *) context_node->name, xmlStrlen(context_node->name), 0);
231 }
232 
dom_xml_fragment_parsing_algorithm_parse(php_dom_libxml_ns_mapper * ns_mapper,const xmlNode * context_node,const zend_string * input,xmlParserCtxtPtr parser)233 static void dom_xml_fragment_parsing_algorithm_parse(php_dom_libxml_ns_mapper *ns_mapper, const xmlNode *context_node, const zend_string *input, xmlParserCtxtPtr parser)
234 {
235 	xmlParseChunk(parser, "<", 1, 0);
236 	dom_xml_parser_tag_name(context_node, parser);
237 
238 	/* Namespaces: we have to declare all in-scope namespaces including the default namespace */
239 	/* xmlns attributes */
240 	php_dom_in_scope_ns in_scope_ns = php_dom_get_in_scope_ns(ns_mapper, context_node, true);
241 	for (size_t i = 0; i < in_scope_ns.count; i++) {
242 		const xmlNs *ns = in_scope_ns.list[i];
243 		xmlParseChunk(parser, " xmlns:", 7, 0);
244 		ZEND_ASSERT(ns->prefix != NULL);
245 		xmlParseChunk(parser, (const char *) ns->prefix, xmlStrlen(ns->prefix), 0);
246 		xmlParseChunk(parser, "=\"", 2, 0);
247 		xmlParseChunk(parser, (const char *) ns->href, xmlStrlen(ns->href), 0);
248 		xmlParseChunk(parser, "\"", 1, 0);
249 	}
250 	php_dom_in_scope_ns_destroy(&in_scope_ns);
251 	/* default namespace */
252 	const char *default_ns = dom_locate_a_namespace(context_node, NULL);
253 	if (default_ns != NULL) {
254 		xmlParseChunk(parser, " xmlns=\"", 8, 0);
255 		xmlParseChunk(parser, default_ns, strlen(default_ns), 0);
256 		xmlParseChunk(parser, "\"", 1, 0);
257 	}
258 
259 	xmlParseChunk(parser, ">", 1, 0);
260 
261 	xmlParseChunk(parser, (const char *) ZSTR_VAL(input), ZSTR_LEN(input), 0);
262 
263 	xmlParseChunk(parser, "</", 2, 0);
264 	dom_xml_parser_tag_name(context_node, parser);
265 	xmlParseChunk(parser, ">", 1, 1);
266 }
267 
268 /* https://html.spec.whatwg.org/#xml-fragment-parsing-algorithm */
dom_xml_fragment_parsing_algorithm(dom_object * obj,const xmlNode * context_node,const zend_string * input)269 static xmlNodePtr dom_xml_fragment_parsing_algorithm(dom_object *obj, const xmlNode *context_node, const zend_string *input)
270 {
271 	/* Steps 1-4 below */
272 	xmlParserCtxtPtr parser = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
273 	if (UNEXPECTED(parser == NULL)) {
274 		php_dom_throw_error(INVALID_STATE_ERR, true);
275 		return NULL;
276 	}
277 
278 	/* This is not only good to avoid a performance cost of changing the tree, but also to work around an old bug
279 	 * in xmlSetTreeDoc(). */
280 	xmlDictFree(parser->dict);
281 	if (context_node->doc->dict == NULL) {
282 		context_node->doc->dict = xmlDictCreate();
283 		xmlDictSetLimit(context_node->doc->dict, XML_MAX_DICTIONARY_LIMIT);
284 	}
285 	parser->dict = context_node->doc->dict;
286 
287 	php_libxml_sanitize_parse_ctxt_options(parser);
288 	xmlCtxtUseOptions(parser, XML_PARSE_IGNORE_ENC | XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
289 
290 	xmlCharEncodingHandlerPtr encoding = xmlFindCharEncodingHandler("UTF-8");
291 	(void) xmlSwitchToEncoding(parser, encoding);
292 
293 	php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
294 	dom_xml_fragment_parsing_algorithm_parse(ns_mapper, context_node, input, parser);
295 
296 	/* 5. If there is an XML well-formedness or XML namespace well-formedness error, then throw a "SyntaxError" DOMException. */
297 	if (!parser->wellFormed || !parser->nsWellFormed) {
298 		parser->dict = NULL;
299 		xmlFreeDoc(parser->myDoc);
300 		xmlFreeParserCtxt(parser);
301 		php_dom_throw_error_with_message(SYNTAX_ERR, "XML fragment is not well-formed", true);
302 		return NULL;
303 	}
304 
305 	xmlDocPtr doc = parser->myDoc;
306 	xmlFreeParserCtxt(parser);
307 
308 	if (EXPECTED(doc != NULL)) {
309 		doc->dict = NULL;
310 
311 		/* 6. If the document element of the resulting Document has any sibling nodes, then throw a "SyntaxError" DOMException. */
312 		xmlNodePtr document_element = doc->children;
313 		if (document_element == NULL || document_element->next != NULL) {
314 			xmlFreeDoc(doc);
315 			php_dom_throw_error_with_message(SYNTAX_ERR, "XML fragment is not well-formed", true);
316 			return NULL;
317 		}
318 
319 		/* 7. Return the child nodes of the document element of the resulting Document, in tree order. */
320 		xmlNodePtr fragment = xmlNewDocFragment(context_node->doc);
321 		if (EXPECTED(fragment != NULL)) {
322 			xmlNodePtr child = document_element->children;
323 			/* Yes, we have to call both xmlSetTreeDoc() prior to xmlAddChildList()
324 			 * because xmlAddChildList() _only_ sets the tree for the topmost elements in the subtree! */
325 			xmlSetTreeDoc(document_element, context_node->doc);
326 			xmlAddChildList(fragment, child);
327 			dom_mark_namespaces_as_attributes_too(ns_mapper, doc);
328 			document_element->children = NULL;
329 			document_element->last = NULL;
330 		}
331 		xmlFreeDoc(doc);
332 		return fragment;
333 	}
334 	return NULL;
335 }
336 
337 /* https://w3c.github.io/DOM-Parsing/#the-innerhtml-mixin
338  * and https://w3c.github.io/DOM-Parsing/#dfn-fragment-parsing-algorithm */
dom_element_inner_html_write(dom_object * obj,zval * newval)339 zend_result dom_element_inner_html_write(dom_object *obj, zval *newval)
340 {
341 	DOM_PROP_NODE(xmlNodePtr, context_node, obj);
342 
343 	xmlNodePtr fragment;
344 	if (context_node->doc->type == XML_DOCUMENT_NODE) {
345 		fragment = dom_xml_fragment_parsing_algorithm(obj, context_node, Z_STR_P(newval));
346 	} else {
347 		fragment = dom_html_fragment_parsing_algorithm(obj, context_node, Z_STR_P(newval), obj->document->quirks_mode);
348 	}
349 
350 	if (fragment == NULL) {
351 		return FAILURE;
352 	}
353 
354 	if (php_dom_ns_is_fast(context_node, php_dom_ns_is_html_magic_token) && xmlStrEqual(context_node->name, BAD_CAST "template")) {
355 		context_node = php_dom_ensure_templated_content(php_dom_get_private_data(obj), context_node);
356 		if (context_node == NULL) {
357 			xmlFreeNode(fragment);
358 			return FAILURE;
359 		}
360 	}
361 
362 	dom_remove_all_children(context_node);
363 	return php_dom_pre_insert(obj->document, fragment, context_node, NULL) ? SUCCESS : FAILURE;
364 }
365 
366 #endif
367