xref: /php-src/ext/dom/html5_serializer.c (revision 690ce6d5)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_serializer.h"
25 #include "namespace_compat.h"
26 #include "serialize_common.h"
27 #include <lexbor/encoding/encoding.h>
28 
29 /* This file implements the HTML 5 serialization algorithm.
30  * https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments (Date 2023-12-14)
31  */
32 
33 #define TRY(x) do { if (UNEXPECTED((x) != SUCCESS)) { return FAILURE; } } while (0)
34 
dom_html5_serialize_doctype(dom_html5_serialize_context * ctx,const xmlDtd * dtd)35 static zend_result dom_html5_serialize_doctype(dom_html5_serialize_context *ctx, const xmlDtd *dtd)
36 {
37 	TRY(ctx->write_string_len(ctx->application_data, "<!DOCTYPE ", strlen("<!DOCTYPE ")));
38 	TRY(ctx->write_string(ctx->application_data, (const char *) dtd->name));
39 	return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
40 }
41 
dom_html5_serialize_comment(dom_html5_serialize_context * ctx,const xmlNode * node)42 static zend_result dom_html5_serialize_comment(dom_html5_serialize_context *ctx, const xmlNode *node)
43 {
44 	TRY(ctx->write_string_len(ctx->application_data, "<!--", strlen("<!--")));
45 	if (node->content) {
46 		TRY(ctx->write_string(ctx->application_data, (const char*) node->content));
47 	}
48 	return ctx->write_string_len(ctx->application_data, "-->", strlen("-->"));
49 }
50 
dom_html5_serialize_processing_instruction(dom_html5_serialize_context * ctx,const xmlNode * node)51 static zend_result dom_html5_serialize_processing_instruction(dom_html5_serialize_context *ctx, const xmlNode *node)
52 {
53 	TRY(ctx->write_string_len(ctx->application_data, "<?", strlen("<?")));
54 	TRY(ctx->write_string(ctx->application_data, (const char *) node->name));
55 	TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
56 	if (node->content) {
57 		TRY(ctx->write_string(ctx->application_data, (const char *) node->content));
58 	}
59 	return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
60 }
61 
dom_html5_serialize_entity_ref(dom_html5_serialize_context * ctx,const xmlNode * node)62 static zend_result dom_html5_serialize_entity_ref(dom_html5_serialize_context *ctx, const xmlNode *node)
63 {
64 	TRY(ctx->write_string_len(ctx->application_data, "&", strlen("&")));
65 	TRY(ctx->write_string(ctx->application_data, (const char *) node->name));
66 	return ctx->write_string_len(ctx->application_data, ";", strlen(";"));
67 }
68 
69 /* https://html.spec.whatwg.org/multipage/parsing.html#escapingString */
dom_html5_escape_string(dom_html5_serialize_context * ctx,const char * content,bool attribute_mode)70 static zend_result dom_html5_escape_string(dom_html5_serialize_context *ctx, const char *content, bool attribute_mode)
71 {
72 	const char *last_output = content;
73 
74 	/* Note: uses UTF-8 internally, so <C2 A0> indicates a non-breaking space */
75 	const char *mask = attribute_mode ? "&\xC2\"" : "&\xC2<>";
76 
77 	while (true) {
78 		size_t chunk_length = strcspn(content, mask);
79 
80 		content += chunk_length;
81 		if (*content == '\0') {
82 			break;
83 		}
84 
85 		switch (*content) {
86 			/* Step 1 */
87 			case '&': {
88 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
89 				TRY(ctx->write_string_len(ctx->application_data, "&amp;", strlen("&amp;")));
90 				last_output = content + 1;
91 				break;
92 			}
93 
94 			/* Step 2 (non-breaking space) (note: uses UTF-8 internally) */
95 			case '\xC2': {
96 				if (content[1] == '\xA0') {
97 					TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
98 					TRY(ctx->write_string_len(ctx->application_data, "&nbsp;", strlen("&nbsp;")));
99 					content++; /* Consume A0 too */
100 					last_output = content + 1;
101 				}
102 				break;
103 			}
104 
105 			/* Step 3 */
106 			case '"': {
107 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
108 				TRY(ctx->write_string_len(ctx->application_data, "&quot;", strlen("&quot;")));
109 				last_output = content + 1;
110 				break;
111 			}
112 
113 			/* Step 4 */
114 			case '<': {
115 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
116 				TRY(ctx->write_string_len(ctx->application_data, "&lt;", strlen("&lt;")));
117 				last_output = content + 1;
118 				break;
119 			}
120 			case '>': {
121 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
122 				TRY(ctx->write_string_len(ctx->application_data, "&gt;", strlen("&gt;")));
123 				last_output = content + 1;
124 				break;
125 			}
126 		}
127 
128 		content++;
129 	}
130 
131 	return ctx->write_string_len(ctx->application_data, last_output, content - last_output);
132 }
133 
dom_html5_serialize_text_node(dom_html5_serialize_context * ctx,const xmlNode * node)134 static zend_result dom_html5_serialize_text_node(dom_html5_serialize_context *ctx, const xmlNode *node)
135 {
136 	if (!node->content) {
137 		return SUCCESS;
138 	}
139 
140 	const xmlNode *parent = node->parent;
141 	if (parent != NULL && parent->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(parent, php_dom_ns_is_html_magic_token)) {
142 		size_t name_length = strlen((const char *) parent->name);
143 		/* Spec tells us to only emit noscript content as-is if scripting is enabled.
144 		 * However, the user agent (PHP) does not support (JS) scripting.
145 		 * Furthermore, if actually consumed by a browser then we should err on the safe side and not emit the content as-is. */
146 		if (dom_local_name_compare_ex(parent, "style", strlen("style"), name_length)
147 			|| dom_local_name_compare_ex(parent, "script", strlen("script"), name_length)
148 			|| dom_local_name_compare_ex(parent, "xmp", strlen("xmp"), name_length)
149 			|| dom_local_name_compare_ex(parent, "iframe", strlen("iframe"), name_length)
150 			|| dom_local_name_compare_ex(parent, "noembed", strlen("noembed"), name_length)
151 			|| dom_local_name_compare_ex(parent, "noframes", strlen("noframes"), name_length)
152 			|| dom_local_name_compare_ex(parent, "plaintext", strlen("plaintext"), name_length)) {
153 			return ctx->write_string(ctx->application_data, (const char *) node->content);
154 		}
155 	}
156 
157 	return dom_html5_escape_string(ctx, (const char *) node->content, false);
158 }
159 
dom_html5_serialize_element_tag_name(dom_html5_serialize_context * ctx,const xmlNode * node)160 static zend_result dom_html5_serialize_element_tag_name(dom_html5_serialize_context *ctx, const xmlNode *node)
161 {
162 	/* Note: it is not the serializer's responsibility to care about uppercase/lowercase (see createElement() note) */
163 	if (node->ns != NULL && node->ns->prefix != NULL
164 		&& !(php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) || php_dom_ns_is_fast(node, php_dom_ns_is_mathml_magic_token) || php_dom_ns_is_fast(node, php_dom_ns_is_svg_magic_token))) {
165 		TRY(ctx->write_string(ctx->application_data, (const char *) node->ns->prefix));
166 		TRY(ctx->write_string_len(ctx->application_data, ":", strlen(":")));
167 	}
168 	return ctx->write_string(ctx->application_data, (const char *) node->name);
169 }
170 
dom_html5_serialize_element_start(dom_html5_serialize_context * ctx,const xmlNode * node)171 static zend_result dom_html5_serialize_element_start(dom_html5_serialize_context *ctx, const xmlNode *node)
172 {
173 	TRY(ctx->write_string_len(ctx->application_data, "<", strlen("<")));
174 	TRY(dom_html5_serialize_element_tag_name(ctx, node));
175 
176 	/* We don't support the "is" value during element creation, so no handling here. */
177 
178 	for (const xmlAttr *attr = node->properties; attr; attr = attr->next) {
179 		TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
180 		if (attr->ns == NULL) {
181 			TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
182 		} else {
183 			if (php_dom_ns_is_fast((const xmlNode *) attr, php_dom_ns_is_xml_magic_token)) {
184 				TRY(ctx->write_string_len(ctx->application_data, "xml:", strlen("xml:")));
185 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
186 			} else if (php_dom_ns_is_fast((const xmlNode *) attr, php_dom_ns_is_xmlns_magic_token)) {
187 				/* Compatibility for real attributes */
188 				if (strcmp((const char *) attr->name, "xmlns") == 0) {
189 					TRY(ctx->write_string_len(ctx->application_data, "xmlns", strlen("xmlns")));
190 				} else {
191 					TRY(ctx->write_string_len(ctx->application_data, "xmlns:", strlen("xmlns:")));
192 					TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
193 				}
194 			} else if (php_dom_ns_is_fast((const xmlNode *) attr, php_dom_ns_is_xlink_magic_token)) {
195 				TRY(ctx->write_string_len(ctx->application_data, "xlink:", strlen("xlink:")));
196 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
197 			} else if (attr->ns->prefix == NULL) {
198 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
199 			} else {
200 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->ns->prefix));
201 				TRY(ctx->write_string_len(ctx->application_data, ":", strlen(":")));
202 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
203 			}
204 		}
205 
206 		TRY(ctx->write_string_len(ctx->application_data, "=\"", strlen("=\"")));
207 
208 		for (xmlNodePtr child = attr->children; child != NULL; child = child->next) {
209 			if (child->type == XML_TEXT_NODE) {
210 				if (child->content != NULL) {
211 					TRY(dom_html5_escape_string(ctx, (const char *) child->content, true));
212 				}
213 			} else if (child->type == XML_ENTITY_REF_NODE) {
214 				TRY(ctx->write_string_len(ctx->application_data, "&", strlen("&")));
215 				TRY(dom_html5_escape_string(ctx, (const char *) child->name, true));
216 				TRY(ctx->write_string_len(ctx->application_data, ";", strlen(";")));
217 			}
218 		}
219 
220 		TRY(ctx->write_string_len(ctx->application_data, "\"", strlen("\"")));
221 	}
222 
223 	return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
224 
225 	/* Note: "continue on to the next child if the element is void" is handled in the iteration and dom_html5_serialize_element_end() */
226 }
227 
228 /* https://html.spec.whatwg.org/multipage/syntax.html#void-elements
229  * https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void */
dom_html5_serializes_as_void(const xmlNode * node)230 static bool dom_html5_serializes_as_void(const xmlNode *node)
231 {
232 	if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token)) {
233 		size_t name_length = strlen((const char *) node->name);
234 		if (/* These are the void elements from https://html.spec.whatwg.org/multipage/syntax.html#void-elements */
235 			dom_local_name_compare_ex(node, "area", strlen("area"), name_length)
236 			|| dom_local_name_compare_ex(node, "base", strlen("base"), name_length)
237 			|| dom_local_name_compare_ex(node, "br", strlen("br"), name_length)
238 			|| dom_local_name_compare_ex(node, "col", strlen("col"), name_length)
239 			|| dom_local_name_compare_ex(node, "embed", strlen("embed"), name_length)
240 			|| dom_local_name_compare_ex(node, "hr", strlen("hr"), name_length)
241 			|| dom_local_name_compare_ex(node, "img", strlen("img"), name_length)
242 			|| dom_local_name_compare_ex(node, "input", strlen("input"), name_length)
243 			|| dom_local_name_compare_ex(node, "link", strlen("link"), name_length)
244 			|| dom_local_name_compare_ex(node, "meta", strlen("meta"), name_length)
245 			|| dom_local_name_compare_ex(node, "source", strlen("source"), name_length)
246 			|| dom_local_name_compare_ex(node, "track", strlen("track"), name_length)
247 			|| dom_local_name_compare_ex(node, "wbr", strlen("wbr"), name_length)
248 			/* These are the additional names from https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void */
249 			|| dom_local_name_compare_ex(node, "basefont", strlen("basefont"), name_length)
250 			|| dom_local_name_compare_ex(node, "bgsound", strlen("bgsound"), name_length)
251 			|| dom_local_name_compare_ex(node, "frame", strlen("frame"), name_length)
252 			|| dom_local_name_compare_ex(node, "keygen", strlen("keygen"), name_length)
253 			|| dom_local_name_compare_ex(node, "param", strlen("param"), name_length)) {
254 			return true;
255 		}
256 	}
257 	return false;
258 }
259 
dom_html5_serialize_element_end(dom_html5_serialize_context * ctx,const xmlNode * node)260 static zend_result dom_html5_serialize_element_end(dom_html5_serialize_context *ctx, const xmlNode *node)
261 {
262 	if (!dom_html5_serializes_as_void(node)) {
263 		TRY(ctx->write_string_len(ctx->application_data, "</", strlen("</")));
264 		TRY(dom_html5_serialize_element_tag_name(ctx, node));
265 		return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
266 	}
267 	return SUCCESS;
268 }
269 
270 /* https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-serialisation-algorithm */
dom_html5_serialize_node(dom_html5_serialize_context * ctx,const xmlNode * node,const xmlNode * bound)271 static zend_result dom_html5_serialize_node(dom_html5_serialize_context *ctx, const xmlNode *node, const xmlNode *bound)
272 {
273 	while (node != NULL) {
274 		switch (node->type) {
275 			case XML_DTD_NODE: {
276 				TRY(dom_html5_serialize_doctype(ctx, (const xmlDtd *) node));
277 				break;
278 			}
279 
280 			case XML_CDATA_SECTION_NODE:
281 			case XML_TEXT_NODE: {
282 				TRY(dom_html5_serialize_text_node(ctx, node));
283 				break;
284 			}
285 
286 			case XML_PI_NODE: {
287 				TRY(dom_html5_serialize_processing_instruction(ctx, node));
288 				break;
289 			}
290 
291 			case XML_COMMENT_NODE: {
292 				TRY(dom_html5_serialize_comment(ctx, node));
293 				break;
294 			}
295 
296 			case XML_ELEMENT_NODE: {
297 				TRY(dom_html5_serialize_element_start(ctx, node));
298 				const xmlNode *children = node->children;
299 				if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) && xmlStrEqual(node->name, BAD_CAST "template")) {
300 					children = php_dom_retrieve_templated_content(ctx->private_data, node);
301 				}
302 				if (children) {
303 					if (!dom_html5_serializes_as_void(node)) {
304 						node = children;
305 						continue;
306 					}
307 				} else {
308 					/* Not descended, so wouldn't put the closing tag as it's normally only done when going back upwards. */
309 					TRY(dom_html5_serialize_element_end(ctx, node));
310 				}
311 				break;
312 			}
313 
314 			case XML_DOCUMENT_FRAG_NODE: {
315 				if (node->children) {
316 					node = node->children;
317 					continue;
318 				}
319 				break;
320 			}
321 
322 			/* Only exists for compatibility with XML and old DOM. */
323 			case XML_ENTITY_REF_NODE: {
324 				TRY(dom_html5_serialize_entity_ref(ctx, node));
325 				break;
326 			}
327 
328 			default:
329 				break;
330 		}
331 
332 		if (node->next) {
333 			node = node->next;
334 		} else {
335 			/* Go upwards, until we find a parent node with a next sibling, or until we hit the bound. */
336 			do {
337 				node = node->parent;
338 				if (node == bound) {
339 					return SUCCESS;
340 				}
341 				if (node->type == XML_ELEMENT_NODE) {
342 					TRY(dom_html5_serialize_element_end(ctx, node));
343 				}
344 			} while (node->next == NULL);
345 			node = node->next;
346 		}
347 	}
348 
349 	return SUCCESS;
350 }
351 
352 /* Note: this serializes the _children_, excluding the node itself! */
dom_html5_serialize(dom_html5_serialize_context * ctx,const xmlNode * node)353 zend_result dom_html5_serialize(dom_html5_serialize_context *ctx, const xmlNode *node)
354 {
355 	/* Step 1. Note that this algorithm serializes children. Only elements, documents, and fragments can have children. */
356 	if (node->type != XML_ELEMENT_NODE
357 		&& node->type != XML_DOCUMENT_FRAG_NODE
358 		&& node->type != XML_DOCUMENT_NODE
359 		&& node->type != XML_HTML_DOCUMENT_NODE) {
360 		return SUCCESS;
361 	}
362 	if (node->type == XML_ELEMENT_NODE && dom_html5_serializes_as_void(node)) {
363 		return SUCCESS;
364 	}
365 
366 	/* Step 2 not needed because we're not using a string to store the serialized data */
367 
368 	/* Step 3. If the node is a template element, then let the node instead be the template element's template contents (a DocumentFragment node). */
369 	xmlNodePtr children = php_dom_retrieve_templated_content(ctx->private_data, node);
370 	if (!children) {
371 		children = node->children;
372 	}
373 
374 	/* Step 4 */
375 	return dom_html5_serialize_node(ctx, children, node);
376 }
377 
378 /* Variant on the above that is equivalent to the "outer HTML". */
dom_html5_serialize_outer(dom_html5_serialize_context * ctx,const xmlNode * node)379 zend_result dom_html5_serialize_outer(dom_html5_serialize_context *ctx, const xmlNode *node)
380 {
381 	if (node->type == XML_DOCUMENT_NODE || node->type == XML_HTML_DOCUMENT_NODE || node->type == XML_DOCUMENT_FRAG_NODE) {
382 		node = node->children;
383 		if (!node) {
384 			return SUCCESS;
385 		}
386 		return dom_html5_serialize_node(ctx, node, node->parent);
387 	} else {
388 		xmlNodePtr old_next = node->next;
389 		((xmlNodePtr) node)->next = NULL;
390 		zend_result result = dom_html5_serialize_node(ctx, node, node->parent);
391 		((xmlNodePtr) node)->next = old_next;
392 		return result;
393 	}
394 }
395 
396 #endif  /* HAVE_LIBXML && HAVE_DOM */
397