xref: /PHP-8.2/ext/dom/html5_serializer.c (revision 0870da33)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_serializer.h"
25 #include "namespace_compat.h"
26 #include <lexbor/encoding/encoding.h>
27 
28 #define TRY(x) do { if (UNEXPECTED((x) != SUCCESS)) { return FAILURE; } } while (0)
29 
dom_is_ns(const xmlNode * node,const char * uri)30 static bool dom_is_ns(const xmlNode *node, const char *uri)
31 {
32 	return node->ns != NULL && strcmp((const char *) node->ns->href, uri) == 0;
33 }
34 
dom_is_html_ns(const xmlNode * node)35 static bool dom_is_html_ns(const xmlNode *node)
36 {
37 	return node->ns == NULL || dom_is_ns(node, DOM_XHTML_NS_URI);
38 }
39 
dom_local_name_compare_ex(const xmlNode * node,const char * tag,size_t tag_length,size_t name_length)40 static bool dom_local_name_compare_ex(const xmlNode *node, const char *tag, size_t tag_length, size_t name_length)
41 {
42 	return name_length == tag_length && zend_binary_strcmp((const char *) node->name, name_length, tag, tag_length) == 0;
43 }
44 
dom_html5_serialize_doctype(dom_html5_serialize_context * ctx,const xmlDtd * dtd)45 static zend_result dom_html5_serialize_doctype(dom_html5_serialize_context *ctx, const xmlDtd *dtd)
46 {
47 	TRY(ctx->write_string_len(ctx->application_data, "<!DOCTYPE ", strlen("<!DOCTYPE ")));
48 	TRY(ctx->write_string(ctx->application_data, (const char *) dtd->name));
49 	return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
50 }
51 
dom_html5_serialize_comment(dom_html5_serialize_context * ctx,const xmlNode * node)52 static zend_result dom_html5_serialize_comment(dom_html5_serialize_context *ctx, const xmlNode *node)
53 {
54 	TRY(ctx->write_string_len(ctx->application_data, "<!--", strlen("<!--")));
55 	TRY(ctx->write_string(ctx->application_data, (const char *) node->content));
56 	return ctx->write_string_len(ctx->application_data, "-->", strlen("-->"));
57 }
58 
dom_html5_serialize_processing_instruction(dom_html5_serialize_context * ctx,const xmlNode * node)59 static zend_result dom_html5_serialize_processing_instruction(dom_html5_serialize_context *ctx, const xmlNode *node)
60 {
61 	TRY(ctx->write_string_len(ctx->application_data, "<?", strlen("<?")));
62 	TRY(ctx->write_string(ctx->application_data, (const char *) node->name));
63 	TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
64 	TRY(ctx->write_string(ctx->application_data, (const char *) node->content));
65 	return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
66 }
67 
68 /* https://html.spec.whatwg.org/multipage/parsing.html#escapingString */
dom_html5_escape_string(dom_html5_serialize_context * ctx,const char * content,bool attribute_mode)69 static zend_result dom_html5_escape_string(dom_html5_serialize_context *ctx, const char *content, bool attribute_mode)
70 {
71 	const char *last_output = content;
72 
73 	/* Note: uses UTF-8 internally, so <C2 A0> indicates a non-breaking space */
74 	const char *mask = attribute_mode ? "&\xC2\"" : "&\xC2<>";
75 
76 	while (true) {
77 		size_t chunk_length = strcspn(content, mask);
78 
79 		content += chunk_length;
80 		if (*content == '\0') {
81 			break;
82 		}
83 
84 		switch (*content) {
85 			/* Step 1 */
86 			case '&': {
87 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
88 				TRY(ctx->write_string_len(ctx->application_data, "&amp;", strlen("&amp;")));
89 				last_output = content + 1;
90 				break;
91 			}
92 
93 			/* Step 2 (non-breaking space) (note: uses UTF-8 internally) */
94 			case '\xC2': {
95 				if (content[1] == '\xA0') {
96 					TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
97 					TRY(ctx->write_string_len(ctx->application_data, "&nbsp;", strlen("&nbsp;")));
98 					content++; /* Consume A0 too */
99 					last_output = content + 1;
100 				}
101 				break;
102 			}
103 
104 			/* Step 3 */
105 			case '"': {
106 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
107 				TRY(ctx->write_string_len(ctx->application_data, "&quot;", strlen("&quot;")));
108 				last_output = content + 1;
109 				break;
110 			}
111 
112 			/* Step 4 */
113 			case '<': {
114 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
115 				TRY(ctx->write_string_len(ctx->application_data, "&lt;", strlen("&lt;")));
116 				last_output = content + 1;
117 				break;
118 			}
119 			case '>': {
120 				TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
121 				TRY(ctx->write_string_len(ctx->application_data, "&gt;", strlen("&gt;")));
122 				last_output = content + 1;
123 				break;
124 			}
125 		}
126 
127 		content++;
128 	}
129 
130 	return ctx->write_string_len(ctx->application_data, last_output, content - last_output);
131 }
132 
dom_html5_serialize_text_node(dom_html5_serialize_context * ctx,const xmlNode * node)133 static zend_result dom_html5_serialize_text_node(dom_html5_serialize_context *ctx, const xmlNode *node)
134 {
135 	if (node->parent->type == XML_ELEMENT_NODE && dom_is_html_ns(node->parent)) {
136 		const xmlNode *parent = node->parent;
137 		size_t name_length = strlen((const char *) parent->name);
138 		/* Spec tells us to only emit noscript content as-is if scripting is enabled.
139 		 * However, the user agent (PHP) does not support (JS) scripting.
140 		 * Furthermore, if actually consumed by a browser then we should err on the safe side and not emit the content as-is. */
141 		if (dom_local_name_compare_ex(parent, "style", strlen("style"), name_length)
142 			|| dom_local_name_compare_ex(parent, "script", strlen("script"), name_length)
143 			|| dom_local_name_compare_ex(parent, "xmp", strlen("xmp"), name_length)
144 			|| dom_local_name_compare_ex(parent, "iframe", strlen("iframe"), name_length)
145 			|| dom_local_name_compare_ex(parent, "noembed", strlen("noembed"), name_length)
146 			|| dom_local_name_compare_ex(parent, "noframes", strlen("noframes"), name_length)
147 			|| dom_local_name_compare_ex(parent, "plaintext", strlen("plaintext"), name_length)) {
148 			return ctx->write_string(ctx->application_data, (const char *) node->content);
149 		}
150 	}
151 
152 	return dom_html5_escape_string(ctx, (const char *) node->content, false);
153 }
154 
dom_html5_serialize_element_tag_name(dom_html5_serialize_context * ctx,const xmlNode * node)155 static zend_result dom_html5_serialize_element_tag_name(dom_html5_serialize_context *ctx, const xmlNode *node)
156 {
157 	/* Note: it is not the serializer's responsibility to care about uppercase/lowercase (see createElement() note) */
158 	if (node->ns != NULL && node->ns->prefix != NULL
159 		&& !(dom_is_html_ns(node) || dom_is_ns(node, DOM_MATHML_NS_URI) || dom_is_ns(node, DOM_SVG_NS_URI))) {
160 		TRY(ctx->write_string(ctx->application_data, (const char *) node->ns->prefix));
161 		TRY(ctx->write_string_len(ctx->application_data, ":", strlen(":")));
162 	}
163 	return ctx->write_string(ctx->application_data, (const char *) node->name);
164 }
165 
dom_html5_serialize_element_start(dom_html5_serialize_context * ctx,const xmlNode * node)166 static zend_result dom_html5_serialize_element_start(dom_html5_serialize_context *ctx, const xmlNode *node)
167 {
168 	TRY(ctx->write_string_len(ctx->application_data, "<", strlen("<")));
169 	TRY(dom_html5_serialize_element_tag_name(ctx, node));
170 
171 	/* We don't support the "is" value during element creation, so no handling here. */
172 
173 	/* Some namespace declarations are also attributes (see https://html.spec.whatwg.org/multipage/parsing.html#create-an-element-for-the-token) */
174 	for (const xmlNs *ns = node->nsDef; ns != NULL; ns = ns->next) {
175 		if (!dom_ns_is_also_an_attribute(ns)) {
176 			continue;
177 		}
178 
179 		if (ns->prefix != NULL) {
180 			TRY(ctx->write_string_len(ctx->application_data, " xmlns:", strlen(" xmlns:")));
181 			TRY(ctx->write_string(ctx->application_data, (const char *) ns->prefix));
182 			TRY(ctx->write_string_len(ctx->application_data, "=\"", strlen("=\"")));
183 		} else {
184 			TRY(ctx->write_string_len(ctx->application_data, " xmlns=\"", strlen(" xmlns=\"")));
185 		}
186 		TRY(ctx->write_string(ctx->application_data, (const char *) ns->href));
187 		TRY(ctx->write_string_len(ctx->application_data, "\"", strlen("\"")));
188 	}
189 
190 	for (const xmlAttr *attr = node->properties; attr; attr = attr->next) {
191 		TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
192 		if (attr->ns == NULL) {
193 			TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
194 		} else {
195 			if (dom_is_ns((const xmlNode *) attr, DOM_XML_NS_URI)) {
196 				TRY(ctx->write_string_len(ctx->application_data, "xml:", strlen("xml:")));
197 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
198 			} else if (dom_is_ns((const xmlNode *) attr, DOM_XMLNS_NS_URI)) {
199 				/* Compatibility for real attributes */
200 				if (strcmp((const char *) attr->name, "xmlns") == 0) {
201 					TRY(ctx->write_string_len(ctx->application_data, "xmlns", strlen("xmlns")));
202 				} else {
203 					TRY(ctx->write_string_len(ctx->application_data, "xmlns:", strlen("xmlns:")));
204 					TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
205 				}
206 			} else if (dom_is_ns((const xmlNode *) attr, DOM_XLINK_NS_URI)) {
207 				TRY(ctx->write_string_len(ctx->application_data, "xlink:", strlen("xlink:")));
208 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
209 			} else if (attr->ns->prefix == NULL) {
210 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
211 			} else {
212 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->ns->prefix));
213 				TRY(ctx->write_string_len(ctx->application_data, ":", strlen(":")));
214 				TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
215 			}
216 		}
217 		TRY(ctx->write_string_len(ctx->application_data, "=\"", strlen("=\"")));
218 		xmlChar *content = xmlNodeGetContent((const xmlNode *) attr);
219 		if (content != NULL) {
220 			zend_result result = dom_html5_escape_string(ctx, (const char *) content, true);
221 			xmlFree(content);
222 			TRY(result);
223 		}
224 		TRY(ctx->write_string_len(ctx->application_data, "\"", strlen("\"")));
225 	}
226 
227 	return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
228 
229 	/* Note: "continue on to the next child if the element is void" is handled in the iteration and dom_html5_serialize_element_end() */
230 }
231 
232 /* https://html.spec.whatwg.org/multipage/syntax.html#void-elements
233  * https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void */
dom_html5_serializes_as_void(const xmlNode * node)234 static bool dom_html5_serializes_as_void(const xmlNode *node)
235 {
236 	if (dom_is_html_ns(node)) {
237 		size_t name_length = strlen((const char *) node->name);
238 		if (/* These are the void elements from https://html.spec.whatwg.org/multipage/syntax.html#void-elements */
239 			dom_local_name_compare_ex(node, "area", strlen("area"), name_length)
240 			|| dom_local_name_compare_ex(node, "base", strlen("base"), name_length)
241 			|| dom_local_name_compare_ex(node, "br", strlen("br"), name_length)
242 			|| dom_local_name_compare_ex(node, "col", strlen("col"), name_length)
243 			|| dom_local_name_compare_ex(node, "embed", strlen("embed"), name_length)
244 			|| dom_local_name_compare_ex(node, "hr", strlen("hr"), name_length)
245 			|| dom_local_name_compare_ex(node, "img", strlen("img"), name_length)
246 			|| dom_local_name_compare_ex(node, "input", strlen("input"), name_length)
247 			|| dom_local_name_compare_ex(node, "link", strlen("link"), name_length)
248 			|| dom_local_name_compare_ex(node, "meta", strlen("meta"), name_length)
249 			|| dom_local_name_compare_ex(node, "source", strlen("source"), name_length)
250 			|| dom_local_name_compare_ex(node, "track", strlen("track"), name_length)
251 			|| dom_local_name_compare_ex(node, "wbr", strlen("wbr"), name_length)
252 			/* These are the additional names from https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void */
253 			|| dom_local_name_compare_ex(node, "basefont", strlen("basefont"), name_length)
254 			|| dom_local_name_compare_ex(node, "bgsound", strlen("bgsound"), name_length)
255 			|| dom_local_name_compare_ex(node, "frame", strlen("frame"), name_length)
256 			|| dom_local_name_compare_ex(node, "keygen", strlen("keygen"), name_length)
257 			|| dom_local_name_compare_ex(node, "param", strlen("param"), name_length)) {
258 			return true;
259 		}
260 	}
261 	return false;
262 }
263 
dom_html5_serialize_element_end(dom_html5_serialize_context * ctx,const xmlNode * node)264 static zend_result dom_html5_serialize_element_end(dom_html5_serialize_context *ctx, const xmlNode *node)
265 {
266 	if (!dom_html5_serializes_as_void(node)) {
267 		TRY(ctx->write_string_len(ctx->application_data, "</", strlen("</")));
268 		TRY(dom_html5_serialize_element_tag_name(ctx, node));
269 		return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
270 	}
271 	return SUCCESS;
272 }
273 
274 /* https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-serialisation-algorithm */
dom_html5_serialize_node(dom_html5_serialize_context * ctx,const xmlNode * node,const xmlNode * bound)275 static zend_result dom_html5_serialize_node(dom_html5_serialize_context *ctx, const xmlNode *node, const xmlNode *bound)
276 {
277 	while (node != NULL) {
278 		switch (node->type) {
279 			case XML_DTD_NODE: {
280 				TRY(dom_html5_serialize_doctype(ctx, (const xmlDtd *) node));
281 				break;
282 			}
283 
284 			case XML_CDATA_SECTION_NODE:
285 			case XML_TEXT_NODE: {
286 				TRY(dom_html5_serialize_text_node(ctx, node));
287 				break;
288 			}
289 
290 			case XML_PI_NODE: {
291 				TRY(dom_html5_serialize_processing_instruction(ctx, node));
292 				break;
293 			}
294 
295 			case XML_COMMENT_NODE: {
296 				TRY(dom_html5_serialize_comment(ctx, node));
297 				break;
298 			}
299 
300 			case XML_ELEMENT_NODE: {
301 				TRY(dom_html5_serialize_element_start(ctx, node));
302 				if (node->children) {
303 					if (!dom_html5_serializes_as_void(node)) {
304 						node = node->children;
305 						continue;
306 					}
307 				} else {
308 					/* Not descended, so wouldn't put the closing tag as it's normally only done when going back upwards. */
309 					TRY(dom_html5_serialize_element_end(ctx, node));
310 				}
311 				break;
312 			}
313 
314 			default:
315 				break;
316 		}
317 
318 		if (node->next) {
319 			node = node->next;
320 		} else {
321 			/* Go upwards, until we find a parent node with a next sibling, or until we hit the bound. */
322 			do {
323 				node = node->parent;
324 				if (node == bound) {
325 					return SUCCESS;
326 				}
327 				if (node->type == XML_ELEMENT_NODE) {
328 					TRY(dom_html5_serialize_element_end(ctx, node));
329 				}
330 			} while (node->next == NULL);
331 			node = node->next;
332 		}
333 	}
334 
335 	return SUCCESS;
336 }
337 
338 /* https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments (Date 2023-10-18)
339  * Note: this serializes the _children_, excluding the node itself! */
dom_html5_serialize(dom_html5_serialize_context * ctx,const xmlNode * node)340 zend_result dom_html5_serialize(dom_html5_serialize_context *ctx, const xmlNode *node)
341 {
342 	/* Step 1. Note that this algorithm serializes children. Only elements, documents, and fragments can have children. */
343 	if (node->type != XML_ELEMENT_NODE
344 		&& node->type != XML_DOCUMENT_FRAG_NODE
345 		&& node->type != XML_DOCUMENT_NODE
346 		&& node->type != XML_HTML_DOCUMENT_NODE) {
347 		return SUCCESS;
348 	}
349 	if (node->type == XML_ELEMENT_NODE && dom_html5_serializes_as_void(node)) {
350 		return SUCCESS;
351 	}
352 
353 	/* Step 2 not needed because we're not using a string to store the serialized data */
354 	/* Step 3 not needed because we don't support template contents yet */
355 
356 	/* Step 4 */
357 	return dom_html5_serialize_node(ctx, node->children, node);
358 }
359 
360 #endif  /* HAVE_LIBXML && HAVE_DOM */
361