1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Niels Dossche <nielsdos@php.net> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_serializer.h"
25 #include "namespace_compat.h"
26 #include "serialize_common.h"
27 #include <lexbor/encoding/encoding.h>
28
29 /* This file implements the HTML 5 serialization algorithm.
30 * https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments (Date 2023-12-14)
31 */
32
33 #define TRY(x) do { if (UNEXPECTED((x) != SUCCESS)) { return FAILURE; } } while (0)
34
dom_html5_serialize_doctype(dom_html5_serialize_context * ctx,const xmlDtd * dtd)35 static zend_result dom_html5_serialize_doctype(dom_html5_serialize_context *ctx, const xmlDtd *dtd)
36 {
37 TRY(ctx->write_string_len(ctx->application_data, "<!DOCTYPE ", strlen("<!DOCTYPE ")));
38 TRY(ctx->write_string(ctx->application_data, (const char *) dtd->name));
39 return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
40 }
41
dom_html5_serialize_comment(dom_html5_serialize_context * ctx,const xmlNode * node)42 static zend_result dom_html5_serialize_comment(dom_html5_serialize_context *ctx, const xmlNode *node)
43 {
44 TRY(ctx->write_string_len(ctx->application_data, "<!--", strlen("<!--")));
45 TRY(ctx->write_string(ctx->application_data, (const char *) node->content));
46 return ctx->write_string_len(ctx->application_data, "-->", strlen("-->"));
47 }
48
dom_html5_serialize_processing_instruction(dom_html5_serialize_context * ctx,const xmlNode * node)49 static zend_result dom_html5_serialize_processing_instruction(dom_html5_serialize_context *ctx, const xmlNode *node)
50 {
51 TRY(ctx->write_string_len(ctx->application_data, "<?", strlen("<?")));
52 TRY(ctx->write_string(ctx->application_data, (const char *) node->name));
53 TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
54 if (node->content) {
55 TRY(ctx->write_string(ctx->application_data, (const char *) node->content));
56 }
57 return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
58 }
59
dom_html5_serialize_entity_ref(dom_html5_serialize_context * ctx,const xmlNode * node)60 static zend_result dom_html5_serialize_entity_ref(dom_html5_serialize_context *ctx, const xmlNode *node)
61 {
62 TRY(ctx->write_string_len(ctx->application_data, "&", strlen("&")));
63 TRY(ctx->write_string(ctx->application_data, (const char *) node->name));
64 return ctx->write_string_len(ctx->application_data, ";", strlen(";"));
65 }
66
67 /* https://html.spec.whatwg.org/multipage/parsing.html#escapingString */
dom_html5_escape_string(dom_html5_serialize_context * ctx,const char * content,bool attribute_mode)68 static zend_result dom_html5_escape_string(dom_html5_serialize_context *ctx, const char *content, bool attribute_mode)
69 {
70 const char *last_output = content;
71
72 /* Note: uses UTF-8 internally, so <C2 A0> indicates a non-breaking space */
73 const char *mask = attribute_mode ? "&\xC2\"" : "&\xC2<>";
74
75 while (true) {
76 size_t chunk_length = strcspn(content, mask);
77
78 content += chunk_length;
79 if (*content == '\0') {
80 break;
81 }
82
83 switch (*content) {
84 /* Step 1 */
85 case '&': {
86 TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
87 TRY(ctx->write_string_len(ctx->application_data, "&", strlen("&")));
88 last_output = content + 1;
89 break;
90 }
91
92 /* Step 2 (non-breaking space) (note: uses UTF-8 internally) */
93 case '\xC2': {
94 if (content[1] == '\xA0') {
95 TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
96 TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
97 content++; /* Consume A0 too */
98 last_output = content + 1;
99 }
100 break;
101 }
102
103 /* Step 3 */
104 case '"': {
105 TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
106 TRY(ctx->write_string_len(ctx->application_data, """, strlen(""")));
107 last_output = content + 1;
108 break;
109 }
110
111 /* Step 4 */
112 case '<': {
113 TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
114 TRY(ctx->write_string_len(ctx->application_data, "<", strlen("<")));
115 last_output = content + 1;
116 break;
117 }
118 case '>': {
119 TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output));
120 TRY(ctx->write_string_len(ctx->application_data, ">", strlen(">")));
121 last_output = content + 1;
122 break;
123 }
124 }
125
126 content++;
127 }
128
129 return ctx->write_string_len(ctx->application_data, last_output, content - last_output);
130 }
131
dom_html5_serialize_text_node(dom_html5_serialize_context * ctx,const xmlNode * node)132 static zend_result dom_html5_serialize_text_node(dom_html5_serialize_context *ctx, const xmlNode *node)
133 {
134 if (node->parent->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(node->parent, php_dom_ns_is_html_magic_token)) {
135 const xmlNode *parent = node->parent;
136 size_t name_length = strlen((const char *) parent->name);
137 /* Spec tells us to only emit noscript content as-is if scripting is enabled.
138 * However, the user agent (PHP) does not support (JS) scripting.
139 * Furthermore, if actually consumed by a browser then we should err on the safe side and not emit the content as-is. */
140 if (dom_local_name_compare_ex(parent, "style", strlen("style"), name_length)
141 || dom_local_name_compare_ex(parent, "script", strlen("script"), name_length)
142 || dom_local_name_compare_ex(parent, "xmp", strlen("xmp"), name_length)
143 || dom_local_name_compare_ex(parent, "iframe", strlen("iframe"), name_length)
144 || dom_local_name_compare_ex(parent, "noembed", strlen("noembed"), name_length)
145 || dom_local_name_compare_ex(parent, "noframes", strlen("noframes"), name_length)
146 || dom_local_name_compare_ex(parent, "plaintext", strlen("plaintext"), name_length)) {
147 return ctx->write_string(ctx->application_data, (const char *) node->content);
148 }
149 }
150
151 return dom_html5_escape_string(ctx, (const char *) node->content, false);
152 }
153
dom_html5_serialize_element_tag_name(dom_html5_serialize_context * ctx,const xmlNode * node)154 static zend_result dom_html5_serialize_element_tag_name(dom_html5_serialize_context *ctx, const xmlNode *node)
155 {
156 /* Note: it is not the serializer's responsibility to care about uppercase/lowercase (see createElement() note) */
157 if (node->ns != NULL && node->ns->prefix != NULL
158 && !(php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) || php_dom_ns_is_fast(node, php_dom_ns_is_mathml_magic_token) || php_dom_ns_is_fast(node, php_dom_ns_is_svg_magic_token))) {
159 TRY(ctx->write_string(ctx->application_data, (const char *) node->ns->prefix));
160 TRY(ctx->write_string_len(ctx->application_data, ":", strlen(":")));
161 }
162 return ctx->write_string(ctx->application_data, (const char *) node->name);
163 }
164
dom_html5_serialize_element_start(dom_html5_serialize_context * ctx,const xmlNode * node)165 static zend_result dom_html5_serialize_element_start(dom_html5_serialize_context *ctx, const xmlNode *node)
166 {
167 TRY(ctx->write_string_len(ctx->application_data, "<", strlen("<")));
168 TRY(dom_html5_serialize_element_tag_name(ctx, node));
169
170 /* We don't support the "is" value during element creation, so no handling here. */
171
172 for (const xmlAttr *attr = node->properties; attr; attr = attr->next) {
173 TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" ")));
174 if (attr->ns == NULL) {
175 TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
176 } else {
177 if (php_dom_ns_is_fast((const xmlNode *) attr, php_dom_ns_is_xml_magic_token)) {
178 TRY(ctx->write_string_len(ctx->application_data, "xml:", strlen("xml:")));
179 TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
180 } else if (php_dom_ns_is_fast((const xmlNode *) attr, php_dom_ns_is_xmlns_magic_token)) {
181 /* Compatibility for real attributes */
182 if (strcmp((const char *) attr->name, "xmlns") == 0) {
183 TRY(ctx->write_string_len(ctx->application_data, "xmlns", strlen("xmlns")));
184 } else {
185 TRY(ctx->write_string_len(ctx->application_data, "xmlns:", strlen("xmlns:")));
186 TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
187 }
188 } else if (php_dom_ns_is_fast((const xmlNode *) attr, php_dom_ns_is_xlink_magic_token)) {
189 TRY(ctx->write_string_len(ctx->application_data, "xlink:", strlen("xlink:")));
190 TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
191 } else if (attr->ns->prefix == NULL) {
192 TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
193 } else {
194 TRY(ctx->write_string(ctx->application_data, (const char *) attr->ns->prefix));
195 TRY(ctx->write_string_len(ctx->application_data, ":", strlen(":")));
196 TRY(ctx->write_string(ctx->application_data, (const char *) attr->name));
197 }
198 }
199
200 TRY(ctx->write_string_len(ctx->application_data, "=\"", strlen("=\"")));
201
202 for (xmlNodePtr child = attr->children; child != NULL; child = child->next) {
203 if (child->type == XML_TEXT_NODE) {
204 if (child->content != NULL) {
205 TRY(dom_html5_escape_string(ctx, (const char *) child->content, true));
206 }
207 } else if (child->type == XML_ENTITY_REF_NODE) {
208 TRY(ctx->write_string_len(ctx->application_data, "&", strlen("&")));
209 TRY(dom_html5_escape_string(ctx, (const char *) child->name, true));
210 TRY(ctx->write_string_len(ctx->application_data, ";", strlen(";")));
211 }
212 }
213
214 TRY(ctx->write_string_len(ctx->application_data, "\"", strlen("\"")));
215 }
216
217 return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
218
219 /* Note: "continue on to the next child if the element is void" is handled in the iteration and dom_html5_serialize_element_end() */
220 }
221
222 /* https://html.spec.whatwg.org/multipage/syntax.html#void-elements
223 * https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void */
dom_html5_serializes_as_void(const xmlNode * node)224 static bool dom_html5_serializes_as_void(const xmlNode *node)
225 {
226 if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token)) {
227 size_t name_length = strlen((const char *) node->name);
228 if (/* These are the void elements from https://html.spec.whatwg.org/multipage/syntax.html#void-elements */
229 dom_local_name_compare_ex(node, "area", strlen("area"), name_length)
230 || dom_local_name_compare_ex(node, "base", strlen("base"), name_length)
231 || dom_local_name_compare_ex(node, "br", strlen("br"), name_length)
232 || dom_local_name_compare_ex(node, "col", strlen("col"), name_length)
233 || dom_local_name_compare_ex(node, "embed", strlen("embed"), name_length)
234 || dom_local_name_compare_ex(node, "hr", strlen("hr"), name_length)
235 || dom_local_name_compare_ex(node, "img", strlen("img"), name_length)
236 || dom_local_name_compare_ex(node, "input", strlen("input"), name_length)
237 || dom_local_name_compare_ex(node, "link", strlen("link"), name_length)
238 || dom_local_name_compare_ex(node, "meta", strlen("meta"), name_length)
239 || dom_local_name_compare_ex(node, "source", strlen("source"), name_length)
240 || dom_local_name_compare_ex(node, "track", strlen("track"), name_length)
241 || dom_local_name_compare_ex(node, "wbr", strlen("wbr"), name_length)
242 /* These are the additional names from https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void */
243 || dom_local_name_compare_ex(node, "basefont", strlen("basefont"), name_length)
244 || dom_local_name_compare_ex(node, "bgsound", strlen("bgsound"), name_length)
245 || dom_local_name_compare_ex(node, "frame", strlen("frame"), name_length)
246 || dom_local_name_compare_ex(node, "keygen", strlen("keygen"), name_length)
247 || dom_local_name_compare_ex(node, "param", strlen("param"), name_length)) {
248 return true;
249 }
250 }
251 return false;
252 }
253
dom_html5_serialize_element_end(dom_html5_serialize_context * ctx,const xmlNode * node)254 static zend_result dom_html5_serialize_element_end(dom_html5_serialize_context *ctx, const xmlNode *node)
255 {
256 if (!dom_html5_serializes_as_void(node)) {
257 TRY(ctx->write_string_len(ctx->application_data, "</", strlen("</")));
258 TRY(dom_html5_serialize_element_tag_name(ctx, node));
259 return ctx->write_string_len(ctx->application_data, ">", strlen(">"));
260 }
261 return SUCCESS;
262 }
263
264 /* https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-serialisation-algorithm */
dom_html5_serialize_node(dom_html5_serialize_context * ctx,const xmlNode * node,const xmlNode * bound)265 static zend_result dom_html5_serialize_node(dom_html5_serialize_context *ctx, const xmlNode *node, const xmlNode *bound)
266 {
267 while (node != NULL) {
268 switch (node->type) {
269 case XML_DTD_NODE: {
270 TRY(dom_html5_serialize_doctype(ctx, (const xmlDtd *) node));
271 break;
272 }
273
274 case XML_CDATA_SECTION_NODE:
275 case XML_TEXT_NODE: {
276 TRY(dom_html5_serialize_text_node(ctx, node));
277 break;
278 }
279
280 case XML_PI_NODE: {
281 TRY(dom_html5_serialize_processing_instruction(ctx, node));
282 break;
283 }
284
285 case XML_COMMENT_NODE: {
286 TRY(dom_html5_serialize_comment(ctx, node));
287 break;
288 }
289
290 case XML_ELEMENT_NODE: {
291 TRY(dom_html5_serialize_element_start(ctx, node));
292 if (node->children) {
293 if (!dom_html5_serializes_as_void(node)) {
294 node = node->children;
295 continue;
296 }
297 } else {
298 /* Not descended, so wouldn't put the closing tag as it's normally only done when going back upwards. */
299 TRY(dom_html5_serialize_element_end(ctx, node));
300 }
301 break;
302 }
303
304 /* Only exists for compatibility with XML and old DOM. */
305 case XML_ENTITY_REF_NODE: {
306 TRY(dom_html5_serialize_entity_ref(ctx, node));
307 break;
308 }
309
310 default:
311 break;
312 }
313
314 if (node->next) {
315 node = node->next;
316 } else {
317 /* Go upwards, until we find a parent node with a next sibling, or until we hit the bound. */
318 do {
319 node = node->parent;
320 if (node == bound) {
321 return SUCCESS;
322 }
323 if (node->type == XML_ELEMENT_NODE) {
324 TRY(dom_html5_serialize_element_end(ctx, node));
325 }
326 } while (node->next == NULL);
327 node = node->next;
328 }
329 }
330
331 return SUCCESS;
332 }
333
334 /* Note: this serializes the _children_, excluding the node itself! */
dom_html5_serialize(dom_html5_serialize_context * ctx,const xmlNode * node)335 zend_result dom_html5_serialize(dom_html5_serialize_context *ctx, const xmlNode *node)
336 {
337 /* Step 1. Note that this algorithm serializes children. Only elements, documents, and fragments can have children. */
338 if (node->type != XML_ELEMENT_NODE
339 && node->type != XML_DOCUMENT_FRAG_NODE
340 && node->type != XML_DOCUMENT_NODE
341 && node->type != XML_HTML_DOCUMENT_NODE) {
342 return SUCCESS;
343 }
344 if (node->type == XML_ELEMENT_NODE && dom_html5_serializes_as_void(node)) {
345 return SUCCESS;
346 }
347
348 /* Step 2 not needed because we're not using a string to store the serialized data */
349 /* Step 3 not needed because we don't support template contents yet */
350
351 /* Step 4 */
352 return dom_html5_serialize_node(ctx, node->children, node);
353 }
354
355 /* Variant on the above that is equivalent to the "outer HTML". */
dom_html5_serialize_outer(dom_html5_serialize_context * ctx,const xmlNode * node)356 zend_result dom_html5_serialize_outer(dom_html5_serialize_context *ctx, const xmlNode *node)
357 {
358 if (node->type == XML_DOCUMENT_NODE || node->type == XML_HTML_DOCUMENT_NODE || node->type == XML_DOCUMENT_FRAG_NODE) {
359 node = node->children;
360 if (!node) {
361 return SUCCESS;
362 }
363 return dom_html5_serialize_node(ctx, node, node->parent);
364 } else {
365 xmlNodePtr old_next = node->next;
366 ((xmlNodePtr) node)->next = NULL;
367 zend_result result = dom_html5_serialize_node(ctx, node, node->parent);
368 ((xmlNodePtr) node)->next = old_next;
369 return result;
370 }
371 }
372
373 #endif /* HAVE_LIBXML && HAVE_DOM */
374