xref: /php-src/ext/dom/xml_document.c (revision 6980eba8)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "namespace_compat.h"
25 #include "private_data.h"
26 #include "xml_serializer.h"
27 #include <libxml/xmlsave.h>
28 
check_options_validity(uint32_t arg_num,zend_long options)29 static bool check_options_validity(uint32_t arg_num, zend_long options)
30 {
31 	const zend_long VALID_OPTIONS = XML_PARSE_RECOVER
32 								  | XML_PARSE_NOENT
33 #if LIBXML_VERSION >= 21300
34 								  | XML_PARSE_NO_XXE
35 #endif
36 								  | XML_PARSE_DTDLOAD
37 								  | XML_PARSE_DTDATTR
38 								  | XML_PARSE_DTDVALID
39 								  | XML_PARSE_NOERROR
40 								  | XML_PARSE_NOWARNING
41 								  | XML_PARSE_NOBLANKS
42 								  | XML_PARSE_XINCLUDE
43 								  | XML_PARSE_NSCLEAN
44 								  | XML_PARSE_NOCDATA
45 								  | XML_PARSE_NONET
46 								  | XML_PARSE_PEDANTIC
47 								  | XML_PARSE_COMPACT
48 								  | XML_PARSE_HUGE
49 								  | XML_PARSE_BIG_LINES;
50 	if ((options & ~VALID_OPTIONS) != 0) {
51 		zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
52 										   "LIBXML_RECOVER, "
53 										   "LIBXML_NOENT, "
54 #if LIBXML_VERSION >= 21300
55 										   "LIBXML_NO_XXE, "
56 #endif
57 										   "LIBXML_DTDLOAD, "
58 										   "LIBXML_DTDATTR, "
59 										   "LIBXML_DTDVALID, "
60 										   "LIBXML_NOERROR, "
61 										   "LIBXML_NOWARNING, "
62 										   "LIBXML_NOBLANKS, "
63 										   "LIBXML_XINCLUDE, "
64 										   "LIBXML_NSCLEAN, "
65 										   "LIBXML_NOCDATA, "
66 										   "LIBXML_NONET, "
67 										   "LIBXML_PEDANTIC, "
68 										   "LIBXML_COMPACT, "
69 										   "LIBXML_PARSEHUGE, "
70 										   "LIBXML_BIGLINES)");
71 		return false;
72 	}
73 	return true;
74 }
75 
76 /* Living spec never creates explicit namespace declaration nodes.
77  * They are only written upon serialization but never appear in the tree.
78  * So in principle we could just ignore them outright.
79  * However, step 10 in https://html.spec.whatwg.org/multipage/parsing.html#create-an-element-for-the-token (Date 2023-12-15)
80  * requires us to have the declaration as an attribute available */
dom_mark_namespaces_as_attributes_too(php_dom_libxml_ns_mapper * ns_mapper,xmlDocPtr doc)81 void dom_mark_namespaces_as_attributes_too(php_dom_libxml_ns_mapper *ns_mapper, xmlDocPtr doc)
82 {
83 	xmlNodePtr node = doc->children;
84 	while (node != NULL) {
85 		if (node->type == XML_ELEMENT_NODE) {
86 			php_dom_ns_compat_mark_attribute_list(ns_mapper, node);
87 		}
88 
89 		node = php_dom_next_in_tree_order(node, NULL);
90 	}
91 }
92 
PHP_METHOD(Dom_XMLDocument,createEmpty)93 PHP_METHOD(Dom_XMLDocument, createEmpty)
94 {
95 	const char *version = NULL;
96 	size_t encoding_len = strlen("UTF-8");
97 	const char *encoding = "UTF-8";
98 	size_t version_len;
99 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|sp", &version, &version_len, &encoding, &encoding_len) == FAILURE) {
100 		RETURN_THROWS();
101 	}
102 
103 	xmlCharEncodingHandlerPtr handler = xmlFindCharEncodingHandler(encoding);
104 
105 	if (handler != NULL) {
106 		xmlCharEncCloseFunc(handler);
107 	} else {
108 		zend_argument_value_error(2, "is not a valid document encoding");
109 		RETURN_THROWS();
110 	}
111 
112 	xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) version);
113 	if (UNEXPECTED(lxml_doc == NULL)) {
114 		goto oom;
115 	}
116 
117 	lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
118 
119 	dom_object *intern = php_dom_instantiate_object_helper(
120 		return_value,
121 		dom_xml_document_class_entry,
122 		(xmlNodePtr) lxml_doc,
123 		NULL
124 	);
125 	dom_set_xml_class(intern->document);
126 	intern->document->private_data = php_dom_libxml_private_data_header(php_dom_private_data_create());
127 	return;
128 
129 oom:
130 	php_dom_throw_error(INVALID_STATE_ERR, true);
131 	RETURN_THROWS();
132 }
133 
load_from_helper(INTERNAL_FUNCTION_PARAMETERS,int mode)134 static void load_from_helper(INTERNAL_FUNCTION_PARAMETERS, int mode)
135 {
136 	const char *source, *override_encoding = NULL;
137 	size_t source_len, override_encoding_len;
138 	zend_long options = 0;
139 	if (zend_parse_parameters(
140 		ZEND_NUM_ARGS(),
141 		"s|lp!",
142 		&source,
143 		&source_len,
144 		&options,
145 		&override_encoding,
146 		&override_encoding_len
147 	) == FAILURE) {
148 		RETURN_THROWS();
149 	}
150 
151 	if (!source_len) {
152 		zend_argument_value_error(1, "must not be empty");
153 		RETURN_THROWS();
154 	}
155 
156 	if (ZEND_SIZE_T_INT_OVFL(source_len)) {
157 		zend_argument_value_error(1, "is too long");
158 		RETURN_THROWS();
159 	}
160 
161 	/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
162 	if (mode == DOM_LOAD_FILE && strstr(source, "%00")) {
163 		zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
164 		RETURN_THROWS();
165 	}
166 
167 	if (!check_options_validity(2, options)) {
168 		RETURN_THROWS();
169 	}
170 
171 	xmlCharEncodingHandlerPtr encoding = NULL;
172 	if (override_encoding != NULL) {
173 		encoding = xmlFindCharEncodingHandler(override_encoding);
174 		if (!encoding) {
175 			zend_argument_value_error(3, "must be a valid document encoding");
176 			RETURN_THROWS();
177 		}
178 		options |= XML_PARSE_IGNORE_ENC;
179 	}
180 
181 	xmlDocPtr lxml_doc = dom_document_parser(NULL, mode, source, source_len, options, encoding);
182 	if (UNEXPECTED(lxml_doc == NULL || lxml_doc == DOM_DOCUMENT_MALFORMED)) {
183 		if (!EG(exception)) {
184 			if (lxml_doc == DOM_DOCUMENT_MALFORMED) {
185 				php_dom_throw_error_with_message(SYNTAX_ERR, "XML fragment is not well-formed", true);
186 			} else {
187 				if (mode == DOM_LOAD_FILE) {
188 					zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", source);
189 				} else {
190 					php_dom_throw_error(INVALID_STATE_ERR, true);
191 				}
192 			}
193 		}
194 		RETURN_THROWS();
195 	}
196 	if (lxml_doc->encoding == NULL) {
197 		if (override_encoding) {
198 			lxml_doc->encoding = xmlStrdup((const xmlChar *) override_encoding);
199 		} else {
200 			lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
201 		}
202 	}
203 	if (mode == DOM_LOAD_FILE && lxml_doc->URL != NULL) {
204 		if (!php_is_stream_path((char *) lxml_doc->URL)) {
205 			/* Check for "file:/" instead of "file://" because of libxml2 quirk */
206 			if (strncmp((const char *) lxml_doc->URL, "file:/", sizeof("file:/") - 1) != 0) {
207 #ifdef PHP_WIN32
208 				xmlChar *buffer = xmlStrdup((const xmlChar *) "file:///");
209 #else
210 				xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
211 #endif
212 				if (buffer != NULL) {
213 					xmlChar *new_buffer = xmlStrcat(buffer, lxml_doc->URL);
214 					if (new_buffer != NULL) {
215 						xmlFree(BAD_CAST lxml_doc->URL);
216 						lxml_doc->URL = new_buffer;
217 					} else {
218 						xmlFree(buffer);
219 					}
220 				}
221 			} else {
222 #ifdef PHP_WIN32
223 				lxml_doc->URL = php_dom_libxml_fix_file_path(BAD_CAST lxml_doc->URL);
224 #endif
225 			}
226 		}
227 	}
228 	dom_object *intern = php_dom_instantiate_object_helper(
229 		return_value,
230 		dom_xml_document_class_entry,
231 		(xmlNodePtr) lxml_doc,
232 		NULL
233 	);
234 	dom_set_xml_class(intern->document);
235 	dom_document_convert_to_modern(intern->document, lxml_doc);
236 }
237 
dom_document_convert_to_modern(php_libxml_ref_obj * document,xmlDocPtr lxml_doc)238 void dom_document_convert_to_modern(php_libxml_ref_obj *document, xmlDocPtr lxml_doc)
239 {
240 	php_dom_private_data *private_data = php_dom_private_data_create();
241 	php_dom_libxml_ns_mapper *ns_mapper = php_dom_ns_mapper_from_private(private_data);
242 	document->private_data = php_dom_libxml_private_data_header(private_data);
243 	dom_mark_namespaces_as_attributes_too(ns_mapper, lxml_doc);
244 }
245 
PHP_METHOD(Dom_XMLDocument,createFromString)246 PHP_METHOD(Dom_XMLDocument, createFromString)
247 {
248 	load_from_helper(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_STRING);
249 }
250 
PHP_METHOD(Dom_XMLDocument,createFromFile)251 PHP_METHOD(Dom_XMLDocument, createFromFile)
252 {
253 	load_from_helper(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_FILE);
254 }
255 
php_new_dom_write_smart_str(void * context,const char * buffer,int len)256 static int php_new_dom_write_smart_str(void *context, const char *buffer, int len)
257 {
258 	smart_str *str = context;
259 	smart_str_appendl(str, buffer, len);
260 	return len;
261 }
262 
get_private_data_from_node(xmlNodePtr node)263 static php_dom_private_data *get_private_data_from_node(xmlNodePtr node)
264 {
265 	dom_object *intern = php_dom_object_get_data(node);
266 	return intern != NULL ? php_dom_get_private_data(intern) : NULL;
267 }
268 
php_new_dom_dump_node_to_str_ex(xmlNodePtr node,int options,bool format,const char * encoding)269 static zend_string *php_new_dom_dump_node_to_str_ex(xmlNodePtr node, int options, bool format, const char *encoding)
270 {
271 	smart_str str = {0};
272 
273 	int status = -1;
274 	xmlSaveCtxtPtr ctxt = xmlSaveToIO(php_new_dom_write_smart_str, NULL, &str, encoding, XML_SAVE_AS_XML | options);
275 	if (EXPECTED(ctxt != NULL)) {
276 		xmlCharEncodingHandlerPtr handler = xmlFindCharEncodingHandler(encoding);
277 		xmlOutputBufferPtr out = xmlOutputBufferCreateIO(php_new_dom_write_smart_str, NULL, &str, handler);
278 		if (EXPECTED(out != NULL)) {
279 			status = dom_xml_serialize(ctxt, out, node, format, false, get_private_data_from_node(node));
280 			status |= xmlOutputBufferFlush(out);
281 			status |= xmlOutputBufferClose(out);
282 		} else {
283 			xmlCharEncCloseFunc(handler);
284 		}
285 		(void) xmlSaveClose(ctxt);
286 	}
287 
288 	if (UNEXPECTED(status < 0)) {
289 		smart_str_free_ex(&str, false);
290 		return NULL;
291 	}
292 
293 	return smart_str_extract(&str);
294 }
295 
php_new_dom_dump_node_to_str(xmlDocPtr doc,xmlNodePtr node,bool format,const char * encoding)296 static zend_string *php_new_dom_dump_node_to_str(xmlDocPtr doc, xmlNodePtr node, bool format, const char *encoding)
297 {
298 	return php_new_dom_dump_node_to_str_ex(node, 0, format, encoding);
299 }
300 
php_new_dom_dump_doc_to_str(xmlDocPtr doc,int options,const char * encoding)301 static zend_string *php_new_dom_dump_doc_to_str(xmlDocPtr doc, int options, const char *encoding)
302 {
303 	return php_new_dom_dump_node_to_str_ex((xmlNodePtr) doc, options, options & XML_SAVE_FORMAT, encoding);
304 }
305 
php_new_dom_dump_node_to_file(const char * filename,xmlDocPtr doc,xmlNodePtr node,bool format,const char * encoding)306 zend_long php_new_dom_dump_node_to_file(const char *filename, xmlDocPtr doc, xmlNodePtr node, bool format, const char *encoding)
307 {
308 	xmlCharEncodingHandlerPtr handler = xmlFindCharEncodingHandler(encoding);
309 	xmlOutputBufferPtr out = xmlOutputBufferCreateFilename(filename, handler, 0);
310 	if (!out) {
311 		xmlCharEncCloseFunc(handler);
312 		return -1;
313 	}
314 
315 	php_stream *stream = out->context;
316 
317 	int status = -1;
318 	xmlSaveCtxtPtr ctxt = xmlSaveToIO(out->writecallback, NULL, stream, encoding, XML_SAVE_AS_XML);
319 	if (EXPECTED(ctxt != NULL)) {
320 		status = dom_xml_serialize(ctxt, out, node, format, false, get_private_data_from_node(node));
321 		status |= xmlOutputBufferFlush(out);
322 		(void) xmlSaveClose(ctxt);
323 	}
324 
325 	size_t offset = php_stream_tell(stream);
326 
327 	(void) xmlOutputBufferClose(out);
328 
329 	return status < 0 ? status : (zend_long) offset;
330 }
331 
php_new_dom_dump_doc_to_file(const char * filename,xmlDocPtr doc,bool format,const char * encoding)332 static zend_long php_new_dom_dump_doc_to_file(const char *filename, xmlDocPtr doc, bool format, const char *encoding)
333 {
334 	return php_new_dom_dump_node_to_file(filename, doc, (xmlNodePtr) doc, format, encoding);
335 }
336 
337 static const php_libxml_document_handlers php_new_dom_default_document_handlers = {
338 	.dump_node_to_str = php_new_dom_dump_node_to_str,
339 	.dump_doc_to_str = php_new_dom_dump_doc_to_str,
340 	.dump_node_to_file = php_new_dom_dump_node_to_file,
341 	.dump_doc_to_file = php_new_dom_dump_doc_to_file,
342 };
343 
dom_set_xml_class(php_libxml_ref_obj * document)344 void dom_set_xml_class(php_libxml_ref_obj *document)
345 {
346 	document->class_type = PHP_LIBXML_CLASS_MODERN;
347 	document->handlers = &php_new_dom_default_document_handlers;
348 }
349 
350 #endif  /* HAVE_LIBXML && HAVE_DOM */
351