xref: /PHP-8.2/ext/dom/xml_document.c (revision 1492be52)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "namespace_compat.h"
25 
check_options_validity(uint32_t arg_num,zend_long options)26 static bool check_options_validity(uint32_t arg_num, zend_long options)
27 {
28 	const zend_long VALID_OPTIONS = XML_PARSE_NOENT
29 								  | XML_PARSE_DTDLOAD
30 								  | XML_PARSE_DTDATTR
31 								  | XML_PARSE_DTDVALID
32 								  | XML_PARSE_NOERROR
33 								  | XML_PARSE_NOWARNING
34 								  | XML_PARSE_NOBLANKS
35 								  | XML_PARSE_XINCLUDE
36 								  | XML_PARSE_NSCLEAN
37 								  | XML_PARSE_NOCDATA
38 								  | XML_PARSE_NONET
39 								  | XML_PARSE_PEDANTIC
40 								  | XML_PARSE_COMPACT
41 								  | XML_PARSE_HUGE
42 								  | XML_PARSE_BIG_LINES;
43 	if ((options & ~VALID_OPTIONS) != 0) {
44 		zend_argument_value_error(2, "contains invalid flags (allowed flags: "
45 									 "LIBXML_NOENT, "
46 									 "LIBXML_DTDLOAD, "
47 									 "LIBXML_DTDATTR, "
48 									 "LIBXML_DTDVALID, "
49 									 "LIBXML_NOERROR, "
50 									 "LIBXML_NOWARNING, "
51 									 "LIBXML_NOBLANKS, "
52 									 "LIBXML_XINCLUDE, "
53 									 "LIBXML_NSCLEAN, "
54 									 "LIBXML_NOCDATA, "
55 									 "LIBXML_NONET, "
56 									 "LIBXML_PEDANTIC, "
57 									 "LIBXML_COMPACT, "
58 									 "LIBXML_PARSEHUGE, "
59 									 "LIBXML_BIGLINES)");
60 		return false;
61 	}
62 	return true;
63 }
64 
65 /* Living spec never creates explicit namespace declaration nodes.
66  * They are only written upon serialization but never appear in the tree.
67  * So in principle we could just ignore them outright.
68  * However, step 10 in https://html.spec.whatwg.org/multipage/parsing.html#create-an-element-for-the-token
69  * requires us to have the declaration as an attribute available */
dom_mark_namespaces_as_attributes_too(xmlDocPtr doc)70 static void dom_mark_namespaces_as_attributes_too(xmlDocPtr doc)
71 {
72 	xmlNodePtr node = doc->children;
73 	while (node != NULL) {
74 		if (node->type == XML_ELEMENT_NODE) {
75 			dom_ns_compat_mark_attribute_list(node->nsDef);
76 
77 			if (node->children) {
78 				node = node->children;
79 				continue;
80 			}
81 		}
82 
83 		if (node->next) {
84 			node = node->next;
85 		} else {
86 			/* Go upwards, until we find a parent node with a next sibling, or until we hit the base. */
87 			do {
88 				node = node->parent;
89 				if (node == NULL) {
90 					return;
91 				}
92 			} while (node->next == NULL);
93 			node = node->next;
94 		}
95 	}
96 }
97 
dom_mark_namespaces_for_copy_based_on_copy(xmlNodePtr copy,const xmlNode * original)98 void dom_mark_namespaces_for_copy_based_on_copy(xmlNodePtr copy, const xmlNode *original)
99 {
100 	xmlNodePtr copy_current = copy;
101 	const xmlNode *original_current = original;
102 	while (copy_current != NULL) {
103 		ZEND_ASSERT(original_current != NULL);
104 
105 		if (copy_current->type == XML_ELEMENT_NODE) {
106 			dom_ns_compat_copy_attribute_list_mark(copy_current->nsDef, original_current->nsDef);
107 
108 			if (copy_current->children) {
109 				copy_current = copy_current->children;
110 				original_current = original_current->children;
111 				continue;
112 			}
113 		}
114 
115 		if (copy_current->next) {
116 			copy_current = copy_current->next;
117 			original_current = original_current->next;
118 		} else {
119 			/* Go upwards, until we find a parent node with a next sibling, or until we hit the base. */
120 			do {
121 				copy_current = copy_current->parent;
122 				if (copy_current == NULL) {
123 					return;
124 				}
125 				original_current = original_current->parent;
126 			} while (copy_current->next == NULL);
127 			copy_current = copy_current->next;
128 			original_current = original_current->next;
129 		}
130 	}
131 }
132 
PHP_METHOD(DOM_XMLDocument,createEmpty)133 PHP_METHOD(DOM_XMLDocument, createEmpty)
134 {
135 	const char *version = NULL;
136 	size_t encoding_len = strlen("UTF-8");
137 	const char *encoding = "UTF-8";
138 	size_t version_len;
139 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|sp", &version, &version_len, &encoding, &encoding_len) == FAILURE) {
140 		RETURN_THROWS();
141 	}
142 
143 	xmlCharEncodingHandlerPtr handler = xmlFindCharEncodingHandler(encoding);
144 
145 	if (handler != NULL) {
146 		xmlCharEncCloseFunc(handler);
147 	} else {
148 		zend_argument_value_error(2, "is not a valid document encoding");
149 		RETURN_THROWS();
150 	}
151 
152 	xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) version);
153 	if (UNEXPECTED(lxml_doc == NULL)) {
154 		goto oom;
155 	}
156 
157 	lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
158 
159 	dom_object *intern = php_dom_instantiate_object_helper(
160 		return_value,
161 		dom_xml_document_class_entry,
162 		(xmlNodePtr) lxml_doc,
163 		NULL
164 	);
165 	intern->document->is_modern_api_class = true;
166 	return;
167 
168 oom:
169 	php_dom_throw_error(INVALID_STATE_ERR, 1);
170 	RETURN_THROWS();
171 }
172 
load_from_helper(INTERNAL_FUNCTION_PARAMETERS,int mode)173 static void load_from_helper(INTERNAL_FUNCTION_PARAMETERS, int mode)
174 {
175 	const char *source, *override_encoding = NULL;
176 	size_t source_len, override_encoding_len;
177 	zend_long options = 0;
178 	if (zend_parse_parameters(
179 		ZEND_NUM_ARGS(),
180 		"s|lp!",
181 		&source,
182 		&source_len,
183 		&options,
184 		&override_encoding,
185 		&override_encoding_len
186 	) == FAILURE) {
187 		RETURN_THROWS();
188 	}
189 
190 	if (!source_len) {
191 		zend_argument_value_error(1, "must not be empty");
192 		RETURN_THROWS();
193 	}
194 
195 	if (ZEND_SIZE_T_INT_OVFL(source_len)) {
196 		zend_argument_value_error(1, "is too long");
197 		RETURN_THROWS();
198 	}
199 
200 	/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
201 	if (mode == DOM_LOAD_FILE && strstr(source, "%00")) {
202 		zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
203 		RETURN_THROWS();
204 	}
205 
206 	if (!check_options_validity(2, options)) {
207 		RETURN_THROWS();
208 	}
209 
210 	xmlCharEncodingHandlerPtr encoding = NULL;
211 	if (override_encoding != NULL) {
212 		encoding = xmlFindCharEncodingHandler(override_encoding);
213 		if (!encoding) {
214 			zend_argument_value_error(3, "must be a valid document encoding");
215 			RETURN_THROWS();
216 		}
217 		options |= XML_PARSE_IGNORE_ENC;
218 	}
219 
220 	xmlDocPtr lxml_doc = dom_document_parser(NULL, mode, source, source_len, options, encoding);
221 	if (UNEXPECTED(lxml_doc == NULL)) {
222 		if (!EG(exception)) {
223 			if (mode == DOM_LOAD_FILE) {
224 				zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", source);
225 			} else {
226 				php_dom_throw_error(INVALID_STATE_ERR, 1);
227 			}
228 		}
229 		RETURN_THROWS();
230 	}
231 	if (lxml_doc->encoding == NULL) {
232 		if (override_encoding) {
233 			lxml_doc->encoding = xmlStrdup((const xmlChar *) override_encoding);
234 		} else {
235 			lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
236 		}
237 	}
238 	dom_object *intern = php_dom_instantiate_object_helper(
239 		return_value,
240 		dom_xml_document_class_entry,
241 		(xmlNodePtr) lxml_doc,
242 		NULL
243 	);
244 	intern->document->is_modern_api_class = true;
245 	dom_mark_namespaces_as_attributes_too(lxml_doc);
246 }
247 
PHP_METHOD(DOM_XMLDocument,createFromString)248 PHP_METHOD(DOM_XMLDocument, createFromString)
249 {
250 	load_from_helper(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_STRING);
251 }
252 
PHP_METHOD(DOM_XMLDocument,createFromFile)253 PHP_METHOD(DOM_XMLDocument, createFromFile)
254 {
255 	load_from_helper(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_FILE);
256 }
257 
258 #endif  /* HAVE_LIBXML && HAVE_DOM */
259