xref: /php-src/ext/dom/html_document.c (revision 88393cfa)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "infra.h"
25 #include "html5_parser.h"
26 #include "html5_serializer.h"
27 #include "namespace_compat.h"
28 #include "private_data.h"
29 #include "dom_properties.h"
30 #include <Zend/zend_smart_string.h>
31 #include <lexbor/html/encoding.h>
32 #include <lexbor/encoding/encoding.h>
33 
34 /* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
35 #define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
36 
37 typedef struct dom_line_column_cache {
38 	size_t last_line;
39 	size_t last_column;
40 	size_t last_offset;
41 } dom_line_column_cache;
42 
43 typedef struct dom_lexbor_libxml2_bridge_application_data {
44 	const char *input_name;
45 	const lxb_codepoint_t *current_input_codepoints;
46 	const char *current_input_characters;
47 	size_t current_input_length;
48 	size_t current_total_offset;
49 	dom_line_column_cache cache_tokenizer;
50 	bool html_no_implied;
51 } dom_lexbor_libxml2_bridge_application_data;
52 
53 typedef struct dom_character_encoding_data {
54 	const lxb_encoding_data_t *encoding_data;
55 	size_t bom_shift;
56 } dom_character_encoding_data;
57 
58 typedef zend_result (*dom_write_output)(void*, const char *, size_t);
59 
60 typedef struct dom_output_ctx {
61 	const lxb_encoding_data_t *encoding_data;
62 	const lxb_encoding_data_t *decoding_data;
63 	lxb_encoding_encode_t *encode;
64 	lxb_encoding_decode_t *decode;
65 	lxb_codepoint_t *codepoints;
66 	lxb_char_t *encoding_output;
67 	void *output_data;
68 	dom_write_output write_output;
69 } dom_output_ctx;
70 
71 typedef struct dom_decoding_encoding_ctx {
72 	/* We can skip some conversion if the input and output encoding are both UTF-8,
73 	 * we only have to validate and substitute replacement characters */
74 	bool fast_path; /* Put first, near the encode & decode structures, for cache locality */
75 	lxb_encoding_encode_t encode;
76 	lxb_encoding_decode_t decode;
77 	const lxb_encoding_data_t *encode_data;
78 	const lxb_encoding_data_t *decode_data;
79 	lxb_char_t encoding_output[4096];
80 	lxb_codepoint_t codepoints[4096];
81 } dom_decoding_encoding_ctx;
82 
83 /* https://dom.spec.whatwg.org/#dom-document-implementation */
dom_modern_document_implementation_read(dom_object * obj,zval * retval)84 zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retval)
85 {
86 	const uint32_t PROP_INDEX = 0;
87 
88 #if ZEND_DEBUG
89 	zend_string *implementation_str = ZSTR_INIT_LITERAL("implementation", false);
90 	const zend_property_info *prop_info = zend_get_property_info(dom_abstract_base_document_class_entry, implementation_str, 0);
91 	zend_string_release_ex(implementation_str, false);
92 	ZEND_ASSERT(OBJ_PROP_TO_NUM(prop_info->offset) == PROP_INDEX);
93 #endif
94 
95 	zval *cached_implementation = OBJ_PROP_NUM(&obj->std, PROP_INDEX);
96 	if (Z_ISUNDEF_P(cached_implementation)) {
97 		php_dom_create_implementation(cached_implementation, true);
98 	}
99 
100 	ZVAL_OBJ_COPY(retval, Z_OBJ_P(cached_implementation));
101 
102 	return SUCCESS;
103 }
104 
dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx * ctx)105 static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
106 {
107 	ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
108 	ctx->decode_data = NULL;
109 	/* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */
110 	ctx->fast_path = true;
111 	(void) lxb_encoding_encode_init(
112 		&ctx->encode,
113 		ctx->encode_data,
114 		ctx->encoding_output,
115 		sizeof(ctx->encoding_output) / sizeof(*ctx->encoding_output)
116 	);
117 	(void) lxb_encoding_encode_replace_set(&ctx->encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
118 }
119 
dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)120 static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)
121 {
122 	switch (id) {
123 		case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment";
124 		case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier";
125 		case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier";
126 		case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference";
127 		case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content";
128 		case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range";
129 		case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream";
130 		case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference";
131 		case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes";
132 		case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute";
133 		case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus";
134 		case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name";
135 		case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata";
136 		case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment";
137 		case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype";
138 		case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text";
139 		case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag";
140 		case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment";
141 		case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment";
142 		case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name";
143 		case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name";
144 		case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value";
145 		case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name";
146 		case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier";
147 		case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier";
148 		case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name";
149 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier";
150 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier";
151 		case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference";
152 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword";
153 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword";
154 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name";
155 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes";
156 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers";
157 		case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment";
158 		case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference";
159 		case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream";
160 		case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
161 		case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference";
162 		case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference";
163 		case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream";
164 		case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier";
165 		case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name";
166 		case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value";
167 		case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name";
168 		case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character";
169 		case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name";
170 		case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag";
171 		case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference";
172 		default: return "unknown error";
173 	}
174 }
175 
dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)176 static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)
177 {
178 	switch (id) {
179 		case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token";
180 		case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token";
181 		case LXB_HTML_RULES_ERROR_NUCH: return "null-character";
182 		case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token";
183 		case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode";
184 		case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode";
185 		case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode";
186 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode";
187 		case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode";
188 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode";
189 		case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode";
190 		case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
191 		case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode";
192 		case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode";
193 		case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode";
194 		case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode";
195 		case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode";
196 		case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode";
197 		case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode";
198 		case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode";
199 		case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong";
200 		case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong";
201 		case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack";
202 		case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack";
203 		case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope";
204 		case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope";
205 		case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope";
206 		case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack";
207 		case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file";
208 		case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text";
209 		case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode";
210 		case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode";
211 		case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode";
212 		case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode";
213 		case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode";
214 		case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode";
215 		default: return "unknown error";
216 	}
217 }
218 
dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)219 static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)
220 {
221 	switch (status) {
222 		case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures";
223 		case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing";
224 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow";
225 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory";
226 		default: return "unknown error";
227 	}
228 }
229 
dom_reset_line_column_cache(dom_line_column_cache * cache)230 static void dom_reset_line_column_cache(dom_line_column_cache *cache)
231 {
232 	cache->last_line = 1;
233 	cache->last_column = 1;
234 	cache->last_offset = 0;
235 }
236 
dom_find_line_and_column_using_cache(const dom_lexbor_libxml2_bridge_application_data * application_data,dom_line_column_cache * cache,size_t offset)237 static void dom_find_line_and_column_using_cache(
238 	const dom_lexbor_libxml2_bridge_application_data *application_data,
239 	dom_line_column_cache *cache,
240 	size_t offset
241 )
242 {
243 	offset -= application_data->current_total_offset;
244 	if (offset > application_data->current_input_length) {
245 		/* Possible with empty input, also just good for general safety */
246 		offset = application_data->current_input_length;
247 	}
248 
249 	size_t last_column = cache->last_column;
250 	size_t last_line = cache->last_line;
251 	size_t last_offset = cache->last_offset;
252 
253 	/* Either unicode or UTF-8 data */
254 	if (application_data->current_input_codepoints != NULL) {
255 		while (last_offset < offset) {
256 			if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) {
257 				last_line++;
258 				last_column = 1;
259 			} else {
260 				last_column++;
261 			}
262 			last_offset++;
263 		}
264 	} else {
265 		while (last_offset < offset) {
266 			const lxb_char_t current = application_data->current_input_characters[last_offset];
267 			if (current == '\n') {
268 				last_line++;
269 				last_column = 1;
270 				last_offset++;
271 			} else {
272 				/* See Lexbor tokenizer patch
273 				 * Note for future self: branchlessly computing the length and jumping by the length would be nice,
274 				 * however it takes so many instructions to do so that it is slower than this naive method. */
275 				if ((current & 0b11000000) != 0b10000000) {
276 					last_column++;
277 				}
278 				last_offset++;
279 			}
280 		}
281 	}
282 
283 	cache->last_column = last_column;
284 	cache->last_line = last_line;
285 	cache->last_offset = last_offset;
286 }
287 
dom_lexbor_libxml2_bridge_tokenizer_error_reporter(void * application_data_voidptr,lxb_html_tokenizer_error_t * error,size_t offset)288 static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
289 	void *application_data_voidptr,
290 	lxb_html_tokenizer_error_t *error,
291 	size_t offset
292 )
293 {
294 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
295 	dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset);
296 	php_libxml_pretend_ctx_error_ex(application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column);
297 }
298 
dom_lexbor_libxml2_bridge_tree_error_reporter(void * application_data_voidptr,lxb_html_tree_error_t * error,size_t line,size_t column,size_t len)299 static void dom_lexbor_libxml2_bridge_tree_error_reporter(
300 	void *application_data_voidptr,
301 	lxb_html_tree_error_t *error,
302 	size_t line,
303 	size_t column,
304 	size_t len
305 )
306 {
307 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
308 
309 	if (line == 1 && application_data->html_no_implied && error->id == LXB_HTML_RULES_ERROR_UNTOININMO) {
310 		/* For no implied mode, we want to mimick libxml's behaviour of not reporting an error for a lacking doctype. */
311 		return;
312 	}
313 
314 	if (len <= 1) {
315 		/* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */
316 		php_libxml_pretend_ctx_error_ex(
317 			application_data->input_name,
318 			line,
319 			column,
320 			"tree error %s in %s, line: %zu, column: %zu\n",
321 			dom_lexbor_tree_error_code_to_string(error->id),
322 			application_data->input_name,
323 			line,
324 			column
325 		);
326 	} else {
327 		php_libxml_pretend_ctx_error_ex(
328 			application_data->input_name,
329 			line,
330 			column,
331 			"tree error %s in %s, line: %zu, column: %zu-%zu\n",
332 			dom_lexbor_tree_error_code_to_string(error->id),
333 			application_data->input_name,
334 			line,
335 			column,
336 			column + len - 1
337 		);
338 	}
339 }
340 
dom_search_child(xmlNodePtr parent,const char * searching_for)341 static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for)
342 {
343 	xmlNodePtr node = parent->children;
344 	while (node != NULL) {
345 		if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) {
346 			return node;
347 		}
348 		node = node->next;
349 	}
350 	return NULL;
351 }
352 
dom_place_remove_element_and_hoist_children(xmlNodePtr parent,const char * searching_for)353 static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for)
354 {
355 	xmlNodePtr node = dom_search_child(parent, searching_for);
356 	if (node != NULL) {
357 		xmlUnlinkNode(node);
358 
359 		xmlNodePtr child = node->children;
360 		while (child != NULL) {
361 			xmlUnlinkNode(child);
362 			xmlAddChild(parent, child);
363 			child = node->children;
364 		}
365 
366 		xmlFreeNode(node);
367 	}
368 }
369 
dom_post_process_html5_loading(xmlDocPtr lxml_doc,zend_long options,const lexbor_libxml2_bridge_extracted_observations * observations)370 static void dom_post_process_html5_loading(
371 	xmlDocPtr lxml_doc,
372 	zend_long options,
373 	const lexbor_libxml2_bridge_extracted_observations *observations
374 )
375 {
376 	if (options & HTML_PARSE_NOIMPLIED) {
377 		xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html");
378 		if (!observations->has_explicit_head_tag) {
379 			dom_place_remove_element_and_hoist_children(html_node, "head");
380 		}
381 		if (!observations->has_explicit_body_tag) {
382 			dom_place_remove_element_and_hoist_children(html_node, "body");
383 		}
384 		if (!observations->has_explicit_html_tag) {
385 			dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html");
386 		}
387 	}
388 }
389 
390 /* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */
dom_determine_encoding(const char * source,size_t source_len)391 static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len)
392 {
393 	dom_character_encoding_data result;
394 
395 	/* BOM sniffing */
396 	if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') {
397 		result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
398 		result.bom_shift = 3;
399 		return result;
400 	} else if (source_len >= 2) {
401 		if (source[0] == '\xFE' && source[1] == '\xFF') {
402 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16BE);
403 			result.bom_shift = 2;
404 			return result;
405 		} else if (source[0] == '\xFF' && source[1] == '\xFE') {
406 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16LE);
407 			result.bom_shift = 2;
408 			return result;
409 		}
410 	}
411 
412 	/* Perform prescan */
413 	lxb_html_encoding_t encoding;
414 	lxb_status_t status = lxb_html_encoding_init(&encoding);
415 	if (status != LXB_STATUS_OK) {
416 		goto fallback_uninit;
417 	}
418 	/* This is the "wait either for 1024 bytes or 500ms" part */
419 	if (source_len > 1024) {
420 		source_len = 1024;
421 	}
422 	status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len);
423 	if (status != LXB_STATUS_OK) {
424 		goto fallback;
425 	}
426 	lxb_html_encoding_entry_t *entry = lxb_html_encoding_meta_entry(&encoding, 0);
427 	if (entry == NULL) {
428 		goto fallback;
429 	}
430 	result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
431 	if (!result.encoding_data) {
432 		goto fallback;
433 	}
434 	result.bom_shift = 0;
435 	lxb_html_encoding_destroy(&encoding, false);
436 	return result;
437 
438 fallback:
439 	lxb_html_encoding_destroy(&encoding, false);
440 fallback_uninit:
441 	result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
442 	result.bom_shift = 0;
443 	return result;
444 }
445 
dom_setup_parser_encoding_manually(const lxb_char_t * buf_start,const lxb_encoding_data_t * encoding_data,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)446 static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
447 {
448 	static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
449 
450 	decoding_encoding_ctx->decode_data = encoding_data;
451 
452 	(void) lxb_encoding_decode_init(
453 		&decoding_encoding_ctx->decode,
454 		decoding_encoding_ctx->decode_data,
455 		decoding_encoding_ctx->codepoints,
456 		sizeof(decoding_encoding_ctx->codepoints) / sizeof(*decoding_encoding_ctx->codepoints)
457 	);
458 	(void) lxb_encoding_decode_replace_set(
459 		&decoding_encoding_ctx->decode,
460 		&replacement_codepoint,
461 		LXB_ENCODING_REPLACEMENT_BUFFER_LEN
462 	);
463 	/* Note: encode_data is for UTF-8 */
464 	decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data;
465 
466 	if (decoding_encoding_ctx->fast_path) {
467 		application_data->current_input_codepoints = NULL;
468 		application_data->current_input_characters = (const char *) buf_start;
469 	} else {
470 		application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
471 		application_data->current_input_characters = NULL;
472 	}
473 }
474 
dom_setup_parser_encoding_implicitly(const lxb_char_t ** buf_ref,size_t * read,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)475 static void dom_setup_parser_encoding_implicitly(
476 	const lxb_char_t **buf_ref,
477 	size_t *read,
478 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
479 	dom_lexbor_libxml2_bridge_application_data *application_data
480 )
481 {
482 	const char *buf_start = (const char *) *buf_ref;
483 	dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
484 	*buf_ref += dom_encoding_data.bom_shift;
485 	*read -= dom_encoding_data.bom_shift;
486 	dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
487 }
488 
dom_process_parse_chunk(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,size_t encoded_length,const lxb_char_t * encoding_output,size_t input_buffer_length,size_t * tokenizer_error_offset,size_t * tree_error_offset)489 static bool dom_process_parse_chunk(
490 	lexbor_libxml2_bridge_parse_context *ctx,
491 	lxb_html_document_t *document,
492 	lxb_html_parser_t *parser,
493 	size_t encoded_length,
494 	const lxb_char_t *encoding_output,
495 	size_t input_buffer_length,
496 	size_t *tokenizer_error_offset,
497 	size_t *tree_error_offset
498 )
499 {
500 	dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data;
501 	application_data->current_input_length = input_buffer_length;
502 	lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length);
503 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
504 		return false;
505 	}
506 	if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
507 		lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
508 		dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
509 	}
510 	application_data->current_total_offset += input_buffer_length;
511 	application_data->cache_tokenizer.last_offset = 0;
512 	return true;
513 }
514 
dom_decode_encode_fast_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)515 static bool dom_decode_encode_fast_path(
516 	lexbor_libxml2_bridge_parse_context *ctx,
517 	lxb_html_document_t *document,
518 	lxb_html_parser_t *parser,
519 	const lxb_char_t **buf_ref_ref,
520 	const lxb_char_t *buf_end,
521 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
522 	size_t *tokenizer_error_offset,
523 	size_t *tree_error_offset
524 )
525 {
526 	const lxb_char_t *buf_ref = *buf_ref_ref;
527 	const lxb_char_t *last_output = buf_ref;
528 	while (buf_ref != buf_end) {
529 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
530 		if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
531 			/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
532 			 * need more UTF-8 bytes to complete a sequence.
533 			 * It might be tempting to use SIMD here, but it turns out that this is less efficient because
534 			 * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
535 			buf_ref++;
536 			continue;
537 		}
538 		const lxb_char_t *buf_ref_backup = buf_ref;
539 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
540 		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
541 			size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
542 			if (!dom_process_parse_chunk(
543 				ctx,
544 				document,
545 				parser,
546 				buf_ref - last_output - skip,
547 				last_output,
548 				buf_ref - last_output,
549 				tokenizer_error_offset,
550 				tree_error_offset
551 			)) {
552 				goto fail_oom;
553 			}
554 			if (!dom_process_parse_chunk(
555 				ctx,
556 				document,
557 				parser,
558 				LXB_ENCODING_REPLACEMENT_SIZE,
559 				LXB_ENCODING_REPLACEMENT_BYTES,
560 				0,
561 				tokenizer_error_offset,
562 				tree_error_offset
563 			)) {
564 				goto fail_oom;
565 			}
566 			last_output = buf_ref;
567 		}
568 	}
569 	if (buf_ref != last_output
570 		&& !dom_process_parse_chunk(
571 			ctx,
572 			document,
573 			parser,
574 			buf_ref - last_output,
575 			last_output,
576 			buf_ref - last_output,
577 			tokenizer_error_offset,
578 			tree_error_offset
579 	)) {
580 		goto fail_oom;
581 	}
582 	*buf_ref_ref = buf_ref;
583 	return true;
584 fail_oom:
585 	*buf_ref_ref = buf_ref;
586 	return false;
587 }
588 
dom_decode_encode_slow_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)589 static bool dom_decode_encode_slow_path(
590 	lexbor_libxml2_bridge_parse_context *ctx,
591 	lxb_html_document_t *document,
592 	lxb_html_parser_t *parser,
593 	const lxb_char_t **buf_ref_ref,
594 	const lxb_char_t *buf_end,
595 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
596 	size_t *tokenizer_error_offset,
597 	size_t *tree_error_offset
598 )
599 {
600 	const lxb_char_t *buf_ref = *buf_ref_ref;
601 	lexbor_status_t decode_status, encode_status;
602 	do {
603 		decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
604 
605 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
606 		size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
607 		const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used;
608 		do {
609 			encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
610 			ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid");
611 			if (!dom_process_parse_chunk(
612 				ctx,
613 				document,
614 				parser,
615 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
616 				decoding_encoding_ctx->encoding_output,
617 				decoding_buffer_used,
618 				tokenizer_error_offset,
619 				tree_error_offset
620 			)) {
621 				goto fail_oom;
622 			}
623 			lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0);
624 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
625 		lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0);
626 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
627 	*buf_ref_ref = buf_ref;
628 	return true;
629 fail_oom:
630 	*buf_ref_ref = buf_ref;
631 	return false;
632 }
633 
dom_parse_decode_encode_step(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)634 static bool dom_parse_decode_encode_step(
635 	lexbor_libxml2_bridge_parse_context *ctx,
636 	lxb_html_document_t *document,
637 	lxb_html_parser_t *parser,
638 	const lxb_char_t **buf_ref_ref,
639 	const lxb_char_t *buf_end,
640 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
641 	size_t *tokenizer_error_offset,
642 	size_t *tree_error_offset
643 )
644 {
645 	if (decoding_encoding_ctx->fast_path) {
646 		return dom_decode_encode_fast_path(
647 			ctx,
648 			document,
649 			parser,
650 			buf_ref_ref,
651 			buf_end,
652 			decoding_encoding_ctx,
653 			tokenizer_error_offset,
654 			tree_error_offset
655 		);
656 	} else {
657 		return dom_decode_encode_slow_path(
658 			ctx,
659 			document,
660 			parser,
661 			buf_ref_ref,
662 			buf_end,
663 			decoding_encoding_ctx,
664 			tokenizer_error_offset,
665 			tree_error_offset
666 		);
667 	}
668 }
669 
dom_parse_decode_encode_finish(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)670 static bool dom_parse_decode_encode_finish(
671 	lexbor_libxml2_bridge_parse_context *ctx,
672 	lxb_html_document_t *document,
673 	lxb_html_parser_t *parser,
674 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
675 	size_t *tokenizer_error_offset,
676 	size_t *tree_error_offset
677 )
678 {
679 	if (!decoding_encoding_ctx->fast_path) {
680 		/* Fast path handles codepoints one by one, so this part is not applicable in that case */
681 		(void) lxb_encoding_decode_finish(&decoding_encoding_ctx->decode);
682 		size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
683 		if (decoding_buffer_size > 0) {
684 			const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
685 			const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size;
686 			(void) decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
687 			if (!dom_process_parse_chunk(
688 				ctx,
689 				document,
690 				parser,
691 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
692 				decoding_encoding_ctx->encoding_output,
693 				decoding_buffer_size,
694 				tokenizer_error_offset,
695 				tree_error_offset
696 			)) {
697 				return false;
698 			}
699 		}
700 	}
701 	(void) lxb_encoding_encode_finish(&decoding_encoding_ctx->encode);
702 	if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode)
703 		&& !dom_process_parse_chunk(
704 			ctx,
705 			document,
706 			parser,
707 			lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
708 			decoding_encoding_ctx->encoding_output,
709 			lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode),
710 			tokenizer_error_offset,
711 			tree_error_offset
712 	)) {
713 		return false;
714 	}
715 	return true;
716 }
717 
check_options_validity(uint32_t arg_num,zend_long options)718 static bool check_options_validity(uint32_t arg_num, zend_long options)
719 {
720 	const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS;
721 	if ((options & ~VALID_OPTIONS) != 0) {
722 		zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
723 										   "LIBXML_NOERROR, "
724 										   "LIBXML_COMPACT, "
725 										   "LIBXML_HTML_NOIMPLIED, "
726 										   "Dom\\NO_DEFAULT_NS)");
727 		return false;
728 	}
729 	return true;
730 }
731 
PHP_METHOD(Dom_HTMLDocument,createEmpty)732 PHP_METHOD(Dom_HTMLDocument, createEmpty)
733 {
734 	const char *encoding = "UTF-8";
735 	size_t encoding_len = strlen("UTF-8");
736 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|p", &encoding, &encoding_len) == FAILURE) {
737 		RETURN_THROWS();
738 	}
739 
740 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) encoding, encoding_len);
741 
742 	if (encoding_data == NULL) {
743 		zend_argument_value_error(1, "must be a valid document encoding");
744 		RETURN_THROWS();
745 	}
746 
747 	xmlDocPtr lxml_doc = php_dom_create_html_doc();
748 	if (UNEXPECTED(lxml_doc == NULL)) {
749 		goto oom;
750 	}
751 
752 	lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
753 
754 	dom_object *intern = php_dom_instantiate_object_helper(
755 		return_value,
756 		dom_html_document_class_entry,
757 		(xmlNodePtr) lxml_doc,
758 		NULL
759 	);
760 	dom_set_xml_class(intern->document);
761 	intern->document->private_data = php_dom_libxml_private_data_header(php_dom_private_data_create());
762 	return;
763 
764 oom:
765 	php_dom_throw_error(INVALID_STATE_ERR, true);
766 	RETURN_THROWS();
767 }
768 
769 /* Only bother to register error handling when the error reports can become observable. */
dom_should_register_error_handlers(zend_long options)770 static bool dom_should_register_error_handlers(zend_long options)
771 {
772 	if (options & XML_PARSE_NOERROR) {
773 		return false;
774 	}
775 
776 	return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING);
777 }
778 
PHP_METHOD(Dom_HTMLDocument,createFromString)779 PHP_METHOD(Dom_HTMLDocument, createFromString)
780 {
781 	const char *source, *override_encoding = NULL;
782 	size_t source_len, override_encoding_len;
783 	zend_long options = 0;
784 	if (zend_parse_parameters(
785 		ZEND_NUM_ARGS(),
786 		"s|lp!",
787 		&source,
788 		&source_len,
789 		&options,
790 		&override_encoding,
791 		&override_encoding_len
792 	) == FAILURE) {
793 		RETURN_THROWS();
794 	}
795 
796 	if (!check_options_validity(2, options)) {
797 		RETURN_THROWS();
798 	}
799 
800 	dom_lexbor_libxml2_bridge_application_data application_data;
801 	application_data.input_name = "Entity";
802 	application_data.current_total_offset = 0;
803 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
804 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
805 	lexbor_libxml2_bridge_parse_context ctx;
806 	lexbor_libxml2_bridge_parse_context_init(&ctx);
807 	if (dom_should_register_error_handlers(options)) {
808 		lexbor_libxml2_bridge_parse_set_error_callbacks(
809 			&ctx,
810 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
811 			dom_lexbor_libxml2_bridge_tree_error_reporter
812 		);
813 	}
814 	ctx.application_data = &application_data;
815 
816 	size_t tokenizer_error_offset = 0;
817 	size_t tree_error_offset = 0;
818 
819 	/* Setup everything encoding & decoding related */
820 	const lxb_char_t *buf_ref = (const lxb_char_t *) source;
821 	dom_decoding_encoding_ctx decoding_encoding_ctx;
822 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
823 	if (override_encoding != NULL) {
824 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
825 			(const lxb_char_t *) override_encoding,
826 			override_encoding_len
827 		);
828 		if (!encoding_data) {
829 			zend_argument_value_error(3, "must be a valid document encoding");
830 			RETURN_THROWS();
831 		}
832 		dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
833 	} else {
834 		dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
835 	}
836 
837 	lxb_html_document_t *document = lxb_html_document_create();
838 	if (UNEXPECTED(document == NULL)) {
839 		goto fail_oom;
840 	}
841 
842 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
843 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
844 		goto fail_oom;
845 	}
846 
847 	lxb_html_parser_t *parser = document->dom_document.parser;
848 
849 	while (source_len > 0) {
850 		size_t chunk_size = source_len;
851 		const size_t MAX_CHUNK_SIZE = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(*decoding_encoding_ctx.encoding_output);
852 		if (chunk_size > MAX_CHUNK_SIZE) {
853 			chunk_size = MAX_CHUNK_SIZE;
854 		}
855 		source_len -= chunk_size;
856 
857 		const lxb_char_t *buf_end = buf_ref + chunk_size;
858 		bool result = dom_parse_decode_encode_step(
859 			&ctx,
860 			document,
861 			parser,
862 			&buf_ref,
863 			buf_end,
864 			&decoding_encoding_ctx,
865 			&tokenizer_error_offset,
866 			&tree_error_offset
867 		);
868 		if (!result) {
869 			goto fail_oom;
870 		}
871 	}
872 
873 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
874 		goto fail_oom;
875 	}
876 
877 	lexbor_status = lxb_html_document_parse_chunk_end(document);
878 	if (lexbor_status != LXB_STATUS_OK) {
879 		goto fail_oom;
880 	}
881 
882 	php_dom_private_data *private_data = php_dom_private_data_create();
883 
884 	xmlDocPtr lxml_doc;
885 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
886 		document,
887 		&lxml_doc,
888 		options & XML_PARSE_COMPACT,
889 		!(options & DOM_HTML_NO_DEFAULT_NS),
890 		private_data
891 	);
892 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
893 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
894 		php_dom_private_data_destroy(private_data);
895 		php_libxml_ctx_error(
896 			NULL,
897 			"%s in %s",
898 			dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status),
899 			application_data.input_name
900 		);
901 		lxb_html_document_destroy(document);
902 		RETURN_FALSE;
903 	}
904 	lxb_html_document_destroy(document);
905 
906 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
907 
908 	if (decoding_encoding_ctx.decode_data) {
909 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
910 	} else {
911 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
912 	}
913 
914 	dom_object *intern = php_dom_instantiate_object_helper(
915 		return_value,
916 		dom_html_document_class_entry,
917 		(xmlNodePtr) lxml_doc,
918 		NULL
919 	);
920 	dom_set_xml_class(intern->document);
921 	intern->document->quirks_mode = ctx.observations.quirks_mode;
922 	intern->document->private_data = php_dom_libxml_private_data_header(private_data);
923 	return;
924 
925 fail_oom:
926 	lxb_html_document_destroy(document);
927 	php_dom_throw_error(INVALID_STATE_ERR, true);
928 	RETURN_THROWS();
929 }
930 
PHP_METHOD(Dom_HTMLDocument,createFromFile)931 PHP_METHOD(Dom_HTMLDocument, createFromFile)
932 {
933 	const char *filename, *override_encoding = NULL;
934 	php_dom_private_data *private_data = NULL;
935 	size_t filename_len, override_encoding_len;
936 	zend_long options = 0;
937 	php_stream *stream = NULL;
938 	if (zend_parse_parameters(
939 		ZEND_NUM_ARGS(),
940 		"p|lp!",
941 		&filename,
942 		&filename_len,
943 		&options,
944 		&override_encoding,
945 		&override_encoding_len
946 	) == FAILURE) {
947 		RETURN_THROWS();
948 	}
949 
950 	/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
951 	if (strstr(filename, "%00")) {
952 		zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
953 		RETURN_THROWS();
954 	}
955 
956 	if (!check_options_validity(2, options)) {
957 		RETURN_THROWS();
958 	}
959 
960 	dom_lexbor_libxml2_bridge_application_data application_data;
961 	application_data.input_name = filename;
962 	application_data.current_total_offset = 0;
963 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
964 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
965 	lexbor_libxml2_bridge_parse_context ctx;
966 	lexbor_libxml2_bridge_parse_context_init(&ctx);
967 	if (dom_should_register_error_handlers(options)) {
968 		lexbor_libxml2_bridge_parse_set_error_callbacks(
969 			&ctx,
970 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
971 			dom_lexbor_libxml2_bridge_tree_error_reporter
972 		);
973 	}
974 	ctx.application_data = &application_data;
975 
976 	char buf[4096];
977 
978 	/* Setup everything encoding & decoding related */
979 	dom_decoding_encoding_ctx decoding_encoding_ctx;
980 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
981 	bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
982 	if (override_encoding != NULL) {
983 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
984 			(const lxb_char_t *) override_encoding,
985 			override_encoding_len
986 		);
987 		if (!encoding_data) {
988 			zend_argument_value_error(3, "must be a valid document encoding");
989 			RETURN_THROWS();
990 		}
991 		should_determine_encoding_implicitly = false;
992 		dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
993 	}
994 
995 	zend_string *opened_path = NULL;
996 	stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, &opened_path, php_libxml_get_stream_context());
997 	if (!stream) {
998 		if (!EG(exception)) {
999 			zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", filename);
1000 		}
1001 		RETURN_THROWS();
1002 	}
1003 
1004 	/* MIME sniff */
1005 	if (should_determine_encoding_implicitly) {
1006 		zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
1007 		if (charset != NULL) {
1008 			const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1009 				(const lxb_char_t *) ZSTR_VAL(charset),
1010 				ZSTR_LEN(charset)
1011 			);
1012 			if (encoding_data != NULL) {
1013 				should_determine_encoding_implicitly = false;
1014 				dom_setup_parser_encoding_manually(
1015 					(const lxb_char_t *) buf,
1016 					encoding_data,
1017 					&decoding_encoding_ctx,
1018 					&application_data
1019 				);
1020 			}
1021 			zend_string_release_ex(charset, false);
1022 		}
1023 	}
1024 
1025 	lxb_html_document_t *document = lxb_html_document_create();
1026 	if (UNEXPECTED(document == NULL)) {
1027 		goto fail_oom;
1028 	}
1029 
1030 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
1031 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
1032 		goto fail_oom;
1033 	}
1034 
1035 	size_t tokenizer_error_offset = 0;
1036 	size_t tree_error_offset = 0;
1037 	ssize_t read;
1038 	lxb_html_parser_t *parser = document->dom_document.parser;
1039 
1040 	while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
1041 		const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1042 
1043 		if (should_determine_encoding_implicitly) {
1044 			should_determine_encoding_implicitly = false;
1045 			dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
1046 		}
1047 
1048 		const lxb_char_t *buf_end = buf_ref + read;
1049 		bool result = dom_parse_decode_encode_step(
1050 			&ctx,
1051 			document,
1052 			parser,
1053 			&buf_ref,
1054 			buf_end,
1055 			&decoding_encoding_ctx,
1056 			&tokenizer_error_offset,
1057 			&tree_error_offset
1058 		);
1059 		if (!result) {
1060 			goto fail_oom;
1061 		}
1062 	}
1063 
1064 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
1065 		goto fail_oom;
1066 	}
1067 
1068 	lexbor_status = lxb_html_document_parse_chunk_end(document);
1069 	if (lexbor_status != LXB_STATUS_OK) {
1070 		goto fail_oom;
1071 	}
1072 
1073 	private_data = php_dom_private_data_create();
1074 
1075 	xmlDocPtr lxml_doc;
1076 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
1077 		document,
1078 		&lxml_doc,
1079 		options & XML_PARSE_COMPACT,
1080 		!(options & DOM_HTML_NO_DEFAULT_NS),
1081 		private_data
1082 	);
1083 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
1084 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
1085 		php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename);
1086 		RETVAL_FALSE;
1087 		goto fail_general;
1088 	}
1089 	lxb_html_document_destroy(document);
1090 
1091 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
1092 
1093 	if (decoding_encoding_ctx.decode_data) {
1094 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
1095 	} else {
1096 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
1097 	}
1098 
1099 	if (stream->wrapper == &php_plain_files_wrapper && opened_path != NULL) {
1100 		xmlChar *converted = xmlPathToURI((const xmlChar *) ZSTR_VAL(opened_path));
1101 		if (UNEXPECTED(!converted)) {
1102 			goto fail_oom;
1103 		}
1104 		/* Check for "file:/" instead of "file://" because of libxml2 quirk */
1105 		if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
1106 			xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
1107 			if (UNEXPECTED(!buffer)) {
1108 				xmlFree(converted);
1109 				goto fail_oom;
1110 			}
1111 			xmlChar *new_buffer = xmlStrcat(buffer, converted);
1112 			if (UNEXPECTED(!new_buffer)) {
1113 				xmlFree(buffer);
1114 				xmlFree(converted);
1115 				goto fail_oom;
1116 			}
1117 			xmlFree(converted);
1118 			lxml_doc->URL = new_buffer;
1119 		} else {
1120 #ifdef PHP_WIN32
1121 			converted = php_dom_libxml_fix_file_path(converted);
1122 #endif
1123 			lxml_doc->URL = converted;
1124 		}
1125 	} else {
1126 		lxml_doc->URL = xmlStrdup((const xmlChar *) filename);
1127 	}
1128 
1129 	if (opened_path != NULL) {
1130 		zend_string_release_ex(opened_path, false);
1131 	}
1132 	php_stream_close(stream);
1133 	stream = NULL;
1134 
1135 	dom_object *intern = php_dom_instantiate_object_helper(
1136 		return_value,
1137 		dom_html_document_class_entry,
1138 		(xmlNodePtr) lxml_doc,
1139 		NULL
1140 	);
1141 	dom_set_xml_class(intern->document);
1142 	intern->document->quirks_mode = ctx.observations.quirks_mode;
1143 	intern->document->private_data = php_dom_libxml_private_data_header(private_data);
1144 	return;
1145 
1146 fail_oom:
1147 	php_dom_throw_error(INVALID_STATE_ERR, true);
1148 fail_general:
1149 	if (private_data != NULL) {
1150 		php_dom_private_data_destroy(private_data);
1151 	}
1152 	lxb_html_document_destroy(document);
1153 	php_stream_close(stream);
1154 	if (opened_path != NULL) {
1155 		zend_string_release_ex(opened_path, false);
1156 	}
1157 }
1158 
dom_write_output_smart_str(void * ctx,const char * buf,size_t size)1159 static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size)
1160 {
1161 	smart_str_appendl((smart_str *) ctx, buf, size);
1162 	return SUCCESS;
1163 }
1164 
dom_write_output_stream(void * application_data,const char * buf,size_t len)1165 static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len)
1166 {
1167 	php_stream *stream = (php_stream *) application_data;
1168 	if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) {
1169 		return FAILURE;
1170 	}
1171 	return SUCCESS;
1172 }
1173 
dom_saveHTML_write_string_len(void * application_data,const char * buf,size_t len)1174 static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
1175 {
1176 	dom_output_ctx *output = (dom_output_ctx *) application_data;
1177 	lxb_status_t decode_status, encode_status;
1178 	const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1179 	const lxb_char_t *buf_end = buf_ref + len;
1180 
1181 	do {
1182 		decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
1183 
1184 		const lxb_codepoint_t *codepoints_ref = output->codepoints;
1185 		const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
1186 		do {
1187 			encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end);
1188 			if (UNEXPECTED(output->write_output(
1189 				output->output_data,
1190 				(const char *) output->encoding_output,
1191 				lxb_encoding_encode_buf_used(output->encode)
1192 			) != SUCCESS)) {
1193 				return FAILURE;
1194 			}
1195 			lxb_encoding_encode_buf_used_set(output->encode, 0);
1196 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
1197 		lxb_encoding_decode_buf_used_set(output->decode, 0);
1198 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
1199 
1200 	return SUCCESS;
1201 }
1202 
dom_saveHTML_write_string(void * application_data,const char * buf)1203 static zend_result dom_saveHTML_write_string(void *application_data, const char *buf)
1204 {
1205 	return dom_saveHTML_write_string_len(application_data, buf, strlen(buf));
1206 }
1207 
dom_common_save(dom_output_ctx * output_ctx,dom_object * intern,const xmlDoc * docp,const xmlNode * node)1208 static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *intern, const xmlDoc *docp, const xmlNode *node)
1209 {
1210 	/* Initialize everything related to encoding & decoding */
1211 	const lxb_encoding_data_t *decoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
1212 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1213 		(const lxb_char_t *) docp->encoding,
1214 		strlen((const char *) docp->encoding)
1215 	);
1216 	lxb_encoding_encode_t encode;
1217 	lxb_encoding_decode_t decode;
1218 	lxb_char_t encoding_output[4096];
1219 	lxb_codepoint_t codepoints[4096];
1220 	(void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(*encoding_output));
1221 	(void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(*codepoints));
1222 	if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1223 		lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
1224 	} else {
1225 		/* Fallback if there is no replacement by default */
1226 		lxb_encoding_encode_replace_set(&encode, (const lxb_char_t *) "?", 1);
1227 	}
1228 	lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
1229 
1230 	output_ctx->encoding_data = encoding_data;
1231 	output_ctx->decoding_data = decoding_data;
1232 	output_ctx->encode = &encode;
1233 	output_ctx->decode = &decode;
1234 	output_ctx->codepoints = codepoints;
1235 	output_ctx->encoding_output = encoding_output;
1236 
1237 	dom_html5_serialize_context ctx;
1238 	ctx.write_string_len = dom_saveHTML_write_string_len;
1239 	ctx.write_string = dom_saveHTML_write_string;
1240 	ctx.application_data = output_ctx;
1241 	ctx.private_data = php_dom_get_private_data(intern);
1242 	if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {
1243 		return FAILURE;
1244 	}
1245 
1246 	(void) lxb_encoding_decode_finish(&decode);
1247 	if (lxb_encoding_decode_buf_used(&decode)) {
1248 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints;
1249 		(void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode));
1250 		if (UNEXPECTED(output_ctx->write_output(
1251 			output_ctx->output_data,
1252 			(const char *) encoding_output,
1253 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1254 		)) {
1255 			return FAILURE;
1256 		}
1257 	}
1258 	(void) lxb_encoding_encode_finish(&encode);
1259 	if (lxb_encoding_encode_buf_used(&encode)) {
1260 		if (UNEXPECTED(output_ctx->write_output(
1261 			output_ctx->output_data,
1262 			(const char *) encoding_output,
1263 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1264 		)) {
1265 			return FAILURE;
1266 		}
1267 	}
1268 
1269 	return SUCCESS;
1270 }
1271 
PHP_METHOD(Dom_HTMLDocument,saveHtmlFile)1272 PHP_METHOD(Dom_HTMLDocument, saveHtmlFile)
1273 {
1274 	zval *id;
1275 	xmlDoc *docp;
1276 	size_t file_len;
1277 	dom_object *intern;
1278 	char *file;
1279 
1280 	id = ZEND_THIS;
1281 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) {
1282 		RETURN_THROWS();
1283 	}
1284 
1285 	if (file_len == 0) {
1286 		zend_argument_must_not_be_empty_error(1);
1287 		RETURN_THROWS();
1288 	}
1289 
1290 	php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
1291 	if (!stream) {
1292 		RETURN_FALSE;
1293 	}
1294 
1295 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
1296 
1297 	dom_output_ctx output_ctx;
1298 	output_ctx.output_data = stream;
1299 	output_ctx.write_output = dom_write_output_stream;
1300 	if (UNEXPECTED(dom_common_save(&output_ctx, intern, docp, (const xmlNode *) docp) != SUCCESS)) {
1301 		php_stream_close(stream);
1302 		RETURN_FALSE;
1303 	}
1304 
1305 	zend_long bytes = php_stream_tell(stream);
1306 	php_stream_close(stream);
1307 
1308 	RETURN_LONG(bytes);
1309 }
1310 
PHP_METHOD(Dom_HTMLDocument,saveHtml)1311 PHP_METHOD(Dom_HTMLDocument, saveHtml)
1312 {
1313 	zval *nodep = NULL;
1314 	const xmlDoc *docp;
1315 	const xmlNode *node;
1316 	dom_object *intern, *nodeobj;
1317 
1318 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|O!", &nodep, dom_modern_node_class_entry) == FAILURE) {
1319 		RETURN_THROWS();
1320 	}
1321 
1322 	DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
1323 
1324 	if (nodep != NULL) {
1325 		DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj);
1326 		if (node->doc != docp) {
1327 			php_dom_throw_error(WRONG_DOCUMENT_ERR, true);
1328 			RETURN_THROWS();
1329 		}
1330 	} else {
1331 		node = (const xmlNode *) docp;
1332 	}
1333 
1334 	smart_str buf = {0};
1335 	dom_output_ctx output_ctx;
1336 	output_ctx.output_data = &buf;
1337 	output_ctx.write_output = dom_write_output_smart_str;
1338 	/* Can't fail because dom_write_output_smart_str() can't fail. */
1339 	zend_result result = dom_common_save(&output_ctx, intern, docp, node);
1340 	ZEND_ASSERT(result == SUCCESS);
1341 
1342 	RETURN_STR(smart_str_extract(&buf));
1343 }
1344 
dom_html_document_encoding_write(dom_object * obj,zval * newval)1345 zend_result dom_html_document_encoding_write(dom_object *obj, zval *newval)
1346 {
1347 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1348 
1349 	/* Typed property, can only be IS_STRING. */
1350 	ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
1351 
1352 	zend_string *str = Z_STR_P(newval);
1353 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str));
1354 
1355 	if (encoding_data != NULL) {
1356 		xmlFree(BAD_CAST docp->encoding);
1357 		docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name);
1358 	} else {
1359 		zend_value_error("Invalid document encoding");
1360 		return FAILURE;
1361 	}
1362 
1363 	return SUCCESS;
1364 }
1365 
dom_html_document_element_read_raw(const xmlDoc * docp,bool (* accept)(const xmlChar *))1366 static xmlNodePtr dom_html_document_element_read_raw(const xmlDoc *docp, bool (*accept)(const xmlChar *))
1367 {
1368 	const xmlNode *root = xmlDocGetRootElement(docp);
1369 	if (root == NULL || !(php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token) && xmlStrEqual(root->name, BAD_CAST "html"))) {
1370 		return NULL;
1371 	}
1372 
1373 	xmlNodePtr cur = root->children;
1374 	while (cur != NULL) {
1375 		if (cur->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(cur, php_dom_ns_is_html_magic_token) && accept(cur->name)) {
1376 			return cur;
1377 		}
1378 		cur = cur->next;
1379 	}
1380 
1381 	return NULL;
1382 }
1383 
dom_html_document_element_read_helper(dom_object * obj,zval * retval,bool (* accept)(const xmlChar *))1384 zend_result dom_html_document_element_read_helper(dom_object *obj, zval *retval, bool (*accept)(const xmlChar *))
1385 {
1386 	DOM_PROP_NODE(const xmlDoc *, docp, obj);
1387 
1388 	const xmlNode *element = dom_html_document_element_read_raw(docp, accept);
1389 	php_dom_create_nullable_object((xmlNodePtr) element, retval, obj);
1390 
1391 	return SUCCESS;
1392 }
1393 
dom_accept_body_name(const xmlChar * name)1394 static bool dom_accept_body_name(const xmlChar *name)
1395 {
1396 	return xmlStrEqual(name, BAD_CAST "body") || xmlStrEqual(name, BAD_CAST "frameset");
1397 }
1398 
dom_accept_head_name(const xmlChar * name)1399 static bool dom_accept_head_name(const xmlChar *name)
1400 {
1401 	return xmlStrEqual(name, BAD_CAST "head");
1402 }
1403 
1404 /* https://html.spec.whatwg.org/#dom-document-body */
dom_html_document_body_read(dom_object * obj,zval * retval)1405 zend_result dom_html_document_body_read(dom_object *obj, zval *retval)
1406 {
1407 	return dom_html_document_element_read_helper(obj, retval, dom_accept_body_name);
1408 }
1409 
1410 /* https://html.spec.whatwg.org/#dom-document-head */
dom_html_document_head_read(dom_object * obj,zval * retval)1411 zend_result dom_html_document_head_read(dom_object *obj, zval *retval)
1412 {
1413 	return dom_html_document_element_read_helper(obj, retval, dom_accept_head_name);
1414 }
1415 
1416 /* https://html.spec.whatwg.org/#dom-document-body */
dom_html_document_body_write(dom_object * obj,zval * newval)1417 zend_result dom_html_document_body_write(dom_object *obj, zval *newval)
1418 {
1419 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1420 
1421 	/* 1. If the new value is not a body or frameset element, then throw a "HierarchyRequestError" DOMException. */
1422 	if (Z_TYPE_P(newval) != IS_NULL) {
1423 		dom_object *newval_intern = Z_DOMOBJ_P(newval);
1424 		if (newval_intern->ptr != NULL) {
1425 			xmlNodePtr newval_node = ((php_libxml_node_ptr *) newval_intern->ptr)->node;
1426 			/* Note: because this property has type HTMLElement, we know the namespace is correct. */
1427 			if (dom_accept_body_name(newval_node->name)) {
1428 				/* 2. If the new value is the same as the body element, return. */
1429 				const xmlNode *current_body_element = dom_html_document_element_read_raw(docp, dom_accept_body_name);
1430 				if (current_body_element == newval_node) {
1431 					return SUCCESS;
1432 				}
1433 
1434 				/* 3. If the body element is not null, then replace the body element with the new value within the body element's parent and return. */
1435 				if (current_body_element != NULL) {
1436 					php_dom_adopt_node(newval_node, obj, docp);
1437 					xmlNodePtr old = xmlReplaceNode((xmlNodePtr) current_body_element, newval_node);
1438 					if (old != NULL && old->_private == NULL) {
1439 						php_libxml_node_free_resource(old);
1440 					}
1441 					return SUCCESS;
1442 				}
1443 
1444 				/* 4. If there is no document element, throw a "HierarchyRequestError" DOMException. */
1445 				xmlNodePtr root = xmlDocGetRootElement(docp);
1446 				if (root == NULL) {
1447 					php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "A body can only be set if there is a document element", true);
1448 					return FAILURE;
1449 				}
1450 
1451 				/* 5. Append the new value to the document element. */
1452 				php_dom_adopt_node(newval_node, obj, docp);
1453 				xmlAddChild(root, newval_node);
1454 				return SUCCESS;
1455 			}
1456 		}
1457 	}
1458 
1459 	php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "The new body must either be a body or a frameset tag", true);
1460 	return FAILURE;
1461 }
1462 
1463 /* https://dom.spec.whatwg.org/#concept-child-text-content */
dom_get_child_text_content(const xmlNode * node)1464 static zend_string *dom_get_child_text_content(const xmlNode *node)
1465 {
1466 	smart_str content = {0};
1467 
1468 	const xmlNode *text = node->children;
1469 	while (text != NULL) {
1470 		if ((text->type == XML_TEXT_NODE || text->type == XML_CDATA_SECTION_NODE) && text->content != NULL) {
1471 			smart_str_appends(&content, (const char *) text->content);
1472 		}
1473 		text = text->next;
1474 	}
1475 
1476 	return smart_str_extract(&content);
1477 }
1478 
1479 /* https://html.spec.whatwg.org/#the-title-element-2 */
dom_get_title_element(const xmlDoc * doc)1480 static xmlNodePtr dom_get_title_element(const xmlDoc *doc)
1481 {
1482 	xmlNodePtr node = doc->children;
1483 
1484 	while (node != NULL) {
1485 		if (node->type == XML_ELEMENT_NODE) {
1486 			if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) && xmlStrEqual(node->name, BAD_CAST "title")) {
1487 				break;
1488 			}
1489 		}
1490 
1491 		node = php_dom_next_in_tree_order(node, NULL);
1492 	}
1493 
1494 	return node;
1495 }
1496 
1497 /* The subtle difference is that this is about the direct title descendant of the svg element,
1498  * whereas the html variant of this function is about the first in-tree title element. */
dom_get_svg_title_element(xmlNodePtr svg)1499 static xmlNodePtr dom_get_svg_title_element(xmlNodePtr svg)
1500 {
1501 	xmlNodePtr cur = svg->children;
1502 
1503 	while (cur != NULL) {
1504 		if (cur->type == XML_ELEMENT_NODE
1505 			&& php_dom_ns_is_fast(cur, php_dom_ns_is_svg_magic_token) && xmlStrEqual(cur->name, BAD_CAST "title")) {
1506 			break;
1507 		}
1508 		cur = cur->next;
1509 	}
1510 
1511 	return cur;
1512 }
1513 
1514 /* https://html.spec.whatwg.org/#document.title */
dom_html_document_title_read(dom_object * obj,zval * retval)1515 zend_result dom_html_document_title_read(dom_object *obj, zval *retval)
1516 {
1517 	DOM_PROP_NODE(const xmlDoc *, docp, obj);
1518 	xmlNodePtr root = xmlDocGetRootElement(docp);
1519 
1520 	if (root == NULL) {
1521 		ZVAL_EMPTY_STRING(retval);
1522 		return SUCCESS;
1523 	}
1524 
1525 	zend_string *value = zend_empty_string;
1526 
1527 	/* 1. If the document element is an SVG svg element,
1528 	 *    then let value be the child text content of the first SVG title element that is a child of the document element. */
1529 	if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
1530 		const xmlNode *title = dom_get_svg_title_element(root);
1531 		if (title != NULL) {
1532 			value = dom_get_child_text_content(title);
1533 		}
1534 	} else {
1535 		/* 2. Otherwise, let value be the child text content of the title element,
1536 		 *    or the empty string if the title element is null. */
1537 		const xmlNode *title = dom_get_title_element(docp);
1538 		if (title != NULL) {
1539 			value = dom_get_child_text_content(title);
1540 		}
1541 	}
1542 
1543 	/* 3. Strip and collapse ASCII whitespace in value. */
1544 	value = dom_strip_and_collapse_ascii_whitespace(value);
1545 
1546 	/* 4. Return value. */
1547 	ZVAL_STR(retval, value);
1548 
1549 	return SUCCESS;
1550 }
1551 
dom_string_replace_all(xmlDocPtr docp,xmlNodePtr element,zval * zv)1552 static void dom_string_replace_all(xmlDocPtr docp, xmlNodePtr element, zval *zv)
1553 {
1554 	dom_remove_all_children(element);
1555 	xmlNode *text = xmlNewDocText(docp, BAD_CAST Z_STRVAL_P(zv));
1556 	xmlAddChild(element, text);
1557 }
1558 
1559 /* https://html.spec.whatwg.org/#document.title */
dom_html_document_title_write(dom_object * obj,zval * newval)1560 zend_result dom_html_document_title_write(dom_object *obj, zval *newval)
1561 {
1562 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1563 	xmlNodePtr root = xmlDocGetRootElement(docp);
1564 
1565 	if (root == NULL) {
1566 		return SUCCESS;
1567 	}
1568 
1569 	/* If the document element is an SVG svg element */
1570 	if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
1571 		/* 1. If there is an SVG title element that is a child of the document element, let element be the first such element. */
1572 		xmlNodePtr element = dom_get_svg_title_element(root);
1573 
1574 		/* 2. Otherwise: */
1575 		if (element == NULL) {
1576 			/* 2.1. Let element be the result of creating an element given the document element's node document,
1577 			 *      title, and the SVG namespace. */
1578 
1579 			/* Annoyingly, we must create it in the svg namespace _without_ prefix... */
1580 			xmlNsPtr ns = root->ns;
1581 			if (ns->prefix != NULL) {
1582 				/* Slow path... */
1583 				php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
1584 				zend_string *href = ZSTR_INIT_LITERAL(DOM_SVG_NS_URI, false);
1585 				ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, zend_empty_string, href);
1586 				zend_string_release_ex(href, false);
1587 			}
1588 
1589 			element = xmlNewDocNode(docp, ns, BAD_CAST "title", NULL);
1590 			if (UNEXPECTED(element == NULL)) {
1591 				php_dom_throw_error(INVALID_STATE_ERR, true);
1592 				return FAILURE;
1593 			}
1594 
1595 			/* 2.2. Insert element as the first child of the document element. */
1596 			if (root->children == NULL) {
1597 				root->last = element;
1598 			} else {
1599 				element->next = root->children;
1600 				root->children->prev = element;
1601 			}
1602 			root->children = element;
1603 			element->parent = root;
1604 		}
1605 
1606 		/* 3. String replace all with the given value within element. */
1607 		dom_string_replace_all(docp, element, newval);
1608 	}
1609 	/* If the document element is in the HTML namespace */
1610 	else if (php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token)) {
1611 		/* 1. If the title element is null and the head element is null, then return. */
1612 		xmlNodePtr title = dom_get_title_element(docp);
1613 		xmlNodePtr head = dom_html_document_element_read_raw(docp, dom_accept_head_name);
1614 		if (title == NULL && head == NULL) {
1615 			return SUCCESS;
1616 		}
1617 
1618 		/* 2. If the title element is non-null, let element be the title element. */
1619 		xmlNodePtr element = title;
1620 
1621 		/* 3. Otherwise: */
1622 		if (element == NULL) {
1623 			/* 3.1. Let element be the result of creating an element given the document element's node document, title,
1624 			 *      and the HTML namespace. */
1625 			php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
1626 			element = xmlNewDocNode(docp, php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper), BAD_CAST "title", NULL);
1627 			if (UNEXPECTED(element == NULL)) {
1628 				php_dom_throw_error(INVALID_STATE_ERR, true);
1629 				return FAILURE;
1630 			}
1631 
1632 			/* 3.2. Append element to the head element. */
1633 			xmlAddChild(head, element);
1634 		}
1635 
1636 		/* 4. String replace all with the given value within element. */
1637 		dom_string_replace_all(docp, element, newval);
1638 	}
1639 
1640 	return SUCCESS;
1641 }
1642 
1643 #if ZEND_DEBUG
PHP_METHOD(Dom_HTMLDocument,debugGetTemplateCount)1644 PHP_METHOD(Dom_HTMLDocument, debugGetTemplateCount)
1645 {
1646 	xmlDocPtr doc;
1647 	dom_object *intern;
1648 
1649 	ZEND_PARSE_PARAMETERS_NONE();
1650 
1651 	DOM_GET_OBJ(doc, ZEND_THIS, xmlDocPtr, intern);
1652 	ZEND_IGNORE_VALUE(doc);
1653 
1654 	RETURN_LONG((zend_long) php_dom_get_template_count((const php_dom_private_data *) intern->document->private_data));
1655 }
1656 #endif
1657 
1658 #endif  /* HAVE_LIBXML && HAVE_DOM */
1659