xref: /php-src/ext/dom/html_document.c (revision 935fef29)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "infra.h"
25 #include "html5_parser.h"
26 #include "html5_serializer.h"
27 #include "namespace_compat.h"
28 #include "private_data.h"
29 #include "dom_properties.h"
30 #include <Zend/zend_smart_string.h>
31 #include <lexbor/html/encoding.h>
32 #include <lexbor/encoding/encoding.h>
33 #include <lexbor/core/swar.h>
34 
35 /* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
36 #define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
37 
38 typedef struct dom_line_column_cache {
39 	size_t last_line;
40 	size_t last_column;
41 	size_t last_offset;
42 } dom_line_column_cache;
43 
44 typedef struct dom_lexbor_libxml2_bridge_application_data {
45 	const char *input_name;
46 	const lxb_codepoint_t *current_input_codepoints;
47 	const char *current_input_characters;
48 	size_t current_input_length;
49 	size_t current_total_offset;
50 	dom_line_column_cache cache_tokenizer;
51 	bool html_no_implied;
52 } dom_lexbor_libxml2_bridge_application_data;
53 
54 typedef struct dom_character_encoding_data {
55 	const lxb_encoding_data_t *encoding_data;
56 	size_t bom_shift;
57 } dom_character_encoding_data;
58 
59 typedef zend_result (*dom_write_output)(void*, const char *, size_t);
60 
61 typedef struct dom_output_ctx {
62 	const lxb_encoding_data_t *encoding_data;
63 	const lxb_encoding_data_t *decoding_data;
64 	lxb_encoding_encode_t *encode;
65 	lxb_encoding_decode_t *decode;
66 	lxb_codepoint_t *codepoints;
67 	lxb_char_t *encoding_output;
68 	void *output_data;
69 	dom_write_output write_output;
70 } dom_output_ctx;
71 
72 typedef struct dom_decoding_encoding_ctx {
73 	/* We can skip some conversion if the input and output encoding are both UTF-8,
74 	 * we only have to validate and substitute replacement characters */
75 	bool fast_path; /* Put first, near the encode & decode structures, for cache locality */
76 	lxb_encoding_encode_t encode;
77 	lxb_encoding_decode_t decode;
78 	const lxb_encoding_data_t *encode_data;
79 	const lxb_encoding_data_t *decode_data;
80 	lxb_char_t encoding_output[4096];
81 	lxb_codepoint_t codepoints[4096];
82 } dom_decoding_encoding_ctx;
83 
84 /* https://dom.spec.whatwg.org/#dom-document-implementation */
dom_modern_document_implementation_read(dom_object * obj,zval * retval)85 zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retval)
86 {
87 	const uint32_t PROP_INDEX = 0;
88 
89 #if ZEND_DEBUG
90 	zend_string *implementation_str = ZSTR_INIT_LITERAL("implementation", false);
91 	const zend_property_info *prop_info = zend_get_property_info(dom_abstract_base_document_class_entry, implementation_str, 0);
92 	zend_string_release_ex(implementation_str, false);
93 	ZEND_ASSERT(OBJ_PROP_TO_NUM(prop_info->offset) == PROP_INDEX);
94 #endif
95 
96 	zval *cached_implementation = OBJ_PROP_NUM(&obj->std, PROP_INDEX);
97 	if (Z_ISUNDEF_P(cached_implementation)) {
98 		php_dom_create_implementation(cached_implementation, true);
99 	}
100 
101 	ZVAL_OBJ_COPY(retval, Z_OBJ_P(cached_implementation));
102 
103 	return SUCCESS;
104 }
105 
dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx * ctx)106 static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
107 {
108 	ctx->decode_data = ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
109 	ctx->fast_path = true;
110 	(void) lxb_encoding_encode_init(
111 		&ctx->encode,
112 		ctx->encode_data,
113 		ctx->encoding_output,
114 		sizeof(ctx->encoding_output) / sizeof(*ctx->encoding_output)
115 	);
116 	(void) lxb_encoding_encode_replace_set(&ctx->encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
117 	(void) lxb_encoding_decode_init(
118 		&ctx->decode,
119 		ctx->decode_data,
120 		ctx->codepoints,
121 		sizeof(ctx->codepoints) / sizeof(*ctx->codepoints)
122 	);
123 	(void) lxb_encoding_decode_replace_set(&ctx->decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
124 }
125 
dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)126 static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)
127 {
128 	switch (id) {
129 		case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment";
130 		case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier";
131 		case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier";
132 		case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference";
133 		case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content";
134 		case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range";
135 		case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream";
136 		case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference";
137 		case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes";
138 		case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute";
139 		case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus";
140 		case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name";
141 		case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata";
142 		case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment";
143 		case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype";
144 		case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text";
145 		case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag";
146 		case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment";
147 		case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment";
148 		case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name";
149 		case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name";
150 		case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value";
151 		case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name";
152 		case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier";
153 		case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier";
154 		case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name";
155 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier";
156 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier";
157 		case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference";
158 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword";
159 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword";
160 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name";
161 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes";
162 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers";
163 		case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment";
164 		case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference";
165 		case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream";
166 		case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
167 		case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference";
168 		case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference";
169 		case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream";
170 		case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier";
171 		case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name";
172 		case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value";
173 		case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name";
174 		case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character";
175 		case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name";
176 		case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag";
177 		case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference";
178 		default: return "unknown error";
179 	}
180 }
181 
dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)182 static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)
183 {
184 	switch (id) {
185 		case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token";
186 		case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token";
187 		case LXB_HTML_RULES_ERROR_NUCH: return "null-character";
188 		case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token";
189 		case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode";
190 		case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode";
191 		case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode";
192 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode";
193 		case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode";
194 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode";
195 		case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode";
196 		case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
197 		case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode";
198 		case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode";
199 		case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode";
200 		case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode";
201 		case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode";
202 		case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode";
203 		case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode";
204 		case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode";
205 		case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong";
206 		case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong";
207 		case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack";
208 		case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack";
209 		case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope";
210 		case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope";
211 		case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope";
212 		case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack";
213 		case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file";
214 		case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text";
215 		case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode";
216 		case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode";
217 		case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode";
218 		case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode";
219 		case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode";
220 		case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode";
221 		default: return "unknown error";
222 	}
223 }
224 
dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)225 static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)
226 {
227 	switch (status) {
228 		case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures";
229 		case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing";
230 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow";
231 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory";
232 		default: return "unknown error";
233 	}
234 }
235 
dom_reset_line_column_cache(dom_line_column_cache * cache)236 static void dom_reset_line_column_cache(dom_line_column_cache *cache)
237 {
238 	cache->last_line = 1;
239 	cache->last_column = 1;
240 	cache->last_offset = 0;
241 }
242 
dom_find_line_and_column_using_cache(const dom_lexbor_libxml2_bridge_application_data * application_data,dom_line_column_cache * cache,size_t offset)243 static void dom_find_line_and_column_using_cache(
244 	const dom_lexbor_libxml2_bridge_application_data *application_data,
245 	dom_line_column_cache *cache,
246 	size_t offset
247 )
248 {
249 	offset -= application_data->current_total_offset;
250 	if (offset > application_data->current_input_length) {
251 		/* Possible with empty input, also just good for general safety */
252 		offset = application_data->current_input_length;
253 	}
254 
255 	size_t last_column = cache->last_column;
256 	size_t last_line = cache->last_line;
257 	size_t last_offset = cache->last_offset;
258 
259 	/* Either unicode or UTF-8 data */
260 	if (application_data->current_input_codepoints != NULL) {
261 		while (last_offset < offset) {
262 			if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) {
263 				last_line++;
264 				last_column = 1;
265 			} else {
266 				last_column++;
267 			}
268 			last_offset++;
269 		}
270 	} else {
271 		while (last_offset < offset) {
272 			const lxb_char_t current = application_data->current_input_characters[last_offset];
273 			if (current == '\n') {
274 				last_line++;
275 				last_column = 1;
276 				last_offset++;
277 			} else {
278 				/* See Lexbor tokenizer patch
279 				 * Note for future self: branchlessly computing the length and jumping by the length would be nice,
280 				 * however it takes so many instructions to do so that it is slower than this naive method. */
281 				if ((current & 0b11000000) != 0b10000000) {
282 					last_column++;
283 				}
284 				last_offset++;
285 			}
286 		}
287 	}
288 
289 	cache->last_column = last_column;
290 	cache->last_line = last_line;
291 	cache->last_offset = last_offset;
292 }
293 
dom_lexbor_libxml2_bridge_tokenizer_error_reporter(void * application_data_voidptr,lxb_html_tokenizer_error_t * error,size_t offset)294 static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
295 	void *application_data_voidptr,
296 	lxb_html_tokenizer_error_t *error,
297 	size_t offset
298 )
299 {
300 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
301 	dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset);
302 	php_libxml_pretend_ctx_error_ex(application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column);
303 }
304 
dom_lexbor_libxml2_bridge_tree_error_reporter(void * application_data_voidptr,lxb_html_tree_error_t * error,size_t line,size_t column,size_t len)305 static void dom_lexbor_libxml2_bridge_tree_error_reporter(
306 	void *application_data_voidptr,
307 	lxb_html_tree_error_t *error,
308 	size_t line,
309 	size_t column,
310 	size_t len
311 )
312 {
313 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
314 
315 	if (line == 1 && application_data->html_no_implied && error->id == LXB_HTML_RULES_ERROR_UNTOININMO) {
316 		/* For no implied mode, we want to mimick libxml's behaviour of not reporting an error for a lacking doctype. */
317 		return;
318 	}
319 
320 	if (len <= 1) {
321 		/* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */
322 		php_libxml_pretend_ctx_error_ex(
323 			application_data->input_name,
324 			line,
325 			column,
326 			"tree error %s in %s, line: %zu, column: %zu\n",
327 			dom_lexbor_tree_error_code_to_string(error->id),
328 			application_data->input_name,
329 			line,
330 			column
331 		);
332 	} else {
333 		php_libxml_pretend_ctx_error_ex(
334 			application_data->input_name,
335 			line,
336 			column,
337 			"tree error %s in %s, line: %zu, column: %zu-%zu\n",
338 			dom_lexbor_tree_error_code_to_string(error->id),
339 			application_data->input_name,
340 			line,
341 			column,
342 			column + len - 1
343 		);
344 	}
345 }
346 
dom_search_child(xmlNodePtr parent,const char * searching_for)347 static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for)
348 {
349 	xmlNodePtr node = parent->children;
350 	while (node != NULL) {
351 		if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) {
352 			return node;
353 		}
354 		node = node->next;
355 	}
356 	return NULL;
357 }
358 
dom_place_remove_element_and_hoist_children(xmlNodePtr parent,const char * searching_for)359 static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for)
360 {
361 	xmlNodePtr node = dom_search_child(parent, searching_for);
362 	if (node != NULL) {
363 		xmlUnlinkNode(node);
364 
365 		xmlNodePtr child = node->children;
366 		while (child != NULL) {
367 			xmlUnlinkNode(child);
368 			xmlAddChild(parent, child);
369 			child = node->children;
370 		}
371 
372 		xmlFreeNode(node);
373 	}
374 }
375 
dom_post_process_html5_loading(xmlDocPtr lxml_doc,zend_long options,const lexbor_libxml2_bridge_extracted_observations * observations)376 static void dom_post_process_html5_loading(
377 	xmlDocPtr lxml_doc,
378 	zend_long options,
379 	const lexbor_libxml2_bridge_extracted_observations *observations
380 )
381 {
382 	if (options & HTML_PARSE_NOIMPLIED) {
383 		xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html");
384 		if (!observations->has_explicit_head_tag) {
385 			dom_place_remove_element_and_hoist_children(html_node, "head");
386 		}
387 		if (!observations->has_explicit_body_tag) {
388 			dom_place_remove_element_and_hoist_children(html_node, "body");
389 		}
390 		if (!observations->has_explicit_html_tag) {
391 			dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html");
392 		}
393 	}
394 }
395 
396 /* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */
dom_determine_encoding(const char * source,size_t source_len)397 static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len)
398 {
399 	dom_character_encoding_data result;
400 
401 	/* BOM sniffing */
402 	if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') {
403 		result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
404 		result.bom_shift = 3;
405 		return result;
406 	} else if (source_len >= 2) {
407 		if (source[0] == '\xFE' && source[1] == '\xFF') {
408 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16BE);
409 			result.bom_shift = 2;
410 			return result;
411 		} else if (source[0] == '\xFF' && source[1] == '\xFE') {
412 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16LE);
413 			result.bom_shift = 2;
414 			return result;
415 		}
416 	}
417 
418 	/* Perform prescan */
419 	lxb_html_encoding_t encoding;
420 	lxb_status_t status = lxb_html_encoding_init(&encoding);
421 	if (status != LXB_STATUS_OK) {
422 		goto fallback_uninit;
423 	}
424 	/* This is the "wait either for 1024 bytes or 500ms" part */
425 	if (source_len > 1024) {
426 		source_len = 1024;
427 	}
428 	status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len);
429 	if (status != LXB_STATUS_OK) {
430 		goto fallback;
431 	}
432 	lxb_html_encoding_entry_t *entry = lxb_html_encoding_meta_entry(&encoding, 0);
433 	if (entry == NULL) {
434 		goto fallback;
435 	}
436 	result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
437 	if (!result.encoding_data) {
438 		goto fallback;
439 	}
440 	result.bom_shift = 0;
441 	lxb_html_encoding_destroy(&encoding, false);
442 	return result;
443 
444 fallback:
445 	lxb_html_encoding_destroy(&encoding, false);
446 fallback_uninit:
447 	result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
448 	result.bom_shift = 0;
449 	return result;
450 }
451 
dom_setup_parser_encoding_manually(const lxb_char_t * buf_start,const lxb_encoding_data_t * encoding_data,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)452 static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
453 {
454 	static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
455 
456 	decoding_encoding_ctx->decode_data = encoding_data;
457 
458 	(void) lxb_encoding_decode_init(
459 		&decoding_encoding_ctx->decode,
460 		decoding_encoding_ctx->decode_data,
461 		decoding_encoding_ctx->codepoints,
462 		sizeof(decoding_encoding_ctx->codepoints) / sizeof(*decoding_encoding_ctx->codepoints)
463 	);
464 	(void) lxb_encoding_decode_replace_set(
465 		&decoding_encoding_ctx->decode,
466 		&replacement_codepoint,
467 		LXB_ENCODING_REPLACEMENT_BUFFER_LEN
468 	);
469 	/* Note: encode_data is for UTF-8 */
470 	decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data;
471 
472 	if (decoding_encoding_ctx->fast_path) {
473 		application_data->current_input_codepoints = NULL;
474 		application_data->current_input_characters = (const char *) buf_start;
475 	} else {
476 		application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
477 		application_data->current_input_characters = NULL;
478 	}
479 }
480 
dom_setup_parser_encoding_implicitly(const lxb_char_t ** buf_ref,size_t * read,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)481 static void dom_setup_parser_encoding_implicitly(
482 	const lxb_char_t **buf_ref,
483 	size_t *read,
484 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
485 	dom_lexbor_libxml2_bridge_application_data *application_data
486 )
487 {
488 	const char *buf_start = (const char *) *buf_ref;
489 	dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
490 	*buf_ref += dom_encoding_data.bom_shift;
491 	*read -= dom_encoding_data.bom_shift;
492 	dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
493 }
494 
dom_process_parse_chunk(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,size_t encoded_length,const lxb_char_t * encoding_output,size_t input_buffer_length,size_t * tokenizer_error_offset,size_t * tree_error_offset)495 static bool dom_process_parse_chunk(
496 	lexbor_libxml2_bridge_parse_context *ctx,
497 	lxb_html_document_t *document,
498 	lxb_html_parser_t *parser,
499 	size_t encoded_length,
500 	const lxb_char_t *encoding_output,
501 	size_t input_buffer_length,
502 	size_t *tokenizer_error_offset,
503 	size_t *tree_error_offset
504 )
505 {
506 	dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data;
507 	application_data->current_input_length = input_buffer_length;
508 	lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length);
509 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
510 		return false;
511 	}
512 	if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
513 		lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
514 		dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
515 	}
516 	application_data->current_total_offset += input_buffer_length;
517 	application_data->cache_tokenizer.last_offset = 0;
518 	return true;
519 }
520 
521 /* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input.
522  * Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */
dom_seek_utf8_non_ascii(const lxb_char_t ** data,const lxb_char_t * end)523 static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end)
524 {
525 	while (*data + sizeof(size_t) <= end) {
526 		size_t bytes;
527 		memcpy(&bytes, *data, sizeof(bytes));
528 		/* If the top bit is set, it's not ASCII. */
529 		if ((bytes & LEXBOR_SWAR_REPEAT(0x80)) != 0) {
530 			return false;
531 		}
532 		*data += sizeof(size_t);
533 	}
534 
535 	while (*data < end) {
536 		if (**data & 0x80) {
537 			return false;
538 		}
539 		(*data)++;
540 	}
541 
542 	return true;
543 }
544 
dom_decode_encode_fast_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)545 static bool dom_decode_encode_fast_path(
546 	lexbor_libxml2_bridge_parse_context *ctx,
547 	lxb_html_document_t *document,
548 	lxb_html_parser_t *parser,
549 	const lxb_char_t **buf_ref_ref,
550 	const lxb_char_t *buf_end,
551 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
552 	size_t *tokenizer_error_offset,
553 	size_t *tree_error_offset
554 )
555 {
556 	decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
557 
558 	const lxb_char_t *buf_ref = *buf_ref_ref;
559 	const lxb_char_t *last_output = buf_ref;
560 	while (buf_ref != buf_end) {
561 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
562 		if (decoding_encoding_ctx->decode.u.utf_8.need == 0) {
563 			/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
564 			 * need more UTF-8 bytes to complete a sequence. */
565 			if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) {
566 				ZEND_ASSERT(buf_ref == buf_end);
567 				break;
568 			}
569 		}
570 		const lxb_char_t *buf_ref_backup = buf_ref;
571 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
572 		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
573 			if (!dom_process_parse_chunk(
574 				ctx,
575 				document,
576 				parser,
577 				buf_ref_backup - last_output,
578 				last_output,
579 				buf_ref - last_output,
580 				tokenizer_error_offset,
581 				tree_error_offset
582 			)) {
583 				goto fail_oom;
584 			}
585 
586 			if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
587 				ZEND_ASSERT(buf_ref == buf_end);
588 				/* The decoder needs more data but the entire buffer is consumed.
589 				 * All valid data is outputted, and if the remaining data for the code point
590 				 * is invalid, the next call will output the replacement bytes. */
591 				*buf_ref_ref = buf_ref;
592 				decoding_encoding_ctx->decode.status = LXB_STATUS_CONTINUE;
593 				return true;
594 			}
595 
596 			if (!dom_process_parse_chunk(
597 				ctx,
598 				document,
599 				parser,
600 				LXB_ENCODING_REPLACEMENT_SIZE,
601 				LXB_ENCODING_REPLACEMENT_BYTES,
602 				0,
603 				tokenizer_error_offset,
604 				tree_error_offset
605 			)) {
606 				goto fail_oom;
607 			}
608 
609 			last_output = buf_ref;
610 		}
611 	}
612 	if (buf_ref != last_output
613 		&& !dom_process_parse_chunk(
614 			ctx,
615 			document,
616 			parser,
617 			buf_ref - last_output,
618 			last_output,
619 			buf_ref - last_output,
620 			tokenizer_error_offset,
621 			tree_error_offset
622 	)) {
623 		goto fail_oom;
624 	}
625 	*buf_ref_ref = buf_ref;
626 	return true;
627 fail_oom:
628 	*buf_ref_ref = buf_ref;
629 	return false;
630 }
631 
dom_decode_encode_slow_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)632 static bool dom_decode_encode_slow_path(
633 	lexbor_libxml2_bridge_parse_context *ctx,
634 	lxb_html_document_t *document,
635 	lxb_html_parser_t *parser,
636 	const lxb_char_t **buf_ref_ref,
637 	const lxb_char_t *buf_end,
638 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
639 	size_t *tokenizer_error_offset,
640 	size_t *tree_error_offset
641 )
642 {
643 	const lxb_char_t *buf_ref = *buf_ref_ref;
644 	lexbor_status_t decode_status, encode_status;
645 	do {
646 		decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
647 
648 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
649 		size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
650 		const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used;
651 		do {
652 			encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
653 			ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid");
654 			if (!dom_process_parse_chunk(
655 				ctx,
656 				document,
657 				parser,
658 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
659 				decoding_encoding_ctx->encoding_output,
660 				decoding_buffer_used,
661 				tokenizer_error_offset,
662 				tree_error_offset
663 			)) {
664 				goto fail_oom;
665 			}
666 			lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0);
667 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
668 		lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0);
669 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
670 	*buf_ref_ref = buf_ref;
671 	return true;
672 fail_oom:
673 	*buf_ref_ref = buf_ref;
674 	return false;
675 }
676 
dom_parse_decode_encode_step(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)677 static bool dom_parse_decode_encode_step(
678 	lexbor_libxml2_bridge_parse_context *ctx,
679 	lxb_html_document_t *document,
680 	lxb_html_parser_t *parser,
681 	const lxb_char_t **buf_ref_ref,
682 	const lxb_char_t *buf_end,
683 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
684 	size_t *tokenizer_error_offset,
685 	size_t *tree_error_offset
686 )
687 {
688 	if (decoding_encoding_ctx->fast_path) {
689 		return dom_decode_encode_fast_path(
690 			ctx,
691 			document,
692 			parser,
693 			buf_ref_ref,
694 			buf_end,
695 			decoding_encoding_ctx,
696 			tokenizer_error_offset,
697 			tree_error_offset
698 		);
699 	} else {
700 		return dom_decode_encode_slow_path(
701 			ctx,
702 			document,
703 			parser,
704 			buf_ref_ref,
705 			buf_end,
706 			decoding_encoding_ctx,
707 			tokenizer_error_offset,
708 			tree_error_offset
709 		);
710 	}
711 }
712 
dom_parse_decode_encode_finish(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)713 static bool dom_parse_decode_encode_finish(
714 	lexbor_libxml2_bridge_parse_context *ctx,
715 	lxb_html_document_t *document,
716 	lxb_html_parser_t *parser,
717 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
718 	size_t *tokenizer_error_offset,
719 	size_t *tree_error_offset
720 )
721 {
722 	lxb_status_t status;
723 
724 	status = lxb_encoding_decode_finish(&decoding_encoding_ctx->decode);
725 	ZEND_ASSERT(status == LXB_STATUS_OK);
726 
727 	size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
728 	if (decoding_buffer_size > 0) {
729 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
730 		const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size;
731 		status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
732 		ZEND_ASSERT(status == LXB_STATUS_OK);
733 		/* No need to produce output here, as we finish the encoder below and pass the chunk. */
734 	}
735 
736 	status = lxb_encoding_encode_finish(&decoding_encoding_ctx->encode);
737 	ZEND_ASSERT(status == LXB_STATUS_OK);
738 	if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode)
739 		&& !dom_process_parse_chunk(
740 			ctx,
741 			document,
742 			parser,
743 			lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
744 			decoding_encoding_ctx->encoding_output,
745 			lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode),
746 			tokenizer_error_offset,
747 			tree_error_offset
748 	)) {
749 		return false;
750 	}
751 	return true;
752 }
753 
check_options_validity(uint32_t arg_num,zend_long options)754 static bool check_options_validity(uint32_t arg_num, zend_long options)
755 {
756 	const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS;
757 	if ((options & ~VALID_OPTIONS) != 0) {
758 		zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
759 										   "LIBXML_NOERROR, "
760 										   "LIBXML_COMPACT, "
761 										   "LIBXML_HTML_NOIMPLIED, "
762 										   "Dom\\NO_DEFAULT_NS)");
763 		return false;
764 	}
765 	return true;
766 }
767 
PHP_METHOD(Dom_HTMLDocument,createEmpty)768 PHP_METHOD(Dom_HTMLDocument, createEmpty)
769 {
770 	const char *encoding = "UTF-8";
771 	size_t encoding_len = strlen("UTF-8");
772 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|p", &encoding, &encoding_len) == FAILURE) {
773 		RETURN_THROWS();
774 	}
775 
776 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) encoding, encoding_len);
777 
778 	if (encoding_data == NULL) {
779 		zend_argument_value_error(1, "must be a valid document encoding");
780 		RETURN_THROWS();
781 	}
782 
783 	xmlDocPtr lxml_doc = php_dom_create_html_doc();
784 	if (UNEXPECTED(lxml_doc == NULL)) {
785 		goto oom;
786 	}
787 
788 	lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
789 
790 	dom_object *intern = php_dom_instantiate_object_helper(
791 		return_value,
792 		dom_html_document_class_entry,
793 		(xmlNodePtr) lxml_doc,
794 		NULL
795 	);
796 	dom_set_xml_class(intern->document);
797 	intern->document->private_data = php_dom_libxml_private_data_header(php_dom_private_data_create());
798 	return;
799 
800 oom:
801 	php_dom_throw_error(INVALID_STATE_ERR, true);
802 	RETURN_THROWS();
803 }
804 
805 /* Only bother to register error handling when the error reports can become observable. */
dom_should_register_error_handlers(zend_long options)806 static bool dom_should_register_error_handlers(zend_long options)
807 {
808 	if (options & XML_PARSE_NOERROR) {
809 		return false;
810 	}
811 
812 	return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING);
813 }
814 
PHP_METHOD(Dom_HTMLDocument,createFromString)815 PHP_METHOD(Dom_HTMLDocument, createFromString)
816 {
817 	const char *source, *override_encoding = NULL;
818 	size_t source_len, override_encoding_len;
819 	zend_long options = 0;
820 	if (zend_parse_parameters(
821 		ZEND_NUM_ARGS(),
822 		"s|lp!",
823 		&source,
824 		&source_len,
825 		&options,
826 		&override_encoding,
827 		&override_encoding_len
828 	) == FAILURE) {
829 		RETURN_THROWS();
830 	}
831 
832 	if (!check_options_validity(2, options)) {
833 		RETURN_THROWS();
834 	}
835 
836 	dom_lexbor_libxml2_bridge_application_data application_data;
837 	application_data.input_name = "Entity";
838 	application_data.current_total_offset = 0;
839 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
840 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
841 	lexbor_libxml2_bridge_parse_context ctx;
842 	lexbor_libxml2_bridge_parse_context_init(&ctx);
843 	if (dom_should_register_error_handlers(options)) {
844 		lexbor_libxml2_bridge_parse_set_error_callbacks(
845 			&ctx,
846 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
847 			dom_lexbor_libxml2_bridge_tree_error_reporter
848 		);
849 	}
850 	ctx.application_data = &application_data;
851 
852 	size_t tokenizer_error_offset = 0;
853 	size_t tree_error_offset = 0;
854 
855 	/* Setup everything encoding & decoding related */
856 	const lxb_char_t *buf_ref = (const lxb_char_t *) source;
857 	dom_decoding_encoding_ctx decoding_encoding_ctx;
858 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
859 	if (override_encoding != NULL) {
860 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
861 			(const lxb_char_t *) override_encoding,
862 			override_encoding_len
863 		);
864 		if (!encoding_data) {
865 			zend_argument_value_error(3, "must be a valid document encoding");
866 			RETURN_THROWS();
867 		}
868 		dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
869 	} else {
870 		dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
871 	}
872 
873 	lxb_html_document_t *document = lxb_html_document_create();
874 	if (UNEXPECTED(document == NULL)) {
875 		goto fail_oom;
876 	}
877 
878 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
879 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
880 		goto fail_oom;
881 	}
882 
883 	lxb_html_parser_t *parser = document->dom_document.parser;
884 
885 	while (source_len > 0) {
886 		size_t chunk_size = source_len;
887 		const size_t MAX_CHUNK_SIZE = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(*decoding_encoding_ctx.encoding_output);
888 		if (chunk_size > MAX_CHUNK_SIZE) {
889 			chunk_size = MAX_CHUNK_SIZE;
890 		}
891 		source_len -= chunk_size;
892 
893 		const lxb_char_t *buf_end = buf_ref + chunk_size;
894 		bool result = dom_parse_decode_encode_step(
895 			&ctx,
896 			document,
897 			parser,
898 			&buf_ref,
899 			buf_end,
900 			&decoding_encoding_ctx,
901 			&tokenizer_error_offset,
902 			&tree_error_offset
903 		);
904 		if (!result) {
905 			goto fail_oom;
906 		}
907 	}
908 
909 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
910 		goto fail_oom;
911 	}
912 
913 	lexbor_status = lxb_html_document_parse_chunk_end(document);
914 	if (lexbor_status != LXB_STATUS_OK) {
915 		goto fail_oom;
916 	}
917 
918 	php_dom_private_data *private_data = php_dom_private_data_create();
919 
920 	xmlDocPtr lxml_doc;
921 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
922 		document,
923 		&lxml_doc,
924 		options & XML_PARSE_COMPACT,
925 		!(options & DOM_HTML_NO_DEFAULT_NS),
926 		private_data
927 	);
928 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
929 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
930 		php_dom_private_data_destroy(private_data);
931 		php_libxml_ctx_error(
932 			NULL,
933 			"%s in %s",
934 			dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status),
935 			application_data.input_name
936 		);
937 		lxb_html_document_destroy(document);
938 		RETURN_FALSE;
939 	}
940 	lxb_html_document_destroy(document);
941 
942 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
943 
944 	if (decoding_encoding_ctx.decode_data) {
945 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
946 	} else {
947 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
948 	}
949 
950 	dom_object *intern = php_dom_instantiate_object_helper(
951 		return_value,
952 		dom_html_document_class_entry,
953 		(xmlNodePtr) lxml_doc,
954 		NULL
955 	);
956 	dom_set_xml_class(intern->document);
957 	intern->document->quirks_mode = ctx.observations.quirks_mode;
958 	intern->document->private_data = php_dom_libxml_private_data_header(private_data);
959 	return;
960 
961 fail_oom:
962 	lxb_html_document_destroy(document);
963 	php_dom_throw_error(INVALID_STATE_ERR, true);
964 	RETURN_THROWS();
965 }
966 
PHP_METHOD(Dom_HTMLDocument,createFromFile)967 PHP_METHOD(Dom_HTMLDocument, createFromFile)
968 {
969 	const char *filename, *override_encoding = NULL;
970 	php_dom_private_data *private_data = NULL;
971 	size_t filename_len, override_encoding_len;
972 	zend_long options = 0;
973 	php_stream *stream = NULL;
974 	if (zend_parse_parameters(
975 		ZEND_NUM_ARGS(),
976 		"p|lp!",
977 		&filename,
978 		&filename_len,
979 		&options,
980 		&override_encoding,
981 		&override_encoding_len
982 	) == FAILURE) {
983 		RETURN_THROWS();
984 	}
985 
986 	/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
987 	if (strstr(filename, "%00")) {
988 		zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
989 		RETURN_THROWS();
990 	}
991 
992 	if (!check_options_validity(2, options)) {
993 		RETURN_THROWS();
994 	}
995 
996 	dom_lexbor_libxml2_bridge_application_data application_data;
997 	application_data.input_name = filename;
998 	application_data.current_total_offset = 0;
999 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
1000 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
1001 	lexbor_libxml2_bridge_parse_context ctx;
1002 	lexbor_libxml2_bridge_parse_context_init(&ctx);
1003 	if (dom_should_register_error_handlers(options)) {
1004 		lexbor_libxml2_bridge_parse_set_error_callbacks(
1005 			&ctx,
1006 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
1007 			dom_lexbor_libxml2_bridge_tree_error_reporter
1008 		);
1009 	}
1010 	ctx.application_data = &application_data;
1011 
1012 	char buf[4096];
1013 
1014 	/* Setup everything encoding & decoding related */
1015 	dom_decoding_encoding_ctx decoding_encoding_ctx;
1016 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
1017 	bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
1018 	if (override_encoding != NULL) {
1019 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1020 			(const lxb_char_t *) override_encoding,
1021 			override_encoding_len
1022 		);
1023 		if (!encoding_data) {
1024 			zend_argument_value_error(3, "must be a valid document encoding");
1025 			RETURN_THROWS();
1026 		}
1027 		should_determine_encoding_implicitly = false;
1028 		dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
1029 	}
1030 
1031 	zend_string *opened_path = NULL;
1032 	stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, &opened_path, php_libxml_get_stream_context());
1033 	if (!stream) {
1034 		if (!EG(exception)) {
1035 			zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", filename);
1036 		}
1037 		RETURN_THROWS();
1038 	}
1039 
1040 	/* MIME sniff */
1041 	if (should_determine_encoding_implicitly) {
1042 		zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
1043 		if (charset != NULL) {
1044 			const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1045 				(const lxb_char_t *) ZSTR_VAL(charset),
1046 				ZSTR_LEN(charset)
1047 			);
1048 			if (encoding_data != NULL) {
1049 				should_determine_encoding_implicitly = false;
1050 				dom_setup_parser_encoding_manually(
1051 					(const lxb_char_t *) buf,
1052 					encoding_data,
1053 					&decoding_encoding_ctx,
1054 					&application_data
1055 				);
1056 			}
1057 			zend_string_release_ex(charset, false);
1058 		}
1059 	}
1060 
1061 	lxb_html_document_t *document = lxb_html_document_create();
1062 	if (UNEXPECTED(document == NULL)) {
1063 		goto fail_oom;
1064 	}
1065 
1066 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
1067 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
1068 		goto fail_oom;
1069 	}
1070 
1071 	size_t tokenizer_error_offset = 0;
1072 	size_t tree_error_offset = 0;
1073 	ssize_t read;
1074 	lxb_html_parser_t *parser = document->dom_document.parser;
1075 
1076 	while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
1077 		const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1078 
1079 		if (should_determine_encoding_implicitly) {
1080 			should_determine_encoding_implicitly = false;
1081 			dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
1082 		}
1083 
1084 		const lxb_char_t *buf_end = buf_ref + read;
1085 		bool result = dom_parse_decode_encode_step(
1086 			&ctx,
1087 			document,
1088 			parser,
1089 			&buf_ref,
1090 			buf_end,
1091 			&decoding_encoding_ctx,
1092 			&tokenizer_error_offset,
1093 			&tree_error_offset
1094 		);
1095 		if (!result) {
1096 			goto fail_oom;
1097 		}
1098 	}
1099 
1100 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
1101 		goto fail_oom;
1102 	}
1103 
1104 	lexbor_status = lxb_html_document_parse_chunk_end(document);
1105 	if (lexbor_status != LXB_STATUS_OK) {
1106 		goto fail_oom;
1107 	}
1108 
1109 	private_data = php_dom_private_data_create();
1110 
1111 	xmlDocPtr lxml_doc;
1112 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
1113 		document,
1114 		&lxml_doc,
1115 		options & XML_PARSE_COMPACT,
1116 		!(options & DOM_HTML_NO_DEFAULT_NS),
1117 		private_data
1118 	);
1119 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
1120 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
1121 		php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename);
1122 		RETVAL_FALSE;
1123 		goto fail_general;
1124 	}
1125 	lxb_html_document_destroy(document);
1126 
1127 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
1128 
1129 	if (decoding_encoding_ctx.decode_data) {
1130 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
1131 	} else {
1132 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
1133 	}
1134 
1135 	if (stream->wrapper == &php_plain_files_wrapper && opened_path != NULL) {
1136 		xmlChar *converted = xmlPathToURI((const xmlChar *) ZSTR_VAL(opened_path));
1137 		if (UNEXPECTED(!converted)) {
1138 			goto fail_oom;
1139 		}
1140 		/* Check for "file:/" instead of "file://" because of libxml2 quirk */
1141 		if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
1142 			xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
1143 			if (UNEXPECTED(!buffer)) {
1144 				xmlFree(converted);
1145 				goto fail_oom;
1146 			}
1147 			xmlChar *new_buffer = xmlStrcat(buffer, converted);
1148 			if (UNEXPECTED(!new_buffer)) {
1149 				xmlFree(buffer);
1150 				xmlFree(converted);
1151 				goto fail_oom;
1152 			}
1153 			xmlFree(converted);
1154 			lxml_doc->URL = new_buffer;
1155 		} else {
1156 #ifdef PHP_WIN32
1157 			converted = php_dom_libxml_fix_file_path(converted);
1158 #endif
1159 			lxml_doc->URL = converted;
1160 		}
1161 	} else {
1162 		lxml_doc->URL = xmlStrdup((const xmlChar *) filename);
1163 	}
1164 
1165 	if (opened_path != NULL) {
1166 		zend_string_release_ex(opened_path, false);
1167 	}
1168 	php_stream_close(stream);
1169 	stream = NULL;
1170 
1171 	dom_object *intern = php_dom_instantiate_object_helper(
1172 		return_value,
1173 		dom_html_document_class_entry,
1174 		(xmlNodePtr) lxml_doc,
1175 		NULL
1176 	);
1177 	dom_set_xml_class(intern->document);
1178 	intern->document->quirks_mode = ctx.observations.quirks_mode;
1179 	intern->document->private_data = php_dom_libxml_private_data_header(private_data);
1180 	return;
1181 
1182 fail_oom:
1183 	php_dom_throw_error(INVALID_STATE_ERR, true);
1184 fail_general:
1185 	if (private_data != NULL) {
1186 		php_dom_private_data_destroy(private_data);
1187 	}
1188 	lxb_html_document_destroy(document);
1189 	php_stream_close(stream);
1190 	if (opened_path != NULL) {
1191 		zend_string_release_ex(opened_path, false);
1192 	}
1193 }
1194 
dom_write_output_smart_str(void * ctx,const char * buf,size_t size)1195 static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size)
1196 {
1197 	smart_str_appendl((smart_str *) ctx, buf, size);
1198 	return SUCCESS;
1199 }
1200 
dom_write_output_stream(void * application_data,const char * buf,size_t len)1201 static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len)
1202 {
1203 	php_stream *stream = (php_stream *) application_data;
1204 	if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) {
1205 		return FAILURE;
1206 	}
1207 	return SUCCESS;
1208 }
1209 
1210 /* Fast path when the output encoding is UTF-8 */
dom_saveHTML_write_string_len_utf8_output(void * application_data,const char * buf,size_t len)1211 static zend_result dom_saveHTML_write_string_len_utf8_output(void *application_data, const char *buf, size_t len)
1212 {
1213 	dom_output_ctx *output = (dom_output_ctx *) application_data;
1214 
1215 	output->decode->status = LXB_STATUS_OK;
1216 
1217 	const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1218 	const lxb_char_t *last_output = buf_ref;
1219 	const lxb_char_t *buf_end = buf_ref + len;
1220 
1221 	while (buf_ref != buf_end) {
1222 		const lxb_char_t *buf_ref_backup = buf_ref;
1223 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(output->decode, &buf_ref, buf_end);
1224 		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
1225 			if (UNEXPECTED(output->write_output(
1226 				output->output_data,
1227 				(const char *) last_output,
1228 				buf_ref_backup - last_output
1229 			) != SUCCESS)) {
1230 				return FAILURE;
1231 			}
1232 
1233 			if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
1234 				ZEND_ASSERT(buf_ref == buf_end);
1235 				/* The decoder needs more data but the entire buffer is consumed.
1236 				 * All valid data is outputted, and if the remaining data for the code point
1237 				 * is invalid, the next call will output the replacement bytes. */
1238 				output->decode->status = LXB_STATUS_CONTINUE;
1239 				return SUCCESS;
1240 			}
1241 
1242 			if (UNEXPECTED(output->write_output(
1243 				output->output_data,
1244 				(const char *) LXB_ENCODING_REPLACEMENT_BYTES,
1245 				LXB_ENCODING_REPLACEMENT_SIZE
1246 			) != SUCCESS)) {
1247 				return FAILURE;
1248 			}
1249 
1250 			last_output = buf_ref;
1251 		}
1252 	}
1253 
1254 	if (buf_ref != last_output) {
1255 		if (UNEXPECTED(output->write_output(
1256 			output->output_data,
1257 			(const char *) last_output,
1258 			buf_ref - last_output
1259 		) != SUCCESS)) {
1260 			return FAILURE;
1261 		}
1262 	}
1263 
1264 	return SUCCESS;
1265 }
1266 
dom_saveHTML_write_string_utf8_output(void * application_data,const char * buf)1267 static zend_result dom_saveHTML_write_string_utf8_output(void *application_data, const char *buf)
1268 {
1269 	return dom_saveHTML_write_string_len_utf8_output(application_data, buf, strlen(buf));
1270 }
1271 
dom_saveHTML_write_string_len(void * application_data,const char * buf,size_t len)1272 static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
1273 {
1274 	dom_output_ctx *output = (dom_output_ctx *) application_data;
1275 	lxb_status_t decode_status, encode_status;
1276 	const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1277 	const lxb_char_t *buf_end = buf_ref + len;
1278 
1279 	do {
1280 		decode_status = lxb_encoding_decode_utf_8(output->decode, &buf_ref, buf_end);
1281 
1282 		const lxb_codepoint_t *codepoints_ref = output->codepoints;
1283 		const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
1284 		do {
1285 			encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end);
1286 			if (UNEXPECTED(output->write_output(
1287 				output->output_data,
1288 				(const char *) output->encoding_output,
1289 				lxb_encoding_encode_buf_used(output->encode)
1290 			) != SUCCESS)) {
1291 				return FAILURE;
1292 			}
1293 			lxb_encoding_encode_buf_used_set(output->encode, 0);
1294 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
1295 		lxb_encoding_decode_buf_used_set(output->decode, 0);
1296 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
1297 
1298 	return SUCCESS;
1299 }
1300 
dom_saveHTML_write_string(void * application_data,const char * buf)1301 static zend_result dom_saveHTML_write_string(void *application_data, const char *buf)
1302 {
1303 	return dom_saveHTML_write_string_len(application_data, buf, strlen(buf));
1304 }
1305 
dom_common_save(dom_output_ctx * output_ctx,dom_object * intern,const xmlDoc * docp,const xmlNode * node)1306 static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *intern, const xmlDoc *docp, const xmlNode *node)
1307 {
1308 	/* Initialize everything related to encoding & decoding */
1309 	const lxb_encoding_data_t *decoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
1310 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1311 		(const lxb_char_t *) docp->encoding,
1312 		strlen((const char *) docp->encoding)
1313 	);
1314 	lxb_encoding_encode_t encode;
1315 	lxb_encoding_decode_t decode;
1316 	lxb_char_t encoding_output[4096];
1317 	lxb_codepoint_t codepoints[4096];
1318 	(void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(*encoding_output));
1319 	(void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(*codepoints));
1320 	if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1321 		lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
1322 	} else {
1323 		/* Fallback if there is no replacement by default */
1324 		lxb_encoding_encode_replace_set(&encode, (const lxb_char_t *) "?", 1);
1325 	}
1326 	lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
1327 
1328 	output_ctx->encoding_data = encoding_data;
1329 	output_ctx->decoding_data = decoding_data;
1330 	output_ctx->encode = &encode;
1331 	output_ctx->decode = &decode;
1332 	output_ctx->codepoints = codepoints;
1333 	output_ctx->encoding_output = encoding_output;
1334 
1335 	dom_html5_serialize_context ctx;
1336 	if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1337 		/* Fast path */
1338 		ctx.write_string_len = dom_saveHTML_write_string_len_utf8_output;
1339 		ctx.write_string = dom_saveHTML_write_string_utf8_output;
1340 	} else {
1341 		/* Slow path */
1342 		ctx.write_string_len = dom_saveHTML_write_string_len;
1343 		ctx.write_string = dom_saveHTML_write_string;
1344 	}
1345 	ctx.application_data = output_ctx;
1346 	ctx.private_data = php_dom_get_private_data(intern);
1347 	if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {
1348 		return FAILURE;
1349 	}
1350 
1351 	(void) lxb_encoding_decode_finish(&decode);
1352 	if (lxb_encoding_decode_buf_used(&decode)) {
1353 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints;
1354 		(void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode));
1355 		if (UNEXPECTED(output_ctx->write_output(
1356 			output_ctx->output_data,
1357 			(const char *) encoding_output,
1358 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1359 		)) {
1360 			return FAILURE;
1361 		}
1362 	}
1363 	(void) lxb_encoding_encode_finish(&encode);
1364 	if (lxb_encoding_encode_buf_used(&encode)) {
1365 		if (UNEXPECTED(output_ctx->write_output(
1366 			output_ctx->output_data,
1367 			(const char *) encoding_output,
1368 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1369 		)) {
1370 			return FAILURE;
1371 		}
1372 	}
1373 
1374 	return SUCCESS;
1375 }
1376 
PHP_METHOD(Dom_HTMLDocument,saveHtmlFile)1377 PHP_METHOD(Dom_HTMLDocument, saveHtmlFile)
1378 {
1379 	zval *id;
1380 	xmlDoc *docp;
1381 	size_t file_len;
1382 	dom_object *intern;
1383 	char *file;
1384 
1385 	id = ZEND_THIS;
1386 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) {
1387 		RETURN_THROWS();
1388 	}
1389 
1390 	if (file_len == 0) {
1391 		zend_argument_must_not_be_empty_error(1);
1392 		RETURN_THROWS();
1393 	}
1394 
1395 	php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
1396 	if (!stream) {
1397 		RETURN_FALSE;
1398 	}
1399 
1400 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
1401 
1402 	dom_output_ctx output_ctx;
1403 	output_ctx.output_data = stream;
1404 	output_ctx.write_output = dom_write_output_stream;
1405 	if (UNEXPECTED(dom_common_save(&output_ctx, intern, docp, (const xmlNode *) docp) != SUCCESS)) {
1406 		php_stream_close(stream);
1407 		RETURN_FALSE;
1408 	}
1409 
1410 	zend_long bytes = php_stream_tell(stream);
1411 	php_stream_close(stream);
1412 
1413 	RETURN_LONG(bytes);
1414 }
1415 
PHP_METHOD(Dom_HTMLDocument,saveHtml)1416 PHP_METHOD(Dom_HTMLDocument, saveHtml)
1417 {
1418 	zval *nodep = NULL;
1419 	const xmlDoc *docp;
1420 	const xmlNode *node;
1421 	dom_object *intern, *nodeobj;
1422 
1423 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|O!", &nodep, dom_modern_node_class_entry) == FAILURE) {
1424 		RETURN_THROWS();
1425 	}
1426 
1427 	DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
1428 
1429 	if (nodep != NULL) {
1430 		DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj);
1431 		if (node->doc != docp) {
1432 			php_dom_throw_error(WRONG_DOCUMENT_ERR, true);
1433 			RETURN_THROWS();
1434 		}
1435 	} else {
1436 		node = (const xmlNode *) docp;
1437 	}
1438 
1439 	smart_str buf = {0};
1440 	dom_output_ctx output_ctx;
1441 	output_ctx.output_data = &buf;
1442 	output_ctx.write_output = dom_write_output_smart_str;
1443 	/* Can't fail because dom_write_output_smart_str() can't fail. */
1444 	zend_result result = dom_common_save(&output_ctx, intern, docp, node);
1445 	ZEND_ASSERT(result == SUCCESS);
1446 
1447 	RETURN_STR(smart_str_extract(&buf));
1448 }
1449 
dom_html_document_encoding_write(dom_object * obj,zval * newval)1450 zend_result dom_html_document_encoding_write(dom_object *obj, zval *newval)
1451 {
1452 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1453 
1454 	/* Typed property, can only be IS_STRING. */
1455 	ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
1456 
1457 	zend_string *str = Z_STR_P(newval);
1458 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str));
1459 
1460 	if (encoding_data != NULL) {
1461 		xmlFree(BAD_CAST docp->encoding);
1462 		docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name);
1463 	} else {
1464 		zend_value_error("Invalid document encoding");
1465 		return FAILURE;
1466 	}
1467 
1468 	return SUCCESS;
1469 }
1470 
dom_html_document_element_read_raw(const xmlDoc * docp,bool (* accept)(const xmlChar *))1471 static xmlNodePtr dom_html_document_element_read_raw(const xmlDoc *docp, bool (*accept)(const xmlChar *))
1472 {
1473 	const xmlNode *root = xmlDocGetRootElement(docp);
1474 	if (root == NULL || !(php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token) && xmlStrEqual(root->name, BAD_CAST "html"))) {
1475 		return NULL;
1476 	}
1477 
1478 	xmlNodePtr cur = root->children;
1479 	while (cur != NULL) {
1480 		if (cur->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(cur, php_dom_ns_is_html_magic_token) && accept(cur->name)) {
1481 			return cur;
1482 		}
1483 		cur = cur->next;
1484 	}
1485 
1486 	return NULL;
1487 }
1488 
dom_html_document_element_read_helper(dom_object * obj,zval * retval,bool (* accept)(const xmlChar *))1489 zend_result dom_html_document_element_read_helper(dom_object *obj, zval *retval, bool (*accept)(const xmlChar *))
1490 {
1491 	DOM_PROP_NODE(const xmlDoc *, docp, obj);
1492 
1493 	const xmlNode *element = dom_html_document_element_read_raw(docp, accept);
1494 	php_dom_create_nullable_object((xmlNodePtr) element, retval, obj);
1495 
1496 	return SUCCESS;
1497 }
1498 
dom_accept_body_name(const xmlChar * name)1499 static bool dom_accept_body_name(const xmlChar *name)
1500 {
1501 	return xmlStrEqual(name, BAD_CAST "body") || xmlStrEqual(name, BAD_CAST "frameset");
1502 }
1503 
dom_accept_head_name(const xmlChar * name)1504 static bool dom_accept_head_name(const xmlChar *name)
1505 {
1506 	return xmlStrEqual(name, BAD_CAST "head");
1507 }
1508 
1509 /* https://html.spec.whatwg.org/#dom-document-body */
dom_html_document_body_read(dom_object * obj,zval * retval)1510 zend_result dom_html_document_body_read(dom_object *obj, zval *retval)
1511 {
1512 	return dom_html_document_element_read_helper(obj, retval, dom_accept_body_name);
1513 }
1514 
1515 /* https://html.spec.whatwg.org/#dom-document-head */
dom_html_document_head_read(dom_object * obj,zval * retval)1516 zend_result dom_html_document_head_read(dom_object *obj, zval *retval)
1517 {
1518 	return dom_html_document_element_read_helper(obj, retval, dom_accept_head_name);
1519 }
1520 
1521 /* https://html.spec.whatwg.org/#dom-document-body */
dom_html_document_body_write(dom_object * obj,zval * newval)1522 zend_result dom_html_document_body_write(dom_object *obj, zval *newval)
1523 {
1524 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1525 
1526 	/* 1. If the new value is not a body or frameset element, then throw a "HierarchyRequestError" DOMException. */
1527 	if (Z_TYPE_P(newval) != IS_NULL) {
1528 		dom_object *newval_intern = Z_DOMOBJ_P(newval);
1529 		if (newval_intern->ptr != NULL) {
1530 			xmlNodePtr newval_node = ((php_libxml_node_ptr *) newval_intern->ptr)->node;
1531 			/* Note: because this property has type HTMLElement, we know the namespace is correct. */
1532 			if (dom_accept_body_name(newval_node->name)) {
1533 				/* 2. If the new value is the same as the body element, return. */
1534 				const xmlNode *current_body_element = dom_html_document_element_read_raw(docp, dom_accept_body_name);
1535 				if (current_body_element == newval_node) {
1536 					return SUCCESS;
1537 				}
1538 
1539 				/* 3. If the body element is not null, then replace the body element with the new value within the body element's parent and return. */
1540 				if (current_body_element != NULL) {
1541 					php_dom_adopt_node(newval_node, obj, docp);
1542 					xmlNodePtr old = xmlReplaceNode((xmlNodePtr) current_body_element, newval_node);
1543 					if (old != NULL && old->_private == NULL) {
1544 						php_libxml_node_free_resource(old);
1545 					}
1546 					return SUCCESS;
1547 				}
1548 
1549 				/* 4. If there is no document element, throw a "HierarchyRequestError" DOMException. */
1550 				xmlNodePtr root = xmlDocGetRootElement(docp);
1551 				if (root == NULL) {
1552 					php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "A body can only be set if there is a document element", true);
1553 					return FAILURE;
1554 				}
1555 
1556 				/* 5. Append the new value to the document element. */
1557 				php_dom_adopt_node(newval_node, obj, docp);
1558 				xmlAddChild(root, newval_node);
1559 				return SUCCESS;
1560 			}
1561 		}
1562 	}
1563 
1564 	php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "The new body must either be a body or a frameset tag", true);
1565 	return FAILURE;
1566 }
1567 
1568 /* https://dom.spec.whatwg.org/#concept-child-text-content */
dom_get_child_text_content(const xmlNode * node)1569 static zend_string *dom_get_child_text_content(const xmlNode *node)
1570 {
1571 	smart_str content = {0};
1572 
1573 	const xmlNode *text = node->children;
1574 	while (text != NULL) {
1575 		if ((text->type == XML_TEXT_NODE || text->type == XML_CDATA_SECTION_NODE) && text->content != NULL) {
1576 			smart_str_appends(&content, (const char *) text->content);
1577 		}
1578 		text = text->next;
1579 	}
1580 
1581 	return smart_str_extract(&content);
1582 }
1583 
1584 /* https://html.spec.whatwg.org/#the-title-element-2 */
dom_get_title_element(const xmlDoc * doc)1585 static xmlNodePtr dom_get_title_element(const xmlDoc *doc)
1586 {
1587 	xmlNodePtr node = doc->children;
1588 
1589 	while (node != NULL) {
1590 		if (node->type == XML_ELEMENT_NODE) {
1591 			if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) && xmlStrEqual(node->name, BAD_CAST "title")) {
1592 				break;
1593 			}
1594 		}
1595 
1596 		node = php_dom_next_in_tree_order(node, NULL);
1597 	}
1598 
1599 	return node;
1600 }
1601 
1602 /* The subtle difference is that this is about the direct title descendant of the svg element,
1603  * whereas the html variant of this function is about the first in-tree title element. */
dom_get_svg_title_element(xmlNodePtr svg)1604 static xmlNodePtr dom_get_svg_title_element(xmlNodePtr svg)
1605 {
1606 	xmlNodePtr cur = svg->children;
1607 
1608 	while (cur != NULL) {
1609 		if (cur->type == XML_ELEMENT_NODE
1610 			&& php_dom_ns_is_fast(cur, php_dom_ns_is_svg_magic_token) && xmlStrEqual(cur->name, BAD_CAST "title")) {
1611 			break;
1612 		}
1613 		cur = cur->next;
1614 	}
1615 
1616 	return cur;
1617 }
1618 
1619 /* https://html.spec.whatwg.org/#document.title */
dom_html_document_title_read(dom_object * obj,zval * retval)1620 zend_result dom_html_document_title_read(dom_object *obj, zval *retval)
1621 {
1622 	DOM_PROP_NODE(const xmlDoc *, docp, obj);
1623 	xmlNodePtr root = xmlDocGetRootElement(docp);
1624 
1625 	if (root == NULL) {
1626 		ZVAL_EMPTY_STRING(retval);
1627 		return SUCCESS;
1628 	}
1629 
1630 	zend_string *value = zend_empty_string;
1631 
1632 	/* 1. If the document element is an SVG svg element,
1633 	 *    then let value be the child text content of the first SVG title element that is a child of the document element. */
1634 	if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
1635 		const xmlNode *title = dom_get_svg_title_element(root);
1636 		if (title != NULL) {
1637 			value = dom_get_child_text_content(title);
1638 		}
1639 	} else {
1640 		/* 2. Otherwise, let value be the child text content of the title element,
1641 		 *    or the empty string if the title element is null. */
1642 		const xmlNode *title = dom_get_title_element(docp);
1643 		if (title != NULL) {
1644 			value = dom_get_child_text_content(title);
1645 		}
1646 	}
1647 
1648 	/* 3. Strip and collapse ASCII whitespace in value. */
1649 	value = dom_strip_and_collapse_ascii_whitespace(value);
1650 
1651 	/* 4. Return value. */
1652 	ZVAL_STR(retval, value);
1653 
1654 	return SUCCESS;
1655 }
1656 
dom_string_replace_all(xmlDocPtr docp,xmlNodePtr element,zval * zv)1657 static void dom_string_replace_all(xmlDocPtr docp, xmlNodePtr element, zval *zv)
1658 {
1659 	dom_remove_all_children(element);
1660 	xmlNode *text = xmlNewDocText(docp, BAD_CAST Z_STRVAL_P(zv));
1661 	xmlAddChild(element, text);
1662 }
1663 
1664 /* https://html.spec.whatwg.org/#document.title */
dom_html_document_title_write(dom_object * obj,zval * newval)1665 zend_result dom_html_document_title_write(dom_object *obj, zval *newval)
1666 {
1667 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1668 	xmlNodePtr root = xmlDocGetRootElement(docp);
1669 
1670 	if (root == NULL) {
1671 		return SUCCESS;
1672 	}
1673 
1674 	/* If the document element is an SVG svg element */
1675 	if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
1676 		/* 1. If there is an SVG title element that is a child of the document element, let element be the first such element. */
1677 		xmlNodePtr element = dom_get_svg_title_element(root);
1678 
1679 		/* 2. Otherwise: */
1680 		if (element == NULL) {
1681 			/* 2.1. Let element be the result of creating an element given the document element's node document,
1682 			 *      title, and the SVG namespace. */
1683 
1684 			/* Annoyingly, we must create it in the svg namespace _without_ prefix... */
1685 			xmlNsPtr ns = root->ns;
1686 			if (ns->prefix != NULL) {
1687 				/* Slow path... */
1688 				php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
1689 				zend_string *href = ZSTR_INIT_LITERAL(DOM_SVG_NS_URI, false);
1690 				ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, zend_empty_string, href);
1691 				zend_string_release_ex(href, false);
1692 			}
1693 
1694 			element = xmlNewDocNode(docp, ns, BAD_CAST "title", NULL);
1695 			if (UNEXPECTED(element == NULL)) {
1696 				php_dom_throw_error(INVALID_STATE_ERR, true);
1697 				return FAILURE;
1698 			}
1699 
1700 			/* 2.2. Insert element as the first child of the document element. */
1701 			if (root->children == NULL) {
1702 				root->last = element;
1703 			} else {
1704 				element->next = root->children;
1705 				root->children->prev = element;
1706 			}
1707 			root->children = element;
1708 			element->parent = root;
1709 		}
1710 
1711 		/* 3. String replace all with the given value within element. */
1712 		dom_string_replace_all(docp, element, newval);
1713 	}
1714 	/* If the document element is in the HTML namespace */
1715 	else if (php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token)) {
1716 		/* 1. If the title element is null and the head element is null, then return. */
1717 		xmlNodePtr title = dom_get_title_element(docp);
1718 		xmlNodePtr head = dom_html_document_element_read_raw(docp, dom_accept_head_name);
1719 		if (title == NULL && head == NULL) {
1720 			return SUCCESS;
1721 		}
1722 
1723 		/* 2. If the title element is non-null, let element be the title element. */
1724 		xmlNodePtr element = title;
1725 
1726 		/* 3. Otherwise: */
1727 		if (element == NULL) {
1728 			/* 3.1. Let element be the result of creating an element given the document element's node document, title,
1729 			 *      and the HTML namespace. */
1730 			php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
1731 			element = xmlNewDocNode(docp, php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper), BAD_CAST "title", NULL);
1732 			if (UNEXPECTED(element == NULL)) {
1733 				php_dom_throw_error(INVALID_STATE_ERR, true);
1734 				return FAILURE;
1735 			}
1736 
1737 			/* 3.2. Append element to the head element. */
1738 			xmlAddChild(head, element);
1739 		}
1740 
1741 		/* 4. String replace all with the given value within element. */
1742 		dom_string_replace_all(docp, element, newval);
1743 	}
1744 
1745 	return SUCCESS;
1746 }
1747 
1748 #if ZEND_DEBUG
PHP_METHOD(Dom_HTMLDocument,debugGetTemplateCount)1749 PHP_METHOD(Dom_HTMLDocument, debugGetTemplateCount)
1750 {
1751 	xmlDocPtr doc;
1752 	dom_object *intern;
1753 
1754 	ZEND_PARSE_PARAMETERS_NONE();
1755 
1756 	DOM_GET_OBJ(doc, ZEND_THIS, xmlDocPtr, intern);
1757 	ZEND_IGNORE_VALUE(doc);
1758 
1759 	RETURN_LONG((zend_long) php_dom_get_template_count((const php_dom_private_data *) intern->document->private_data));
1760 }
1761 #endif
1762 
1763 #endif  /* HAVE_LIBXML && HAVE_DOM */
1764