xref: /PHP-8.2/ext/dom/html_document.c (revision 2f1fe320)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_parser.h"
25 #include "html5_serializer.h"
26 #include "namespace_compat.h"
27 #include <Zend/zend_smart_string.h>
28 #include <lexbor/html/encoding.h>
29 #include <lexbor/encoding/encoding.h>
30 
31 /* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
32 #define DOM_FALLBACK_ENCODING_NAME "UTF-8"
33 #define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
34 
35 typedef struct _dom_line_column_cache {
36 	size_t last_line;
37 	size_t last_column;
38 	size_t last_offset;
39 } dom_line_column_cache;
40 
41 typedef struct _dom_lexbor_libxml2_bridge_application_data {
42 	const char *input_name;
43 	const lxb_codepoint_t *current_input_codepoints;
44 	const char *current_input_characters;
45 	size_t current_input_length;
46 	size_t current_total_offset;
47 	dom_line_column_cache cache_tokenizer;
48 	bool html_no_implied;
49 } dom_lexbor_libxml2_bridge_application_data;
50 
51 typedef struct _dom_character_encoding_data {
52 	const lxb_encoding_data_t *encoding_data;
53 	size_t bom_shift;
54 } dom_character_encoding_data;
55 
56 typedef zend_result (*dom_write_output)(void*, const char *, size_t);
57 
58 typedef struct _dom_output_ctx {
59 	const lxb_encoding_data_t *encoding_data;
60 	const lxb_encoding_data_t *decoding_data;
61 	lxb_encoding_encode_t *encode;
62 	lxb_encoding_decode_t *decode;
63 	lxb_codepoint_t *codepoints;
64 	lxb_char_t *encoding_output;
65 	void *output_data;
66 	dom_write_output write_output;
67 } dom_output_ctx;
68 
69 typedef struct _dom_decoding_encoding_ctx {
70 	/* We can skip some conversion if the input and output encoding are both UTF-8,
71 	 * we only have to validate and substitute replacement characters */
72 	bool fast_path; /* Put first, near the encode & decode structures, for cache locality */
73 	lxb_encoding_encode_t encode;
74 	lxb_encoding_decode_t decode;
75 	const lxb_encoding_data_t *encode_data;
76 	const lxb_encoding_data_t *decode_data;
77 	lxb_char_t encoding_output[4096];
78 	lxb_codepoint_t codepoints[4096];
79 } dom_decoding_encoding_ctx;
80 
dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx * ctx)81 static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
82 {
83 	ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
84 	ctx->decode_data = NULL;
85 	/* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */
86 	ctx->fast_path = true;
87 	(void) lxb_encoding_encode_init(
88 		&ctx->encode,
89 		ctx->encode_data,
90 		ctx->encoding_output,
91 		sizeof(ctx->encoding_output) / sizeof(*ctx->encoding_output)
92 	);
93 	(void) lxb_encoding_encode_replace_set(&ctx->encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
94 }
95 
dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)96 static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)
97 {
98 	switch (id) {
99 		case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment";
100 		case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier";
101 		case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier";
102 		case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference";
103 		case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content";
104 		case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range";
105 		case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream";
106 		case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference";
107 		case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes";
108 		case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute";
109 		case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus";
110 		case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name";
111 		case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata";
112 		case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment";
113 		case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype";
114 		case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text";
115 		case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag";
116 		case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment";
117 		case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment";
118 		case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name";
119 		case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name";
120 		case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value";
121 		case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name";
122 		case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier";
123 		case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier";
124 		case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name";
125 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier";
126 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier";
127 		case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference";
128 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword";
129 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword";
130 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name";
131 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes";
132 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers";
133 		case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment";
134 		case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference";
135 		case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream";
136 		case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
137 		case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference";
138 		case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference";
139 		case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream";
140 		case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier";
141 		case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name";
142 		case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value";
143 		case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name";
144 		case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character";
145 		case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name";
146 		case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag";
147 		case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference";
148 		default: return "unknown error";
149 	}
150 }
151 
dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)152 static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)
153 {
154 	switch (id) {
155 		case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token";
156 		case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token";
157 		case LXB_HTML_RULES_ERROR_NUCH: return "null-character";
158 		case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token";
159 		case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode";
160 		case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode";
161 		case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode";
162 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode";
163 		case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode";
164 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode";
165 		case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode";
166 		case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
167 		case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode";
168 		case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode";
169 		case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode";
170 		case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode";
171 		case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode";
172 		case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode";
173 		case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode";
174 		case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode";
175 		case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong";
176 		case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong";
177 		case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack";
178 		case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack";
179 		case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope";
180 		case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope";
181 		case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope";
182 		case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack";
183 		case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file";
184 		case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text";
185 		case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode";
186 		case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode";
187 		case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode";
188 		case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode";
189 		case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode";
190 		case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode";
191 		default: return "unknown error";
192 	}
193 }
194 
dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)195 static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)
196 {
197 	switch (status) {
198 		case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures";
199 		case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing";
200 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow";
201 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory";
202 		default: return "unknown error";
203 	}
204 }
205 
dom_reset_line_column_cache(dom_line_column_cache * cache)206 static void dom_reset_line_column_cache(dom_line_column_cache *cache)
207 {
208 	cache->last_line = 1;
209 	cache->last_column = 1;
210 	cache->last_offset = 0;
211 }
212 
dom_find_line_and_column_using_cache(const dom_lexbor_libxml2_bridge_application_data * application_data,dom_line_column_cache * cache,size_t offset)213 static void dom_find_line_and_column_using_cache(
214 	const dom_lexbor_libxml2_bridge_application_data *application_data,
215 	dom_line_column_cache *cache,
216 	size_t offset
217 )
218 {
219 	offset -= application_data->current_total_offset;
220 	if (offset > application_data->current_input_length) {
221 		/* Possible with empty input, also just good for general safety */
222 		offset = application_data->current_input_length;
223 	}
224 
225 	/* Either unicode or UTF-8 data */
226 	if (application_data->current_input_codepoints != NULL) {
227 		while (cache->last_offset < offset) {
228 			if (application_data->current_input_codepoints[cache->last_offset] == 0x000A /* Unicode codepoint for line feed */) {
229 				cache->last_line++;
230 				cache->last_column = 1;
231 			} else {
232 				cache->last_column++;
233 			}
234 			cache->last_offset++;
235 		}
236 	} else {
237 		while (cache->last_offset < offset) {
238 			const lxb_char_t current = application_data->current_input_characters[cache->last_offset];
239 			if (current == '\n') {
240 				cache->last_line++;
241 				cache->last_column = 1;
242 				cache->last_offset++;
243 			} else {
244 				/* See Lexbor tokenizer patch
245 				 * Note for future self: branchlessly computing the length and jumping by the length would be nice,
246 				 * however it takes so many instructions to do so that it is slower than this naive method. */
247 				if ((current & 0b11000000) != 0b10000000) {
248 					cache->last_column++;
249 				}
250 				cache->last_offset++;
251 			}
252 		}
253 	}
254 }
255 
dom_lexbor_libxml2_bridge_tokenizer_error_reporter(void * application_data_voidptr,lxb_html_tokenizer_error_t * error,size_t offset)256 static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
257 	void *application_data_voidptr,
258 	lxb_html_tokenizer_error_t *error,
259 	size_t offset
260 )
261 {
262 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
263 	dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset);
264 	php_libxml_pretend_ctx_error_ex(application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column);
265 }
266 
dom_lexbor_libxml2_bridge_tree_error_reporter(void * application_data_voidptr,lxb_html_tree_error_t * error,size_t line,size_t column,size_t len)267 static void dom_lexbor_libxml2_bridge_tree_error_reporter(
268 	void *application_data_voidptr,
269 	lxb_html_tree_error_t *error,
270 	size_t line,
271 	size_t column,
272 	size_t len
273 )
274 {
275 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
276 
277 	if (line == 1 && application_data->html_no_implied && error->id == LXB_HTML_RULES_ERROR_UNTOININMO) {
278 		/* For no implied mode, we want to mimick libxml's behaviour of not reporting an error for a lacking doctype. */
279 		return;
280 	}
281 
282 	if (UNEXPECTED(len <= 1)) {
283 		/* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */
284 		php_libxml_pretend_ctx_error_ex(
285 			application_data->input_name,
286 			line,
287 			column,
288 			"tree error %s in %s, line: %zu, column: %zu\n",
289 			dom_lexbor_tree_error_code_to_string(error->id),
290 			application_data->input_name,
291 			line,
292 			column
293 		);
294 	} else {
295 		php_libxml_pretend_ctx_error_ex(
296 			application_data->input_name,
297 			line,
298 			column,
299 			"tree error %s in %s, line: %zu, column: %zu-%zu\n",
300 			dom_lexbor_tree_error_code_to_string(error->id),
301 			application_data->input_name,
302 			line,
303 			column,
304 			column + len - 1
305 		);
306 	}
307 }
308 
dom_search_child(xmlNodePtr parent,const char * searching_for)309 static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for)
310 {
311 	xmlNodePtr node = parent->children;
312 	while (node != NULL) {
313 		if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) {
314 			return node;
315 		}
316 		node = node->next;
317 	}
318 	return NULL;
319 }
320 
dom_place_remove_element_and_hoist_children(xmlNodePtr parent,const char * searching_for)321 static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for)
322 {
323 	xmlNodePtr node = dom_search_child(parent, searching_for);
324 	if (node != NULL) {
325 		xmlUnlinkNode(node);
326 
327 		xmlNodePtr child = node->children;
328 		while (child != NULL) {
329 			xmlUnlinkNode(child);
330 			xmlAddChild(parent, child);
331 			child = node->children;
332 		}
333 
334 		xmlFreeNode(node);
335 	}
336 }
337 
dom_post_process_html5_loading(xmlDocPtr lxml_doc,zend_long options,const lexbor_libxml2_bridge_extracted_observations * observations)338 static void dom_post_process_html5_loading(
339 	xmlDocPtr lxml_doc,
340 	zend_long options,
341 	const lexbor_libxml2_bridge_extracted_observations *observations
342 )
343 {
344 	if (options & HTML_PARSE_NOIMPLIED) {
345 		xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html");
346 		if (!observations->has_explicit_head_tag) {
347 			dom_place_remove_element_and_hoist_children(html_node, "head");
348 		}
349 		if (!observations->has_explicit_body_tag) {
350 			dom_place_remove_element_and_hoist_children(html_node, "body");
351 		}
352 		if (!observations->has_explicit_html_tag) {
353 			/* The HTML node has a single namespace declaration, that we must preserve after removing the node.
354 			 * However, it's possible the namespace is NULL if DOM\HTML_NO_DEFAULT_NS was set. */
355 			if (!(options & DOM_HTML_NO_DEFAULT_NS)) {
356 				php_libxml_set_old_ns(lxml_doc, html_node->nsDef);
357 				html_node->nsDef = NULL;
358 			}
359 			dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html");
360 			if (!(options & DOM_HTML_NO_DEFAULT_NS) && EXPECTED(lxml_doc->children != NULL)) {
361 				xmlNodePtr node = lxml_doc->children;
362 				while (node) {
363 					/* Fine to use the DOM wrap reconciliation here because it's the "modern" world of DOM,
364 					 * and no user manipulation happened yet. */
365 					xmlDOMWrapCtxt dummy_ctxt = {0};
366 					xmlDOMWrapReconcileNamespaces(&dummy_ctxt, node, /* options */ 0);
367 					node = node->next;
368 				}
369 			}
370 		}
371 	}
372 }
373 
374 /* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */
dom_determine_encoding(const char * source,size_t source_len)375 static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len)
376 {
377 	dom_character_encoding_data result;
378 
379 	/* BOM sniffing */
380 	if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') {
381 		result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
382 		result.bom_shift = 3;
383 		return result;
384 	} else if (source_len >= 2) {
385 		if (source[0] == '\xFE' && source[1] == '\xFF') {
386 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16BE);
387 			result.bom_shift = 2;
388 			return result;
389 		} else if (source[0] == '\xFF' && source[1] == '\xFE') {
390 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16LE);
391 			result.bom_shift = 2;
392 			return result;
393 		}
394 	}
395 
396 	/* Perform prescan */
397 	lxb_html_encoding_t encoding;
398 	lxb_status_t status = lxb_html_encoding_init(&encoding);
399 	if (status != LXB_STATUS_OK) {
400 		goto fallback_uninit;
401 	}
402 	/* This is the "wait either for 1024 bytes or 500ms" part */
403 	if (source_len > 1024) {
404 		source_len = 1024;
405 	}
406 	status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len);
407 	if (status != LXB_STATUS_OK) {
408 		goto fallback;
409 	}
410 	lxb_html_encoding_entry_t *entry = lxb_html_encoding_meta_entry(&encoding, 0);
411 	if (entry == NULL) {
412 		goto fallback;
413 	}
414 	result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
415 	if (!result.encoding_data) {
416 		goto fallback;
417 	}
418 	result.bom_shift = 0;
419 	lxb_html_encoding_destroy(&encoding, false);
420 	return result;
421 
422 fallback:
423 	lxb_html_encoding_destroy(&encoding, false);
424 fallback_uninit:
425 	result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
426 	result.bom_shift = 0;
427 	return result;
428 }
429 
dom_setup_parser_encoding_manually(const lxb_char_t * buf_start,const lxb_encoding_data_t * encoding_data,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)430 static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
431 {
432 	static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
433 
434 	decoding_encoding_ctx->decode_data = encoding_data;
435 
436 	(void) lxb_encoding_decode_init(
437 		&decoding_encoding_ctx->decode,
438 		decoding_encoding_ctx->decode_data,
439 		decoding_encoding_ctx->codepoints,
440 		sizeof(decoding_encoding_ctx->codepoints) / sizeof(*decoding_encoding_ctx->codepoints)
441 	);
442 	(void) lxb_encoding_decode_replace_set(
443 		&decoding_encoding_ctx->decode,
444 		&replacement_codepoint,
445 		LXB_ENCODING_REPLACEMENT_BUFFER_LEN
446 	);
447 	/* Note: encode_data is for UTF-8 */
448 	decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data;
449 
450 	if (decoding_encoding_ctx->fast_path) {
451 		application_data->current_input_codepoints = NULL;
452 		application_data->current_input_characters = (const char *) buf_start;
453 	} else {
454 		application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
455 		application_data->current_input_characters = NULL;
456 	}
457 }
458 
dom_setup_parser_encoding_implicitly(const lxb_char_t ** buf_ref,size_t * read,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)459 static void dom_setup_parser_encoding_implicitly(
460 	const lxb_char_t **buf_ref,
461 	size_t *read,
462 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
463 	dom_lexbor_libxml2_bridge_application_data *application_data
464 )
465 {
466 	const char *buf_start = (const char *) *buf_ref;
467 	dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
468 	*buf_ref += dom_encoding_data.bom_shift;
469 	*read -= dom_encoding_data.bom_shift;
470 	dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
471 }
472 
dom_process_parse_chunk(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,size_t encoded_length,const lxb_char_t * encoding_output,size_t input_buffer_length,size_t * tokenizer_error_offset,size_t * tree_error_offset)473 static bool dom_process_parse_chunk(
474 	lexbor_libxml2_bridge_parse_context *ctx,
475 	lxb_html_document_t *document,
476 	lxb_html_parser_t *parser,
477 	size_t encoded_length,
478 	const lxb_char_t *encoding_output,
479 	size_t input_buffer_length,
480 	size_t *tokenizer_error_offset,
481 	size_t *tree_error_offset
482 )
483 {
484 	dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data;
485 	application_data->current_input_length = input_buffer_length;
486 	lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length);
487 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
488 		return false;
489 	}
490 	if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
491 		lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
492 		dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
493 	}
494 	application_data->current_total_offset += input_buffer_length;
495 	application_data->cache_tokenizer.last_offset = 0;
496 	return true;
497 }
498 
dom_decode_encode_fast_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)499 static bool dom_decode_encode_fast_path(
500 	lexbor_libxml2_bridge_parse_context *ctx,
501 	lxb_html_document_t *document,
502 	lxb_html_parser_t *parser,
503 	const lxb_char_t **buf_ref_ref,
504 	const lxb_char_t *buf_end,
505 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
506 	size_t *tokenizer_error_offset,
507 	size_t *tree_error_offset
508 )
509 {
510 	const lxb_char_t *buf_ref = *buf_ref_ref;
511 	const lxb_char_t *last_output = buf_ref;
512 	while (buf_ref != buf_end) {
513 		const lxb_char_t *buf_ref_backup = buf_ref;
514 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
515 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
516 		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
517 			size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
518 			if (!dom_process_parse_chunk(
519 				ctx,
520 				document,
521 				parser,
522 				buf_ref - last_output - skip,
523 				last_output,
524 				buf_ref - last_output,
525 				tokenizer_error_offset,
526 				tree_error_offset
527 			)) {
528 				goto fail_oom;
529 			}
530 			if (!dom_process_parse_chunk(
531 				ctx,
532 				document,
533 				parser,
534 				LXB_ENCODING_REPLACEMENT_SIZE,
535 				LXB_ENCODING_REPLACEMENT_BYTES,
536 				0,
537 				tokenizer_error_offset,
538 				tree_error_offset
539 			)) {
540 				goto fail_oom;
541 			}
542 			last_output = buf_ref;
543 		}
544 	}
545 	if (buf_ref != last_output
546 		&& !dom_process_parse_chunk(
547 			ctx,
548 			document,
549 			parser,
550 			buf_ref - last_output,
551 			last_output,
552 			buf_ref - last_output,
553 			tokenizer_error_offset,
554 			tree_error_offset
555 	)) {
556 		goto fail_oom;
557 	}
558 	*buf_ref_ref = buf_ref;
559 	return true;
560 fail_oom:
561 	*buf_ref_ref = buf_ref;
562 	return false;
563 }
564 
dom_decode_encode_slow_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)565 static bool dom_decode_encode_slow_path(
566 	lexbor_libxml2_bridge_parse_context *ctx,
567 	lxb_html_document_t *document,
568 	lxb_html_parser_t *parser,
569 	const lxb_char_t **buf_ref_ref,
570 	const lxb_char_t *buf_end,
571 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
572 	size_t *tokenizer_error_offset,
573 	size_t *tree_error_offset
574 )
575 {
576 	const lxb_char_t *buf_ref = *buf_ref_ref;
577 	lexbor_status_t decode_status, encode_status;
578 	do {
579 		decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
580 
581 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
582 		size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
583 		const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used;
584 		do {
585 			encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
586 			ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid");
587 			if (!dom_process_parse_chunk(
588 				ctx,
589 				document,
590 				parser,
591 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
592 				decoding_encoding_ctx->encoding_output,
593 				decoding_buffer_used,
594 				tokenizer_error_offset,
595 				tree_error_offset
596 			)) {
597 				goto fail_oom;
598 			}
599 			lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0);
600 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
601 		lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0);
602 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
603 	*buf_ref_ref = buf_ref;
604 	return true;
605 fail_oom:
606 	*buf_ref_ref = buf_ref;
607 	return false;
608 }
609 
dom_parse_decode_encode_step(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)610 static bool dom_parse_decode_encode_step(
611 	lexbor_libxml2_bridge_parse_context *ctx,
612 	lxb_html_document_t *document,
613 	lxb_html_parser_t *parser,
614 	const lxb_char_t **buf_ref_ref,
615 	const lxb_char_t *buf_end,
616 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
617 	size_t *tokenizer_error_offset,
618 	size_t *tree_error_offset
619 )
620 {
621 	if (decoding_encoding_ctx->fast_path) {
622 		return dom_decode_encode_fast_path(
623 			ctx,
624 			document,
625 			parser,
626 			buf_ref_ref,
627 			buf_end,
628 			decoding_encoding_ctx,
629 			tokenizer_error_offset,
630 			tree_error_offset
631 		);
632 	} else {
633 		return dom_decode_encode_slow_path(
634 			ctx,
635 			document,
636 			parser,
637 			buf_ref_ref,
638 			buf_end,
639 			decoding_encoding_ctx,
640 			tokenizer_error_offset,
641 			tree_error_offset
642 		);
643 	}
644 }
645 
dom_parse_decode_encode_finish(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)646 static bool dom_parse_decode_encode_finish(
647 	lexbor_libxml2_bridge_parse_context *ctx,
648 	lxb_html_document_t *document,
649 	lxb_html_parser_t *parser,
650 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
651 	size_t *tokenizer_error_offset,
652 	size_t *tree_error_offset
653 )
654 {
655 	if (!decoding_encoding_ctx->fast_path) {
656 		/* Fast path handles codepoints one by one, so this part is not applicable in that case */
657 		(void) lxb_encoding_decode_finish(&decoding_encoding_ctx->decode);
658 		size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
659 		if (decoding_buffer_size > 0) {
660 			const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
661 			const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size;
662 			(void) decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
663 			if (!dom_process_parse_chunk(
664 				ctx,
665 				document,
666 				parser,
667 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
668 				decoding_encoding_ctx->encoding_output,
669 				decoding_buffer_size,
670 				tokenizer_error_offset,
671 				tree_error_offset
672 			)) {
673 				return false;
674 			}
675 		}
676 	}
677 	(void) lxb_encoding_encode_finish(&decoding_encoding_ctx->encode);
678 	if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode)
679 		&& !dom_process_parse_chunk(
680 			ctx,
681 			document,
682 			parser,
683 			lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
684 			decoding_encoding_ctx->encoding_output,
685 			lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode),
686 			tokenizer_error_offset,
687 			tree_error_offset
688 	)) {
689 		return false;
690 	}
691 	return true;
692 }
693 
check_options_validity(uint32_t arg_num,zend_long options)694 static bool check_options_validity(uint32_t arg_num, zend_long options)
695 {
696 	const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS;
697 	if ((options & ~VALID_OPTIONS) != 0) {
698 		zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
699 										   "LIBXML_NOERROR, "
700 										   "LIBXML_COMPACT, "
701 										   "LIBXML_HTML_NOIMPLIED, "
702 										   "DOM\\NO_DEFAULT_NS)");
703 		return false;
704 	}
705 	return true;
706 }
707 
PHP_METHOD(DOM_HTMLDocument,createEmpty)708 PHP_METHOD(DOM_HTMLDocument, createEmpty)
709 {
710 	const char *encoding = "UTF-8";
711 	size_t encoding_len = strlen("UTF-8");
712 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|p", &encoding, &encoding_len) == FAILURE) {
713 		RETURN_THROWS();
714 	}
715 
716 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) encoding, encoding_len);
717 
718 	if (encoding_data == NULL) {
719 		zend_argument_value_error(1, "must be a valid document encoding");
720 		RETURN_THROWS();
721 	}
722 
723 #ifdef LIBXML_HTML_ENABLED
724 	xmlDocPtr lxml_doc = htmlNewDocNoDtD(NULL, NULL);
725 	if (UNEXPECTED(lxml_doc == NULL)) {
726 		goto oom;
727 	}
728 #else
729 	xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) "1.0");
730 	if (UNEXPECTED(lxml_doc == NULL)) {
731 		goto oom;
732 	}
733 	lxml_doc->type = XML_HTML_DOCUMENT_NODE;
734 #endif
735 
736 	lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
737 
738 	dom_object *intern = php_dom_instantiate_object_helper(
739 		return_value,
740 		dom_html_document_class_entry,
741 		(xmlNodePtr) lxml_doc,
742 		NULL
743 	);
744 	intern->document->is_modern_api_class = true;
745 	return;
746 
747 oom:
748 	php_dom_throw_error(INVALID_STATE_ERR, 1);
749 	RETURN_THROWS();
750 }
751 
PHP_METHOD(DOM_HTMLDocument,createFromString)752 PHP_METHOD(DOM_HTMLDocument, createFromString)
753 {
754 	const char *source, *override_encoding = NULL;
755 	size_t source_len, override_encoding_len;
756 	zend_long options = 0;
757 	if (zend_parse_parameters(
758 		ZEND_NUM_ARGS(),
759 		"s|lp!",
760 		&source,
761 		&source_len,
762 		&options,
763 		&override_encoding,
764 		&override_encoding_len
765 	) == FAILURE) {
766 		RETURN_THROWS();
767 	}
768 
769 	if (!check_options_validity(2, options)) {
770 		RETURN_THROWS();
771 	}
772 
773 	dom_lexbor_libxml2_bridge_application_data application_data;
774 	application_data.input_name = "Entity";
775 	application_data.current_total_offset = 0;
776 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
777 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
778 	lexbor_libxml2_bridge_parse_context ctx;
779 	lexbor_libxml2_bridge_parse_context_init(&ctx);
780 	if (!(options & XML_PARSE_NOERROR)) {
781 		lexbor_libxml2_bridge_parse_set_error_callbacks(
782 			&ctx,
783 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
784 			dom_lexbor_libxml2_bridge_tree_error_reporter
785 		);
786 	}
787 	ctx.application_data = &application_data;
788 
789 	size_t tokenizer_error_offset = 0;
790 	size_t tree_error_offset = 0;
791 
792 	/* Setup everything encoding & decoding related */
793 	const lxb_char_t *buf_ref = (const lxb_char_t *) source;
794 	dom_decoding_encoding_ctx decoding_encoding_ctx;
795 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
796 	if (override_encoding != NULL) {
797 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
798 			(const lxb_char_t *) override_encoding,
799 			override_encoding_len
800 		);
801 		if (!encoding_data) {
802 			zend_argument_value_error(3, "must be a valid document encoding");
803 			RETURN_THROWS();
804 		}
805 		dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
806 	} else {
807 		dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
808 	}
809 
810 	lxb_html_document_t *document = lxb_html_document_create();
811 	if (UNEXPECTED(document == NULL)) {
812 		goto fail_oom;
813 	}
814 
815 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
816 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
817 		goto fail_oom;
818 	}
819 
820 	lxb_html_parser_t *parser = document->dom_document.parser;
821 
822 	while (source_len > 0) {
823 		size_t chunk_size = source_len;
824 		const size_t MAX_CHUNK_SIZE = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(*decoding_encoding_ctx.encoding_output);
825 		if (chunk_size > MAX_CHUNK_SIZE) {
826 			chunk_size = MAX_CHUNK_SIZE;
827 		}
828 		source_len -= chunk_size;
829 
830 		const lxb_char_t *buf_end = buf_ref + chunk_size;
831 		bool result = dom_parse_decode_encode_step(
832 			&ctx,
833 			document,
834 			parser,
835 			&buf_ref,
836 			buf_end,
837 			&decoding_encoding_ctx,
838 			&tokenizer_error_offset,
839 			&tree_error_offset
840 		);
841 		if (!result) {
842 			goto fail_oom;
843 		}
844 	}
845 
846 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
847 		goto fail_oom;
848 	}
849 
850 	lexbor_status = lxb_html_document_parse_chunk_end(document);
851 	if (lexbor_status != LXB_STATUS_OK) {
852 		goto fail_oom;
853 	}
854 
855 	xmlDocPtr lxml_doc;
856 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
857 		document,
858 		&lxml_doc,
859 		options & XML_PARSE_COMPACT,
860 		!(options & DOM_HTML_NO_DEFAULT_NS)
861 	);
862 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
863 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
864 		php_libxml_ctx_error(
865 			NULL,
866 			"%s in %s",
867 			dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status),
868 			application_data.input_name
869 		);
870 		lxb_html_document_destroy(document);
871 		RETURN_FALSE;
872 	}
873 	lxb_html_document_destroy(document);
874 
875 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
876 
877 	if (decoding_encoding_ctx.decode_data) {
878 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
879 	} else {
880 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
881 	}
882 
883 	dom_object *intern = php_dom_instantiate_object_helper(
884 		return_value,
885 		dom_html_document_class_entry,
886 		(xmlNodePtr) lxml_doc,
887 		NULL
888 	);
889 	intern->document->is_modern_api_class = true;
890 	return;
891 
892 fail_oom:
893 	lxb_html_document_destroy(document);
894 	php_dom_throw_error(INVALID_STATE_ERR, 1);
895 	RETURN_THROWS();
896 }
897 
PHP_METHOD(DOM_HTMLDocument,createFromFile)898 PHP_METHOD(DOM_HTMLDocument, createFromFile)
899 {
900 	const char *filename, *override_encoding = NULL;
901 	size_t filename_len, override_encoding_len;
902 	zend_long options = 0;
903 	php_stream *stream = NULL;
904 	if (zend_parse_parameters(
905 		ZEND_NUM_ARGS(),
906 		"p|lp!",
907 		&filename,
908 		&filename_len,
909 		&options,
910 		&override_encoding,
911 		&override_encoding_len
912 	) == FAILURE) {
913 		RETURN_THROWS();
914 	}
915 
916 	/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
917 	if (strstr(filename, "%00")) {
918 		zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
919 		RETURN_THROWS();
920 	}
921 
922 	if (!check_options_validity(2, options)) {
923 		RETURN_THROWS();
924 	}
925 
926 	dom_lexbor_libxml2_bridge_application_data application_data;
927 	application_data.input_name = filename;
928 	application_data.current_total_offset = 0;
929 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
930 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
931 	lexbor_libxml2_bridge_parse_context ctx;
932 	lexbor_libxml2_bridge_parse_context_init(&ctx);
933 	if (!(options & XML_PARSE_NOERROR)) {
934 		lexbor_libxml2_bridge_parse_set_error_callbacks(
935 			&ctx,
936 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
937 			dom_lexbor_libxml2_bridge_tree_error_reporter
938 		);
939 	}
940 	ctx.application_data = &application_data;
941 
942 	char buf[4096];
943 
944 	/* Setup everything encoding & decoding related */
945 	dom_decoding_encoding_ctx decoding_encoding_ctx;
946 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
947 	bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
948 	if (override_encoding != NULL) {
949 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
950 			(const lxb_char_t *) override_encoding,
951 			override_encoding_len
952 		);
953 		if (!encoding_data) {
954 			zend_argument_value_error(3, "must be a valid document encoding");
955 			RETURN_THROWS();
956 		}
957 		should_determine_encoding_implicitly = false;
958 		dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
959 	}
960 
961 	stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
962 	if (!stream) {
963 		if (!EG(exception)) {
964 			zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", filename);
965 		}
966 		RETURN_THROWS();
967 	}
968 
969 	/* MIME sniff */
970 	if (should_determine_encoding_implicitly) {
971 		zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
972 		if (charset != NULL) {
973 			const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
974 				(const lxb_char_t *) ZSTR_VAL(charset),
975 				ZSTR_LEN(charset)
976 			);
977 			if (encoding_data != NULL) {
978 				should_determine_encoding_implicitly = false;
979 				dom_setup_parser_encoding_manually(
980 					(const lxb_char_t *) buf,
981 					encoding_data,
982 					&decoding_encoding_ctx,
983 					&application_data
984 				);
985 			}
986 			zend_string_release_ex(charset, false);
987 		}
988 	}
989 
990 	lxb_html_document_t *document = lxb_html_document_create();
991 	if (UNEXPECTED(document == NULL)) {
992 		goto fail_oom;
993 	}
994 
995 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
996 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
997 		goto fail_oom;
998 	}
999 
1000 	size_t tokenizer_error_offset = 0;
1001 	size_t tree_error_offset = 0;
1002 	ssize_t read;
1003 	lxb_html_parser_t *parser = document->dom_document.parser;
1004 
1005 	while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
1006 		const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1007 
1008 		if (should_determine_encoding_implicitly) {
1009 			should_determine_encoding_implicitly = false;
1010 			dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
1011 		}
1012 
1013 		const lxb_char_t *buf_end = buf_ref + read;
1014 		bool result = dom_parse_decode_encode_step(
1015 			&ctx,
1016 			document,
1017 			parser,
1018 			&buf_ref,
1019 			buf_end,
1020 			&decoding_encoding_ctx,
1021 			&tokenizer_error_offset,
1022 			&tree_error_offset
1023 		);
1024 		if (!result) {
1025 			goto fail_oom;
1026 		}
1027 	}
1028 
1029 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
1030 		goto fail_oom;
1031 	}
1032 
1033 	lexbor_status = lxb_html_document_parse_chunk_end(document);
1034 	if (lexbor_status != LXB_STATUS_OK) {
1035 		goto fail_oom;
1036 	}
1037 
1038 	xmlDocPtr lxml_doc;
1039 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
1040 		document,
1041 		&lxml_doc,
1042 		options & XML_PARSE_COMPACT,
1043 		!(options & DOM_HTML_NO_DEFAULT_NS)
1044 	);
1045 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
1046 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
1047 		php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename);
1048 		lxb_html_document_destroy(document);
1049 		php_stream_close(stream);
1050 		RETURN_FALSE;
1051 	}
1052 	lxb_html_document_destroy(document);
1053 
1054 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
1055 
1056 	if (decoding_encoding_ctx.decode_data) {
1057 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
1058 	} else {
1059 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
1060 	}
1061 
1062 	if (stream->wrapper == &php_plain_files_wrapper) {
1063 		xmlChar *converted = xmlPathToURI((const xmlChar *) filename);
1064 		if (UNEXPECTED(!converted)) {
1065 			goto fail_oom;
1066 		}
1067 		/* Check for "file:/" instead of "file://" because of libxml2 quirk */
1068 		if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
1069 			xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
1070 			if (UNEXPECTED(!buffer)) {
1071 				xmlFree(converted);
1072 				goto fail_oom;
1073 			}
1074 			xmlChar *new_buffer = xmlStrcat(buffer, converted);
1075 			if (UNEXPECTED(!new_buffer)) {
1076 				xmlFree(buffer);
1077 				xmlFree(converted);
1078 				goto fail_oom;
1079 			}
1080 			xmlFree(converted);
1081 			lxml_doc->URL = new_buffer;
1082 		} else {
1083 			lxml_doc->URL = converted;
1084 		}
1085 	} else {
1086 		lxml_doc->URL = xmlStrdup((const xmlChar *) filename);
1087 	}
1088 
1089 	php_stream_close(stream);
1090 	stream = NULL;
1091 
1092 	dom_object *intern = php_dom_instantiate_object_helper(
1093 		return_value,
1094 		dom_html_document_class_entry,
1095 		(xmlNodePtr) lxml_doc,
1096 		NULL
1097 	);
1098 	intern->document->is_modern_api_class = true;
1099 	return;
1100 
1101 fail_oom:
1102 	php_dom_throw_error(INVALID_STATE_ERR, 1);
1103 	lxb_html_document_destroy(document);
1104 	if (stream) {
1105 		php_stream_close(stream);
1106 	}
1107 	RETURN_THROWS();
1108 }
1109 
dom_write_output_smart_str(void * ctx,const char * buf,size_t size)1110 static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size)
1111 {
1112 	smart_str_appendl((smart_str *) ctx, buf, size);
1113 	return SUCCESS;
1114 }
1115 
dom_write_output_stream(void * application_data,const char * buf,size_t len)1116 static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len)
1117 {
1118 	php_stream *stream = (php_stream *) application_data;
1119 	if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) {
1120 		return FAILURE;
1121 	}
1122 	return SUCCESS;
1123 }
1124 
dom_saveHTML_write_string_len(void * application_data,const char * buf,size_t len)1125 static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
1126 {
1127 	dom_output_ctx *output = (dom_output_ctx *) application_data;
1128 	lxb_status_t decode_status, encode_status;
1129 	const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1130 	const lxb_char_t *buf_end = buf_ref + len;
1131 
1132 	do {
1133 		decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
1134 
1135 		const lxb_codepoint_t *codepoints_ref = output->codepoints;
1136 		const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
1137 		do {
1138 			encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end);
1139 			if (UNEXPECTED(output->write_output(
1140 				output->output_data,
1141 				(const char *) output->encoding_output,
1142 				lxb_encoding_encode_buf_used(output->encode)
1143 			) != SUCCESS)) {
1144 				return FAILURE;
1145 			}
1146 			lxb_encoding_encode_buf_used_set(output->encode, 0);
1147 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
1148 		lxb_encoding_decode_buf_used_set(output->decode, 0);
1149 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
1150 
1151 	return SUCCESS;
1152 }
1153 
dom_saveHTML_write_string(void * application_data,const char * buf)1154 static zend_result dom_saveHTML_write_string(void *application_data, const char *buf)
1155 {
1156 	return dom_saveHTML_write_string_len(application_data, buf, strlen(buf));
1157 }
1158 
dom_common_save(dom_output_ctx * output_ctx,const xmlDoc * docp,const xmlNode * node)1159 static zend_result dom_common_save(dom_output_ctx *output_ctx, const xmlDoc *docp, const xmlNode *node)
1160 {
1161 	/* Initialize everything related to encoding & decoding */
1162 	const lxb_encoding_data_t *decoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
1163 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1164 		(const lxb_char_t *) docp->encoding,
1165 		strlen((const char *) docp->encoding)
1166 	);
1167 	lxb_encoding_encode_t encode;
1168 	lxb_encoding_decode_t decode;
1169 	lxb_char_t encoding_output[4096];
1170 	lxb_codepoint_t codepoints[4096];
1171 	(void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(*encoding_output));
1172 	(void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(*codepoints));
1173 	if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1174 		lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
1175 	} else {
1176 		/* Fallback if there is no replacement by default */
1177 		lxb_encoding_encode_replace_set(&encode, (const lxb_char_t *) "?", 1);
1178 	}
1179 	lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
1180 
1181 	output_ctx->encoding_data = encoding_data;
1182 	output_ctx->decoding_data = decoding_data;
1183 	output_ctx->encode = &encode;
1184 	output_ctx->decode = &decode;
1185 	output_ctx->codepoints = codepoints;
1186 	output_ctx->encoding_output = encoding_output;
1187 
1188 	dom_html5_serialize_context ctx;
1189 	ctx.write_string_len = dom_saveHTML_write_string_len;
1190 	ctx.write_string = dom_saveHTML_write_string;
1191 	ctx.application_data = output_ctx;
1192 	if (UNEXPECTED(dom_html5_serialize(&ctx, node) != SUCCESS)) {
1193 		return FAILURE;
1194 	}
1195 
1196 	(void) lxb_encoding_decode_finish(&decode);
1197 	if (lxb_encoding_decode_buf_used(&decode)) {
1198 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints;
1199 		(void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode));
1200 		if (UNEXPECTED(output_ctx->write_output(
1201 			output_ctx->output_data,
1202 			(const char *) encoding_output,
1203 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1204 		)) {
1205 			return FAILURE;
1206 		}
1207 	}
1208 	(void) lxb_encoding_encode_finish(&encode);
1209 	if (lxb_encoding_encode_buf_used(&encode)) {
1210 		if (UNEXPECTED(output_ctx->write_output(
1211 			output_ctx->output_data,
1212 			(const char *) encoding_output,
1213 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1214 		)) {
1215 			return FAILURE;
1216 		}
1217 	}
1218 
1219 	return SUCCESS;
1220 }
1221 
PHP_METHOD(DOM_HTMLDocument,saveHTMLFile)1222 PHP_METHOD(DOM_HTMLDocument, saveHTMLFile)
1223 {
1224 	zval *id;
1225 	xmlDoc *docp;
1226 	size_t file_len;
1227 	dom_object *intern;
1228 	char *file;
1229 
1230 	id = ZEND_THIS;
1231 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) {
1232 		RETURN_THROWS();
1233 	}
1234 
1235 	if (file_len == 0) {
1236 		zend_argument_value_error(1, "must not be empty");
1237 		RETURN_THROWS();
1238 	}
1239 
1240 	php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
1241 	if (!stream) {
1242 		RETURN_FALSE;
1243 	}
1244 
1245 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
1246 
1247 	dom_output_ctx output_ctx;
1248 	output_ctx.output_data = stream;
1249 	output_ctx.write_output = dom_write_output_stream;
1250 	if (UNEXPECTED(dom_common_save(&output_ctx, docp, (const xmlNode *) docp) != SUCCESS)) {
1251 		php_stream_close(stream);
1252 		RETURN_FALSE;
1253 	}
1254 
1255 	zend_long bytes = php_stream_tell(stream);
1256 	php_stream_close(stream);
1257 
1258 	RETURN_LONG(bytes);
1259 }
1260 
PHP_METHOD(DOM_HTMLDocument,saveHTML)1261 PHP_METHOD(DOM_HTMLDocument, saveHTML)
1262 {
1263 	zval *nodep = NULL;
1264 	const xmlDoc *docp;
1265 	const xmlNode *node;
1266 	dom_object *intern, *nodeobj;
1267 
1268 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|O!", &nodep, dom_node_class_entry) == FAILURE) {
1269 		RETURN_THROWS();
1270 	}
1271 
1272 	DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
1273 
1274 	if (nodep != NULL) {
1275 		DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj);
1276 		if (node->doc != docp) {
1277 			php_dom_throw_error(WRONG_DOCUMENT_ERR, dom_get_strict_error(intern->document));
1278 			RETURN_FALSE;
1279 		}
1280 	} else {
1281 		node = (const xmlNode *) docp;
1282 	}
1283 
1284 	smart_str buf = {0};
1285 	dom_output_ctx output_ctx;
1286 	output_ctx.output_data = &buf;
1287 	output_ctx.write_output = dom_write_output_smart_str;
1288 	/* Can't fail because dom_write_output_smart_str() can't fail. */
1289 	zend_result result = dom_common_save(&output_ctx, docp, node);
1290 	ZEND_ASSERT(result == SUCCESS);
1291 
1292 	RETURN_STR(smart_str_extract(&buf));
1293 }
1294 
PHP_METHOD(DOM_HTMLDocument,__construct)1295 PHP_METHOD(DOM_HTMLDocument, __construct)
1296 {
1297 	/* Private constructor cannot be called. */
1298 	ZEND_UNREACHABLE();
1299 }
1300 
dom_html_document_encoding_write(dom_object * obj,zval * newval)1301 zend_result dom_html_document_encoding_write(dom_object *obj, zval *newval)
1302 {
1303 	xmlDoc *docp = (xmlDocPtr) dom_object_get_node(obj);
1304 	if (docp == NULL) {
1305 		php_dom_throw_error(INVALID_STATE_ERR, 1);
1306 		return FAILURE;
1307 	}
1308 
1309 	/* Typed property, can only be IS_STRING or IS_NULL. */
1310 	ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING || Z_TYPE_P(newval) == IS_NULL);
1311 
1312 	if (Z_TYPE_P(newval) == IS_NULL) {
1313 		goto invalid_encoding;
1314 	}
1315 
1316 	zend_string *str = Z_STR_P(newval);
1317 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str));
1318 
1319 	if (encoding_data != NULL) {
1320 		xmlFree((xmlChar *) docp->encoding);
1321 		docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name);
1322 	} else {
1323 		goto invalid_encoding;
1324 	}
1325 
1326 	return SUCCESS;
1327 
1328 invalid_encoding:
1329 	zend_value_error("Invalid document encoding");
1330 	return FAILURE;
1331 }
1332 
1333 #endif  /* HAVE_LIBXML && HAVE_DOM */
1334