xref: /php-src/ext/dom/html_document.c (revision 1fdbb0ab)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_parser.h"
25 #include "html5_serializer.h"
26 #include "namespace_compat.h"
27 #include "dom_properties.h"
28 #include <Zend/zend_smart_string.h>
29 #include <lexbor/html/encoding.h>
30 #include <lexbor/encoding/encoding.h>
31 
32 /* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
33 #define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
34 
35 typedef struct dom_line_column_cache {
36 	size_t last_line;
37 	size_t last_column;
38 	size_t last_offset;
39 } dom_line_column_cache;
40 
41 typedef struct dom_lexbor_libxml2_bridge_application_data {
42 	const char *input_name;
43 	const lxb_codepoint_t *current_input_codepoints;
44 	const char *current_input_characters;
45 	size_t current_input_length;
46 	size_t current_total_offset;
47 	dom_line_column_cache cache_tokenizer;
48 	bool html_no_implied;
49 } dom_lexbor_libxml2_bridge_application_data;
50 
51 typedef struct dom_character_encoding_data {
52 	const lxb_encoding_data_t *encoding_data;
53 	size_t bom_shift;
54 } dom_character_encoding_data;
55 
56 typedef zend_result (*dom_write_output)(void*, const char *, size_t);
57 
58 typedef struct dom_output_ctx {
59 	const lxb_encoding_data_t *encoding_data;
60 	const lxb_encoding_data_t *decoding_data;
61 	lxb_encoding_encode_t *encode;
62 	lxb_encoding_decode_t *decode;
63 	lxb_codepoint_t *codepoints;
64 	lxb_char_t *encoding_output;
65 	void *output_data;
66 	dom_write_output write_output;
67 } dom_output_ctx;
68 
69 typedef struct dom_decoding_encoding_ctx {
70 	/* We can skip some conversion if the input and output encoding are both UTF-8,
71 	 * we only have to validate and substitute replacement characters */
72 	bool fast_path; /* Put first, near the encode & decode structures, for cache locality */
73 	lxb_encoding_encode_t encode;
74 	lxb_encoding_decode_t decode;
75 	const lxb_encoding_data_t *encode_data;
76 	const lxb_encoding_data_t *decode_data;
77 	lxb_char_t encoding_output[4096];
78 	lxb_codepoint_t codepoints[4096];
79 } dom_decoding_encoding_ctx;
80 
81 /* https://dom.spec.whatwg.org/#dom-document-implementation */
dom_modern_document_implementation_read(dom_object * obj,zval * retval)82 zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retval)
83 {
84 	const uint32_t PROP_INDEX = 14;
85 
86 #if ZEND_DEBUG
87 	zend_string *implementation_str = ZSTR_INIT_LITERAL("implementation", false);
88 	const zend_property_info *prop_info = zend_get_property_info(dom_abstract_base_document_class_entry, implementation_str, 0);
89 	zend_string_release_ex(implementation_str, false);
90 	ZEND_ASSERT(OBJ_PROP_TO_NUM(prop_info->offset) == PROP_INDEX);
91 #endif
92 
93 	zval *cached_implementation = OBJ_PROP_NUM(&obj->std, PROP_INDEX);
94 	if (Z_ISUNDEF_P(cached_implementation)) {
95 		php_dom_create_implementation(cached_implementation, true);
96 	}
97 
98 	ZVAL_OBJ_COPY(retval, Z_OBJ_P(cached_implementation));
99 
100 	return SUCCESS;
101 }
102 
dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx * ctx)103 static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
104 {
105 	ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
106 	ctx->decode_data = NULL;
107 	/* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */
108 	ctx->fast_path = true;
109 	(void) lxb_encoding_encode_init(
110 		&ctx->encode,
111 		ctx->encode_data,
112 		ctx->encoding_output,
113 		sizeof(ctx->encoding_output) / sizeof(*ctx->encoding_output)
114 	);
115 	(void) lxb_encoding_encode_replace_set(&ctx->encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
116 }
117 
dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)118 static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)
119 {
120 	switch (id) {
121 		case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment";
122 		case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier";
123 		case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier";
124 		case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference";
125 		case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content";
126 		case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range";
127 		case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream";
128 		case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference";
129 		case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes";
130 		case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute";
131 		case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus";
132 		case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name";
133 		case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata";
134 		case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment";
135 		case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype";
136 		case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text";
137 		case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag";
138 		case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment";
139 		case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment";
140 		case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name";
141 		case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name";
142 		case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value";
143 		case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name";
144 		case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier";
145 		case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier";
146 		case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name";
147 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier";
148 		case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier";
149 		case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference";
150 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword";
151 		case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword";
152 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name";
153 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes";
154 		case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers";
155 		case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment";
156 		case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference";
157 		case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream";
158 		case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
159 		case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference";
160 		case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference";
161 		case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream";
162 		case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier";
163 		case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name";
164 		case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value";
165 		case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name";
166 		case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character";
167 		case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name";
168 		case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag";
169 		case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference";
170 		default: return "unknown error";
171 	}
172 }
173 
dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)174 static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)
175 {
176 	switch (id) {
177 		case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token";
178 		case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token";
179 		case LXB_HTML_RULES_ERROR_NUCH: return "null-character";
180 		case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token";
181 		case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode";
182 		case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode";
183 		case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode";
184 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode";
185 		case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode";
186 		case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode";
187 		case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode";
188 		case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
189 		case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode";
190 		case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode";
191 		case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode";
192 		case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode";
193 		case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode";
194 		case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode";
195 		case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode";
196 		case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode";
197 		case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong";
198 		case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong";
199 		case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack";
200 		case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack";
201 		case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope";
202 		case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope";
203 		case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope";
204 		case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack";
205 		case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file";
206 		case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text";
207 		case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode";
208 		case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode";
209 		case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode";
210 		case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode";
211 		case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode";
212 		case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode";
213 		default: return "unknown error";
214 	}
215 }
216 
dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)217 static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)
218 {
219 	switch (status) {
220 		case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures";
221 		case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing";
222 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow";
223 		case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory";
224 		default: return "unknown error";
225 	}
226 }
227 
dom_reset_line_column_cache(dom_line_column_cache * cache)228 static void dom_reset_line_column_cache(dom_line_column_cache *cache)
229 {
230 	cache->last_line = 1;
231 	cache->last_column = 1;
232 	cache->last_offset = 0;
233 }
234 
dom_find_line_and_column_using_cache(const dom_lexbor_libxml2_bridge_application_data * application_data,dom_line_column_cache * cache,size_t offset)235 static void dom_find_line_and_column_using_cache(
236 	const dom_lexbor_libxml2_bridge_application_data *application_data,
237 	dom_line_column_cache *cache,
238 	size_t offset
239 )
240 {
241 	offset -= application_data->current_total_offset;
242 	if (offset > application_data->current_input_length) {
243 		/* Possible with empty input, also just good for general safety */
244 		offset = application_data->current_input_length;
245 	}
246 
247 	size_t last_column = cache->last_column;
248 	size_t last_line = cache->last_line;
249 	size_t last_offset = cache->last_offset;
250 
251 	/* Either unicode or UTF-8 data */
252 	if (application_data->current_input_codepoints != NULL) {
253 		while (last_offset < offset) {
254 			if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) {
255 				last_line++;
256 				last_column = 1;
257 			} else {
258 				last_column++;
259 			}
260 			last_offset++;
261 		}
262 	} else {
263 		while (last_offset < offset) {
264 			const lxb_char_t current = application_data->current_input_characters[last_offset];
265 			if (current == '\n') {
266 				last_line++;
267 				last_column = 1;
268 				last_offset++;
269 			} else {
270 				/* See Lexbor tokenizer patch
271 				 * Note for future self: branchlessly computing the length and jumping by the length would be nice,
272 				 * however it takes so many instructions to do so that it is slower than this naive method. */
273 				if ((current & 0b11000000) != 0b10000000) {
274 					last_column++;
275 				}
276 				last_offset++;
277 			}
278 		}
279 	}
280 
281 	cache->last_column = last_column;
282 	cache->last_line = last_line;
283 	cache->last_offset = last_offset;
284 }
285 
dom_lexbor_libxml2_bridge_tokenizer_error_reporter(void * application_data_voidptr,lxb_html_tokenizer_error_t * error,size_t offset)286 static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
287 	void *application_data_voidptr,
288 	lxb_html_tokenizer_error_t *error,
289 	size_t offset
290 )
291 {
292 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
293 	dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset);
294 	php_libxml_pretend_ctx_error_ex(application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column);
295 }
296 
dom_lexbor_libxml2_bridge_tree_error_reporter(void * application_data_voidptr,lxb_html_tree_error_t * error,size_t line,size_t column,size_t len)297 static void dom_lexbor_libxml2_bridge_tree_error_reporter(
298 	void *application_data_voidptr,
299 	lxb_html_tree_error_t *error,
300 	size_t line,
301 	size_t column,
302 	size_t len
303 )
304 {
305 	dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
306 
307 	if (line == 1 && application_data->html_no_implied && error->id == LXB_HTML_RULES_ERROR_UNTOININMO) {
308 		/* For no implied mode, we want to mimick libxml's behaviour of not reporting an error for a lacking doctype. */
309 		return;
310 	}
311 
312 	if (len <= 1) {
313 		/* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */
314 		php_libxml_pretend_ctx_error_ex(
315 			application_data->input_name,
316 			line,
317 			column,
318 			"tree error %s in %s, line: %zu, column: %zu\n",
319 			dom_lexbor_tree_error_code_to_string(error->id),
320 			application_data->input_name,
321 			line,
322 			column
323 		);
324 	} else {
325 		php_libxml_pretend_ctx_error_ex(
326 			application_data->input_name,
327 			line,
328 			column,
329 			"tree error %s in %s, line: %zu, column: %zu-%zu\n",
330 			dom_lexbor_tree_error_code_to_string(error->id),
331 			application_data->input_name,
332 			line,
333 			column,
334 			column + len - 1
335 		);
336 	}
337 }
338 
dom_search_child(xmlNodePtr parent,const char * searching_for)339 static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for)
340 {
341 	xmlNodePtr node = parent->children;
342 	while (node != NULL) {
343 		if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) {
344 			return node;
345 		}
346 		node = node->next;
347 	}
348 	return NULL;
349 }
350 
dom_place_remove_element_and_hoist_children(xmlNodePtr parent,const char * searching_for)351 static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for)
352 {
353 	xmlNodePtr node = dom_search_child(parent, searching_for);
354 	if (node != NULL) {
355 		xmlUnlinkNode(node);
356 
357 		xmlNodePtr child = node->children;
358 		while (child != NULL) {
359 			xmlUnlinkNode(child);
360 			xmlAddChild(parent, child);
361 			child = node->children;
362 		}
363 
364 		xmlFreeNode(node);
365 	}
366 }
367 
dom_post_process_html5_loading(xmlDocPtr lxml_doc,zend_long options,const lexbor_libxml2_bridge_extracted_observations * observations)368 static void dom_post_process_html5_loading(
369 	xmlDocPtr lxml_doc,
370 	zend_long options,
371 	const lexbor_libxml2_bridge_extracted_observations *observations
372 )
373 {
374 	if (options & HTML_PARSE_NOIMPLIED) {
375 		xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html");
376 		if (!observations->has_explicit_head_tag) {
377 			dom_place_remove_element_and_hoist_children(html_node, "head");
378 		}
379 		if (!observations->has_explicit_body_tag) {
380 			dom_place_remove_element_and_hoist_children(html_node, "body");
381 		}
382 		if (!observations->has_explicit_html_tag) {
383 			dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html");
384 		}
385 	}
386 }
387 
388 /* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */
dom_determine_encoding(const char * source,size_t source_len)389 static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len)
390 {
391 	dom_character_encoding_data result;
392 
393 	/* BOM sniffing */
394 	if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') {
395 		result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
396 		result.bom_shift = 3;
397 		return result;
398 	} else if (source_len >= 2) {
399 		if (source[0] == '\xFE' && source[1] == '\xFF') {
400 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16BE);
401 			result.bom_shift = 2;
402 			return result;
403 		} else if (source[0] == '\xFF' && source[1] == '\xFE') {
404 			result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16LE);
405 			result.bom_shift = 2;
406 			return result;
407 		}
408 	}
409 
410 	/* Perform prescan */
411 	lxb_html_encoding_t encoding;
412 	lxb_status_t status = lxb_html_encoding_init(&encoding);
413 	if (status != LXB_STATUS_OK) {
414 		goto fallback_uninit;
415 	}
416 	/* This is the "wait either for 1024 bytes or 500ms" part */
417 	if (source_len > 1024) {
418 		source_len = 1024;
419 	}
420 	status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len);
421 	if (status != LXB_STATUS_OK) {
422 		goto fallback;
423 	}
424 	lxb_html_encoding_entry_t *entry = lxb_html_encoding_meta_entry(&encoding, 0);
425 	if (entry == NULL) {
426 		goto fallback;
427 	}
428 	result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
429 	if (!result.encoding_data) {
430 		goto fallback;
431 	}
432 	result.bom_shift = 0;
433 	lxb_html_encoding_destroy(&encoding, false);
434 	return result;
435 
436 fallback:
437 	lxb_html_encoding_destroy(&encoding, false);
438 fallback_uninit:
439 	result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
440 	result.bom_shift = 0;
441 	return result;
442 }
443 
dom_setup_parser_encoding_manually(const lxb_char_t * buf_start,const lxb_encoding_data_t * encoding_data,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)444 static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
445 {
446 	static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
447 
448 	decoding_encoding_ctx->decode_data = encoding_data;
449 
450 	(void) lxb_encoding_decode_init(
451 		&decoding_encoding_ctx->decode,
452 		decoding_encoding_ctx->decode_data,
453 		decoding_encoding_ctx->codepoints,
454 		sizeof(decoding_encoding_ctx->codepoints) / sizeof(*decoding_encoding_ctx->codepoints)
455 	);
456 	(void) lxb_encoding_decode_replace_set(
457 		&decoding_encoding_ctx->decode,
458 		&replacement_codepoint,
459 		LXB_ENCODING_REPLACEMENT_BUFFER_LEN
460 	);
461 	/* Note: encode_data is for UTF-8 */
462 	decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data;
463 
464 	if (decoding_encoding_ctx->fast_path) {
465 		application_data->current_input_codepoints = NULL;
466 		application_data->current_input_characters = (const char *) buf_start;
467 	} else {
468 		application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
469 		application_data->current_input_characters = NULL;
470 	}
471 }
472 
dom_setup_parser_encoding_implicitly(const lxb_char_t ** buf_ref,size_t * read,dom_decoding_encoding_ctx * decoding_encoding_ctx,dom_lexbor_libxml2_bridge_application_data * application_data)473 static void dom_setup_parser_encoding_implicitly(
474 	const lxb_char_t **buf_ref,
475 	size_t *read,
476 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
477 	dom_lexbor_libxml2_bridge_application_data *application_data
478 )
479 {
480 	const char *buf_start = (const char *) *buf_ref;
481 	dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
482 	*buf_ref += dom_encoding_data.bom_shift;
483 	*read -= dom_encoding_data.bom_shift;
484 	dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
485 }
486 
dom_process_parse_chunk(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,size_t encoded_length,const lxb_char_t * encoding_output,size_t input_buffer_length,size_t * tokenizer_error_offset,size_t * tree_error_offset)487 static bool dom_process_parse_chunk(
488 	lexbor_libxml2_bridge_parse_context *ctx,
489 	lxb_html_document_t *document,
490 	lxb_html_parser_t *parser,
491 	size_t encoded_length,
492 	const lxb_char_t *encoding_output,
493 	size_t input_buffer_length,
494 	size_t *tokenizer_error_offset,
495 	size_t *tree_error_offset
496 )
497 {
498 	dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data;
499 	application_data->current_input_length = input_buffer_length;
500 	lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length);
501 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
502 		return false;
503 	}
504 	if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
505 		lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
506 		dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
507 	}
508 	application_data->current_total_offset += input_buffer_length;
509 	application_data->cache_tokenizer.last_offset = 0;
510 	return true;
511 }
512 
dom_decode_encode_fast_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)513 static bool dom_decode_encode_fast_path(
514 	lexbor_libxml2_bridge_parse_context *ctx,
515 	lxb_html_document_t *document,
516 	lxb_html_parser_t *parser,
517 	const lxb_char_t **buf_ref_ref,
518 	const lxb_char_t *buf_end,
519 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
520 	size_t *tokenizer_error_offset,
521 	size_t *tree_error_offset
522 )
523 {
524 	const lxb_char_t *buf_ref = *buf_ref_ref;
525 	const lxb_char_t *last_output = buf_ref;
526 	while (buf_ref != buf_end) {
527 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
528 		if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
529 			/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
530 			 * need more UTF-8 bytes to complete a sequence.
531 			 * It might be tempting to use SIMD here, but it turns out that this is less efficient because
532 			 * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
533 			buf_ref++;
534 			continue;
535 		}
536 		const lxb_char_t *buf_ref_backup = buf_ref;
537 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
538 		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
539 			size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
540 			if (!dom_process_parse_chunk(
541 				ctx,
542 				document,
543 				parser,
544 				buf_ref - last_output - skip,
545 				last_output,
546 				buf_ref - last_output,
547 				tokenizer_error_offset,
548 				tree_error_offset
549 			)) {
550 				goto fail_oom;
551 			}
552 			if (!dom_process_parse_chunk(
553 				ctx,
554 				document,
555 				parser,
556 				LXB_ENCODING_REPLACEMENT_SIZE,
557 				LXB_ENCODING_REPLACEMENT_BYTES,
558 				0,
559 				tokenizer_error_offset,
560 				tree_error_offset
561 			)) {
562 				goto fail_oom;
563 			}
564 			last_output = buf_ref;
565 		}
566 	}
567 	if (buf_ref != last_output
568 		&& !dom_process_parse_chunk(
569 			ctx,
570 			document,
571 			parser,
572 			buf_ref - last_output,
573 			last_output,
574 			buf_ref - last_output,
575 			tokenizer_error_offset,
576 			tree_error_offset
577 	)) {
578 		goto fail_oom;
579 	}
580 	*buf_ref_ref = buf_ref;
581 	return true;
582 fail_oom:
583 	*buf_ref_ref = buf_ref;
584 	return false;
585 }
586 
dom_decode_encode_slow_path(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)587 static bool dom_decode_encode_slow_path(
588 	lexbor_libxml2_bridge_parse_context *ctx,
589 	lxb_html_document_t *document,
590 	lxb_html_parser_t *parser,
591 	const lxb_char_t **buf_ref_ref,
592 	const lxb_char_t *buf_end,
593 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
594 	size_t *tokenizer_error_offset,
595 	size_t *tree_error_offset
596 )
597 {
598 	const lxb_char_t *buf_ref = *buf_ref_ref;
599 	lexbor_status_t decode_status, encode_status;
600 	do {
601 		decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
602 
603 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
604 		size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
605 		const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used;
606 		do {
607 			encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
608 			ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid");
609 			if (!dom_process_parse_chunk(
610 				ctx,
611 				document,
612 				parser,
613 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
614 				decoding_encoding_ctx->encoding_output,
615 				decoding_buffer_used,
616 				tokenizer_error_offset,
617 				tree_error_offset
618 			)) {
619 				goto fail_oom;
620 			}
621 			lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0);
622 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
623 		lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0);
624 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
625 	*buf_ref_ref = buf_ref;
626 	return true;
627 fail_oom:
628 	*buf_ref_ref = buf_ref;
629 	return false;
630 }
631 
dom_parse_decode_encode_step(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,const lxb_char_t ** buf_ref_ref,const lxb_char_t * buf_end,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)632 static bool dom_parse_decode_encode_step(
633 	lexbor_libxml2_bridge_parse_context *ctx,
634 	lxb_html_document_t *document,
635 	lxb_html_parser_t *parser,
636 	const lxb_char_t **buf_ref_ref,
637 	const lxb_char_t *buf_end,
638 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
639 	size_t *tokenizer_error_offset,
640 	size_t *tree_error_offset
641 )
642 {
643 	if (decoding_encoding_ctx->fast_path) {
644 		return dom_decode_encode_fast_path(
645 			ctx,
646 			document,
647 			parser,
648 			buf_ref_ref,
649 			buf_end,
650 			decoding_encoding_ctx,
651 			tokenizer_error_offset,
652 			tree_error_offset
653 		);
654 	} else {
655 		return dom_decode_encode_slow_path(
656 			ctx,
657 			document,
658 			parser,
659 			buf_ref_ref,
660 			buf_end,
661 			decoding_encoding_ctx,
662 			tokenizer_error_offset,
663 			tree_error_offset
664 		);
665 	}
666 }
667 
dom_parse_decode_encode_finish(lexbor_libxml2_bridge_parse_context * ctx,lxb_html_document_t * document,lxb_html_parser_t * parser,dom_decoding_encoding_ctx * decoding_encoding_ctx,size_t * tokenizer_error_offset,size_t * tree_error_offset)668 static bool dom_parse_decode_encode_finish(
669 	lexbor_libxml2_bridge_parse_context *ctx,
670 	lxb_html_document_t *document,
671 	lxb_html_parser_t *parser,
672 	dom_decoding_encoding_ctx *decoding_encoding_ctx,
673 	size_t *tokenizer_error_offset,
674 	size_t *tree_error_offset
675 )
676 {
677 	if (!decoding_encoding_ctx->fast_path) {
678 		/* Fast path handles codepoints one by one, so this part is not applicable in that case */
679 		(void) lxb_encoding_decode_finish(&decoding_encoding_ctx->decode);
680 		size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
681 		if (decoding_buffer_size > 0) {
682 			const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
683 			const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size;
684 			(void) decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
685 			if (!dom_process_parse_chunk(
686 				ctx,
687 				document,
688 				parser,
689 				lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
690 				decoding_encoding_ctx->encoding_output,
691 				decoding_buffer_size,
692 				tokenizer_error_offset,
693 				tree_error_offset
694 			)) {
695 				return false;
696 			}
697 		}
698 	}
699 	(void) lxb_encoding_encode_finish(&decoding_encoding_ctx->encode);
700 	if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode)
701 		&& !dom_process_parse_chunk(
702 			ctx,
703 			document,
704 			parser,
705 			lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
706 			decoding_encoding_ctx->encoding_output,
707 			lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode),
708 			tokenizer_error_offset,
709 			tree_error_offset
710 	)) {
711 		return false;
712 	}
713 	return true;
714 }
715 
check_options_validity(uint32_t arg_num,zend_long options)716 static bool check_options_validity(uint32_t arg_num, zend_long options)
717 {
718 	const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS;
719 	if ((options & ~VALID_OPTIONS) != 0) {
720 		zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
721 										   "LIBXML_NOERROR, "
722 										   "LIBXML_COMPACT, "
723 										   "LIBXML_HTML_NOIMPLIED, "
724 										   "Dom\\NO_DEFAULT_NS)");
725 		return false;
726 	}
727 	return true;
728 }
729 
PHP_METHOD(Dom_HTMLDocument,createEmpty)730 PHP_METHOD(Dom_HTMLDocument, createEmpty)
731 {
732 	const char *encoding = "UTF-8";
733 	size_t encoding_len = strlen("UTF-8");
734 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|p", &encoding, &encoding_len) == FAILURE) {
735 		RETURN_THROWS();
736 	}
737 
738 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) encoding, encoding_len);
739 
740 	if (encoding_data == NULL) {
741 		zend_argument_value_error(1, "must be a valid document encoding");
742 		RETURN_THROWS();
743 	}
744 
745 	xmlDocPtr lxml_doc = php_dom_create_html_doc();
746 	if (UNEXPECTED(lxml_doc == NULL)) {
747 		goto oom;
748 	}
749 
750 	lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
751 
752 	dom_object *intern = php_dom_instantiate_object_helper(
753 		return_value,
754 		dom_html_document_class_entry,
755 		(xmlNodePtr) lxml_doc,
756 		NULL
757 	);
758 	dom_set_xml_class(intern->document);
759 	intern->document->private_data = php_dom_libxml_ns_mapper_header(php_dom_libxml_ns_mapper_create());
760 	return;
761 
762 oom:
763 	php_dom_throw_error(INVALID_STATE_ERR, true);
764 	RETURN_THROWS();
765 }
766 
767 /* Only bother to register error handling when the error reports can become observable. */
dom_should_register_error_handlers(zend_long options)768 static bool dom_should_register_error_handlers(zend_long options)
769 {
770 	if (options & XML_PARSE_NOERROR) {
771 		return false;
772 	}
773 
774 	return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING);
775 }
776 
PHP_METHOD(Dom_HTMLDocument,createFromString)777 PHP_METHOD(Dom_HTMLDocument, createFromString)
778 {
779 	const char *source, *override_encoding = NULL;
780 	size_t source_len, override_encoding_len;
781 	zend_long options = 0;
782 	if (zend_parse_parameters(
783 		ZEND_NUM_ARGS(),
784 		"s|lp!",
785 		&source,
786 		&source_len,
787 		&options,
788 		&override_encoding,
789 		&override_encoding_len
790 	) == FAILURE) {
791 		RETURN_THROWS();
792 	}
793 
794 	if (!check_options_validity(2, options)) {
795 		RETURN_THROWS();
796 	}
797 
798 	dom_lexbor_libxml2_bridge_application_data application_data;
799 	application_data.input_name = "Entity";
800 	application_data.current_total_offset = 0;
801 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
802 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
803 	lexbor_libxml2_bridge_parse_context ctx;
804 	lexbor_libxml2_bridge_parse_context_init(&ctx);
805 	if (dom_should_register_error_handlers(options)) {
806 		lexbor_libxml2_bridge_parse_set_error_callbacks(
807 			&ctx,
808 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
809 			dom_lexbor_libxml2_bridge_tree_error_reporter
810 		);
811 	}
812 	ctx.application_data = &application_data;
813 
814 	size_t tokenizer_error_offset = 0;
815 	size_t tree_error_offset = 0;
816 
817 	/* Setup everything encoding & decoding related */
818 	const lxb_char_t *buf_ref = (const lxb_char_t *) source;
819 	dom_decoding_encoding_ctx decoding_encoding_ctx;
820 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
821 	if (override_encoding != NULL) {
822 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
823 			(const lxb_char_t *) override_encoding,
824 			override_encoding_len
825 		);
826 		if (!encoding_data) {
827 			zend_argument_value_error(3, "must be a valid document encoding");
828 			RETURN_THROWS();
829 		}
830 		dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
831 	} else {
832 		dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
833 	}
834 
835 	lxb_html_document_t *document = lxb_html_document_create();
836 	if (UNEXPECTED(document == NULL)) {
837 		goto fail_oom;
838 	}
839 
840 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
841 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
842 		goto fail_oom;
843 	}
844 
845 	lxb_html_parser_t *parser = document->dom_document.parser;
846 
847 	while (source_len > 0) {
848 		size_t chunk_size = source_len;
849 		const size_t MAX_CHUNK_SIZE = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(*decoding_encoding_ctx.encoding_output);
850 		if (chunk_size > MAX_CHUNK_SIZE) {
851 			chunk_size = MAX_CHUNK_SIZE;
852 		}
853 		source_len -= chunk_size;
854 
855 		const lxb_char_t *buf_end = buf_ref + chunk_size;
856 		bool result = dom_parse_decode_encode_step(
857 			&ctx,
858 			document,
859 			parser,
860 			&buf_ref,
861 			buf_end,
862 			&decoding_encoding_ctx,
863 			&tokenizer_error_offset,
864 			&tree_error_offset
865 		);
866 		if (!result) {
867 			goto fail_oom;
868 		}
869 	}
870 
871 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
872 		goto fail_oom;
873 	}
874 
875 	lexbor_status = lxb_html_document_parse_chunk_end(document);
876 	if (lexbor_status != LXB_STATUS_OK) {
877 		goto fail_oom;
878 	}
879 
880 	php_dom_libxml_ns_mapper *ns_mapper = php_dom_libxml_ns_mapper_create();
881 
882 	xmlDocPtr lxml_doc;
883 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
884 		document,
885 		&lxml_doc,
886 		options & XML_PARSE_COMPACT,
887 		!(options & DOM_HTML_NO_DEFAULT_NS),
888 		ns_mapper
889 	);
890 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
891 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
892 		php_dom_libxml_ns_mapper_destroy(ns_mapper);
893 		php_libxml_ctx_error(
894 			NULL,
895 			"%s in %s",
896 			dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status),
897 			application_data.input_name
898 		);
899 		lxb_html_document_destroy(document);
900 		RETURN_FALSE;
901 	}
902 	lxb_html_document_destroy(document);
903 
904 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
905 
906 	if (decoding_encoding_ctx.decode_data) {
907 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
908 	} else {
909 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
910 	}
911 
912 	dom_object *intern = php_dom_instantiate_object_helper(
913 		return_value,
914 		dom_html_document_class_entry,
915 		(xmlNodePtr) lxml_doc,
916 		NULL
917 	);
918 	dom_set_xml_class(intern->document);
919 	intern->document->private_data = php_dom_libxml_ns_mapper_header(ns_mapper);
920 	return;
921 
922 fail_oom:
923 	lxb_html_document_destroy(document);
924 	php_dom_throw_error(INVALID_STATE_ERR, true);
925 	RETURN_THROWS();
926 }
927 
PHP_METHOD(Dom_HTMLDocument,createFromFile)928 PHP_METHOD(Dom_HTMLDocument, createFromFile)
929 {
930 	const char *filename, *override_encoding = NULL;
931 	php_dom_libxml_ns_mapper *ns_mapper = NULL;
932 	size_t filename_len, override_encoding_len;
933 	zend_long options = 0;
934 	php_stream *stream = NULL;
935 	if (zend_parse_parameters(
936 		ZEND_NUM_ARGS(),
937 		"p|lp!",
938 		&filename,
939 		&filename_len,
940 		&options,
941 		&override_encoding,
942 		&override_encoding_len
943 	) == FAILURE) {
944 		RETURN_THROWS();
945 	}
946 
947 	/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
948 	if (strstr(filename, "%00")) {
949 		zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
950 		RETURN_THROWS();
951 	}
952 
953 	if (!check_options_validity(2, options)) {
954 		RETURN_THROWS();
955 	}
956 
957 	dom_lexbor_libxml2_bridge_application_data application_data;
958 	application_data.input_name = filename;
959 	application_data.current_total_offset = 0;
960 	application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
961 	dom_reset_line_column_cache(&application_data.cache_tokenizer);
962 	lexbor_libxml2_bridge_parse_context ctx;
963 	lexbor_libxml2_bridge_parse_context_init(&ctx);
964 	if (dom_should_register_error_handlers(options)) {
965 		lexbor_libxml2_bridge_parse_set_error_callbacks(
966 			&ctx,
967 			dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
968 			dom_lexbor_libxml2_bridge_tree_error_reporter
969 		);
970 	}
971 	ctx.application_data = &application_data;
972 
973 	char buf[4096];
974 
975 	/* Setup everything encoding & decoding related */
976 	dom_decoding_encoding_ctx decoding_encoding_ctx;
977 	dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
978 	bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
979 	if (override_encoding != NULL) {
980 		const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
981 			(const lxb_char_t *) override_encoding,
982 			override_encoding_len
983 		);
984 		if (!encoding_data) {
985 			zend_argument_value_error(3, "must be a valid document encoding");
986 			RETURN_THROWS();
987 		}
988 		should_determine_encoding_implicitly = false;
989 		dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
990 	}
991 
992 	zend_string *opened_path = NULL;
993 	stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, &opened_path, php_libxml_get_stream_context());
994 	if (!stream) {
995 		if (!EG(exception)) {
996 			zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", filename);
997 		}
998 		RETURN_THROWS();
999 	}
1000 
1001 	/* MIME sniff */
1002 	if (should_determine_encoding_implicitly) {
1003 		zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
1004 		if (charset != NULL) {
1005 			const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1006 				(const lxb_char_t *) ZSTR_VAL(charset),
1007 				ZSTR_LEN(charset)
1008 			);
1009 			if (encoding_data != NULL) {
1010 				should_determine_encoding_implicitly = false;
1011 				dom_setup_parser_encoding_manually(
1012 					(const lxb_char_t *) buf,
1013 					encoding_data,
1014 					&decoding_encoding_ctx,
1015 					&application_data
1016 				);
1017 			}
1018 			zend_string_release_ex(charset, false);
1019 		}
1020 	}
1021 
1022 	lxb_html_document_t *document = lxb_html_document_create();
1023 	if (UNEXPECTED(document == NULL)) {
1024 		goto fail_oom;
1025 	}
1026 
1027 	lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
1028 	if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
1029 		goto fail_oom;
1030 	}
1031 
1032 	size_t tokenizer_error_offset = 0;
1033 	size_t tree_error_offset = 0;
1034 	ssize_t read;
1035 	lxb_html_parser_t *parser = document->dom_document.parser;
1036 
1037 	while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
1038 		const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1039 
1040 		if (should_determine_encoding_implicitly) {
1041 			should_determine_encoding_implicitly = false;
1042 			dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
1043 		}
1044 
1045 		const lxb_char_t *buf_end = buf_ref + read;
1046 		bool result = dom_parse_decode_encode_step(
1047 			&ctx,
1048 			document,
1049 			parser,
1050 			&buf_ref,
1051 			buf_end,
1052 			&decoding_encoding_ctx,
1053 			&tokenizer_error_offset,
1054 			&tree_error_offset
1055 		);
1056 		if (!result) {
1057 			goto fail_oom;
1058 		}
1059 	}
1060 
1061 	if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
1062 		goto fail_oom;
1063 	}
1064 
1065 	lexbor_status = lxb_html_document_parse_chunk_end(document);
1066 	if (lexbor_status != LXB_STATUS_OK) {
1067 		goto fail_oom;
1068 	}
1069 
1070 	ns_mapper = php_dom_libxml_ns_mapper_create();
1071 
1072 	xmlDocPtr lxml_doc;
1073 	lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
1074 		document,
1075 		&lxml_doc,
1076 		options & XML_PARSE_COMPACT,
1077 		!(options & DOM_HTML_NO_DEFAULT_NS),
1078 		ns_mapper
1079 	);
1080 	lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
1081 	if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
1082 		php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename);
1083 		RETVAL_FALSE;
1084 		goto fail_general;
1085 	}
1086 	lxb_html_document_destroy(document);
1087 
1088 	dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
1089 
1090 	if (decoding_encoding_ctx.decode_data) {
1091 		lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
1092 	} else {
1093 		lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
1094 	}
1095 
1096 	if (stream->wrapper == &php_plain_files_wrapper && opened_path != NULL) {
1097 		xmlChar *converted = xmlPathToURI((const xmlChar *) ZSTR_VAL(opened_path));
1098 		if (UNEXPECTED(!converted)) {
1099 			goto fail_oom;
1100 		}
1101 		/* Check for "file:/" instead of "file://" because of libxml2 quirk */
1102 		if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
1103 			xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
1104 			if (UNEXPECTED(!buffer)) {
1105 				xmlFree(converted);
1106 				goto fail_oom;
1107 			}
1108 			xmlChar *new_buffer = xmlStrcat(buffer, converted);
1109 			if (UNEXPECTED(!new_buffer)) {
1110 				xmlFree(buffer);
1111 				xmlFree(converted);
1112 				goto fail_oom;
1113 			}
1114 			xmlFree(converted);
1115 			lxml_doc->URL = new_buffer;
1116 		} else {
1117 #if PHP_WIN32
1118 			converted = php_dom_libxml_fix_file_path(converted);
1119 #endif
1120 			lxml_doc->URL = converted;
1121 		}
1122 	} else {
1123 		lxml_doc->URL = xmlStrdup((const xmlChar *) filename);
1124 	}
1125 
1126 	if (opened_path != NULL) {
1127 		zend_string_release_ex(opened_path, false);
1128 	}
1129 	php_stream_close(stream);
1130 	stream = NULL;
1131 
1132 	dom_object *intern = php_dom_instantiate_object_helper(
1133 		return_value,
1134 		dom_html_document_class_entry,
1135 		(xmlNodePtr) lxml_doc,
1136 		NULL
1137 	);
1138 	dom_set_xml_class(intern->document);
1139 	intern->document->private_data = php_dom_libxml_ns_mapper_header(ns_mapper);
1140 	return;
1141 
1142 fail_oom:
1143 	php_dom_throw_error(INVALID_STATE_ERR, true);
1144 fail_general:
1145 	if (ns_mapper != NULL) {
1146 		php_dom_libxml_ns_mapper_destroy(ns_mapper);
1147 	}
1148 	lxb_html_document_destroy(document);
1149 	php_stream_close(stream);
1150 	if (opened_path != NULL) {
1151 		zend_string_release_ex(opened_path, false);
1152 	}
1153 }
1154 
dom_write_output_smart_str(void * ctx,const char * buf,size_t size)1155 static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size)
1156 {
1157 	smart_str_appendl((smart_str *) ctx, buf, size);
1158 	return SUCCESS;
1159 }
1160 
dom_write_output_stream(void * application_data,const char * buf,size_t len)1161 static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len)
1162 {
1163 	php_stream *stream = (php_stream *) application_data;
1164 	if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) {
1165 		return FAILURE;
1166 	}
1167 	return SUCCESS;
1168 }
1169 
dom_saveHTML_write_string_len(void * application_data,const char * buf,size_t len)1170 static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
1171 {
1172 	dom_output_ctx *output = (dom_output_ctx *) application_data;
1173 	lxb_status_t decode_status, encode_status;
1174 	const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1175 	const lxb_char_t *buf_end = buf_ref + len;
1176 
1177 	do {
1178 		decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
1179 
1180 		const lxb_codepoint_t *codepoints_ref = output->codepoints;
1181 		const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
1182 		do {
1183 			encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end);
1184 			if (UNEXPECTED(output->write_output(
1185 				output->output_data,
1186 				(const char *) output->encoding_output,
1187 				lxb_encoding_encode_buf_used(output->encode)
1188 			) != SUCCESS)) {
1189 				return FAILURE;
1190 			}
1191 			lxb_encoding_encode_buf_used_set(output->encode, 0);
1192 		} while (encode_status == LXB_STATUS_SMALL_BUFFER);
1193 		lxb_encoding_decode_buf_used_set(output->decode, 0);
1194 	} while (decode_status == LXB_STATUS_SMALL_BUFFER);
1195 
1196 	return SUCCESS;
1197 }
1198 
dom_saveHTML_write_string(void * application_data,const char * buf)1199 static zend_result dom_saveHTML_write_string(void *application_data, const char *buf)
1200 {
1201 	return dom_saveHTML_write_string_len(application_data, buf, strlen(buf));
1202 }
1203 
dom_common_save(dom_output_ctx * output_ctx,const xmlDoc * docp,const xmlNode * node)1204 static zend_result dom_common_save(dom_output_ctx *output_ctx, const xmlDoc *docp, const xmlNode *node)
1205 {
1206 	/* Initialize everything related to encoding & decoding */
1207 	const lxb_encoding_data_t *decoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
1208 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1209 		(const lxb_char_t *) docp->encoding,
1210 		strlen((const char *) docp->encoding)
1211 	);
1212 	lxb_encoding_encode_t encode;
1213 	lxb_encoding_decode_t decode;
1214 	lxb_char_t encoding_output[4096];
1215 	lxb_codepoint_t codepoints[4096];
1216 	(void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(*encoding_output));
1217 	(void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(*codepoints));
1218 	if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1219 		lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
1220 	} else {
1221 		/* Fallback if there is no replacement by default */
1222 		lxb_encoding_encode_replace_set(&encode, (const lxb_char_t *) "?", 1);
1223 	}
1224 	lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
1225 
1226 	output_ctx->encoding_data = encoding_data;
1227 	output_ctx->decoding_data = decoding_data;
1228 	output_ctx->encode = &encode;
1229 	output_ctx->decode = &decode;
1230 	output_ctx->codepoints = codepoints;
1231 	output_ctx->encoding_output = encoding_output;
1232 
1233 	dom_html5_serialize_context ctx;
1234 	ctx.write_string_len = dom_saveHTML_write_string_len;
1235 	ctx.write_string = dom_saveHTML_write_string;
1236 	ctx.application_data = output_ctx;
1237 	if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {
1238 		return FAILURE;
1239 	}
1240 
1241 	(void) lxb_encoding_decode_finish(&decode);
1242 	if (lxb_encoding_decode_buf_used(&decode)) {
1243 		const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints;
1244 		(void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode));
1245 		if (UNEXPECTED(output_ctx->write_output(
1246 			output_ctx->output_data,
1247 			(const char *) encoding_output,
1248 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1249 		)) {
1250 			return FAILURE;
1251 		}
1252 	}
1253 	(void) lxb_encoding_encode_finish(&encode);
1254 	if (lxb_encoding_encode_buf_used(&encode)) {
1255 		if (UNEXPECTED(output_ctx->write_output(
1256 			output_ctx->output_data,
1257 			(const char *) encoding_output,
1258 			lxb_encoding_encode_buf_used(&encode)) != SUCCESS
1259 		)) {
1260 			return FAILURE;
1261 		}
1262 	}
1263 
1264 	return SUCCESS;
1265 }
1266 
PHP_METHOD(Dom_HTMLDocument,saveHtmlFile)1267 PHP_METHOD(Dom_HTMLDocument, saveHtmlFile)
1268 {
1269 	zval *id;
1270 	xmlDoc *docp;
1271 	size_t file_len;
1272 	dom_object *intern;
1273 	char *file;
1274 
1275 	id = ZEND_THIS;
1276 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) {
1277 		RETURN_THROWS();
1278 	}
1279 
1280 	if (file_len == 0) {
1281 		zend_argument_value_error(1, "must not be empty");
1282 		RETURN_THROWS();
1283 	}
1284 
1285 	php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
1286 	if (!stream) {
1287 		RETURN_FALSE;
1288 	}
1289 
1290 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
1291 
1292 	dom_output_ctx output_ctx;
1293 	output_ctx.output_data = stream;
1294 	output_ctx.write_output = dom_write_output_stream;
1295 	if (UNEXPECTED(dom_common_save(&output_ctx, docp, (const xmlNode *) docp) != SUCCESS)) {
1296 		php_stream_close(stream);
1297 		RETURN_FALSE;
1298 	}
1299 
1300 	zend_long bytes = php_stream_tell(stream);
1301 	php_stream_close(stream);
1302 
1303 	RETURN_LONG(bytes);
1304 }
1305 
PHP_METHOD(Dom_HTMLDocument,saveHtml)1306 PHP_METHOD(Dom_HTMLDocument, saveHtml)
1307 {
1308 	zval *nodep = NULL;
1309 	const xmlDoc *docp;
1310 	const xmlNode *node;
1311 	dom_object *intern, *nodeobj;
1312 
1313 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|O!", &nodep, dom_modern_node_class_entry) == FAILURE) {
1314 		RETURN_THROWS();
1315 	}
1316 
1317 	DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
1318 
1319 	if (nodep != NULL) {
1320 		DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj);
1321 		if (node->doc != docp) {
1322 			php_dom_throw_error(WRONG_DOCUMENT_ERR, true);
1323 			RETURN_THROWS();
1324 		}
1325 	} else {
1326 		node = (const xmlNode *) docp;
1327 	}
1328 
1329 	smart_str buf = {0};
1330 	dom_output_ctx output_ctx;
1331 	output_ctx.output_data = &buf;
1332 	output_ctx.write_output = dom_write_output_smart_str;
1333 	/* Can't fail because dom_write_output_smart_str() can't fail. */
1334 	zend_result result = dom_common_save(&output_ctx, docp, node);
1335 	ZEND_ASSERT(result == SUCCESS);
1336 
1337 	RETURN_STR(smart_str_extract(&buf));
1338 }
1339 
dom_html_document_encoding_write(dom_object * obj,zval * newval)1340 zend_result dom_html_document_encoding_write(dom_object *obj, zval *newval)
1341 {
1342 	DOM_PROP_NODE(xmlDocPtr, docp, obj);
1343 
1344 	/* Typed property, can only be IS_STRING. */
1345 	ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
1346 
1347 	zend_string *str = Z_STR_P(newval);
1348 	const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str));
1349 
1350 	if (encoding_data != NULL) {
1351 		xmlFree(BAD_CAST docp->encoding);
1352 		docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name);
1353 	} else {
1354 		zend_value_error("Invalid document encoding");
1355 		return FAILURE;
1356 	}
1357 
1358 	return SUCCESS;
1359 }
1360 
1361 #endif  /* HAVE_LIBXML && HAVE_DOM */
1362