1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Niels Dossche <nielsdos@php.net> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "php_dom.h"
24 #include "html5_parser.h"
25 #include <lexbor/html/parser.h>
26 #include <lexbor/html/interfaces/element.h>
27 #include <libxml/parserInternals.h>
28 #include <libxml/HTMLtree.h>
29 #include <Zend/zend.h>
30
31 #define WORK_LIST_INIT_SIZE 128
32 /* libxml2 reserves 2 pointer-sized words for interned strings */
33 #define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
34
35 typedef struct _work_list_item {
36 lxb_dom_node_t *node;
37 uintptr_t current_active_namespace;
38 xmlNodePtr lxml_parent;
39 xmlNsPtr lxml_ns;
40 } work_list_item;
41
lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t * array,lxb_dom_node_t * node,uintptr_t current_active_namespace,xmlNodePtr lxml_parent,xmlNsPtr lxml_ns)42 static void lexbor_libxml2_bridge_work_list_item_push(
43 lexbor_array_obj_t *array,
44 lxb_dom_node_t *node,
45 uintptr_t current_active_namespace,
46 xmlNodePtr lxml_parent,
47 xmlNsPtr lxml_ns
48 )
49 {
50 work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array);
51 item->node = node;
52 item->current_active_namespace = current_active_namespace;
53 item->lxml_parent = lxml_parent;
54 item->lxml_ns = lxml_ns;
55 }
56
sanitize_line_nr(size_t line)57 static unsigned short sanitize_line_nr(size_t line)
58 {
59 if (line > USHRT_MAX) {
60 return USHRT_MAX;
61 }
62 return (unsigned short) line;
63 }
64
get_libxml_namespace_href(uintptr_t lexbor_namespace)65 static const php_dom_ns_magic_token *get_libxml_namespace_href(uintptr_t lexbor_namespace)
66 {
67 if (lexbor_namespace == LXB_NS_SVG) {
68 return php_dom_ns_is_svg_magic_token;
69 } else if (lexbor_namespace == LXB_NS_MATH) {
70 return php_dom_ns_is_mathml_magic_token;
71 } else {
72 return php_dom_ns_is_html_magic_token;
73 }
74 }
75
lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,const lxb_char_t * data,size_t data_length,bool compact_text_nodes)76 static zend_always_inline xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
77 {
78 if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
79 /* See xmlSAX2TextNode() in libxml2 */
80 xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
81 if (UNEXPECTED(lxml_text == NULL)) {
82 return NULL;
83 }
84 memset(lxml_text, 0, sizeof(*lxml_text));
85 lxml_text->name = xmlStringText;
86 lxml_text->type = XML_TEXT_NODE;
87 lxml_text->doc = lxml_doc;
88 lxml_text->content = BAD_CAST &lxml_text->properties;
89 if (data != NULL) {
90 memcpy(lxml_text->content, data, data_length);
91 }
92 return lxml_text;
93 } else {
94 return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
95 }
96 }
97
lexbor_libxml2_bridge_convert(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,bool compact_text_nodes,bool create_default_ns,php_dom_libxml_ns_mapper * ns_mapper)98 static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
99 lxb_dom_node_t *start_node,
100 xmlDocPtr lxml_doc,
101 bool compact_text_nodes,
102 bool create_default_ns,
103 php_dom_libxml_ns_mapper *ns_mapper
104 )
105 {
106 lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
107
108 xmlNsPtr html_ns = php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper);
109 xmlNsPtr xlink_ns = NULL;
110 xmlNsPtr prefixed_xmlns_ns = NULL;
111
112 lexbor_array_obj_t work_list;
113 lexbor_array_obj_init(&work_list, WORK_LIST_INIT_SIZE, sizeof(work_list_item));
114
115 for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) {
116 lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, (xmlNodePtr) lxml_doc, NULL);
117 }
118
119 work_list_item *current_stack_item;
120 while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) {
121 lxb_dom_node_t *node = current_stack_item->node;
122 xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
123
124 /* CDATA section and processing instructions don't occur in parsed HTML documents.
125 * The historical types are not emitted by the parser either. */
126 if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
127 /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name.
128 * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty.
129 * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */
130 lxb_dom_element_t *element = lxb_dom_interface_element(node);
131 const lxb_char_t *name = lxb_dom_element_local_name(element, NULL);
132 xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL);
133 if (UNEXPECTED(lxml_element == NULL)) {
134 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
135 break;
136 }
137 xmlAddChild(lxml_parent, lxml_element);
138 lxml_element->line = sanitize_line_nr(node->line);
139
140 /* Namespaces, note: namespace switches are uncommon */
141 uintptr_t entering_namespace = element->node.ns;
142 xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
143 if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
144 if (entering_namespace == LXB_NS_HTML) {
145 current_lxml_ns = html_ns;
146 } else {
147 const php_dom_ns_magic_token *magic_token = get_libxml_namespace_href(entering_namespace);
148 zend_string *uri = zend_string_init((char *) magic_token, strlen((char *) magic_token), false);
149 current_lxml_ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, NULL, uri);
150 zend_string_release_ex(uri, false);
151 if (EXPECTED(current_lxml_ns != NULL)) {
152 current_lxml_ns->_private = (void *) magic_token;
153 }
154 }
155 }
156 /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */
157 lxml_element->ns = current_lxml_ns;
158
159 for (lxb_dom_node_t *child_node = element->node.last_child; child_node != NULL; child_node = child_node->prev) {
160 lexbor_libxml2_bridge_work_list_item_push(
161 &work_list,
162 child_node,
163 entering_namespace,
164 lxml_element,
165 current_lxml_ns
166 );
167 }
168
169 xmlAttrPtr last_added_attr = NULL;
170 for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
171 /* Same namespace remark as for elements */
172 size_t local_name_length, value_length;
173 const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
174 const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
175
176 if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
177 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
178 break;
179 }
180
181 xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
182 if (UNEXPECTED(lxml_attr == NULL)) {
183 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
184 break;
185 }
186
187 memset(lxml_attr, 0, sizeof(xmlAttr));
188 lxml_attr->type = XML_ATTRIBUTE_NODE;
189 lxml_attr->parent = lxml_element;
190 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
191 lxml_attr->doc = lxml_doc;
192 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
193 if (UNEXPECTED(lxml_text == NULL)) {
194 xmlFreeProp(lxml_attr);
195 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
196 break;
197 }
198
199 lxml_attr->children = lxml_attr->last = lxml_text;
200 lxml_text->parent = (xmlNodePtr) lxml_attr;
201
202 if (attr->node.ns == LXB_NS_XMLNS) {
203 if (strcmp((const char *) local_name, "xmlns") != 0) {
204 if (prefixed_xmlns_ns == NULL) {
205 prefixed_xmlns_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xmlns", DOM_XMLNS_NS_URI);
206 }
207 lxml_attr->ns = prefixed_xmlns_ns;
208 } else {
209 lxml_attr->ns = php_dom_libxml_ns_mapper_ensure_prefixless_xmlns_ns(ns_mapper);
210 }
211 lxml_attr->ns->_private = (void *) php_dom_ns_is_xmlns_magic_token;
212 } else if (attr->node.ns == LXB_NS_XLINK) {
213 if (xlink_ns == NULL) {
214 xlink_ns = php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(ns_mapper, "xlink", DOM_XLINK_NS_URI);
215 xlink_ns->_private = (void *) php_dom_ns_is_xlink_magic_token;
216 }
217 lxml_attr->ns = xlink_ns;
218 }
219
220 if (last_added_attr == NULL) {
221 lxml_element->properties = lxml_attr;
222 } else {
223 last_added_attr->next = lxml_attr;
224 lxml_attr->prev = last_added_attr;
225 }
226 last_added_attr = lxml_attr;
227
228 /* xmlIsID does some other stuff too that is irrelevant here. */
229 if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd' && attr->node.ns == LXB_NS_HTML) {
230 xmlAddID(NULL, lxml_doc, value, lxml_attr);
231 }
232
233 /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
234 }
235 } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
236 lxb_dom_text_t *text = lxb_dom_interface_text(node);
237 const lxb_char_t *data = text->char_data.data.data;
238 size_t data_length = text->char_data.data.length;
239 if (UNEXPECTED(data_length >= INT_MAX)) {
240 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
241 break;
242 }
243 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
244 if (UNEXPECTED(lxml_text == NULL)) {
245 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
246 break;
247 }
248 xmlAddChild(lxml_parent, lxml_text);
249 if (node->line >= USHRT_MAX) {
250 lxml_text->line = USHRT_MAX;
251 lxml_text->psvi = (void *) (ptrdiff_t) node->line;
252 } else {
253 lxml_text->line = (unsigned short) node->line;
254 }
255 } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
256 lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node);
257 const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL);
258 size_t public_id_len, system_id_len;
259 const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len);
260 const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len);
261 xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
262 lxml_doc,
263 name,
264 public_id_len ? public_id : NULL,
265 system_id_len ? system_id : NULL
266 );
267 if (UNEXPECTED(lxml_dtd == NULL)) {
268 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
269 break;
270 }
271 /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
272 } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
273 lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
274 xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
275 if (UNEXPECTED(lxml_comment == NULL)) {
276 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
277 break;
278 }
279 xmlAddChild(lxml_parent, lxml_comment);
280 lxml_comment->line = sanitize_line_nr(node->line);
281 }
282 }
283
284 lexbor_array_obj_destroy(&work_list, false);
285 return retval;
286 }
287
lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context * ctx)288 void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
289 {
290 memset(ctx, 0, sizeof(*ctx));
291 }
292
lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context * ctx,lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)293 void lexbor_libxml2_bridge_parse_set_error_callbacks(
294 lexbor_libxml2_bridge_parse_context *ctx,
295 lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,
296 lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
297 )
298 {
299 ctx->tokenizer_error_reporter = tokenizer_error_reporter;
300 ctx->tree_error_reporter = tree_error_reporter;
301 }
302
lexbor_libxml2_bridge_convert_document(lxb_html_document_t * document,xmlDocPtr * doc_out,bool compact_text_nodes,bool create_default_ns,php_dom_libxml_ns_mapper * ns_mapper)303 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
304 lxb_html_document_t *document,
305 xmlDocPtr *doc_out,
306 bool compact_text_nodes,
307 bool create_default_ns,
308 php_dom_libxml_ns_mapper *ns_mapper
309 )
310 {
311 xmlDocPtr lxml_doc = php_dom_create_html_doc();
312 if (UNEXPECTED(!lxml_doc)) {
313 return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
314 }
315 lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
316 lxb_dom_interface_node(document)->last_child,
317 lxml_doc,
318 compact_text_nodes,
319 create_default_ns,
320 ns_mapper
321 );
322 if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
323 xmlFreeDoc(lxml_doc);
324 return status;
325 }
326 *doc_out = lxml_doc;
327 return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
328 }
329
lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context * ctx,lxb_html_parser_t * parser,const lxb_char_t * input_html,size_t chunk_offset,size_t * error_index_offset_tokenizer,size_t * error_index_offset_tree)330 void lexbor_libxml2_bridge_report_errors(
331 const lexbor_libxml2_bridge_parse_context *ctx,
332 lxb_html_parser_t *parser,
333 const lxb_char_t *input_html,
334 size_t chunk_offset,
335 size_t *error_index_offset_tokenizer,
336 size_t *error_index_offset_tree
337 )
338 {
339 void *error;
340
341 /* Tokenizer errors */
342 lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors;
343 size_t index = *error_index_offset_tokenizer;
344 while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
345 /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */
346 lxb_html_tokenizer_error_t *token_error = error;
347 if (ctx->tokenizer_error_reporter) {
348 ctx->tokenizer_error_reporter(
349 ctx->application_data,
350 token_error,
351 token_error->pos - input_html + chunk_offset
352 );
353 }
354 index++;
355 }
356 *error_index_offset_tokenizer = index;
357
358 /* Tree parser errors */
359 parse_errors = lxb_html_parser_tree(parser)->parse_errors;
360 index = *error_index_offset_tree;
361 while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
362 /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */
363 lxb_html_tree_error_t *tree_error = error;
364 if (ctx->tree_error_reporter) {
365 ctx->tree_error_reporter(
366 ctx->application_data,
367 tree_error,
368 tree_error->line + 1,
369 tree_error->column + 1,
370 tree_error->length
371 );
372 }
373 index++;
374 }
375 *error_index_offset_tree = index;
376 }
377
lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t * tree,lexbor_libxml2_bridge_extracted_observations * observations)378 void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
379 {
380 observations->has_explicit_html_tag = tree->has_explicit_html_tag;
381 observations->has_explicit_head_tag = tree->has_explicit_head_tag;
382 observations->has_explicit_body_tag = tree->has_explicit_body_tag;
383 }
384
385 #endif /* HAVE_LIBXML && HAVE_DOM */
386