1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Niels Dossche <nielsdos@php.net> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include "php.h"
22 #if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23 #include "html5_parser.h"
24 #include "namespace_compat.h"
25 #include <lexbor/html/parser.h>
26 #include <lexbor/html/interfaces/element.h>
27 #include <libxml/tree.h>
28 #include <libxml/parserInternals.h>
29 #include <libxml/HTMLtree.h>
30 #include <Zend/zend.h>
31
32 #define WORK_LIST_INIT_SIZE 128
33 /* libxml2 reserves 2 pointer-sized words for interned strings */
34 #define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
35
36 typedef struct _work_list_item {
37 lxb_dom_node_t *node;
38 uintptr_t current_active_namespace;
39 xmlNodePtr lxml_parent;
40 xmlNsPtr lxml_ns;
41 } work_list_item;
42
lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t * array,lxb_dom_node_t * node,uintptr_t current_active_namespace,xmlNodePtr lxml_parent,xmlNsPtr lxml_ns)43 static void lexbor_libxml2_bridge_work_list_item_push(
44 lexbor_array_obj_t *array,
45 lxb_dom_node_t *node,
46 uintptr_t current_active_namespace,
47 xmlNodePtr lxml_parent,
48 xmlNsPtr lxml_ns
49 )
50 {
51 work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array);
52 item->node = node;
53 item->current_active_namespace = current_active_namespace;
54 item->lxml_parent = lxml_parent;
55 item->lxml_ns = lxml_ns;
56 }
57
sanitize_line_nr(size_t line)58 static unsigned short sanitize_line_nr(size_t line)
59 {
60 if (line > USHRT_MAX) {
61 return USHRT_MAX;
62 }
63 return (unsigned short) line;
64 }
65
get_libxml_namespace_href(uintptr_t lexbor_namespace)66 static const xmlChar *get_libxml_namespace_href(uintptr_t lexbor_namespace)
67 {
68 if (lexbor_namespace == LXB_NS_SVG) {
69 return (const xmlChar *) DOM_SVG_NS_URI;
70 } else if (lexbor_namespace == LXB_NS_MATH) {
71 return (const xmlChar *) DOM_MATHML_NS_URI;
72 } else {
73 return (const xmlChar *) DOM_XHTML_NS_URI;
74 }
75 }
76
lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,const lxb_char_t * data,size_t data_length,bool compact_text_nodes)77 static xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
78 {
79 if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
80 /* See xmlSAX2TextNode() in libxml2 */
81 xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
82 if (UNEXPECTED(lxml_text == NULL)) {
83 return NULL;
84 }
85 memset(lxml_text, 0, sizeof(*lxml_text));
86 lxml_text->name = xmlStringText;
87 lxml_text->type = XML_TEXT_NODE;
88 lxml_text->doc = lxml_doc;
89 lxml_text->content = (xmlChar *) &lxml_text->properties;
90 memcpy(lxml_text->content, data, data_length);
91 return lxml_text;
92 } else {
93 return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
94 }
95 }
96
lexbor_libxml2_bridge_convert(lxb_dom_node_t * start_node,xmlDocPtr lxml_doc,bool compact_text_nodes,bool create_default_ns)97 static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
98 lxb_dom_node_t *start_node,
99 xmlDocPtr lxml_doc,
100 bool compact_text_nodes,
101 bool create_default_ns
102 )
103 {
104 lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
105
106 lexbor_array_obj_t work_list;
107 lexbor_array_obj_init(&work_list, WORK_LIST_INIT_SIZE, sizeof(work_list_item));
108
109 for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) {
110 lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, (xmlNodePtr) lxml_doc, NULL);
111 }
112
113 work_list_item *current_stack_item;
114 while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) {
115 lxb_dom_node_t *node = current_stack_item->node;
116 xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
117
118 /* CDATA section and processing instructions don't occur in parsed HTML documents.
119 * The historical types are not emitted by the parser either. */
120 if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
121 /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name.
122 * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty.
123 * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */
124 lxb_dom_element_t *element = lxb_dom_interface_element(node);
125 const lxb_char_t *name = lxb_dom_element_local_name(element, NULL);
126 xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL);
127 if (UNEXPECTED(lxml_element == NULL)) {
128 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
129 goto out;
130 }
131 xmlAddChild(lxml_parent, lxml_element);
132 lxml_element->line = sanitize_line_nr(node->line);
133
134 /* Namespaces, note: namespace switches are uncommon */
135 uintptr_t entering_namespace = element->node.ns;
136 xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
137 if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
138 current_lxml_ns = xmlNewNs(lxml_element, get_libxml_namespace_href(entering_namespace), NULL);
139 }
140 /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */
141 lxml_element->ns = current_lxml_ns;
142
143 for (lxb_dom_node_t *child_node = element->node.last_child; child_node != NULL; child_node = child_node->prev) {
144 lexbor_libxml2_bridge_work_list_item_push(
145 &work_list,
146 child_node,
147 entering_namespace,
148 lxml_element,
149 current_lxml_ns
150 );
151 }
152
153 xmlAttrPtr last_added_attr = NULL;
154 for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
155 /* Same namespace remark as for elements */
156 size_t local_name_length, value_length;
157 const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
158 const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
159
160 if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
161 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
162 goto out;
163 }
164
165 xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
166 if (UNEXPECTED(lxml_attr == NULL)) {
167 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
168 goto out;
169 }
170
171 memset(lxml_attr, 0, sizeof(xmlAttr));
172 lxml_attr->type = XML_ATTRIBUTE_NODE;
173 lxml_attr->parent = lxml_element;
174 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
175 lxml_attr->doc = lxml_doc;
176 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
177 if (UNEXPECTED(lxml_text == NULL)) {
178 xmlFreeProp(lxml_attr);
179 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
180 goto out;
181 }
182
183 lxml_attr->children = lxml_attr->last = lxml_text;
184
185 if (last_added_attr == NULL) {
186 lxml_element->properties = lxml_attr;
187 } else {
188 last_added_attr->next = lxml_attr;
189 lxml_attr->prev = last_added_attr;
190 }
191 last_added_attr = lxml_attr;
192
193 /* xmlIsID does some other stuff too that is irrelevant here. */
194 if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd') {
195 xmlAddID(NULL, lxml_doc, value, lxml_attr);
196 }
197
198 /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
199 }
200 } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
201 lxb_dom_text_t *text = lxb_dom_interface_text(node);
202 const lxb_char_t *data = text->char_data.data.data;
203 size_t data_length = text->char_data.data.length;
204 if (UNEXPECTED(data_length >= INT_MAX)) {
205 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
206 goto out;
207 }
208 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
209 if (UNEXPECTED(lxml_text == NULL)) {
210 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
211 goto out;
212 }
213 xmlAddChild(lxml_parent, lxml_text);
214 if (node->line >= USHRT_MAX) {
215 lxml_text->line = USHRT_MAX;
216 lxml_text->psvi = (void *) (ptrdiff_t) node->line;
217 } else {
218 lxml_text->line = (unsigned short) node->line;
219 }
220 } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
221 lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node);
222 const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL);
223 size_t public_id_len, system_id_len;
224 const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len);
225 const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len);
226 xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
227 lxml_doc,
228 name,
229 public_id_len ? public_id : NULL,
230 system_id_len ? system_id : NULL
231 );
232 if (UNEXPECTED(lxml_dtd == NULL)) {
233 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
234 goto out;
235 }
236 /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
237 } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
238 lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
239 xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
240 if (UNEXPECTED(lxml_comment == NULL)) {
241 retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
242 goto out;
243 }
244 xmlAddChild(lxml_parent, lxml_comment);
245 lxml_comment->line = sanitize_line_nr(node->line);
246 }
247 }
248
249 out:
250 lexbor_array_obj_destroy(&work_list, false);
251 return retval;
252 }
253
lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context * ctx)254 void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
255 {
256 memset(ctx, 0, sizeof(*ctx));
257 }
258
lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context * ctx,lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)259 void lexbor_libxml2_bridge_parse_set_error_callbacks(
260 lexbor_libxml2_bridge_parse_context *ctx,
261 lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter,
262 lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
263 )
264 {
265 ctx->tokenizer_error_reporter = tokenizer_error_reporter;
266 ctx->tree_error_reporter = tree_error_reporter;
267 }
268
lexbor_libxml2_bridge_convert_document(lxb_html_document_t * document,xmlDocPtr * doc_out,bool compact_text_nodes,bool create_default_ns)269 lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
270 lxb_html_document_t *document,
271 xmlDocPtr *doc_out,
272 bool compact_text_nodes,
273 bool create_default_ns
274 )
275 {
276 #ifdef LIBXML_HTML_ENABLED
277 xmlDocPtr lxml_doc = htmlNewDocNoDtD(NULL, NULL);
278 if (UNEXPECTED(!lxml_doc)) {
279 return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
280 }
281 #else
282 /* If HTML support is not enabled, then htmlNewDocNoDtD() is not available.
283 * This code mimics the behaviour. */
284 xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) "1.0");
285 if (UNEXPECTED(!lxml_doc)) {
286 return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
287 }
288 lxml_doc->type = XML_HTML_DOCUMENT_NODE;
289 #endif
290 lxml_doc->dict = xmlDictCreate();
291 lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
292 lxb_dom_interface_node(document)->last_child,
293 lxml_doc,
294 compact_text_nodes,
295 create_default_ns
296 );
297 if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) {
298 xmlFreeDoc(lxml_doc);
299 return status;
300 }
301 *doc_out = lxml_doc;
302 return LEXBOR_LIBXML2_BRIDGE_STATUS_OK;
303 }
304
lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context * ctx,lxb_html_parser_t * parser,const lxb_char_t * input_html,size_t chunk_offset,size_t * error_index_offset_tokenizer,size_t * error_index_offset_tree)305 void lexbor_libxml2_bridge_report_errors(
306 const lexbor_libxml2_bridge_parse_context *ctx,
307 lxb_html_parser_t *parser,
308 const lxb_char_t *input_html,
309 size_t chunk_offset,
310 size_t *error_index_offset_tokenizer,
311 size_t *error_index_offset_tree
312 )
313 {
314 void *error;
315
316 /* Tokenizer errors */
317 lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors;
318 size_t index = *error_index_offset_tokenizer;
319 while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
320 /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */
321 lxb_html_tokenizer_error_t *token_error = error;
322 if (ctx->tokenizer_error_reporter) {
323 ctx->tokenizer_error_reporter(
324 ctx->application_data,
325 token_error,
326 token_error->pos - input_html + chunk_offset
327 );
328 }
329 index++;
330 }
331 *error_index_offset_tokenizer = index;
332
333 /* Tree parser errors */
334 parse_errors = lxb_html_parser_tree(parser)->parse_errors;
335 index = *error_index_offset_tree;
336 while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) {
337 /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */
338 lxb_html_tree_error_t *tree_error = error;
339 if (ctx->tree_error_reporter) {
340 ctx->tree_error_reporter(
341 ctx->application_data,
342 tree_error,
343 tree_error->line + 1,
344 tree_error->column + 1,
345 tree_error->length
346 );
347 }
348 index++;
349 }
350 *error_index_offset_tree = index;
351 }
352
lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t * tree,lexbor_libxml2_bridge_extracted_observations * observations)353 void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
354 {
355 observations->has_explicit_html_tag = tree->has_explicit_html_tag;
356 observations->has_explicit_head_tag = tree->has_explicit_head_tag;
357 observations->has_explicit_body_tag = tree->has_explicit_body_tag;
358 }
359
360 #endif /* HAVE_LIBXML && HAVE_DOM */
361