1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tree/insertion_mode.h"
8 #include "lexbor/dom/interfaces/document_type.h"
9 
10 
11 typedef struct {
12     const char *data;
13     size_t len;
14 }
15 lxb_html_tree_insertion_mode_initial_str_t;
16 
17 
18 static lxb_html_tree_insertion_mode_initial_str_t
19 lxb_html_tree_insertion_mode_initial_doctype_public_is[] =
20 {
21     {"-//W3O//DTD W3 HTML Strict 3.0//EN//", 36},
22     {"-/W3C/DTD HTML 4.0 Transitional/EN", 34},
23     {"HTML", 4}
24 };
25 
26 static lxb_html_tree_insertion_mode_initial_str_t
27 lxb_html_tree_insertion_mode_initial_doctype_system_is[] =
28 {
29     {"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd", 58}
30 };
31 
32 static lxb_html_tree_insertion_mode_initial_str_t
33 lxb_html_tree_insertion_mode_initial_doctype_public_start[] =
34 {
35     {"+//Silmaril//dtd html Pro v0r11 19970101//", 42},
36     {"-//AS//DTD HTML 3.0 asWedit + extensions//", 42},
37     {"-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", 52},
38     {"-//IETF//DTD HTML 2.0 Level 1//", 31},
39     {"-//IETF//DTD HTML 2.0 Level 2//", 31},
40     {"-//IETF//DTD HTML 2.0 Strict Level 1//", 38},
41     {"-//IETF//DTD HTML 2.0 Strict Level 2//", 38},
42     {"-//IETF//DTD HTML 2.0 Strict//", 30},
43     {"-//IETF//DTD HTML 2.0//", 23},
44     {"-//IETF//DTD HTML 2.1E//", 24},
45     {"-//IETF//DTD HTML 3.0//", 23},
46     {"-//IETF//DTD HTML 3.2 Final//", 29},
47     {"-//IETF//DTD HTML 3.2//", 23},
48     {"-//IETF//DTD HTML 3//", 21},
49     {"-//IETF//DTD HTML Level 0//", 27},
50     {"-//IETF//DTD HTML Level 1//", 27},
51     {"-//IETF//DTD HTML Level 2//", 27},
52     {"-//IETF//DTD HTML Level 3//", 27},
53     {"-//IETF//DTD HTML Strict Level 0//", 34},
54     {"-//IETF//DTD HTML Strict Level 1//", 34},
55     {"-//IETF//DTD HTML Strict Level 2//", 34},
56     {"-//IETF//DTD HTML Strict Level 3//", 34},
57     {"-//IETF//DTD HTML Strict//", 26},
58     {"-//IETF//DTD HTML//", 19},
59     {"-//Metrius//DTD Metrius Presentational//", 40},
60     {"-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", 53},
61     {"-//Microsoft//DTD Internet Explorer 2.0 HTML//", 46},
62     {"-//Microsoft//DTD Internet Explorer 2.0 Tables//", 48},
63     {"-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", 53},
64     {"-//Microsoft//DTD Internet Explorer 3.0 HTML//", 46},
65     {"-//Microsoft//DTD Internet Explorer 3.0 Tables//", 48},
66     {"-//Netscape Comm. Corp.//DTD HTML//", 35},
67     {"-//Netscape Comm. Corp.//DTD Strict HTML//", 42},
68     {"-//O'Reilly and Associates//DTD HTML 2.0//", 42},
69     {"-//O'Reilly and Associates//DTD HTML Extended 1.0//", 51},
70     {"-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", 59},
71     {"-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", 43},
72     {"-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", 78},
73     {"-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", 69},
74     {"-//Spyglass//DTD HTML 2.0 Extended//", 36},
75     {"-//Sun Microsystems Corp.//DTD HotJava HTML//", 45},
76     {"-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", 52},
77     {"-//W3C//DTD HTML 3 1995-03-24//", 31},
78     {"-//W3C//DTD HTML 3.2 Draft//", 28},
79     {"-//W3C//DTD HTML 3.2 Final//", 28},
80     {"-//W3C//DTD HTML 3.2//", 22},
81     {"-//W3C//DTD HTML 3.2S Draft//", 29},
82     {"-//W3C//DTD HTML 4.0 Frameset//", 31},
83     {"-//W3C//DTD HTML 4.0 Transitional//", 35},
84     {"-//W3C//DTD HTML Experimental 19960712//", 40},
85     {"-//W3C//DTD HTML Experimental 970421//", 38},
86     {"-//W3C//DTD W3 HTML//", 21},
87     {"-//W3O//DTD W3 HTML 3.0//", 25},
88     {"-//WebTechs//DTD Mozilla HTML 2.0//", 35},
89     {"-//WebTechs//DTD Mozilla HTML//", 31}
90 };
91 
92 static lxb_html_tree_insertion_mode_initial_str_t
93 lxb_html_tree_insertion_mode_initial_doctype_sys_pub_start[] =
94 {
95     {"-//W3C//DTD HTML 4.01 Frameset//", 32},
96     {"-//W3C//DTD HTML 4.01 Transitional//", 36}
97 };
98 
99 static lxb_html_tree_insertion_mode_initial_str_t
100 lxb_html_tree_insertion_mode_initial_doctype_lim_pub_start[] =
101 {
102     {"-//W3C//DTD XHTML 1.0 Frameset//", 32},
103     {"-//W3C//DTD XHTML 1.0 Transitional//", 36}
104 };
105 
106 
107 static bool
108 lxb_html_tree_insertion_mode_initial_doctype(lxb_html_tree_t *tree,
109                                              lxb_html_token_t *token);
110 
111 static void
112 lxb_html_tree_insertion_mode_initial_doctype_ckeck(lxb_html_tree_t *tree,
113                                          lxb_dom_document_type_t *doc_type,
114                                          lxb_html_token_t *token, bool is_html);
115 
116 static bool
117 lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(
118                                              lxb_dom_document_type_t *doc_type);
119 
120 static bool
121 lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(
122                                              lxb_dom_document_type_t *doc_type);
123 
124 static bool
125 lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(
126                                              lxb_dom_document_type_t *doc_type);
127 
128 static bool
129 lxb_html_tree_insertion_mode_initial_doctype_check_limq(
130                                              lxb_dom_document_type_t *doc_type);
131 
132 
133 bool
lxb_html_tree_insertion_mode_initial(lxb_html_tree_t * tree,lxb_html_token_t * token)134 lxb_html_tree_insertion_mode_initial(lxb_html_tree_t *tree,
135                                      lxb_html_token_t *token)
136 {
137     switch (token->tag_id) {
138         case LXB_TAG__EM_COMMENT: {
139             lxb_dom_comment_t *comment;
140 
141             comment = lxb_html_tree_insert_comment(tree, token,
142                                         lxb_dom_interface_node(tree->document));
143             if (comment == NULL) {
144                 return lxb_html_tree_process_abort(tree);
145             }
146 
147             break;
148         }
149 
150         case LXB_TAG__EM_DOCTYPE:
151             tree->mode = lxb_html_tree_insertion_mode_before_html;
152 
153             return lxb_html_tree_insertion_mode_initial_doctype(tree, token);
154 
155         case LXB_TAG__TEXT:
156             tree->status = lxb_html_token_data_skip_ws_begin(token);
157             if (tree->status != LXB_STATUS_OK) {
158                 return lxb_html_tree_process_abort(tree);
159             }
160 
161             if (token->text_start == token->text_end) {
162                 return true;
163             }
164             /* fall through */
165 
166         default: {
167             lxb_dom_document_t *document = &tree->document->dom_document;
168 
169             if (tree->document->iframe_srcdoc == NULL) {
170                 lxb_html_tree_parse_error(tree, token,
171                                           LXB_HTML_RULES_ERROR_UNTOININMO);
172 
173                 document->compat_mode = LXB_DOM_DOCUMENT_CMODE_QUIRKS;
174             }
175 
176             tree->mode = lxb_html_tree_insertion_mode_before_html;
177 
178             return false;
179         }
180     }
181 
182     return true;
183 }
184 
185 static bool
lxb_html_tree_insertion_mode_initial_doctype(lxb_html_tree_t * tree,lxb_html_token_t * token)186 lxb_html_tree_insertion_mode_initial_doctype(lxb_html_tree_t *tree,
187                                              lxb_html_token_t *token)
188 {
189     lxb_dom_document_type_t *doc_type;
190 
191     /* Create */
192     doc_type = lxb_html_tree_create_document_type_from_token(tree, token);
193     if (doc_type == NULL) {
194         tree->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
195 
196         return lxb_html_tree_process_abort(tree);
197     }
198 
199     /* Check */
200     bool is_html = (doc_type->name == LXB_DOM_ATTR_HTML);
201 
202     if (is_html == false
203         || doc_type->public_id.length != 0
204         || (doc_type->system_id.length == 19
205             && strncmp("about:legacy-compat",
206                        (const char *) doc_type->system_id.data, 19) != 0)
207         )
208     {
209         lxb_html_tree_parse_error(tree, token,
210                                   LXB_HTML_RULES_ERROR_BADOTOININMO);
211     }
212 
213     lxb_html_tree_insertion_mode_initial_doctype_ckeck(tree, doc_type,
214                                                        token, is_html);
215 
216     lxb_dom_node_insert_child_wo_events(&tree->document->dom_document.node,
217                                         lxb_dom_interface_node(doc_type));
218 
219     lxb_dom_document_attach_doctype(&tree->document->dom_document, doc_type);
220 
221     return true;
222 }
223 
224 static void
lxb_html_tree_insertion_mode_initial_doctype_ckeck(lxb_html_tree_t * tree,lxb_dom_document_type_t * doc_type,lxb_html_token_t * token,bool is_html)225 lxb_html_tree_insertion_mode_initial_doctype_ckeck(lxb_html_tree_t *tree,
226                                           lxb_dom_document_type_t *doc_type,
227                                           lxb_html_token_t *token, bool is_html)
228 {
229     if (tree->document->iframe_srcdoc != NULL) {
230         return;
231     }
232 
233     bool quirks;
234     lxb_dom_document_t *document = &tree->document->dom_document;
235 
236     if (token->type & LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS) {
237         goto set_quirks;
238     }
239 
240     if (is_html == false) {
241         goto set_quirks;
242     }
243 
244     if (doc_type->public_id.length != 0) {
245         quirks =
246             lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(doc_type);
247 
248         if (quirks) {
249             goto set_quirks;
250         }
251     }
252 
253     if (doc_type->system_id.length != 0) {
254         quirks =
255             lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(doc_type);
256 
257         if (quirks) {
258             goto set_quirks;
259         }
260     }
261 
262     if (doc_type->public_id.length != 0 && doc_type->system_id.length == 0) {
263         quirks =
264             lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(doc_type);
265 
266         if (quirks) {
267             goto set_quirks;
268         }
269     }
270 
271     if (doc_type->public_id.length != 0) {
272         quirks =
273             lxb_html_tree_insertion_mode_initial_doctype_check_limq(doc_type);
274 
275         if (quirks) {
276             document->compat_mode = LXB_DOM_DOCUMENT_CMODE_LIMITED_QUIRKS;
277             return;
278         }
279     }
280 
281     return;
282 
283 set_quirks:
284 
285     document->compat_mode = LXB_DOM_DOCUMENT_CMODE_QUIRKS;
286 }
287 
288 static bool
lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(lxb_dom_document_type_t * doc_type)289 lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(
290                                               lxb_dom_document_type_t *doc_type)
291 {
292     size_t size, i;
293     lxb_html_tree_insertion_mode_initial_str_t *str;
294 
295     /* The public identifier is set to */
296     size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_public_is)
297         / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
298 
299     for (i = 0; i < size; i++) {
300         str = &lxb_html_tree_insertion_mode_initial_doctype_public_is[i];
301 
302         if (str->len == doc_type->public_id.length
303             && lexbor_str_data_casecmp((const lxb_char_t *) str->data,
304                                        doc_type->public_id.data))
305         {
306             return true;
307         }
308     }
309 
310     /* The public identifier starts with */
311     size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_public_start)
312         / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
313 
314     for (i = 0; i < size; i++) {
315         str = &lxb_html_tree_insertion_mode_initial_doctype_public_start[i];
316 
317         if (str->len <= doc_type->public_id.length
318             && lexbor_str_data_ncasecmp((const lxb_char_t *) str->data,
319                                         doc_type->public_id.data, str->len))
320         {
321             return true;
322         }
323     }
324 
325     return false;
326 }
327 
328 static bool
lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(lxb_dom_document_type_t * doc_type)329 lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(
330                                               lxb_dom_document_type_t *doc_type)
331 {
332     size_t size;
333     lxb_html_tree_insertion_mode_initial_str_t *str;
334 
335     /* The system identifier is set to */
336     size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_system_is)
337         / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
338 
339     for (size_t i = 0; i < size; i++) {
340         str = &lxb_html_tree_insertion_mode_initial_doctype_system_is[i];
341 
342         if (str->len == doc_type->system_id.length
343             && lexbor_str_data_casecmp((const lxb_char_t *) str->data,
344                                        doc_type->system_id.data))
345         {
346             return true;
347         }
348     }
349 
350     return false;
351 }
352 
353 static bool
lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(lxb_dom_document_type_t * doc_type)354 lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(
355                                               lxb_dom_document_type_t *doc_type)
356 {
357     size_t size;
358     lxb_html_tree_insertion_mode_initial_str_t *str;
359 
360     /* The system identifier is missing and the public identifier starts with */
361     size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_sys_pub_start)
362         / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
363 
364     for (size_t i = 0; i < size; i++) {
365         str = &lxb_html_tree_insertion_mode_initial_doctype_sys_pub_start[i];
366 
367         if (str->len <= doc_type->public_id.length
368             && lexbor_str_data_ncasecmp((const lxb_char_t *) str->data,
369                                         doc_type->public_id.data, str->len))
370         {
371             return true;
372         }
373     }
374 
375     return false;
376 }
377 
378 static bool
lxb_html_tree_insertion_mode_initial_doctype_check_limq(lxb_dom_document_type_t * doc_type)379 lxb_html_tree_insertion_mode_initial_doctype_check_limq(
380                                               lxb_dom_document_type_t *doc_type)
381 {
382     bool quirks;
383     size_t size;
384     lxb_html_tree_insertion_mode_initial_str_t *str;
385 
386     if (doc_type->system_id.length != 0) {
387         quirks =
388             lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(doc_type);
389 
390         if (quirks) {
391             return true;
392         }
393     }
394 
395     /* The public identifier starts with */
396     size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_lim_pub_start)
397         / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
398 
399     for (size_t i = 0; i < size; i++) {
400         str = &lxb_html_tree_insertion_mode_initial_doctype_lim_pub_start[i];
401 
402         if (str->len <= doc_type->public_id.length
403             && lexbor_str_data_ncasecmp((const lxb_char_t *) str->data,
404                                         doc_type->public_id.data, str->len))
405         {
406             return true;
407         }
408     }
409 
410     return false;
411 }
412