1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tree/insertion_mode.h"
8 #include "lexbor/dom/interfaces/document_type.h"
9
10
11 typedef struct {
12 const char *data;
13 size_t len;
14 }
15 lxb_html_tree_insertion_mode_initial_str_t;
16
17
18 static lxb_html_tree_insertion_mode_initial_str_t
19 lxb_html_tree_insertion_mode_initial_doctype_public_is[] =
20 {
21 {"-//W3O//DTD W3 HTML Strict 3.0//EN//", 36},
22 {"-/W3C/DTD HTML 4.0 Transitional/EN", 34},
23 {"HTML", 4}
24 };
25
26 static lxb_html_tree_insertion_mode_initial_str_t
27 lxb_html_tree_insertion_mode_initial_doctype_system_is[] =
28 {
29 {"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd", 58}
30 };
31
32 static lxb_html_tree_insertion_mode_initial_str_t
33 lxb_html_tree_insertion_mode_initial_doctype_public_start[] =
34 {
35 {"+//Silmaril//dtd html Pro v0r11 19970101//", 42},
36 {"-//AS//DTD HTML 3.0 asWedit + extensions//", 42},
37 {"-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", 52},
38 {"-//IETF//DTD HTML 2.0 Level 1//", 31},
39 {"-//IETF//DTD HTML 2.0 Level 2//", 31},
40 {"-//IETF//DTD HTML 2.0 Strict Level 1//", 38},
41 {"-//IETF//DTD HTML 2.0 Strict Level 2//", 38},
42 {"-//IETF//DTD HTML 2.0 Strict//", 30},
43 {"-//IETF//DTD HTML 2.0//", 23},
44 {"-//IETF//DTD HTML 2.1E//", 24},
45 {"-//IETF//DTD HTML 3.0//", 23},
46 {"-//IETF//DTD HTML 3.2 Final//", 29},
47 {"-//IETF//DTD HTML 3.2//", 23},
48 {"-//IETF//DTD HTML 3//", 21},
49 {"-//IETF//DTD HTML Level 0//", 27},
50 {"-//IETF//DTD HTML Level 1//", 27},
51 {"-//IETF//DTD HTML Level 2//", 27},
52 {"-//IETF//DTD HTML Level 3//", 27},
53 {"-//IETF//DTD HTML Strict Level 0//", 34},
54 {"-//IETF//DTD HTML Strict Level 1//", 34},
55 {"-//IETF//DTD HTML Strict Level 2//", 34},
56 {"-//IETF//DTD HTML Strict Level 3//", 34},
57 {"-//IETF//DTD HTML Strict//", 26},
58 {"-//IETF//DTD HTML//", 19},
59 {"-//Metrius//DTD Metrius Presentational//", 40},
60 {"-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", 53},
61 {"-//Microsoft//DTD Internet Explorer 2.0 HTML//", 46},
62 {"-//Microsoft//DTD Internet Explorer 2.0 Tables//", 48},
63 {"-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", 53},
64 {"-//Microsoft//DTD Internet Explorer 3.0 HTML//", 46},
65 {"-//Microsoft//DTD Internet Explorer 3.0 Tables//", 48},
66 {"-//Netscape Comm. Corp.//DTD HTML//", 35},
67 {"-//Netscape Comm. Corp.//DTD Strict HTML//", 42},
68 {"-//O'Reilly and Associates//DTD HTML 2.0//", 42},
69 {"-//O'Reilly and Associates//DTD HTML Extended 1.0//", 51},
70 {"-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", 59},
71 {"-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", 43},
72 {"-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", 78},
73 {"-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", 69},
74 {"-//Spyglass//DTD HTML 2.0 Extended//", 36},
75 {"-//Sun Microsystems Corp.//DTD HotJava HTML//", 45},
76 {"-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", 52},
77 {"-//W3C//DTD HTML 3 1995-03-24//", 31},
78 {"-//W3C//DTD HTML 3.2 Draft//", 28},
79 {"-//W3C//DTD HTML 3.2 Final//", 28},
80 {"-//W3C//DTD HTML 3.2//", 22},
81 {"-//W3C//DTD HTML 3.2S Draft//", 29},
82 {"-//W3C//DTD HTML 4.0 Frameset//", 31},
83 {"-//W3C//DTD HTML 4.0 Transitional//", 35},
84 {"-//W3C//DTD HTML Experimental 19960712//", 40},
85 {"-//W3C//DTD HTML Experimental 970421//", 38},
86 {"-//W3C//DTD W3 HTML//", 21},
87 {"-//W3O//DTD W3 HTML 3.0//", 25},
88 {"-//WebTechs//DTD Mozilla HTML 2.0//", 35},
89 {"-//WebTechs//DTD Mozilla HTML//", 31}
90 };
91
92 static lxb_html_tree_insertion_mode_initial_str_t
93 lxb_html_tree_insertion_mode_initial_doctype_sys_pub_start[] =
94 {
95 {"-//W3C//DTD HTML 4.01 Frameset//", 32},
96 {"-//W3C//DTD HTML 4.01 Transitional//", 36}
97 };
98
99 static lxb_html_tree_insertion_mode_initial_str_t
100 lxb_html_tree_insertion_mode_initial_doctype_lim_pub_start[] =
101 {
102 {"-//W3C//DTD XHTML 1.0 Frameset//", 32},
103 {"-//W3C//DTD XHTML 1.0 Transitional//", 36}
104 };
105
106
107 static bool
108 lxb_html_tree_insertion_mode_initial_doctype(lxb_html_tree_t *tree,
109 lxb_html_token_t *token);
110
111 static void
112 lxb_html_tree_insertion_mode_initial_doctype_ckeck(lxb_html_tree_t *tree,
113 lxb_dom_document_type_t *doc_type,
114 lxb_html_token_t *token, bool is_html);
115
116 static bool
117 lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(
118 lxb_dom_document_type_t *doc_type);
119
120 static bool
121 lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(
122 lxb_dom_document_type_t *doc_type);
123
124 static bool
125 lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(
126 lxb_dom_document_type_t *doc_type);
127
128 static bool
129 lxb_html_tree_insertion_mode_initial_doctype_check_limq(
130 lxb_dom_document_type_t *doc_type);
131
132
133 bool
lxb_html_tree_insertion_mode_initial(lxb_html_tree_t * tree,lxb_html_token_t * token)134 lxb_html_tree_insertion_mode_initial(lxb_html_tree_t *tree,
135 lxb_html_token_t *token)
136 {
137 switch (token->tag_id) {
138 case LXB_TAG__EM_COMMENT: {
139 lxb_dom_comment_t *comment;
140
141 comment = lxb_html_tree_insert_comment(tree, token,
142 lxb_dom_interface_node(tree->document));
143 if (comment == NULL) {
144 return lxb_html_tree_process_abort(tree);
145 }
146
147 break;
148 }
149
150 case LXB_TAG__EM_DOCTYPE:
151 tree->mode = lxb_html_tree_insertion_mode_before_html;
152
153 return lxb_html_tree_insertion_mode_initial_doctype(tree, token);
154
155 case LXB_TAG__TEXT:
156 tree->status = lxb_html_token_data_skip_ws_begin(token);
157 if (tree->status != LXB_STATUS_OK) {
158 return lxb_html_tree_process_abort(tree);
159 }
160
161 if (token->text_start == token->text_end) {
162 return true;
163 }
164 /* fall through */
165
166 default: {
167 lxb_dom_document_t *document = &tree->document->dom_document;
168
169 if (tree->document->iframe_srcdoc == NULL) {
170 lxb_html_tree_parse_error(tree, token,
171 LXB_HTML_RULES_ERROR_UNTOININMO);
172
173 document->compat_mode = LXB_DOM_DOCUMENT_CMODE_QUIRKS;
174 }
175
176 tree->mode = lxb_html_tree_insertion_mode_before_html;
177
178 return false;
179 }
180 }
181
182 return true;
183 }
184
185 static bool
lxb_html_tree_insertion_mode_initial_doctype(lxb_html_tree_t * tree,lxb_html_token_t * token)186 lxb_html_tree_insertion_mode_initial_doctype(lxb_html_tree_t *tree,
187 lxb_html_token_t *token)
188 {
189 lxb_dom_document_type_t *doc_type;
190
191 /* Create */
192 doc_type = lxb_html_tree_create_document_type_from_token(tree, token);
193 if (doc_type == NULL) {
194 tree->status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
195
196 return lxb_html_tree_process_abort(tree);
197 }
198
199 /* Check */
200 bool is_html = (doc_type->name == LXB_DOM_ATTR_HTML);
201
202 if (is_html == false
203 || doc_type->public_id.length != 0
204 || (doc_type->system_id.length == 19
205 && strncmp("about:legacy-compat",
206 (const char *) doc_type->system_id.data, 19) != 0)
207 )
208 {
209 lxb_html_tree_parse_error(tree, token,
210 LXB_HTML_RULES_ERROR_BADOTOININMO);
211 }
212
213 lxb_html_tree_insertion_mode_initial_doctype_ckeck(tree, doc_type,
214 token, is_html);
215
216 lxb_dom_node_insert_child_wo_events(&tree->document->dom_document.node,
217 lxb_dom_interface_node(doc_type));
218
219 lxb_dom_document_attach_doctype(&tree->document->dom_document, doc_type);
220
221 return true;
222 }
223
224 static void
lxb_html_tree_insertion_mode_initial_doctype_ckeck(lxb_html_tree_t * tree,lxb_dom_document_type_t * doc_type,lxb_html_token_t * token,bool is_html)225 lxb_html_tree_insertion_mode_initial_doctype_ckeck(lxb_html_tree_t *tree,
226 lxb_dom_document_type_t *doc_type,
227 lxb_html_token_t *token, bool is_html)
228 {
229 if (tree->document->iframe_srcdoc != NULL) {
230 return;
231 }
232
233 bool quirks;
234 lxb_dom_document_t *document = &tree->document->dom_document;
235
236 if (token->type & LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS) {
237 goto set_quirks;
238 }
239
240 if (is_html == false) {
241 goto set_quirks;
242 }
243
244 if (doc_type->public_id.length != 0) {
245 quirks =
246 lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(doc_type);
247
248 if (quirks) {
249 goto set_quirks;
250 }
251 }
252
253 if (doc_type->system_id.length != 0) {
254 quirks =
255 lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(doc_type);
256
257 if (quirks) {
258 goto set_quirks;
259 }
260 }
261
262 if (doc_type->public_id.length != 0 && doc_type->system_id.length == 0) {
263 quirks =
264 lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(doc_type);
265
266 if (quirks) {
267 goto set_quirks;
268 }
269 }
270
271 if (doc_type->public_id.length != 0) {
272 quirks =
273 lxb_html_tree_insertion_mode_initial_doctype_check_limq(doc_type);
274
275 if (quirks) {
276 document->compat_mode = LXB_DOM_DOCUMENT_CMODE_LIMITED_QUIRKS;
277 return;
278 }
279 }
280
281 return;
282
283 set_quirks:
284
285 document->compat_mode = LXB_DOM_DOCUMENT_CMODE_QUIRKS;
286 }
287
288 static bool
lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(lxb_dom_document_type_t * doc_type)289 lxb_html_tree_insertion_mode_initial_doctype_ckeck_public(
290 lxb_dom_document_type_t *doc_type)
291 {
292 size_t size, i;
293 lxb_html_tree_insertion_mode_initial_str_t *str;
294
295 /* The public identifier is set to */
296 size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_public_is)
297 / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
298
299 for (i = 0; i < size; i++) {
300 str = &lxb_html_tree_insertion_mode_initial_doctype_public_is[i];
301
302 if (str->len == doc_type->public_id.length
303 && lexbor_str_data_casecmp((const lxb_char_t *) str->data,
304 doc_type->public_id.data))
305 {
306 return true;
307 }
308 }
309
310 /* The public identifier starts with */
311 size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_public_start)
312 / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
313
314 for (i = 0; i < size; i++) {
315 str = &lxb_html_tree_insertion_mode_initial_doctype_public_start[i];
316
317 if (str->len <= doc_type->public_id.length
318 && lexbor_str_data_ncasecmp((const lxb_char_t *) str->data,
319 doc_type->public_id.data, str->len))
320 {
321 return true;
322 }
323 }
324
325 return false;
326 }
327
328 static bool
lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(lxb_dom_document_type_t * doc_type)329 lxb_html_tree_insertion_mode_initial_doctype_ckeck_system(
330 lxb_dom_document_type_t *doc_type)
331 {
332 size_t size;
333 lxb_html_tree_insertion_mode_initial_str_t *str;
334
335 /* The system identifier is set to */
336 size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_system_is)
337 / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
338
339 for (size_t i = 0; i < size; i++) {
340 str = &lxb_html_tree_insertion_mode_initial_doctype_system_is[i];
341
342 if (str->len == doc_type->system_id.length
343 && lexbor_str_data_casecmp((const lxb_char_t *) str->data,
344 doc_type->system_id.data))
345 {
346 return true;
347 }
348 }
349
350 return false;
351 }
352
353 static bool
lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(lxb_dom_document_type_t * doc_type)354 lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(
355 lxb_dom_document_type_t *doc_type)
356 {
357 size_t size;
358 lxb_html_tree_insertion_mode_initial_str_t *str;
359
360 /* The system identifier is missing and the public identifier starts with */
361 size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_sys_pub_start)
362 / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
363
364 for (size_t i = 0; i < size; i++) {
365 str = &lxb_html_tree_insertion_mode_initial_doctype_sys_pub_start[i];
366
367 if (str->len <= doc_type->public_id.length
368 && lexbor_str_data_ncasecmp((const lxb_char_t *) str->data,
369 doc_type->public_id.data, str->len))
370 {
371 return true;
372 }
373 }
374
375 return false;
376 }
377
378 static bool
lxb_html_tree_insertion_mode_initial_doctype_check_limq(lxb_dom_document_type_t * doc_type)379 lxb_html_tree_insertion_mode_initial_doctype_check_limq(
380 lxb_dom_document_type_t *doc_type)
381 {
382 bool quirks;
383 size_t size;
384 lxb_html_tree_insertion_mode_initial_str_t *str;
385
386 if (doc_type->system_id.length != 0) {
387 quirks =
388 lxb_html_tree_insertion_mode_initial_doctype_ckeck_pubsys(doc_type);
389
390 if (quirks) {
391 return true;
392 }
393 }
394
395 /* The public identifier starts with */
396 size = sizeof(lxb_html_tree_insertion_mode_initial_doctype_lim_pub_start)
397 / sizeof(lxb_html_tree_insertion_mode_initial_str_t);
398
399 for (size_t i = 0; i < size; i++) {
400 str = &lxb_html_tree_insertion_mode_initial_doctype_lim_pub_start[i];
401
402 if (str->len <= doc_type->public_id.length
403 && lexbor_str_data_ncasecmp((const lxb_char_t *) str->data,
404 doc_type->public_id.data, str->len))
405 {
406 return true;
407 }
408 }
409
410 return false;
411 }
412