1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tokenizer/state_doctype.h"
8 #include "lexbor/html/tokenizer/state.h"
9
10
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #include "lexbor/core/str_res.h"
13
14
15 lxb_dom_attr_data_t *
16 lxb_dom_attr_local_name_append(lexbor_hash_t *hash,
17 const lxb_char_t *name, size_t length);
18
19
20 static const lxb_char_t *
21 lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t *tkz,
22 const lxb_char_t *data,
23 const lxb_char_t *end);
24
25 static const lxb_char_t *
26 lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t *tkz,
27 const lxb_char_t *data,
28 const lxb_char_t *end);
29
30 static const lxb_char_t *
31 lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t *tkz,
32 const lxb_char_t *data,
33 const lxb_char_t *end);
34
35 static const lxb_char_t *
36 lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t *tkz,
37 const lxb_char_t *data,
38 const lxb_char_t *end);
39
40 static const lxb_char_t *
41 lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t *tkz,
42 const lxb_char_t *data,
43 const lxb_char_t *end);
44
45 static const lxb_char_t *
46 lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t *tkz,
47 const lxb_char_t *data,
48 const lxb_char_t *end);
49
50 static const lxb_char_t *
51 lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t *tkz,
52 const lxb_char_t *data,
53 const lxb_char_t *end);
54
55 static const lxb_char_t *
56 lxb_html_tokenizer_state_doctype_before_public_identifier(
57 lxb_html_tokenizer_t *tkz,
58 const lxb_char_t *data,
59 const lxb_char_t *end);
60
61 static const lxb_char_t *
62 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(
63 lxb_html_tokenizer_t *tkz,
64 const lxb_char_t *data,
65 const lxb_char_t *end);
66
67 static const lxb_char_t *
68 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(
69 lxb_html_tokenizer_t *tkz,
70 const lxb_char_t *data,
71 const lxb_char_t *end);
72
73 static const lxb_char_t *
74 lxb_html_tokenizer_state_doctype_after_public_identifier(
75 lxb_html_tokenizer_t *tkz,
76 const lxb_char_t *data,
77 const lxb_char_t *end);
78
79 static const lxb_char_t *
80 lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(
81 lxb_html_tokenizer_t *tkz,
82 const lxb_char_t *data,
83 const lxb_char_t *end);
84
85 static const lxb_char_t *
86 lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t *tkz,
87 const lxb_char_t *data,
88 const lxb_char_t *end);
89
90 static const lxb_char_t *
91 lxb_html_tokenizer_state_doctype_before_system_identifier(
92 lxb_html_tokenizer_t *tkz,
93 const lxb_char_t *data,
94 const lxb_char_t *end);
95
96 static const lxb_char_t *
97 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(
98 lxb_html_tokenizer_t *tkz,
99 const lxb_char_t *data,
100 const lxb_char_t *end);
101
102 static const lxb_char_t *
103 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(
104 lxb_html_tokenizer_t *tkz,
105 const lxb_char_t *data,
106 const lxb_char_t *end);
107
108 static const lxb_char_t *
109 lxb_html_tokenizer_state_doctype_after_system_identifier(
110 lxb_html_tokenizer_t *tkz,
111 const lxb_char_t *data,
112 const lxb_char_t *end);
113
114 static const lxb_char_t *
115 lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t *tkz,
116 const lxb_char_t *data,
117 const lxb_char_t *end);
118
119
120 /*
121 * Helper function. No in the specification. For 12.2.5.53
122 */
123 const lxb_char_t *
lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)124 lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t *tkz,
125 const lxb_char_t *data,
126 const lxb_char_t *end)
127 {
128 if (tkz->is_eof == false) {
129 lxb_html_tokenizer_state_token_set_end(tkz, data);
130 }
131 else {
132 lxb_html_tokenizer_state_token_set_end_oef(tkz);
133 }
134
135 tkz->token->tag_id = LXB_TAG__EM_DOCTYPE;
136
137 return lxb_html_tokenizer_state_doctype(tkz, data, end);
138 }
139
140 /*
141 * 12.2.5.53 DOCTYPE state
142 */
143 static const lxb_char_t *
lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)144 lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t *tkz,
145 const lxb_char_t *data,
146 const lxb_char_t *end)
147 {
148 switch (*data) {
149 /*
150 * U+0009 CHARACTER TABULATION (tab)
151 * U+000A LINE FEED (LF)
152 * U+000C FORM FEED (FF)
153 * U+000D CARRIAGE RETURN (CR)
154 * U+0020 SPACE
155 */
156 case 0x09:
157 case 0x0A:
158 case 0x0C:
159 case 0x0D:
160 case 0x20:
161 data++;
162 break;
163
164 /* U+003E GREATER-THAN SIGN (>) */
165 case 0x3E:
166 break;
167
168 /* EOF */
169 case 0x00:
170 if (tkz->is_eof) {
171 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
172 LXB_HTML_TOKENIZER_ERROR_EOINDO);
173
174 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
175
176 lxb_html_tokenizer_state_token_done_m(tkz, end);
177
178 return end;
179 }
180 /* fall through */
181
182 default:
183 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
184 LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA);
185 break;
186 }
187
188 tkz->state = lxb_html_tokenizer_state_doctype_before_name;
189
190 return data;
191 }
192
193 /*
194 * 12.2.5.54 Before DOCTYPE name state
195 */
196 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)197 lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t *tkz,
198 const lxb_char_t *data,
199 const lxb_char_t *end)
200 {
201 lxb_html_token_attr_t *attr;
202
203 while (data != end) {
204 switch (*data) {
205 /*
206 * U+0009 CHARACTER TABULATION (tab)
207 * U+000A LINE FEED (LF)
208 * U+000C FORM FEED (FF)
209 * U+000D CARRIAGE RETURN (CR)
210 * U+0020 SPACE
211 */
212 case 0x09:
213 case 0x0A:
214 case 0x0C:
215 case 0x0D:
216 case 0x20:
217 break;
218
219 /*
220 * U+0000 NULL
221 * EOF
222 */
223 case 0x00:
224 if (tkz->is_eof) {
225 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
226 LXB_HTML_TOKENIZER_ERROR_EOINDO);
227
228 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
229
230 lxb_html_tokenizer_state_token_done_m(tkz, end);
231
232 return end;
233 }
234
235 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
236 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
237 lxb_html_tokenizer_state_append_replace_m(tkz);
238
239 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
240 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
241
242 tkz->token->attr_last->type
243 |= LXB_HTML_TOKEN_ATTR_TYPE_NAME_NULL;
244
245 tkz->state = lxb_html_tokenizer_state_doctype_name;
246
247 return (data + 1);
248
249 /* U+003E GREATER-THAN SIGN (>) */
250 case 0x3E:
251 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
252 tkz->state = lxb_html_tokenizer_state_data_before;
253
254 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
255 LXB_HTML_TOKENIZER_ERROR_MIDONA);
256
257 lxb_html_tokenizer_state_token_done_m(tkz, end);
258
259 return (data + 1);
260
261 /*
262 * ASCII upper alpha
263 * Anything else
264 */
265 default:
266 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
267 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
268
269 tkz->state = lxb_html_tokenizer_state_doctype_name;
270
271 return data;
272 }
273
274 data++;
275 }
276
277 return data;
278 }
279
280 /*
281 * 12.2.5.55 DOCTYPE name state
282 */
283 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)284 lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t *tkz,
285 const lxb_char_t *data,
286 const lxb_char_t *end)
287 {
288 lxb_html_tokenizer_state_begin_set(tkz, data);
289
290 while (data != end) {
291 switch (*data) {
292 /*
293 * U+0009 CHARACTER TABULATION (tab)
294 * U+000A LINE FEED (LF)
295 * U+000C FORM FEED (FF)
296 * U+000D CARRIAGE RETURN (CR)
297 * U+0020 SPACE
298 */
299 case 0x09:
300 case 0x0A:
301 case 0x0C:
302 case 0x0D:
303 case 0x20:
304 lxb_html_tokenizer_state_append_data_m(tkz, data);
305 lxb_html_tokenizer_state_set_name_m(tkz);
306 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
307
308 tkz->state = lxb_html_tokenizer_state_doctype_after_name;
309
310 return (data + 1);
311
312 /* U+003E GREATER-THAN SIGN (>) */
313 case 0x3E:
314 tkz->state = lxb_html_tokenizer_state_data_before;
315
316 lxb_html_tokenizer_state_append_data_m(tkz, data);
317 lxb_html_tokenizer_state_set_name_m(tkz);
318 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
319 lxb_html_tokenizer_state_token_done_m(tkz, end);
320
321 return (data + 1);
322
323 /*
324 * U+0000 NULL
325 * EOF
326 */
327 case 0x00:
328 lxb_html_tokenizer_state_append_data_m(tkz, data);
329
330 if (tkz->is_eof) {
331 lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz);
332
333 lxb_html_tokenizer_error_add(tkz->parse_errors,
334 tkz->token->attr_last->name_end,
335 LXB_HTML_TOKENIZER_ERROR_EOINDO);
336
337 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
338
339 lxb_html_tokenizer_state_set_name_m(tkz);
340 lxb_html_tokenizer_state_token_done_m(tkz, end);
341
342 return end;
343 }
344
345 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
346 lxb_html_tokenizer_state_append_replace_m(tkz);
347
348 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
349 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
350
351 tkz->token->attr_last->type
352 |= LXB_HTML_TOKEN_ATTR_TYPE_NAME_NULL;
353
354 break;
355
356 /* Anything else */
357 default:
358 break;
359 }
360
361 data++;
362 }
363
364 lxb_html_tokenizer_state_append_data_m(tkz, data);
365
366 return data;
367 }
368
369 /*
370 * 12.2.5.56 After DOCTYPE name state
371 */
372 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)373 lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t *tkz,
374 const lxb_char_t *data,
375 const lxb_char_t *end)
376 {
377 lxb_html_token_attr_t *attr;
378 const lxb_dom_attr_data_t *attr_data;
379
380 while (data != end) {
381 switch (*data) {
382 /*
383 * U+0009 CHARACTER TABULATION (tab)
384 * U+000A LINE FEED (LF)
385 * U+000C FORM FEED (FF)
386 * U+000D CARRIAGE RETURN (CR)
387 * U+0020 SPACE
388 */
389 case 0x09:
390 case 0x0A:
391 case 0x0C:
392 case 0x0D:
393 case 0x20:
394 break;
395
396 /* U+003E GREATER-THAN SIGN (>) */
397 case 0x3E:
398 tkz->state = lxb_html_tokenizer_state_data_before;
399
400 lxb_html_tokenizer_state_token_done_m(tkz, end);
401
402 return (data + 1);
403
404 /* EOF */
405 case 0x00:
406 if (tkz->is_eof) {
407 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
408 LXB_HTML_TOKENIZER_ERROR_EOINDO);
409
410 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
411
412 lxb_html_tokenizer_state_token_done_m(tkz, end);
413
414 return end;
415 }
416 /* fall through */
417
418 /* Anything else */
419 default:
420 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
421 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
422
423 if ((data + 6) > end) {
424 /*
425 * ASCII case-insensitive match for the word "PUBLIC"
426 * U+0044 character (P) or U+0050 character (p)
427 */
428 if (*data == 0x50 || *data == 0x70) {
429 tkz->markup = (lxb_char_t *) "PUBLIC";
430
431 tkz->state =
432 lxb_html_tokenizer_state_doctype_after_name_public;
433
434 return data;
435 }
436
437 /*
438 * ASCII case-insensitive match for the word "SYSTEM"
439 * U+0044 character (S) or U+0053 character (s)
440 */
441 if (*data == 0x53 || *data == 0x73) {
442 tkz->markup = (lxb_char_t *) "SYSTEM";
443
444 tkz->state =
445 lxb_html_tokenizer_state_doctype_after_name_system;
446
447 return data;
448 }
449 }
450 else if (lexbor_str_data_ncasecmp((lxb_char_t *) "PUBLIC",
451 data, 6))
452 {
453 lxb_html_tokenizer_state_token_attr_set_name_end(tkz,
454 (data + 6));
455
456 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
457 LXB_DOM_ATTR_PUBLIC);
458 if (attr_data == NULL) {
459 tkz->status = LXB_STATUS_ERROR;
460 return end;
461 }
462
463 tkz->token->attr_last->name = attr_data;
464
465 tkz->state =
466 lxb_html_tokenizer_state_doctype_after_public_keyword;
467
468 return (data + 6);
469 }
470 else if (lexbor_str_data_ncasecmp((lxb_char_t *) "SYSTEM",
471 data, 6))
472 {
473 lxb_html_tokenizer_state_token_attr_set_name_end(tkz,
474 (data + 6));
475
476 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
477 LXB_DOM_ATTR_SYSTEM);
478 if (attr_data == NULL) {
479 tkz->status = LXB_STATUS_ERROR;
480 return end;
481 }
482
483 tkz->token->attr_last->name = attr_data;
484
485 tkz->state =
486 lxb_html_tokenizer_state_doctype_after_system_keyword;
487
488 return (data + 6);
489 }
490
491 lxb_html_token_attr_delete(tkz->token, attr,
492 tkz->dobj_token_attr);
493
494 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
495 LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA);
496
497 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
498 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
499
500 return data;
501 }
502
503 data++;
504 }
505
506 return data;
507 }
508
509 /*
510 * Helper function. No in the specification. For 12.2.5.56
511 * For doctype PUBLIC
512 */
513 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)514 lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t *tkz,
515 const lxb_char_t *data,
516 const lxb_char_t *end)
517 {
518 const lxb_char_t *pos;
519 const lxb_dom_attr_data_t *attr_data;
520
521 pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
522
523 if (pos == NULL) {
524 lxb_html_token_attr_delete(tkz->token, tkz->token->attr_last,
525 tkz->dobj_token_attr);
526
527 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
528 LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA);
529
530 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
531
532 return data;
533 }
534
535 if (*pos == '\0') {
536 pos = data + (pos - tkz->markup);
537
538 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, pos);
539
540 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
541 LXB_DOM_ATTR_PUBLIC);
542 if (attr_data == NULL) {
543 tkz->status = LXB_STATUS_ERROR;
544 return end;
545 }
546
547 tkz->token->attr_last->name = attr_data;
548
549 tkz->state = lxb_html_tokenizer_state_doctype_after_public_keyword;
550
551 return (pos + 1);
552 }
553
554 tkz->markup = pos;
555
556 return end;
557 }
558
559 /*
560 * Helper function. No in the specification. For 12.2.5.56
561 * For doctype SYSTEM
562 */
563 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)564 lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t *tkz,
565 const lxb_char_t *data,
566 const lxb_char_t *end)
567 {
568 const lxb_char_t *pos;
569 const lxb_dom_attr_data_t *attr_data;
570
571 pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
572
573 if (pos == NULL) {
574 lxb_html_token_attr_delete(tkz->token, tkz->token->attr_last,
575 tkz->dobj_token_attr);
576
577 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
578 LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA);
579
580 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
581
582 return data;
583 }
584
585 if (*pos == '\0') {
586 pos = data + (pos - tkz->markup);
587
588 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, pos);
589
590 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
591 LXB_DOM_ATTR_SYSTEM);
592 if (attr_data == NULL) {
593 tkz->status = LXB_STATUS_ERROR;
594 return end;
595 }
596
597 tkz->token->attr_last->name = attr_data;
598
599 tkz->state = lxb_html_tokenizer_state_doctype_after_system_keyword;
600
601 return (pos + 1);
602 }
603
604 tkz->markup = pos;
605
606 return end;
607 }
608
609 /*
610 * 12.2.5.57 After DOCTYPE public keyword state
611 */
612 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)613 lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t *tkz,
614 const lxb_char_t *data,
615 const lxb_char_t *end)
616 {
617 switch (*data) {
618 /*
619 * U+0009 CHARACTER TABULATION (tab)
620 * U+000A LINE FEED (LF)
621 * U+000C FORM FEED (FF)
622 * U+000D CARRIAGE RETURN (CR)
623 * U+0020 SPACE
624 */
625 case 0x09:
626 case 0x0A:
627 case 0x0C:
628 case 0x0D:
629 case 0x20:
630 tkz->state =
631 lxb_html_tokenizer_state_doctype_before_public_identifier;
632
633 return (data + 1);
634
635 /* U+0022 QUOTATION MARK (") */
636 case 0x22:
637 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
638 LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE);
639
640 tkz->state =
641 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
642
643 return (data + 1);
644
645 /* U+0027 APOSTROPHE (') */
646 case 0x27:
647 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
648 LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE);
649
650 tkz->state =
651 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
652
653 return (data + 1);
654
655 /* U+003E GREATER-THAN SIGN (>) */
656 case 0x3E:
657 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
658 tkz->state = lxb_html_tokenizer_state_data_before;
659
660 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
661 LXB_HTML_TOKENIZER_ERROR_MIDOPUID);
662
663 lxb_html_tokenizer_state_token_done_m(tkz, end);
664
665 return (data + 1);
666
667 /* EOF */
668 case 0x00:
669 if (tkz->is_eof) {
670 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
671
672 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
673 LXB_HTML_TOKENIZER_ERROR_EOINDO);
674
675 lxb_html_tokenizer_state_token_done_m(tkz, end);
676
677 return end;
678 }
679 /* fall through */
680
681 default:
682 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
683 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
684
685 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
686 LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID);
687
688 return data;
689 }
690
691 return data;
692 }
693
694 /*
695 * 12.2.5.58 Before DOCTYPE public identifier state
696 */
697 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_before_public_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)698 lxb_html_tokenizer_state_doctype_before_public_identifier(lxb_html_tokenizer_t *tkz,
699 const lxb_char_t *data,
700 const lxb_char_t *end)
701 {
702 switch (*data) {
703 /*
704 * U+0009 CHARACTER TABULATION (tab)
705 * U+000A LINE FEED (LF)
706 * U+000C FORM FEED (FF)
707 * U+000D CARRIAGE RETURN (CR)
708 * U+0020 SPACE
709 */
710 case 0x09:
711 case 0x0A:
712 case 0x0C:
713 case 0x0D:
714 case 0x20:
715 break;
716
717 /* U+0022 QUOTATION MARK (") */
718 case 0x22:
719 tkz->state =
720 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
721
722 break;
723
724 /* U+0027 APOSTROPHE (') */
725 case 0x27:
726 tkz->state =
727 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
728
729 break;
730
731 /* U+003E GREATER-THAN SIGN (>) */
732 case 0x3E:
733 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
734 tkz->state = lxb_html_tokenizer_state_data_before;
735
736 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
737 LXB_HTML_TOKENIZER_ERROR_MIDOPUID);
738
739 lxb_html_tokenizer_state_token_done_m(tkz, end);
740
741 break;
742
743 /* EOF */
744 case 0x00:
745 if (tkz->is_eof) {
746 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
747 LXB_HTML_TOKENIZER_ERROR_EOINDO);
748
749 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
750
751 lxb_html_tokenizer_state_token_done_m(tkz, end);
752
753 return end;
754 }
755 /* fall through */
756
757 default:
758 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
759 LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID);
760
761 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
762 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
763
764 return data;
765 }
766
767 return (data + 1);
768 }
769
770 /*
771 * 12.2.5.59 DOCTYPE public identifier (double-quoted) state
772 */
773 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)774 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(lxb_html_tokenizer_t *tkz,
775 const lxb_char_t *data,
776 const lxb_char_t *end)
777 {
778 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
779 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
780 }
781
782 lxb_html_tokenizer_state_begin_set(tkz, data);
783
784 while (data != end) {
785 switch (*data) {
786 /* U+0022 QUOTATION MARK (") */
787 case 0x22:
788 lxb_html_tokenizer_state_append_data_m(tkz, data);
789 lxb_html_tokenizer_state_set_value_m(tkz);
790 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
791
792 tkz->state =
793 lxb_html_tokenizer_state_doctype_after_public_identifier;
794
795 return (data + 1);
796
797 /* U+003E GREATER-THAN SIGN (>) */
798 case 0x3E:
799 tkz->state = lxb_html_tokenizer_state_data_before;
800
801 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
802 LXB_HTML_TOKENIZER_ERROR_ABDOPUID);
803
804 lxb_html_tokenizer_state_append_data_m(tkz, data);
805 lxb_html_tokenizer_state_set_value_m(tkz);
806 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
807 lxb_html_tokenizer_state_token_done_m(tkz, end);
808
809 return (data + 1);
810
811 /* U+000D CARRIAGE RETURN (CR) */
812 case 0x0D:
813 if (++data >= end) {
814 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
815
816 tkz->state = lxb_html_tokenizer_state_cr;
817 tkz->state_return = lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
818
819 return data;
820 }
821
822 lxb_html_tokenizer_state_append_data_m(tkz, data);
823 tkz->pos[-1] = 0x0A;
824
825 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
826
827 if (*data != 0x0A) {
828 lxb_html_tokenizer_state_begin_set(tkz, data);
829 data--;
830 }
831
832 break;
833
834 /*
835 * U+0000 NULL
836 * EOF
837 */
838 case 0x00:
839 lxb_html_tokenizer_state_append_data_m(tkz, data);
840
841 if (tkz->is_eof) {
842 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
843
844 if (tkz->token->attr_last->value_begin == NULL) {
845 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
846 tkz->token->attr_last->value_end);
847 }
848
849 lxb_html_tokenizer_error_add(tkz->parse_errors,
850 tkz->token->attr_last->value_end,
851 LXB_HTML_TOKENIZER_ERROR_EOINDO);
852
853 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
854
855 lxb_html_tokenizer_state_set_value_m(tkz);
856 lxb_html_tokenizer_state_token_done_m(tkz, end);
857
858 return end;
859 }
860
861 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
862 lxb_html_tokenizer_state_append_replace_m(tkz);
863
864 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
865 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
866
867 tkz->token->attr_last->type
868 |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
869
870 break;
871
872 /* Anything else */
873 default:
874 break;
875 }
876
877 data++;
878 }
879
880 lxb_html_tokenizer_state_append_data_m(tkz, data);
881
882 return data;
883 }
884
885 /*
886 * 12.2.5.60 DOCTYPE public identifier (single-quoted) state
887 */
888 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)889 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(lxb_html_tokenizer_t *tkz,
890 const lxb_char_t *data,
891 const lxb_char_t *end)
892 {
893 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
894 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
895 }
896
897 lxb_html_tokenizer_state_begin_set(tkz, data);
898
899 while (data != end) {
900 switch (*data) {
901 /* U+0027 APOSTROPHE (') */
902 case 0x27:
903 lxb_html_tokenizer_state_append_data_m(tkz, data);
904 lxb_html_tokenizer_state_set_value_m(tkz);
905 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
906
907 tkz->state =
908 lxb_html_tokenizer_state_doctype_after_public_identifier;
909
910 return (data + 1);
911
912 /* U+003E GREATER-THAN SIGN (>) */
913 case 0x3E:
914 tkz->state = lxb_html_tokenizer_state_data_before;
915
916 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
917 LXB_HTML_TOKENIZER_ERROR_ABDOPUID);
918
919 lxb_html_tokenizer_state_append_data_m(tkz, data);
920 lxb_html_tokenizer_state_set_value_m(tkz);
921 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
922 lxb_html_tokenizer_state_token_done_m(tkz, end);
923
924 return (data + 1);
925
926 /* U+000D CARRIAGE RETURN (CR) */
927 case 0x0D:
928 if (++data >= end) {
929 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
930
931 tkz->state = lxb_html_tokenizer_state_cr;
932 tkz->state_return = lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
933
934 return data;
935 }
936
937 lxb_html_tokenizer_state_append_data_m(tkz, data);
938 tkz->pos[-1] = 0x0A;
939
940 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
941
942 if (*data != 0x0A) {
943 lxb_html_tokenizer_state_begin_set(tkz, data);
944 data--;
945 }
946
947 break;
948
949 /*
950 * U+0000 NULL
951 * EOF
952 */
953 case 0x00:
954 lxb_html_tokenizer_state_append_data_m(tkz, data);
955
956 if (tkz->is_eof) {
957 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
958
959 if (tkz->token->attr_last->value_begin == NULL) {
960 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
961 tkz->token->attr_last->value_end);
962 }
963
964 lxb_html_tokenizer_error_add(tkz->parse_errors,
965 tkz->token->attr_last->value_end,
966 LXB_HTML_TOKENIZER_ERROR_EOINDO);
967
968 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
969
970 lxb_html_tokenizer_state_set_value_m(tkz);
971 lxb_html_tokenizer_state_token_done_m(tkz, end);
972
973 return end;
974 }
975
976 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
977 lxb_html_tokenizer_state_append_replace_m(tkz);
978
979 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
980 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
981
982 tkz->token->attr_last->type
983 |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
984
985 break;
986
987 /* Anything else */
988 default:
989 break;
990 }
991
992 data++;
993 }
994
995 lxb_html_tokenizer_state_append_data_m(tkz, data);
996
997 return data;
998 }
999
1000 /*
1001 * 12.2.5.61 After DOCTYPE public identifier state
1002 */
1003 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_public_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1004 lxb_html_tokenizer_state_doctype_after_public_identifier(lxb_html_tokenizer_t *tkz,
1005 const lxb_char_t *data,
1006 const lxb_char_t *end)
1007 {
1008 lxb_html_token_attr_t *attr;
1009
1010 switch (*data) {
1011 /*
1012 * U+0009 CHARACTER TABULATION (tab)
1013 * U+000A LINE FEED (LF)
1014 * U+000C FORM FEED (FF)
1015 * U+000D CARRIAGE RETURN (CR)
1016 * U+0020 SPACE
1017 */
1018 case 0x09:
1019 case 0x0A:
1020 case 0x0C:
1021 case 0x0D:
1022 case 0x20:
1023 tkz->state =
1024 lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers;
1025
1026 return (data + 1);
1027
1028 /* U+003E GREATER-THAN SIGN (>) */
1029 case 0x3E:
1030 tkz->state = lxb_html_tokenizer_state_data_before;
1031
1032 lxb_html_tokenizer_state_token_done_m(tkz, end);
1033
1034 return (data + 1);
1035
1036 /* U+0022 QUOTATION MARK (") */
1037 case 0x22:
1038 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1039 LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID);
1040
1041 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1042
1043 tkz->state =
1044 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1045
1046 return (data + 1);
1047
1048 /* U+0027 APOSTROPHE (') */
1049 case 0x27:
1050 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1051 LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID);
1052
1053 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1054
1055 tkz->state =
1056 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1057
1058 return (data + 1);
1059
1060 /* EOF */
1061 case 0x00:
1062 if (tkz->is_eof) {
1063 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1064 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1065
1066 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1067 lxb_html_tokenizer_state_token_done_m(tkz, end);
1068
1069 return end;
1070 }
1071 /* fall through */
1072
1073 default:
1074 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1075 LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1076
1077 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1078 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1079
1080 return data;
1081 }
1082
1083 return data;
1084 }
1085
1086 /*
1087 * 12.2.5.62 Between DOCTYPE public and system identifiers state
1088 */
1089 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1090 lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(lxb_html_tokenizer_t *tkz,
1091 const lxb_char_t *data,
1092 const lxb_char_t *end)
1093 {
1094 lxb_html_token_attr_t *attr;
1095
1096 switch (*data) {
1097 /*
1098 * U+0009 CHARACTER TABULATION (tab)
1099 * U+000A LINE FEED (LF)
1100 * U+000C FORM FEED (FF)
1101 * U+000D CARRIAGE RETURN (CR)
1102 * U+0020 SPACE
1103 */
1104 case 0x09:
1105 case 0x0A:
1106 case 0x0C:
1107 case 0x0D:
1108 case 0x20:
1109 return (data + 1);
1110
1111 /* U+003E GREATER-THAN SIGN (>) */
1112 case 0x3E:
1113 tkz->state = lxb_html_tokenizer_state_data_before;
1114
1115 lxb_html_tokenizer_state_token_done_m(tkz, end);
1116
1117 return (data + 1);
1118
1119 /* U+0022 QUOTATION MARK (") */
1120 case 0x22:
1121 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1122
1123 tkz->state =
1124 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1125
1126 return (data + 1);
1127
1128 /* U+0027 APOSTROPHE (') */
1129 case 0x27:
1130 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1131
1132 tkz->state =
1133 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1134
1135 return (data + 1);
1136
1137 /* EOF */
1138 case 0x00:
1139 if (tkz->is_eof) {
1140 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1141 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1142
1143 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1144 lxb_html_tokenizer_state_token_done_m(tkz, end);
1145
1146 return end;
1147 }
1148 /* fall through */
1149
1150 default:
1151 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1152 LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1153
1154 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1155 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1156
1157 return data;
1158 }
1159
1160 return data;
1161 }
1162
1163 /*
1164 * 12.2.5.63 After DOCTYPE system keyword state
1165 */
1166 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1167 lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t *tkz,
1168 const lxb_char_t *data,
1169 const lxb_char_t *end)
1170 {
1171 switch (*data) {
1172 /*
1173 * U+0009 CHARACTER TABULATION (tab)
1174 * U+000A LINE FEED (LF)
1175 * U+000C FORM FEED (FF)
1176 * U+000D CARRIAGE RETURN (CR)
1177 * U+0020 SPACE
1178 */
1179 case 0x09:
1180 case 0x0A:
1181 case 0x0C:
1182 case 0x0D:
1183 case 0x20:
1184 tkz->state =
1185 lxb_html_tokenizer_state_doctype_before_system_identifier;
1186
1187 return (data + 1);
1188
1189 /* U+0022 QUOTATION MARK (") */
1190 case 0x22:
1191 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1192 LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE);
1193
1194 tkz->state =
1195 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1196
1197 return (data + 1);
1198
1199 /* U+0027 APOSTROPHE (') */
1200 case 0x27:
1201 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1202 LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE);
1203
1204 tkz->state =
1205 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1206
1207 return (data + 1);
1208
1209 /* U+003E GREATER-THAN SIGN (>) */
1210 case 0x3E:
1211 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1212 tkz->state = lxb_html_tokenizer_state_data_before;
1213
1214 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1215 LXB_HTML_TOKENIZER_ERROR_MIDOSYID);
1216
1217 lxb_html_tokenizer_state_token_done_m(tkz, end);
1218
1219 return (data + 1);
1220
1221 /* EOF */
1222 case 0x00:
1223 if (tkz->is_eof) {
1224 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1225 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1226
1227 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1228 lxb_html_tokenizer_state_token_done_m(tkz, end);
1229
1230 return end;
1231 }
1232 /* fall through */
1233
1234 default:
1235 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1236 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1237
1238 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1239 LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1240
1241 return data;
1242 }
1243
1244 return data;
1245 }
1246
1247 /*
1248 * 12.2.5.64 Before DOCTYPE system identifier state
1249 */
1250 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_before_system_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1251 lxb_html_tokenizer_state_doctype_before_system_identifier(lxb_html_tokenizer_t *tkz,
1252 const lxb_char_t *data,
1253 const lxb_char_t *end)
1254 {
1255 switch (*data) {
1256 /*
1257 * U+0009 CHARACTER TABULATION (tab)
1258 * U+000A LINE FEED (LF)
1259 * U+000C FORM FEED (FF)
1260 * U+000D CARRIAGE RETURN (CR)
1261 * U+0020 SPACE
1262 */
1263 case 0x09:
1264 case 0x0A:
1265 case 0x0C:
1266 case 0x0D:
1267 case 0x20:
1268 return (data + 1);
1269
1270 /* U+0022 QUOTATION MARK (") */
1271 case 0x22:
1272 tkz->state =
1273 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1274
1275 return (data + 1);
1276
1277 /* U+0027 APOSTROPHE (') */
1278 case 0x27:
1279 tkz->state =
1280 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1281
1282 return (data + 1);
1283
1284 /* U+003E GREATER-THAN SIGN (>) */
1285 case 0x3E:
1286 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1287 tkz->state = lxb_html_tokenizer_state_data_before;
1288
1289 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1290 LXB_HTML_TOKENIZER_ERROR_MIDOSYID);
1291
1292 lxb_html_tokenizer_state_token_done_m(tkz, end);
1293
1294 return (data + 1);
1295
1296 /* EOF */
1297 case 0x00:
1298 if (tkz->is_eof) {
1299 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1300 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1301
1302 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1303
1304 lxb_html_tokenizer_state_token_done_m(tkz, end);
1305
1306 return end;
1307 }
1308 /* fall through */
1309
1310 default:
1311 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1312 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1313
1314 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1315 LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1316
1317 return data;
1318 }
1319
1320 return data;
1321 }
1322
1323 /*
1324 * 12.2.5.65 DOCTYPE system identifier (double-quoted) state
1325 */
1326 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1327 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(lxb_html_tokenizer_t *tkz,
1328 const lxb_char_t *data,
1329 const lxb_char_t *end)
1330 {
1331 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1332 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1333 }
1334
1335 lxb_html_tokenizer_state_begin_set(tkz, data);
1336
1337 while (data != end) {
1338 switch (*data) {
1339 /* U+0022 QUOTATION MARK (") */
1340 case 0x22:
1341 lxb_html_tokenizer_state_append_data_m(tkz, data);
1342 lxb_html_tokenizer_state_set_value_m(tkz);
1343 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1344
1345 tkz->state =
1346 lxb_html_tokenizer_state_doctype_after_system_identifier;
1347
1348 return (data + 1);
1349
1350 /* U+003E GREATER-THAN SIGN (>) */
1351 case 0x3E:
1352 tkz->state = lxb_html_tokenizer_state_data_before;
1353
1354 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1355 LXB_HTML_TOKENIZER_ERROR_ABDOSYID);
1356
1357 lxb_html_tokenizer_state_append_data_m(tkz, data);
1358 lxb_html_tokenizer_state_set_value_m(tkz);
1359 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1360 lxb_html_tokenizer_state_token_done_m(tkz, end);
1361
1362 return (data + 1);
1363
1364 /* U+000D CARRIAGE RETURN (CR) */
1365 case 0x0D:
1366 if (++data >= end) {
1367 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1368
1369 tkz->state = lxb_html_tokenizer_state_cr;
1370 tkz->state_return = lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1371
1372 return data;
1373 }
1374
1375 lxb_html_tokenizer_state_append_data_m(tkz, data);
1376 tkz->pos[-1] = 0x0A;
1377
1378 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1379
1380 if (*data != 0x0A) {
1381 lxb_html_tokenizer_state_begin_set(tkz, data);
1382 data--;
1383 }
1384
1385 break;
1386
1387 /*
1388 * U+0000 NULL
1389 * EOF
1390 */
1391 case 0x00:
1392 lxb_html_tokenizer_state_append_data_m(tkz, data);
1393
1394 if (tkz->is_eof) {
1395 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1396
1397 if (tkz->token->attr_last->value_begin == NULL) {
1398 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
1399 tkz->token->attr_last->value_end);
1400 }
1401
1402 lxb_html_tokenizer_error_add(tkz->parse_errors,
1403 tkz->token->attr_last->value_end,
1404 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1405
1406 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1407
1408 lxb_html_tokenizer_state_set_value_m(tkz);
1409 lxb_html_tokenizer_state_token_done_m(tkz, end);
1410
1411 return end;
1412 }
1413
1414 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1415 lxb_html_tokenizer_state_append_replace_m(tkz);
1416
1417 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1418 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1419
1420 tkz->token->attr_last->type
1421 |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
1422
1423 break;
1424
1425 /* Anything else */
1426 default:
1427 break;
1428 }
1429
1430 data++;
1431 }
1432
1433 lxb_html_tokenizer_state_append_data_m(tkz, data);
1434
1435 return data;
1436 }
1437
1438 /*
1439 * 12.2.5.66 DOCTYPE system identifier (single-quoted) state
1440 */
1441 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1442 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(lxb_html_tokenizer_t *tkz,
1443 const lxb_char_t *data,
1444 const lxb_char_t *end)
1445 {
1446 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1447 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1448 }
1449
1450 lxb_html_tokenizer_state_begin_set(tkz, data);
1451
1452 while (data != end) {
1453 switch (*data) {
1454 /* U+0027 APOSTROPHE (') */
1455 case 0x27:
1456 lxb_html_tokenizer_state_append_data_m(tkz, data);
1457 lxb_html_tokenizer_state_set_value_m(tkz);
1458 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1459
1460 tkz->state =
1461 lxb_html_tokenizer_state_doctype_after_system_identifier;
1462
1463 return (data + 1);
1464
1465 /* U+003E GREATER-THAN SIGN (>) */
1466 case 0x3E:
1467 tkz->state = lxb_html_tokenizer_state_data_before;
1468
1469 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1470 LXB_HTML_TOKENIZER_ERROR_ABDOSYID);
1471
1472 lxb_html_tokenizer_state_append_data_m(tkz, data);
1473 lxb_html_tokenizer_state_set_value_m(tkz);
1474 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1475 lxb_html_tokenizer_state_token_done_m(tkz, end);
1476
1477 return (data + 1);
1478
1479 /* U+000D CARRIAGE RETURN (CR) */
1480 case 0x0D:
1481 if (++data >= end) {
1482 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1483
1484 tkz->state = lxb_html_tokenizer_state_cr;
1485 tkz->state_return = lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1486
1487 return data;
1488 }
1489
1490 lxb_html_tokenizer_state_append_data_m(tkz, data);
1491 tkz->pos[-1] = 0x0A;
1492
1493 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1494
1495 if (*data != 0x0A) {
1496 lxb_html_tokenizer_state_begin_set(tkz, data);
1497 data--;
1498 }
1499
1500 break;
1501
1502 /*
1503 * U+0000 NULL
1504 * EOF
1505 */
1506 case 0x00:
1507 lxb_html_tokenizer_state_append_data_m(tkz, data);
1508
1509 if (tkz->is_eof) {
1510 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1511
1512 if (tkz->token->attr_last->value_begin == NULL) {
1513 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
1514 tkz->token->attr_last->value_end);
1515 }
1516
1517 lxb_html_tokenizer_error_add(tkz->parse_errors,
1518 tkz->token->attr_last->value_end,
1519 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1520
1521 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1522
1523 lxb_html_tokenizer_state_set_value_m(tkz);
1524 lxb_html_tokenizer_state_token_done_m(tkz, end);
1525
1526 return end;
1527 }
1528
1529 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1530 lxb_html_tokenizer_state_append_replace_m(tkz);
1531
1532 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1533 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1534
1535 tkz->token->attr_last->type
1536 |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
1537
1538 break;
1539
1540 /* Anything else */
1541 default:
1542 break;
1543 }
1544
1545 data++;
1546 }
1547
1548 lxb_html_tokenizer_state_append_data_m(tkz, data);
1549
1550 return data;
1551 }
1552
1553 /*
1554 * 12.2.5.67 After DOCTYPE system identifier state
1555 */
1556 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_system_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1557 lxb_html_tokenizer_state_doctype_after_system_identifier(
1558 lxb_html_tokenizer_t *tkz,
1559 const lxb_char_t *data,
1560 const lxb_char_t *end)
1561 {
1562 switch (*data) {
1563 /*
1564 * U+0009 CHARACTER TABULATION (tab)
1565 * U+000A LINE FEED (LF)
1566 * U+000C FORM FEED (FF)
1567 * U+000D CARRIAGE RETURN (CR)
1568 * U+0020 SPACE
1569 */
1570 case 0x09:
1571 case 0x0A:
1572 case 0x0C:
1573 case 0x0D:
1574 case 0x20:
1575 return (data + 1);
1576
1577 /* U+003E GREATER-THAN SIGN (>) */
1578 case 0x3E:
1579 tkz->state = lxb_html_tokenizer_state_data_before;
1580
1581 lxb_html_tokenizer_state_token_done_m(tkz, end);
1582
1583 return (data + 1);
1584
1585 /* EOF */
1586 case 0x00:
1587 if (tkz->is_eof) {
1588 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1589 LXB_HTML_TOKENIZER_ERROR_EOINDO);
1590
1591 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1592
1593 lxb_html_tokenizer_state_token_done_m(tkz, end);
1594
1595 return end;
1596 }
1597 /* fall through */
1598
1599 default:
1600 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1601 LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID);
1602
1603 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1604
1605 return data;
1606 }
1607
1608 return data;
1609 }
1610
1611 /*
1612 * 12.2.5.68 Bogus DOCTYPE state
1613 */
1614 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1615 lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t *tkz,
1616 const lxb_char_t *data,
1617 const lxb_char_t *end)
1618 {
1619 while (data != end) {
1620 switch (*data) {
1621 /* U+003E GREATER-THAN SIGN (>) */
1622 case 0x3E:
1623 tkz->state = lxb_html_tokenizer_state_data_before;
1624
1625 lxb_html_tokenizer_state_token_done_m(tkz, end);
1626
1627 return (data + 1);
1628
1629 /*
1630 * U+0000 NULL
1631 * EOF
1632 */
1633 case 0x00:
1634 if (tkz->is_eof) {
1635 lxb_html_tokenizer_state_token_done_m(tkz, end);
1636
1637 return end;
1638 }
1639
1640 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1641 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1642
1643 break;
1644
1645 /* Anything else */
1646 default:
1647 break;
1648 }
1649
1650 data++;
1651 }
1652
1653 return data;
1654 }
1655