1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tokenizer/state_doctype.h"
8 #include "lexbor/html/tokenizer/state.h"
9 
10 
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #include "lexbor/core/str_res.h"
13 
14 
15 lxb_dom_attr_data_t *
16 lxb_dom_attr_local_name_append(lexbor_hash_t *hash,
17                                const lxb_char_t *name, size_t length);
18 
19 
20 static const lxb_char_t *
21 lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t *tkz,
22                                  const lxb_char_t *data,
23                                  const lxb_char_t *end);
24 
25 static const lxb_char_t *
26 lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t *tkz,
27                                              const lxb_char_t *data,
28                                              const lxb_char_t *end);
29 
30 static const lxb_char_t *
31 lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t *tkz,
32                                       const lxb_char_t *data,
33                                       const lxb_char_t *end);
34 
35 static const lxb_char_t *
36 lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t *tkz,
37                                             const lxb_char_t *data,
38                                             const lxb_char_t *end);
39 
40 static const lxb_char_t *
41 lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t *tkz,
42                                                    const lxb_char_t *data,
43                                                    const lxb_char_t *end);
44 
45 static const lxb_char_t *
46 lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t *tkz,
47                                                    const lxb_char_t *data,
48                                                    const lxb_char_t *end);
49 
50 static const lxb_char_t *
51 lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t *tkz,
52                                                       const lxb_char_t *data,
53                                                       const lxb_char_t *end);
54 
55 static const lxb_char_t *
56 lxb_html_tokenizer_state_doctype_before_public_identifier(
57                                                       lxb_html_tokenizer_t *tkz,
58                                                       const lxb_char_t *data,
59                                                       const lxb_char_t *end);
60 
61 static const lxb_char_t *
62 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(
63                                                       lxb_html_tokenizer_t *tkz,
64                                                       const lxb_char_t *data,
65                                                       const lxb_char_t *end);
66 
67 static const lxb_char_t *
68 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(
69                                                       lxb_html_tokenizer_t *tkz,
70                                                       const lxb_char_t *data,
71                                                       const lxb_char_t *end);
72 
73 static const lxb_char_t *
74 lxb_html_tokenizer_state_doctype_after_public_identifier(
75                                                       lxb_html_tokenizer_t *tkz,
76                                                       const lxb_char_t *data,
77                                                       const lxb_char_t *end);
78 
79 static const lxb_char_t *
80 lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(
81                                                       lxb_html_tokenizer_t *tkz,
82                                                       const lxb_char_t *data,
83                                                       const lxb_char_t *end);
84 
85 static const lxb_char_t *
86 lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t *tkz,
87                                                       const lxb_char_t *data,
88                                                       const lxb_char_t *end);
89 
90 static const lxb_char_t *
91 lxb_html_tokenizer_state_doctype_before_system_identifier(
92                                                       lxb_html_tokenizer_t *tkz,
93                                                       const lxb_char_t *data,
94                                                       const lxb_char_t *end);
95 
96 static const lxb_char_t *
97 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(
98                                                       lxb_html_tokenizer_t *tkz,
99                                                       const lxb_char_t *data,
100                                                       const lxb_char_t *end);
101 
102 static const lxb_char_t *
103 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(
104                                                       lxb_html_tokenizer_t *tkz,
105                                                       const lxb_char_t *data,
106                                                       const lxb_char_t *end);
107 
108 static const lxb_char_t *
109 lxb_html_tokenizer_state_doctype_after_system_identifier(
110                                                       lxb_html_tokenizer_t *tkz,
111                                                       const lxb_char_t *data,
112                                                       const lxb_char_t *end);
113 
114 static const lxb_char_t *
115 lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t *tkz,
116                                        const lxb_char_t *data,
117                                        const lxb_char_t *end);
118 
119 
120 /*
121  * Helper function. No in the specification. For 12.2.5.53
122  */
123 const lxb_char_t *
lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)124 lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t *tkz,
125                                         const lxb_char_t *data,
126                                         const lxb_char_t *end)
127 {
128     if (tkz->is_eof == false) {
129         lxb_html_tokenizer_state_token_set_end(tkz, data);
130     }
131     else {
132         lxb_html_tokenizer_state_token_set_end_oef(tkz);
133     }
134 
135     tkz->token->tag_id = LXB_TAG__EM_DOCTYPE;
136 
137     return lxb_html_tokenizer_state_doctype(tkz, data, end);
138 }
139 
140 /*
141  * 12.2.5.53 DOCTYPE state
142  */
143 static const lxb_char_t *
lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)144 lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t *tkz,
145                                  const lxb_char_t *data,
146                                  const lxb_char_t *end)
147 {
148     switch (*data) {
149         /*
150          * U+0009 CHARACTER TABULATION (tab)
151          * U+000A LINE FEED (LF)
152          * U+000C FORM FEED (FF)
153          * U+000D CARRIAGE RETURN (CR)
154          * U+0020 SPACE
155          */
156         case 0x09:
157         case 0x0A:
158         case 0x0C:
159         case 0x0D:
160         case 0x20:
161             data++;
162             break;
163 
164         /* U+003E GREATER-THAN SIGN (>) */
165         case 0x3E:
166             break;
167 
168         /* EOF */
169         case 0x00:
170             if (tkz->is_eof) {
171                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
172                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
173 
174                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
175 
176                 lxb_html_tokenizer_state_token_done_m(tkz, end);
177 
178                 return end;
179             }
180             /* fall through */
181 
182         default:
183             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
184                                          LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA);
185             break;
186     }
187 
188     tkz->state = lxb_html_tokenizer_state_doctype_before_name;
189 
190     return data;
191 }
192 
193 /*
194  * 12.2.5.54 Before DOCTYPE name state
195  */
196 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)197 lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t *tkz,
198                                              const lxb_char_t *data,
199                                              const lxb_char_t *end)
200 {
201     lxb_html_token_attr_t *attr;
202 
203     while (data != end) {
204         switch (*data) {
205             /*
206              * U+0009 CHARACTER TABULATION (tab)
207              * U+000A LINE FEED (LF)
208              * U+000C FORM FEED (FF)
209              * U+000D CARRIAGE RETURN (CR)
210              * U+0020 SPACE
211              */
212             case 0x09:
213             case 0x0A:
214             case 0x0C:
215             case 0x0D:
216             case 0x20:
217                 break;
218 
219             /*
220              * U+0000 NULL
221              * EOF
222              */
223             case 0x00:
224                 if (tkz->is_eof) {
225                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
226                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
227 
228                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
229 
230                     lxb_html_tokenizer_state_token_done_m(tkz, end);
231 
232                     return end;
233                 }
234 
235                 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
236                 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
237                 lxb_html_tokenizer_state_append_replace_m(tkz);
238 
239                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
240                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
241 
242                 tkz->token->attr_last->type
243                     |= LXB_HTML_TOKEN_ATTR_TYPE_NAME_NULL;
244 
245                 tkz->state = lxb_html_tokenizer_state_doctype_name;
246 
247                 return (data + 1);
248 
249             /* U+003E GREATER-THAN SIGN (>) */
250             case 0x3E:
251                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
252                 tkz->state = lxb_html_tokenizer_state_data_before;
253 
254                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
255                                              LXB_HTML_TOKENIZER_ERROR_MIDONA);
256 
257                 lxb_html_tokenizer_state_token_done_m(tkz, end);
258 
259                 return (data + 1);
260 
261             /*
262              * ASCII upper alpha
263              * Anything else
264              */
265             default:
266                 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
267                 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
268 
269                 tkz->state = lxb_html_tokenizer_state_doctype_name;
270 
271                 return data;
272         }
273 
274         data++;
275     }
276 
277     return data;
278 }
279 
280 /*
281  * 12.2.5.55 DOCTYPE name state
282  */
283 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)284 lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t *tkz,
285                                       const lxb_char_t *data,
286                                       const lxb_char_t *end)
287 {
288     lxb_html_tokenizer_state_begin_set(tkz, data);
289 
290     while (data != end) {
291         switch (*data) {
292             /*
293              * U+0009 CHARACTER TABULATION (tab)
294              * U+000A LINE FEED (LF)
295              * U+000C FORM FEED (FF)
296              * U+000D CARRIAGE RETURN (CR)
297              * U+0020 SPACE
298              */
299             case 0x09:
300             case 0x0A:
301             case 0x0C:
302             case 0x0D:
303             case 0x20:
304                 lxb_html_tokenizer_state_append_data_m(tkz, data);
305                 lxb_html_tokenizer_state_set_name_m(tkz);
306                 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
307 
308                 tkz->state = lxb_html_tokenizer_state_doctype_after_name;
309 
310                 return (data + 1);
311 
312             /* U+003E GREATER-THAN SIGN (>) */
313             case 0x3E:
314                 tkz->state = lxb_html_tokenizer_state_data_before;
315 
316                 lxb_html_tokenizer_state_append_data_m(tkz, data);
317                 lxb_html_tokenizer_state_set_name_m(tkz);
318                 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
319                 lxb_html_tokenizer_state_token_done_m(tkz, end);
320 
321                 return (data + 1);
322 
323             /*
324              * U+0000 NULL
325              * EOF
326              */
327             case 0x00:
328                 lxb_html_tokenizer_state_append_data_m(tkz, data);
329 
330                 if (tkz->is_eof) {
331                     lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz);
332 
333                     lxb_html_tokenizer_error_add(tkz->parse_errors,
334                                                tkz->token->attr_last->name_end,
335                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
336 
337                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
338 
339                     lxb_html_tokenizer_state_set_name_m(tkz);
340                     lxb_html_tokenizer_state_token_done_m(tkz, end);
341 
342                     return end;
343                 }
344 
345                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
346                 lxb_html_tokenizer_state_append_replace_m(tkz);
347 
348                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
349                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
350 
351                 tkz->token->attr_last->type
352                     |= LXB_HTML_TOKEN_ATTR_TYPE_NAME_NULL;
353 
354                 break;
355 
356             /* Anything else */
357             default:
358                 break;
359         }
360 
361         data++;
362     }
363 
364     lxb_html_tokenizer_state_append_data_m(tkz, data);
365 
366     return data;
367 }
368 
369 /*
370  * 12.2.5.56 After DOCTYPE name state
371  */
372 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)373 lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t *tkz,
374                                             const lxb_char_t *data,
375                                             const lxb_char_t *end)
376 {
377     lxb_html_token_attr_t *attr;
378     const lxb_dom_attr_data_t *attr_data;
379 
380     while (data != end) {
381         switch (*data) {
382             /*
383              * U+0009 CHARACTER TABULATION (tab)
384              * U+000A LINE FEED (LF)
385              * U+000C FORM FEED (FF)
386              * U+000D CARRIAGE RETURN (CR)
387              * U+0020 SPACE
388              */
389             case 0x09:
390             case 0x0A:
391             case 0x0C:
392             case 0x0D:
393             case 0x20:
394                 break;
395 
396             /* U+003E GREATER-THAN SIGN (>) */
397             case 0x3E:
398                 tkz->state = lxb_html_tokenizer_state_data_before;
399 
400                 lxb_html_tokenizer_state_token_done_m(tkz, end);
401 
402                 return (data + 1);
403 
404             /* EOF */
405             case 0x00:
406                 if (tkz->is_eof) {
407                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
408                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
409 
410                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
411 
412                     lxb_html_tokenizer_state_token_done_m(tkz, end);
413 
414                     return end;
415                 }
416                 /* fall through */
417 
418             /* Anything else */
419             default:
420                 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
421                 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
422 
423                 if ((data + 6) > end) {
424                     /*
425                      * ASCII case-insensitive match for the word "PUBLIC"
426                      * U+0044 character (P) or U+0050 character (p)
427                      */
428                     if (*data == 0x50 || *data == 0x70) {
429                         tkz->markup = (lxb_char_t *) "PUBLIC";
430 
431                         tkz->state =
432                             lxb_html_tokenizer_state_doctype_after_name_public;
433 
434                         return data;
435                     }
436 
437                     /*
438                      * ASCII case-insensitive match for the word "SYSTEM"
439                      * U+0044 character (S) or U+0053 character (s)
440                      */
441                     if (*data == 0x53 || *data == 0x73) {
442                         tkz->markup = (lxb_char_t *) "SYSTEM";
443 
444                         tkz->state =
445                             lxb_html_tokenizer_state_doctype_after_name_system;
446 
447                         return data;
448                     }
449                 }
450                 else if (lexbor_str_data_ncasecmp((lxb_char_t *) "PUBLIC",
451                                                   data, 6))
452                 {
453                     lxb_html_tokenizer_state_token_attr_set_name_end(tkz,
454                                                                     (data + 6));
455 
456                     attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
457                                                         LXB_DOM_ATTR_PUBLIC);
458                     if (attr_data == NULL) {
459                         tkz->status = LXB_STATUS_ERROR;
460                         return end;
461                     }
462 
463                     tkz->token->attr_last->name = attr_data;
464 
465                     tkz->state =
466                         lxb_html_tokenizer_state_doctype_after_public_keyword;
467 
468                     return (data + 6);
469                 }
470                 else if (lexbor_str_data_ncasecmp((lxb_char_t *) "SYSTEM",
471                                                   data, 6))
472                 {
473                     lxb_html_tokenizer_state_token_attr_set_name_end(tkz,
474                                                                     (data + 6));
475 
476                     attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
477                                                         LXB_DOM_ATTR_SYSTEM);
478                     if (attr_data == NULL) {
479                         tkz->status = LXB_STATUS_ERROR;
480                         return end;
481                     }
482 
483                     tkz->token->attr_last->name = attr_data;
484 
485                     tkz->state =
486                         lxb_html_tokenizer_state_doctype_after_system_keyword;
487 
488                     return (data + 6);
489                 }
490 
491                 lxb_html_token_attr_delete(tkz->token, attr,
492                                            tkz->dobj_token_attr);
493 
494                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
495                                          LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA);
496 
497                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
498                 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
499 
500                 return data;
501         }
502 
503         data++;
504     }
505 
506     return data;
507 }
508 
509 /*
510  * Helper function. No in the specification. For 12.2.5.56
511  * For doctype PUBLIC
512  */
513 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)514 lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t *tkz,
515                                                    const lxb_char_t *data,
516                                                    const lxb_char_t *end)
517 {
518     const lxb_char_t *pos;
519     const lxb_dom_attr_data_t *attr_data;
520 
521     pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
522 
523     if (pos == NULL) {
524         lxb_html_token_attr_delete(tkz->token, tkz->token->attr_last,
525                                    tkz->dobj_token_attr);
526 
527         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
528                                      LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA);
529 
530         tkz->state = lxb_html_tokenizer_state_doctype_bogus;
531 
532         return data;
533     }
534 
535     if (*pos == '\0') {
536         pos = data + (pos - tkz->markup);
537 
538         lxb_html_tokenizer_state_token_attr_set_name_end(tkz, pos);
539 
540         attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
541                                             LXB_DOM_ATTR_PUBLIC);
542         if (attr_data == NULL) {
543             tkz->status = LXB_STATUS_ERROR;
544             return end;
545         }
546 
547         tkz->token->attr_last->name = attr_data;
548 
549         tkz->state = lxb_html_tokenizer_state_doctype_after_public_keyword;
550 
551         return (pos + 1);
552     }
553 
554     tkz->markup = pos;
555 
556     return end;
557 }
558 
559 /*
560  * Helper function. No in the specification. For 12.2.5.56
561  * For doctype SYSTEM
562  */
563 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)564 lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t *tkz,
565                                                    const lxb_char_t *data,
566                                                    const lxb_char_t *end)
567 {
568     const lxb_char_t *pos;
569     const lxb_dom_attr_data_t *attr_data;
570 
571     pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
572 
573     if (pos == NULL) {
574         lxb_html_token_attr_delete(tkz->token, tkz->token->attr_last,
575                                    tkz->dobj_token_attr);
576 
577         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
578                                      LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA);
579 
580         tkz->state = lxb_html_tokenizer_state_doctype_bogus;
581 
582         return data;
583     }
584 
585     if (*pos == '\0') {
586         pos = data + (pos - tkz->markup);
587 
588         lxb_html_tokenizer_state_token_attr_set_name_end(tkz, pos);
589 
590         attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
591                                             LXB_DOM_ATTR_SYSTEM);
592         if (attr_data == NULL) {
593             tkz->status = LXB_STATUS_ERROR;
594             return end;
595         }
596 
597         tkz->token->attr_last->name = attr_data;
598 
599         tkz->state = lxb_html_tokenizer_state_doctype_after_system_keyword;
600 
601         return (pos + 1);
602     }
603 
604     tkz->markup = pos;
605 
606     return end;
607 }
608 
609 /*
610  * 12.2.5.57 After DOCTYPE public keyword state
611  */
612 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)613 lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t *tkz,
614                                                       const lxb_char_t *data,
615                                                       const lxb_char_t *end)
616 {
617     switch (*data) {
618         /*
619          * U+0009 CHARACTER TABULATION (tab)
620          * U+000A LINE FEED (LF)
621          * U+000C FORM FEED (FF)
622          * U+000D CARRIAGE RETURN (CR)
623          * U+0020 SPACE
624          */
625         case 0x09:
626         case 0x0A:
627         case 0x0C:
628         case 0x0D:
629         case 0x20:
630             tkz->state =
631                 lxb_html_tokenizer_state_doctype_before_public_identifier;
632 
633             return (data + 1);
634 
635         /* U+0022 QUOTATION MARK (") */
636         case 0x22:
637             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
638                                          LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE);
639 
640             tkz->state =
641                lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
642 
643             return (data + 1);
644 
645         /* U+0027 APOSTROPHE (') */
646         case 0x27:
647             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
648                                          LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE);
649 
650             tkz->state =
651                lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
652 
653             return (data + 1);
654 
655         /* U+003E GREATER-THAN SIGN (>) */
656         case 0x3E:
657             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
658             tkz->state = lxb_html_tokenizer_state_data_before;
659 
660             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
661                                          LXB_HTML_TOKENIZER_ERROR_MIDOPUID);
662 
663             lxb_html_tokenizer_state_token_done_m(tkz, end);
664 
665             return (data + 1);
666 
667         /* EOF */
668         case 0x00:
669             if (tkz->is_eof) {
670                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
671 
672                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
673                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
674 
675                 lxb_html_tokenizer_state_token_done_m(tkz, end);
676 
677                 return end;
678             }
679             /* fall through */
680 
681         default:
682             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
683             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
684 
685             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
686                                          LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID);
687 
688             return data;
689     }
690 
691     return data;
692 }
693 
694 /*
695  * 12.2.5.58 Before DOCTYPE public identifier state
696  */
697 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_before_public_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)698 lxb_html_tokenizer_state_doctype_before_public_identifier(lxb_html_tokenizer_t *tkz,
699                                                           const lxb_char_t *data,
700                                                           const lxb_char_t *end)
701 {
702     switch (*data) {
703         /*
704          * U+0009 CHARACTER TABULATION (tab)
705          * U+000A LINE FEED (LF)
706          * U+000C FORM FEED (FF)
707          * U+000D CARRIAGE RETURN (CR)
708          * U+0020 SPACE
709          */
710         case 0x09:
711         case 0x0A:
712         case 0x0C:
713         case 0x0D:
714         case 0x20:
715             break;
716 
717         /* U+0022 QUOTATION MARK (") */
718         case 0x22:
719             tkz->state =
720                lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
721 
722             break;
723 
724         /* U+0027 APOSTROPHE (') */
725         case 0x27:
726             tkz->state =
727                lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
728 
729             break;
730 
731         /* U+003E GREATER-THAN SIGN (>) */
732         case 0x3E:
733             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
734             tkz->state = lxb_html_tokenizer_state_data_before;
735 
736             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
737                                          LXB_HTML_TOKENIZER_ERROR_MIDOPUID);
738 
739             lxb_html_tokenizer_state_token_done_m(tkz, end);
740 
741             break;
742 
743         /* EOF */
744         case 0x00:
745             if (tkz->is_eof) {
746                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
747                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
748 
749                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
750 
751                 lxb_html_tokenizer_state_token_done_m(tkz, end);
752 
753                 return end;
754             }
755             /* fall through */
756 
757         default:
758             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
759                                          LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID);
760 
761             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
762             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
763 
764             return data;
765     }
766 
767     return (data + 1);
768 }
769 
770 /*
771  * 12.2.5.59 DOCTYPE public identifier (double-quoted) state
772  */
773 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)774 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(lxb_html_tokenizer_t *tkz,
775                                                                  const lxb_char_t *data,
776                                                                  const lxb_char_t *end)
777 {
778     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
779         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
780     }
781 
782     lxb_html_tokenizer_state_begin_set(tkz, data);
783 
784     while (data != end) {
785         switch (*data) {
786             /* U+0022 QUOTATION MARK (") */
787             case 0x22:
788                 lxb_html_tokenizer_state_append_data_m(tkz, data);
789                 lxb_html_tokenizer_state_set_value_m(tkz);
790                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
791 
792                 tkz->state =
793                     lxb_html_tokenizer_state_doctype_after_public_identifier;
794 
795                 return (data + 1);
796 
797             /* U+003E GREATER-THAN SIGN (>) */
798             case 0x3E:
799                 tkz->state = lxb_html_tokenizer_state_data_before;
800 
801                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
802                                              LXB_HTML_TOKENIZER_ERROR_ABDOPUID);
803 
804                 lxb_html_tokenizer_state_append_data_m(tkz, data);
805                 lxb_html_tokenizer_state_set_value_m(tkz);
806                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
807                 lxb_html_tokenizer_state_token_done_m(tkz, end);
808 
809                 return (data + 1);
810 
811             /* U+000D CARRIAGE RETURN (CR) */
812             case 0x0D:
813                 if (++data >= end) {
814                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
815 
816                     tkz->state = lxb_html_tokenizer_state_cr;
817                     tkz->state_return = lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
818 
819                     return data;
820                 }
821 
822                 lxb_html_tokenizer_state_append_data_m(tkz, data);
823                 tkz->pos[-1] = 0x0A;
824 
825                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
826 
827                 if (*data != 0x0A) {
828                     lxb_html_tokenizer_state_begin_set(tkz, data);
829                     data--;
830                 }
831 
832                 break;
833 
834             /*
835              * U+0000 NULL
836              * EOF
837              */
838             case 0x00:
839                 lxb_html_tokenizer_state_append_data_m(tkz, data);
840 
841                 if (tkz->is_eof) {
842                     lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
843 
844                     if (tkz->token->attr_last->value_begin == NULL) {
845                         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
846                                             tkz->token->attr_last->value_end);
847                     }
848 
849                     lxb_html_tokenizer_error_add(tkz->parse_errors,
850                                                tkz->token->attr_last->value_end,
851                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
852 
853                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
854 
855                     lxb_html_tokenizer_state_set_value_m(tkz);
856                     lxb_html_tokenizer_state_token_done_m(tkz, end);
857 
858                     return end;
859                 }
860 
861                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
862                 lxb_html_tokenizer_state_append_replace_m(tkz);
863 
864                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
865                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
866 
867                 tkz->token->attr_last->type
868                     |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
869 
870                 break;
871 
872             /* Anything else */
873             default:
874                 break;
875         }
876 
877         data++;
878     }
879 
880     lxb_html_tokenizer_state_append_data_m(tkz, data);
881 
882     return data;
883 }
884 
885 /*
886  * 12.2.5.60 DOCTYPE public identifier (single-quoted) state
887  */
888 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)889 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(lxb_html_tokenizer_t *tkz,
890                                                                  const lxb_char_t *data,
891                                                                  const lxb_char_t *end)
892 {
893     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
894         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
895     }
896 
897     lxb_html_tokenizer_state_begin_set(tkz, data);
898 
899     while (data != end) {
900         switch (*data) {
901             /* U+0027 APOSTROPHE (') */
902             case 0x27:
903                 lxb_html_tokenizer_state_append_data_m(tkz, data);
904                 lxb_html_tokenizer_state_set_value_m(tkz);
905                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
906 
907                 tkz->state =
908                     lxb_html_tokenizer_state_doctype_after_public_identifier;
909 
910                 return (data + 1);
911 
912             /* U+003E GREATER-THAN SIGN (>) */
913             case 0x3E:
914                 tkz->state = lxb_html_tokenizer_state_data_before;
915 
916                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
917                                              LXB_HTML_TOKENIZER_ERROR_ABDOPUID);
918 
919                 lxb_html_tokenizer_state_append_data_m(tkz, data);
920                 lxb_html_tokenizer_state_set_value_m(tkz);
921                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
922                 lxb_html_tokenizer_state_token_done_m(tkz, end);
923 
924                 return (data + 1);
925 
926             /* U+000D CARRIAGE RETURN (CR) */
927             case 0x0D:
928                 if (++data >= end) {
929                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
930 
931                     tkz->state = lxb_html_tokenizer_state_cr;
932                     tkz->state_return = lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
933 
934                     return data;
935                 }
936 
937                 lxb_html_tokenizer_state_append_data_m(tkz, data);
938                 tkz->pos[-1] = 0x0A;
939 
940                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
941 
942                 if (*data != 0x0A) {
943                     lxb_html_tokenizer_state_begin_set(tkz, data);
944                     data--;
945                 }
946 
947                 break;
948 
949             /*
950              * U+0000 NULL
951              * EOF
952              */
953             case 0x00:
954                 lxb_html_tokenizer_state_append_data_m(tkz, data);
955 
956                 if (tkz->is_eof) {
957                     lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
958 
959                     if (tkz->token->attr_last->value_begin == NULL) {
960                         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
961                                               tkz->token->attr_last->value_end);
962                     }
963 
964                     lxb_html_tokenizer_error_add(tkz->parse_errors,
965                                                tkz->token->attr_last->value_end,
966                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
967 
968                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
969 
970                     lxb_html_tokenizer_state_set_value_m(tkz);
971                     lxb_html_tokenizer_state_token_done_m(tkz, end);
972 
973                     return end;
974                 }
975 
976                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
977                 lxb_html_tokenizer_state_append_replace_m(tkz);
978 
979                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
980                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
981 
982                 tkz->token->attr_last->type
983                     |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
984 
985                 break;
986 
987             /* Anything else */
988             default:
989                 break;
990         }
991 
992         data++;
993     }
994 
995     lxb_html_tokenizer_state_append_data_m(tkz, data);
996 
997     return data;
998 }
999 
1000 /*
1001  * 12.2.5.61 After DOCTYPE public identifier state
1002  */
1003 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_public_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1004 lxb_html_tokenizer_state_doctype_after_public_identifier(lxb_html_tokenizer_t *tkz,
1005                                                          const lxb_char_t *data,
1006                                                          const lxb_char_t *end)
1007 {
1008     lxb_html_token_attr_t *attr;
1009 
1010     switch (*data) {
1011         /*
1012          * U+0009 CHARACTER TABULATION (tab)
1013          * U+000A LINE FEED (LF)
1014          * U+000C FORM FEED (FF)
1015          * U+000D CARRIAGE RETURN (CR)
1016          * U+0020 SPACE
1017          */
1018         case 0x09:
1019         case 0x0A:
1020         case 0x0C:
1021         case 0x0D:
1022         case 0x20:
1023             tkz->state =
1024          lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers;
1025 
1026             return (data + 1);
1027 
1028         /* U+003E GREATER-THAN SIGN (>) */
1029         case 0x3E:
1030             tkz->state = lxb_html_tokenizer_state_data_before;
1031 
1032             lxb_html_tokenizer_state_token_done_m(tkz, end);
1033 
1034             return (data + 1);
1035 
1036         /* U+0022 QUOTATION MARK (") */
1037         case 0x22:
1038             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1039                                      LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID);
1040 
1041             lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1042 
1043             tkz->state =
1044                lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1045 
1046             return (data + 1);
1047 
1048         /* U+0027 APOSTROPHE (') */
1049         case 0x27:
1050             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1051                                      LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID);
1052 
1053             lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1054 
1055             tkz->state =
1056                lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1057 
1058             return (data + 1);
1059 
1060         /* EOF */
1061         case 0x00:
1062             if (tkz->is_eof) {
1063                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1064                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
1065 
1066                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1067                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1068 
1069                 return end;
1070             }
1071             /* fall through */
1072 
1073         default:
1074             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1075                                          LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1076 
1077             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1078             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1079 
1080             return data;
1081     }
1082 
1083     return data;
1084 }
1085 
1086 /*
1087  * 12.2.5.62 Between DOCTYPE public and system identifiers state
1088  */
1089 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1090 lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(lxb_html_tokenizer_t *tkz,
1091                                                                        const lxb_char_t *data,
1092                                                                        const lxb_char_t *end)
1093 {
1094     lxb_html_token_attr_t *attr;
1095 
1096     switch (*data) {
1097         /*
1098          * U+0009 CHARACTER TABULATION (tab)
1099          * U+000A LINE FEED (LF)
1100          * U+000C FORM FEED (FF)
1101          * U+000D CARRIAGE RETURN (CR)
1102          * U+0020 SPACE
1103          */
1104         case 0x09:
1105         case 0x0A:
1106         case 0x0C:
1107         case 0x0D:
1108         case 0x20:
1109             return (data + 1);
1110 
1111         /* U+003E GREATER-THAN SIGN (>) */
1112         case 0x3E:
1113             tkz->state = lxb_html_tokenizer_state_data_before;
1114 
1115             lxb_html_tokenizer_state_token_done_m(tkz, end);
1116 
1117             return (data + 1);
1118 
1119         /* U+0022 QUOTATION MARK (") */
1120         case 0x22:
1121             lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1122 
1123             tkz->state =
1124                lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1125 
1126             return (data + 1);
1127 
1128         /* U+0027 APOSTROPHE (') */
1129         case 0x27:
1130             lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
1131 
1132             tkz->state =
1133                lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1134 
1135             return (data + 1);
1136 
1137         /* EOF */
1138         case 0x00:
1139             if (tkz->is_eof) {
1140                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1141                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
1142 
1143                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1144                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1145 
1146                 return end;
1147             }
1148             /* fall through */
1149 
1150         default:
1151             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1152                                          LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1153 
1154             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1155             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1156 
1157             return data;
1158     }
1159 
1160     return data;
1161 }
1162 
1163 /*
1164  * 12.2.5.63 After DOCTYPE system keyword state
1165  */
1166 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1167 lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t *tkz,
1168                                                       const lxb_char_t *data,
1169                                                       const lxb_char_t *end)
1170 {
1171     switch (*data) {
1172         /*
1173          * U+0009 CHARACTER TABULATION (tab)
1174          * U+000A LINE FEED (LF)
1175          * U+000C FORM FEED (FF)
1176          * U+000D CARRIAGE RETURN (CR)
1177          * U+0020 SPACE
1178          */
1179         case 0x09:
1180         case 0x0A:
1181         case 0x0C:
1182         case 0x0D:
1183         case 0x20:
1184             tkz->state =
1185                 lxb_html_tokenizer_state_doctype_before_system_identifier;
1186 
1187             return (data + 1);
1188 
1189         /* U+0022 QUOTATION MARK (") */
1190         case 0x22:
1191             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1192                                          LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE);
1193 
1194             tkz->state =
1195                lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1196 
1197             return (data + 1);
1198 
1199         /* U+0027 APOSTROPHE (') */
1200         case 0x27:
1201             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1202                                          LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE);
1203 
1204             tkz->state =
1205                lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1206 
1207             return (data + 1);
1208 
1209         /* U+003E GREATER-THAN SIGN (>) */
1210         case 0x3E:
1211             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1212             tkz->state = lxb_html_tokenizer_state_data_before;
1213 
1214             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1215                                          LXB_HTML_TOKENIZER_ERROR_MIDOSYID);
1216 
1217             lxb_html_tokenizer_state_token_done_m(tkz, end);
1218 
1219             return (data + 1);
1220 
1221         /* EOF */
1222         case 0x00:
1223             if (tkz->is_eof) {
1224                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1225                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
1226 
1227                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1228                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1229 
1230                 return end;
1231             }
1232             /* fall through */
1233 
1234         default:
1235             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1236             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1237 
1238             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1239                                          LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1240 
1241             return data;
1242     }
1243 
1244     return data;
1245 }
1246 
1247 /*
1248  * 12.2.5.64 Before DOCTYPE system identifier state
1249  */
1250 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_before_system_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1251 lxb_html_tokenizer_state_doctype_before_system_identifier(lxb_html_tokenizer_t *tkz,
1252                                                           const lxb_char_t *data,
1253                                                           const lxb_char_t *end)
1254 {
1255     switch (*data) {
1256         /*
1257          * U+0009 CHARACTER TABULATION (tab)
1258          * U+000A LINE FEED (LF)
1259          * U+000C FORM FEED (FF)
1260          * U+000D CARRIAGE RETURN (CR)
1261          * U+0020 SPACE
1262          */
1263         case 0x09:
1264         case 0x0A:
1265         case 0x0C:
1266         case 0x0D:
1267         case 0x20:
1268             return (data + 1);
1269 
1270         /* U+0022 QUOTATION MARK (") */
1271         case 0x22:
1272             tkz->state =
1273                lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1274 
1275             return (data + 1);
1276 
1277         /* U+0027 APOSTROPHE (') */
1278         case 0x27:
1279             tkz->state =
1280                lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1281 
1282             return (data + 1);
1283 
1284         /* U+003E GREATER-THAN SIGN (>) */
1285         case 0x3E:
1286             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1287             tkz->state = lxb_html_tokenizer_state_data_before;
1288 
1289             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1290                                          LXB_HTML_TOKENIZER_ERROR_MIDOSYID);
1291 
1292             lxb_html_tokenizer_state_token_done_m(tkz, end);
1293 
1294             return (data + 1);
1295 
1296         /* EOF */
1297         case 0x00:
1298             if (tkz->is_eof) {
1299                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1300                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
1301 
1302                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1303 
1304                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1305 
1306                 return end;
1307             }
1308             /* fall through */
1309 
1310         default:
1311             tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1312             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1313 
1314             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1315                                          LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID);
1316 
1317             return data;
1318     }
1319 
1320     return data;
1321 }
1322 
1323 /*
1324  * 12.2.5.65 DOCTYPE system identifier (double-quoted) state
1325  */
1326 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1327 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(lxb_html_tokenizer_t *tkz,
1328                                                                  const lxb_char_t *data,
1329                                                                  const lxb_char_t *end)
1330 {
1331     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1332         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1333     }
1334 
1335     lxb_html_tokenizer_state_begin_set(tkz, data);
1336 
1337     while (data != end) {
1338         switch (*data) {
1339             /* U+0022 QUOTATION MARK (") */
1340             case 0x22:
1341                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1342                 lxb_html_tokenizer_state_set_value_m(tkz);
1343                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1344 
1345                 tkz->state =
1346                     lxb_html_tokenizer_state_doctype_after_system_identifier;
1347 
1348                 return (data + 1);
1349 
1350             /* U+003E GREATER-THAN SIGN (>) */
1351             case 0x3E:
1352                 tkz->state = lxb_html_tokenizer_state_data_before;
1353 
1354                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1355                                              LXB_HTML_TOKENIZER_ERROR_ABDOSYID);
1356 
1357                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1358                 lxb_html_tokenizer_state_set_value_m(tkz);
1359                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1360                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1361 
1362                 return (data + 1);
1363 
1364             /* U+000D CARRIAGE RETURN (CR) */
1365             case 0x0D:
1366                 if (++data >= end) {
1367                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1368 
1369                     tkz->state = lxb_html_tokenizer_state_cr;
1370                     tkz->state_return = lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1371 
1372                     return data;
1373                 }
1374 
1375                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1376                 tkz->pos[-1] = 0x0A;
1377 
1378                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1379 
1380                 if (*data != 0x0A) {
1381                     lxb_html_tokenizer_state_begin_set(tkz, data);
1382                     data--;
1383                 }
1384 
1385                 break;
1386 
1387             /*
1388              * U+0000 NULL
1389              * EOF
1390              */
1391             case 0x00:
1392                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1393 
1394                 if (tkz->is_eof) {
1395                     lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1396 
1397                     if (tkz->token->attr_last->value_begin == NULL) {
1398                         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
1399                                               tkz->token->attr_last->value_end);
1400                     }
1401 
1402                     lxb_html_tokenizer_error_add(tkz->parse_errors,
1403                                                tkz->token->attr_last->value_end,
1404                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
1405 
1406                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1407 
1408                     lxb_html_tokenizer_state_set_value_m(tkz);
1409                     lxb_html_tokenizer_state_token_done_m(tkz, end);
1410 
1411                     return end;
1412                 }
1413 
1414                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1415                 lxb_html_tokenizer_state_append_replace_m(tkz);
1416 
1417                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1418                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1419 
1420                 tkz->token->attr_last->type
1421                     |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
1422 
1423                 break;
1424 
1425             /* Anything else */
1426             default:
1427                 break;
1428         }
1429 
1430         data++;
1431     }
1432 
1433     lxb_html_tokenizer_state_append_data_m(tkz, data);
1434 
1435     return data;
1436 }
1437 
1438 /*
1439  * 12.2.5.66 DOCTYPE system identifier (single-quoted) state
1440  */
1441 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1442 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(lxb_html_tokenizer_t *tkz,
1443                                                                  const lxb_char_t *data,
1444                                                                  const lxb_char_t *end)
1445 {
1446     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1447         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1448     }
1449 
1450     lxb_html_tokenizer_state_begin_set(tkz, data);
1451 
1452     while (data != end) {
1453         switch (*data) {
1454             /* U+0027 APOSTROPHE (') */
1455             case 0x27:
1456                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1457                 lxb_html_tokenizer_state_set_value_m(tkz);
1458                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1459 
1460                 tkz->state =
1461                     lxb_html_tokenizer_state_doctype_after_system_identifier;
1462 
1463                 return (data + 1);
1464 
1465             /* U+003E GREATER-THAN SIGN (>) */
1466             case 0x3E:
1467                 tkz->state = lxb_html_tokenizer_state_data_before;
1468 
1469                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1470                                              LXB_HTML_TOKENIZER_ERROR_ABDOSYID);
1471 
1472                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1473                 lxb_html_tokenizer_state_set_value_m(tkz);
1474                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1475                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1476 
1477                 return (data + 1);
1478 
1479             /* U+000D CARRIAGE RETURN (CR) */
1480             case 0x0D:
1481                 if (++data >= end) {
1482                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1483 
1484                     tkz->state = lxb_html_tokenizer_state_cr;
1485                     tkz->state_return = lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1486 
1487                     return data;
1488                 }
1489 
1490                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1491                 tkz->pos[-1] = 0x0A;
1492 
1493                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1494 
1495                 if (*data != 0x0A) {
1496                     lxb_html_tokenizer_state_begin_set(tkz, data);
1497                     data--;
1498                 }
1499 
1500                 break;
1501 
1502             /*
1503              * U+0000 NULL
1504              * EOF
1505              */
1506             case 0x00:
1507                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1508 
1509                 if (tkz->is_eof) {
1510                     lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1511 
1512                     if (tkz->token->attr_last->value_begin == NULL) {
1513                         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz,
1514                                               tkz->token->attr_last->value_end);
1515                     }
1516 
1517                     lxb_html_tokenizer_error_add(tkz->parse_errors,
1518                                                tkz->token->attr_last->value_end,
1519                                                LXB_HTML_TOKENIZER_ERROR_EOINDO);
1520 
1521                     tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1522 
1523                     lxb_html_tokenizer_state_set_value_m(tkz);
1524                     lxb_html_tokenizer_state_token_done_m(tkz, end);
1525 
1526                     return end;
1527                 }
1528 
1529                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1530                 lxb_html_tokenizer_state_append_replace_m(tkz);
1531 
1532                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1533                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1534 
1535                 tkz->token->attr_last->type
1536                     |= LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL;
1537 
1538                 break;
1539 
1540             /* Anything else */
1541             default:
1542                 break;
1543         }
1544 
1545         data++;
1546     }
1547 
1548     lxb_html_tokenizer_state_append_data_m(tkz, data);
1549 
1550     return data;
1551 }
1552 
1553 /*
1554  * 12.2.5.67 After DOCTYPE system identifier state
1555  */
1556 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_after_system_identifier(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1557 lxb_html_tokenizer_state_doctype_after_system_identifier(
1558                                                       lxb_html_tokenizer_t *tkz,
1559                                                       const lxb_char_t *data,
1560                                                       const lxb_char_t *end)
1561 {
1562     switch (*data) {
1563         /*
1564          * U+0009 CHARACTER TABULATION (tab)
1565          * U+000A LINE FEED (LF)
1566          * U+000C FORM FEED (FF)
1567          * U+000D CARRIAGE RETURN (CR)
1568          * U+0020 SPACE
1569          */
1570         case 0x09:
1571         case 0x0A:
1572         case 0x0C:
1573         case 0x0D:
1574         case 0x20:
1575             return (data + 1);
1576 
1577         /* U+003E GREATER-THAN SIGN (>) */
1578         case 0x3E:
1579             tkz->state = lxb_html_tokenizer_state_data_before;
1580 
1581             lxb_html_tokenizer_state_token_done_m(tkz, end);
1582 
1583             return (data + 1);
1584 
1585         /* EOF */
1586         case 0x00:
1587             if (tkz->is_eof) {
1588                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1589                                              LXB_HTML_TOKENIZER_ERROR_EOINDO);
1590 
1591                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS;
1592 
1593                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1594 
1595                 return end;
1596             }
1597             /* fall through */
1598 
1599         default:
1600             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1601                                          LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID);
1602 
1603             tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1604 
1605             return data;
1606     }
1607 
1608     return data;
1609 }
1610 
1611 /*
1612  * 12.2.5.68 Bogus DOCTYPE state
1613  */
1614 static const lxb_char_t *
lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1615 lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t *tkz,
1616                                        const lxb_char_t *data,
1617                                        const lxb_char_t *end)
1618 {
1619     while (data != end) {
1620         switch (*data) {
1621             /* U+003E GREATER-THAN SIGN (>) */
1622             case 0x3E:
1623                 tkz->state = lxb_html_tokenizer_state_data_before;
1624 
1625                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1626 
1627                 return (data + 1);
1628 
1629             /*
1630              * U+0000 NULL
1631              * EOF
1632              */
1633             case 0x00:
1634                 if (tkz->is_eof) {
1635                     lxb_html_tokenizer_state_token_done_m(tkz, end);
1636 
1637                     return end;
1638                 }
1639 
1640                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1641                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1642 
1643                 break;
1644 
1645             /* Anything else */
1646             default:
1647                 break;
1648         }
1649 
1650         data++;
1651     }
1652 
1653     return data;
1654 }
1655