1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tokenizer/state.h"
8 #include "lexbor/html/tokenizer/state_comment.h"
9 #include "lexbor/html/tokenizer/state_doctype.h"
10 
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER
13 #define LEXBOR_STR_RES_REPLACEMENT_CHARACTER
14 #define LEXBOR_STR_RES_ALPHA_CHARACTER
15 #define LEXBOR_STR_RES_MAP_HEX
16 #define LEXBOR_STR_RES_MAP_NUM
17 #include "lexbor/core/str_res.h"
18 
19 #define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST
20 #include "lexbor/html/tokenizer/res.h"
21 
22 
23 const lxb_tag_data_t *
24 lxb_tag_append_lower(lexbor_hash_t *hash,
25                      const lxb_char_t *name, size_t length);
26 
27 lxb_dom_attr_data_t *
28 lxb_dom_attr_local_name_append(lexbor_hash_t *hash,
29                                const lxb_char_t *name, size_t length);
30 
31 
32 static const lxb_char_t *
33 lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
34                               const lxb_char_t *data, const lxb_char_t *end);
35 
36 static const lxb_char_t *
37 lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
38                                    const lxb_char_t *data,
39                                    const lxb_char_t *end);
40 
41 /* Tag */
42 static const lxb_char_t *
43 lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
44                                   const lxb_char_t *data,
45                                   const lxb_char_t *end);
46 
47 static const lxb_char_t *
48 lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
49                                       const lxb_char_t *data,
50                                       const lxb_char_t *end);
51 
52 static const lxb_char_t *
53 lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
54                                   const lxb_char_t *data,
55                                   const lxb_char_t *end);
56 
57 /* Attribute */
58 static const lxb_char_t *
59 lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
60                                         const lxb_char_t *data,
61                                         const lxb_char_t *end);
62 
63 static const lxb_char_t *
64 lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
65                                               const lxb_char_t *data,
66                                               const lxb_char_t *end);
67 
68 static const lxb_char_t *
69 lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
70                                                 const lxb_char_t *data,
71                                                 const lxb_char_t *end);
72 
73 static const lxb_char_t *
74 lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
75                                                        const lxb_char_t *data,
76                                                        const lxb_char_t *end);
77 
78 static const lxb_char_t *
79 lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
80                                                        const lxb_char_t *data,
81                                                        const lxb_char_t *end);
82 
83 static const lxb_char_t *
84 lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
85                                                   const lxb_char_t *data,
86                                                   const lxb_char_t *end);
87 
88 static const lxb_char_t *
89 lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
90                                                       const lxb_char_t *data,
91                                                       const lxb_char_t *end);
92 
93 static const lxb_char_t *
94 lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
95                                               const lxb_char_t *data,
96                                               const lxb_char_t *end);
97 
98 static const lxb_char_t *
99 lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
100                                        const lxb_char_t *data,
101                                        const lxb_char_t *end);
102 
103 /* Markup declaration */
104 static const lxb_char_t *
105 lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
106                                                  const lxb_char_t *data,
107                                                  const lxb_char_t *end);
108 
109 static const lxb_char_t *
110 lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
111                                                     const lxb_char_t *data,
112                                                     const lxb_char_t *end);
113 
114 static const lxb_char_t *
115 lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
116                                                     const lxb_char_t *data,
117                                                     const lxb_char_t *end);
118 
119 static const lxb_char_t *
120 lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
121                                                   const lxb_char_t *data,
122                                                   const lxb_char_t *end);
123 
124 /* CDATA Section */
125 static const lxb_char_t *
126 lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
127                                               const lxb_char_t *data,
128                                               const lxb_char_t *end);
129 
130 static const lxb_char_t *
131 lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
132                                        const lxb_char_t *data,
133                                        const lxb_char_t *end);
134 
135 static const lxb_char_t *
136 lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
137                                                const lxb_char_t *data,
138                                                const lxb_char_t *end);
139 
140 static const lxb_char_t *
141 lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
142                                            const lxb_char_t *data,
143                                            const lxb_char_t *end);
144 
145 static const lxb_char_t *
146 lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
147                                        const lxb_char_t *data,
148                                        const lxb_char_t *end);
149 
150 static const lxb_char_t *
151 _lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
152                                    const lxb_char_t *data,
153                                    const lxb_char_t *end);
154 
155 static const lxb_char_t *
156 lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
157                                         const lxb_char_t *data,
158                                         const lxb_char_t *end);
159 
160 static const lxb_char_t *
161 lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
162                                                       const lxb_char_t *data,
163                                                       const lxb_char_t *end);
164 
165 static const lxb_char_t *
166 lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
167                                           const lxb_char_t *data,
168                                           const lxb_char_t *end);
169 
170 static const lxb_char_t *
171 lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
172                                                     const lxb_char_t *data,
173                                                     const lxb_char_t *end);
174 
175 static const lxb_char_t *
176 lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
177                                                 const lxb_char_t *data,
178                                                 const lxb_char_t *end);
179 
180 static const lxb_char_t *
181 lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
182                                               const lxb_char_t *data,
183                                               const lxb_char_t *end);
184 
185 static const lxb_char_t *
186 lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
187                                           const lxb_char_t *data,
188                                           const lxb_char_t *end);
189 
190 static const lxb_char_t *
191 lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
192                                               const lxb_char_t *data,
193                                               const lxb_char_t *end);
194 
195 static size_t
196 lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data);
197 
198 
199 /*
200  * Helper function. No in the specification. For 12.2.5.1 Data state
201  */
202 const lxb_char_t *
lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)203 lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz,
204                                      const lxb_char_t *data,
205                                      const lxb_char_t *end)
206 {
207     if (tkz->is_eof == false) {
208         lxb_html_tokenizer_state_token_set_begin(tkz, data);
209     }
210 
211     /*
212      * Text node init param sets before emit token.
213      */
214 
215     tkz->state = lxb_html_tokenizer_state_data;
216 
217     return data;
218 }
219 
220 /*
221  * 12.2.5.1 Data state
222  */
223 static const lxb_char_t *
lxb_html_tokenizer_state_data(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)224 lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
225                               const lxb_char_t *data, const lxb_char_t *end)
226 {
227     lxb_html_tokenizer_state_begin_set(tkz, data);
228 
229     while (data != end) {
230         switch (*data) {
231             /* U+003C LESS-THAN SIGN (<) */
232             case 0x3C:
233                 lxb_html_tokenizer_state_append_data_m(tkz, data);
234                 lxb_html_tokenizer_state_token_set_end(tkz, data);
235 
236                 tkz->state = lxb_html_tokenizer_state_tag_open;
237                 return (data + 1);
238 
239             /* U+0026 AMPERSAND (&) */
240             case 0x26:
241                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
242 
243                 tkz->state = lxb_html_tokenizer_state_char_ref;
244                 tkz->state_return = lxb_html_tokenizer_state_data;
245 
246                 return data + 1;
247 
248             /* U+000D CARRIAGE RETURN (CR) */
249             case 0x0D:
250                 if (++data >= end) {
251                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
252 
253                     tkz->state = lxb_html_tokenizer_state_cr;
254                     tkz->state_return = lxb_html_tokenizer_state_data;
255 
256                     return data;
257                 }
258 
259                 lxb_html_tokenizer_state_append_data_m(tkz, data);
260                 tkz->pos[-1] = 0x0A;
261 
262                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
263 
264                 if (*data != 0x0A) {
265                     lxb_html_tokenizer_state_begin_set(tkz, data);
266                     data--;
267                 }
268 
269                 break;
270 
271             /*
272              * U+0000 NULL
273              * EOF
274              */
275             case 0x00:
276                 if (tkz->is_eof) {
277                     /* Emit TEXT node if not empty */
278                     if (tkz->token->begin != NULL) {
279                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
280                     }
281 
282                     if (tkz->token->begin != tkz->token->end) {
283                         tkz->token->tag_id = LXB_TAG__TEXT;
284 
285                         lxb_html_tokenizer_state_append_data_m(tkz, data);
286 
287                         lxb_html_tokenizer_state_set_text(tkz);
288                         lxb_html_tokenizer_state_token_done_wo_check_m(tkz,end);
289                     }
290 
291                     return end;
292                 }
293 
294                 if (SIZE_MAX - tkz->token->null_count < 1) {
295                     tkz->status = LXB_STATUS_ERROR_OVERFLOW;
296                     return end;
297                 }
298 
299                 tkz->token->null_count++;
300 
301                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
302                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
303                 break;
304         }
305 
306         data++;
307     }
308 
309     lxb_html_tokenizer_state_append_data_m(tkz, data);
310 
311     return data;
312 }
313 
314 /*
315  * Helper function. No in the specification. For 12.2.5.5 PLAINTEXT state
316  */
317 const lxb_char_t *
lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)318 lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz,
319                                           const lxb_char_t *data,
320                                           const lxb_char_t *end)
321 {
322     if (tkz->is_eof == false) {
323         lxb_html_tokenizer_state_token_set_begin(tkz, data);
324     }
325 
326     tkz->token->tag_id = LXB_TAG__TEXT;
327 
328     tkz->state = lxb_html_tokenizer_state_plaintext;
329 
330     return data;
331 }
332 
333 /*
334  * 12.2.5.5 PLAINTEXT state
335  */
336 static const lxb_char_t *
lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)337 lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
338                                    const lxb_char_t *data,
339                                    const lxb_char_t *end)
340 {
341     lxb_html_tokenizer_state_begin_set(tkz, data);
342 
343     while (data != end) {
344         switch (*data) {
345             /* U+000D CARRIAGE RETURN (CR) */
346             case 0x0D:
347                 if (++data >= end) {
348                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
349 
350                     tkz->state = lxb_html_tokenizer_state_cr;
351                     tkz->state_return = lxb_html_tokenizer_state_plaintext;
352 
353                     return data;
354                 }
355 
356                 lxb_html_tokenizer_state_append_data_m(tkz, data);
357                 tkz->pos[-1] = 0x0A;
358 
359                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
360 
361                 if (*data != 0x0A) {
362                     lxb_html_tokenizer_state_begin_set(tkz, data);
363                     data--;
364                 }
365 
366                 break;
367 
368             /*
369              * U+0000 NULL
370              * EOF
371              */
372             case 0x00:
373                 lxb_html_tokenizer_state_append_data_m(tkz, data);
374 
375                 if (tkz->is_eof) {
376                     if (tkz->token->begin != NULL) {
377                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
378                     }
379 
380                     lxb_html_tokenizer_state_set_text(tkz);
381                     lxb_html_tokenizer_state_token_done_m(tkz, end);
382 
383                     return end;
384                 }
385 
386                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
387                 lxb_html_tokenizer_state_append_replace_m(tkz);
388 
389                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
390                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
391                 break;
392         }
393 
394         data++;
395     }
396 
397     lxb_html_tokenizer_state_append_data_m(tkz, data);
398 
399     return data;
400 }
401 
402 /*
403  * 12.2.5.6 Tag open state
404  */
405 static const lxb_char_t *
lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)406 lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
407                                   const lxb_char_t *data, const lxb_char_t *end)
408 {
409     /* ASCII alpha */
410     if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
411         tkz->state = lxb_html_tokenizer_state_tag_name;
412 
413         lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
414         lxb_html_tokenizer_state_token_set_begin(tkz, data);
415 
416         return data;
417     }
418 
419     /* U+002F SOLIDUS (/) */
420     else if (*data == 0x2F) {
421         tkz->state = lxb_html_tokenizer_state_end_tag_open;
422 
423         return (data + 1);
424     }
425 
426     /* U+0021 EXCLAMATION MARK (!) */
427     else if (*data == 0x21) {
428         tkz->state = lxb_html_tokenizer_state_markup_declaration_open;
429 
430         lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
431 
432         return (data + 1);
433     }
434 
435     /* U+003F QUESTION MARK (?) */
436     else if (*data == 0x3F) {
437         tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
438 
439         lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
440         lxb_html_tokenizer_state_token_set_begin(tkz, data);
441 
442         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
443                                      LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA);
444 
445         return data;
446     }
447 
448     /* EOF */
449     else if (*data == 0x00) {
450         if (tkz->is_eof) {
451             lxb_html_tokenizer_state_append_m(tkz, "<", 1);
452 
453             lxb_html_tokenizer_state_token_set_end_oef(tkz);
454             lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
455 
456             lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
457                                          LXB_HTML_TOKENIZER_ERROR_EOBETANA);
458 
459             return end;
460         }
461     }
462 
463     lxb_html_tokenizer_state_append_m(tkz, "<", 1);
464 
465     lxb_html_tokenizer_error_add(tkz->parse_errors, data,
466                                  LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
467 
468     tkz->state = lxb_html_tokenizer_state_data;
469 
470     return data;
471 }
472 
473 /*
474  * 12.2.5.7 End tag open state
475  */
476 static const lxb_char_t *
lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)477 lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
478                                       const lxb_char_t *data,
479                                       const lxb_char_t *end)
480 {
481     /* ASCII alpha */
482     if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
483         tkz->state = lxb_html_tokenizer_state_tag_name;
484 
485         lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
486         lxb_html_tokenizer_state_token_set_begin(tkz, data);
487 
488         tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
489 
490         return data;
491     }
492 
493     /* U+003E GREATER-THAN SIGN (>) */
494     else if (*data == 0x3E) {
495         tkz->state = lxb_html_tokenizer_state_data;
496 
497         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
498                                      LXB_HTML_TOKENIZER_ERROR_MIENTANA);
499 
500         return (data + 1);
501     }
502 
503     /* Fake EOF */
504     else if (*data == 0x00) {
505         if (tkz->is_eof) {
506             lxb_html_tokenizer_state_append_m(tkz, "</", 2);
507 
508             lxb_html_tokenizer_state_token_set_end_oef(tkz);
509             lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
510 
511             lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
512                                          LXB_HTML_TOKENIZER_ERROR_EOBETANA);
513 
514             return end;
515         }
516     }
517 
518     tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
519 
520     lxb_html_tokenizer_error_add(tkz->parse_errors, data,
521                                  LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
522 
523     lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
524     lxb_html_tokenizer_state_token_set_begin(tkz, data);
525 
526     return data;
527 }
528 
529 /*
530  * 12.2.5.8 Tag name state
531  */
532 static const lxb_char_t *
lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)533 lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
534                                   const lxb_char_t *data, const lxb_char_t *end)
535 {
536     lxb_html_tokenizer_state_begin_set(tkz, data);
537 
538     while (data != end) {
539         switch (*data) {
540             /*
541              * U+0009 CHARACTER TABULATION (tab)
542              * U+000A LINE FEED (LF)
543              * U+000C FORM FEED (FF)
544              * U+000D CARRIAGE RETURN (CR)
545              * U+0020 SPACE
546              */
547             case 0x09:
548             case 0x0A:
549             case 0x0C:
550             case 0x0D:
551             case 0x20:
552                 lxb_html_tokenizer_state_append_data_m(tkz, data);
553                 lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
554                 lxb_html_tokenizer_state_token_set_end(tkz, data);
555 
556                 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
557                 return (data + 1);
558 
559             /* U+002F SOLIDUS (/) */
560             case 0x2F:
561                 lxb_html_tokenizer_state_append_data_m(tkz, data);
562                 lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
563                 lxb_html_tokenizer_state_token_set_end(tkz, data);
564 
565                 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
566                 return (data + 1);
567 
568             /* U+003E GREATER-THAN SIGN (>) */
569             case 0x3E:
570                 tkz->state = lxb_html_tokenizer_state_data_before;
571 
572                 lxb_html_tokenizer_state_append_data_m(tkz, data);
573                 lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
574                 lxb_html_tokenizer_state_token_set_end(tkz, data);
575                 lxb_html_tokenizer_state_token_done_m(tkz, end);
576 
577                 return (data + 1);
578 
579             /* U+0000 NULL */
580             case 0x00:
581                 if (tkz->is_eof) {
582                     lxb_html_tokenizer_state_token_set_end_oef(tkz);
583 
584                     lxb_html_tokenizer_error_add(tkz->parse_errors,
585                                                tkz->token->end,
586                                                LXB_HTML_TOKENIZER_ERROR_EOINTA);
587                     return end;
588                 }
589 
590                 lxb_html_tokenizer_state_append_data_m(tkz, data);
591                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
592                 lxb_html_tokenizer_state_append_replace_m(tkz);
593 
594                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
595                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
596                 break;
597 
598             default:
599                 break;
600         }
601 
602         data++;
603     }
604 
605     lxb_html_tokenizer_state_append_data_m(tkz, data);
606 
607     return data;
608 }
609 
610 /*
611  * 12.2.5.32 Before attribute name state
612  */
613 const lxb_char_t *
lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)614 lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t *tkz,
615                                                const lxb_char_t *data,
616                                                const lxb_char_t *end)
617 {
618     lxb_html_token_attr_t *attr;
619 
620     while (data != end) {
621         switch (*data) {
622             /*
623              * U+0009 CHARACTER TABULATION (tab)
624              * U+000A LINE FEED (LF)
625              * U+000C FORM FEED (FF)
626              * U+000D CARRIAGE RETURN (CR)
627              * U+0020 SPACE
628              */
629             case 0x09:
630             case 0x0A:
631             case 0x0C:
632             case 0x0D:
633             case 0x20:
634                 break;
635 
636             /* U+003D EQUALS SIGN (=) */
637             case 0x3D:
638                 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
639                 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
640 
641                 lxb_html_tokenizer_state_append_m(tkz, data, 1);
642 
643                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
644                                          LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA);
645 
646                 tkz->state = lxb_html_tokenizer_state_attribute_name;
647                 return (data + 1);
648 
649             /*
650              * U+002F SOLIDUS (/)
651              * U+003E GREATER-THAN SIGN (>)
652              */
653             case 0x2F:
654             case 0x3E:
655                 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
656                 return data;
657 
658             /* EOF */
659             case 0x00:
660                 if (tkz->is_eof) {
661                     tkz->state = lxb_html_tokenizer_state_after_attribute_name;
662                     return data;
663                 }
664                 /* fall through */
665 
666             /* Anything else */
667             default:
668                 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
669                 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
670 
671                 tkz->state = lxb_html_tokenizer_state_attribute_name;
672                 return data;
673         }
674 
675         data++;
676     }
677 
678     return data;
679 }
680 
681 /*
682  * 12.2.5.33 Attribute name state
683  */
684 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)685 lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
686                                         const lxb_char_t *data,
687                                         const lxb_char_t *end)
688 {
689     lxb_html_tokenizer_state_begin_set(tkz, data);
690 
691     while (data != end) {
692         switch (*data) {
693             /*
694              * U+0009 CHARACTER TABULATION (tab)
695              * U+000A LINE FEED (LF)
696              * U+000C FORM FEED (FF)
697              * U+000D CARRIAGE RETURN (CR)
698              * U+0020 SPACE
699              * U+002F SOLIDUS (/)
700              * U+003E GREATER-THAN SIGN (>)
701              */
702             case 0x09:
703             case 0x0A:
704             case 0x0C:
705             case 0x0D:
706             case 0x20:
707             case 0x2F:
708             case 0x3E:
709                 lxb_html_tokenizer_state_append_data_m(tkz, data);
710                 lxb_html_tokenizer_state_set_name_m(tkz);
711                 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
712 
713                 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
714                 return data;
715 
716             /*
717              * U+0000 NULL
718              * EOF
719              */
720             case 0x00:
721                 if (tkz->is_eof) {
722                     lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz);
723 
724                     tkz->state = lxb_html_tokenizer_state_after_attribute_name;
725                     return data;
726                 }
727 
728                 lxb_html_tokenizer_state_append_data_m(tkz, data);
729                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
730                 lxb_html_tokenizer_state_append_replace_m(tkz);
731 
732                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
733                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
734                 break;
735 
736             /* U+003D EQUALS SIGN (=) */
737             case 0x3D:
738                 lxb_html_tokenizer_state_append_data_m(tkz, data);
739                 lxb_html_tokenizer_state_set_name_m(tkz);
740                 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
741 
742                 tkz->state = lxb_html_tokenizer_state_before_attribute_value;
743                 return (data + 1);
744 
745             /*
746              * U+0022 QUOTATION MARK (")
747              * U+0027 APOSTROPHE (')
748              * U+003C LESS-THAN SIGN (<)
749              */
750             case 0x22:
751             case 0x27:
752             case 0x3C:
753                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
754                                            LXB_HTML_TOKENIZER_ERROR_UNCHINATNA);
755                 break;
756 
757             default:
758                 break;
759         }
760 
761         data++;
762     }
763 
764     lxb_html_tokenizer_state_append_data_m(tkz, data);
765 
766     return data;
767 }
768 
769 /*
770  * 12.2.5.34 After attribute name state
771  */
772 static const lxb_char_t *
lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)773 lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
774                                               const lxb_char_t *data,
775                                               const lxb_char_t *end)
776 {
777     lxb_html_token_attr_t *attr;
778 
779     while (data != end) {
780         switch (*data) {
781             /*
782              * U+0009 CHARACTER TABULATION (tab)
783              * U+000A LINE FEED (LF)
784              * U+000C FORM FEED (FF)
785              * U+000D CARRIAGE RETURN (CR)
786              * U+0020 SPACE
787              */
788             case 0x09:
789             case 0x0A:
790             case 0x0C:
791             case 0x0D:
792             case 0x20:
793                 break;
794 
795             /* U+002F SOLIDUS (/) */
796             case 0x2F:
797                 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
798                 return (data + 1);
799 
800             /* U+003D EQUALS SIGN (=) */
801             case 0x3D:
802                 tkz->state = lxb_html_tokenizer_state_before_attribute_value;
803                 return (data + 1);
804 
805             /* U+003E GREATER-THAN SIGN (>) */
806             case 0x3E:
807                 tkz->state = lxb_html_tokenizer_state_data_before;
808 
809                 lxb_html_tokenizer_state_token_done_m(tkz, end);
810 
811                 return (data + 1);
812 
813             case 0x00:
814                 if (tkz->is_eof) {
815                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
816                                                LXB_HTML_TOKENIZER_ERROR_EOINTA);
817                     return end;
818                 }
819                 /* fall through */
820 
821             default:
822                 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
823                 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
824 
825                 tkz->state = lxb_html_tokenizer_state_attribute_name;
826                 return data;
827         }
828 
829         data++;
830     }
831 
832     return data;
833 }
834 
835 /*
836  * 12.2.5.35 Before attribute value state
837  */
838 static const lxb_char_t *
lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)839 lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
840                                                 const lxb_char_t *data,
841                                                 const lxb_char_t *end)
842 {
843     while (data != end) {
844         switch (*data) {
845             /*
846              * U+0009 CHARACTER TABULATION (tab)
847              * U+000A LINE FEED (LF)
848              * U+000C FORM FEED (FF)
849              * U+000D CARRIAGE RETURN (CR)
850              * U+0020 SPACE
851              */
852             case 0x09:
853             case 0x0A:
854             case 0x0C:
855             case 0x0D:
856             case 0x20:
857                 break;
858 
859             /* U+0022 QUOTATION MARK (") */
860             case 0x22:
861                 tkz->state =
862                     lxb_html_tokenizer_state_attribute_value_double_quoted;
863 
864                 return (data + 1);
865 
866             /* U+0027 APOSTROPHE (') */
867             case 0x27:
868                 tkz->state =
869                     lxb_html_tokenizer_state_attribute_value_single_quoted;
870 
871                 return (data + 1);
872 
873             /* U+003E GREATER-THAN SIGN (>) */
874             case 0x3E:
875                 tkz->state = lxb_html_tokenizer_state_data_before;
876 
877                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
878                                              LXB_HTML_TOKENIZER_ERROR_MIATVA);
879 
880                 lxb_html_tokenizer_state_token_done_m(tkz, end);
881 
882                 return (data + 1);
883 
884             default:
885                 tkz->state = lxb_html_tokenizer_state_attribute_value_unquoted;
886                 return data;
887         }
888 
889         data++;
890     }
891 
892     return data;
893 }
894 
895 /*
896  * 12.2.5.36 Attribute value (double-quoted) state
897  */
898 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)899 lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
900                                                        const lxb_char_t *data,
901                                                        const lxb_char_t *end)
902 {
903     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
904         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
905     }
906 
907     lxb_html_tokenizer_state_begin_set(tkz, data);
908 
909     while (data != end) {
910         switch (*data) {
911             /* U+0022 QUOTATION MARK (") */
912             case 0x22:
913                 lxb_html_tokenizer_state_append_data_m(tkz, data);
914                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
915                 lxb_html_tokenizer_state_set_value_m(tkz);
916 
917                 tkz->state =
918                     lxb_html_tokenizer_state_after_attribute_value_quoted;
919 
920                 return (data + 1);
921 
922             /* U+0026 AMPERSAND (&) */
923             case 0x26:
924                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
925 
926                 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
927                 tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
928 
929                 return data + 1;
930 
931             /* U+000D CARRIAGE RETURN (CR) */
932             case 0x0D:
933                 if (++data >= end) {
934                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
935 
936                     tkz->state = lxb_html_tokenizer_state_cr;
937                     tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
938 
939                     return data;
940                 }
941 
942                 lxb_html_tokenizer_state_append_data_m(tkz, data);
943                 tkz->pos[-1] = 0x0A;
944 
945                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
946 
947                 if (*data != 0x0A) {
948                     lxb_html_tokenizer_state_begin_set(tkz, data);
949                     data--;
950                 }
951 
952                 break;
953 
954             /*
955              * U+0000 NULL
956              * EOF
957              */
958             case 0x00:
959                 if (tkz->is_eof) {
960                     if (tkz->token->attr_last->value_begin != NULL) {
961                      lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
962                     }
963 
964                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
965                                                LXB_HTML_TOKENIZER_ERROR_EOINTA);
966                     return end;
967                 }
968 
969                 lxb_html_tokenizer_state_append_data_m(tkz, data);
970                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
971                 lxb_html_tokenizer_state_append_replace_m(tkz);
972 
973                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
974                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
975                 break;
976 
977             default:
978                 break;
979         }
980 
981         data++;
982     }
983 
984     lxb_html_tokenizer_state_append_data_m(tkz, data);
985 
986     return data;
987 }
988 
989 /*
990  * 12.2.5.37 Attribute value (single-quoted) state
991  */
992 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)993 lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
994                                                        const lxb_char_t *data,
995                                                        const lxb_char_t *end)
996 {
997     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
998         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
999     }
1000 
1001     lxb_html_tokenizer_state_begin_set(tkz, data);
1002 
1003     while (data != end) {
1004         switch (*data) {
1005             /* U+0027 APOSTROPHE (') */
1006             case 0x27:
1007                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1008                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1009                 lxb_html_tokenizer_state_set_value_m(tkz);
1010 
1011                 tkz->state =
1012                     lxb_html_tokenizer_state_after_attribute_value_quoted;
1013 
1014                 return (data + 1);
1015 
1016             /* U+0026 AMPERSAND (&) */
1017             case 0x26:
1018                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
1019 
1020                 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1021                 tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1022 
1023                 return data + 1;
1024 
1025             /* U+000D CARRIAGE RETURN (CR) */
1026             case 0x0D:
1027                 if (++data >= end) {
1028                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1029 
1030                     tkz->state = lxb_html_tokenizer_state_cr;
1031                     tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1032 
1033                     return data;
1034                 }
1035 
1036                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1037                 tkz->pos[-1] = 0x0A;
1038 
1039                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1040 
1041                 if (*data != 0x0A) {
1042                     lxb_html_tokenizer_state_begin_set(tkz, data);
1043                     data--;
1044                 }
1045 
1046                 break;
1047 
1048             /*
1049              * U+0000 NULL
1050              * EOF
1051              */
1052             case 0x00:
1053                 if (tkz->is_eof) {
1054                     if (tkz->token->attr_last->value_begin != NULL) {
1055                      lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1056                     }
1057 
1058                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1059                                                LXB_HTML_TOKENIZER_ERROR_EOINTA);
1060                     return end;
1061                 }
1062 
1063                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1064                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1065                 lxb_html_tokenizer_state_append_replace_m(tkz);
1066 
1067                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1068                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1069                 break;
1070 
1071             default:
1072                 break;
1073         }
1074 
1075         data++;
1076     }
1077 
1078     lxb_html_tokenizer_state_append_data_m(tkz, data);
1079 
1080     return data;
1081 }
1082 
1083 /*
1084  * 12.2.5.38 Attribute value (unquoted) state
1085  */
1086 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1087 lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
1088                                                   const lxb_char_t *data,
1089                                                   const lxb_char_t *end)
1090 {
1091     if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1092         lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1093     }
1094 
1095     lxb_html_tokenizer_state_begin_set(tkz, data);
1096 
1097     while (data != end) {
1098         switch (*data) {
1099            /*
1100             * U+0009 CHARACTER TABULATION (tab)
1101             * U+000A LINE FEED (LF)
1102             * U+000C FORM FEED (FF)
1103             * U+000D CARRIAGE RETURN (CR)
1104             * U+0020 SPACE
1105             */
1106             case 0x09:
1107             case 0x0A:
1108             case 0x0C:
1109             case 0x0D:
1110             case 0x20:
1111                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1112                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1113                 lxb_html_tokenizer_state_set_value_m(tkz);
1114 
1115                 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1116                 return (data + 1);
1117 
1118             /* U+0026 AMPERSAND (&) */
1119             case 0x26:
1120                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
1121 
1122                 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1123                 tkz->state_return = lxb_html_tokenizer_state_attribute_value_unquoted;
1124 
1125                 return data + 1;
1126 
1127             /* U+003E GREATER-THAN SIGN (>) */
1128             case 0x3E:
1129                 tkz->state = lxb_html_tokenizer_state_data_before;
1130 
1131                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1132                 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1133                 lxb_html_tokenizer_state_set_value_m(tkz);
1134 
1135                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1136 
1137                 return (data + 1);
1138 
1139             /*
1140              * U+0000 NULL
1141              * EOF
1142              */
1143             case 0x00:
1144                 if (tkz->is_eof) {
1145                     if (tkz->token->attr_last->value_begin != NULL) {
1146                      lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1147                     }
1148 
1149                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1150                                                  LXB_HTML_TOKENIZER_ERROR_EOINTA);
1151                     return end;
1152                 }
1153 
1154                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1155                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1156                 lxb_html_tokenizer_state_append_replace_m(tkz);
1157 
1158                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1159                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1160                 break;
1161 
1162             /*
1163              * U+0022 QUOTATION MARK (")
1164              * U+0027 APOSTROPHE (')
1165              * U+003C LESS-THAN SIGN (<)
1166              * U+003D EQUALS SIGN (=)
1167              * U+0060 GRAVE ACCENT (`)
1168              */
1169             case 0x22:
1170             case 0x27:
1171             case 0x3C:
1172             case 0x3D:
1173             case 0x60:
1174                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
1175                                              LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA);
1176                 break;
1177 
1178             default:
1179                 break;
1180         }
1181 
1182         data++;
1183     }
1184 
1185     lxb_html_tokenizer_state_append_data_m(tkz, data);
1186 
1187     return data;
1188 }
1189 
1190 /*
1191  * 12.2.5.39 After attribute value (quoted) state
1192  */
1193 static const lxb_char_t *
lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1194 lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
1195                                                       const lxb_char_t *data,
1196                                                       const lxb_char_t *end)
1197 {
1198     switch (*data) {
1199         /*
1200          * U+0009 CHARACTER TABULATION (tab)
1201          * U+000A LINE FEED (LF)
1202          * U+000C FORM FEED (FF)
1203          * U+000D CARRIAGE RETURN (CR)
1204          * U+0020 SPACE
1205          */
1206         case 0x09:
1207         case 0x0A:
1208         case 0x0C:
1209         case 0x0D:
1210         case 0x20:
1211             tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1212 
1213             return (data + 1);
1214 
1215         /* U+002F SOLIDUS (/) */
1216         case 0x2F:
1217             tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
1218 
1219             return (data + 1);
1220 
1221         /* U+003E GREATER-THAN SIGN (>) */
1222         case 0x3E:
1223             tkz->state = lxb_html_tokenizer_state_data_before;
1224 
1225             lxb_html_tokenizer_state_token_done_m(tkz, end);
1226 
1227             return (data + 1);
1228 
1229         /* EOF */
1230         case 0x00:
1231             if (tkz->is_eof) {
1232                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1233                                              LXB_HTML_TOKENIZER_ERROR_EOINTA);
1234                 return end;
1235             }
1236             /* fall through */
1237 
1238         default:
1239             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1240                                          LXB_HTML_TOKENIZER_ERROR_MIWHBEAT);
1241 
1242             tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1243 
1244             return data;
1245     }
1246 
1247     return data;
1248 }
1249 
1250 
1251 const lxb_char_t *
lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1252 lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
1253                             const lxb_char_t *end)
1254 {
1255     lxb_html_tokenizer_state_append_m(tkz, "\n", 1);
1256 
1257     if (*data == 0x0A) {
1258         data++;
1259     }
1260 
1261     tkz->state = tkz->state_return;
1262 
1263     return data;
1264 }
1265 
1266 /*
1267  * 12.2.5.40 Self-closing start tag state
1268  */
1269 const lxb_char_t *
lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1270 lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t *tkz,
1271                                                 const lxb_char_t *data,
1272                                                 const lxb_char_t *end)
1273 {
1274     switch (*data) {
1275         /* U+003E GREATER-THAN SIGN (>) */
1276         case 0x3E:
1277             tkz->state = lxb_html_tokenizer_state_data_before;
1278             tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE_SELF;
1279 
1280             lxb_html_tokenizer_state_token_done_m(tkz, end);
1281 
1282             return (data + 1);
1283 
1284         /* EOF */
1285         case 0x00:
1286             if (tkz->is_eof) {
1287                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
1288                                              LXB_HTML_TOKENIZER_ERROR_EOINTA);
1289                 return end;
1290             }
1291             /* fall through */
1292 
1293         default:
1294             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1295                                          LXB_HTML_TOKENIZER_ERROR_UNSOINTA);
1296 
1297             tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1298 
1299             return data;
1300     }
1301 
1302     return data;
1303 }
1304 
1305 /*
1306  * Helper function. No in the specification. For 12.2.5.41 Bogus comment state
1307  */
1308 static const lxb_char_t *
lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1309 lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
1310                                               const lxb_char_t *data,
1311                                               const lxb_char_t *end)
1312 {
1313     tkz->token->tag_id = LXB_TAG__EM_COMMENT;
1314 
1315     tkz->state = lxb_html_tokenizer_state_bogus_comment;
1316 
1317     return data;
1318 }
1319 
1320 /*
1321  * 12.2.5.41 Bogus comment state
1322  */
1323 static const lxb_char_t *
lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1324 lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
1325                                        const lxb_char_t *data,
1326                                        const lxb_char_t *end)
1327 {
1328     lxb_html_tokenizer_state_begin_set(tkz, data);
1329 
1330     while (data != end) {
1331         switch (*data) {
1332             /* U+003E GREATER-THAN SIGN (>) */
1333             case 0x3E:
1334                 tkz->state = lxb_html_tokenizer_state_data_before;
1335 
1336                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1337                 lxb_html_tokenizer_state_token_set_end(tkz, data);
1338                 lxb_html_tokenizer_state_set_text(tkz);
1339                 lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
1340 
1341                 return (data + 1);
1342 
1343             /* U+000D CARRIAGE RETURN (CR) */
1344             case 0x0D:
1345                 if (++data >= end) {
1346                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1347 
1348                     tkz->state = lxb_html_tokenizer_state_cr;
1349                     tkz->state_return = lxb_html_tokenizer_state_bogus_comment;
1350 
1351                     return data;
1352                 }
1353 
1354                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1355                 tkz->pos[-1] = 0x0A;
1356 
1357                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1358 
1359                 if (*data != 0x0A) {
1360                     lxb_html_tokenizer_state_begin_set(tkz, data);
1361                     data--;
1362                 }
1363 
1364                 break;
1365 
1366             /*
1367              * EOF
1368              * U+0000 NULL
1369              */
1370             case 0x00:
1371                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1372 
1373                 if (tkz->is_eof) {
1374                     if (tkz->token->begin != NULL) {
1375                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
1376                     }
1377 
1378                     lxb_html_tokenizer_state_set_text(tkz);
1379                     lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
1380 
1381                     return end;
1382                 }
1383 
1384                 lxb_html_tokenizer_state_append_replace_m(tkz);
1385                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1386 
1387                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1388                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1389                 break;
1390         }
1391 
1392         data++;
1393     }
1394 
1395     lxb_html_tokenizer_state_append_data_m(tkz, data);
1396 
1397     return data;
1398 }
1399 
1400 /*
1401  * 12.2.5.42 Markup declaration open state
1402  */
1403 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1404 lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
1405                                                  const lxb_char_t *data,
1406                                                  const lxb_char_t *end)
1407 {
1408     /* Check first char for change parse state */
1409     if (tkz->is_eof == false) {
1410         lxb_html_tokenizer_state_token_set_begin(tkz, data);
1411     }
1412 
1413     /* U+002D HYPHEN-MINUS characters (-) */
1414     if (*data == 0x2D) {
1415         if ((end - data) < 2) {
1416             tkz->state = lxb_html_tokenizer_state_markup_declaration_comment;
1417             return (data + 1);
1418         }
1419 
1420         if (data[1] == 0x2D) {
1421             tkz->state = lxb_html_tokenizer_state_comment_before_start;
1422             return (data + 2);
1423         }
1424     }
1425     /*
1426      * ASCII case-insensitive match for the word "DOCTYPE"
1427      * U+0044 character (D) or U+0064 character (d)
1428      */
1429     else if (*data == 0x44 || *data == 0x64) {
1430         if ((end - data) < 7) {
1431             tkz->markup = (lxb_char_t *) "doctype";
1432 
1433             tkz->state = lxb_html_tokenizer_state_markup_declaration_doctype;
1434             return data;
1435         }
1436 
1437         if (lexbor_str_data_ncasecmp((lxb_char_t *) "doctype", data, 7)) {
1438             tkz->state = lxb_html_tokenizer_state_doctype_before;
1439             return (data + 7);
1440         }
1441     }
1442     /* Case-sensitive match for the string "[CDATA["
1443      * (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET
1444      * character before and after)
1445      */
1446     else if (*data == 0x5B) {
1447         if ((end - data) < 7) {
1448             tkz->markup = (lxb_char_t *) "[CDATA[";
1449 
1450             tkz->state = lxb_html_tokenizer_state_markup_declaration_cdata;
1451             return data;
1452         }
1453 
1454         if (lexbor_str_data_ncmp((lxb_char_t *) "[CDATA[", data, 7)) {
1455             lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
1456 
1457             if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1458                 data += 7;
1459 
1460                 lxb_html_tokenizer_state_token_set_begin(tkz, data);
1461 
1462                 tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1463 
1464                 return data;
1465             }
1466 
1467             tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1468 
1469             return data;
1470         }
1471     }
1472 
1473     if (tkz->is_eof) {
1474         lxb_html_tokenizer_state_token_set_end_oef(tkz);
1475 
1476         tkz->token->begin = tkz->token->end;
1477     }
1478 
1479     lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1480                                  LXB_HTML_TOKENIZER_ERROR_INOPCO);
1481 
1482     tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1483 
1484     return data;
1485 }
1486 
1487 /*
1488  * Helper function. No in the specification. For 12.2.5.42
1489  * For a comment tag <!--
1490  */
1491 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1492 lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
1493                                                     const lxb_char_t *data,
1494                                                     const lxb_char_t *end)
1495 {
1496     /* U+002D HYPHEN-MINUS characters (-) */
1497     if (*data == 0x2D) {
1498         tkz->state = lxb_html_tokenizer_state_comment_before_start;
1499         return (data + 1);
1500     }
1501 
1502     lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1503                                  LXB_HTML_TOKENIZER_ERROR_INOPCO);
1504 
1505     tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1506     return data;
1507 }
1508 
1509 /*
1510  * Helper function. No in the specification. For 12.2.5.42
1511  * For a DOCTYPE tag <!DOCTYPE
1512  */
1513 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1514 lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
1515                                                     const lxb_char_t *data,
1516                                                     const lxb_char_t *end)
1517 {
1518     const lxb_char_t *pos;
1519     pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
1520 
1521     if (pos == NULL) {
1522         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1523                                      LXB_HTML_TOKENIZER_ERROR_INOPCO);
1524 
1525         tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1526         return data;
1527     }
1528 
1529     if (*pos == '\0') {
1530         data = (data + (pos - tkz->markup));
1531 
1532         tkz->state = lxb_html_tokenizer_state_doctype_before;
1533         return data;
1534     }
1535 
1536     tkz->markup = pos;
1537 
1538     return end;
1539 }
1540 
1541 /*
1542  * Helper function. No in the specification. For 12.2.5.42
1543  * For a CDATA tag <![CDATA[
1544  */
1545 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1546 lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
1547                                                   const lxb_char_t *data,
1548                                                   const lxb_char_t *end)
1549 {
1550     const lxb_char_t *pos;
1551     pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
1552 
1553     if (pos == NULL) {
1554         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1555                                      LXB_HTML_TOKENIZER_ERROR_INOPCO);
1556 
1557         tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1558         return data;
1559     }
1560 
1561     if (*pos == '\0') {
1562         lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
1563 
1564         if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1565             data = (data + (pos - tkz->markup));
1566 
1567             tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1568             return data;
1569         }
1570 
1571         lxb_html_tokenizer_state_append_m(tkz, "[CDATA", 6);
1572 
1573         tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1574         return data;
1575     }
1576 
1577     tkz->markup = pos;
1578 
1579     return end;
1580 }
1581 
1582 /*
1583  * Helper function. No in the specification. For 12.2.5.69
1584  */
1585 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1586 lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
1587                                               const lxb_char_t *data,
1588                                               const lxb_char_t *end)
1589 {
1590     if (tkz->is_eof == false) {
1591         lxb_html_tokenizer_state_token_set_begin(tkz, data);
1592     }
1593     else {
1594         lxb_html_tokenizer_state_token_set_begin(tkz, tkz->last);
1595     }
1596 
1597     tkz->token->tag_id = LXB_TAG__TEXT;
1598 
1599     tkz->state = lxb_html_tokenizer_state_cdata_section;
1600 
1601     return data;
1602 }
1603 
1604 /*
1605  * 12.2.5.69 CDATA section state
1606  */
1607 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1608 lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
1609                                        const lxb_char_t *data,
1610                                        const lxb_char_t *end)
1611 {
1612     lxb_html_tokenizer_state_begin_set(tkz, data);
1613 
1614     while (data != end) {
1615         switch (*data) {
1616             /* U+005D RIGHT SQUARE BRACKET (]) */
1617             case 0x5D:
1618                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1619                 lxb_html_tokenizer_state_token_set_end(tkz, data);
1620 
1621                 tkz->state = lxb_html_tokenizer_state_cdata_section_bracket;
1622                 return (data + 1);
1623 
1624             /* U+000D CARRIAGE RETURN (CR) */
1625             case 0x0D:
1626                 if (++data >= end) {
1627                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1628 
1629                     tkz->state = lxb_html_tokenizer_state_cr;
1630                     tkz->state_return = lxb_html_tokenizer_state_cdata_section;
1631 
1632                     return data;
1633                 }
1634 
1635                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1636                 tkz->pos[-1] = 0x0A;
1637 
1638                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1639 
1640                 if (*data != 0x0A) {
1641                     lxb_html_tokenizer_state_begin_set(tkz, data);
1642                     data--;
1643                 }
1644 
1645                 break;
1646 
1647             /* EOF */
1648             case 0x00:
1649                 if (tkz->is_eof) {
1650                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1651                                                  LXB_HTML_TOKENIZER_ERROR_EOINCD);
1652 
1653                     if (tkz->token->begin != NULL) {
1654                         lxb_html_tokenizer_state_append_data_m(tkz, data);
1655                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
1656                     }
1657 
1658                     lxb_html_tokenizer_state_set_text(tkz);
1659                     lxb_html_tokenizer_state_token_done_m(tkz, end);
1660 
1661                     return end;
1662                 }
1663 
1664                 if (SIZE_MAX - tkz->token->null_count < 1) {
1665                     tkz->status = LXB_STATUS_ERROR_OVERFLOW;
1666                     return end;
1667                 }
1668 
1669                 tkz->token->null_count++;
1670 
1671                 break;
1672 
1673             default:
1674                 break;
1675         }
1676 
1677         data++;
1678     }
1679 
1680     lxb_html_tokenizer_state_append_data_m(tkz, data);
1681 
1682     return data;
1683 }
1684 
1685 /*
1686  * 12.2.5.70 CDATA section bracket state
1687  */
1688 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1689 lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
1690                                                const lxb_char_t *data,
1691                                                const lxb_char_t *end)
1692 {
1693     /* U+005D RIGHT SQUARE BRACKET (]) */
1694     if (*data == 0x5D) {
1695         tkz->state = lxb_html_tokenizer_state_cdata_section_end;
1696         return (data + 1);
1697     }
1698 
1699     lxb_html_tokenizer_state_append_m(tkz, "]", 1);
1700 
1701     tkz->state = lxb_html_tokenizer_state_cdata_section;
1702 
1703     return data;
1704 }
1705 
1706 /*
1707  * 12.2.5.71 CDATA section end state
1708  */
1709 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1710 lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
1711                                            const lxb_char_t *data,
1712                                            const lxb_char_t *end)
1713 {
1714     /* U+005D RIGHT SQUARE BRACKET (]) */
1715     if (*data == 0x5D) {
1716         lxb_html_tokenizer_state_append_m(tkz, data, 1);
1717         return (data + 1);
1718     }
1719     /* U+003E GREATER-THAN SIGN character */
1720     else if (*data == 0x3E) {
1721         tkz->state = lxb_html_tokenizer_state_data_before;
1722 
1723         lxb_html_tokenizer_state_set_text(tkz);
1724         lxb_html_tokenizer_state_token_done_m(tkz, end);
1725 
1726         return (data + 1);
1727     }
1728 
1729     lxb_html_tokenizer_state_append_m(tkz, "]]", 2);
1730 
1731     tkz->state = lxb_html_tokenizer_state_cdata_section;
1732 
1733     return data;
1734 }
1735 
1736 /*
1737  * 12.2.5.72 Character reference state
1738  */
1739 const lxb_char_t *
lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1740 lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1741                                   const lxb_char_t *data, const lxb_char_t *end)
1742 {
1743     tkz->is_attribute = false;
1744 
1745     return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1746 }
1747 
1748 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1749 lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
1750                                        const lxb_char_t *data,
1751                                        const lxb_char_t *end)
1752 {
1753     tkz->is_attribute = true;
1754 
1755     return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1756 }
1757 
1758 static const lxb_char_t *
_lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1759 _lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1760                                    const lxb_char_t *data,
1761                                    const lxb_char_t *end)
1762 {
1763     /* ASCII alphanumeric */
1764     if (lexbor_str_res_alphanumeric_character[ *data ] != LEXBOR_STR_RES_SLIP) {
1765         tkz->entity = &lxb_html_tokenizer_res_entities_sbst[1];
1766         tkz->entity_match = NULL;
1767         tkz->entity_start = (tkz->pos - 1) - tkz->start;
1768 
1769         tkz->state = lxb_html_tokenizer_state_char_ref_named;
1770 
1771         return data;
1772     }
1773     /* U+0023 NUMBER SIGN (#) */
1774     else if (*data == 0x23) {
1775         tkz->markup = data;
1776         tkz->entity_start = (tkz->pos - 1) - tkz->start;
1777 
1778         lxb_html_tokenizer_state_append_m(tkz, data, 1);
1779 
1780         tkz->state = lxb_html_tokenizer_state_char_ref_numeric;
1781 
1782         return (data + 1);
1783     }
1784     else {
1785         tkz->state = tkz->state_return;
1786     }
1787 
1788     return data;
1789 }
1790 
1791 /*
1792  * 12.2.5.73 Named character reference state
1793  *
1794  * The slowest part in HTML parsing!!!
1795  *
1796  * This option works correctly and passes all tests (stream parsing too).
1797  * We must seriously think about how to accelerate this part.
1798  */
1799 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1800 lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
1801                                         const lxb_char_t *data,
1802                                         const lxb_char_t *end)
1803 {
1804     size_t size, tail_size;
1805     lxb_char_t *start;
1806     const lexbor_sbst_entry_static_t *entry = tkz->entity;
1807 
1808     const lxb_char_t *begin = data;
1809 
1810     while (data < end) {
1811         entry = lexbor_sbst_entry_static_find(lxb_html_tokenizer_res_entities_sbst,
1812                                               entry, *data);
1813         if (entry == NULL) {
1814             lxb_html_tokenizer_state_append_m(tkz, begin, (data - begin));
1815             goto done;
1816         }
1817 
1818         if (entry->value[0] != 0) {
1819             tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start;
1820             tkz->entity_match = entry;
1821         }
1822 
1823         entry = &lxb_html_tokenizer_res_entities_sbst[ entry->next ];
1824 
1825         data++;
1826     }
1827 
1828     /* If entry not NULL and buffer empty, then wait next buffer. */
1829     tkz->entity = entry;
1830 
1831     lxb_html_tokenizer_state_append_m(tkz, begin, (end - begin));
1832     return data;
1833 
1834 done:
1835 
1836     /* If we have bad entity */
1837     if (tkz->entity_match == NULL) {
1838         tkz->state = lxb_html_tokenizer_state_char_ref_ambiguous_ampersand;
1839 
1840         return data;
1841     }
1842 
1843     tkz->state = tkz->state_return;
1844 
1845     /*
1846      * If the character reference was consumed as part of an attribute,
1847      * and the last character matched is not a U+003B SEMICOLON character (;),
1848      * and the next input character is either a U+003D EQUALS SIGN character (=)
1849      * or an ASCII alphanumeric, then, for historical reasons,
1850      * flush code points consumed as a character reference
1851      * and switch to the return state.
1852      */
1853     /* U+003B SEMICOLON character (;) */
1854     if (tkz->is_attribute && tkz->entity_match->key != 0x3B) {
1855         /* U+003D EQUALS SIGN character (=) or ASCII alphanumeric */
1856         if (*data == 0x3D
1857             || lexbor_str_res_alphanumeric_character[*data] != LEXBOR_STR_RES_SLIP)
1858         {
1859             return data;
1860         }
1861     }
1862 
1863     if (tkz->entity_match->key != 0x3B) {
1864         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1865                                      LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE);
1866     }
1867 
1868     start = &tkz->start[tkz->entity_start];
1869 
1870     size = tkz->pos - start;
1871     tail_size = tkz->pos - &tkz->start[tkz->entity_end] - 1;
1872 
1873     if (tail_size != 0) {
1874         if ((size + tail_size) + start > tkz->end) {
1875             if (lxb_html_tokenizer_temp_realloc(tkz, size) != LXB_STATUS_OK) {
1876                 return end;
1877             }
1878             start = &tkz->start[tkz->entity_start];
1879         }
1880 
1881         memmove(start + tkz->entity_match->value_len,
1882                 tkz->pos - tail_size, tail_size);
1883     }
1884 
1885     memcpy(start, tkz->entity_match->value, tkz->entity_match->value_len);
1886 
1887     tkz->pos = start + (tkz->entity_match->value_len + tail_size);
1888 
1889     return data;
1890 }
1891 
1892 /*
1893  * 12.2.5.74 Ambiguous ampersand state
1894  */
1895 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1896 lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
1897                                                       const lxb_char_t *data,
1898                                                       const lxb_char_t *end)
1899 {
1900     /* ASCII alphanumeric */
1901     /* Skipped, not need */
1902 
1903     /* U+003B SEMICOLON (;) */
1904     if (*data == 0x3B) {
1905         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1906                                      LXB_HTML_TOKENIZER_ERROR_UNNACHRE);
1907     }
1908 
1909     tkz->state = tkz->state_return;
1910 
1911     return data;
1912 }
1913 
1914 /*
1915  * 12.2.5.75 Numeric character reference state
1916  */
1917 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1918 lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
1919                                           const lxb_char_t *data,
1920                                           const lxb_char_t *end)
1921 {
1922     tkz->entity_number = 0;
1923 
1924     /*
1925      * U+0078 LATIN SMALL LETTER X
1926      * U+0058 LATIN CAPITAL LETTER X
1927      */
1928     if (*data == 0x78 || *data == 0x58) {
1929         lxb_html_tokenizer_state_append_m(tkz, data, 1);
1930 
1931         tkz->state = lxb_html_tokenizer_state_char_ref_hexademical_start;
1932 
1933         return (data + 1);
1934     }
1935 
1936     tkz->state = lxb_html_tokenizer_state_char_ref_decimal_start;
1937 
1938     return data;
1939 }
1940 
1941 /*
1942  * 12.2.5.76 Hexademical character reference start state
1943  */
1944 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1945 lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
1946                                                     const lxb_char_t *data,
1947                                                     const lxb_char_t *end)
1948 {
1949     /* ASCII hex digit */
1950     if (lexbor_str_res_map_hex[ *data ] != LEXBOR_STR_RES_SLIP) {
1951         tkz->state = lxb_html_tokenizer_state_char_ref_hexademical;
1952     }
1953     else {
1954         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1955                                      LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
1956 
1957         tkz->state = tkz->state_return;
1958     }
1959 
1960     return data;
1961 }
1962 
1963 /*
1964  * 12.2.5.77 Decimal character reference start state
1965  */
1966 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1967 lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
1968                                                 const lxb_char_t *data,
1969                                                 const lxb_char_t *end)
1970 {
1971     /* ASCII digit */
1972     if (lexbor_str_res_map_num[ *data ] != LEXBOR_STR_RES_SLIP) {
1973         tkz->state = lxb_html_tokenizer_state_char_ref_decimal;
1974     }
1975     else {
1976         lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1977                                      LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
1978 
1979         tkz->state = tkz->state_return;
1980     }
1981 
1982     return data;
1983 }
1984 
1985 /*
1986  * 12.2.5.78 Hexademical character reference state
1987  */
1988 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1989 lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
1990                                               const lxb_char_t *data,
1991                                               const lxb_char_t *end)
1992 {
1993     while (data != end) {
1994         if (lexbor_str_res_map_hex[ *data ] == LEXBOR_STR_RES_SLIP) {
1995             tkz->state = tkz->state_return;
1996 
1997             if (*data == ';') {
1998                 data++;
1999             }
2000 
2001             return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2002         }
2003 
2004         if (tkz->entity_number <= 0x10FFFF) {
2005             tkz->entity_number <<= 4;
2006             tkz->entity_number |= lexbor_str_res_map_hex[ *data ];
2007         }
2008 
2009         data++;
2010     }
2011 
2012     return data;
2013 }
2014 
2015 /*
2016  * 12.2.5.79 Decimal character reference state
2017  */
2018 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)2019 lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
2020                                           const lxb_char_t *data,
2021                                           const lxb_char_t *end)
2022 {
2023     while (data != end) {
2024         if (lexbor_str_res_map_num[ *data ] == LEXBOR_STR_RES_SLIP) {
2025             tkz->state = tkz->state_return;
2026 
2027             if (*data == ';') {
2028                 data++;
2029             }
2030 
2031             return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2032         }
2033 
2034         if (tkz->entity_number <= 0x10FFFF) {
2035             tkz->entity_number = lexbor_str_res_map_num[ *data ]
2036                                  + tkz->entity_number * 10;
2037         }
2038 
2039         data++;
2040     }
2041 
2042     return data;
2043 }
2044 
2045 /*
2046  * 12.2.5.80 Numeric character reference end state
2047  */
2048 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)2049 lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
2050                                               const lxb_char_t *data,
2051                                               const lxb_char_t *end)
2052 {
2053     lxb_char_t *start = &tkz->start[tkz->entity_start];
2054 
2055     if ((start + 4) > tkz->end) {
2056         if(lxb_html_tokenizer_temp_realloc(tkz, 4)) {
2057             return end;
2058         }
2059 
2060         start = &tkz->start[tkz->entity_start];
2061     }
2062 
2063     if (tkz->entity_number == 0x00) {
2064         lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2065                                      LXB_HTML_TOKENIZER_ERROR_NUCHRE);
2066 
2067         goto xFFFD;
2068     }
2069     else if (tkz->entity_number > 0x10FFFF) {
2070         lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2071                                      LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA);
2072 
2073         goto xFFFD;
2074     }
2075     else if (tkz->entity_number >= 0xD800 && tkz->entity_number <= 0xDFFF) {
2076         lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2077                                      LXB_HTML_TOKENIZER_ERROR_SUCHRE);
2078 
2079         goto xFFFD;
2080     }
2081     else if (tkz->entity_number >= 0xFDD0 && tkz->entity_number <= 0xFDEF) {
2082         lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2083                                      LXB_HTML_TOKENIZER_ERROR_NOCHRE);
2084     }
2085 
2086     switch (tkz->entity_number) {
2087         case 0xFFFE:  case 0xFFFF:  case 0x1FFFE: case 0x1FFFF: case 0x2FFFE:
2088         case 0x2FFFF: case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF:
2089         case 0x5FFFE: case 0x5FFFF: case 0x6FFFE: case 0x6FFFF: case 0x7FFFE:
2090         case 0x7FFFF: case 0x8FFFE: case 0x8FFFF: case 0x9FFFE: case 0x9FFFF:
2091         case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF: case 0xCFFFE:
2092         case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
2093         case 0xFFFFE: case 0xFFFFF:
2094         case 0x10FFFE:
2095         case 0x10FFFF:
2096             lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2097                                          LXB_HTML_TOKENIZER_ERROR_NOCHRE);
2098             break;
2099 
2100         default:
2101             break;
2102     }
2103 
2104     if (tkz->entity_number <= 0x1F
2105         || (tkz->entity_number >= 0x7F && tkz->entity_number <= 0x9F))
2106     {
2107         lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2108                                      LXB_HTML_TOKENIZER_ERROR_COCHRE);
2109     }
2110 
2111     if (tkz->entity_number <= 0x9F) {
2112         tkz->entity_number = (uint32_t) lexbor_str_res_replacement_character[tkz->entity_number];
2113     }
2114 
2115     start += lxb_html_tokenizer_state_to_ascii_utf_8(tkz->entity_number, start);
2116 
2117     tkz->pos = start;
2118 
2119     return data;
2120 
2121 xFFFD:
2122 
2123     memcpy(start, lexbor_str_res_ansi_replacement_character,
2124            sizeof(lexbor_str_res_ansi_replacement_character) - 1);
2125 
2126     tkz->pos = start + sizeof(lexbor_str_res_ansi_replacement_character) - 1;
2127 
2128     return data;
2129 }
2130 
2131 static size_t
lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint,lxb_char_t * data)2132 lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data)
2133 {
2134     /* 0x80 -- 10xxxxxx */
2135     /* 0xC0 -- 110xxxxx */
2136     /* 0xE0 -- 1110xxxx */
2137     /* 0xF0 -- 11110xxx */
2138 
2139     if (codepoint <= 0x0000007F) {
2140         /* 0xxxxxxx */
2141         data[0] = (char) codepoint;
2142 
2143         return 1;
2144     }
2145     else if (codepoint <= 0x000007FF) {
2146         /* 110xxxxx 10xxxxxx */
2147         data[0] = (char) (0xC0 | (codepoint >> 6  ));
2148         data[1] = (char) (0x80 | (codepoint & 0x3F));
2149 
2150         return 2;
2151     }
2152     else if (codepoint <= 0x0000FFFF) {
2153         /* 1110xxxx 10xxxxxx 10xxxxxx */
2154         data[0] = (char) (0xE0 | ((codepoint >> 12)));
2155         data[1] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2156         data[2] = (char) (0x80 | ( codepoint & 0x3F));
2157 
2158         return 3;
2159     }
2160     else if (codepoint <= 0x001FFFFF) {
2161         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
2162         data[0] = (char) (0xF0 | ( codepoint >> 18));
2163         data[1] = (char) (0x80 | ((codepoint >> 12) & 0x3F));
2164         data[2] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2165         data[3] = (char) (0x80 | ( codepoint & 0x3F));
2166 
2167         return 4;
2168     }
2169 
2170     return 0;
2171 }
2172