1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tokenizer/state.h"
8 #include "lexbor/html/tokenizer/state_comment.h"
9 #include "lexbor/html/tokenizer/state_doctype.h"
10
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER
13 #define LEXBOR_STR_RES_REPLACEMENT_CHARACTER
14 #define LEXBOR_STR_RES_ALPHA_CHARACTER
15 #define LEXBOR_STR_RES_MAP_HEX
16 #define LEXBOR_STR_RES_MAP_NUM
17 #include "lexbor/core/str_res.h"
18
19 #define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST
20 #include "lexbor/html/tokenizer/res.h"
21
22
23 const lxb_tag_data_t *
24 lxb_tag_append_lower(lexbor_hash_t *hash,
25 const lxb_char_t *name, size_t length);
26
27 lxb_dom_attr_data_t *
28 lxb_dom_attr_local_name_append(lexbor_hash_t *hash,
29 const lxb_char_t *name, size_t length);
30
31
32 static const lxb_char_t *
33 lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
34 const lxb_char_t *data, const lxb_char_t *end);
35
36 static const lxb_char_t *
37 lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
38 const lxb_char_t *data,
39 const lxb_char_t *end);
40
41 /* Tag */
42 static const lxb_char_t *
43 lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
44 const lxb_char_t *data,
45 const lxb_char_t *end);
46
47 static const lxb_char_t *
48 lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
49 const lxb_char_t *data,
50 const lxb_char_t *end);
51
52 static const lxb_char_t *
53 lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
54 const lxb_char_t *data,
55 const lxb_char_t *end);
56
57 /* Attribute */
58 static const lxb_char_t *
59 lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
60 const lxb_char_t *data,
61 const lxb_char_t *end);
62
63 static const lxb_char_t *
64 lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
65 const lxb_char_t *data,
66 const lxb_char_t *end);
67
68 static const lxb_char_t *
69 lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
70 const lxb_char_t *data,
71 const lxb_char_t *end);
72
73 static const lxb_char_t *
74 lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
75 const lxb_char_t *data,
76 const lxb_char_t *end);
77
78 static const lxb_char_t *
79 lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
80 const lxb_char_t *data,
81 const lxb_char_t *end);
82
83 static const lxb_char_t *
84 lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
85 const lxb_char_t *data,
86 const lxb_char_t *end);
87
88 static const lxb_char_t *
89 lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
90 const lxb_char_t *data,
91 const lxb_char_t *end);
92
93 static const lxb_char_t *
94 lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
95 const lxb_char_t *data,
96 const lxb_char_t *end);
97
98 static const lxb_char_t *
99 lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
100 const lxb_char_t *data,
101 const lxb_char_t *end);
102
103 /* Markup declaration */
104 static const lxb_char_t *
105 lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
106 const lxb_char_t *data,
107 const lxb_char_t *end);
108
109 static const lxb_char_t *
110 lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
111 const lxb_char_t *data,
112 const lxb_char_t *end);
113
114 static const lxb_char_t *
115 lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
116 const lxb_char_t *data,
117 const lxb_char_t *end);
118
119 static const lxb_char_t *
120 lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
121 const lxb_char_t *data,
122 const lxb_char_t *end);
123
124 /* CDATA Section */
125 static const lxb_char_t *
126 lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
127 const lxb_char_t *data,
128 const lxb_char_t *end);
129
130 static const lxb_char_t *
131 lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
132 const lxb_char_t *data,
133 const lxb_char_t *end);
134
135 static const lxb_char_t *
136 lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
137 const lxb_char_t *data,
138 const lxb_char_t *end);
139
140 static const lxb_char_t *
141 lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
142 const lxb_char_t *data,
143 const lxb_char_t *end);
144
145 static const lxb_char_t *
146 lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
147 const lxb_char_t *data,
148 const lxb_char_t *end);
149
150 static const lxb_char_t *
151 _lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
152 const lxb_char_t *data,
153 const lxb_char_t *end);
154
155 static const lxb_char_t *
156 lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
157 const lxb_char_t *data,
158 const lxb_char_t *end);
159
160 static const lxb_char_t *
161 lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
162 const lxb_char_t *data,
163 const lxb_char_t *end);
164
165 static const lxb_char_t *
166 lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
167 const lxb_char_t *data,
168 const lxb_char_t *end);
169
170 static const lxb_char_t *
171 lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
172 const lxb_char_t *data,
173 const lxb_char_t *end);
174
175 static const lxb_char_t *
176 lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
177 const lxb_char_t *data,
178 const lxb_char_t *end);
179
180 static const lxb_char_t *
181 lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
182 const lxb_char_t *data,
183 const lxb_char_t *end);
184
185 static const lxb_char_t *
186 lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
187 const lxb_char_t *data,
188 const lxb_char_t *end);
189
190 static const lxb_char_t *
191 lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
192 const lxb_char_t *data,
193 const lxb_char_t *end);
194
195 static size_t
196 lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data);
197
198
199 /*
200 * Helper function. No in the specification. For 12.2.5.1 Data state
201 */
202 const lxb_char_t *
lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)203 lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz,
204 const lxb_char_t *data,
205 const lxb_char_t *end)
206 {
207 if (tkz->is_eof == false) {
208 lxb_html_tokenizer_state_token_set_begin(tkz, data);
209 }
210
211 /*
212 * Text node init param sets before emit token.
213 */
214
215 tkz->state = lxb_html_tokenizer_state_data;
216
217 return data;
218 }
219
220 /*
221 * 12.2.5.1 Data state
222 */
223 static const lxb_char_t *
lxb_html_tokenizer_state_data(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)224 lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
225 const lxb_char_t *data, const lxb_char_t *end)
226 {
227 lxb_html_tokenizer_state_begin_set(tkz, data);
228
229 while (data != end) {
230 switch (*data) {
231 /* U+003C LESS-THAN SIGN (<) */
232 case 0x3C:
233 lxb_html_tokenizer_state_append_data_m(tkz, data);
234 lxb_html_tokenizer_state_token_set_end(tkz, data);
235
236 tkz->state = lxb_html_tokenizer_state_tag_open;
237 return (data + 1);
238
239 /* U+0026 AMPERSAND (&) */
240 case 0x26:
241 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
242
243 tkz->state = lxb_html_tokenizer_state_char_ref;
244 tkz->state_return = lxb_html_tokenizer_state_data;
245
246 return data + 1;
247
248 /* U+000D CARRIAGE RETURN (CR) */
249 case 0x0D:
250 if (++data >= end) {
251 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
252
253 tkz->state = lxb_html_tokenizer_state_cr;
254 tkz->state_return = lxb_html_tokenizer_state_data;
255
256 return data;
257 }
258
259 lxb_html_tokenizer_state_append_data_m(tkz, data);
260 tkz->pos[-1] = 0x0A;
261
262 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
263
264 if (*data != 0x0A) {
265 lxb_html_tokenizer_state_begin_set(tkz, data);
266 data--;
267 }
268
269 break;
270
271 /*
272 * U+0000 NULL
273 * EOF
274 */
275 case 0x00:
276 if (tkz->is_eof) {
277 /* Emit TEXT node if not empty */
278 if (tkz->token->begin != NULL) {
279 lxb_html_tokenizer_state_token_set_end_oef(tkz);
280 }
281
282 if (tkz->token->begin != tkz->token->end) {
283 tkz->token->tag_id = LXB_TAG__TEXT;
284
285 lxb_html_tokenizer_state_append_data_m(tkz, data);
286
287 lxb_html_tokenizer_state_set_text(tkz);
288 lxb_html_tokenizer_state_token_done_wo_check_m(tkz,end);
289 }
290
291 return end;
292 }
293
294 if (SIZE_MAX - tkz->token->null_count < 1) {
295 tkz->status = LXB_STATUS_ERROR_OVERFLOW;
296 return end;
297 }
298
299 tkz->token->null_count++;
300
301 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
302 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
303 break;
304 }
305
306 data++;
307 }
308
309 lxb_html_tokenizer_state_append_data_m(tkz, data);
310
311 return data;
312 }
313
314 /*
315 * Helper function. No in the specification. For 12.2.5.5 PLAINTEXT state
316 */
317 const lxb_char_t *
lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)318 lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz,
319 const lxb_char_t *data,
320 const lxb_char_t *end)
321 {
322 if (tkz->is_eof == false) {
323 lxb_html_tokenizer_state_token_set_begin(tkz, data);
324 }
325
326 tkz->token->tag_id = LXB_TAG__TEXT;
327
328 tkz->state = lxb_html_tokenizer_state_plaintext;
329
330 return data;
331 }
332
333 /*
334 * 12.2.5.5 PLAINTEXT state
335 */
336 static const lxb_char_t *
lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)337 lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
338 const lxb_char_t *data,
339 const lxb_char_t *end)
340 {
341 lxb_html_tokenizer_state_begin_set(tkz, data);
342
343 while (data != end) {
344 switch (*data) {
345 /* U+000D CARRIAGE RETURN (CR) */
346 case 0x0D:
347 if (++data >= end) {
348 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
349
350 tkz->state = lxb_html_tokenizer_state_cr;
351 tkz->state_return = lxb_html_tokenizer_state_plaintext;
352
353 return data;
354 }
355
356 lxb_html_tokenizer_state_append_data_m(tkz, data);
357 tkz->pos[-1] = 0x0A;
358
359 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
360
361 if (*data != 0x0A) {
362 lxb_html_tokenizer_state_begin_set(tkz, data);
363 data--;
364 }
365
366 break;
367
368 /*
369 * U+0000 NULL
370 * EOF
371 */
372 case 0x00:
373 lxb_html_tokenizer_state_append_data_m(tkz, data);
374
375 if (tkz->is_eof) {
376 if (tkz->token->begin != NULL) {
377 lxb_html_tokenizer_state_token_set_end_oef(tkz);
378 }
379
380 lxb_html_tokenizer_state_set_text(tkz);
381 lxb_html_tokenizer_state_token_done_m(tkz, end);
382
383 return end;
384 }
385
386 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
387 lxb_html_tokenizer_state_append_replace_m(tkz);
388
389 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
390 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
391 break;
392 }
393
394 data++;
395 }
396
397 lxb_html_tokenizer_state_append_data_m(tkz, data);
398
399 return data;
400 }
401
402 /*
403 * 12.2.5.6 Tag open state
404 */
405 static const lxb_char_t *
lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)406 lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
407 const lxb_char_t *data, const lxb_char_t *end)
408 {
409 /* ASCII alpha */
410 if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
411 tkz->state = lxb_html_tokenizer_state_tag_name;
412
413 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
414 lxb_html_tokenizer_state_token_set_begin(tkz, data);
415
416 return data;
417 }
418
419 /* U+002F SOLIDUS (/) */
420 else if (*data == 0x2F) {
421 tkz->state = lxb_html_tokenizer_state_end_tag_open;
422
423 return (data + 1);
424 }
425
426 /* U+0021 EXCLAMATION MARK (!) */
427 else if (*data == 0x21) {
428 tkz->state = lxb_html_tokenizer_state_markup_declaration_open;
429
430 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
431
432 return (data + 1);
433 }
434
435 /* U+003F QUESTION MARK (?) */
436 else if (*data == 0x3F) {
437 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
438
439 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
440 lxb_html_tokenizer_state_token_set_begin(tkz, data);
441
442 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
443 LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA);
444
445 return data;
446 }
447
448 /* EOF */
449 else if (*data == 0x00) {
450 if (tkz->is_eof) {
451 lxb_html_tokenizer_state_append_m(tkz, "<", 1);
452
453 lxb_html_tokenizer_state_token_set_end_oef(tkz);
454 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
455
456 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
457 LXB_HTML_TOKENIZER_ERROR_EOBETANA);
458
459 return end;
460 }
461 }
462
463 lxb_html_tokenizer_state_append_m(tkz, "<", 1);
464
465 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
466 LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
467
468 tkz->state = lxb_html_tokenizer_state_data;
469
470 return data;
471 }
472
473 /*
474 * 12.2.5.7 End tag open state
475 */
476 static const lxb_char_t *
lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)477 lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
478 const lxb_char_t *data,
479 const lxb_char_t *end)
480 {
481 /* ASCII alpha */
482 if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
483 tkz->state = lxb_html_tokenizer_state_tag_name;
484
485 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
486 lxb_html_tokenizer_state_token_set_begin(tkz, data);
487
488 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
489
490 return data;
491 }
492
493 /* U+003E GREATER-THAN SIGN (>) */
494 else if (*data == 0x3E) {
495 tkz->state = lxb_html_tokenizer_state_data;
496
497 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
498 LXB_HTML_TOKENIZER_ERROR_MIENTANA);
499
500 return (data + 1);
501 }
502
503 /* Fake EOF */
504 else if (*data == 0x00) {
505 if (tkz->is_eof) {
506 lxb_html_tokenizer_state_append_m(tkz, "</", 2);
507
508 lxb_html_tokenizer_state_token_set_end_oef(tkz);
509 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
510
511 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
512 LXB_HTML_TOKENIZER_ERROR_EOBETANA);
513
514 return end;
515 }
516 }
517
518 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
519
520 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
521 LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
522
523 lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
524 lxb_html_tokenizer_state_token_set_begin(tkz, data);
525
526 return data;
527 }
528
529 /*
530 * 12.2.5.8 Tag name state
531 */
532 static const lxb_char_t *
lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)533 lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
534 const lxb_char_t *data, const lxb_char_t *end)
535 {
536 lxb_html_tokenizer_state_begin_set(tkz, data);
537
538 while (data != end) {
539 switch (*data) {
540 /*
541 * U+0009 CHARACTER TABULATION (tab)
542 * U+000A LINE FEED (LF)
543 * U+000C FORM FEED (FF)
544 * U+000D CARRIAGE RETURN (CR)
545 * U+0020 SPACE
546 */
547 case 0x09:
548 case 0x0A:
549 case 0x0C:
550 case 0x0D:
551 case 0x20:
552 lxb_html_tokenizer_state_append_data_m(tkz, data);
553 lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
554 lxb_html_tokenizer_state_token_set_end(tkz, data);
555
556 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
557 return (data + 1);
558
559 /* U+002F SOLIDUS (/) */
560 case 0x2F:
561 lxb_html_tokenizer_state_append_data_m(tkz, data);
562 lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
563 lxb_html_tokenizer_state_token_set_end(tkz, data);
564
565 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
566 return (data + 1);
567
568 /* U+003E GREATER-THAN SIGN (>) */
569 case 0x3E:
570 tkz->state = lxb_html_tokenizer_state_data_before;
571
572 lxb_html_tokenizer_state_append_data_m(tkz, data);
573 lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
574 lxb_html_tokenizer_state_token_set_end(tkz, data);
575 lxb_html_tokenizer_state_token_done_m(tkz, end);
576
577 return (data + 1);
578
579 /* U+0000 NULL */
580 case 0x00:
581 if (tkz->is_eof) {
582 lxb_html_tokenizer_state_token_set_end_oef(tkz);
583
584 lxb_html_tokenizer_error_add(tkz->parse_errors,
585 tkz->token->end,
586 LXB_HTML_TOKENIZER_ERROR_EOINTA);
587 return end;
588 }
589
590 lxb_html_tokenizer_state_append_data_m(tkz, data);
591 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
592 lxb_html_tokenizer_state_append_replace_m(tkz);
593
594 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
595 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
596 break;
597
598 default:
599 break;
600 }
601
602 data++;
603 }
604
605 lxb_html_tokenizer_state_append_data_m(tkz, data);
606
607 return data;
608 }
609
610 /*
611 * 12.2.5.32 Before attribute name state
612 */
613 const lxb_char_t *
lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)614 lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t *tkz,
615 const lxb_char_t *data,
616 const lxb_char_t *end)
617 {
618 lxb_html_token_attr_t *attr;
619
620 while (data != end) {
621 switch (*data) {
622 /*
623 * U+0009 CHARACTER TABULATION (tab)
624 * U+000A LINE FEED (LF)
625 * U+000C FORM FEED (FF)
626 * U+000D CARRIAGE RETURN (CR)
627 * U+0020 SPACE
628 */
629 case 0x09:
630 case 0x0A:
631 case 0x0C:
632 case 0x0D:
633 case 0x20:
634 break;
635
636 /* U+003D EQUALS SIGN (=) */
637 case 0x3D:
638 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
639 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
640
641 lxb_html_tokenizer_state_append_m(tkz, data, 1);
642
643 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
644 LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA);
645
646 tkz->state = lxb_html_tokenizer_state_attribute_name;
647 return (data + 1);
648
649 /*
650 * U+002F SOLIDUS (/)
651 * U+003E GREATER-THAN SIGN (>)
652 */
653 case 0x2F:
654 case 0x3E:
655 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
656 return data;
657
658 /* EOF */
659 case 0x00:
660 if (tkz->is_eof) {
661 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
662 return data;
663 }
664 /* fall through */
665
666 /* Anything else */
667 default:
668 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
669 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
670
671 tkz->state = lxb_html_tokenizer_state_attribute_name;
672 return data;
673 }
674
675 data++;
676 }
677
678 return data;
679 }
680
681 /*
682 * 12.2.5.33 Attribute name state
683 */
684 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)685 lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
686 const lxb_char_t *data,
687 const lxb_char_t *end)
688 {
689 lxb_html_tokenizer_state_begin_set(tkz, data);
690
691 while (data != end) {
692 switch (*data) {
693 /*
694 * U+0009 CHARACTER TABULATION (tab)
695 * U+000A LINE FEED (LF)
696 * U+000C FORM FEED (FF)
697 * U+000D CARRIAGE RETURN (CR)
698 * U+0020 SPACE
699 * U+002F SOLIDUS (/)
700 * U+003E GREATER-THAN SIGN (>)
701 */
702 case 0x09:
703 case 0x0A:
704 case 0x0C:
705 case 0x0D:
706 case 0x20:
707 case 0x2F:
708 case 0x3E:
709 lxb_html_tokenizer_state_append_data_m(tkz, data);
710 lxb_html_tokenizer_state_set_name_m(tkz);
711 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
712
713 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
714 return data;
715
716 /*
717 * U+0000 NULL
718 * EOF
719 */
720 case 0x00:
721 if (tkz->is_eof) {
722 lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz);
723
724 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
725 return data;
726 }
727
728 lxb_html_tokenizer_state_append_data_m(tkz, data);
729 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
730 lxb_html_tokenizer_state_append_replace_m(tkz);
731
732 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
733 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
734 break;
735
736 /* U+003D EQUALS SIGN (=) */
737 case 0x3D:
738 lxb_html_tokenizer_state_append_data_m(tkz, data);
739 lxb_html_tokenizer_state_set_name_m(tkz);
740 lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
741
742 tkz->state = lxb_html_tokenizer_state_before_attribute_value;
743 return (data + 1);
744
745 /*
746 * U+0022 QUOTATION MARK (")
747 * U+0027 APOSTROPHE (')
748 * U+003C LESS-THAN SIGN (<)
749 */
750 case 0x22:
751 case 0x27:
752 case 0x3C:
753 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
754 LXB_HTML_TOKENIZER_ERROR_UNCHINATNA);
755 break;
756
757 default:
758 break;
759 }
760
761 data++;
762 }
763
764 lxb_html_tokenizer_state_append_data_m(tkz, data);
765
766 return data;
767 }
768
769 /*
770 * 12.2.5.34 After attribute name state
771 */
772 static const lxb_char_t *
lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)773 lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
774 const lxb_char_t *data,
775 const lxb_char_t *end)
776 {
777 lxb_html_token_attr_t *attr;
778
779 while (data != end) {
780 switch (*data) {
781 /*
782 * U+0009 CHARACTER TABULATION (tab)
783 * U+000A LINE FEED (LF)
784 * U+000C FORM FEED (FF)
785 * U+000D CARRIAGE RETURN (CR)
786 * U+0020 SPACE
787 */
788 case 0x09:
789 case 0x0A:
790 case 0x0C:
791 case 0x0D:
792 case 0x20:
793 break;
794
795 /* U+002F SOLIDUS (/) */
796 case 0x2F:
797 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
798 return (data + 1);
799
800 /* U+003D EQUALS SIGN (=) */
801 case 0x3D:
802 tkz->state = lxb_html_tokenizer_state_before_attribute_value;
803 return (data + 1);
804
805 /* U+003E GREATER-THAN SIGN (>) */
806 case 0x3E:
807 tkz->state = lxb_html_tokenizer_state_data_before;
808
809 lxb_html_tokenizer_state_token_done_m(tkz, end);
810
811 return (data + 1);
812
813 case 0x00:
814 if (tkz->is_eof) {
815 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
816 LXB_HTML_TOKENIZER_ERROR_EOINTA);
817 return end;
818 }
819 /* fall through */
820
821 default:
822 lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
823 lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
824
825 tkz->state = lxb_html_tokenizer_state_attribute_name;
826 return data;
827 }
828
829 data++;
830 }
831
832 return data;
833 }
834
835 /*
836 * 12.2.5.35 Before attribute value state
837 */
838 static const lxb_char_t *
lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)839 lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
840 const lxb_char_t *data,
841 const lxb_char_t *end)
842 {
843 while (data != end) {
844 switch (*data) {
845 /*
846 * U+0009 CHARACTER TABULATION (tab)
847 * U+000A LINE FEED (LF)
848 * U+000C FORM FEED (FF)
849 * U+000D CARRIAGE RETURN (CR)
850 * U+0020 SPACE
851 */
852 case 0x09:
853 case 0x0A:
854 case 0x0C:
855 case 0x0D:
856 case 0x20:
857 break;
858
859 /* U+0022 QUOTATION MARK (") */
860 case 0x22:
861 tkz->state =
862 lxb_html_tokenizer_state_attribute_value_double_quoted;
863
864 return (data + 1);
865
866 /* U+0027 APOSTROPHE (') */
867 case 0x27:
868 tkz->state =
869 lxb_html_tokenizer_state_attribute_value_single_quoted;
870
871 return (data + 1);
872
873 /* U+003E GREATER-THAN SIGN (>) */
874 case 0x3E:
875 tkz->state = lxb_html_tokenizer_state_data_before;
876
877 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
878 LXB_HTML_TOKENIZER_ERROR_MIATVA);
879
880 lxb_html_tokenizer_state_token_done_m(tkz, end);
881
882 return (data + 1);
883
884 default:
885 tkz->state = lxb_html_tokenizer_state_attribute_value_unquoted;
886 return data;
887 }
888
889 data++;
890 }
891
892 return data;
893 }
894
895 /*
896 * 12.2.5.36 Attribute value (double-quoted) state
897 */
898 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)899 lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
900 const lxb_char_t *data,
901 const lxb_char_t *end)
902 {
903 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
904 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
905 }
906
907 lxb_html_tokenizer_state_begin_set(tkz, data);
908
909 while (data != end) {
910 switch (*data) {
911 /* U+0022 QUOTATION MARK (") */
912 case 0x22:
913 lxb_html_tokenizer_state_append_data_m(tkz, data);
914 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
915 lxb_html_tokenizer_state_set_value_m(tkz);
916
917 tkz->state =
918 lxb_html_tokenizer_state_after_attribute_value_quoted;
919
920 return (data + 1);
921
922 /* U+0026 AMPERSAND (&) */
923 case 0x26:
924 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
925
926 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
927 tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
928
929 return data + 1;
930
931 /* U+000D CARRIAGE RETURN (CR) */
932 case 0x0D:
933 if (++data >= end) {
934 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
935
936 tkz->state = lxb_html_tokenizer_state_cr;
937 tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
938
939 return data;
940 }
941
942 lxb_html_tokenizer_state_append_data_m(tkz, data);
943 tkz->pos[-1] = 0x0A;
944
945 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
946
947 if (*data != 0x0A) {
948 lxb_html_tokenizer_state_begin_set(tkz, data);
949 data--;
950 }
951
952 break;
953
954 /*
955 * U+0000 NULL
956 * EOF
957 */
958 case 0x00:
959 if (tkz->is_eof) {
960 if (tkz->token->attr_last->value_begin != NULL) {
961 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
962 }
963
964 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
965 LXB_HTML_TOKENIZER_ERROR_EOINTA);
966 return end;
967 }
968
969 lxb_html_tokenizer_state_append_data_m(tkz, data);
970 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
971 lxb_html_tokenizer_state_append_replace_m(tkz);
972
973 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
974 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
975 break;
976
977 default:
978 break;
979 }
980
981 data++;
982 }
983
984 lxb_html_tokenizer_state_append_data_m(tkz, data);
985
986 return data;
987 }
988
989 /*
990 * 12.2.5.37 Attribute value (single-quoted) state
991 */
992 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)993 lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
994 const lxb_char_t *data,
995 const lxb_char_t *end)
996 {
997 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
998 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
999 }
1000
1001 lxb_html_tokenizer_state_begin_set(tkz, data);
1002
1003 while (data != end) {
1004 switch (*data) {
1005 /* U+0027 APOSTROPHE (') */
1006 case 0x27:
1007 lxb_html_tokenizer_state_append_data_m(tkz, data);
1008 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1009 lxb_html_tokenizer_state_set_value_m(tkz);
1010
1011 tkz->state =
1012 lxb_html_tokenizer_state_after_attribute_value_quoted;
1013
1014 return (data + 1);
1015
1016 /* U+0026 AMPERSAND (&) */
1017 case 0x26:
1018 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
1019
1020 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1021 tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1022
1023 return data + 1;
1024
1025 /* U+000D CARRIAGE RETURN (CR) */
1026 case 0x0D:
1027 if (++data >= end) {
1028 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1029
1030 tkz->state = lxb_html_tokenizer_state_cr;
1031 tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1032
1033 return data;
1034 }
1035
1036 lxb_html_tokenizer_state_append_data_m(tkz, data);
1037 tkz->pos[-1] = 0x0A;
1038
1039 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1040
1041 if (*data != 0x0A) {
1042 lxb_html_tokenizer_state_begin_set(tkz, data);
1043 data--;
1044 }
1045
1046 break;
1047
1048 /*
1049 * U+0000 NULL
1050 * EOF
1051 */
1052 case 0x00:
1053 if (tkz->is_eof) {
1054 if (tkz->token->attr_last->value_begin != NULL) {
1055 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1056 }
1057
1058 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1059 LXB_HTML_TOKENIZER_ERROR_EOINTA);
1060 return end;
1061 }
1062
1063 lxb_html_tokenizer_state_append_data_m(tkz, data);
1064 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1065 lxb_html_tokenizer_state_append_replace_m(tkz);
1066
1067 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1068 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1069 break;
1070
1071 default:
1072 break;
1073 }
1074
1075 data++;
1076 }
1077
1078 lxb_html_tokenizer_state_append_data_m(tkz, data);
1079
1080 return data;
1081 }
1082
1083 /*
1084 * 12.2.5.38 Attribute value (unquoted) state
1085 */
1086 static const lxb_char_t *
lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1087 lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
1088 const lxb_char_t *data,
1089 const lxb_char_t *end)
1090 {
1091 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1092 lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1093 }
1094
1095 lxb_html_tokenizer_state_begin_set(tkz, data);
1096
1097 while (data != end) {
1098 switch (*data) {
1099 /*
1100 * U+0009 CHARACTER TABULATION (tab)
1101 * U+000A LINE FEED (LF)
1102 * U+000C FORM FEED (FF)
1103 * U+000D CARRIAGE RETURN (CR)
1104 * U+0020 SPACE
1105 */
1106 case 0x09:
1107 case 0x0A:
1108 case 0x0C:
1109 case 0x0D:
1110 case 0x20:
1111 lxb_html_tokenizer_state_append_data_m(tkz, data);
1112 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1113 lxb_html_tokenizer_state_set_value_m(tkz);
1114
1115 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1116 return (data + 1);
1117
1118 /* U+0026 AMPERSAND (&) */
1119 case 0x26:
1120 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
1121
1122 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1123 tkz->state_return = lxb_html_tokenizer_state_attribute_value_unquoted;
1124
1125 return data + 1;
1126
1127 /* U+003E GREATER-THAN SIGN (>) */
1128 case 0x3E:
1129 tkz->state = lxb_html_tokenizer_state_data_before;
1130
1131 lxb_html_tokenizer_state_append_data_m(tkz, data);
1132 lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1133 lxb_html_tokenizer_state_set_value_m(tkz);
1134
1135 lxb_html_tokenizer_state_token_done_m(tkz, end);
1136
1137 return (data + 1);
1138
1139 /*
1140 * U+0000 NULL
1141 * EOF
1142 */
1143 case 0x00:
1144 if (tkz->is_eof) {
1145 if (tkz->token->attr_last->value_begin != NULL) {
1146 lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1147 }
1148
1149 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1150 LXB_HTML_TOKENIZER_ERROR_EOINTA);
1151 return end;
1152 }
1153
1154 lxb_html_tokenizer_state_append_data_m(tkz, data);
1155 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1156 lxb_html_tokenizer_state_append_replace_m(tkz);
1157
1158 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1159 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1160 break;
1161
1162 /*
1163 * U+0022 QUOTATION MARK (")
1164 * U+0027 APOSTROPHE (')
1165 * U+003C LESS-THAN SIGN (<)
1166 * U+003D EQUALS SIGN (=)
1167 * U+0060 GRAVE ACCENT (`)
1168 */
1169 case 0x22:
1170 case 0x27:
1171 case 0x3C:
1172 case 0x3D:
1173 case 0x60:
1174 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
1175 LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA);
1176 break;
1177
1178 default:
1179 break;
1180 }
1181
1182 data++;
1183 }
1184
1185 lxb_html_tokenizer_state_append_data_m(tkz, data);
1186
1187 return data;
1188 }
1189
1190 /*
1191 * 12.2.5.39 After attribute value (quoted) state
1192 */
1193 static const lxb_char_t *
lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1194 lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
1195 const lxb_char_t *data,
1196 const lxb_char_t *end)
1197 {
1198 switch (*data) {
1199 /*
1200 * U+0009 CHARACTER TABULATION (tab)
1201 * U+000A LINE FEED (LF)
1202 * U+000C FORM FEED (FF)
1203 * U+000D CARRIAGE RETURN (CR)
1204 * U+0020 SPACE
1205 */
1206 case 0x09:
1207 case 0x0A:
1208 case 0x0C:
1209 case 0x0D:
1210 case 0x20:
1211 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1212
1213 return (data + 1);
1214
1215 /* U+002F SOLIDUS (/) */
1216 case 0x2F:
1217 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
1218
1219 return (data + 1);
1220
1221 /* U+003E GREATER-THAN SIGN (>) */
1222 case 0x3E:
1223 tkz->state = lxb_html_tokenizer_state_data_before;
1224
1225 lxb_html_tokenizer_state_token_done_m(tkz, end);
1226
1227 return (data + 1);
1228
1229 /* EOF */
1230 case 0x00:
1231 if (tkz->is_eof) {
1232 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1233 LXB_HTML_TOKENIZER_ERROR_EOINTA);
1234 return end;
1235 }
1236 /* fall through */
1237
1238 default:
1239 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1240 LXB_HTML_TOKENIZER_ERROR_MIWHBEAT);
1241
1242 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1243
1244 return data;
1245 }
1246
1247 return data;
1248 }
1249
1250
1251 const lxb_char_t *
lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1252 lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
1253 const lxb_char_t *end)
1254 {
1255 lxb_html_tokenizer_state_append_m(tkz, "\n", 1);
1256
1257 if (*data == 0x0A) {
1258 data++;
1259 }
1260
1261 tkz->state = tkz->state_return;
1262
1263 return data;
1264 }
1265
1266 /*
1267 * 12.2.5.40 Self-closing start tag state
1268 */
1269 const lxb_char_t *
lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1270 lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t *tkz,
1271 const lxb_char_t *data,
1272 const lxb_char_t *end)
1273 {
1274 switch (*data) {
1275 /* U+003E GREATER-THAN SIGN (>) */
1276 case 0x3E:
1277 tkz->state = lxb_html_tokenizer_state_data_before;
1278 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE_SELF;
1279
1280 lxb_html_tokenizer_state_token_done_m(tkz, end);
1281
1282 return (data + 1);
1283
1284 /* EOF */
1285 case 0x00:
1286 if (tkz->is_eof) {
1287 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
1288 LXB_HTML_TOKENIZER_ERROR_EOINTA);
1289 return end;
1290 }
1291 /* fall through */
1292
1293 default:
1294 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1295 LXB_HTML_TOKENIZER_ERROR_UNSOINTA);
1296
1297 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1298
1299 return data;
1300 }
1301
1302 return data;
1303 }
1304
1305 /*
1306 * Helper function. No in the specification. For 12.2.5.41 Bogus comment state
1307 */
1308 static const lxb_char_t *
lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1309 lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
1310 const lxb_char_t *data,
1311 const lxb_char_t *end)
1312 {
1313 tkz->token->tag_id = LXB_TAG__EM_COMMENT;
1314
1315 tkz->state = lxb_html_tokenizer_state_bogus_comment;
1316
1317 return data;
1318 }
1319
1320 /*
1321 * 12.2.5.41 Bogus comment state
1322 */
1323 static const lxb_char_t *
lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1324 lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
1325 const lxb_char_t *data,
1326 const lxb_char_t *end)
1327 {
1328 lxb_html_tokenizer_state_begin_set(tkz, data);
1329
1330 while (data != end) {
1331 switch (*data) {
1332 /* U+003E GREATER-THAN SIGN (>) */
1333 case 0x3E:
1334 tkz->state = lxb_html_tokenizer_state_data_before;
1335
1336 lxb_html_tokenizer_state_append_data_m(tkz, data);
1337 lxb_html_tokenizer_state_token_set_end(tkz, data);
1338 lxb_html_tokenizer_state_set_text(tkz);
1339 lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
1340
1341 return (data + 1);
1342
1343 /* U+000D CARRIAGE RETURN (CR) */
1344 case 0x0D:
1345 if (++data >= end) {
1346 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1347
1348 tkz->state = lxb_html_tokenizer_state_cr;
1349 tkz->state_return = lxb_html_tokenizer_state_bogus_comment;
1350
1351 return data;
1352 }
1353
1354 lxb_html_tokenizer_state_append_data_m(tkz, data);
1355 tkz->pos[-1] = 0x0A;
1356
1357 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1358
1359 if (*data != 0x0A) {
1360 lxb_html_tokenizer_state_begin_set(tkz, data);
1361 data--;
1362 }
1363
1364 break;
1365
1366 /*
1367 * EOF
1368 * U+0000 NULL
1369 */
1370 case 0x00:
1371 lxb_html_tokenizer_state_append_data_m(tkz, data);
1372
1373 if (tkz->is_eof) {
1374 if (tkz->token->begin != NULL) {
1375 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1376 }
1377
1378 lxb_html_tokenizer_state_set_text(tkz);
1379 lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
1380
1381 return end;
1382 }
1383
1384 lxb_html_tokenizer_state_append_replace_m(tkz);
1385 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1386
1387 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1388 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1389 break;
1390 }
1391
1392 data++;
1393 }
1394
1395 lxb_html_tokenizer_state_append_data_m(tkz, data);
1396
1397 return data;
1398 }
1399
1400 /*
1401 * 12.2.5.42 Markup declaration open state
1402 */
1403 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1404 lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
1405 const lxb_char_t *data,
1406 const lxb_char_t *end)
1407 {
1408 /* Check first char for change parse state */
1409 if (tkz->is_eof == false) {
1410 lxb_html_tokenizer_state_token_set_begin(tkz, data);
1411 }
1412
1413 /* U+002D HYPHEN-MINUS characters (-) */
1414 if (*data == 0x2D) {
1415 if ((end - data) < 2) {
1416 tkz->state = lxb_html_tokenizer_state_markup_declaration_comment;
1417 return (data + 1);
1418 }
1419
1420 if (data[1] == 0x2D) {
1421 tkz->state = lxb_html_tokenizer_state_comment_before_start;
1422 return (data + 2);
1423 }
1424 }
1425 /*
1426 * ASCII case-insensitive match for the word "DOCTYPE"
1427 * U+0044 character (D) or U+0064 character (d)
1428 */
1429 else if (*data == 0x44 || *data == 0x64) {
1430 if ((end - data) < 7) {
1431 tkz->markup = (lxb_char_t *) "doctype";
1432
1433 tkz->state = lxb_html_tokenizer_state_markup_declaration_doctype;
1434 return data;
1435 }
1436
1437 if (lexbor_str_data_ncasecmp((lxb_char_t *) "doctype", data, 7)) {
1438 tkz->state = lxb_html_tokenizer_state_doctype_before;
1439 return (data + 7);
1440 }
1441 }
1442 /* Case-sensitive match for the string "[CDATA["
1443 * (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET
1444 * character before and after)
1445 */
1446 else if (*data == 0x5B) {
1447 if ((end - data) < 7) {
1448 tkz->markup = (lxb_char_t *) "[CDATA[";
1449
1450 tkz->state = lxb_html_tokenizer_state_markup_declaration_cdata;
1451 return data;
1452 }
1453
1454 if (lexbor_str_data_ncmp((lxb_char_t *) "[CDATA[", data, 7)) {
1455 lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
1456
1457 if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1458 data += 7;
1459
1460 lxb_html_tokenizer_state_token_set_begin(tkz, data);
1461
1462 tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1463
1464 return data;
1465 }
1466
1467 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1468
1469 return data;
1470 }
1471 }
1472
1473 if (tkz->is_eof) {
1474 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1475
1476 tkz->token->begin = tkz->token->end;
1477 }
1478
1479 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1480 LXB_HTML_TOKENIZER_ERROR_INOPCO);
1481
1482 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1483
1484 return data;
1485 }
1486
1487 /*
1488 * Helper function. No in the specification. For 12.2.5.42
1489 * For a comment tag <!--
1490 */
1491 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1492 lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
1493 const lxb_char_t *data,
1494 const lxb_char_t *end)
1495 {
1496 /* U+002D HYPHEN-MINUS characters (-) */
1497 if (*data == 0x2D) {
1498 tkz->state = lxb_html_tokenizer_state_comment_before_start;
1499 return (data + 1);
1500 }
1501
1502 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1503 LXB_HTML_TOKENIZER_ERROR_INOPCO);
1504
1505 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1506 return data;
1507 }
1508
1509 /*
1510 * Helper function. No in the specification. For 12.2.5.42
1511 * For a DOCTYPE tag <!DOCTYPE
1512 */
1513 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1514 lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
1515 const lxb_char_t *data,
1516 const lxb_char_t *end)
1517 {
1518 const lxb_char_t *pos;
1519 pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
1520
1521 if (pos == NULL) {
1522 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1523 LXB_HTML_TOKENIZER_ERROR_INOPCO);
1524
1525 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1526 return data;
1527 }
1528
1529 if (*pos == '\0') {
1530 data = (data + (pos - tkz->markup));
1531
1532 tkz->state = lxb_html_tokenizer_state_doctype_before;
1533 return data;
1534 }
1535
1536 tkz->markup = pos;
1537
1538 return end;
1539 }
1540
1541 /*
1542 * Helper function. No in the specification. For 12.2.5.42
1543 * For a CDATA tag <![CDATA[
1544 */
1545 static const lxb_char_t *
lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1546 lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
1547 const lxb_char_t *data,
1548 const lxb_char_t *end)
1549 {
1550 const lxb_char_t *pos;
1551 pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
1552
1553 if (pos == NULL) {
1554 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1555 LXB_HTML_TOKENIZER_ERROR_INOPCO);
1556
1557 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1558 return data;
1559 }
1560
1561 if (*pos == '\0') {
1562 lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
1563
1564 if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1565 data = (data + (pos - tkz->markup));
1566
1567 tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1568 return data;
1569 }
1570
1571 lxb_html_tokenizer_state_append_m(tkz, "[CDATA", 6);
1572
1573 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1574 return data;
1575 }
1576
1577 tkz->markup = pos;
1578
1579 return end;
1580 }
1581
1582 /*
1583 * Helper function. No in the specification. For 12.2.5.69
1584 */
1585 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1586 lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
1587 const lxb_char_t *data,
1588 const lxb_char_t *end)
1589 {
1590 if (tkz->is_eof == false) {
1591 lxb_html_tokenizer_state_token_set_begin(tkz, data);
1592 }
1593 else {
1594 lxb_html_tokenizer_state_token_set_begin(tkz, tkz->last);
1595 }
1596
1597 tkz->token->tag_id = LXB_TAG__TEXT;
1598
1599 tkz->state = lxb_html_tokenizer_state_cdata_section;
1600
1601 return data;
1602 }
1603
1604 /*
1605 * 12.2.5.69 CDATA section state
1606 */
1607 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1608 lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
1609 const lxb_char_t *data,
1610 const lxb_char_t *end)
1611 {
1612 lxb_html_tokenizer_state_begin_set(tkz, data);
1613
1614 while (data != end) {
1615 switch (*data) {
1616 /* U+005D RIGHT SQUARE BRACKET (]) */
1617 case 0x5D:
1618 lxb_html_tokenizer_state_append_data_m(tkz, data);
1619 lxb_html_tokenizer_state_token_set_end(tkz, data);
1620
1621 tkz->state = lxb_html_tokenizer_state_cdata_section_bracket;
1622 return (data + 1);
1623
1624 /* U+000D CARRIAGE RETURN (CR) */
1625 case 0x0D:
1626 if (++data >= end) {
1627 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1628
1629 tkz->state = lxb_html_tokenizer_state_cr;
1630 tkz->state_return = lxb_html_tokenizer_state_cdata_section;
1631
1632 return data;
1633 }
1634
1635 lxb_html_tokenizer_state_append_data_m(tkz, data);
1636 tkz->pos[-1] = 0x0A;
1637
1638 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1639
1640 if (*data != 0x0A) {
1641 lxb_html_tokenizer_state_begin_set(tkz, data);
1642 data--;
1643 }
1644
1645 break;
1646
1647 /* EOF */
1648 case 0x00:
1649 if (tkz->is_eof) {
1650 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1651 LXB_HTML_TOKENIZER_ERROR_EOINCD);
1652
1653 if (tkz->token->begin != NULL) {
1654 lxb_html_tokenizer_state_append_data_m(tkz, data);
1655 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1656 }
1657
1658 lxb_html_tokenizer_state_set_text(tkz);
1659 lxb_html_tokenizer_state_token_done_m(tkz, end);
1660
1661 return end;
1662 }
1663
1664 if (SIZE_MAX - tkz->token->null_count < 1) {
1665 tkz->status = LXB_STATUS_ERROR_OVERFLOW;
1666 return end;
1667 }
1668
1669 tkz->token->null_count++;
1670
1671 break;
1672
1673 default:
1674 break;
1675 }
1676
1677 data++;
1678 }
1679
1680 lxb_html_tokenizer_state_append_data_m(tkz, data);
1681
1682 return data;
1683 }
1684
1685 /*
1686 * 12.2.5.70 CDATA section bracket state
1687 */
1688 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1689 lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
1690 const lxb_char_t *data,
1691 const lxb_char_t *end)
1692 {
1693 /* U+005D RIGHT SQUARE BRACKET (]) */
1694 if (*data == 0x5D) {
1695 tkz->state = lxb_html_tokenizer_state_cdata_section_end;
1696 return (data + 1);
1697 }
1698
1699 lxb_html_tokenizer_state_append_m(tkz, "]", 1);
1700
1701 tkz->state = lxb_html_tokenizer_state_cdata_section;
1702
1703 return data;
1704 }
1705
1706 /*
1707 * 12.2.5.71 CDATA section end state
1708 */
1709 static const lxb_char_t *
lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1710 lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
1711 const lxb_char_t *data,
1712 const lxb_char_t *end)
1713 {
1714 /* U+005D RIGHT SQUARE BRACKET (]) */
1715 if (*data == 0x5D) {
1716 lxb_html_tokenizer_state_append_m(tkz, data, 1);
1717 return (data + 1);
1718 }
1719 /* U+003E GREATER-THAN SIGN character */
1720 else if (*data == 0x3E) {
1721 tkz->state = lxb_html_tokenizer_state_data_before;
1722
1723 lxb_html_tokenizer_state_set_text(tkz);
1724 lxb_html_tokenizer_state_token_done_m(tkz, end);
1725
1726 return (data + 1);
1727 }
1728
1729 lxb_html_tokenizer_state_append_m(tkz, "]]", 2);
1730
1731 tkz->state = lxb_html_tokenizer_state_cdata_section;
1732
1733 return data;
1734 }
1735
1736 /*
1737 * 12.2.5.72 Character reference state
1738 */
1739 const lxb_char_t *
lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1740 lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1741 const lxb_char_t *data, const lxb_char_t *end)
1742 {
1743 tkz->is_attribute = false;
1744
1745 return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1746 }
1747
1748 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1749 lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
1750 const lxb_char_t *data,
1751 const lxb_char_t *end)
1752 {
1753 tkz->is_attribute = true;
1754
1755 return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1756 }
1757
1758 static const lxb_char_t *
_lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1759 _lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1760 const lxb_char_t *data,
1761 const lxb_char_t *end)
1762 {
1763 /* ASCII alphanumeric */
1764 if (lexbor_str_res_alphanumeric_character[ *data ] != LEXBOR_STR_RES_SLIP) {
1765 tkz->entity = &lxb_html_tokenizer_res_entities_sbst[1];
1766 tkz->entity_match = NULL;
1767 tkz->entity_start = (tkz->pos - 1) - tkz->start;
1768
1769 tkz->state = lxb_html_tokenizer_state_char_ref_named;
1770
1771 return data;
1772 }
1773 /* U+0023 NUMBER SIGN (#) */
1774 else if (*data == 0x23) {
1775 tkz->markup = data;
1776 tkz->entity_start = (tkz->pos - 1) - tkz->start;
1777
1778 lxb_html_tokenizer_state_append_m(tkz, data, 1);
1779
1780 tkz->state = lxb_html_tokenizer_state_char_ref_numeric;
1781
1782 return (data + 1);
1783 }
1784 else {
1785 tkz->state = tkz->state_return;
1786 }
1787
1788 return data;
1789 }
1790
1791 /*
1792 * 12.2.5.73 Named character reference state
1793 *
1794 * The slowest part in HTML parsing!!!
1795 *
1796 * This option works correctly and passes all tests (stream parsing too).
1797 * We must seriously think about how to accelerate this part.
1798 */
1799 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1800 lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
1801 const lxb_char_t *data,
1802 const lxb_char_t *end)
1803 {
1804 size_t size, tail_size;
1805 lxb_char_t *start;
1806 const lexbor_sbst_entry_static_t *entry = tkz->entity;
1807
1808 const lxb_char_t *begin = data;
1809
1810 while (data < end) {
1811 entry = lexbor_sbst_entry_static_find(lxb_html_tokenizer_res_entities_sbst,
1812 entry, *data);
1813 if (entry == NULL) {
1814 lxb_html_tokenizer_state_append_m(tkz, begin, (data - begin));
1815 goto done;
1816 }
1817
1818 if (entry->value[0] != 0) {
1819 tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start;
1820 tkz->entity_match = entry;
1821 }
1822
1823 entry = &lxb_html_tokenizer_res_entities_sbst[ entry->next ];
1824
1825 data++;
1826 }
1827
1828 /* If entry not NULL and buffer empty, then wait next buffer. */
1829 tkz->entity = entry;
1830
1831 lxb_html_tokenizer_state_append_m(tkz, begin, (end - begin));
1832 return data;
1833
1834 done:
1835
1836 /* If we have bad entity */
1837 if (tkz->entity_match == NULL) {
1838 tkz->state = lxb_html_tokenizer_state_char_ref_ambiguous_ampersand;
1839
1840 return data;
1841 }
1842
1843 tkz->state = tkz->state_return;
1844
1845 /*
1846 * If the character reference was consumed as part of an attribute,
1847 * and the last character matched is not a U+003B SEMICOLON character (;),
1848 * and the next input character is either a U+003D EQUALS SIGN character (=)
1849 * or an ASCII alphanumeric, then, for historical reasons,
1850 * flush code points consumed as a character reference
1851 * and switch to the return state.
1852 */
1853 /* U+003B SEMICOLON character (;) */
1854 if (tkz->is_attribute && tkz->entity_match->key != 0x3B) {
1855 /* U+003D EQUALS SIGN character (=) or ASCII alphanumeric */
1856 if (*data == 0x3D
1857 || lexbor_str_res_alphanumeric_character[*data] != LEXBOR_STR_RES_SLIP)
1858 {
1859 return data;
1860 }
1861 }
1862
1863 if (tkz->entity_match->key != 0x3B) {
1864 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1865 LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE);
1866 }
1867
1868 start = &tkz->start[tkz->entity_start];
1869
1870 size = tkz->pos - start;
1871 tail_size = tkz->pos - &tkz->start[tkz->entity_end] - 1;
1872
1873 if (tail_size != 0) {
1874 if ((size + tail_size) + start > tkz->end) {
1875 if (lxb_html_tokenizer_temp_realloc(tkz, size) != LXB_STATUS_OK) {
1876 return end;
1877 }
1878 start = &tkz->start[tkz->entity_start];
1879 }
1880
1881 memmove(start + tkz->entity_match->value_len,
1882 tkz->pos - tail_size, tail_size);
1883 }
1884
1885 memcpy(start, tkz->entity_match->value, tkz->entity_match->value_len);
1886
1887 tkz->pos = start + (tkz->entity_match->value_len + tail_size);
1888
1889 return data;
1890 }
1891
1892 /*
1893 * 12.2.5.74 Ambiguous ampersand state
1894 */
1895 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1896 lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
1897 const lxb_char_t *data,
1898 const lxb_char_t *end)
1899 {
1900 /* ASCII alphanumeric */
1901 /* Skipped, not need */
1902
1903 /* U+003B SEMICOLON (;) */
1904 if (*data == 0x3B) {
1905 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1906 LXB_HTML_TOKENIZER_ERROR_UNNACHRE);
1907 }
1908
1909 tkz->state = tkz->state_return;
1910
1911 return data;
1912 }
1913
1914 /*
1915 * 12.2.5.75 Numeric character reference state
1916 */
1917 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1918 lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
1919 const lxb_char_t *data,
1920 const lxb_char_t *end)
1921 {
1922 tkz->entity_number = 0;
1923
1924 /*
1925 * U+0078 LATIN SMALL LETTER X
1926 * U+0058 LATIN CAPITAL LETTER X
1927 */
1928 if (*data == 0x78 || *data == 0x58) {
1929 lxb_html_tokenizer_state_append_m(tkz, data, 1);
1930
1931 tkz->state = lxb_html_tokenizer_state_char_ref_hexademical_start;
1932
1933 return (data + 1);
1934 }
1935
1936 tkz->state = lxb_html_tokenizer_state_char_ref_decimal_start;
1937
1938 return data;
1939 }
1940
1941 /*
1942 * 12.2.5.76 Hexademical character reference start state
1943 */
1944 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1945 lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
1946 const lxb_char_t *data,
1947 const lxb_char_t *end)
1948 {
1949 /* ASCII hex digit */
1950 if (lexbor_str_res_map_hex[ *data ] != LEXBOR_STR_RES_SLIP) {
1951 tkz->state = lxb_html_tokenizer_state_char_ref_hexademical;
1952 }
1953 else {
1954 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1955 LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
1956
1957 tkz->state = tkz->state_return;
1958 }
1959
1960 return data;
1961 }
1962
1963 /*
1964 * 12.2.5.77 Decimal character reference start state
1965 */
1966 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1967 lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
1968 const lxb_char_t *data,
1969 const lxb_char_t *end)
1970 {
1971 /* ASCII digit */
1972 if (lexbor_str_res_map_num[ *data ] != LEXBOR_STR_RES_SLIP) {
1973 tkz->state = lxb_html_tokenizer_state_char_ref_decimal;
1974 }
1975 else {
1976 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1977 LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
1978
1979 tkz->state = tkz->state_return;
1980 }
1981
1982 return data;
1983 }
1984
1985 /*
1986 * 12.2.5.78 Hexademical character reference state
1987 */
1988 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1989 lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
1990 const lxb_char_t *data,
1991 const lxb_char_t *end)
1992 {
1993 while (data != end) {
1994 if (lexbor_str_res_map_hex[ *data ] == LEXBOR_STR_RES_SLIP) {
1995 tkz->state = tkz->state_return;
1996
1997 if (*data == ';') {
1998 data++;
1999 }
2000
2001 return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2002 }
2003
2004 if (tkz->entity_number <= 0x10FFFF) {
2005 tkz->entity_number <<= 4;
2006 tkz->entity_number |= lexbor_str_res_map_hex[ *data ];
2007 }
2008
2009 data++;
2010 }
2011
2012 return data;
2013 }
2014
2015 /*
2016 * 12.2.5.79 Decimal character reference state
2017 */
2018 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)2019 lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
2020 const lxb_char_t *data,
2021 const lxb_char_t *end)
2022 {
2023 while (data != end) {
2024 if (lexbor_str_res_map_num[ *data ] == LEXBOR_STR_RES_SLIP) {
2025 tkz->state = tkz->state_return;
2026
2027 if (*data == ';') {
2028 data++;
2029 }
2030
2031 return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2032 }
2033
2034 if (tkz->entity_number <= 0x10FFFF) {
2035 tkz->entity_number = lexbor_str_res_map_num[ *data ]
2036 + tkz->entity_number * 10;
2037 }
2038
2039 data++;
2040 }
2041
2042 return data;
2043 }
2044
2045 /*
2046 * 12.2.5.80 Numeric character reference end state
2047 */
2048 static const lxb_char_t *
lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)2049 lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
2050 const lxb_char_t *data,
2051 const lxb_char_t *end)
2052 {
2053 lxb_char_t *start = &tkz->start[tkz->entity_start];
2054
2055 if ((start + 4) > tkz->end) {
2056 if(lxb_html_tokenizer_temp_realloc(tkz, 4)) {
2057 return end;
2058 }
2059
2060 start = &tkz->start[tkz->entity_start];
2061 }
2062
2063 if (tkz->entity_number == 0x00) {
2064 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2065 LXB_HTML_TOKENIZER_ERROR_NUCHRE);
2066
2067 goto xFFFD;
2068 }
2069 else if (tkz->entity_number > 0x10FFFF) {
2070 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2071 LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA);
2072
2073 goto xFFFD;
2074 }
2075 else if (tkz->entity_number >= 0xD800 && tkz->entity_number <= 0xDFFF) {
2076 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2077 LXB_HTML_TOKENIZER_ERROR_SUCHRE);
2078
2079 goto xFFFD;
2080 }
2081 else if (tkz->entity_number >= 0xFDD0 && tkz->entity_number <= 0xFDEF) {
2082 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2083 LXB_HTML_TOKENIZER_ERROR_NOCHRE);
2084 }
2085
2086 switch (tkz->entity_number) {
2087 case 0xFFFE: case 0xFFFF: case 0x1FFFE: case 0x1FFFF: case 0x2FFFE:
2088 case 0x2FFFF: case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF:
2089 case 0x5FFFE: case 0x5FFFF: case 0x6FFFE: case 0x6FFFF: case 0x7FFFE:
2090 case 0x7FFFF: case 0x8FFFE: case 0x8FFFF: case 0x9FFFE: case 0x9FFFF:
2091 case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF: case 0xCFFFE:
2092 case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
2093 case 0xFFFFE: case 0xFFFFF:
2094 case 0x10FFFE:
2095 case 0x10FFFF:
2096 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2097 LXB_HTML_TOKENIZER_ERROR_NOCHRE);
2098 break;
2099
2100 default:
2101 break;
2102 }
2103
2104 if (tkz->entity_number <= 0x1F
2105 || (tkz->entity_number >= 0x7F && tkz->entity_number <= 0x9F))
2106 {
2107 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2108 LXB_HTML_TOKENIZER_ERROR_COCHRE);
2109 }
2110
2111 if (tkz->entity_number <= 0x9F) {
2112 tkz->entity_number = (uint32_t) lexbor_str_res_replacement_character[tkz->entity_number];
2113 }
2114
2115 start += lxb_html_tokenizer_state_to_ascii_utf_8(tkz->entity_number, start);
2116
2117 tkz->pos = start;
2118
2119 return data;
2120
2121 xFFFD:
2122
2123 memcpy(start, lexbor_str_res_ansi_replacement_character,
2124 sizeof(lexbor_str_res_ansi_replacement_character) - 1);
2125
2126 tkz->pos = start + sizeof(lexbor_str_res_ansi_replacement_character) - 1;
2127
2128 return data;
2129 }
2130
2131 static size_t
lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint,lxb_char_t * data)2132 lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data)
2133 {
2134 /* 0x80 -- 10xxxxxx */
2135 /* 0xC0 -- 110xxxxx */
2136 /* 0xE0 -- 1110xxxx */
2137 /* 0xF0 -- 11110xxx */
2138
2139 if (codepoint <= 0x0000007F) {
2140 /* 0xxxxxxx */
2141 data[0] = (char) codepoint;
2142
2143 return 1;
2144 }
2145 else if (codepoint <= 0x000007FF) {
2146 /* 110xxxxx 10xxxxxx */
2147 data[0] = (char) (0xC0 | (codepoint >> 6 ));
2148 data[1] = (char) (0x80 | (codepoint & 0x3F));
2149
2150 return 2;
2151 }
2152 else if (codepoint <= 0x0000FFFF) {
2153 /* 1110xxxx 10xxxxxx 10xxxxxx */
2154 data[0] = (char) (0xE0 | ((codepoint >> 12)));
2155 data[1] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2156 data[2] = (char) (0x80 | ( codepoint & 0x3F));
2157
2158 return 3;
2159 }
2160 else if (codepoint <= 0x001FFFFF) {
2161 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
2162 data[0] = (char) (0xF0 | ( codepoint >> 18));
2163 data[1] = (char) (0x80 | ((codepoint >> 12) & 0x3F));
2164 data[2] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2165 data[3] = (char) (0x80 | ( codepoint & 0x3F));
2166
2167 return 4;
2168 }
2169
2170 return 0;
2171 }
2172