1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tokenizer/state_script.h"
8 #include "lexbor/html/tokenizer/state.h"
9
10 #define LEXBOR_STR_RES_ALPHA_CHARACTER
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #include "lexbor/core/str_res.h"
13
14 #include "lexbor/core/str_res.h"
15
16
17 const lxb_tag_data_t *
18 lxb_tag_append_lower(lexbor_hash_t *hash,
19 const lxb_char_t *name, size_t length);
20
21
22 static const lxb_char_t *
23 lxb_html_tokenizer_state_script_data(lxb_html_tokenizer_t *tkz,
24 const lxb_char_t *data,
25 const lxb_char_t *end);
26
27 static const lxb_char_t *
28 lxb_html_tokenizer_state_script_data_less_than_sign(lxb_html_tokenizer_t *tkz,
29 const lxb_char_t *data,
30 const lxb_char_t *end);
31
32 static const lxb_char_t *
33 lxb_html_tokenizer_state_script_data_end_tag_open(lxb_html_tokenizer_t *tkz,
34 const lxb_char_t *data,
35 const lxb_char_t *end);
36
37 static const lxb_char_t *
38 lxb_html_tokenizer_state_script_data_end_tag_name(lxb_html_tokenizer_t *tkz,
39 const lxb_char_t *data,
40 const lxb_char_t *end);
41
42 static const lxb_char_t *
43 lxb_html_tokenizer_state_script_data_escape_start(lxb_html_tokenizer_t *tkz,
44 const lxb_char_t *data,
45 const lxb_char_t *end);
46
47 static const lxb_char_t *
48 lxb_html_tokenizer_state_script_data_escape_start_dash(
49 lxb_html_tokenizer_t *tkz,
50 const lxb_char_t *data,
51 const lxb_char_t *end);
52
53 static const lxb_char_t *
54 lxb_html_tokenizer_state_script_data_escaped(lxb_html_tokenizer_t *tkz,
55 const lxb_char_t *data,
56 const lxb_char_t *end);
57
58 static const lxb_char_t *
59 lxb_html_tokenizer_state_script_data_escaped_dash(lxb_html_tokenizer_t *tkz,
60 const lxb_char_t *data,
61 const lxb_char_t *end);
62
63 static const lxb_char_t *
64 lxb_html_tokenizer_state_script_data_escaped_dash_dash(
65 lxb_html_tokenizer_t *tkz,
66 const lxb_char_t *data,
67 const lxb_char_t *end);
68
69 static const lxb_char_t *
70 lxb_html_tokenizer_state_script_data_escaped_less_than_sign(
71 lxb_html_tokenizer_t *tkz,
72 const lxb_char_t *data,
73 const lxb_char_t *end);
74
75 static const lxb_char_t *
76 lxb_html_tokenizer_state_script_data_escaped_end_tag_open(
77 lxb_html_tokenizer_t *tkz,
78 const lxb_char_t *data,
79 const lxb_char_t *end);
80
81 static const lxb_char_t *
82 lxb_html_tokenizer_state_script_data_escaped_end_tag_name(
83 lxb_html_tokenizer_t *tkz,
84 const lxb_char_t *data,
85 const lxb_char_t *end);
86
87 static const lxb_char_t *
88 lxb_html_tokenizer_state_script_data_double_escape_start(
89 lxb_html_tokenizer_t *tkz,
90 const lxb_char_t *data,
91 const lxb_char_t *end);
92
93 static const lxb_char_t *
94 lxb_html_tokenizer_state_script_data_double_escaped(lxb_html_tokenizer_t *tkz,
95 const lxb_char_t *data,
96 const lxb_char_t *end);
97
98 static const lxb_char_t *
99 lxb_html_tokenizer_state_script_data_double_escaped_dash(
100 lxb_html_tokenizer_t *tkz,
101 const lxb_char_t *data,
102 const lxb_char_t *end);
103
104 static const lxb_char_t *
105 lxb_html_tokenizer_state_script_data_double_escaped_dash_dash(
106 lxb_html_tokenizer_t *tkz,
107 const lxb_char_t *data,
108 const lxb_char_t *end);
109
110 static const lxb_char_t *
111 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign(
112 lxb_html_tokenizer_t *tkz,
113 const lxb_char_t *data,
114 const lxb_char_t *end);
115
116 static const lxb_char_t *
117 lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open(
118 lxb_html_tokenizer_t *tkz,
119 const lxb_char_t *data,
120 const lxb_char_t *end);
121
122 static const lxb_char_t *
123 lxb_html_tokenizer_state_script_data_double_escape_end(
124 lxb_html_tokenizer_t *tkz,
125 const lxb_char_t *data,
126 const lxb_char_t *end);
127
128
129 /*
130 * Helper function. No in the specification. For 12.2.5.4 Script data state
131 */
132 const lxb_char_t *
lxb_html_tokenizer_state_script_data_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)133 lxb_html_tokenizer_state_script_data_before(lxb_html_tokenizer_t *tkz,
134 const lxb_char_t *data,
135 const lxb_char_t *end)
136 {
137 if (tkz->is_eof == false) {
138 lxb_html_tokenizer_state_token_set_begin(tkz, data);
139 }
140
141 tkz->state = lxb_html_tokenizer_state_script_data;
142
143 return data;
144 }
145
146 /*
147 * 12.2.5.4 Script data state
148 */
149 static const lxb_char_t *
lxb_html_tokenizer_state_script_data(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)150 lxb_html_tokenizer_state_script_data(lxb_html_tokenizer_t *tkz,
151 const lxb_char_t *data,
152 const lxb_char_t *end)
153 {
154 lxb_html_tokenizer_state_begin_set(tkz, data);
155
156 while (data != end) {
157 switch (*data) {
158 /* U+003C LESS-THAN SIGN (<) */
159 case 0x3C:
160 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
161 lxb_html_tokenizer_state_token_set_end(tkz, data);
162
163 tkz->state =
164 lxb_html_tokenizer_state_script_data_less_than_sign;
165
166 return (data + 1);
167
168 /* U+000D CARRIAGE RETURN (CR) */
169 case 0x0D:
170 if (++data >= end) {
171 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
172
173 tkz->state = lxb_html_tokenizer_state_cr;
174 tkz->state_return = lxb_html_tokenizer_state_script_data;
175
176 return data;
177 }
178
179 lxb_html_tokenizer_state_append_data_m(tkz, data);
180 tkz->pos[-1] = 0x0A;
181
182 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
183
184 if (*data != 0x0A) {
185 lxb_html_tokenizer_state_begin_set(tkz, data);
186 data--;
187 }
188
189 break;
190
191 /*
192 * U+0000 NULL
193 * EOF
194 */
195 case 0x00:
196 lxb_html_tokenizer_state_append_data_m(tkz, data);
197
198 if (tkz->is_eof) {
199 if (tkz->token->begin != NULL) {
200 lxb_html_tokenizer_state_token_set_end_oef(tkz);
201 }
202
203 tkz->token->tag_id = LXB_TAG__TEXT;
204
205 lxb_html_tokenizer_state_set_text(tkz);
206 lxb_html_tokenizer_state_token_done_m(tkz, end);
207
208 return end;
209 }
210
211 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
212 lxb_html_tokenizer_state_append_replace_m(tkz);
213
214 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
215 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
216 break;
217
218 default:
219 break;
220 }
221
222 data++;
223 }
224
225 lxb_html_tokenizer_state_append_data_m(tkz, data);
226
227 return data;
228 }
229
230 /*
231 * 12.2.5.15 Script data less-than sign state
232 */
233 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)234 lxb_html_tokenizer_state_script_data_less_than_sign(lxb_html_tokenizer_t *tkz,
235 const lxb_char_t *data,
236 const lxb_char_t *end)
237 {
238 switch (*data) {
239 /* U+002F SOLIDUS (/) */
240 case 0x2F:
241 tkz->state = lxb_html_tokenizer_state_script_data_end_tag_open;
242
243 return (data + 1);
244
245 /* U+0021 EXCLAMATION MARK (!) */
246 case 0x21:
247 tkz->state = lxb_html_tokenizer_state_script_data_escape_start;
248
249 return (data + 1);
250
251 default:
252 tkz->state = lxb_html_tokenizer_state_script_data;
253
254 break;
255 }
256
257 return data;
258 }
259
260 /*
261 * 12.2.5.16 Script data end tag open state
262 */
263 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)264 lxb_html_tokenizer_state_script_data_end_tag_open(lxb_html_tokenizer_t *tkz,
265 const lxb_char_t *data,
266 const lxb_char_t *end)
267 {
268 if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
269 tkz->entity_start = (tkz->pos - 1) - tkz->start;
270 tkz->temp = data;
271
272 tkz->state = lxb_html_tokenizer_state_script_data_end_tag_name;
273 }
274 else {
275 tkz->state = lxb_html_tokenizer_state_script_data;
276 }
277
278 lxb_html_tokenizer_state_append_m(tkz, "/", 1);
279
280 return data;
281 }
282
283 /*
284 * 12.2.5.17 Script data end tag name state
285 */
286 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)287 lxb_html_tokenizer_state_script_data_end_tag_name(lxb_html_tokenizer_t *tkz,
288 const lxb_char_t *data,
289 const lxb_char_t *end)
290 {
291 lxb_html_tokenizer_state_begin_set(tkz, data);
292
293 while (data != end) {
294 switch (*data) {
295 /*
296 * U+0009 CHARACTER TABULATION (tab)
297 * U+000A LINE FEED (LF)
298 * U+000C FORM FEED (FF)
299 * U+000D CARRIAGE RETURN (CR)
300 * U+0020 SPACE
301 */
302 case 0x09:
303 case 0x0A:
304 case 0x0C:
305 case 0x0D:
306 case 0x20:
307 lxb_html_tokenizer_state_append_data_m(tkz, data);
308 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
309 tkz->pos);
310
311 if (tkz->tmp_tag_id != tkz->token->tag_id) {
312 goto anything_else;
313 }
314
315 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
316
317 goto done;
318
319 /* U+002F SOLIDUS (/) */
320 case 0x2F:
321 lxb_html_tokenizer_state_append_data_m(tkz, data);
322 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
323 tkz->pos);
324
325 if (tkz->tmp_tag_id != tkz->token->tag_id) {
326 goto anything_else;
327 }
328
329 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
330
331 goto done;
332
333 /* U+003E GREATER-THAN SIGN (>) */
334 case 0x3E:
335 lxb_html_tokenizer_state_append_data_m(tkz, data);
336 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
337 tkz->pos);
338
339 if (tkz->tmp_tag_id != tkz->token->tag_id) {
340 goto anything_else;
341 }
342
343 tkz->state = lxb_html_tokenizer_state_data_before;
344
345 /* Emit text token */
346 tkz->token->tag_id = LXB_TAG__TEXT;
347 tkz->pos = &tkz->start[tkz->entity_start];
348
349 lxb_html_tokenizer_state_set_text(tkz);
350 lxb_html_tokenizer_state_token_done_m(tkz, end);
351
352 /* Init close token */
353 tkz->token->tag_id = tkz->tmp_tag_id;
354 tkz->token->begin = tkz->temp;
355 tkz->token->end = data;
356 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
357
358 /* Emit close token */
359 lxb_html_tokenizer_state_token_done_m(tkz, end);
360
361 return (data + 1);
362
363 default:
364 if (lexbor_str_res_alpha_character[*data]
365 == LEXBOR_STR_RES_SLIP)
366 {
367 goto anything_else;
368 }
369
370 break;
371 }
372
373 data++;
374 }
375
376 lxb_html_tokenizer_state_append_data_m(tkz, data);
377
378 return data;
379
380 anything_else:
381
382 tkz->state = lxb_html_tokenizer_state_script_data;
383
384 return data;
385
386 done:
387
388 /* Emit text token */
389 tkz->token->tag_id = LXB_TAG__TEXT;
390 tkz->pos = &tkz->start[tkz->entity_start];
391
392 lxb_html_tokenizer_state_set_text(tkz);
393 lxb_html_tokenizer_state_token_done_m(tkz, end);
394
395 /* Init close token */
396 tkz->token->tag_id = tkz->tmp_tag_id;
397 tkz->token->begin = tkz->temp;
398 tkz->token->end = data;
399 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
400
401 return (data + 1);
402 }
403
404 /*
405 * 12.2.5.18 Script data escape start state
406 */
407 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escape_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)408 lxb_html_tokenizer_state_script_data_escape_start(lxb_html_tokenizer_t *tkz,
409 const lxb_char_t *data,
410 const lxb_char_t *end)
411 {
412 /* U+002D HYPHEN-MINUS (-) */
413 if (*data == 0x2D) {
414 tkz->state = lxb_html_tokenizer_state_script_data_escape_start_dash;
415
416 return (data + 1);
417 }
418
419 lxb_html_tokenizer_state_append_m(tkz, "!", 1);
420
421 tkz->state = lxb_html_tokenizer_state_script_data;
422
423 return data;
424 }
425
426 /*
427 * 12.2.5.19 Script data escape start dash state
428 */
429 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escape_start_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)430 lxb_html_tokenizer_state_script_data_escape_start_dash(
431 lxb_html_tokenizer_t *tkz,
432 const lxb_char_t *data,
433 const lxb_char_t *end)
434 {
435 /* U+002D HYPHEN-MINUS (-) */
436 if (*data == 0x2D) {
437 lxb_html_tokenizer_state_append_m(tkz, "!--", 3);
438
439 tkz->state = lxb_html_tokenizer_state_script_data_escaped_dash_dash;
440
441 return (data + 1);
442 }
443
444 lxb_html_tokenizer_state_append_m(tkz, "!-", 2);
445
446 tkz->state = lxb_html_tokenizer_state_script_data;
447
448 return data;
449 }
450
451 /*
452 * 12.2.5.20 Script data escaped state
453 */
454 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)455 lxb_html_tokenizer_state_script_data_escaped(lxb_html_tokenizer_t *tkz,
456 const lxb_char_t *data,
457 const lxb_char_t *end)
458 {
459 lxb_html_tokenizer_state_begin_set(tkz, data);
460
461 while (data != end) {
462 switch (*data) {
463 /* U+002D HYPHEN-MINUS (-) */
464 case 0x2D:
465 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
466
467 tkz->state = lxb_html_tokenizer_state_script_data_escaped_dash;
468
469 return (data + 1);
470
471 /* U+003C LESS-THAN SIGN (<) */
472 case 0x3C:
473 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
474 lxb_html_tokenizer_state_token_set_end(tkz, data);
475
476 tkz->state =
477 lxb_html_tokenizer_state_script_data_escaped_less_than_sign;
478
479 return (data + 1);
480
481 /* U+000D CARRIAGE RETURN (CR) */
482 case 0x0D:
483 if (++data >= end) {
484 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
485
486 tkz->state = lxb_html_tokenizer_state_cr;
487 tkz->state_return = lxb_html_tokenizer_state_script_data_escaped;
488
489 return data;
490 }
491
492 lxb_html_tokenizer_state_append_data_m(tkz, data);
493 tkz->pos[-1] = 0x0A;
494
495 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
496
497 if (*data != 0x0A) {
498 lxb_html_tokenizer_state_begin_set(tkz, data);
499 data--;
500 }
501
502 break;
503
504 /*
505 * U+0000 NULL
506 * EOF
507 */
508 case 0x00:
509 lxb_html_tokenizer_state_append_data_m(tkz, data);
510
511 if (tkz->is_eof) {
512 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
513 LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
514
515 tkz->token->tag_id = LXB_TAG__TEXT;
516
517 lxb_html_tokenizer_state_set_text(tkz);
518 lxb_html_tokenizer_state_token_set_end_oef(tkz);
519 lxb_html_tokenizer_state_token_done_m(tkz, end);
520
521 return end;
522 }
523
524 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
525 lxb_html_tokenizer_state_append_replace_m(tkz);
526
527 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
528 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
529 break;
530
531 default:
532 break;
533 }
534
535 data++;
536 }
537
538 lxb_html_tokenizer_state_append_data_m(tkz, data);
539
540 return data;
541 }
542
543 /*
544 * 12.2.5.21 Script data escaped dash state
545 */
546 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)547 lxb_html_tokenizer_state_script_data_escaped_dash(lxb_html_tokenizer_t *tkz,
548 const lxb_char_t *data,
549 const lxb_char_t *end)
550 {
551 switch (*data) {
552 /* U+002D HYPHEN-MINUS (-) */
553 case 0x2D:
554 lxb_html_tokenizer_state_append_m(tkz, data, 1);
555
556 tkz->state = lxb_html_tokenizer_state_script_data_escaped_dash_dash;
557
558 return (data + 1);
559
560 /* U+003C LESS-THAN SIGN (<) */
561 case 0x3C:
562 lxb_html_tokenizer_state_append_m(tkz, data, 1);
563 lxb_html_tokenizer_state_token_set_end(tkz, data);
564
565 tkz->state =
566 lxb_html_tokenizer_state_script_data_escaped_less_than_sign;
567
568 return (data + 1);
569
570 /*
571 * U+0000 NULL
572 * EOF
573 */
574 case 0x00:
575 if (tkz->is_eof) {
576 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
577 LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
578
579 tkz->token->tag_id = LXB_TAG__TEXT;
580
581 lxb_html_tokenizer_state_set_text(tkz);
582 lxb_html_tokenizer_state_token_set_end_oef(tkz);
583 lxb_html_tokenizer_state_token_done_m(tkz, end);
584
585 return end;
586 }
587
588 lxb_html_tokenizer_state_append_replace_m(tkz);
589
590 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
591 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
592
593 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
594
595 return (data + 1);
596
597 default:
598 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
599
600 return data;
601 }
602 }
603
604 /*
605 * 12.2.5.22 Script data escaped dash dash state
606 */
607 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_dash_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)608 lxb_html_tokenizer_state_script_data_escaped_dash_dash(lxb_html_tokenizer_t *tkz,
609 const lxb_char_t *data,
610 const lxb_char_t *end)
611 {
612 switch (*data) {
613 /* U+002D HYPHEN-MINUS (-) */
614 case 0x2D:
615 lxb_html_tokenizer_state_append_m(tkz, "-", 1);
616 return (data + 1);
617
618 /* U+003C LESS-THAN SIGN (<) */
619 case 0x3C:
620 lxb_html_tokenizer_state_append_m(tkz, "<", 1);
621 lxb_html_tokenizer_state_token_set_end(tkz, data);
622
623 tkz->state =
624 lxb_html_tokenizer_state_script_data_escaped_less_than_sign;
625
626 return (data + 1);
627
628 /* U+003E GREATER-THAN SIGN (>) */
629 case 0x3E:
630 tkz->state = lxb_html_tokenizer_state_script_data;
631 return data;
632
633 default:
634 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
635 return data;
636 }
637 }
638
639 /*
640 * 12.2.5.23 Script data escaped less-than sign state
641 */
642 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)643 lxb_html_tokenizer_state_script_data_escaped_less_than_sign(
644 lxb_html_tokenizer_t *tkz,
645 const lxb_char_t *data,
646 const lxb_char_t *end)
647 {
648 /* U+002F SOLIDUS (/) */
649 if (*data == 0x2F) {
650 tkz->state = lxb_html_tokenizer_state_script_data_escaped_end_tag_open;
651
652 return (data + 1);
653 }
654
655 /* ASCII alpha */
656 if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
657 tkz->entity_start = tkz->pos - tkz->start;
658
659 tkz->state = lxb_html_tokenizer_state_script_data_double_escape_start;
660
661 return data;
662 }
663
664 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
665
666 return data;
667 }
668
669 /*
670 * 12.2.5.24 Script data escaped end tag open state
671 */
672 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)673 lxb_html_tokenizer_state_script_data_escaped_end_tag_open(lxb_html_tokenizer_t *tkz,
674 const lxb_char_t *data,
675 const lxb_char_t *end)
676 {
677 if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
678 tkz->temp = data;
679 tkz->entity_start = (tkz->pos - 1) - tkz->start;
680
681 tkz->state = lxb_html_tokenizer_state_script_data_escaped_end_tag_name;
682 }
683 else {
684 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
685 }
686
687 lxb_html_tokenizer_state_append_m(tkz, "/", 1);
688
689 return data;
690 }
691
692 /*
693 * 12.2.5.25 Script data escaped end tag name state
694 */
695 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)696 lxb_html_tokenizer_state_script_data_escaped_end_tag_name(
697 lxb_html_tokenizer_t *tkz,
698 const lxb_char_t *data,
699 const lxb_char_t *end)
700 {
701 lxb_html_tokenizer_state_begin_set(tkz, data);
702
703 while (data != end) {
704 switch (*data) {
705 /*
706 * U+0009 CHARACTER TABULATION (tab)
707 * U+000A LINE FEED (LF)
708 * U+000C FORM FEED (FF)
709 * U+000D CARRIAGE RETURN (CR)
710 * U+0020 SPACE
711 */
712 case 0x09:
713 case 0x0A:
714 case 0x0C:
715 case 0x0D:
716 case 0x20:
717 lxb_html_tokenizer_state_append_data_m(tkz, data);
718 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
719 tkz->pos);
720
721 if (tkz->tmp_tag_id != tkz->token->tag_id) {
722 goto anything_else;
723 }
724
725 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
726
727 goto done;
728
729 /* U+002F SOLIDUS (/) */
730 case 0x2F:
731 lxb_html_tokenizer_state_append_data_m(tkz, data);
732 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
733 tkz->pos);
734
735 if (tkz->tmp_tag_id != tkz->token->tag_id) {
736 goto anything_else;
737 }
738
739 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
740
741 goto done;
742
743 /* U+003E GREATER-THAN SIGN (>) */
744 case 0x3E:
745 lxb_html_tokenizer_state_append_data_m(tkz, data);
746 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
747 tkz->pos);
748
749 if (tkz->tmp_tag_id != tkz->token->tag_id) {
750 goto anything_else;
751 }
752
753 tkz->state = lxb_html_tokenizer_state_data_before;
754
755 /* Emit text token */
756 tkz->token->tag_id = LXB_TAG__TEXT;
757 tkz->pos = &tkz->start[tkz->entity_start];
758
759 lxb_html_tokenizer_state_set_text(tkz);
760 lxb_html_tokenizer_state_token_done_m(tkz, end);
761
762 /* Init close token */
763 tkz->token->tag_id = tkz->tmp_tag_id;
764 tkz->token->begin = tkz->temp;
765 tkz->token->end = data;
766 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
767
768 /* Emit close token */
769 lxb_html_tokenizer_state_token_done_m(tkz, end);
770
771 return (data + 1);
772
773 default:
774 if (lexbor_str_res_alpha_character[*data]
775 == LEXBOR_STR_RES_SLIP)
776 {
777 lxb_html_tokenizer_state_append_data_m(tkz, data);
778 goto anything_else;
779 }
780
781 break;
782 }
783
784 data++;
785 }
786
787 lxb_html_tokenizer_state_append_data_m(tkz, data);
788
789 return data;
790
791 anything_else:
792
793 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
794
795 return data;
796
797 done:
798
799 /* Emit text token */
800 tkz->token->tag_id = LXB_TAG__TEXT;
801 tkz->pos = &tkz->start[tkz->entity_start];
802
803 lxb_html_tokenizer_state_set_text(tkz);
804 lxb_html_tokenizer_state_token_done_m(tkz, end);
805
806 /* Init close token */
807 tkz->token->tag_id = tkz->tmp_tag_id;
808 tkz->token->begin = tkz->temp;
809 tkz->token->end = data;
810 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
811
812 return (data + 1);
813 }
814
815 /*
816 * 12.2.5.26 Script data double escape start state
817 */
818 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escape_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)819 lxb_html_tokenizer_state_script_data_double_escape_start(lxb_html_tokenizer_t *tkz,
820 const lxb_char_t *data,
821 const lxb_char_t *end)
822 {
823 lxb_html_tokenizer_state_begin_set(tkz, data);
824
825 while (data != end) {
826 switch (*data) {
827 /*
828 * U+0009 CHARACTER TABULATION (tab)
829 * U+000A LINE FEED (LF)
830 * U+000C FORM FEED (FF)
831 * U+000D CARRIAGE RETURN (CR)
832 * U+0020 SPACE
833 * U+002F SOLIDUS (/)
834 * U+003E GREATER-THAN SIGN (>)
835 */
836 case 0x09:
837 case 0x0A:
838 case 0x0C:
839 case 0x0D:
840 case 0x20:
841 case 0x2F:
842 case 0x3E:
843 lxb_html_tokenizer_state_append_data_m(tkz, data);
844
845 if ((tkz->pos - &tkz->start[tkz->entity_start]) == 6
846 && lexbor_str_data_ncasecmp(&tkz->start[tkz->entity_start],
847 (const lxb_char_t *) "script", 6))
848 {
849 tkz->state =
850 lxb_html_tokenizer_state_script_data_double_escaped;
851
852 return data;
853 }
854
855 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
856
857 return data;
858
859 default:
860 if (lexbor_str_res_alpha_character[*data]
861 == LEXBOR_STR_RES_SLIP)
862 {
863 lxb_html_tokenizer_state_append_data_m(tkz, data);
864
865 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
866
867 return data;
868 }
869
870 break;
871 }
872
873 data++;
874 }
875
876 lxb_html_tokenizer_state_append_data_m(tkz, data);
877
878 return data;
879 }
880
881 /*
882 * 12.2.5.27 Script data double escaped state
883 */
884 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)885 lxb_html_tokenizer_state_script_data_double_escaped(lxb_html_tokenizer_t *tkz,
886 const lxb_char_t *data,
887 const lxb_char_t *end)
888 {
889 lxb_html_tokenizer_state_begin_set(tkz, data);
890
891 while (data != end) {
892 switch (*data) {
893 /* U+002D HYPHEN-MINUS (-) */
894 case 0x2D:
895 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
896
897 tkz->state =
898 lxb_html_tokenizer_state_script_data_double_escaped_dash;
899
900 return (data + 1);
901
902 /* U+003C LESS-THAN SIGN (<) */
903 case 0x3C:
904 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
905
906 tkz->state =
907 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign;
908
909 return (data + 1);
910
911 /* U+000D CARRIAGE RETURN (CR) */
912 case 0x0D:
913 if (++data >= end) {
914 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
915
916 tkz->state = lxb_html_tokenizer_state_cr;
917 tkz->state_return = lxb_html_tokenizer_state_script_data_double_escaped;
918
919 return data;
920 }
921
922 lxb_html_tokenizer_state_append_data_m(tkz, data);
923 tkz->pos[-1] = 0x0A;
924
925 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
926
927 if (*data != 0x0A) {
928 lxb_html_tokenizer_state_begin_set(tkz, data);
929 data--;
930 }
931
932 break;
933
934 /*
935 * U+0000 NULL
936 * EOF
937 */
938 case 0x00:
939 lxb_html_tokenizer_state_append_data_m(tkz, data);
940
941 if (tkz->is_eof) {
942 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
943 LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
944
945 tkz->token->tag_id = LXB_TAG__TEXT;
946
947 lxb_html_tokenizer_state_set_text(tkz);
948 lxb_html_tokenizer_state_token_set_end_oef(tkz);
949 lxb_html_tokenizer_state_token_done_m(tkz, end);
950
951 return end;
952 }
953
954 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
955 lxb_html_tokenizer_state_append_replace_m(tkz);
956
957 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
958 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
959 break;
960
961 default:
962 break;
963 }
964
965 data++;
966 }
967
968 lxb_html_tokenizer_state_append_data_m(tkz, data);
969
970 return data;
971 }
972
973 /*
974 * 12.2.5.28 Script data double escaped dash state
975 */
976 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)977 lxb_html_tokenizer_state_script_data_double_escaped_dash(lxb_html_tokenizer_t *tkz,
978 const lxb_char_t *data,
979 const lxb_char_t *end)
980 {
981 switch (*data) {
982 /* U+002D HYPHEN-MINUS (-) */
983 case 0x2D:
984 lxb_html_tokenizer_state_append_m(tkz, data, 1);
985
986 tkz->state =
987 lxb_html_tokenizer_state_script_data_double_escaped_dash_dash;
988
989 return (data + 1);
990
991 /* U+003C LESS-THAN SIGN (<) */
992 case 0x3C:
993 lxb_html_tokenizer_state_append_m(tkz, data, 1);
994
995 tkz->state =
996 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign;
997
998 return (data + 1);
999
1000 /*
1001 * U+0000 NULL
1002 * EOF
1003 */
1004 case 0x00:
1005 if (tkz->is_eof) {
1006 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1007 LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
1008
1009 tkz->token->tag_id = LXB_TAG__TEXT;
1010
1011 lxb_html_tokenizer_state_set_text(tkz);
1012 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1013 lxb_html_tokenizer_state_token_done_m(tkz, end);
1014
1015 return end;
1016 }
1017
1018 lxb_html_tokenizer_state_append_replace_m(tkz);
1019
1020 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1021 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1022
1023 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1024
1025 return (data + 1);
1026
1027 default:
1028 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1029
1030 return data;
1031 }
1032 }
1033
1034 /*
1035 * 12.2.5.29 Script data double escaped dash dash state
1036 */
1037 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_dash_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1038 lxb_html_tokenizer_state_script_data_double_escaped_dash_dash(
1039 lxb_html_tokenizer_t *tkz,
1040 const lxb_char_t *data,
1041 const lxb_char_t *end)
1042 {
1043 switch (*data) {
1044 /* U+002D HYPHEN-MINUS (-) */
1045 case 0x2D:
1046 lxb_html_tokenizer_state_append_m(tkz, data, 1);
1047 return (data + 1);
1048
1049 /* U+003C LESS-THAN SIGN (<) */
1050 case 0x3C:
1051 lxb_html_tokenizer_state_append_m(tkz, data, 1);
1052
1053 tkz->state =
1054 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign;
1055
1056 return (data + 1);
1057
1058 /* U+003E GREATER-THAN SIGN (>) */
1059 case 0x3E:
1060 lxb_html_tokenizer_state_append_m(tkz, data, 1);
1061
1062 tkz->state = lxb_html_tokenizer_state_script_data;
1063
1064 return (data + 1);
1065
1066 /*
1067 * U+0000 NULL
1068 * EOF
1069 */
1070 case 0x00:
1071 if (tkz->is_eof) {
1072 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1073 LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
1074
1075 tkz->token->tag_id = LXB_TAG__TEXT;
1076
1077 lxb_html_tokenizer_state_set_text(tkz);
1078 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1079 lxb_html_tokenizer_state_token_done_m(tkz, end);
1080
1081 return end;
1082 }
1083
1084 lxb_html_tokenizer_state_append_replace_m(tkz);
1085
1086 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1087 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1088
1089 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1090
1091 return (data + 1);
1092
1093 default:
1094 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1095
1096 return data;
1097 }
1098
1099 return data;
1100 }
1101
1102 /*
1103 * 12.2.5.30 Script data double escaped less-than sign state
1104 */
1105 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1106 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign(
1107 lxb_html_tokenizer_t *tkz,
1108 const lxb_char_t *data,
1109 const lxb_char_t *end)
1110 {
1111 /* U+002F SOLIDUS (/) */
1112 if (*data == 0x2F) {
1113 tkz->state =
1114 lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open;
1115
1116 return (data + 1);
1117 }
1118
1119 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1120
1121 return data;
1122 }
1123
1124 /*
1125 * 12.2.5.30.5 Helper function. No in the specification.
1126 */
1127 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1128 lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open(
1129 lxb_html_tokenizer_t *tkz,
1130 const lxb_char_t *data,
1131 const lxb_char_t *end)
1132 {
1133 if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
1134 tkz->entity_start = (tkz->pos + 1) - tkz->start;
1135
1136 tkz->state = lxb_html_tokenizer_state_script_data_double_escape_end;
1137 }
1138 else {
1139 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1140 }
1141
1142 lxb_html_tokenizer_state_append_m(tkz, "/", 1);
1143
1144 return data;
1145 }
1146
1147 /*
1148 * 12.2.5.31 Script data double escape end state
1149 */
1150 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escape_end(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1151 lxb_html_tokenizer_state_script_data_double_escape_end(
1152 lxb_html_tokenizer_t *tkz,
1153 const lxb_char_t *data,
1154 const lxb_char_t *end)
1155 {
1156 lxb_html_tokenizer_state_begin_set(tkz, data);
1157
1158 while (data != end) {
1159 switch (*data) {
1160 /*
1161 * U+0009 CHARACTER TABULATION (tab)
1162 * U+000A LINE FEED (LF)
1163 * U+000C FORM FEED (FF)
1164 * U+000D CARRIAGE RETURN (CR)
1165 * U+0020 SPACE
1166 * U+002F SOLIDUS (/)
1167 * U+003E GREATER-THAN SIGN (>)
1168 */
1169 case 0x09:
1170 case 0x0A:
1171 case 0x0C:
1172 case 0x0D:
1173 case 0x20:
1174 case 0x2F:
1175 case 0x3E:
1176 lxb_html_tokenizer_state_append_data_m(tkz, data);
1177
1178 if ((tkz->pos - &tkz->start[tkz->entity_start]) == 6
1179 && lexbor_str_data_ncasecmp(&tkz->start[tkz->entity_start],
1180 (const lxb_char_t *) "script", 6))
1181 {
1182 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
1183 return data;
1184 }
1185
1186 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1187
1188 return data;
1189
1190 default:
1191 if (lexbor_str_res_alpha_character[*data]
1192 == LEXBOR_STR_RES_SLIP)
1193 {
1194 lxb_html_tokenizer_state_append_data_m(tkz, data);
1195
1196 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1197 return data;
1198 }
1199
1200 break;
1201 }
1202
1203 data++;
1204 }
1205
1206 lxb_html_tokenizer_state_append_data_m(tkz, data);
1207
1208 return data;
1209 }
1210