1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tokenizer/state_script.h"
8 #include "lexbor/html/tokenizer/state.h"
9 
10 #define LEXBOR_STR_RES_ALPHA_CHARACTER
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #include "lexbor/core/str_res.h"
13 
14 #include "lexbor/core/str_res.h"
15 
16 
17 const lxb_tag_data_t *
18 lxb_tag_append_lower(lexbor_hash_t *hash,
19                      const lxb_char_t *name, size_t length);
20 
21 
22 static const lxb_char_t *
23 lxb_html_tokenizer_state_script_data(lxb_html_tokenizer_t *tkz,
24                                      const lxb_char_t *data,
25                                      const lxb_char_t *end);
26 
27 static const lxb_char_t *
28 lxb_html_tokenizer_state_script_data_less_than_sign(lxb_html_tokenizer_t *tkz,
29                                                     const lxb_char_t *data,
30                                                     const lxb_char_t *end);
31 
32 static const lxb_char_t *
33 lxb_html_tokenizer_state_script_data_end_tag_open(lxb_html_tokenizer_t *tkz,
34                                                   const lxb_char_t *data,
35                                                   const lxb_char_t *end);
36 
37 static const lxb_char_t *
38 lxb_html_tokenizer_state_script_data_end_tag_name(lxb_html_tokenizer_t *tkz,
39                                                   const lxb_char_t *data,
40                                                   const lxb_char_t *end);
41 
42 static const lxb_char_t *
43 lxb_html_tokenizer_state_script_data_escape_start(lxb_html_tokenizer_t *tkz,
44                                                   const lxb_char_t *data,
45                                                   const lxb_char_t *end);
46 
47 static const lxb_char_t *
48 lxb_html_tokenizer_state_script_data_escape_start_dash(
49                                                       lxb_html_tokenizer_t *tkz,
50                                                       const lxb_char_t *data,
51                                                       const lxb_char_t *end);
52 
53 static const lxb_char_t *
54 lxb_html_tokenizer_state_script_data_escaped(lxb_html_tokenizer_t *tkz,
55                                              const lxb_char_t *data,
56                                              const lxb_char_t *end);
57 
58 static const lxb_char_t *
59 lxb_html_tokenizer_state_script_data_escaped_dash(lxb_html_tokenizer_t *tkz,
60                                                   const lxb_char_t *data,
61                                                   const lxb_char_t *end);
62 
63 static const lxb_char_t *
64 lxb_html_tokenizer_state_script_data_escaped_dash_dash(
65                                                       lxb_html_tokenizer_t *tkz,
66                                                       const lxb_char_t *data,
67                                                       const lxb_char_t *end);
68 
69 static const lxb_char_t *
70 lxb_html_tokenizer_state_script_data_escaped_less_than_sign(
71                                                       lxb_html_tokenizer_t *tkz,
72                                                       const lxb_char_t *data,
73                                                       const lxb_char_t *end);
74 
75 static const lxb_char_t *
76 lxb_html_tokenizer_state_script_data_escaped_end_tag_open(
77                                                       lxb_html_tokenizer_t *tkz,
78                                                       const lxb_char_t *data,
79                                                       const lxb_char_t *end);
80 
81 static const lxb_char_t *
82 lxb_html_tokenizer_state_script_data_escaped_end_tag_name(
83                                                       lxb_html_tokenizer_t *tkz,
84                                                       const lxb_char_t *data,
85                                                       const lxb_char_t *end);
86 
87 static const lxb_char_t *
88 lxb_html_tokenizer_state_script_data_double_escape_start(
89                                                       lxb_html_tokenizer_t *tkz,
90                                                       const lxb_char_t *data,
91                                                       const lxb_char_t *end);
92 
93 static const lxb_char_t *
94 lxb_html_tokenizer_state_script_data_double_escaped(lxb_html_tokenizer_t *tkz,
95                                                     const lxb_char_t *data,
96                                                     const lxb_char_t *end);
97 
98 static const lxb_char_t *
99 lxb_html_tokenizer_state_script_data_double_escaped_dash(
100                                                       lxb_html_tokenizer_t *tkz,
101                                                       const lxb_char_t *data,
102                                                       const lxb_char_t *end);
103 
104 static const lxb_char_t *
105 lxb_html_tokenizer_state_script_data_double_escaped_dash_dash(
106                                                       lxb_html_tokenizer_t *tkz,
107                                                       const lxb_char_t *data,
108                                                       const lxb_char_t *end);
109 
110 static const lxb_char_t *
111 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign(
112                                                       lxb_html_tokenizer_t *tkz,
113                                                       const lxb_char_t *data,
114                                                       const lxb_char_t *end);
115 
116 static const lxb_char_t *
117 lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open(
118                                                       lxb_html_tokenizer_t *tkz,
119                                                       const lxb_char_t *data,
120                                                       const lxb_char_t *end);
121 
122 static const lxb_char_t *
123 lxb_html_tokenizer_state_script_data_double_escape_end(
124                                                       lxb_html_tokenizer_t *tkz,
125                                                       const lxb_char_t *data,
126                                                       const lxb_char_t *end);
127 
128 
129 /*
130  * Helper function. No in the specification. For 12.2.5.4 Script data state
131  */
132 const lxb_char_t *
lxb_html_tokenizer_state_script_data_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)133 lxb_html_tokenizer_state_script_data_before(lxb_html_tokenizer_t *tkz,
134                                             const lxb_char_t *data,
135                                             const lxb_char_t *end)
136 {
137     if (tkz->is_eof == false) {
138         lxb_html_tokenizer_state_token_set_begin(tkz, data);
139     }
140 
141     tkz->state = lxb_html_tokenizer_state_script_data;
142 
143     return data;
144 }
145 
146 /*
147  * 12.2.5.4 Script data state
148  */
149 static const lxb_char_t *
lxb_html_tokenizer_state_script_data(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)150 lxb_html_tokenizer_state_script_data(lxb_html_tokenizer_t *tkz,
151                                      const lxb_char_t *data,
152                                      const lxb_char_t *end)
153 {
154     lxb_html_tokenizer_state_begin_set(tkz, data);
155 
156     while (data != end) {
157         switch (*data) {
158             /* U+003C LESS-THAN SIGN (<) */
159             case 0x3C:
160                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
161                 lxb_html_tokenizer_state_token_set_end(tkz, data);
162 
163                 tkz->state =
164                     lxb_html_tokenizer_state_script_data_less_than_sign;
165 
166                 return (data + 1);
167 
168             /* U+000D CARRIAGE RETURN (CR) */
169             case 0x0D:
170                 if (++data >= end) {
171                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
172 
173                     tkz->state = lxb_html_tokenizer_state_cr;
174                     tkz->state_return = lxb_html_tokenizer_state_script_data;
175 
176                     return data;
177                 }
178 
179                 lxb_html_tokenizer_state_append_data_m(tkz, data);
180                 tkz->pos[-1] = 0x0A;
181 
182                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
183 
184                 if (*data != 0x0A) {
185                     lxb_html_tokenizer_state_begin_set(tkz, data);
186                     data--;
187                 }
188 
189                 break;
190 
191             /*
192              * U+0000 NULL
193              * EOF
194              */
195             case 0x00:
196                 lxb_html_tokenizer_state_append_data_m(tkz, data);
197 
198                 if (tkz->is_eof) {
199                     if (tkz->token->begin != NULL) {
200                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
201                     }
202 
203                     tkz->token->tag_id = LXB_TAG__TEXT;
204 
205                     lxb_html_tokenizer_state_set_text(tkz);
206                     lxb_html_tokenizer_state_token_done_m(tkz, end);
207 
208                     return end;
209                 }
210 
211                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
212                 lxb_html_tokenizer_state_append_replace_m(tkz);
213 
214                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
215                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
216                 break;
217 
218             default:
219                 break;
220         }
221 
222         data++;
223     }
224 
225     lxb_html_tokenizer_state_append_data_m(tkz, data);
226 
227     return data;
228 }
229 
230 /*
231  * 12.2.5.15 Script data less-than sign state
232  */
233 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)234 lxb_html_tokenizer_state_script_data_less_than_sign(lxb_html_tokenizer_t *tkz,
235                                                     const lxb_char_t *data,
236                                                     const lxb_char_t *end)
237 {
238     switch (*data) {
239         /* U+002F SOLIDUS (/) */
240         case 0x2F:
241             tkz->state = lxb_html_tokenizer_state_script_data_end_tag_open;
242 
243             return (data + 1);
244 
245         /* U+0021 EXCLAMATION MARK (!) */
246         case 0x21:
247             tkz->state = lxb_html_tokenizer_state_script_data_escape_start;
248 
249             return (data + 1);
250 
251         default:
252             tkz->state = lxb_html_tokenizer_state_script_data;
253 
254             break;
255     }
256 
257     return data;
258 }
259 
260 /*
261  * 12.2.5.16 Script data end tag open state
262  */
263 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)264 lxb_html_tokenizer_state_script_data_end_tag_open(lxb_html_tokenizer_t *tkz,
265                                                   const lxb_char_t *data,
266                                                   const lxb_char_t *end)
267 {
268     if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
269         tkz->entity_start = (tkz->pos - 1) - tkz->start;
270         tkz->temp = data;
271 
272         tkz->state = lxb_html_tokenizer_state_script_data_end_tag_name;
273     }
274     else {
275         tkz->state = lxb_html_tokenizer_state_script_data;
276     }
277 
278     lxb_html_tokenizer_state_append_m(tkz, "/", 1);
279 
280     return data;
281 }
282 
283 /*
284  * 12.2.5.17 Script data end tag name state
285  */
286 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)287 lxb_html_tokenizer_state_script_data_end_tag_name(lxb_html_tokenizer_t *tkz,
288                                                   const lxb_char_t *data,
289                                                   const lxb_char_t *end)
290 {
291     lxb_html_tokenizer_state_begin_set(tkz, data);
292 
293     while (data != end) {
294         switch (*data) {
295             /*
296              * U+0009 CHARACTER TABULATION (tab)
297              * U+000A LINE FEED (LF)
298              * U+000C FORM FEED (FF)
299              * U+000D CARRIAGE RETURN (CR)
300              * U+0020 SPACE
301              */
302             case 0x09:
303             case 0x0A:
304             case 0x0C:
305             case 0x0D:
306             case 0x20:
307                 lxb_html_tokenizer_state_append_data_m(tkz, data);
308                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
309                                                    tkz->pos);
310 
311                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
312                     goto anything_else;
313                 }
314 
315                 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
316 
317                 goto done;
318 
319             /* U+002F SOLIDUS (/) */
320             case 0x2F:
321                 lxb_html_tokenizer_state_append_data_m(tkz, data);
322                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
323                                                    tkz->pos);
324 
325                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
326                     goto anything_else;
327                 }
328 
329                 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
330 
331                 goto done;
332 
333             /* U+003E GREATER-THAN SIGN (>) */
334             case 0x3E:
335                 lxb_html_tokenizer_state_append_data_m(tkz, data);
336                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
337                                                    tkz->pos);
338 
339                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
340                     goto anything_else;
341                 }
342 
343                 tkz->state = lxb_html_tokenizer_state_data_before;
344 
345                 /* Emit text token */
346                 tkz->token->tag_id = LXB_TAG__TEXT;
347                 tkz->pos = &tkz->start[tkz->entity_start];
348 
349                 lxb_html_tokenizer_state_set_text(tkz);
350                 lxb_html_tokenizer_state_token_done_m(tkz, end);
351 
352                 /* Init close token */
353                 tkz->token->tag_id = tkz->tmp_tag_id;
354                 tkz->token->begin = tkz->temp;
355                 tkz->token->end = data;
356                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
357 
358                 /* Emit close token */
359                 lxb_html_tokenizer_state_token_done_m(tkz, end);
360 
361                 return (data + 1);
362 
363             default:
364                 if (lexbor_str_res_alpha_character[*data]
365                     == LEXBOR_STR_RES_SLIP)
366                 {
367                     goto anything_else;
368                 }
369 
370                 break;
371         }
372 
373         data++;
374     }
375 
376     lxb_html_tokenizer_state_append_data_m(tkz, data);
377 
378     return data;
379 
380 anything_else:
381 
382     tkz->state = lxb_html_tokenizer_state_script_data;
383 
384     return data;
385 
386 done:
387 
388     /* Emit text token */
389     tkz->token->tag_id = LXB_TAG__TEXT;
390     tkz->pos = &tkz->start[tkz->entity_start];
391 
392     lxb_html_tokenizer_state_set_text(tkz);
393     lxb_html_tokenizer_state_token_done_m(tkz, end);
394 
395     /* Init close token */
396     tkz->token->tag_id = tkz->tmp_tag_id;
397     tkz->token->begin = tkz->temp;
398     tkz->token->end = data;
399     tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
400 
401     return (data + 1);
402 }
403 
404 /*
405  * 12.2.5.18 Script data escape start state
406  */
407 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escape_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)408 lxb_html_tokenizer_state_script_data_escape_start(lxb_html_tokenizer_t *tkz,
409                                                   const lxb_char_t *data,
410                                                   const lxb_char_t *end)
411 {
412     /* U+002D HYPHEN-MINUS (-) */
413     if (*data == 0x2D) {
414         tkz->state = lxb_html_tokenizer_state_script_data_escape_start_dash;
415 
416         return (data + 1);
417     }
418 
419     lxb_html_tokenizer_state_append_m(tkz, "!", 1);
420 
421     tkz->state = lxb_html_tokenizer_state_script_data;
422 
423     return data;
424 }
425 
426 /*
427  * 12.2.5.19 Script data escape start dash state
428  */
429 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escape_start_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)430 lxb_html_tokenizer_state_script_data_escape_start_dash(
431                                                       lxb_html_tokenizer_t *tkz,
432                                                       const lxb_char_t *data,
433                                                       const lxb_char_t *end)
434 {
435     /* U+002D HYPHEN-MINUS (-) */
436     if (*data == 0x2D) {
437         lxb_html_tokenizer_state_append_m(tkz, "!--", 3);
438 
439         tkz->state = lxb_html_tokenizer_state_script_data_escaped_dash_dash;
440 
441         return (data + 1);
442     }
443 
444     lxb_html_tokenizer_state_append_m(tkz, "!-", 2);
445 
446     tkz->state = lxb_html_tokenizer_state_script_data;
447 
448     return data;
449 }
450 
451 /*
452  * 12.2.5.20 Script data escaped state
453  */
454 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)455 lxb_html_tokenizer_state_script_data_escaped(lxb_html_tokenizer_t *tkz,
456                                              const lxb_char_t *data,
457                                              const lxb_char_t *end)
458 {
459     lxb_html_tokenizer_state_begin_set(tkz, data);
460 
461     while (data != end) {
462         switch (*data) {
463             /* U+002D HYPHEN-MINUS (-) */
464             case 0x2D:
465                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
466 
467                 tkz->state = lxb_html_tokenizer_state_script_data_escaped_dash;
468 
469                 return (data + 1);
470 
471             /* U+003C LESS-THAN SIGN (<) */
472             case 0x3C:
473                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
474                 lxb_html_tokenizer_state_token_set_end(tkz, data);
475 
476                 tkz->state =
477                     lxb_html_tokenizer_state_script_data_escaped_less_than_sign;
478 
479                 return (data + 1);
480 
481             /* U+000D CARRIAGE RETURN (CR) */
482             case 0x0D:
483                 if (++data >= end) {
484                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
485 
486                     tkz->state = lxb_html_tokenizer_state_cr;
487                     tkz->state_return = lxb_html_tokenizer_state_script_data_escaped;
488 
489                     return data;
490                 }
491 
492                 lxb_html_tokenizer_state_append_data_m(tkz, data);
493                 tkz->pos[-1] = 0x0A;
494 
495                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
496 
497                 if (*data != 0x0A) {
498                     lxb_html_tokenizer_state_begin_set(tkz, data);
499                     data--;
500                 }
501 
502                 break;
503 
504             /*
505              * U+0000 NULL
506              * EOF
507              */
508             case 0x00:
509                 lxb_html_tokenizer_state_append_data_m(tkz, data);
510 
511                 if (tkz->is_eof) {
512                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
513                                        LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
514 
515                     tkz->token->tag_id = LXB_TAG__TEXT;
516 
517                     lxb_html_tokenizer_state_set_text(tkz);
518                     lxb_html_tokenizer_state_token_set_end_oef(tkz);
519                     lxb_html_tokenizer_state_token_done_m(tkz, end);
520 
521                     return end;
522                 }
523 
524                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
525                 lxb_html_tokenizer_state_append_replace_m(tkz);
526 
527                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
528                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
529                 break;
530 
531             default:
532                 break;
533         }
534 
535         data++;
536     }
537 
538     lxb_html_tokenizer_state_append_data_m(tkz, data);
539 
540     return data;
541 }
542 
543 /*
544  * 12.2.5.21 Script data escaped dash state
545  */
546 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)547 lxb_html_tokenizer_state_script_data_escaped_dash(lxb_html_tokenizer_t *tkz,
548                                                   const lxb_char_t *data,
549                                                   const lxb_char_t *end)
550 {
551     switch (*data) {
552         /* U+002D HYPHEN-MINUS (-) */
553         case 0x2D:
554             lxb_html_tokenizer_state_append_m(tkz, data, 1);
555 
556             tkz->state = lxb_html_tokenizer_state_script_data_escaped_dash_dash;
557 
558             return (data + 1);
559 
560         /* U+003C LESS-THAN SIGN (<) */
561         case 0x3C:
562             lxb_html_tokenizer_state_append_m(tkz, data, 1);
563             lxb_html_tokenizer_state_token_set_end(tkz, data);
564 
565             tkz->state =
566                 lxb_html_tokenizer_state_script_data_escaped_less_than_sign;
567 
568             return (data + 1);
569 
570         /*
571          * U+0000 NULL
572          * EOF
573          */
574         case 0x00:
575             if (tkz->is_eof) {
576                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
577                                        LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
578 
579                 tkz->token->tag_id = LXB_TAG__TEXT;
580 
581                 lxb_html_tokenizer_state_set_text(tkz);
582                 lxb_html_tokenizer_state_token_set_end_oef(tkz);
583                 lxb_html_tokenizer_state_token_done_m(tkz, end);
584 
585                 return end;
586             }
587 
588             lxb_html_tokenizer_state_append_replace_m(tkz);
589 
590             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
591                                          LXB_HTML_TOKENIZER_ERROR_UNNUCH);
592 
593             tkz->state = lxb_html_tokenizer_state_script_data_escaped;
594 
595             return (data + 1);
596 
597         default:
598             tkz->state = lxb_html_tokenizer_state_script_data_escaped;
599 
600             return data;
601     }
602 }
603 
604 /*
605  * 12.2.5.22 Script data escaped dash dash state
606  */
607 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_dash_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)608 lxb_html_tokenizer_state_script_data_escaped_dash_dash(lxb_html_tokenizer_t *tkz,
609                                                        const lxb_char_t *data,
610                                                        const lxb_char_t *end)
611 {
612     switch (*data) {
613         /* U+002D HYPHEN-MINUS (-) */
614         case 0x2D:
615             lxb_html_tokenizer_state_append_m(tkz, "-", 1);
616             return (data + 1);
617 
618         /* U+003C LESS-THAN SIGN (<) */
619         case 0x3C:
620             lxb_html_tokenizer_state_append_m(tkz, "<", 1);
621             lxb_html_tokenizer_state_token_set_end(tkz, data);
622 
623             tkz->state =
624                 lxb_html_tokenizer_state_script_data_escaped_less_than_sign;
625 
626             return (data + 1);
627 
628         /* U+003E GREATER-THAN SIGN (>) */
629         case 0x3E:
630             tkz->state = lxb_html_tokenizer_state_script_data;
631             return data;
632 
633         default:
634             tkz->state = lxb_html_tokenizer_state_script_data_escaped;
635             return data;
636     }
637 }
638 
639 /*
640  * 12.2.5.23 Script data escaped less-than sign state
641  */
642 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)643 lxb_html_tokenizer_state_script_data_escaped_less_than_sign(
644                                                       lxb_html_tokenizer_t *tkz,
645                                                       const lxb_char_t *data,
646                                                       const lxb_char_t *end)
647 {
648     /* U+002F SOLIDUS (/) */
649     if (*data == 0x2F) {
650         tkz->state = lxb_html_tokenizer_state_script_data_escaped_end_tag_open;
651 
652         return (data + 1);
653     }
654 
655     /* ASCII alpha */
656     if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
657         tkz->entity_start = tkz->pos - tkz->start;
658 
659         tkz->state = lxb_html_tokenizer_state_script_data_double_escape_start;
660 
661         return data;
662     }
663 
664     tkz->state = lxb_html_tokenizer_state_script_data_escaped;
665 
666     return data;
667 }
668 
669 /*
670  * 12.2.5.24 Script data escaped end tag open state
671  */
672 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)673 lxb_html_tokenizer_state_script_data_escaped_end_tag_open(lxb_html_tokenizer_t *tkz,
674                                                           const lxb_char_t *data,
675                                                           const lxb_char_t *end)
676 {
677     if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
678         tkz->temp = data;
679         tkz->entity_start = (tkz->pos - 1) - tkz->start;
680 
681         tkz->state = lxb_html_tokenizer_state_script_data_escaped_end_tag_name;
682     }
683     else {
684         tkz->state = lxb_html_tokenizer_state_script_data_escaped;
685     }
686 
687     lxb_html_tokenizer_state_append_m(tkz, "/", 1);
688 
689     return data;
690 }
691 
692 /*
693  * 12.2.5.25 Script data escaped end tag name state
694  */
695 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_escaped_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)696 lxb_html_tokenizer_state_script_data_escaped_end_tag_name(
697                                                       lxb_html_tokenizer_t *tkz,
698                                                       const lxb_char_t *data,
699                                                       const lxb_char_t *end)
700 {
701     lxb_html_tokenizer_state_begin_set(tkz, data);
702 
703     while (data != end) {
704         switch (*data) {
705            /*
706             * U+0009 CHARACTER TABULATION (tab)
707             * U+000A LINE FEED (LF)
708             * U+000C FORM FEED (FF)
709             * U+000D CARRIAGE RETURN (CR)
710             * U+0020 SPACE
711             */
712             case 0x09:
713             case 0x0A:
714             case 0x0C:
715             case 0x0D:
716             case 0x20:
717                 lxb_html_tokenizer_state_append_data_m(tkz, data);
718                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
719                                                    tkz->pos);
720 
721                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
722                     goto anything_else;
723                 }
724 
725                 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
726 
727                 goto done;
728 
729             /* U+002F SOLIDUS (/) */
730             case 0x2F:
731                 lxb_html_tokenizer_state_append_data_m(tkz, data);
732                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
733                                                    tkz->pos);
734 
735                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
736                     goto anything_else;
737                 }
738 
739                 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
740 
741                 goto done;
742 
743             /* U+003E GREATER-THAN SIGN (>) */
744             case 0x3E:
745                 lxb_html_tokenizer_state_append_data_m(tkz, data);
746                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
747                                                    tkz->pos);
748 
749                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
750                     goto anything_else;
751                 }
752 
753                 tkz->state = lxb_html_tokenizer_state_data_before;
754 
755                 /* Emit text token */
756                 tkz->token->tag_id = LXB_TAG__TEXT;
757                 tkz->pos = &tkz->start[tkz->entity_start];
758 
759                 lxb_html_tokenizer_state_set_text(tkz);
760                 lxb_html_tokenizer_state_token_done_m(tkz, end);
761 
762                 /* Init close token */
763                 tkz->token->tag_id = tkz->tmp_tag_id;
764                 tkz->token->begin = tkz->temp;
765                 tkz->token->end = data;
766                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
767 
768                 /* Emit close token */
769                 lxb_html_tokenizer_state_token_done_m(tkz, end);
770 
771                 return (data + 1);
772 
773             default:
774                 if (lexbor_str_res_alpha_character[*data]
775                     == LEXBOR_STR_RES_SLIP)
776                 {
777                     lxb_html_tokenizer_state_append_data_m(tkz, data);
778                     goto anything_else;
779                 }
780 
781                 break;
782         }
783 
784         data++;
785     }
786 
787     lxb_html_tokenizer_state_append_data_m(tkz, data);
788 
789     return data;
790 
791 anything_else:
792 
793     tkz->state = lxb_html_tokenizer_state_script_data_escaped;
794 
795     return data;
796 
797 done:
798 
799     /* Emit text token */
800     tkz->token->tag_id = LXB_TAG__TEXT;
801     tkz->pos = &tkz->start[tkz->entity_start];
802 
803     lxb_html_tokenizer_state_set_text(tkz);
804     lxb_html_tokenizer_state_token_done_m(tkz, end);
805 
806     /* Init close token */
807     tkz->token->tag_id = tkz->tmp_tag_id;
808     tkz->token->begin = tkz->temp;
809     tkz->token->end = data;
810     tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
811 
812     return (data + 1);
813 }
814 
815 /*
816  * 12.2.5.26 Script data double escape start state
817  */
818 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escape_start(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)819 lxb_html_tokenizer_state_script_data_double_escape_start(lxb_html_tokenizer_t *tkz,
820                                                          const lxb_char_t *data,
821                                                          const lxb_char_t *end)
822 {
823     lxb_html_tokenizer_state_begin_set(tkz, data);
824 
825     while (data != end) {
826         switch (*data) {
827             /*
828              * U+0009 CHARACTER TABULATION (tab)
829              * U+000A LINE FEED (LF)
830              * U+000C FORM FEED (FF)
831              * U+000D CARRIAGE RETURN (CR)
832              * U+0020 SPACE
833              * U+002F SOLIDUS (/)
834              * U+003E GREATER-THAN SIGN (>)
835              */
836             case 0x09:
837             case 0x0A:
838             case 0x0C:
839             case 0x0D:
840             case 0x20:
841             case 0x2F:
842             case 0x3E:
843                 lxb_html_tokenizer_state_append_data_m(tkz, data);
844 
845                 if ((tkz->pos - &tkz->start[tkz->entity_start]) == 6
846                     && lexbor_str_data_ncasecmp(&tkz->start[tkz->entity_start],
847                                                 (const lxb_char_t *) "script", 6))
848                 {
849                     tkz->state =
850                         lxb_html_tokenizer_state_script_data_double_escaped;
851 
852                     return data;
853                 }
854 
855                 tkz->state = lxb_html_tokenizer_state_script_data_escaped;
856 
857                 return data;
858 
859             default:
860                 if (lexbor_str_res_alpha_character[*data]
861                     == LEXBOR_STR_RES_SLIP)
862                 {
863                     lxb_html_tokenizer_state_append_data_m(tkz, data);
864 
865                     tkz->state = lxb_html_tokenizer_state_script_data_escaped;
866 
867                     return data;
868                 }
869 
870                 break;
871         }
872 
873         data++;
874     }
875 
876     lxb_html_tokenizer_state_append_data_m(tkz, data);
877 
878     return data;
879 }
880 
881 /*
882  * 12.2.5.27 Script data double escaped state
883  */
884 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)885 lxb_html_tokenizer_state_script_data_double_escaped(lxb_html_tokenizer_t *tkz,
886                                                     const lxb_char_t *data,
887                                                     const lxb_char_t *end)
888 {
889     lxb_html_tokenizer_state_begin_set(tkz, data);
890 
891     while (data != end) {
892         switch (*data) {
893             /* U+002D HYPHEN-MINUS (-) */
894             case 0x2D:
895                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
896 
897                 tkz->state =
898                     lxb_html_tokenizer_state_script_data_double_escaped_dash;
899 
900                 return (data + 1);
901 
902             /* U+003C LESS-THAN SIGN (<) */
903             case 0x3C:
904                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
905 
906                 tkz->state =
907              lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign;
908 
909                 return (data + 1);
910 
911             /* U+000D CARRIAGE RETURN (CR) */
912             case 0x0D:
913                 if (++data >= end) {
914                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
915 
916                     tkz->state = lxb_html_tokenizer_state_cr;
917                     tkz->state_return = lxb_html_tokenizer_state_script_data_double_escaped;
918 
919                     return data;
920                 }
921 
922                 lxb_html_tokenizer_state_append_data_m(tkz, data);
923                 tkz->pos[-1] = 0x0A;
924 
925                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
926 
927                 if (*data != 0x0A) {
928                     lxb_html_tokenizer_state_begin_set(tkz, data);
929                     data--;
930                 }
931 
932                 break;
933 
934             /*
935              * U+0000 NULL
936              * EOF
937              */
938             case 0x00:
939                 lxb_html_tokenizer_state_append_data_m(tkz, data);
940 
941                 if (tkz->is_eof) {
942                     lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
943                                        LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
944 
945                     tkz->token->tag_id = LXB_TAG__TEXT;
946 
947                     lxb_html_tokenizer_state_set_text(tkz);
948                     lxb_html_tokenizer_state_token_set_end_oef(tkz);
949                     lxb_html_tokenizer_state_token_done_m(tkz, end);
950 
951                     return end;
952                 }
953 
954                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
955                 lxb_html_tokenizer_state_append_replace_m(tkz);
956 
957                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
958                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
959                 break;
960 
961             default:
962                 break;
963         }
964 
965         data++;
966     }
967 
968     lxb_html_tokenizer_state_append_data_m(tkz, data);
969 
970     return data;
971 }
972 
973 /*
974  * 12.2.5.28 Script data double escaped dash state
975  */
976 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)977 lxb_html_tokenizer_state_script_data_double_escaped_dash(lxb_html_tokenizer_t *tkz,
978                                                          const lxb_char_t *data,
979                                                          const lxb_char_t *end)
980 {
981     switch (*data) {
982         /* U+002D HYPHEN-MINUS (-) */
983         case 0x2D:
984             lxb_html_tokenizer_state_append_m(tkz, data, 1);
985 
986             tkz->state =
987                 lxb_html_tokenizer_state_script_data_double_escaped_dash_dash;
988 
989             return (data + 1);
990 
991         /* U+003C LESS-THAN SIGN (<) */
992         case 0x3C:
993             lxb_html_tokenizer_state_append_m(tkz, data, 1);
994 
995             tkz->state =
996              lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign;
997 
998             return (data + 1);
999 
1000         /*
1001          * U+0000 NULL
1002          * EOF
1003          */
1004         case 0x00:
1005             if (tkz->is_eof) {
1006                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1007                                        LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
1008 
1009                 tkz->token->tag_id = LXB_TAG__TEXT;
1010 
1011                 lxb_html_tokenizer_state_set_text(tkz);
1012                 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1013                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1014 
1015                 return end;
1016             }
1017 
1018             lxb_html_tokenizer_state_append_replace_m(tkz);
1019 
1020             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1021                                          LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1022 
1023             tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1024 
1025             return (data + 1);
1026 
1027         default:
1028             tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1029 
1030             return data;
1031     }
1032 }
1033 
1034 /*
1035  * 12.2.5.29 Script data double escaped dash dash state
1036  */
1037 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_dash_dash(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1038 lxb_html_tokenizer_state_script_data_double_escaped_dash_dash(
1039                                                       lxb_html_tokenizer_t *tkz,
1040                                                       const lxb_char_t *data,
1041                                                       const lxb_char_t *end)
1042 {
1043     switch (*data) {
1044         /* U+002D HYPHEN-MINUS (-) */
1045         case 0x2D:
1046             lxb_html_tokenizer_state_append_m(tkz, data, 1);
1047             return (data + 1);
1048 
1049         /* U+003C LESS-THAN SIGN (<) */
1050         case 0x3C:
1051             lxb_html_tokenizer_state_append_m(tkz, data, 1);
1052 
1053             tkz->state =
1054              lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign;
1055 
1056             return (data + 1);
1057 
1058         /* U+003E GREATER-THAN SIGN (>) */
1059         case 0x3E:
1060             lxb_html_tokenizer_state_append_m(tkz, data, 1);
1061 
1062             tkz->state = lxb_html_tokenizer_state_script_data;
1063 
1064             return (data + 1);
1065 
1066         /*
1067          * U+0000 NULL
1068          * EOF
1069          */
1070         case 0x00:
1071             if (tkz->is_eof) {
1072                 lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1073                                        LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE);
1074 
1075                 tkz->token->tag_id = LXB_TAG__TEXT;
1076 
1077                 lxb_html_tokenizer_state_set_text(tkz);
1078                 lxb_html_tokenizer_state_token_set_end_oef(tkz);
1079                 lxb_html_tokenizer_state_token_done_m(tkz, end);
1080 
1081                 return end;
1082             }
1083 
1084             lxb_html_tokenizer_state_append_replace_m(tkz);
1085 
1086             lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1087                                          LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1088 
1089             tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1090 
1091             return (data + 1);
1092 
1093         default:
1094             tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1095 
1096             return data;
1097     }
1098 
1099     return data;
1100 }
1101 
1102 /*
1103  * 12.2.5.30 Script data double escaped less-than sign state
1104  */
1105 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1106 lxb_html_tokenizer_state_script_data_double_escaped_less_than_sign(
1107                                                       lxb_html_tokenizer_t *tkz,
1108                                                       const lxb_char_t *data,
1109                                                       const lxb_char_t *end)
1110 {
1111     /* U+002F SOLIDUS (/) */
1112     if (*data == 0x2F) {
1113         tkz->state =
1114             lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open;
1115 
1116         return (data + 1);
1117     }
1118 
1119     tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1120 
1121     return data;
1122 }
1123 
1124 /*
1125  * 12.2.5.30.5 Helper function. No in the specification.
1126  */
1127 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1128 lxb_html_tokenizer_state_script_data_double_escaped_end_tag_open(
1129                                                       lxb_html_tokenizer_t *tkz,
1130                                                       const lxb_char_t *data,
1131                                                       const lxb_char_t *end)
1132 {
1133     if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
1134         tkz->entity_start = (tkz->pos + 1) - tkz->start;
1135 
1136         tkz->state = lxb_html_tokenizer_state_script_data_double_escape_end;
1137     }
1138     else {
1139         tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1140     }
1141 
1142     lxb_html_tokenizer_state_append_m(tkz, "/", 1);
1143 
1144     return data;
1145 }
1146 
1147 /*
1148  * 12.2.5.31 Script data double escape end state
1149  */
1150 static const lxb_char_t *
lxb_html_tokenizer_state_script_data_double_escape_end(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)1151 lxb_html_tokenizer_state_script_data_double_escape_end(
1152                                                       lxb_html_tokenizer_t *tkz,
1153                                                       const lxb_char_t *data,
1154                                                       const lxb_char_t *end)
1155 {
1156     lxb_html_tokenizer_state_begin_set(tkz, data);
1157 
1158     while (data != end) {
1159         switch (*data) {
1160             /*
1161              * U+0009 CHARACTER TABULATION (tab)
1162              * U+000A LINE FEED (LF)
1163              * U+000C FORM FEED (FF)
1164              * U+000D CARRIAGE RETURN (CR)
1165              * U+0020 SPACE
1166              * U+002F SOLIDUS (/)
1167              * U+003E GREATER-THAN SIGN (>)
1168              */
1169             case 0x09:
1170             case 0x0A:
1171             case 0x0C:
1172             case 0x0D:
1173             case 0x20:
1174             case 0x2F:
1175             case 0x3E:
1176                 lxb_html_tokenizer_state_append_data_m(tkz, data);
1177 
1178                 if ((tkz->pos - &tkz->start[tkz->entity_start]) == 6
1179                     && lexbor_str_data_ncasecmp(&tkz->start[tkz->entity_start],
1180                                                 (const lxb_char_t *) "script", 6))
1181                 {
1182                     tkz->state = lxb_html_tokenizer_state_script_data_escaped;
1183                     return data;
1184                 }
1185 
1186                 tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1187 
1188                 return data;
1189 
1190             default:
1191                 if (lexbor_str_res_alpha_character[*data]
1192                     == LEXBOR_STR_RES_SLIP)
1193                 {
1194                     lxb_html_tokenizer_state_append_data_m(tkz, data);
1195 
1196                     tkz->state = lxb_html_tokenizer_state_script_data_double_escaped;
1197                     return data;
1198                 }
1199 
1200                 break;
1201         }
1202 
1203         data++;
1204     }
1205 
1206     lxb_html_tokenizer_state_append_data_m(tkz, data);
1207 
1208     return data;
1209 }
1210