1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tokenizer/state_rawtext.h"
8 #include "lexbor/html/tokenizer/state.h"
9 
10 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
11 #define LEXBOR_STR_RES_ALPHA_CHARACTER
12 #include "lexbor/core/str_res.h"
13 
14 
15 const lxb_tag_data_t *
16 lxb_tag_append_lower(lexbor_hash_t *hash,
17                      const lxb_char_t *name, size_t length);
18 
19 
20 static const lxb_char_t *
21 lxb_html_tokenizer_state_rawtext(lxb_html_tokenizer_t *tkz,
22                                 const lxb_char_t *data,
23                                 const lxb_char_t *end);
24 
25 static const lxb_char_t *
26 lxb_html_tokenizer_state_rawtext_less_than_sign(lxb_html_tokenizer_t *tkz,
27                                                const lxb_char_t *data,
28                                                const lxb_char_t *end);
29 
30 static const lxb_char_t *
31 lxb_html_tokenizer_state_rawtext_end_tag_open(lxb_html_tokenizer_t *tkz,
32                                              const lxb_char_t *data,
33                                              const lxb_char_t *end);
34 
35 static const lxb_char_t *
36 lxb_html_tokenizer_state_rawtext_end_tag_name(lxb_html_tokenizer_t *tkz,
37                                              const lxb_char_t *data,
38                                              const lxb_char_t *end);
39 
40 
41 /*
42  * Helper function. No in the specification. For 12.2.5.3 RAWTEXT state
43  */
44 const lxb_char_t *
lxb_html_tokenizer_state_rawtext_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)45 lxb_html_tokenizer_state_rawtext_before(lxb_html_tokenizer_t *tkz,
46                                         const lxb_char_t *data,
47                                         const lxb_char_t *end)
48 {
49     if (tkz->is_eof == false) {
50         lxb_html_tokenizer_state_token_set_begin(tkz, data);
51     }
52 
53     tkz->state = lxb_html_tokenizer_state_rawtext;
54 
55     return data;
56 }
57 
58 /*
59  * 12.2.5.3 RAWTEXT state
60  */
61 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)62 lxb_html_tokenizer_state_rawtext(lxb_html_tokenizer_t *tkz,
63                                  const lxb_char_t *data,
64                                  const lxb_char_t *end)
65 {
66     lxb_html_tokenizer_state_begin_set(tkz, data);
67 
68     while (data != end) {
69         switch (*data) {
70             /* U+003C LESS-THAN SIGN (<) */
71             case 0x3C:
72                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
73                 lxb_html_tokenizer_state_token_set_end(tkz, data);
74 
75                 tkz->state = lxb_html_tokenizer_state_rawtext_less_than_sign;
76 
77                 return (data + 1);
78 
79             /* U+000D CARRIAGE RETURN (CR) */
80             case 0x0D:
81                 if (++data >= end) {
82                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
83 
84                     tkz->state = lxb_html_tokenizer_state_cr;
85                     tkz->state_return = lxb_html_tokenizer_state_rawtext;
86 
87                     return data;
88                 }
89 
90                 lxb_html_tokenizer_state_append_data_m(tkz, data);
91                 tkz->pos[-1] = 0x0A;
92 
93                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
94 
95                 if (*data != 0x0A) {
96                     lxb_html_tokenizer_state_begin_set(tkz, data);
97                     data--;
98                 }
99 
100                 break;
101 
102             /*
103              * U+0000 NULL
104              * EOF
105              */
106             case 0x00:
107                 lxb_html_tokenizer_state_append_data_m(tkz, data);
108 
109                 if (tkz->is_eof) {
110                     if (tkz->token->begin != NULL) {
111                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
112                     }
113 
114                     tkz->token->tag_id = LXB_TAG__TEXT;
115 
116                     lxb_html_tokenizer_state_set_text(tkz);
117                     lxb_html_tokenizer_state_token_done_m(tkz, end);
118 
119                     return end;
120                 }
121 
122                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
123                 lxb_html_tokenizer_state_append_replace_m(tkz);
124 
125                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
126                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
127                 break;
128 
129             default:
130                 break;
131         }
132 
133         data++;
134     }
135 
136     lxb_html_tokenizer_state_append_data_m(tkz, data);
137 
138     return data;
139 }
140 
141 /*
142  * 12.2.5.12 RAWTEXT less-than sign state
143  */
144 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)145 lxb_html_tokenizer_state_rawtext_less_than_sign(lxb_html_tokenizer_t *tkz,
146                                                 const lxb_char_t *data,
147                                                 const lxb_char_t *end)
148 {
149     /* U+002F SOLIDUS (/) */
150     if (*data == 0x2F) {
151         tkz->state = lxb_html_tokenizer_state_rawtext_end_tag_open;
152 
153         return (data + 1);
154     }
155 
156     tkz->state = lxb_html_tokenizer_state_rawtext;
157 
158     return data;
159 }
160 
161 /*
162  * 12.2.5.13 RAWTEXT end tag open state
163  */
164 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)165 lxb_html_tokenizer_state_rawtext_end_tag_open(lxb_html_tokenizer_t *tkz,
166                                               const lxb_char_t *data,
167                                               const lxb_char_t *end)
168 {
169     if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
170         tkz->temp = data;
171         tkz->entity_start = (tkz->pos - 1) - tkz->start;
172 
173         tkz->state = lxb_html_tokenizer_state_rawtext_end_tag_name;
174     }
175     else {
176         tkz->state = lxb_html_tokenizer_state_rawtext;
177     }
178 
179     lxb_html_tokenizer_state_append_m(tkz, "/", 1);
180 
181     return data;
182 }
183 
184 /*
185  * 12.2.5.14 RAWTEXT end tag name state
186  */
187 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)188 lxb_html_tokenizer_state_rawtext_end_tag_name(lxb_html_tokenizer_t *tkz,
189                                               const lxb_char_t *data,
190                                               const lxb_char_t *end)
191 {
192     lxb_html_tokenizer_state_begin_set(tkz, data);
193 
194     while (data != end) {
195         switch (*data) {
196             /*
197              * U+0009 CHARACTER TABULATION (tab)
198              * U+000A LINE FEED (LF)
199              * U+000C FORM FEED (FF)
200              * U+000D CARRIAGE RETURN (CR)
201              * U+0020 SPACE
202              */
203             case 0x09:
204             case 0x0A:
205             case 0x0C:
206             case 0x0D:
207             case 0x20:
208                 lxb_html_tokenizer_state_append_data_m(tkz, data);
209                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
210                                                    tkz->pos);
211 
212                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
213                     goto anything_else;
214                 }
215 
216                 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
217                 goto done;
218 
219             /* U+002F SOLIDUS (/) */
220             case 0x2F:
221                 lxb_html_tokenizer_state_append_data_m(tkz, data);
222                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
223                                                    tkz->pos);
224 
225                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
226                     goto anything_else;
227                 }
228 
229                 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
230                 goto done;
231 
232             /* U+003E GREATER-THAN SIGN (>) */
233             case 0x3E:
234                 lxb_html_tokenizer_state_append_data_m(tkz, data);
235                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
236                                                    tkz->pos);
237 
238                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
239                     goto anything_else;
240                 }
241 
242                 tkz->state = lxb_html_tokenizer_state_data_before;
243 
244                 /* Emit text token */
245                 tkz->token->tag_id = LXB_TAG__TEXT;
246                 tkz->pos = &tkz->start[tkz->entity_start];
247 
248                 lxb_html_tokenizer_state_set_text(tkz);
249                 lxb_html_tokenizer_state_token_done_m(tkz, end);
250 
251                 /* Init close token */
252                 tkz->token->tag_id = tkz->tmp_tag_id;
253                 tkz->token->begin = tkz->temp;
254                 tkz->token->end = data;
255                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
256 
257                 /* Emit close token */
258                 lxb_html_tokenizer_state_token_done_m(tkz, end);
259 
260                 return (data + 1);
261 
262             default:
263                 if (lexbor_str_res_alpha_character[*data]
264                     == LEXBOR_STR_RES_SLIP)
265                 {
266                     lxb_html_tokenizer_state_append_data_m(tkz, data);
267 
268                     goto anything_else;
269                 }
270 
271                 break;
272         }
273 
274         data++;
275     }
276 
277     lxb_html_tokenizer_state_append_data_m(tkz, data);
278 
279     return data;
280 
281 anything_else:
282 
283     tkz->state = lxb_html_tokenizer_state_rawtext;
284 
285     return data;
286 
287 done:
288 
289     /* Emit text token */
290     tkz->token->tag_id = LXB_TAG__TEXT;
291     tkz->pos = &tkz->start[tkz->entity_start];
292 
293     lxb_html_tokenizer_state_set_text(tkz);
294     lxb_html_tokenizer_state_token_done_m(tkz, end);
295 
296     /* Init close token */
297     tkz->token->tag_id = tkz->tmp_tag_id;
298     tkz->token->begin = tkz->temp;
299     tkz->token->end = data;
300     tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
301 
302     return (data + 1);
303 }
304