1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tokenizer/state_rawtext.h"
8 #include "lexbor/html/tokenizer/state.h"
9
10 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
11 #define LEXBOR_STR_RES_ALPHA_CHARACTER
12 #include "lexbor/core/str_res.h"
13
14
15 const lxb_tag_data_t *
16 lxb_tag_append_lower(lexbor_hash_t *hash,
17 const lxb_char_t *name, size_t length);
18
19
20 static const lxb_char_t *
21 lxb_html_tokenizer_state_rawtext(lxb_html_tokenizer_t *tkz,
22 const lxb_char_t *data,
23 const lxb_char_t *end);
24
25 static const lxb_char_t *
26 lxb_html_tokenizer_state_rawtext_less_than_sign(lxb_html_tokenizer_t *tkz,
27 const lxb_char_t *data,
28 const lxb_char_t *end);
29
30 static const lxb_char_t *
31 lxb_html_tokenizer_state_rawtext_end_tag_open(lxb_html_tokenizer_t *tkz,
32 const lxb_char_t *data,
33 const lxb_char_t *end);
34
35 static const lxb_char_t *
36 lxb_html_tokenizer_state_rawtext_end_tag_name(lxb_html_tokenizer_t *tkz,
37 const lxb_char_t *data,
38 const lxb_char_t *end);
39
40
41 /*
42 * Helper function. No in the specification. For 12.2.5.3 RAWTEXT state
43 */
44 const lxb_char_t *
lxb_html_tokenizer_state_rawtext_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)45 lxb_html_tokenizer_state_rawtext_before(lxb_html_tokenizer_t *tkz,
46 const lxb_char_t *data,
47 const lxb_char_t *end)
48 {
49 if (tkz->is_eof == false) {
50 lxb_html_tokenizer_state_token_set_begin(tkz, data);
51 }
52
53 tkz->state = lxb_html_tokenizer_state_rawtext;
54
55 return data;
56 }
57
58 /*
59 * 12.2.5.3 RAWTEXT state
60 */
61 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)62 lxb_html_tokenizer_state_rawtext(lxb_html_tokenizer_t *tkz,
63 const lxb_char_t *data,
64 const lxb_char_t *end)
65 {
66 lxb_html_tokenizer_state_begin_set(tkz, data);
67
68 while (data != end) {
69 switch (*data) {
70 /* U+003C LESS-THAN SIGN (<) */
71 case 0x3C:
72 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
73 lxb_html_tokenizer_state_token_set_end(tkz, data);
74
75 tkz->state = lxb_html_tokenizer_state_rawtext_less_than_sign;
76
77 return (data + 1);
78
79 /* U+000D CARRIAGE RETURN (CR) */
80 case 0x0D:
81 if (++data >= end) {
82 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
83
84 tkz->state = lxb_html_tokenizer_state_cr;
85 tkz->state_return = lxb_html_tokenizer_state_rawtext;
86
87 return data;
88 }
89
90 lxb_html_tokenizer_state_append_data_m(tkz, data);
91 tkz->pos[-1] = 0x0A;
92
93 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
94
95 if (*data != 0x0A) {
96 lxb_html_tokenizer_state_begin_set(tkz, data);
97 data--;
98 }
99
100 break;
101
102 /*
103 * U+0000 NULL
104 * EOF
105 */
106 case 0x00:
107 lxb_html_tokenizer_state_append_data_m(tkz, data);
108
109 if (tkz->is_eof) {
110 if (tkz->token->begin != NULL) {
111 lxb_html_tokenizer_state_token_set_end_oef(tkz);
112 }
113
114 tkz->token->tag_id = LXB_TAG__TEXT;
115
116 lxb_html_tokenizer_state_set_text(tkz);
117 lxb_html_tokenizer_state_token_done_m(tkz, end);
118
119 return end;
120 }
121
122 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
123 lxb_html_tokenizer_state_append_replace_m(tkz);
124
125 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
126 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
127 break;
128
129 default:
130 break;
131 }
132
133 data++;
134 }
135
136 lxb_html_tokenizer_state_append_data_m(tkz, data);
137
138 return data;
139 }
140
141 /*
142 * 12.2.5.12 RAWTEXT less-than sign state
143 */
144 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)145 lxb_html_tokenizer_state_rawtext_less_than_sign(lxb_html_tokenizer_t *tkz,
146 const lxb_char_t *data,
147 const lxb_char_t *end)
148 {
149 /* U+002F SOLIDUS (/) */
150 if (*data == 0x2F) {
151 tkz->state = lxb_html_tokenizer_state_rawtext_end_tag_open;
152
153 return (data + 1);
154 }
155
156 tkz->state = lxb_html_tokenizer_state_rawtext;
157
158 return data;
159 }
160
161 /*
162 * 12.2.5.13 RAWTEXT end tag open state
163 */
164 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)165 lxb_html_tokenizer_state_rawtext_end_tag_open(lxb_html_tokenizer_t *tkz,
166 const lxb_char_t *data,
167 const lxb_char_t *end)
168 {
169 if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
170 tkz->temp = data;
171 tkz->entity_start = (tkz->pos - 1) - tkz->start;
172
173 tkz->state = lxb_html_tokenizer_state_rawtext_end_tag_name;
174 }
175 else {
176 tkz->state = lxb_html_tokenizer_state_rawtext;
177 }
178
179 lxb_html_tokenizer_state_append_m(tkz, "/", 1);
180
181 return data;
182 }
183
184 /*
185 * 12.2.5.14 RAWTEXT end tag name state
186 */
187 static const lxb_char_t *
lxb_html_tokenizer_state_rawtext_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)188 lxb_html_tokenizer_state_rawtext_end_tag_name(lxb_html_tokenizer_t *tkz,
189 const lxb_char_t *data,
190 const lxb_char_t *end)
191 {
192 lxb_html_tokenizer_state_begin_set(tkz, data);
193
194 while (data != end) {
195 switch (*data) {
196 /*
197 * U+0009 CHARACTER TABULATION (tab)
198 * U+000A LINE FEED (LF)
199 * U+000C FORM FEED (FF)
200 * U+000D CARRIAGE RETURN (CR)
201 * U+0020 SPACE
202 */
203 case 0x09:
204 case 0x0A:
205 case 0x0C:
206 case 0x0D:
207 case 0x20:
208 lxb_html_tokenizer_state_append_data_m(tkz, data);
209 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
210 tkz->pos);
211
212 if (tkz->tmp_tag_id != tkz->token->tag_id) {
213 goto anything_else;
214 }
215
216 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
217 goto done;
218
219 /* U+002F SOLIDUS (/) */
220 case 0x2F:
221 lxb_html_tokenizer_state_append_data_m(tkz, data);
222 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
223 tkz->pos);
224
225 if (tkz->tmp_tag_id != tkz->token->tag_id) {
226 goto anything_else;
227 }
228
229 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
230 goto done;
231
232 /* U+003E GREATER-THAN SIGN (>) */
233 case 0x3E:
234 lxb_html_tokenizer_state_append_data_m(tkz, data);
235 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
236 tkz->pos);
237
238 if (tkz->tmp_tag_id != tkz->token->tag_id) {
239 goto anything_else;
240 }
241
242 tkz->state = lxb_html_tokenizer_state_data_before;
243
244 /* Emit text token */
245 tkz->token->tag_id = LXB_TAG__TEXT;
246 tkz->pos = &tkz->start[tkz->entity_start];
247
248 lxb_html_tokenizer_state_set_text(tkz);
249 lxb_html_tokenizer_state_token_done_m(tkz, end);
250
251 /* Init close token */
252 tkz->token->tag_id = tkz->tmp_tag_id;
253 tkz->token->begin = tkz->temp;
254 tkz->token->end = data;
255 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
256
257 /* Emit close token */
258 lxb_html_tokenizer_state_token_done_m(tkz, end);
259
260 return (data + 1);
261
262 default:
263 if (lexbor_str_res_alpha_character[*data]
264 == LEXBOR_STR_RES_SLIP)
265 {
266 lxb_html_tokenizer_state_append_data_m(tkz, data);
267
268 goto anything_else;
269 }
270
271 break;
272 }
273
274 data++;
275 }
276
277 lxb_html_tokenizer_state_append_data_m(tkz, data);
278
279 return data;
280
281 anything_else:
282
283 tkz->state = lxb_html_tokenizer_state_rawtext;
284
285 return data;
286
287 done:
288
289 /* Emit text token */
290 tkz->token->tag_id = LXB_TAG__TEXT;
291 tkz->pos = &tkz->start[tkz->entity_start];
292
293 lxb_html_tokenizer_state_set_text(tkz);
294 lxb_html_tokenizer_state_token_done_m(tkz, end);
295
296 /* Init close token */
297 tkz->token->tag_id = tkz->tmp_tag_id;
298 tkz->token->begin = tkz->temp;
299 tkz->token->end = data;
300 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
301
302 return (data + 1);
303 }
304