1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tokenizer/state_rcdata.h"
8 #include "lexbor/html/tokenizer/state.h"
9
10 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
11 #define LEXBOR_STR_RES_ALPHA_CHARACTER
12 #include "lexbor/core/str_res.h"
13
14
15 const lxb_tag_data_t *
16 lxb_tag_append_lower(lexbor_hash_t *hash,
17 const lxb_char_t *name, size_t length);
18
19
20 static const lxb_char_t *
21 lxb_html_tokenizer_state_rcdata(lxb_html_tokenizer_t *tkz,
22 const lxb_char_t *data,
23 const lxb_char_t *end);
24
25 static const lxb_char_t *
26 lxb_html_tokenizer_state_rcdata_less_than_sign(lxb_html_tokenizer_t *tkz,
27 const lxb_char_t *data,
28 const lxb_char_t *end);
29
30 static const lxb_char_t *
31 lxb_html_tokenizer_state_rcdata_end_tag_open(lxb_html_tokenizer_t *tkz,
32 const lxb_char_t *data,
33 const lxb_char_t *end);
34
35 static const lxb_char_t *
36 lxb_html_tokenizer_state_rcdata_end_tag_name(lxb_html_tokenizer_t *tkz,
37 const lxb_char_t *data,
38 const lxb_char_t *end);
39
40
41 /*
42 * Helper function. No in the specification. For 12.2.5.2 RCDATA state
43 */
44 const lxb_char_t *
lxb_html_tokenizer_state_rcdata_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)45 lxb_html_tokenizer_state_rcdata_before(lxb_html_tokenizer_t *tkz,
46 const lxb_char_t *data,
47 const lxb_char_t *end)
48 {
49 if (tkz->is_eof == false) {
50 lxb_html_tokenizer_state_token_set_begin(tkz, data);
51 }
52
53 tkz->state = lxb_html_tokenizer_state_rcdata;
54
55 return data;
56 }
57
58 /*
59 * 12.2.5.2 RCDATA state
60 */
61 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)62 lxb_html_tokenizer_state_rcdata(lxb_html_tokenizer_t *tkz,
63 const lxb_char_t *data,
64 const lxb_char_t *end)
65 {
66 lxb_html_tokenizer_state_begin_set(tkz, data);
67
68 while (data != end) {
69 switch (*data) {
70 /* U+003C LESS-THAN SIGN (<) */
71 case 0x3C:
72 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
73 lxb_html_tokenizer_state_token_set_end(tkz, data);
74
75 tkz->state = lxb_html_tokenizer_state_rcdata_less_than_sign;
76
77 return (data + 1);
78
79 /* U+0026 AMPERSAND (&) */
80 case 0x26:
81 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
82
83 tkz->state = lxb_html_tokenizer_state_char_ref;
84 tkz->state_return = lxb_html_tokenizer_state_rcdata;
85
86 return data + 1;
87
88 /* U+000D CARRIAGE RETURN (CR) */
89 case 0x0D:
90 if (++data >= end) {
91 lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
92
93 tkz->state = lxb_html_tokenizer_state_cr;
94 tkz->state_return = lxb_html_tokenizer_state_rcdata;
95
96 return data;
97 }
98
99 lxb_html_tokenizer_state_append_data_m(tkz, data);
100 tkz->pos[-1] = 0x0A;
101
102 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
103
104 if (*data != 0x0A) {
105 lxb_html_tokenizer_state_begin_set(tkz, data);
106 data--;
107 }
108
109 break;
110
111 /*
112 * U+0000 NULL
113 * EOF
114 */
115 case 0x00:
116 lxb_html_tokenizer_state_append_data_m(tkz, data);
117
118 if (tkz->is_eof) {
119 if (tkz->token->begin != NULL) {
120 lxb_html_tokenizer_state_token_set_end_oef(tkz);
121 }
122
123 tkz->token->tag_id = LXB_TAG__TEXT;
124
125 lxb_html_tokenizer_state_set_text(tkz);
126 lxb_html_tokenizer_state_token_done_m(tkz, end);
127
128 return end;
129 }
130
131 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
132 lxb_html_tokenizer_state_append_replace_m(tkz);
133
134 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
135 LXB_HTML_TOKENIZER_ERROR_UNNUCH);
136 break;
137
138 default:
139 break;
140 }
141
142 data++;
143 }
144
145 lxb_html_tokenizer_state_append_data_m(tkz, data);
146
147 return data;
148 }
149
150 /*
151 * 12.2.5.9 RCDATA less-than sign state
152 */
153 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)154 lxb_html_tokenizer_state_rcdata_less_than_sign(lxb_html_tokenizer_t *tkz,
155 const lxb_char_t *data,
156 const lxb_char_t *end)
157 {
158 /* U+002F SOLIDUS (/) */
159 if (*data == 0x2F) {
160 tkz->state = lxb_html_tokenizer_state_rcdata_end_tag_open;
161
162 return (data + 1);
163 }
164
165 tkz->state = lxb_html_tokenizer_state_rcdata;
166
167 return data;
168 }
169
170 /*
171 * 12.2.5.10 RCDATA end tag open state
172 */
173 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)174 lxb_html_tokenizer_state_rcdata_end_tag_open(lxb_html_tokenizer_t *tkz,
175 const lxb_char_t *data,
176 const lxb_char_t *end)
177 {
178 if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
179 tkz->temp = data;
180 tkz->entity_start = (tkz->pos - 1) - tkz->start;
181
182 tkz->state = lxb_html_tokenizer_state_rcdata_end_tag_name;
183 }
184 else {
185 tkz->state = lxb_html_tokenizer_state_rcdata;
186 }
187
188 lxb_html_tokenizer_state_append_m(tkz, "/", 1);
189
190 return data;
191 }
192
193 /*
194 * 12.2.5.11 RCDATA end tag name state
195 */
196 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)197 lxb_html_tokenizer_state_rcdata_end_tag_name(lxb_html_tokenizer_t *tkz,
198 const lxb_char_t *data,
199 const lxb_char_t *end)
200 {
201 lxb_html_tokenizer_state_begin_set(tkz, data);
202
203 while (data != end) {
204 switch (*data) {
205 /*
206 * U+0009 CHARACTER TABULATION (tab)
207 * U+000A LINE FEED (LF)
208 * U+000C FORM FEED (FF)
209 * U+000D CARRIAGE RETURN (CR)
210 * U+0020 SPACE
211 */
212 case 0x09:
213 case 0x0A:
214 case 0x0C:
215 case 0x0D:
216 case 0x20:
217 lxb_html_tokenizer_state_append_data_m(tkz, data);
218 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
219 tkz->pos);
220
221 if (tkz->tmp_tag_id != tkz->token->tag_id) {
222 goto anything_else;
223 }
224
225 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
226 goto done;
227
228 /* U+002F SOLIDUS (/) */
229 case 0x2F:
230 lxb_html_tokenizer_state_append_data_m(tkz, data);
231 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
232 tkz->pos);
233
234 if (tkz->tmp_tag_id != tkz->token->tag_id) {
235 goto anything_else;
236 }
237
238 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
239 goto done;
240
241 /* U+003E GREATER-THAN SIGN (>) */
242 case 0x3E:
243 lxb_html_tokenizer_state_append_data_m(tkz, data);
244 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
245 tkz->pos);
246
247 if (tkz->tmp_tag_id != tkz->token->tag_id) {
248 goto anything_else;
249 }
250
251 tkz->state = lxb_html_tokenizer_state_data_before;
252
253 /* Emit text token */
254 tkz->token->tag_id = LXB_TAG__TEXT;
255 tkz->pos = &tkz->start[tkz->entity_start];
256
257 lxb_html_tokenizer_state_set_text(tkz);
258 lxb_html_tokenizer_state_token_done_m(tkz, end);
259
260 /* Init close token */
261 tkz->token->tag_id = tkz->tmp_tag_id;
262 tkz->token->begin = tkz->temp;
263 tkz->token->end = data;
264 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
265
266 /* Emit close token */
267 lxb_html_tokenizer_state_token_done_m(tkz, end);
268
269 return (data + 1);
270
271 default:
272 if (lexbor_str_res_alpha_character[*data]
273 == LEXBOR_STR_RES_SLIP)
274 {
275 lxb_html_tokenizer_state_append_data_m(tkz, data);
276
277 goto anything_else;
278 }
279
280 break;
281 }
282
283 data++;
284 }
285
286 lxb_html_tokenizer_state_append_data_m(tkz, data);
287
288 return data;
289
290 anything_else:
291
292 tkz->state = lxb_html_tokenizer_state_rcdata;
293
294 return data;
295
296 done:
297
298 /* Emit text token */
299 tkz->token->tag_id = LXB_TAG__TEXT;
300 tkz->pos = &tkz->start[tkz->entity_start];
301
302 lxb_html_tokenizer_state_set_text(tkz);
303 lxb_html_tokenizer_state_token_done_m(tkz, end);
304
305 /* Init close token */
306 tkz->token->tag_id = tkz->tmp_tag_id;
307 tkz->token->end = data;
308 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
309
310 return (data + 1);
311 }
312