1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tokenizer/state_rcdata.h"
8 #include "lexbor/html/tokenizer/state.h"
9 
10 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
11 #define LEXBOR_STR_RES_ALPHA_CHARACTER
12 #include "lexbor/core/str_res.h"
13 
14 
15 const lxb_tag_data_t *
16 lxb_tag_append_lower(lexbor_hash_t *hash,
17                      const lxb_char_t *name, size_t length);
18 
19 
20 static const lxb_char_t *
21 lxb_html_tokenizer_state_rcdata(lxb_html_tokenizer_t *tkz,
22                                 const lxb_char_t *data,
23                                 const lxb_char_t *end);
24 
25 static const lxb_char_t *
26 lxb_html_tokenizer_state_rcdata_less_than_sign(lxb_html_tokenizer_t *tkz,
27                                                const lxb_char_t *data,
28                                                const lxb_char_t *end);
29 
30 static const lxb_char_t *
31 lxb_html_tokenizer_state_rcdata_end_tag_open(lxb_html_tokenizer_t *tkz,
32                                              const lxb_char_t *data,
33                                              const lxb_char_t *end);
34 
35 static const lxb_char_t *
36 lxb_html_tokenizer_state_rcdata_end_tag_name(lxb_html_tokenizer_t *tkz,
37                                              const lxb_char_t *data,
38                                              const lxb_char_t *end);
39 
40 
41 /*
42  * Helper function. No in the specification. For 12.2.5.2 RCDATA state
43  */
44 const lxb_char_t *
lxb_html_tokenizer_state_rcdata_before(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)45 lxb_html_tokenizer_state_rcdata_before(lxb_html_tokenizer_t *tkz,
46                                        const lxb_char_t *data,
47                                        const lxb_char_t *end)
48 {
49     if (tkz->is_eof == false) {
50         lxb_html_tokenizer_state_token_set_begin(tkz, data);
51     }
52 
53     tkz->state = lxb_html_tokenizer_state_rcdata;
54 
55     return data;
56 }
57 
58 /*
59  * 12.2.5.2 RCDATA state
60  */
61 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)62 lxb_html_tokenizer_state_rcdata(lxb_html_tokenizer_t *tkz,
63                                 const lxb_char_t *data,
64                                 const lxb_char_t *end)
65 {
66     lxb_html_tokenizer_state_begin_set(tkz, data);
67 
68     while (data != end) {
69         switch (*data) {
70             /* U+003C LESS-THAN SIGN (<) */
71             case 0x3C:
72                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
73                 lxb_html_tokenizer_state_token_set_end(tkz, data);
74 
75                 tkz->state = lxb_html_tokenizer_state_rcdata_less_than_sign;
76 
77                 return (data + 1);
78 
79             /* U+0026 AMPERSAND (&) */
80             case 0x26:
81                 lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
82 
83                 tkz->state = lxb_html_tokenizer_state_char_ref;
84                 tkz->state_return = lxb_html_tokenizer_state_rcdata;
85 
86                 return data + 1;
87 
88             /* U+000D CARRIAGE RETURN (CR) */
89             case 0x0D:
90                 if (++data >= end) {
91                     lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
92 
93                     tkz->state = lxb_html_tokenizer_state_cr;
94                     tkz->state_return = lxb_html_tokenizer_state_rcdata;
95 
96                     return data;
97                 }
98 
99                 lxb_html_tokenizer_state_append_data_m(tkz, data);
100                 tkz->pos[-1] = 0x0A;
101 
102                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
103 
104                 if (*data != 0x0A) {
105                     lxb_html_tokenizer_state_begin_set(tkz, data);
106                     data--;
107                 }
108 
109                 break;
110 
111             /*
112              * U+0000 NULL
113              * EOF
114              */
115             case 0x00:
116                 lxb_html_tokenizer_state_append_data_m(tkz, data);
117 
118                 if (tkz->is_eof) {
119                     if (tkz->token->begin != NULL) {
120                         lxb_html_tokenizer_state_token_set_end_oef(tkz);
121                     }
122 
123                     tkz->token->tag_id = LXB_TAG__TEXT;
124 
125                     lxb_html_tokenizer_state_set_text(tkz);
126                     lxb_html_tokenizer_state_token_done_m(tkz, end);
127 
128                     return end;
129                 }
130 
131                 lxb_html_tokenizer_state_begin_set(tkz, data + 1);
132                 lxb_html_tokenizer_state_append_replace_m(tkz);
133 
134                 lxb_html_tokenizer_error_add(tkz->parse_errors, data,
135                                              LXB_HTML_TOKENIZER_ERROR_UNNUCH);
136                 break;
137 
138             default:
139                 break;
140         }
141 
142         data++;
143     }
144 
145     lxb_html_tokenizer_state_append_data_m(tkz, data);
146 
147     return data;
148 }
149 
150 /*
151  * 12.2.5.9 RCDATA less-than sign state
152  */
153 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata_less_than_sign(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)154 lxb_html_tokenizer_state_rcdata_less_than_sign(lxb_html_tokenizer_t *tkz,
155                                                const lxb_char_t *data,
156                                                const lxb_char_t *end)
157 {
158     /* U+002F SOLIDUS (/) */
159     if (*data == 0x2F) {
160         tkz->state = lxb_html_tokenizer_state_rcdata_end_tag_open;
161 
162         return (data + 1);
163     }
164 
165     tkz->state = lxb_html_tokenizer_state_rcdata;
166 
167     return data;
168 }
169 
170 /*
171  * 12.2.5.10 RCDATA end tag open state
172  */
173 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata_end_tag_open(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)174 lxb_html_tokenizer_state_rcdata_end_tag_open(lxb_html_tokenizer_t *tkz,
175                                              const lxb_char_t *data,
176                                              const lxb_char_t *end)
177 {
178     if (lexbor_str_res_alpha_character[*data] != LEXBOR_STR_RES_SLIP) {
179         tkz->temp = data;
180         tkz->entity_start = (tkz->pos - 1) - tkz->start;
181 
182         tkz->state = lxb_html_tokenizer_state_rcdata_end_tag_name;
183     }
184     else {
185         tkz->state = lxb_html_tokenizer_state_rcdata;
186     }
187 
188     lxb_html_tokenizer_state_append_m(tkz, "/", 1);
189 
190     return data;
191 }
192 
193 /*
194  * 12.2.5.11 RCDATA end tag name state
195  */
196 static const lxb_char_t *
lxb_html_tokenizer_state_rcdata_end_tag_name(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,const lxb_char_t * end)197 lxb_html_tokenizer_state_rcdata_end_tag_name(lxb_html_tokenizer_t *tkz,
198                                              const lxb_char_t *data,
199                                              const lxb_char_t *end)
200 {
201     lxb_html_tokenizer_state_begin_set(tkz, data);
202 
203     while (data != end) {
204         switch (*data) {
205             /*
206              * U+0009 CHARACTER TABULATION (tab)
207              * U+000A LINE FEED (LF)
208              * U+000C FORM FEED (FF)
209              * U+000D CARRIAGE RETURN (CR)
210              * U+0020 SPACE
211              */
212             case 0x09:
213             case 0x0A:
214             case 0x0C:
215             case 0x0D:
216             case 0x20:
217                 lxb_html_tokenizer_state_append_data_m(tkz, data);
218                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
219                                                    tkz->pos);
220 
221                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
222                     goto anything_else;
223                 }
224 
225                 tkz->state = lxb_html_tokenizer_state_before_attribute_name;
226                 goto done;
227 
228             /* U+002F SOLIDUS (/) */
229             case 0x2F:
230                 lxb_html_tokenizer_state_append_data_m(tkz, data);
231                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
232                                                    tkz->pos);
233 
234                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
235                     goto anything_else;
236                 }
237 
238                 tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
239                 goto done;
240 
241             /* U+003E GREATER-THAN SIGN (>) */
242             case 0x3E:
243                 lxb_html_tokenizer_state_append_data_m(tkz, data);
244                 lxb_html_tokenizer_state_set_tag_m(tkz, &tkz->start[tkz->entity_start] + 2,
245                                                    tkz->pos);
246 
247                 if (tkz->tmp_tag_id != tkz->token->tag_id) {
248                     goto anything_else;
249                 }
250 
251                 tkz->state = lxb_html_tokenizer_state_data_before;
252 
253                 /* Emit text token */
254                 tkz->token->tag_id = LXB_TAG__TEXT;
255                 tkz->pos = &tkz->start[tkz->entity_start];
256 
257                 lxb_html_tokenizer_state_set_text(tkz);
258                 lxb_html_tokenizer_state_token_done_m(tkz, end);
259 
260                 /* Init close token */
261                 tkz->token->tag_id = tkz->tmp_tag_id;
262                 tkz->token->begin = tkz->temp;
263                 tkz->token->end = data;
264                 tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
265 
266                 /* Emit close token */
267                 lxb_html_tokenizer_state_token_done_m(tkz, end);
268 
269                 return (data + 1);
270 
271             default:
272                 if (lexbor_str_res_alpha_character[*data]
273                     == LEXBOR_STR_RES_SLIP)
274                 {
275                     lxb_html_tokenizer_state_append_data_m(tkz, data);
276 
277                     goto anything_else;
278                 }
279 
280                 break;
281         }
282 
283         data++;
284     }
285 
286     lxb_html_tokenizer_state_append_data_m(tkz, data);
287 
288     return data;
289 
290 anything_else:
291 
292     tkz->state = lxb_html_tokenizer_state_rcdata;
293 
294     return data;
295 
296 done:
297 
298     /* Emit text token */
299     tkz->token->tag_id = LXB_TAG__TEXT;
300     tkz->pos = &tkz->start[tkz->entity_start];
301 
302     lxb_html_tokenizer_state_set_text(tkz);
303     lxb_html_tokenizer_state_token_done_m(tkz, end);
304 
305     /* Init close token */
306     tkz->token->tag_id = tkz->tmp_tag_id;
307     tkz->token->end = data;
308     tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
309 
310     return (data + 1);
311 }
312