1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/token.h"
8 #include "lexbor/html/tokenizer.h"
9
10 #define LEXBOR_STR_RES_MAP_LOWERCASE
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #define LEXBOR_STR_RES_MAP_HEX
13 #define LEXBOR_STR_RES_MAP_NUM
14 #include "lexbor/core/str_res.h"
15
16 #include "lexbor/dom/interfaces/document_type.h"
17
18
19 const lxb_tag_data_t *
20 lxb_tag_append_lower(lexbor_hash_t *hash,
21 const lxb_char_t *name, size_t length);
22
23
24 lxb_html_token_t *
lxb_html_token_create(lexbor_dobject_t * dobj)25 lxb_html_token_create(lexbor_dobject_t *dobj)
26 {
27 return lexbor_dobject_calloc(dobj);
28 }
29
30 lxb_html_token_t *
lxb_html_token_destroy(lxb_html_token_t * token,lexbor_dobject_t * dobj)31 lxb_html_token_destroy(lxb_html_token_t *token, lexbor_dobject_t *dobj)
32 {
33 return lexbor_dobject_free(dobj, token);
34 }
35
36 lxb_html_token_attr_t *
lxb_html_token_attr_append(lxb_html_token_t * token,lexbor_dobject_t * dobj)37 lxb_html_token_attr_append(lxb_html_token_t *token, lexbor_dobject_t *dobj)
38 {
39 lxb_html_token_attr_t *attr = lxb_html_token_attr_create(dobj);
40 if (attr == NULL) {
41 return NULL;
42 }
43
44 if (token->attr_last == NULL) {
45 token->attr_first = attr;
46 token->attr_last = attr;
47
48 return attr;
49 }
50
51 token->attr_last->next = attr;
52 attr->prev = token->attr_last;
53
54 token->attr_last = attr;
55
56 return attr;
57 }
58
59 void
lxb_html_token_attr_remove(lxb_html_token_t * token,lxb_html_token_attr_t * attr)60 lxb_html_token_attr_remove(lxb_html_token_t *token, lxb_html_token_attr_t *attr)
61 {
62 if (token->attr_first == attr) {
63 token->attr_first = attr->next;
64 }
65
66 if (token->attr_last == attr) {
67 token->attr_last = attr->prev;
68 }
69
70 if (attr->next != NULL) {
71 attr->next->prev = attr->prev;
72 }
73
74 if (attr->prev != NULL) {
75 attr->prev->next = attr->next;
76 }
77
78 attr->next = NULL;
79 attr->prev = NULL;
80 }
81
82 void
lxb_html_token_attr_delete(lxb_html_token_t * token,lxb_html_token_attr_t * attr,lexbor_dobject_t * dobj)83 lxb_html_token_attr_delete(lxb_html_token_t *token,
84 lxb_html_token_attr_t *attr, lexbor_dobject_t *dobj)
85 {
86 lxb_html_token_attr_remove(token, attr);
87 lxb_html_token_attr_destroy(attr, dobj);
88 }
89
90 lxb_status_t
lxb_html_token_make_text(lxb_html_token_t * token,lexbor_str_t * str,lexbor_mraw_t * mraw)91 lxb_html_token_make_text(lxb_html_token_t *token, lexbor_str_t *str,
92 lexbor_mraw_t *mraw)
93 {
94 size_t len = token->text_end - token->text_start;
95
96 (void) lexbor_str_init(str, mraw, len);
97 if (str->data == NULL) {
98 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
99 }
100
101 memcpy(str->data, token->text_start, len);
102
103 str->data[len] = 0x00;
104 str->length = len;
105
106 return LXB_STATUS_OK;
107 }
108
109 lxb_status_t
lxb_html_token_make_text_drop_null(lxb_html_token_t * token,lexbor_str_t * str,lexbor_mraw_t * mraw)110 lxb_html_token_make_text_drop_null(lxb_html_token_t *token, lexbor_str_t *str,
111 lexbor_mraw_t *mraw)
112 {
113 lxb_char_t *p, c;
114 const lxb_char_t *data = token->text_start;
115 const lxb_char_t *end = token->text_end;
116
117 size_t len = (end - data) - token->null_count;
118
119 (void) lexbor_str_init(str, mraw, len);
120 if (str->data == NULL) {
121 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
122 }
123
124 p = str->data;
125
126 while (data < end) {
127 c = *data++;
128
129 if (c != 0x00) {
130 *p++ = c;
131 }
132 }
133
134 str->data[len] = 0x00;
135 str->length = len;
136
137 return LXB_STATUS_OK;
138 }
139
140 lxb_status_t
lxb_html_token_make_text_replace_null(lxb_html_token_t * token,lexbor_str_t * str,lexbor_mraw_t * mraw)141 lxb_html_token_make_text_replace_null(lxb_html_token_t *token,
142 lexbor_str_t *str, lexbor_mraw_t *mraw)
143 {
144 lxb_char_t *p, c;
145 const lxb_char_t *data = token->text_start;
146 const lxb_char_t *end = token->text_end;
147
148 static const unsigned rep_len = sizeof(lexbor_str_res_ansi_replacement_character) - 1;
149
150 size_t len = (end - data) + (token->null_count * rep_len) - token->null_count;
151
152 (void) lexbor_str_init(str, mraw, len);
153 if (str->data == NULL) {
154 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
155 }
156
157 p = str->data;
158
159 while (data < end) {
160 c = *data++;
161
162 if (c == 0x00) {
163 memcpy(p, lexbor_str_res_ansi_replacement_character, rep_len);
164 p += rep_len;
165
166 continue;
167 }
168
169 *p++ = c;
170 }
171
172 str->data[len] = 0x00;
173 str->length = len;
174
175 return LXB_STATUS_OK;
176 }
177
178 lxb_status_t
lxb_html_token_data_skip_ws_begin(lxb_html_token_t * token)179 lxb_html_token_data_skip_ws_begin(lxb_html_token_t *token)
180 {
181 const lxb_char_t *data = token->text_start;
182 const lxb_char_t *end = token->text_end;
183
184 while (data < end) {
185 switch (*data) {
186 /*
187 * U+0009 CHARACTER TABULATION (tab)
188 * U+000A LINE FEED (LF)
189 * U+000C FORM FEED (FF)
190 * U+0020 SPACE
191 */
192 case 0x09:
193 case 0x0A:
194 case 0x0D:
195 case 0x20:
196 break;
197
198 default:
199 token->begin += data - token->text_start;
200 token->text_start = data;
201
202 return LXB_STATUS_OK;
203 }
204
205 data++;
206 }
207
208 token->begin += data - token->text_start;
209 token->text_start = data;
210
211 return LXB_STATUS_OK;
212 }
213
214 lxb_status_t
lxb_html_token_data_skip_one_newline_begin(lxb_html_token_t * token)215 lxb_html_token_data_skip_one_newline_begin(lxb_html_token_t *token)
216 {
217 const lxb_char_t *data = token->text_start;
218 const lxb_char_t *end = token->text_end;
219
220 if (data < end) {
221 /* U+000A LINE FEED (LF) */
222 if (*data == 0x0A) {
223 token->begin++;
224 token->text_start++;
225 }
226 }
227
228 return LXB_STATUS_OK;
229 }
230
231 lxb_status_t
lxb_html_token_data_split_ws_begin(lxb_html_token_t * token,lxb_html_token_t * ws_token)232 lxb_html_token_data_split_ws_begin(lxb_html_token_t *token,
233 lxb_html_token_t *ws_token)
234 {
235 *ws_token = *token;
236
237 lxb_status_t status = lxb_html_token_data_skip_ws_begin(token);
238 if (status != LXB_STATUS_OK) {
239 return status;
240 }
241
242 if (token->text_start == token->text_end) {
243 return LXB_STATUS_OK;
244 }
245
246 if (token->text_start == ws_token->text_start) {
247 memset(ws_token, 0, sizeof(lxb_html_token_t));
248
249 return LXB_STATUS_OK;
250 }
251
252 ws_token->end = token->begin;
253 ws_token->text_end = token->text_start;
254
255 return LXB_STATUS_OK;
256 }
257
258 lxb_status_t
lxb_html_token_doctype_parse(lxb_html_token_t * token,lxb_dom_document_type_t * doc_type)259 lxb_html_token_doctype_parse(lxb_html_token_t *token,
260 lxb_dom_document_type_t *doc_type)
261 {
262 lxb_html_token_attr_t *attr;
263 lexbor_mraw_t *mraw = doc_type->node.owner_document->mraw;
264
265 /* Set all to empty string if attr not exist */
266 if (token->attr_first == NULL) {
267 goto set_name_pub_sys_empty;
268 }
269
270 /* Name */
271 attr = token->attr_first;
272
273 doc_type->name = attr->name->attr_id;
274
275 /* PUBLIC or SYSTEM */
276 attr = attr->next;
277 if (attr == NULL) {
278 goto set_pub_sys_empty;
279 }
280
281 if (attr->name->attr_id == LXB_DOM_ATTR_PUBLIC) {
282 (void) lexbor_str_init(&doc_type->public_id, mraw, attr->value_size);
283 if (doc_type->public_id.data == NULL) {
284 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
285 }
286
287 if (attr->value_begin == NULL) {
288 return LXB_STATUS_OK;
289 }
290
291 (void) lexbor_str_append(&doc_type->public_id, mraw, attr->value,
292 attr->value_size);
293 }
294 else if (attr->name->attr_id == LXB_DOM_ATTR_SYSTEM) {
295 (void) lexbor_str_init(&doc_type->system_id, mraw, attr->value_size);
296 if (doc_type->system_id.data == NULL) {
297 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
298 }
299
300 if (attr->value_begin == NULL) {
301 return LXB_STATUS_OK;
302 }
303
304 (void) lexbor_str_append(&doc_type->system_id, mraw, attr->value,
305 attr->value_size);
306
307 return LXB_STATUS_OK;
308 }
309 else {
310 goto set_pub_sys_empty;
311 }
312
313 /* SUSTEM */
314 attr = attr->next;
315 if (attr == NULL) {
316 goto set_sys_empty;
317 }
318
319 (void) lexbor_str_init(&doc_type->system_id, mraw, attr->value_size);
320 if (doc_type->system_id.data == NULL) {
321 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
322 }
323
324 (void) lexbor_str_append(&doc_type->system_id, mraw, attr->value,
325 attr->value_size);
326
327 return LXB_STATUS_OK;
328
329 set_name_pub_sys_empty:
330
331 doc_type->name = LXB_DOM_ATTR__UNDEF;
332
333 set_pub_sys_empty:
334
335 (void) lexbor_str_init(&doc_type->public_id, mraw, 0);
336 if (doc_type->public_id.data == NULL) {
337 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
338 }
339
340 set_sys_empty:
341
342 (void) lexbor_str_init(&doc_type->system_id, mraw, 0);
343 if (doc_type->system_id.data == NULL) {
344 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
345 }
346
347 return LXB_HTML_STATUS_OK;
348 }
349
350 lxb_html_token_attr_t *
lxb_html_token_find_attr(lxb_html_tokenizer_t * tkz,lxb_html_token_t * token,const lxb_char_t * name,size_t name_len)351 lxb_html_token_find_attr(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token,
352 const lxb_char_t *name, size_t name_len)
353 {
354 const lxb_dom_attr_data_t *data;
355 lxb_html_token_attr_t *attr = token->attr_first;
356
357 data = lxb_dom_attr_data_by_local_name(tkz->attrs, name, name_len);
358 if (data == NULL) {
359 return NULL;
360 }
361
362 while (attr != NULL) {
363 if (attr->name->attr_id == data->attr_id) {
364 return attr;
365 }
366
367 attr = attr->next;
368 }
369
370 return NULL;
371 }
372
373 /*
374 * No inline functions for ABI.
375 */
376 void
lxb_html_token_clean_noi(lxb_html_token_t * token)377 lxb_html_token_clean_noi(lxb_html_token_t *token)
378 {
379 lxb_html_token_clean(token);
380 }
381
382 lxb_html_token_t *
lxb_html_token_create_eof_noi(lexbor_dobject_t * dobj)383 lxb_html_token_create_eof_noi(lexbor_dobject_t *dobj)
384 {
385 return lxb_html_token_create_eof(dobj);
386 }
387