1 /*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/tokenizer.h"
8 #include "lexbor/html/tokenizer/state.h"
9 #include "lexbor/html/tokenizer/state_rcdata.h"
10 #include "lexbor/html/tokenizer/state_rawtext.h"
11 #include "lexbor/html/tokenizer/state_script.h"
12 #include "lexbor/html/tree.h"
13
14 #define LXB_HTML_TAG_RES_DATA
15 #define LXB_HTML_TAG_RES_SHS_DATA
16 #include "lexbor/html/tag_res.h"
17
18
19 #define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4)
20
21
22 enum {
23 LXB_HTML_TOKENIZER_OPT_UNDEF = 0x00,
24 LXB_HTML_TOKENIZER_OPT_TAGS_SELF = 0x01,
25 LXB_HTML_TOKENIZER_OPT_ATTRS_SELF = 0x02,
26 LXB_HTML_TOKENIZER_OPT_ATTRS_MRAW_SELF = 0x04
27 };
28
29
30 const lxb_char_t *lxb_html_tokenizer_eof = (const lxb_char_t *) "\x00";
31
32
33 static lxb_html_token_t *
34 lxb_html_tokenizer_token_done(lxb_html_tokenizer_t *tkz,
35 lxb_html_token_t *token, void *ctx);
36
37
38 lxb_html_tokenizer_t *
lxb_html_tokenizer_create(void)39 lxb_html_tokenizer_create(void)
40 {
41 return lexbor_calloc(1, sizeof(lxb_html_tokenizer_t));
42 }
43
44 lxb_status_t
lxb_html_tokenizer_init(lxb_html_tokenizer_t * tkz)45 lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
46 {
47 lxb_status_t status;
48
49 if (tkz == NULL) {
50 return LXB_STATUS_ERROR_OBJECT_IS_NULL;
51 }
52
53 /* mraw for templary strings or structures */
54 tkz->mraw = lexbor_mraw_create();
55 status = lexbor_mraw_init(tkz->mraw, 1024);
56 if (status != LXB_STATUS_OK) {
57 return status;
58 }
59
60 /* Init Token */
61 tkz->token = NULL;
62
63 tkz->dobj_token = lexbor_dobject_create();
64 status = lexbor_dobject_init(tkz->dobj_token,
65 4096, sizeof(lxb_html_token_t));
66 if (status != LXB_STATUS_OK) {
67 return status;
68 }
69
70 /* Init Token Attributes */
71 tkz->dobj_token_attr = lexbor_dobject_create();
72 status = lexbor_dobject_init(tkz->dobj_token_attr, 4096,
73 sizeof(lxb_html_token_attr_t));
74 if (status != LXB_STATUS_OK) {
75 return status;
76 }
77
78 /* Parse errors */
79 tkz->parse_errors = lexbor_array_obj_create();
80 status = lexbor_array_obj_init(tkz->parse_errors, 16,
81 sizeof(lxb_html_tokenizer_error_t));
82 if (status != LXB_STATUS_OK) {
83 return status;
84 }
85
86 /* Temporary memory for tag name and attributes. */
87 tkz->start = lexbor_malloc(LXB_HTML_TKZ_TEMP_SIZE * sizeof(lxb_char_t));
88 if (tkz->start == NULL) {
89 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
90 }
91
92 tkz->pos = tkz->start;
93 tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
94 /* current_line & current_column already initialized by calloc (zero-based) */
95
96 tkz->tree = NULL;
97 tkz->tags = NULL;
98 tkz->attrs = NULL;
99 tkz->attrs_mraw = NULL;
100
101 tkz->state = lxb_html_tokenizer_state_data_before;
102 tkz->state_return = NULL;
103
104 tkz->callback_token_done = lxb_html_tokenizer_token_done;
105 tkz->callback_token_ctx = NULL;
106
107 tkz->is_eof = false;
108 tkz->status = LXB_STATUS_OK;
109
110 tkz->base = NULL;
111 tkz->ref_count = 1;
112
113 return LXB_STATUS_OK;
114 }
115
116 lxb_status_t
lxb_html_tokenizer_inherit(lxb_html_tokenizer_t * tkz_to,lxb_html_tokenizer_t * tkz_from)117 lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
118 lxb_html_tokenizer_t *tkz_from)
119 {
120 lxb_status_t status;
121
122 tkz_to->tags = tkz_from->tags;
123 tkz_to->attrs = tkz_from->attrs;
124 tkz_to->attrs_mraw = tkz_from->attrs_mraw;
125 tkz_to->mraw = tkz_from->mraw;
126
127 /* Token and Attributes */
128 tkz_to->token = NULL;
129
130 tkz_to->dobj_token = tkz_from->dobj_token;
131 tkz_to->dobj_token_attr = tkz_from->dobj_token_attr;
132
133 /* Parse errors */
134 tkz_to->parse_errors = lexbor_array_obj_create();
135 status = lexbor_array_obj_init(tkz_to->parse_errors, 16,
136 sizeof(lxb_html_tokenizer_error_t));
137 if (status != LXB_STATUS_OK) {
138 return status;
139 }
140
141 tkz_to->state = lxb_html_tokenizer_state_data_before;
142 tkz_to->state_return = NULL;
143
144 tkz_to->callback_token_done = lxb_html_tokenizer_token_done;
145 tkz_to->callback_token_ctx = NULL;
146
147 tkz_to->is_eof = false;
148 tkz_to->status = LXB_STATUS_OK;
149
150 tkz_to->base = tkz_from;
151 tkz_to->ref_count = 1;
152
153 tkz_to->start = tkz_from->start;
154 tkz_to->end = tkz_from->end;
155 tkz_to->pos = tkz_to->start;
156 tkz_to->current_line = tkz_from->current_line;
157 tkz_to->current_column = tkz_from->current_column;
158
159 return LXB_STATUS_OK;
160 }
161
162 lxb_html_tokenizer_t *
lxb_html_tokenizer_ref(lxb_html_tokenizer_t * tkz)163 lxb_html_tokenizer_ref(lxb_html_tokenizer_t *tkz)
164 {
165 if (tkz == NULL) {
166 return NULL;
167 }
168
169 if (tkz->base != NULL) {
170 return lxb_html_tokenizer_ref(tkz->base);
171 }
172
173 tkz->ref_count++;
174
175 return tkz;
176 }
177
178 lxb_html_tokenizer_t *
lxb_html_tokenizer_unref(lxb_html_tokenizer_t * tkz)179 lxb_html_tokenizer_unref(lxb_html_tokenizer_t *tkz)
180 {
181 if (tkz == NULL || tkz->ref_count == 0) {
182 return NULL;
183 }
184
185 if (tkz->base != NULL) {
186 tkz->base = lxb_html_tokenizer_unref(tkz->base);
187 }
188
189 tkz->ref_count--;
190
191 if (tkz->ref_count == 0) {
192 lxb_html_tokenizer_destroy(tkz);
193 }
194
195 return NULL;
196 }
197
198 void
lxb_html_tokenizer_clean(lxb_html_tokenizer_t * tkz)199 lxb_html_tokenizer_clean(lxb_html_tokenizer_t *tkz)
200 {
201 tkz->tree = NULL;
202
203 tkz->state = lxb_html_tokenizer_state_data_before;
204 tkz->state_return = NULL;
205
206 tkz->is_eof = false;
207 tkz->status = LXB_STATUS_OK;
208
209 tkz->pos = tkz->start;
210
211 lexbor_mraw_clean(tkz->mraw);
212 lexbor_dobject_clean(tkz->dobj_token);
213 lexbor_dobject_clean(tkz->dobj_token_attr);
214
215 lexbor_array_obj_clean(tkz->parse_errors);
216 }
217
218 lxb_html_tokenizer_t *
lxb_html_tokenizer_destroy(lxb_html_tokenizer_t * tkz)219 lxb_html_tokenizer_destroy(lxb_html_tokenizer_t *tkz)
220 {
221 if (tkz == NULL) {
222 return NULL;
223 }
224
225 if (tkz->base == NULL) {
226 if (tkz->opt & LXB_HTML_TOKENIZER_OPT_TAGS_SELF) {
227 lxb_html_tokenizer_tags_destroy(tkz);
228 }
229
230 if (tkz->opt & LXB_HTML_TOKENIZER_OPT_ATTRS_SELF) {
231 lxb_html_tokenizer_attrs_destroy(tkz);
232 }
233
234 lexbor_mraw_destroy(tkz->mraw, true);
235 lexbor_dobject_destroy(tkz->dobj_token, true);
236 lexbor_dobject_destroy(tkz->dobj_token_attr, true);
237 lexbor_free(tkz->start);
238 }
239
240 tkz->parse_errors = lexbor_array_obj_destroy(tkz->parse_errors, true);
241
242 return lexbor_free(tkz);
243 }
244
245 lxb_status_t
lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t * tkz,size_t table_size)246 lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t *tkz, size_t table_size)
247 {
248 tkz->tags = lexbor_hash_create();
249 return lexbor_hash_init(tkz->tags, table_size, sizeof(lxb_tag_data_t));
250 }
251
252 void
lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t * tkz)253 lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t *tkz)
254 {
255 tkz->tags = lexbor_hash_destroy(tkz->tags, true);
256 }
257
258 lxb_status_t
lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t * tkz,size_t table_size)259 lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t *tkz, size_t table_size)
260 {
261 tkz->attrs = lexbor_hash_create();
262 return lexbor_hash_init(tkz->attrs, table_size,
263 sizeof(lxb_dom_attr_data_t));
264 }
265
266 void
lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t * tkz)267 lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t *tkz)
268 {
269 tkz->attrs = lexbor_hash_destroy(tkz->attrs, true);
270 }
271
272 lxb_status_t
lxb_html_tokenizer_begin(lxb_html_tokenizer_t * tkz)273 lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
274 {
275 if (tkz->tags == NULL) {
276 tkz->status = lxb_html_tokenizer_tags_make(tkz, 256);
277 if (tkz->status != LXB_STATUS_OK) {
278 return tkz->status;
279 }
280
281 tkz->opt |= LXB_HTML_TOKENIZER_OPT_TAGS_SELF;
282 }
283
284 if (tkz->attrs == NULL) {
285 tkz->status = lxb_html_tokenizer_attrs_make(tkz, 256);
286 if (tkz->status != LXB_STATUS_OK) {
287 return tkz->status;
288 }
289
290 tkz->opt |= LXB_HTML_TOKENIZER_OPT_ATTRS_SELF;
291 }
292
293 if (tkz->attrs_mraw == NULL) {
294 tkz->attrs_mraw = tkz->mraw;
295
296 tkz->opt |= LXB_HTML_TOKENIZER_OPT_ATTRS_MRAW_SELF;
297 }
298
299 tkz->token = lxb_html_token_create(tkz->dobj_token);
300 if (tkz->token == NULL) {
301 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
302 }
303
304 return LXB_STATUS_OK;
305 }
306
307 lxb_status_t
lxb_html_tokenizer_chunk(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,size_t size)308 lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
309 size_t size)
310 {
311 const lxb_char_t *end = data + size;
312
313 tkz->is_eof = false;
314 tkz->status = LXB_STATUS_OK;
315 tkz->last = end;
316
317 while (data < end) {
318 size_t current_column = tkz->current_column;
319 const lxb_char_t *new_data = tkz->state(tkz, data, end);
320 while (data < new_data) {
321 /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
322 if (*data == '\n') {
323 tkz->current_line++;
324 current_column = 0;
325 } else {
326 /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
327 * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
328 if ((*data & 0b11000000) == 0b10000000) {
329 /* Continuation byte, do nothing */
330 } else {
331 /* First byte for a codepoint */
332 current_column++;
333 }
334 }
335 data++;
336 }
337 tkz->current_column = current_column;
338 }
339
340 return tkz->status;
341 }
342
343 lxb_status_t
lxb_html_tokenizer_end(lxb_html_tokenizer_t * tkz)344 lxb_html_tokenizer_end(lxb_html_tokenizer_t *tkz)
345 {
346 const lxb_char_t *data, *end;
347
348 tkz->status = LXB_STATUS_OK;
349
350 /* Send a fake EOF data. */
351 data = lxb_html_tokenizer_eof;
352 end = lxb_html_tokenizer_eof + 1UL;
353
354 tkz->is_eof = true;
355
356 while (tkz->state(tkz, data, end) < end) {
357 /* empty loop */
358 }
359
360 tkz->is_eof = false;
361
362 if (tkz->status != LXB_STATUS_OK) {
363 return tkz->status;
364 }
365
366 /* Emit fake token: END OF FILE */
367 lxb_html_token_clean(tkz->token);
368
369 tkz->token->tag_id = LXB_TAG__END_OF_FILE;
370
371 tkz->token = tkz->callback_token_done(tkz, tkz->token,
372 tkz->callback_token_ctx);
373
374 if (tkz->token == NULL && tkz->status == LXB_STATUS_OK) {
375 tkz->status = LXB_STATUS_ERROR;
376 }
377
378 return tkz->status;
379 }
380
381 static lxb_html_token_t *
lxb_html_tokenizer_token_done(lxb_html_tokenizer_t * tkz,lxb_html_token_t * token,void * ctx)382 lxb_html_tokenizer_token_done(lxb_html_tokenizer_t *tkz,
383 lxb_html_token_t *token, void *ctx)
384 {
385 return token;
386 }
387
388 lxb_ns_id_t
lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t * tkz)389 lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz)
390 {
391 if (tkz->tree == NULL) {
392 return LXB_NS__UNDEF;
393 }
394
395 lxb_dom_node_t *node = lxb_html_tree_adjusted_current_node(tkz->tree);
396
397 if (node == NULL) {
398 return LXB_NS__UNDEF;
399 }
400
401 return node->ns;
402 }
403
404 void
lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t * tkz,bool scripting,lxb_tag_id_t tag_id,lxb_ns_id_t ns)405 lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t *tkz, bool scripting,
406 lxb_tag_id_t tag_id, lxb_ns_id_t ns)
407 {
408 if (ns != LXB_NS_HTML) {
409 tkz->state = lxb_html_tokenizer_state_data_before;
410
411 return;
412 }
413
414 switch (tag_id) {
415 case LXB_TAG_TITLE:
416 case LXB_TAG_TEXTAREA:
417 tkz->tmp_tag_id = tag_id;
418 tkz->state = lxb_html_tokenizer_state_rcdata_before;
419
420 break;
421
422 case LXB_TAG_STYLE:
423 case LXB_TAG_XMP:
424 case LXB_TAG_IFRAME:
425 case LXB_TAG_NOEMBED:
426 case LXB_TAG_NOFRAMES:
427 tkz->tmp_tag_id = tag_id;
428 tkz->state = lxb_html_tokenizer_state_rawtext_before;
429
430 break;
431
432 case LXB_TAG_SCRIPT:
433 tkz->tmp_tag_id = tag_id;
434 tkz->state = lxb_html_tokenizer_state_script_data_before;
435
436 break;
437
438 case LXB_TAG_NOSCRIPT:
439 if (scripting) {
440 tkz->tmp_tag_id = tag_id;
441 tkz->state = lxb_html_tokenizer_state_rawtext_before;
442
443 return;
444 }
445
446 tkz->state = lxb_html_tokenizer_state_data_before;
447
448 break;
449
450 case LXB_TAG_PLAINTEXT:
451 tkz->state = lxb_html_tokenizer_state_plaintext_before;
452
453 break;
454
455 default:
456 break;
457 }
458 }
459
460 /*
461 * No inline functions for ABI.
462 */
463 void
lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t * tkz,lxb_status_t status)464 lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t *tkz,
465 lxb_status_t status)
466 {
467 lxb_html_tokenizer_status_set(tkz, status);
468 }
469
470 void
lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_token_f call_func,void * ctx)471 lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t *tkz,
472 lxb_html_tokenizer_token_f call_func,
473 void *ctx)
474 {
475 lxb_html_tokenizer_callback_token_done_set(tkz, call_func, ctx);
476 }
477
478 void *
lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t * tkz)479 lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t *tkz)
480 {
481 return lxb_html_tokenizer_callback_token_done_ctx(tkz);
482 }
483
484 void
lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_state_f state)485 lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t *tkz,
486 lxb_html_tokenizer_state_f state)
487 {
488 lxb_html_tokenizer_state_set(tkz, state);
489 }
490
491 void
lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t * tkz,lxb_tag_id_t tag_id)492 lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t *tkz,
493 lxb_tag_id_t tag_id)
494 {
495 lxb_html_tokenizer_tmp_tag_id_set(tkz, tag_id);
496 }
497
498 lxb_html_tree_t *
lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t * tkz)499 lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t *tkz)
500 {
501 return lxb_html_tokenizer_tree(tkz);
502 }
503
504 void
lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t * tkz,lxb_html_tree_t * tree)505 lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t *tkz,
506 lxb_html_tree_t *tree)
507 {
508 lxb_html_tokenizer_tree_set(tkz, tree);
509 }
510
511 lexbor_mraw_t *
lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t * tkz)512 lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t *tkz)
513 {
514 return lxb_html_tokenizer_mraw(tkz);
515 }
516
517 lexbor_hash_t *
lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t * tkz)518 lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t *tkz)
519 {
520 return lxb_html_tokenizer_tags(tkz);
521 }
522