xref: /php-src/ext/dom/lexbor/lexbor/html/tokenizer.c (revision bffab33a)
1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/tokenizer.h"
8 #include "lexbor/html/tokenizer/state.h"
9 #include "lexbor/html/tokenizer/state_rcdata.h"
10 #include "lexbor/html/tokenizer/state_rawtext.h"
11 #include "lexbor/html/tokenizer/state_script.h"
12 #include "lexbor/html/tree.h"
13 
14 #define LXB_HTML_TAG_RES_DATA
15 #define LXB_HTML_TAG_RES_SHS_DATA
16 #include "lexbor/html/tag_res.h"
17 
18 
19 #define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4)
20 
21 
22 enum {
23     LXB_HTML_TOKENIZER_OPT_UNDEF           = 0x00,
24     LXB_HTML_TOKENIZER_OPT_TAGS_SELF       = 0x01,
25     LXB_HTML_TOKENIZER_OPT_ATTRS_SELF      = 0x02,
26     LXB_HTML_TOKENIZER_OPT_ATTRS_MRAW_SELF = 0x04
27 };
28 
29 
30 const lxb_char_t *lxb_html_tokenizer_eof = (const lxb_char_t *) "\x00";
31 
32 
33 static lxb_html_token_t *
34 lxb_html_tokenizer_token_done(lxb_html_tokenizer_t *tkz,
35                               lxb_html_token_t *token, void *ctx);
36 
37 
38 lxb_html_tokenizer_t *
lxb_html_tokenizer_create(void)39 lxb_html_tokenizer_create(void)
40 {
41     return lexbor_calloc(1, sizeof(lxb_html_tokenizer_t));
42 }
43 
44 lxb_status_t
lxb_html_tokenizer_init(lxb_html_tokenizer_t * tkz)45 lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
46 {
47     lxb_status_t status;
48 
49     if (tkz == NULL) {
50         return LXB_STATUS_ERROR_OBJECT_IS_NULL;
51     }
52 
53     /* mraw for templary strings or structures */
54     tkz->mraw = lexbor_mraw_create();
55     status = lexbor_mraw_init(tkz->mraw, 1024);
56     if (status != LXB_STATUS_OK) {
57         return status;
58     }
59 
60     /* Init Token */
61     tkz->token = NULL;
62 
63     tkz->dobj_token = lexbor_dobject_create();
64     status = lexbor_dobject_init(tkz->dobj_token,
65                                  4096, sizeof(lxb_html_token_t));
66     if (status != LXB_STATUS_OK) {
67         return status;
68     }
69 
70     /* Init Token Attributes */
71     tkz->dobj_token_attr = lexbor_dobject_create();
72     status = lexbor_dobject_init(tkz->dobj_token_attr, 4096,
73                                  sizeof(lxb_html_token_attr_t));
74     if (status != LXB_STATUS_OK) {
75         return status;
76     }
77 
78     /* Parse errors */
79     tkz->parse_errors = lexbor_array_obj_create();
80     status = lexbor_array_obj_init(tkz->parse_errors, 16,
81                                    sizeof(lxb_html_tokenizer_error_t));
82     if (status != LXB_STATUS_OK) {
83         return status;
84     }
85 
86     /* Temporary memory for tag name and attributes. */
87     tkz->start = lexbor_malloc(LXB_HTML_TKZ_TEMP_SIZE * sizeof(lxb_char_t));
88     if (tkz->start == NULL) {
89         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
90     }
91 
92     tkz->pos = tkz->start;
93     tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
94     /* current_line & current_column already initialized by calloc (zero-based) */
95 
96     tkz->tree = NULL;
97     tkz->tags = NULL;
98     tkz->attrs = NULL;
99     tkz->attrs_mraw = NULL;
100 
101     tkz->state = lxb_html_tokenizer_state_data_before;
102     tkz->state_return = NULL;
103 
104     tkz->callback_token_done = lxb_html_tokenizer_token_done;
105     tkz->callback_token_ctx = NULL;
106 
107     tkz->is_eof = false;
108     tkz->status = LXB_STATUS_OK;
109 
110     tkz->base = NULL;
111     tkz->ref_count = 1;
112 
113     return LXB_STATUS_OK;
114 }
115 
116 lxb_status_t
lxb_html_tokenizer_inherit(lxb_html_tokenizer_t * tkz_to,lxb_html_tokenizer_t * tkz_from)117 lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
118                            lxb_html_tokenizer_t *tkz_from)
119 {
120     lxb_status_t status;
121 
122     tkz_to->tags = tkz_from->tags;
123     tkz_to->attrs = tkz_from->attrs;
124     tkz_to->attrs_mraw = tkz_from->attrs_mraw;
125     tkz_to->mraw = tkz_from->mraw;
126 
127     /* Token and Attributes */
128     tkz_to->token = NULL;
129 
130     tkz_to->dobj_token = tkz_from->dobj_token;
131     tkz_to->dobj_token_attr = tkz_from->dobj_token_attr;
132 
133     /* Parse errors */
134     tkz_to->parse_errors = lexbor_array_obj_create();
135     status = lexbor_array_obj_init(tkz_to->parse_errors, 16,
136                                    sizeof(lxb_html_tokenizer_error_t));
137     if (status != LXB_STATUS_OK) {
138         return status;
139     }
140 
141     tkz_to->state = lxb_html_tokenizer_state_data_before;
142     tkz_to->state_return = NULL;
143 
144     tkz_to->callback_token_done = lxb_html_tokenizer_token_done;
145     tkz_to->callback_token_ctx = NULL;
146 
147     tkz_to->is_eof = false;
148     tkz_to->status = LXB_STATUS_OK;
149 
150     tkz_to->base = tkz_from;
151     tkz_to->ref_count = 1;
152 
153     tkz_to->start = tkz_from->start;
154     tkz_to->end = tkz_from->end;
155     tkz_to->pos = tkz_to->start;
156     tkz_to->current_line = tkz_from->current_line;
157     tkz_to->current_column = tkz_from->current_column;
158 
159     return LXB_STATUS_OK;
160 }
161 
162 lxb_html_tokenizer_t *
lxb_html_tokenizer_ref(lxb_html_tokenizer_t * tkz)163 lxb_html_tokenizer_ref(lxb_html_tokenizer_t *tkz)
164 {
165     if (tkz == NULL) {
166         return NULL;
167     }
168 
169     if (tkz->base != NULL) {
170         return lxb_html_tokenizer_ref(tkz->base);
171     }
172 
173     tkz->ref_count++;
174 
175     return tkz;
176 }
177 
178 lxb_html_tokenizer_t *
lxb_html_tokenizer_unref(lxb_html_tokenizer_t * tkz)179 lxb_html_tokenizer_unref(lxb_html_tokenizer_t *tkz)
180 {
181     if (tkz == NULL || tkz->ref_count == 0) {
182         return NULL;
183     }
184 
185     if (tkz->base != NULL) {
186         tkz->base = lxb_html_tokenizer_unref(tkz->base);
187     }
188 
189     tkz->ref_count--;
190 
191     if (tkz->ref_count == 0) {
192         lxb_html_tokenizer_destroy(tkz);
193     }
194 
195     return NULL;
196 }
197 
198 void
lxb_html_tokenizer_clean(lxb_html_tokenizer_t * tkz)199 lxb_html_tokenizer_clean(lxb_html_tokenizer_t *tkz)
200 {
201     tkz->tree = NULL;
202 
203     tkz->state = lxb_html_tokenizer_state_data_before;
204     tkz->state_return = NULL;
205 
206     tkz->is_eof = false;
207     tkz->status = LXB_STATUS_OK;
208 
209     tkz->pos = tkz->start;
210 
211     lexbor_mraw_clean(tkz->mraw);
212     lexbor_dobject_clean(tkz->dobj_token);
213     lexbor_dobject_clean(tkz->dobj_token_attr);
214 
215     lexbor_array_obj_clean(tkz->parse_errors);
216 }
217 
218 lxb_html_tokenizer_t *
lxb_html_tokenizer_destroy(lxb_html_tokenizer_t * tkz)219 lxb_html_tokenizer_destroy(lxb_html_tokenizer_t *tkz)
220 {
221     if (tkz == NULL) {
222         return NULL;
223     }
224 
225     if (tkz->base == NULL) {
226         if (tkz->opt & LXB_HTML_TOKENIZER_OPT_TAGS_SELF) {
227             lxb_html_tokenizer_tags_destroy(tkz);
228         }
229 
230         if (tkz->opt & LXB_HTML_TOKENIZER_OPT_ATTRS_SELF) {
231             lxb_html_tokenizer_attrs_destroy(tkz);
232         }
233 
234         lexbor_mraw_destroy(tkz->mraw, true);
235         lexbor_dobject_destroy(tkz->dobj_token, true);
236         lexbor_dobject_destroy(tkz->dobj_token_attr, true);
237         lexbor_free(tkz->start);
238     }
239 
240     tkz->parse_errors = lexbor_array_obj_destroy(tkz->parse_errors, true);
241 
242     return lexbor_free(tkz);
243 }
244 
245 lxb_status_t
lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t * tkz,size_t table_size)246 lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t *tkz, size_t table_size)
247 {
248     tkz->tags = lexbor_hash_create();
249     return lexbor_hash_init(tkz->tags, table_size, sizeof(lxb_tag_data_t));
250 }
251 
252 void
lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t * tkz)253 lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t *tkz)
254 {
255     tkz->tags = lexbor_hash_destroy(tkz->tags, true);
256 }
257 
258 lxb_status_t
lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t * tkz,size_t table_size)259 lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t *tkz, size_t table_size)
260 {
261     tkz->attrs = lexbor_hash_create();
262     return lexbor_hash_init(tkz->attrs, table_size,
263                             sizeof(lxb_dom_attr_data_t));
264 }
265 
266 void
lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t * tkz)267 lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t *tkz)
268 {
269     tkz->attrs = lexbor_hash_destroy(tkz->attrs, true);
270 }
271 
272 lxb_status_t
lxb_html_tokenizer_begin(lxb_html_tokenizer_t * tkz)273 lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
274 {
275     if (tkz->tags == NULL) {
276         tkz->status = lxb_html_tokenizer_tags_make(tkz, 256);
277         if (tkz->status != LXB_STATUS_OK) {
278             return tkz->status;
279         }
280 
281         tkz->opt |= LXB_HTML_TOKENIZER_OPT_TAGS_SELF;
282     }
283 
284     if (tkz->attrs == NULL) {
285         tkz->status = lxb_html_tokenizer_attrs_make(tkz, 256);
286         if (tkz->status != LXB_STATUS_OK) {
287             return tkz->status;
288         }
289 
290         tkz->opt |= LXB_HTML_TOKENIZER_OPT_ATTRS_SELF;
291     }
292 
293     if (tkz->attrs_mraw == NULL) {
294         tkz->attrs_mraw = tkz->mraw;
295 
296         tkz->opt |= LXB_HTML_TOKENIZER_OPT_ATTRS_MRAW_SELF;
297     }
298 
299     tkz->token = lxb_html_token_create(tkz->dobj_token);
300     if (tkz->token == NULL) {
301         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
302     }
303 
304     return LXB_STATUS_OK;
305 }
306 
307 lxb_status_t
lxb_html_tokenizer_chunk(lxb_html_tokenizer_t * tkz,const lxb_char_t * data,size_t size)308 lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
309                          size_t size)
310 {
311     const lxb_char_t *end = data + size;
312 
313     tkz->is_eof = false;
314     tkz->status = LXB_STATUS_OK;
315     tkz->last = end;
316 
317     while (data < end) {
318         size_t current_column = tkz->current_column;
319         const lxb_char_t *new_data = tkz->state(tkz, data, end);
320         while (data < new_data) {
321             /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
322             if (*data == '\n') {
323                 tkz->current_line++;
324                 current_column = 0;
325             } else {
326                 /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
327                  * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
328                 if ((*data & 0b11000000) == 0b10000000) {
329                     /* Continuation byte, do nothing */
330                 } else {
331                     /* First byte for a codepoint */
332                     current_column++;
333                 }
334             }
335             data++;
336         }
337         tkz->current_column = current_column;
338     }
339 
340     return tkz->status;
341 }
342 
343 lxb_status_t
lxb_html_tokenizer_end(lxb_html_tokenizer_t * tkz)344 lxb_html_tokenizer_end(lxb_html_tokenizer_t *tkz)
345 {
346     const lxb_char_t *data, *end;
347 
348     tkz->status = LXB_STATUS_OK;
349 
350     /* Send a fake EOF data. */
351     data = lxb_html_tokenizer_eof;
352     end = lxb_html_tokenizer_eof + 1UL;
353 
354     tkz->is_eof = true;
355 
356     while (tkz->state(tkz, data, end) < end) {
357         /* empty loop */
358     }
359 
360     tkz->is_eof = false;
361 
362     if (tkz->status != LXB_STATUS_OK) {
363         return tkz->status;
364     }
365 
366     /* Emit fake token: END OF FILE */
367     lxb_html_token_clean(tkz->token);
368 
369     tkz->token->tag_id = LXB_TAG__END_OF_FILE;
370 
371     tkz->token = tkz->callback_token_done(tkz, tkz->token,
372                                           tkz->callback_token_ctx);
373 
374     if (tkz->token == NULL && tkz->status == LXB_STATUS_OK) {
375         tkz->status = LXB_STATUS_ERROR;
376     }
377 
378     return tkz->status;
379 }
380 
381 static lxb_html_token_t *
lxb_html_tokenizer_token_done(lxb_html_tokenizer_t * tkz,lxb_html_token_t * token,void * ctx)382 lxb_html_tokenizer_token_done(lxb_html_tokenizer_t *tkz,
383                               lxb_html_token_t *token, void *ctx)
384 {
385     return token;
386 }
387 
388 lxb_ns_id_t
lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t * tkz)389 lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz)
390 {
391     if (tkz->tree == NULL) {
392         return LXB_NS__UNDEF;
393     }
394 
395     lxb_dom_node_t *node = lxb_html_tree_adjusted_current_node(tkz->tree);
396 
397     if (node == NULL) {
398         return LXB_NS__UNDEF;
399     }
400 
401     return node->ns;
402 }
403 
404 void
lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t * tkz,bool scripting,lxb_tag_id_t tag_id,lxb_ns_id_t ns)405 lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t *tkz, bool scripting,
406                                     lxb_tag_id_t tag_id, lxb_ns_id_t ns)
407 {
408     if (ns != LXB_NS_HTML) {
409         tkz->state = lxb_html_tokenizer_state_data_before;
410 
411         return;
412     }
413 
414     switch (tag_id) {
415         case LXB_TAG_TITLE:
416         case LXB_TAG_TEXTAREA:
417             tkz->tmp_tag_id = tag_id;
418             tkz->state = lxb_html_tokenizer_state_rcdata_before;
419 
420             break;
421 
422         case LXB_TAG_STYLE:
423         case LXB_TAG_XMP:
424         case LXB_TAG_IFRAME:
425         case LXB_TAG_NOEMBED:
426         case LXB_TAG_NOFRAMES:
427             tkz->tmp_tag_id = tag_id;
428             tkz->state = lxb_html_tokenizer_state_rawtext_before;
429 
430             break;
431 
432         case LXB_TAG_SCRIPT:
433             tkz->tmp_tag_id = tag_id;
434             tkz->state = lxb_html_tokenizer_state_script_data_before;
435 
436             break;
437 
438         case LXB_TAG_NOSCRIPT:
439             if (scripting) {
440                 tkz->tmp_tag_id = tag_id;
441                 tkz->state = lxb_html_tokenizer_state_rawtext_before;
442 
443                 return;
444             }
445 
446             tkz->state = lxb_html_tokenizer_state_data_before;
447 
448             break;
449 
450         case LXB_TAG_PLAINTEXT:
451             tkz->state = lxb_html_tokenizer_state_plaintext_before;
452 
453             break;
454 
455         default:
456             break;
457     }
458 }
459 
460 /*
461  * No inline functions for ABI.
462  */
463 void
lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t * tkz,lxb_status_t status)464 lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t *tkz,
465                                   lxb_status_t status)
466 {
467     lxb_html_tokenizer_status_set(tkz, status);
468 }
469 
470 void
lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_token_f call_func,void * ctx)471 lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t *tkz,
472                                                lxb_html_tokenizer_token_f call_func,
473                                                void *ctx)
474 {
475     lxb_html_tokenizer_callback_token_done_set(tkz, call_func, ctx);
476 }
477 
478 void *
lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t * tkz)479 lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t *tkz)
480 {
481     return lxb_html_tokenizer_callback_token_done_ctx(tkz);
482 }
483 
484 void
lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t * tkz,lxb_html_tokenizer_state_f state)485 lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t *tkz,
486                                  lxb_html_tokenizer_state_f state)
487 {
488     lxb_html_tokenizer_state_set(tkz, state);
489 }
490 
491 void
lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t * tkz,lxb_tag_id_t tag_id)492 lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t *tkz,
493                                       lxb_tag_id_t tag_id)
494 {
495     lxb_html_tokenizer_tmp_tag_id_set(tkz, tag_id);
496 }
497 
498 lxb_html_tree_t *
lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t * tkz)499 lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t *tkz)
500 {
501     return lxb_html_tokenizer_tree(tkz);
502 }
503 
504 void
lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t * tkz,lxb_html_tree_t * tree)505 lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t *tkz,
506                                 lxb_html_tree_t *tree)
507 {
508     lxb_html_tokenizer_tree_set(tkz, tree);
509 }
510 
511 lexbor_mraw_t *
lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t * tkz)512 lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t *tkz)
513 {
514     return lxb_html_tokenizer_mraw(tkz);
515 }
516 
517 lexbor_hash_t *
lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t * tkz)518 lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t *tkz)
519 {
520     return lxb_html_tokenizer_tags(tkz);
521 }
522