xref: /php-src/ext/dom/lexbor/lexbor/html/token.c (revision bffab33a)
1 /*
2  * Copyright (C) 2018-2020 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/html/token.h"
8 #include "lexbor/html/tokenizer.h"
9 
10 #define LEXBOR_STR_RES_MAP_LOWERCASE
11 #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12 #define LEXBOR_STR_RES_MAP_HEX
13 #define LEXBOR_STR_RES_MAP_NUM
14 #include "lexbor/core/str_res.h"
15 
16 #include "lexbor/dom/interfaces/document_type.h"
17 
18 
19 const lxb_tag_data_t *
20 lxb_tag_append_lower(lexbor_hash_t *hash,
21                      const lxb_char_t *name, size_t length);
22 
23 
24 lxb_html_token_t *
lxb_html_token_create(lexbor_dobject_t * dobj)25 lxb_html_token_create(lexbor_dobject_t *dobj)
26 {
27     return lexbor_dobject_calloc(dobj);
28 }
29 
30 lxb_html_token_t *
lxb_html_token_destroy(lxb_html_token_t * token,lexbor_dobject_t * dobj)31 lxb_html_token_destroy(lxb_html_token_t *token, lexbor_dobject_t *dobj)
32 {
33     return lexbor_dobject_free(dobj, token);
34 }
35 
36 lxb_html_token_attr_t *
lxb_html_token_attr_append(lxb_html_token_t * token,lexbor_dobject_t * dobj)37 lxb_html_token_attr_append(lxb_html_token_t *token, lexbor_dobject_t *dobj)
38 {
39     lxb_html_token_attr_t *attr = lxb_html_token_attr_create(dobj);
40     if (attr == NULL) {
41         return NULL;
42     }
43 
44     if (token->attr_last == NULL) {
45         token->attr_first = attr;
46         token->attr_last = attr;
47 
48         return attr;
49     }
50 
51     token->attr_last->next = attr;
52     attr->prev = token->attr_last;
53 
54     token->attr_last = attr;
55 
56     return attr;
57 }
58 
59 void
lxb_html_token_attr_remove(lxb_html_token_t * token,lxb_html_token_attr_t * attr)60 lxb_html_token_attr_remove(lxb_html_token_t *token, lxb_html_token_attr_t *attr)
61 {
62     if (token->attr_first == attr) {
63         token->attr_first = attr->next;
64     }
65 
66     if (token->attr_last == attr) {
67         token->attr_last = attr->prev;
68     }
69 
70     if (attr->next != NULL) {
71         attr->next->prev = attr->prev;
72     }
73 
74     if (attr->prev != NULL) {
75         attr->prev->next = attr->next;
76     }
77 
78     attr->next = NULL;
79     attr->prev = NULL;
80 }
81 
82 void
lxb_html_token_attr_delete(lxb_html_token_t * token,lxb_html_token_attr_t * attr,lexbor_dobject_t * dobj)83 lxb_html_token_attr_delete(lxb_html_token_t *token,
84                            lxb_html_token_attr_t *attr, lexbor_dobject_t *dobj)
85 {
86     lxb_html_token_attr_remove(token, attr);
87     lxb_html_token_attr_destroy(attr, dobj);
88 }
89 
90 lxb_status_t
lxb_html_token_make_text(lxb_html_token_t * token,lexbor_str_t * str,lexbor_mraw_t * mraw)91 lxb_html_token_make_text(lxb_html_token_t *token, lexbor_str_t *str,
92                          lexbor_mraw_t *mraw)
93 {
94     size_t len = token->text_end - token->text_start;
95 
96     (void) lexbor_str_init(str, mraw, len);
97     if (str->data == NULL) {
98         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
99     }
100 
101     memcpy(str->data, token->text_start, len);
102 
103     str->data[len] = 0x00;
104     str->length = len;
105 
106     return LXB_STATUS_OK;
107 }
108 
109 lxb_status_t
lxb_html_token_make_text_drop_null(lxb_html_token_t * token,lexbor_str_t * str,lexbor_mraw_t * mraw)110 lxb_html_token_make_text_drop_null(lxb_html_token_t *token, lexbor_str_t *str,
111                                    lexbor_mraw_t *mraw)
112 {
113     lxb_char_t *p, c;
114     const lxb_char_t *data = token->text_start;
115     const lxb_char_t *end = token->text_end;
116 
117     size_t len = (end - data) - token->null_count;
118 
119     (void) lexbor_str_init(str, mraw, len);
120     if (str->data == NULL) {
121         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
122     }
123 
124     p = str->data;
125 
126     while (data < end) {
127         c = *data++;
128 
129         if (c != 0x00) {
130             *p++ = c;
131         }
132     }
133 
134     str->data[len] = 0x00;
135     str->length = len;
136 
137     return LXB_STATUS_OK;
138 }
139 
140 lxb_status_t
lxb_html_token_make_text_replace_null(lxb_html_token_t * token,lexbor_str_t * str,lexbor_mraw_t * mraw)141 lxb_html_token_make_text_replace_null(lxb_html_token_t *token,
142                                       lexbor_str_t *str, lexbor_mraw_t *mraw)
143 {
144     lxb_char_t *p, c;
145     const lxb_char_t *data = token->text_start;
146     const lxb_char_t *end = token->text_end;
147 
148     static const unsigned rep_len = sizeof(lexbor_str_res_ansi_replacement_character) - 1;
149 
150     size_t len = (end - data) + (token->null_count * rep_len) - token->null_count;
151 
152     (void) lexbor_str_init(str, mraw, len);
153     if (str->data == NULL) {
154         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
155     }
156 
157     p = str->data;
158 
159     while (data < end) {
160         c = *data++;
161 
162         if (c == 0x00) {
163             memcpy(p, lexbor_str_res_ansi_replacement_character, rep_len);
164             p += rep_len;
165 
166             continue;
167         }
168 
169         *p++ = c;
170     }
171 
172     str->data[len] = 0x00;
173     str->length = len;
174 
175     return LXB_STATUS_OK;
176 }
177 
178 lxb_status_t
lxb_html_token_data_skip_ws_begin(lxb_html_token_t * token)179 lxb_html_token_data_skip_ws_begin(lxb_html_token_t *token)
180 {
181     const lxb_char_t *data = token->text_start;
182     const lxb_char_t *end = token->text_end;
183 
184     while (data < end) {
185         switch (*data) {
186             /*
187              * U+0009 CHARACTER TABULATION (tab)
188              * U+000A LINE FEED (LF)
189              * U+000C FORM FEED (FF)
190              * U+0020 SPACE
191              */
192             case 0x09:
193             case 0x0A:
194             case 0x0D:
195             case 0x20:
196                 break;
197 
198             default:
199                 token->begin += data - token->text_start;
200                 token->text_start = data;
201 
202                 return LXB_STATUS_OK;
203         }
204 
205         data++;
206     }
207 
208     token->begin += data - token->text_start;
209     token->text_start = data;
210 
211     return LXB_STATUS_OK;
212 }
213 
214 lxb_status_t
lxb_html_token_data_skip_one_newline_begin(lxb_html_token_t * token)215 lxb_html_token_data_skip_one_newline_begin(lxb_html_token_t *token)
216 {
217     const lxb_char_t *data = token->text_start;
218     const lxb_char_t *end = token->text_end;
219 
220     if (data < end) {
221         /* U+000A LINE FEED (LF) */
222         if (*data == 0x0A) {
223             token->begin++;
224             token->text_start++;
225         }
226     }
227 
228     return LXB_STATUS_OK;
229 }
230 
231 lxb_status_t
lxb_html_token_data_split_ws_begin(lxb_html_token_t * token,lxb_html_token_t * ws_token)232 lxb_html_token_data_split_ws_begin(lxb_html_token_t *token,
233                                    lxb_html_token_t *ws_token)
234 {
235     *ws_token = *token;
236 
237     lxb_status_t status = lxb_html_token_data_skip_ws_begin(token);
238     if (status != LXB_STATUS_OK) {
239         return status;
240     }
241 
242     if (token->text_start == token->text_end) {
243         return LXB_STATUS_OK;
244     }
245 
246     if (token->text_start == ws_token->text_start) {
247         memset(ws_token, 0, sizeof(lxb_html_token_t));
248 
249         return LXB_STATUS_OK;
250     }
251 
252     ws_token->end = token->begin;
253     ws_token->text_end = token->text_start;
254 
255     return LXB_STATUS_OK;
256 }
257 
258 lxb_status_t
lxb_html_token_doctype_parse(lxb_html_token_t * token,lxb_dom_document_type_t * doc_type)259 lxb_html_token_doctype_parse(lxb_html_token_t *token,
260                              lxb_dom_document_type_t *doc_type)
261 {
262     lxb_html_token_attr_t *attr;
263     lexbor_mraw_t *mraw = doc_type->node.owner_document->mraw;
264 
265     /* Set all to empty string if attr not exist */
266     if (token->attr_first == NULL) {
267         goto set_name_pub_sys_empty;
268     }
269 
270     /* Name */
271     attr = token->attr_first;
272 
273     doc_type->name = attr->name->attr_id;
274 
275     /* PUBLIC or SYSTEM */
276     attr = attr->next;
277     if (attr == NULL) {
278         goto set_pub_sys_empty;
279     }
280 
281     if (attr->name->attr_id == LXB_DOM_ATTR_PUBLIC) {
282         (void) lexbor_str_init(&doc_type->public_id, mraw, attr->value_size);
283         if (doc_type->public_id.data == NULL) {
284             return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
285         }
286 
287         if (attr->value_begin == NULL) {
288             return LXB_STATUS_OK;
289         }
290 
291         (void) lexbor_str_append(&doc_type->public_id, mraw, attr->value,
292                                  attr->value_size);
293     }
294     else if (attr->name->attr_id == LXB_DOM_ATTR_SYSTEM) {
295         (void) lexbor_str_init(&doc_type->system_id, mraw, attr->value_size);
296         if (doc_type->system_id.data == NULL) {
297             return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
298         }
299 
300         if (attr->value_begin == NULL) {
301             return LXB_STATUS_OK;
302         }
303 
304         (void) lexbor_str_append(&doc_type->system_id, mraw, attr->value,
305                                  attr->value_size);
306 
307         return LXB_STATUS_OK;
308     }
309     else {
310         goto set_pub_sys_empty;
311     }
312 
313     /* SUSTEM */
314     attr = attr->next;
315     if (attr == NULL) {
316         goto set_sys_empty;
317     }
318 
319     (void) lexbor_str_init(&doc_type->system_id, mraw, attr->value_size);
320     if (doc_type->system_id.data == NULL) {
321         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
322     }
323 
324     (void) lexbor_str_append(&doc_type->system_id, mraw, attr->value,
325                              attr->value_size);
326 
327     return LXB_STATUS_OK;
328 
329 set_name_pub_sys_empty:
330 
331     doc_type->name = LXB_DOM_ATTR__UNDEF;
332 
333 set_pub_sys_empty:
334 
335     (void) lexbor_str_init(&doc_type->public_id, mraw, 0);
336     if (doc_type->public_id.data == NULL) {
337         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
338     }
339 
340 set_sys_empty:
341 
342     (void) lexbor_str_init(&doc_type->system_id, mraw, 0);
343     if (doc_type->system_id.data == NULL) {
344         return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
345     }
346 
347     return LXB_HTML_STATUS_OK;
348 }
349 
350 lxb_html_token_attr_t *
lxb_html_token_find_attr(lxb_html_tokenizer_t * tkz,lxb_html_token_t * token,const lxb_char_t * name,size_t name_len)351 lxb_html_token_find_attr(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token,
352                          const lxb_char_t *name, size_t name_len)
353 {
354     const lxb_dom_attr_data_t *data;
355     lxb_html_token_attr_t *attr = token->attr_first;
356 
357     data = lxb_dom_attr_data_by_local_name(tkz->attrs, name, name_len);
358     if (data == NULL) {
359         return NULL;
360     }
361 
362     while (attr != NULL) {
363         if (attr->name->attr_id == data->attr_id) {
364             return attr;
365         }
366 
367         attr = attr->next;
368     }
369 
370     return NULL;
371 }
372 
373 /*
374  * No inline functions for ABI.
375  */
376 void
lxb_html_token_clean_noi(lxb_html_token_t * token)377 lxb_html_token_clean_noi(lxb_html_token_t *token)
378 {
379     lxb_html_token_clean(token);
380 }
381 
382 lxb_html_token_t *
lxb_html_token_create_eof_noi(lexbor_dobject_t * dobj)383 lxb_html_token_create_eof_noi(lexbor_dobject_t *dobj)
384 {
385     return lxb_html_token_create_eof(dobj);
386 }
387