1 /*
2 * Copyright (C) 2019 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/html/encoding.h"
8
9 #include "lexbor/core/str.h"
10
11
12 static const lxb_char_t *
13 lxb_html_encoding_meta(lxb_html_encoding_t *em,
14 const lxb_char_t *data, const lxb_char_t *end);
15
16 static const lxb_char_t *
17 lxb_html_get_attribute(const lxb_char_t *data, const lxb_char_t *end,
18 const lxb_char_t **name, const lxb_char_t **name_end,
19 const lxb_char_t **value, const lxb_char_t **value_end);
20
21
22 lxb_inline const lxb_char_t *
lxb_html_encoding_skip_spaces(const lxb_char_t * data,const lxb_char_t * end)23 lxb_html_encoding_skip_spaces(const lxb_char_t *data, const lxb_char_t *end)
24 {
25 for (; data < end; data++) {
26 switch (*data) {
27 case 0x09: case 0x0A:
28 case 0x0C: case 0x0D:
29 case 0x20:
30 break;
31
32 default:
33 return data;
34 }
35 }
36
37 return end;
38 }
39
40 lxb_inline const lxb_char_t *
lxb_html_encoding_skip_name(const lxb_char_t * data,const lxb_char_t * end)41 lxb_html_encoding_skip_name(const lxb_char_t *data, const lxb_char_t *end)
42 {
43 for (; data < end; data++) {
44 switch (*data) {
45 case 0x09: case 0x0A:
46 case 0x0C: case 0x0D:
47 case 0x20: case 0x3E:
48 return data;
49 }
50 }
51
52 return end;
53 }
54
55 lxb_inline const lxb_char_t *
lxb_html_encoding_tag_end(const lxb_char_t * data,const lxb_char_t * end)56 lxb_html_encoding_tag_end(const lxb_char_t *data, const lxb_char_t *end)
57 {
58 data = memchr(data, '>', (end - data));
59 if (data == NULL) {
60 return end;
61 }
62
63 return data + 1;
64 }
65
66 lxb_status_t
lxb_html_encoding_init(lxb_html_encoding_t * em)67 lxb_html_encoding_init(lxb_html_encoding_t *em)
68 {
69 lxb_status_t status;
70
71 if (em == NULL) {
72 return LXB_STATUS_ERROR_WRONG_ARGS;
73 }
74
75 status = lexbor_array_obj_init(&em->cache, 12,
76 sizeof(lxb_html_encoding_entry_t));
77 if (status != LXB_STATUS_OK) {
78 return status;
79 }
80
81 return lexbor_array_obj_init(&em->result, 12,
82 sizeof(lxb_html_encoding_entry_t));
83 }
84
85 lxb_html_encoding_t *
lxb_html_encoding_destroy(lxb_html_encoding_t * em,bool self_destroy)86 lxb_html_encoding_destroy(lxb_html_encoding_t *em, bool self_destroy)
87 {
88 if (em == NULL) {
89 return NULL;
90 }
91
92 lexbor_array_obj_destroy(&em->cache, false);
93 lexbor_array_obj_destroy(&em->result, false);
94
95 if (self_destroy) {
96 return lexbor_free(em);
97 }
98
99 return em;
100 }
101
102 lxb_status_t
lxb_html_encoding_determine(lxb_html_encoding_t * em,const lxb_char_t * data,const lxb_char_t * end)103 lxb_html_encoding_determine(lxb_html_encoding_t *em,
104 const lxb_char_t *data, const lxb_char_t *end)
105 {
106 const lxb_char_t *name, *name_end;
107 const lxb_char_t *value, *value_end;
108
109 while (data < end) {
110 /* Find tag beginning */
111 data = memchr(data, '<', (end - data));
112 if (data == NULL) {
113 return LXB_STATUS_OK;
114 }
115
116 if (++data == end) {
117 return LXB_STATUS_OK;
118 }
119
120 switch (*data) {
121 /* Comment or broken tag */
122 case '!':
123 if ((data + 5) > end) {
124 return LXB_STATUS_OK;
125 }
126
127 if (data[1] != '-' || data[2] != '-') {
128 data = lxb_html_encoding_tag_end(data, end);
129 continue;
130 }
131
132 while (data < end) {
133 data = lxb_html_encoding_tag_end(data, end);
134
135 if (data[-3] == '-' && data[-2] == '-') {
136 break;
137 }
138 }
139
140 break;
141
142 case '?':
143 data = lxb_html_encoding_tag_end(data, end);
144 break;
145
146 case '/':
147 data++;
148
149 if ((data + 3) > end) {
150 return LXB_STATUS_OK;
151 }
152
153 if ((unsigned) (*data - 0x41) <= (0x5A - 0x41)
154 || (unsigned) (*data - 0x61) <= (0x7A - 0x61))
155 {
156 goto skip_attributes;
157 }
158
159 data = lxb_html_encoding_tag_end(data, end);
160 break;
161
162 default:
163
164 if ((unsigned) (*data - 0x41) > (0x5A - 0x41)
165 && (unsigned) (*data - 0x61) > (0x7A - 0x61))
166 {
167 break;
168 }
169
170 if ((data + 6) > end) {
171 return LXB_STATUS_OK;
172 }
173
174 if (!lexbor_str_data_ncasecmp(data, (lxb_char_t *) "meta", 4)) {
175 goto skip_attributes;
176 }
177
178 data += 4;
179
180 switch (*data++) {
181 case 0x09: case 0x0A: case 0x0C:
182 case 0x0D: case 0x20: case 0x2F:
183 break;
184
185 default:
186 goto skip_attributes;
187 }
188
189 data = lxb_html_encoding_meta(em, data, end);
190 if (data == NULL) {
191 return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
192 }
193
194 break;
195
196 skip_attributes:
197
198 data = lxb_html_encoding_skip_name(data, end);
199 if (data >= end) {
200 return LXB_STATUS_OK;
201 }
202
203 if (*data == '>') {
204 data++;
205 continue;
206 }
207
208 /* Skip attributes */
209 while (data < end) {
210 data = lxb_html_get_attribute(data, end, &name, &name_end,
211 &value, &value_end);
212 if (name == NULL) {
213 break;
214 }
215 }
216
217 break;
218 }
219 }
220
221 return LXB_STATUS_OK;
222 }
223
224 static const lxb_char_t *
lxb_html_encoding_meta(lxb_html_encoding_t * em,const lxb_char_t * data,const lxb_char_t * end)225 lxb_html_encoding_meta(lxb_html_encoding_t *em,
226 const lxb_char_t *data, const lxb_char_t *end)
227 {
228 size_t i, len, cur;
229 bool got_pragma, have_content;
230 uint8_t need_pragma;
231 const lxb_char_t *name, *name_end;
232 const lxb_char_t *value, *value_end;
233 lxb_html_encoding_entry_t *attr;
234
235 got_pragma = false;
236 have_content = false;
237 need_pragma = 0x00;
238 cur = lexbor_array_obj_length(&em->result);
239
240 lexbor_array_obj_clean(&em->cache);
241
242 while (data < end) {
243
244 find_attr:
245
246 data = lxb_html_get_attribute(data, end, &name, &name_end,
247 &value, &value_end);
248 if (name == NULL) {
249 break;
250 }
251
252 len = name_end - name;
253
254 if (len < 7) {
255 continue;
256 }
257
258 /* Exists check */
259 for (i = 0; i < lexbor_array_obj_length(&em->cache); i++) {
260 attr = lexbor_array_obj_get(&em->cache, i);
261
262 if ((size_t) (attr->end - attr->name) == len
263 && lexbor_str_data_ncasecmp(attr->name, name, len))
264 {
265 goto find_attr;
266 }
267 }
268
269 /* Append attribute to cache */
270 attr = lexbor_array_obj_push(&em->cache);
271 if (attr == NULL) {
272 return NULL;
273 }
274
275 attr->name = name;
276 attr->end = name_end;
277
278 if (value == NULL) {
279 continue;
280 }
281
282 /* http-equiv check */
283 if (len == (sizeof("http-equiv") - 1)) {
284 if (!lexbor_str_data_ncasecmp((lxb_char_t *) "http-equiv", name, len)) {
285 continue;
286 }
287
288 if ((value_end - value) == (sizeof("content-type") - 1)
289 && lexbor_str_data_ncasecmp((lxb_char_t *) "content-type",
290 value, (sizeof("content-type") - 1)))
291 {
292 got_pragma = true;
293 }
294
295 continue;
296 }
297
298 if (lexbor_str_data_ncasecmp((lxb_char_t *) "content", name, 7)) {
299 if (have_content == false) {
300
301 name = lxb_html_encoding_content(value, value_end, &name_end);
302 if (name == NULL) {
303 continue;
304 }
305
306 attr = lexbor_array_obj_push(&em->result);
307 if (attr == NULL) {
308 return NULL;
309 }
310
311 attr->name = name;
312 attr->end = name_end;
313
314 need_pragma = 0x02;
315 have_content = true;
316 }
317
318 continue;
319 }
320
321 if (lexbor_str_data_ncasecmp((lxb_char_t *) "charset", name, 7)) {
322 attr = lexbor_array_obj_push(&em->result);
323 if (attr == NULL) {
324 return NULL;
325 }
326
327 attr->name = value;
328 attr->end = value_end;
329
330 need_pragma = 0x01;
331 }
332 }
333
334 if (need_pragma == 0x00 || (need_pragma == 0x02 && got_pragma == false)) {
335 if (cur != lexbor_array_obj_length(&em->result)) {
336 lexbor_array_obj_pop(&em->result);
337 }
338 }
339
340 return data;
341 }
342
343 const lxb_char_t *
lxb_html_encoding_content(const lxb_char_t * data,const lxb_char_t * end,const lxb_char_t ** name_end)344 lxb_html_encoding_content(const lxb_char_t *data, const lxb_char_t *end,
345 const lxb_char_t **name_end)
346 {
347 const lxb_char_t *name;
348
349 do {
350 for (; (data + 7) < end; data++) {
351 if (lexbor_str_data_ncasecmp((lxb_char_t *) "charset", data, 7)) {
352 goto found;
353 }
354 }
355
356 return NULL;
357
358 found:
359
360 data = lxb_html_encoding_skip_spaces((data + 7), end);
361 if (data >= end) {
362 return NULL;
363 }
364
365 if (*data != '=') {
366 continue;
367 }
368
369 data = lxb_html_encoding_skip_spaces((data + 1), end);
370 if (data >= end) {
371 return NULL;
372 }
373
374 break;
375 }
376 while (true);
377
378 if (*data == '\'' || *data == '"') {
379 *name_end = data++;
380 name = data;
381
382 for (; data < end; data++) {
383 if (*data == **name_end) {
384 break;
385 }
386 }
387
388 *name_end = data;
389 goto done;
390 }
391
392 name = data;
393 *name_end = data;
394
395 for (; data < end; data++) {
396 switch (*data) {
397 case ';':
398 goto done;
399
400 case 0x09: case 0x0A:
401 case 0x0C: case 0x0D:
402 case 0x20:
403 goto done;
404
405 case '"':
406 case '\'':
407 return NULL;
408 }
409 }
410
411 if (data == name) {
412 return NULL;
413 }
414
415 done:
416
417 *name_end = data;
418
419 return name;
420 }
421
422 static const lxb_char_t *
lxb_html_get_attribute(const lxb_char_t * data,const lxb_char_t * end,const lxb_char_t ** name,const lxb_char_t ** name_end,const lxb_char_t ** value,const lxb_char_t ** value_end)423 lxb_html_get_attribute(const lxb_char_t *data, const lxb_char_t *end,
424 const lxb_char_t **name, const lxb_char_t **name_end,
425 const lxb_char_t **value, const lxb_char_t **value_end)
426 {
427 lxb_char_t ch;
428
429 *name = NULL;
430 *value = NULL;
431
432 for (; data < end; data++) {
433 switch (*data) {
434 case 0x09: case 0x0A:
435 case 0x0C: case 0x0D:
436 case 0x20: case 0x2F:
437 break;
438
439 case 0x3E:
440 return (data + 1);
441
442 default:
443 goto name_state;
444 }
445 }
446
447 if (data == end) {
448 return data;
449 }
450
451 name_state:
452
453 /* Attribute name */
454 *name = data;
455
456 while (data < end) {
457 switch (*data) {
458 case 0x09: case 0x0A:
459 case 0x0C: case 0x0D:
460 case 0x20:
461 *name_end = data;
462
463 data++;
464 goto spaces_state;
465
466 case '/': case '>':
467 *name_end = data;
468 return data;
469
470 case '=':
471 if (*name != NULL) {
472 *name_end = data++;
473 goto value_state;
474 }
475 }
476
477 data++;
478 }
479
480 *name_end = data;
481
482 spaces_state:
483
484 data = lxb_html_encoding_skip_spaces(data, end);
485 if (data == end) {
486 return data;
487 }
488
489 if (*data != '=') {
490 return data;
491 }
492
493 data += 1;
494
495 value_state:
496
497 data = lxb_html_encoding_skip_spaces(data, end);
498 if (data == end) {
499 return data;
500 }
501
502 switch (*data) {
503 case '"':
504 case '\'':
505 ch = *data++;
506 if (data == end) {
507 return data;
508 }
509
510 *value = data;
511
512 do {
513 if (*data == ch) {
514 *value_end = data;
515 return data + 1;
516 }
517 }
518 while (++data < end);
519
520 *value = NULL;
521
522 return data;
523
524 case '>':
525 return data;
526
527 default:
528 *value = data++;
529 break;
530 }
531
532 for (; data < end; data++) {
533 switch (*data) {
534 case 0x09: case 0x0A:
535 case 0x0C: case 0x0D:
536 case 0x20: case 0x3E:
537 *value_end = data;
538 return data;
539 }
540 }
541
542 *value = NULL;
543
544 return data;
545 }
546
547 /*
548 * No inline functions for ABI.
549 */
550 lxb_html_encoding_t *
lxb_html_encoding_create_noi(void)551 lxb_html_encoding_create_noi(void)
552 {
553 return lxb_html_encoding_create();
554 }
555
556 void
lxb_html_encoding_clean_noi(lxb_html_encoding_t * em)557 lxb_html_encoding_clean_noi(lxb_html_encoding_t *em)
558 {
559 lxb_html_encoding_clean(em);
560 }
561
562 lxb_html_encoding_entry_t *
lxb_html_encoding_meta_entry_noi(lxb_html_encoding_t * em,size_t idx)563 lxb_html_encoding_meta_entry_noi(lxb_html_encoding_t *em, size_t idx)
564 {
565 return lxb_html_encoding_meta_entry(em, idx);
566 }
567
568 size_t
lxb_html_encoding_meta_length_noi(lxb_html_encoding_t * em)569 lxb_html_encoding_meta_length_noi(lxb_html_encoding_t *em)
570 {
571 return lxb_html_encoding_meta_length(em);
572 }
573
574 lexbor_array_obj_t *
lxb_html_encoding_meta_result_noi(lxb_html_encoding_t * em)575 lxb_html_encoding_meta_result_noi(lxb_html_encoding_t *em)
576 {
577 return lxb_html_encoding_meta_result(em);
578 }
579