1 /*
2 * Copyright (C) 2019 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/encoding/decode.h"
8 #include "lexbor/encoding/single.h"
9 #include "lexbor/encoding/multi.h"
10 #include "lexbor/encoding/range.h"
11
12
13 #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY(_lower, _upper, _cont) \
14 { \
15 ch = *p; \
16 \
17 if (ch < _lower || ch > _upper) { \
18 ctx->u.utf_8.lower = 0x00; \
19 ctx->u.utf_8.need = 0; \
20 \
21 LXB_ENCODING_DECODE_ERROR_BEGIN { \
22 *data = p; \
23 ctx->have_error = true; \
24 } \
25 LXB_ENCODING_DECODE_ERROR_END(); \
26 \
27 _cont; \
28 } \
29 else { \
30 p++; \
31 need--; \
32 ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
33 } \
34 }
35
36 #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(first, two, f_lower, s_upper) \
37 do { \
38 if (ch == first) { \
39 ctx->u.utf_8.lower = f_lower; \
40 ctx->u.utf_8.upper = 0xBF; \
41 } \
42 else if (ch == two) { \
43 ctx->u.utf_8.lower = 0x80; \
44 ctx->u.utf_8.upper = s_upper; \
45 } \
46 } \
47 while (0)
48
49 #define LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, cp) \
50 do { \
51 (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
52 } \
53 while (0)
54
55 #define LXB_ENCODING_DECODE_APPEND(ctx, cp) \
56 do { \
57 if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
58 return LXB_STATUS_SMALL_BUFFER; \
59 } \
60 \
61 (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
62 } \
63 while (0)
64
65 #define LXB_ENCODING_DECODE_APPEND_P(ctx, cp) \
66 do { \
67 if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
68 *data = p; \
69 return LXB_STATUS_SMALL_BUFFER; \
70 } \
71 \
72 (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
73 } \
74 while (0)
75
76 #define LXB_ENCODING_DECODE_CHECK_OUT(ctx) \
77 do { \
78 if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
79 return LXB_STATUS_SMALL_BUFFER; \
80 } \
81 } \
82 while (0)
83
84 #define LXB_ENCODING_DECODE_ERROR_BEGIN \
85 do { \
86 if (ctx->replace_to == NULL) { \
87 return LXB_STATUS_ERROR; \
88 } \
89 \
90 if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) { \
91 do
92
93 #define LXB_ENCODING_DECODE_ERROR_END() \
94 while (0); \
95 \
96 return LXB_STATUS_SMALL_BUFFER; \
97 } \
98 \
99 memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to, \
100 sizeof(lxb_codepoint_t) * ctx->replace_len); \
101 \
102 ctx->buffer_used += ctx->replace_len; \
103 } \
104 while (0)
105
106 #define LXB_ENCODING_DECODE_ERROR(ctx) \
107 do { \
108 LXB_ENCODING_DECODE_ERROR_BEGIN { \
109 } LXB_ENCODING_DECODE_ERROR_END(); \
110 } \
111 while (0)
112
113 #define LXB_ENCODING_DECODE_FAILED(ident) \
114 do { \
115 if ((byte) < (0x80)) { \
116 (*data)--; \
117 } \
118 \
119 LXB_ENCODING_DECODE_ERROR_BEGIN { \
120 ctx->have_error = true; \
121 (ident) = 0x01; \
122 } \
123 LXB_ENCODING_DECODE_ERROR_END(); \
124 } \
125 while (0)
126
127 #define LXB_ENCODING_DECODE_SINGLE(decode_map) \
128 do { \
129 const lxb_char_t *p = *data; \
130 \
131 while (p < end) { \
132 if (*p < 0x80) { \
133 LXB_ENCODING_DECODE_APPEND_P(ctx, *p++); \
134 } \
135 else { \
136 ctx->codepoint = decode_map[(*p++) - 0x80].codepoint; \
137 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) { \
138 LXB_ENCODING_DECODE_ERROR_BEGIN { \
139 *data = p - 1; \
140 } \
141 LXB_ENCODING_DECODE_ERROR_END(); \
142 continue; \
143 } \
144 \
145 LXB_ENCODING_DECODE_APPEND_P(ctx, ctx->codepoint); \
146 } \
147 \
148 *data = p; \
149 } \
150 } \
151 while (0)
152
153 #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(lower, upper) \
154 do { \
155 ch = **data; \
156 \
157 if (ch < lower || ch > upper) { \
158 goto failed; \
159 } \
160 \
161 (*data)++; \
162 needed--; \
163 ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
164 } \
165 while (0)
166
167 #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(first, two, f_lower, \
168 s_upper) \
169 do { \
170 if (ch == first) { \
171 ctx->u.utf_8.lower = f_lower; \
172 ctx->u.utf_8.upper = 0xBF; \
173 } \
174 else if (ch == two) { \
175 ctx->u.utf_8.lower = 0x80; \
176 ctx->u.utf_8.upper = s_upper; \
177 } \
178 } \
179 while (0)
180
181
182 lxb_status_t
lxb_encoding_decode_default(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)183 lxb_encoding_decode_default(lxb_encoding_decode_t *ctx,
184 const lxb_char_t **data, const lxb_char_t *end)
185 {
186 return lxb_encoding_decode_utf_8(ctx, data, end);
187 }
188
189 lxb_status_t
lxb_encoding_decode_auto(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)190 lxb_encoding_decode_auto(lxb_encoding_decode_t *ctx,
191 const lxb_char_t **data, const lxb_char_t *end)
192 {
193 *data = end;
194 return LXB_STATUS_ERROR;
195 }
196
197 lxb_status_t
lxb_encoding_decode_undefined(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)198 lxb_encoding_decode_undefined(lxb_encoding_decode_t *ctx,
199 const lxb_char_t **data, const lxb_char_t *end)
200 {
201 *data = end;
202 return LXB_STATUS_ERROR;
203 }
204
205 lxb_status_t
lxb_encoding_decode_big5(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)206 lxb_encoding_decode_big5(lxb_encoding_decode_t *ctx,
207 const lxb_char_t **data, const lxb_char_t *end)
208 {
209 uint32_t index;
210 lxb_char_t lead, byte;
211
212 ctx->status = LXB_STATUS_OK;
213
214 if (ctx->u.lead != 0x00) {
215 if (ctx->have_error) {
216 ctx->u.lead = 0x00;
217 ctx->have_error = false;
218
219 LXB_ENCODING_DECODE_ERROR_BEGIN {
220 ctx->u.lead = 0x01;
221 ctx->have_error = true;
222 } LXB_ENCODING_DECODE_ERROR_END();
223 }
224 else if (ctx->second_codepoint != 0x0000) {
225 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
226 return LXB_STATUS_SMALL_BUFFER;
227 }
228
229 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->u.lead);
230 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->second_codepoint);
231
232 ctx->u.lead = 0x00;
233 ctx->second_codepoint = 0x0000;
234 }
235 else {
236 if (*data >= end) {
237 ctx->status = LXB_STATUS_CONTINUE;
238
239 return LXB_STATUS_CONTINUE;
240 }
241
242 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
243
244 lead = (lxb_char_t) ctx->u.lead;
245 ctx->u.lead = 0x00;
246
247 goto lead_state;
248 }
249 }
250
251 while (*data < end) {
252 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
253
254 lead = *(*data)++;
255
256 if (lead < 0x80) {
257 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
258 continue;
259 }
260
261 if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
262 LXB_ENCODING_DECODE_ERROR_BEGIN {
263 (*data)--;
264 }
265 LXB_ENCODING_DECODE_ERROR_END();
266
267 continue;
268 }
269
270 if (*data >= end) {
271 ctx->u.lead = lead;
272 ctx->status = LXB_STATUS_CONTINUE;
273
274 return LXB_STATUS_CONTINUE;
275 }
276
277 lead_state:
278
279 index = 0;
280 byte = *(*data)++;
281
282 if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
283 || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
284 {
285 if (byte < 0x7F) {
286 /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
287 index = (lead - 0x81) * 157 + (byte - 0x40);
288 }
289 else {
290 /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
291 index = (lead - 0x81) * 157 + (byte - 0x62);
292 }
293 }
294
295 /*
296 * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
297 * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
298 * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
299 * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
300 */
301 switch (index) {
302 case 1133:
303 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
304 ctx->u.lead = 0x00CA;
305 ctx->second_codepoint = 0x0304;
306
307 return LXB_STATUS_SMALL_BUFFER;
308 }
309
310 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
311 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
312
313 continue;
314
315 case 1135:
316 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
317 ctx->u.lead = 0x00CA;
318 ctx->second_codepoint = 0x030C;
319
320 return LXB_STATUS_SMALL_BUFFER;
321 }
322
323 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
324 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
325
326 continue;
327
328 case 1164:
329 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
330 ctx->u.lead = 0x00EA;
331 ctx->second_codepoint = 0x0304;
332
333 return LXB_STATUS_SMALL_BUFFER;
334 }
335
336 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
337 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
338
339 continue;
340
341 case 1166:
342 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
343 ctx->u.lead = 0x00EA;
344 ctx->second_codepoint = 0x030C;
345
346 return LXB_STATUS_SMALL_BUFFER;
347 }
348
349 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
350 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
351
352 continue;
353
354 case 0:
355 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
356 continue;
357 }
358
359 ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
360 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
361 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
362 continue;
363 }
364
365 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
366 }
367
368 return LXB_STATUS_OK;
369 }
370
371 lxb_status_t
lxb_encoding_decode_euc_jp(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)372 lxb_encoding_decode_euc_jp(lxb_encoding_decode_t *ctx,
373 const lxb_char_t **data, const lxb_char_t *end)
374 {
375 bool is_jis0212;
376 lxb_char_t byte, lead;
377
378 ctx->status = LXB_STATUS_OK;
379
380 if (ctx->u.euc_jp.lead != 0x00) {
381 if (ctx->have_error) {
382 ctx->have_error = false;
383 ctx->u.euc_jp.lead = 0x00;
384
385 LXB_ENCODING_DECODE_ERROR_BEGIN {
386 ctx->have_error = true;
387 ctx->u.euc_jp.lead = 0x01;
388 } LXB_ENCODING_DECODE_ERROR_END();
389 }
390 else {
391 if (*data >= end) {
392 ctx->status = LXB_STATUS_CONTINUE;
393
394 return LXB_STATUS_CONTINUE;
395 }
396
397 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
398
399 lead = ctx->u.euc_jp.lead;
400 byte = *(*data)++;
401
402 ctx->u.euc_jp.lead = 0x00;
403
404 if (ctx->u.euc_jp.is_jis0212) {
405 is_jis0212 = true;
406 ctx->u.euc_jp.is_jis0212 = false;
407
408 goto lead_jis_state;
409 }
410
411 goto lead_state;
412 }
413 }
414
415 while (*data < end) {
416 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
417
418 lead = *(*data)++;
419
420 if (lead < 0x80) {
421 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
422 continue;
423 }
424
425 if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
426 && (lead != 0x8E && lead != 0x8F))
427 {
428 LXB_ENCODING_DECODE_ERROR_BEGIN {
429 (*data)--;
430 }
431 LXB_ENCODING_DECODE_ERROR_END();
432
433 continue;
434 }
435
436 if (*data >= end) {
437 ctx->u.euc_jp.lead = lead;
438 ctx->status = LXB_STATUS_CONTINUE;
439
440 return LXB_STATUS_CONTINUE;
441 }
442
443 byte = *(*data)++;
444
445 lead_state:
446
447 if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
448 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + byte);
449 continue;
450 }
451
452 is_jis0212 = false;
453
454 if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
455 if (*data >= end) {
456 ctx->u.euc_jp.lead = byte;
457 ctx->u.euc_jp.is_jis0212 = true;
458
459 ctx->status = LXB_STATUS_CONTINUE;
460
461 return LXB_STATUS_CONTINUE;
462 }
463
464 lead = byte;
465 byte = *(*data)++;
466 is_jis0212 = true;
467 }
468
469 lead_jis_state:
470
471 if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
472 || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
473 {
474 LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
475 continue;
476 }
477
478 /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
479 ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
480
481 if (is_jis0212) {
482 if ((sizeof(lxb_encoding_multi_index_jis0212)
483 / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
484 {
485 LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
486 continue;
487 }
488
489 ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
490 }
491 else {
492 if ((sizeof(lxb_encoding_multi_index_jis0208)
493 / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
494 {
495 LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
496 continue;
497 }
498
499 ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
500 }
501
502 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
503 LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
504 continue;
505 }
506
507 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
508 }
509
510 return LXB_STATUS_OK;
511 }
512
513 lxb_status_t
lxb_encoding_decode_euc_kr(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)514 lxb_encoding_decode_euc_kr(lxb_encoding_decode_t *ctx,
515 const lxb_char_t **data, const lxb_char_t *end)
516 {
517 lxb_char_t lead, byte;
518
519 ctx->status = LXB_STATUS_OK;
520
521 if (ctx->u.lead != 0x00) {
522 if (ctx->have_error) {
523 ctx->have_error = false;
524 ctx->u.lead = 0x00;
525
526 LXB_ENCODING_DECODE_ERROR_BEGIN {
527 ctx->have_error = true;
528 ctx->u.lead = 0x01;
529 } LXB_ENCODING_DECODE_ERROR_END();
530 }
531 else {
532 if (*data >= end) {
533 ctx->status = LXB_STATUS_CONTINUE;
534
535 return LXB_STATUS_CONTINUE;
536 }
537
538 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
539
540 lead = (lxb_char_t) ctx->u.lead;
541 ctx->u.lead = 0x00;
542
543 goto lead_state;
544 }
545 }
546
547 while (*data < end) {
548 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
549
550 lead = *(*data)++;
551
552 if (lead < 0x80) {
553 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
554 continue;
555 }
556
557 if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
558 LXB_ENCODING_DECODE_ERROR_BEGIN {
559 (*data)--;
560 }
561 LXB_ENCODING_DECODE_ERROR_END();
562
563 continue;
564 }
565
566 if (*data == end) {
567 ctx->u.lead = lead;
568 ctx->status = LXB_STATUS_CONTINUE;
569
570 return LXB_STATUS_CONTINUE;
571 }
572
573 lead_state:
574
575 byte = *(*data)++;
576
577 if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
578 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
579 continue;
580 }
581
582 /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
583 ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
584
585 if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
586 / sizeof(lxb_encoding_multi_index_t))
587 {
588 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
589 continue;
590 }
591
592 ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
593 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
594 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
595 continue;
596 }
597
598 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
599 }
600
601 return LXB_STATUS_OK;
602 }
603
604 lxb_status_t
lxb_encoding_decode_gbk(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)605 lxb_encoding_decode_gbk(lxb_encoding_decode_t *ctx,
606 const lxb_char_t **data, const lxb_char_t *end)
607 {
608 return lxb_encoding_decode_gb18030(ctx, data, end);
609 }
610
611 lxb_status_t
lxb_encoding_decode_ibm866(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)612 lxb_encoding_decode_ibm866(lxb_encoding_decode_t *ctx,
613 const lxb_char_t **data, const lxb_char_t *end)
614 {
615 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_ibm866);
616
617 return LXB_STATUS_OK;
618 }
619
620 lxb_status_t
lxb_encoding_decode_iso_2022_jp(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)621 lxb_encoding_decode_iso_2022_jp(lxb_encoding_decode_t *ctx,
622 const lxb_char_t **data, const lxb_char_t *end)
623 {
624 #define LXB_ENCODING_DECODE_ISO_2022_JP_OK() \
625 do { \
626 if (*data >= end) { \
627 return LXB_STATUS_OK; \
628 } \
629 } \
630 while (0)
631
632 #define LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE() \
633 do { \
634 if (*data >= end) { \
635 ctx->status = LXB_STATUS_CONTINUE; \
636 return LXB_STATUS_CONTINUE; \
637 } \
638 } \
639 while (0)
640
641
642 lxb_char_t byte;
643 lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
644
645 ctx->status = LXB_STATUS_OK;
646
647 if (ctx->have_error) {
648 ctx->have_error = false;
649
650 LXB_ENCODING_DECODE_ERROR_BEGIN {
651 ctx->have_error = true;
652 }
653 LXB_ENCODING_DECODE_ERROR_END();
654 }
655
656 if (iso->prepand != 0x00) {
657 if (*data >= end) {
658 ctx->status = LXB_STATUS_CONTINUE;
659
660 return LXB_STATUS_CONTINUE;
661 }
662
663 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
664
665 byte = iso->prepand;
666 iso->prepand = 0x00;
667
668 goto prepand;
669 }
670
671 if (*data >= end) {
672 return LXB_STATUS_OK;
673 }
674
675 do {
676 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
677
678 byte = *(*data)++;
679
680 prepand:
681
682 switch (iso->state) {
683 case LXB_ENCODING_DECODE_2022_JP_ASCII:
684 if (byte == 0x1B) {
685 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
686
687 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
688 break;
689 }
690
691 /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
692 if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
693 && byte != 0x0E && byte != 0x0F)
694 {
695 iso->out_flag = false;
696
697 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
698 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
699 break;
700 }
701
702 iso->out_flag = false;
703
704 LXB_ENCODING_DECODE_ERROR_BEGIN {
705 ctx->have_error = true;
706 }
707 LXB_ENCODING_DECODE_ERROR_END();
708
709 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
710 break;
711
712 case LXB_ENCODING_DECODE_2022_JP_ROMAN:
713 switch (byte) {
714 case 0x1B:
715 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
716
717 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
718 continue;
719
720 case 0x5C:
721 iso->out_flag = false;
722
723 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00A5);
724 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
725
726 continue;
727
728 case 0x7E:
729 iso->out_flag = false;
730
731 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x203E);
732 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
733
734 continue;
735
736 case 0x0E:
737 case 0x0F:
738 break;
739
740 default:
741 /* 0x00 to 0x7F */
742 if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
743 iso->out_flag = false;
744
745 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
746 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
747
748 continue;
749 }
750
751 break;
752 }
753
754 iso->out_flag = false;
755
756 LXB_ENCODING_DECODE_ERROR_BEGIN {
757 ctx->have_error = true;
758 }
759 LXB_ENCODING_DECODE_ERROR_END();
760
761 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
762 break;
763
764 case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
765 if (byte == 0x1B) {
766 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
767
768 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
769 break;
770 }
771
772 /* 0x21 to 0x5F */
773 if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
774 iso->out_flag = false;
775
776 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx,
777 0xFF61 - 0x21 + byte);
778 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
779 break;
780 }
781
782 iso->out_flag = false;
783
784 LXB_ENCODING_DECODE_ERROR_BEGIN {
785 ctx->have_error = true;
786 }
787 LXB_ENCODING_DECODE_ERROR_END();
788
789 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
790 break;
791
792 case LXB_ENCODING_DECODE_2022_JP_LEAD:
793 if (byte == 0x1B) {
794 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
795
796 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
797 break;
798 }
799
800 /* 0x21 to 0x7E */
801 if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
802 iso->out_flag = false;
803 iso->lead = byte;
804 iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
805
806 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
807 break;
808 }
809
810 iso->out_flag = false;
811
812 LXB_ENCODING_DECODE_ERROR_BEGIN {
813 ctx->have_error = true;
814 }
815 LXB_ENCODING_DECODE_ERROR_END();
816
817 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
818 break;
819
820 case LXB_ENCODING_DECODE_2022_JP_TRAIL:
821 if (byte == 0x1B) {
822 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
823
824 LXB_ENCODING_DECODE_ERROR_BEGIN {
825 ctx->have_error = true;
826 }
827 LXB_ENCODING_DECODE_ERROR_END();
828
829 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
830 break;
831 }
832
833 iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
834
835 /* 0x21 to 0x7E */
836 if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
837 /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
838 ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
839
840 ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
841
842 if (ctx->codepoint != LXB_ENCODING_ERROR_CODEPOINT) {
843 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
844 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
845
846 break;
847 }
848 }
849
850 LXB_ENCODING_DECODE_ERROR_BEGIN {
851 iso->prepand = 0x01;
852 ctx->have_error = true;
853 }
854 LXB_ENCODING_DECODE_ERROR_END();
855
856 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
857 break;
858
859 case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
860 if (byte == 0x24 || byte == 0x28) {
861 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
862 iso->lead = byte;
863
864 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
865 break;
866 }
867
868 (*data)--;
869
870 iso->out_flag = false;
871 iso->state = ctx->u.iso_2022_jp.out_state;
872
873 LXB_ENCODING_DECODE_ERROR_BEGIN {
874 iso->prepand = 0x01;
875 ctx->have_error = true;
876 }
877 LXB_ENCODING_DECODE_ERROR_END();
878
879 break;
880
881 case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
882 iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
883
884 if (iso->lead == 0x28) {
885 if (byte == 0x42) {
886 iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
887 }
888 else if (byte == 0x4A) {
889 iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
890 }
891 else if (byte == 0x49) {
892 iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
893 }
894 }
895 else if (iso->lead == 0x24) {
896 if (byte == 0x40 || byte == 0x42) {
897 iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
898 }
899 }
900
901 if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
902 (*data)--;
903
904 iso->out_flag = false;
905 iso->state = iso->out_state;
906
907 LXB_ENCODING_DECODE_ERROR_BEGIN {
908 iso->prepand = iso->lead;
909 iso->lead = 0x00;
910
911 ctx->have_error = true;
912 }
913 LXB_ENCODING_DECODE_ERROR_END();
914
915 byte = iso->lead;
916 iso->lead = 0x00;
917
918 goto prepand;
919 }
920
921 iso->lead = 0x00;
922 iso->out_state = iso->state;
923
924 if (iso->out_flag) {
925 LXB_ENCODING_DECODE_ERROR_BEGIN {
926 ctx->have_error = true;
927 }
928 LXB_ENCODING_DECODE_ERROR_END();
929
930 LXB_ENCODING_DECODE_ISO_2022_JP_OK();
931 break;
932 }
933
934 iso->out_flag = true;
935
936 LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
937 break;
938 }
939 }
940 while (true);
941
942 return LXB_STATUS_OK;
943
944 #undef LXB_ENCODING_DECODE_ISO_2022_JP_OK
945 #undef LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE
946 }
947
948 lxb_status_t
lxb_encoding_decode_iso_8859_10(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)949 lxb_encoding_decode_iso_8859_10(lxb_encoding_decode_t *ctx,
950 const lxb_char_t **data, const lxb_char_t *end)
951 {
952 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_10);
953
954 return LXB_STATUS_OK;
955 }
956
957 lxb_status_t
lxb_encoding_decode_iso_8859_13(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)958 lxb_encoding_decode_iso_8859_13(lxb_encoding_decode_t *ctx,
959 const lxb_char_t **data, const lxb_char_t *end)
960 {
961 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_13);
962
963 return LXB_STATUS_OK;
964 }
965
966 lxb_status_t
lxb_encoding_decode_iso_8859_14(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)967 lxb_encoding_decode_iso_8859_14(lxb_encoding_decode_t *ctx,
968 const lxb_char_t **data, const lxb_char_t *end)
969 {
970 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_14);
971
972 return LXB_STATUS_OK;
973 }
974
975 lxb_status_t
lxb_encoding_decode_iso_8859_15(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)976 lxb_encoding_decode_iso_8859_15(lxb_encoding_decode_t *ctx,
977 const lxb_char_t **data, const lxb_char_t *end)
978 {
979 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_15);
980
981 return LXB_STATUS_OK;
982 }
983
984 lxb_status_t
lxb_encoding_decode_iso_8859_16(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)985 lxb_encoding_decode_iso_8859_16(lxb_encoding_decode_t *ctx,
986 const lxb_char_t **data, const lxb_char_t *end)
987 {
988 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_16);
989
990 return LXB_STATUS_OK;
991 }
992
993 lxb_status_t
lxb_encoding_decode_iso_8859_2(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)994 lxb_encoding_decode_iso_8859_2(lxb_encoding_decode_t *ctx,
995 const lxb_char_t **data, const lxb_char_t *end)
996 {
997 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_2);
998
999 return LXB_STATUS_OK;
1000 }
1001
1002 lxb_status_t
lxb_encoding_decode_iso_8859_3(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1003 lxb_encoding_decode_iso_8859_3(lxb_encoding_decode_t *ctx,
1004 const lxb_char_t **data, const lxb_char_t *end)
1005 {
1006 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_3);
1007
1008 return LXB_STATUS_OK;
1009 }
1010
1011 lxb_status_t
lxb_encoding_decode_iso_8859_4(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1012 lxb_encoding_decode_iso_8859_4(lxb_encoding_decode_t *ctx,
1013 const lxb_char_t **data, const lxb_char_t *end)
1014 {
1015 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_4);
1016
1017 return LXB_STATUS_OK;
1018 }
1019
1020 lxb_status_t
lxb_encoding_decode_iso_8859_5(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1021 lxb_encoding_decode_iso_8859_5(lxb_encoding_decode_t *ctx,
1022 const lxb_char_t **data, const lxb_char_t *end)
1023 {
1024 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_5);
1025
1026 return LXB_STATUS_OK;
1027 }
1028
1029 lxb_status_t
lxb_encoding_decode_iso_8859_6(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1030 lxb_encoding_decode_iso_8859_6(lxb_encoding_decode_t *ctx,
1031 const lxb_char_t **data, const lxb_char_t *end)
1032 {
1033 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_6);
1034
1035 return LXB_STATUS_OK;
1036 }
1037
1038 lxb_status_t
lxb_encoding_decode_iso_8859_7(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1039 lxb_encoding_decode_iso_8859_7(lxb_encoding_decode_t *ctx,
1040 const lxb_char_t **data, const lxb_char_t *end)
1041 {
1042 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_7);
1043
1044 return LXB_STATUS_OK;
1045 }
1046
1047 lxb_status_t
lxb_encoding_decode_iso_8859_8(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1048 lxb_encoding_decode_iso_8859_8(lxb_encoding_decode_t *ctx,
1049 const lxb_char_t **data, const lxb_char_t *end)
1050 {
1051 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1052
1053 return LXB_STATUS_OK;
1054 }
1055
1056 lxb_status_t
lxb_encoding_decode_iso_8859_8_i(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1057 lxb_encoding_decode_iso_8859_8_i(lxb_encoding_decode_t *ctx,
1058 const lxb_char_t **data, const lxb_char_t *end)
1059 {
1060 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1061
1062 return LXB_STATUS_OK;
1063 }
1064
1065 lxb_status_t
lxb_encoding_decode_koi8_r(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1066 lxb_encoding_decode_koi8_r(lxb_encoding_decode_t *ctx,
1067 const lxb_char_t **data, const lxb_char_t *end)
1068 {
1069 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_r);
1070
1071 return LXB_STATUS_OK;
1072 }
1073
1074 lxb_status_t
lxb_encoding_decode_koi8_u(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1075 lxb_encoding_decode_koi8_u(lxb_encoding_decode_t *ctx,
1076 const lxb_char_t **data, const lxb_char_t *end)
1077 {
1078 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_u);
1079
1080 return LXB_STATUS_OK;
1081 }
1082
1083 lxb_status_t
lxb_encoding_decode_shift_jis(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1084 lxb_encoding_decode_shift_jis(lxb_encoding_decode_t *ctx,
1085 const lxb_char_t **data, const lxb_char_t *end)
1086 {
1087 lxb_char_t byte, lead;
1088
1089 ctx->status = LXB_STATUS_OK;
1090
1091 if (ctx->u.lead != 0x00) {
1092 if (ctx->have_error) {
1093 ctx->have_error = false;
1094 ctx->u.lead = 0x00;
1095
1096 LXB_ENCODING_DECODE_ERROR_BEGIN {
1097 ctx->have_error = true;
1098 ctx->u.lead = 0x01;
1099 } LXB_ENCODING_DECODE_ERROR_END();
1100 }
1101 else {
1102 if (*data >= end) {
1103 ctx->status = LXB_STATUS_CONTINUE;
1104
1105 return LXB_STATUS_CONTINUE;
1106 }
1107
1108 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1109
1110 lead = (lxb_char_t) ctx->u.lead;
1111 ctx->u.lead = 0x00;
1112
1113 goto lead_state;
1114 }
1115 }
1116
1117 while (*data < end) {
1118 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1119
1120 lead = *(*data)++;
1121
1122 if (lead <= 0x80) {
1123 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
1124 continue;
1125 }
1126
1127 if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
1128 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + lead);
1129 continue;
1130 }
1131
1132 if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
1133 && lead != 0xE0 && lead != 0xFC)
1134 {
1135 LXB_ENCODING_DECODE_ERROR_BEGIN {
1136 (*data)--;
1137 }
1138 LXB_ENCODING_DECODE_ERROR_END();
1139
1140 continue;
1141 }
1142
1143 if (*data >= end) {
1144 ctx->u.lead = lead;
1145 ctx->status = LXB_STATUS_CONTINUE;
1146
1147 return LXB_STATUS_CONTINUE;
1148 }
1149
1150 lead_state:
1151
1152 byte = *(*data)++;
1153
1154 if (byte < 0x7F) {
1155 ctx->codepoint = 0x40;
1156 }
1157 else {
1158 ctx->codepoint = 0x41;
1159 }
1160
1161 if (lead < 0xA0) {
1162 ctx->second_codepoint = 0x81;
1163 }
1164 else {
1165 ctx->second_codepoint = 0xC1;
1166 }
1167
1168 if ((unsigned) (byte - 0x40) > (0x7E - 0x40)
1169 && (unsigned) (byte - 0x80) > (0xFC - 0x80))
1170 {
1171 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1172 continue;
1173 }
1174
1175 /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
1176 ctx->codepoint = (lead - ctx->second_codepoint) * 188
1177 + byte - ctx->codepoint;
1178
1179 if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
1180 / sizeof(lxb_encoding_multi_index_t)))
1181 {
1182 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1183 continue;
1184 }
1185
1186 if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
1187 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xE000 - 8836 + ctx->codepoint);
1188 continue;
1189 }
1190
1191 ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
1192 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1193 LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1194 continue;
1195 }
1196
1197 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1198 }
1199
1200 return LXB_STATUS_OK;
1201 }
1202
1203 lxb_inline lxb_status_t
lxb_encoding_decode_utf_16(lxb_encoding_decode_t * ctx,bool is_be,const lxb_char_t ** data,const lxb_char_t * end)1204 lxb_encoding_decode_utf_16(lxb_encoding_decode_t *ctx, bool is_be,
1205 const lxb_char_t **data, const lxb_char_t *end)
1206 {
1207 unsigned lead;
1208 lxb_codepoint_t unit;
1209
1210 ctx->status = LXB_STATUS_OK;
1211
1212 if (ctx->have_error) {
1213 ctx->have_error = false;
1214
1215 LXB_ENCODING_DECODE_ERROR_BEGIN {
1216 ctx->have_error = true;
1217 }
1218 LXB_ENCODING_DECODE_ERROR_END();
1219 }
1220
1221 if (ctx->u.lead != 0x00) {
1222 if (*data >= end) {
1223 ctx->status = LXB_STATUS_CONTINUE;
1224
1225 return LXB_STATUS_CONTINUE;
1226 }
1227
1228 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1229
1230 lead = ctx->u.lead - 0x01;
1231 ctx->u.lead = 0x00;
1232
1233 goto lead_state;
1234 }
1235
1236 while (*data < end) {
1237 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1238
1239 pair_state:
1240
1241 lead = *(*data)++;
1242
1243 if (*data >= end) {
1244 ctx->u.lead = lead + 0x01;
1245 ctx->status = LXB_STATUS_CONTINUE;
1246
1247 return LXB_STATUS_CONTINUE;
1248 }
1249
1250 lead_state:
1251
1252 /* For UTF-16BE or UTF-16LE */
1253 if (is_be) {
1254 unit = (lead << 8) + *(*data)++;
1255 }
1256 else {
1257 unit = (*(*data)++ << 8) + lead;
1258 }
1259
1260 if (ctx->second_codepoint != 0x00) {
1261 if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1262 ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
1263 + (unit - 0xDC00);
1264
1265 ctx->second_codepoint = 0x00;
1266
1267 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1268 continue;
1269 }
1270
1271 (*data)--;
1272
1273 ctx->second_codepoint = 0x00;
1274
1275 LXB_ENCODING_DECODE_ERROR_BEGIN {
1276 ctx->have_error = true;
1277
1278 ctx->u.lead = lead + 0x01;
1279 }
1280 LXB_ENCODING_DECODE_ERROR_END();
1281
1282 goto lead_state;
1283 }
1284
1285 /* Surrogate pair */
1286 if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
1287 if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1288 LXB_ENCODING_DECODE_ERROR_BEGIN {
1289 ctx->have_error = true;
1290 }
1291 LXB_ENCODING_DECODE_ERROR_END();
1292
1293 continue;
1294 }
1295
1296 ctx->second_codepoint = unit;
1297
1298 if (*data >= end) {
1299 ctx->status = LXB_STATUS_CONTINUE;
1300
1301 return LXB_STATUS_CONTINUE;
1302 }
1303
1304 goto pair_state;
1305 }
1306
1307 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, unit);
1308 }
1309
1310 return LXB_STATUS_OK;
1311 }
1312
1313 lxb_status_t
lxb_encoding_decode_utf_16be(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1314 lxb_encoding_decode_utf_16be(lxb_encoding_decode_t *ctx,
1315 const lxb_char_t **data, const lxb_char_t *end)
1316 {
1317 return lxb_encoding_decode_utf_16(ctx, true, data, end);
1318 }
1319
1320 lxb_status_t
lxb_encoding_decode_utf_16le(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1321 lxb_encoding_decode_utf_16le(lxb_encoding_decode_t *ctx,
1322 const lxb_char_t **data, const lxb_char_t *end)
1323 {
1324 return lxb_encoding_decode_utf_16(ctx, false, data, end);
1325 }
1326
1327 lxb_status_t
lxb_encoding_decode_utf_8(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1328 lxb_encoding_decode_utf_8(lxb_encoding_decode_t *ctx,
1329 const lxb_char_t **data, const lxb_char_t *end)
1330 {
1331 unsigned need;
1332 lxb_char_t ch;
1333 const lxb_char_t *p = *data;
1334
1335 ctx->status = LXB_STATUS_OK;
1336
1337 if (ctx->have_error) {
1338 ctx->have_error = false;
1339
1340 LXB_ENCODING_DECODE_ERROR_BEGIN {
1341 ctx->have_error = true;
1342 }
1343 LXB_ENCODING_DECODE_ERROR_END();
1344 }
1345
1346 if (ctx->u.utf_8.need != 0) {
1347 if (p >= end) {
1348 ctx->status = LXB_STATUS_CONTINUE;
1349
1350 return LXB_STATUS_CONTINUE;
1351 }
1352
1353 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1354
1355 need = ctx->u.utf_8.need;
1356 ctx->u.utf_8.need = 0;
1357
1358 if (ctx->u.utf_8.lower != 0x00) {
1359 LXB_ENCODING_DECODE_UTF_8_BOUNDARY(ctx->u.utf_8.lower,
1360 ctx->u.utf_8.upper, goto begin);
1361 ctx->u.utf_8.lower = 0x00;
1362 }
1363
1364 goto decode;
1365 }
1366
1367 begin:
1368
1369 while (p < end) {
1370 if (ctx->buffer_used >= ctx->buffer_length) {
1371 *data = p;
1372
1373 return LXB_STATUS_SMALL_BUFFER;
1374 }
1375
1376 ch = *p++;
1377
1378 if (ch < 0x80) {
1379 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ch);
1380 continue;
1381 }
1382 else if (ch <= 0xDF) {
1383 if (ch < 0xC2) {
1384 LXB_ENCODING_DECODE_ERROR_BEGIN {
1385 *data = p - 1;
1386 }
1387 LXB_ENCODING_DECODE_ERROR_END();
1388
1389 continue;
1390 }
1391
1392 need = 1;
1393 ctx->codepoint = ch & 0x1F;
1394 }
1395 else if (ch < 0xF0) {
1396 need = 2;
1397 ctx->codepoint = ch & 0x0F;
1398
1399 if (p == end) {
1400 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xE0, 0xED, 0xA0, 0x9F);
1401
1402 *data = p;
1403
1404 ctx->u.utf_8.need = need;
1405 ctx->status = LXB_STATUS_CONTINUE;
1406
1407 return LXB_STATUS_CONTINUE;
1408 }
1409
1410 if (ch == 0xE0) {
1411 LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0xA0, 0xBF, continue);
1412 }
1413 else if (ch == 0xED) {
1414 LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x9F, continue);
1415 }
1416 }
1417 else if (ch < 0xF5) {
1418 need = 3;
1419 ctx->codepoint = ch & 0x07;
1420
1421 if (p == end) {
1422 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xF0, 0xF4, 0x90, 0x8F);
1423
1424 *data = p;
1425
1426 ctx->u.utf_8.need = need;
1427 ctx->status = LXB_STATUS_CONTINUE;
1428
1429 return LXB_STATUS_CONTINUE;
1430 }
1431
1432 if (ch == 0xF0) {
1433 LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x90, 0xBF, continue);
1434 }
1435 else if (ch == 0xF4) {
1436 LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x8F, continue);
1437 }
1438 }
1439 else {
1440 LXB_ENCODING_DECODE_ERROR_BEGIN {
1441 *data = p - 1;
1442 }
1443 LXB_ENCODING_DECODE_ERROR_END();
1444
1445 continue;
1446 }
1447
1448 decode:
1449
1450 do {
1451 if (p >= end) {
1452 *data = p;
1453
1454 ctx->u.utf_8.need = need;
1455 ctx->status = LXB_STATUS_CONTINUE;
1456
1457 return LXB_STATUS_CONTINUE;
1458 }
1459
1460 ch = *p++;
1461
1462 if (ch < 0x80 || ch > 0xBF) {
1463 p--;
1464
1465 ctx->u.utf_8.need = 0;
1466
1467 LXB_ENCODING_DECODE_ERROR_BEGIN {
1468 *data = p;
1469 ctx->have_error = true;
1470 }
1471 LXB_ENCODING_DECODE_ERROR_END();
1472
1473 break;
1474 }
1475
1476 ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
1477
1478 if (--need == 0) {
1479 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1480
1481 break;
1482 }
1483 }
1484 while (true);
1485 }
1486
1487 *data = p;
1488
1489 return LXB_STATUS_OK;
1490 }
1491
1492 lxb_inline lxb_codepoint_t
lxb_encoding_decode_gb18030_range(uint32_t index)1493 lxb_encoding_decode_gb18030_range(uint32_t index)
1494 {
1495 size_t mid, left, right;
1496 const lxb_encoding_range_index_t *range;
1497
1498 /*
1499 * Pointer greater than 39419 and less than 189000,
1500 * or pointer is greater than 1237575
1501 */
1502 if ((unsigned) (index - 39419) < (189000 - 39419)
1503 || index > 1237575)
1504 {
1505 return LXB_ENCODING_ERROR_CODEPOINT;
1506 }
1507
1508 if (index == 7457) {
1509 return 0xE7C7;
1510 }
1511
1512 left = 0;
1513 right = LXB_ENCODING_RANGE_INDEX_GB18030_SIZE;
1514 range = lxb_encoding_range_index_gb18030;
1515
1516 /* Some compilers say about uninitialized mid */
1517 mid = 0;
1518
1519 while (left < right) {
1520 mid = left + (right - left) / 2;
1521
1522 if (range[mid].index < index) {
1523 left = mid + 1;
1524
1525 if (left < right && range[ left ].index > index) {
1526 break;
1527 }
1528 }
1529 else if (range[mid].index > index) {
1530 right = mid - 1;
1531
1532 if (right > 0 && range[right].index <= index) {
1533 mid = right;
1534 break;
1535 }
1536 }
1537 else {
1538 break;
1539 }
1540 }
1541
1542 return range[mid].codepoint + index - range[mid].index;
1543 }
1544
1545 lxb_status_t
lxb_encoding_decode_gb18030(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1546 lxb_encoding_decode_gb18030(lxb_encoding_decode_t *ctx,
1547 const lxb_char_t **data, const lxb_char_t *end)
1548 {
1549 uint32_t pointer;
1550 lxb_char_t first, second, third, offset;
1551
1552 /* Make compiler happy */
1553 second = 0x00;
1554
1555 ctx->status = LXB_STATUS_OK;
1556
1557 if (ctx->have_error) {
1558 ctx->have_error = false;
1559
1560 LXB_ENCODING_DECODE_ERROR_BEGIN {
1561 ctx->have_error = true;
1562 }
1563 LXB_ENCODING_DECODE_ERROR_END();
1564 }
1565
1566 if (ctx->u.gb18030.first != 0) {
1567 if (*data >= end) {
1568 ctx->status = LXB_STATUS_CONTINUE;
1569
1570 return LXB_STATUS_CONTINUE;
1571 }
1572
1573 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1574
1575 if (ctx->u.gb18030.third != 0x00) {
1576 first = ctx->u.gb18030.first;
1577 second = ctx->u.gb18030.second;
1578 third = ctx->u.gb18030.third;
1579
1580 memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1581
1582 if (ctx->prepend) {
1583 /* The first is always < 0x80 */
1584 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1585
1586 if (ctx->buffer_used == ctx->buffer_length) {
1587 ctx->u.gb18030.first = third;
1588
1589 return LXB_STATUS_SMALL_BUFFER;
1590 }
1591
1592 first = third;
1593 ctx->prepend = false;
1594
1595 goto prepend_first;
1596 }
1597
1598 goto third_state;
1599 }
1600 else if (ctx->u.gb18030.second != 0x00) {
1601 first = ctx->u.gb18030.first;
1602 second = ctx->u.gb18030.second;
1603
1604 memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1605
1606 goto second_state;
1607 }
1608
1609 first = ctx->u.gb18030.first;
1610 ctx->u.gb18030.first = 0x00;
1611
1612 if (ctx->prepend) {
1613 ctx->prepend = false;
1614 goto prepend_first;
1615 }
1616
1617 goto first_state;
1618 }
1619
1620 while (*data < end) {
1621 LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1622
1623 first = *(*data)++;
1624
1625 prepend_first:
1626
1627 if (first < 0x80) {
1628 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, first);
1629 continue;
1630 }
1631
1632 if (first == 0x80) {
1633 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x20AC);
1634 continue;
1635 }
1636
1637 /* Range 0x81 to 0xFE, inclusive */
1638 if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
1639 LXB_ENCODING_DECODE_ERROR_BEGIN {
1640 (*data)--;
1641 }
1642 LXB_ENCODING_DECODE_ERROR_END();
1643
1644 continue;
1645 }
1646
1647 if (*data == end) {
1648 ctx->u.gb18030.first = first;
1649 ctx->status = LXB_STATUS_CONTINUE;
1650
1651 return LXB_STATUS_CONTINUE;
1652 }
1653
1654 /* First */
1655 first_state:
1656
1657 second = *(*data)++;
1658
1659 /* Range 0x30 to 0x39, inclusive */
1660 if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
1661 offset = (second < 0x7F) ? 0x40 : 0x41;
1662
1663 /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
1664 if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
1665 || (unsigned) (second - 0x80) <= (0xFE - 0x80))
1666 {
1667 pointer = (first - 0x81) * 190 + (second - offset);
1668 }
1669 else {
1670 if (second < 0x80) {
1671 (*data)--;
1672 }
1673
1674 LXB_ENCODING_DECODE_ERROR_BEGIN {
1675 ctx->have_error = true;
1676 }
1677 LXB_ENCODING_DECODE_ERROR_END();
1678
1679 continue;
1680 }
1681
1682 /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
1683 ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
1684 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1685 if (second < 0x80) {
1686 (*data)--;
1687 }
1688
1689 LXB_ENCODING_DECODE_ERROR_BEGIN {
1690 ctx->have_error = true;
1691 }
1692 LXB_ENCODING_DECODE_ERROR_END();
1693
1694 continue;
1695 }
1696
1697 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1698 continue;
1699 }
1700
1701 if (*data == end) {
1702 ctx->u.gb18030.first = first;
1703 ctx->u.gb18030.second = second;
1704
1705 ctx->status = LXB_STATUS_CONTINUE;
1706
1707 return LXB_STATUS_CONTINUE;
1708 }
1709
1710 /* Second */
1711 second_state:
1712
1713 third = *(*data)++;
1714
1715 /* Range 0x81 to 0xFE, inclusive */
1716 if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
1717 (*data)--;
1718
1719 LXB_ENCODING_DECODE_ERROR_BEGIN {
1720 ctx->prepend = true;
1721 ctx->have_error = true;
1722 ctx->u.gb18030.first = second;
1723 }
1724 LXB_ENCODING_DECODE_ERROR_END();
1725
1726 first = second;
1727
1728 goto prepend_first;
1729 }
1730
1731 if (*data == end) {
1732 ctx->u.gb18030.first = first;
1733 ctx->u.gb18030.second = second;
1734 ctx->u.gb18030.third = third;
1735
1736 ctx->status = LXB_STATUS_CONTINUE;
1737
1738 return LXB_STATUS_CONTINUE;
1739 }
1740
1741 /* Third */
1742 third_state:
1743
1744 /* Range 0x30 to 0x39, inclusive */
1745 if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
1746 ctx->prepend = true;
1747
1748 LXB_ENCODING_DECODE_ERROR_BEGIN {
1749 ctx->prepend = true;
1750 ctx->have_error = true;
1751
1752 /* First is a fake for trigger */
1753 ctx->u.gb18030.first = 0x01;
1754 ctx->u.gb18030.second = second;
1755 ctx->u.gb18030.third = third;
1756 }
1757 LXB_ENCODING_DECODE_ERROR_END();
1758
1759 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1760
1761 if (ctx->buffer_used == ctx->buffer_length) {
1762 ctx->prepend = true;
1763 ctx->have_error = true;
1764
1765 /* First is a fake for trigger */
1766 ctx->u.gb18030.first = 0x01;
1767 ctx->u.gb18030.second = second;
1768 ctx->u.gb18030.third = third;
1769
1770 return LXB_STATUS_SMALL_BUFFER;
1771 }
1772
1773 first = third;
1774
1775 goto prepend_first;
1776 }
1777
1778 pointer = ((first - 0x81) * (10 * 126 * 10))
1779 + ((second - 0x30) * (10 * 126))
1780 + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
1781
1782 ctx->codepoint = lxb_encoding_decode_gb18030_range(pointer);
1783
1784 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1785 LXB_ENCODING_DECODE_ERROR_BEGIN {}
1786 LXB_ENCODING_DECODE_ERROR_END();
1787
1788 continue;
1789 }
1790
1791 LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1792 }
1793
1794 return LXB_STATUS_OK;
1795 }
1796
1797 lxb_status_t
lxb_encoding_decode_macintosh(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1798 lxb_encoding_decode_macintosh(lxb_encoding_decode_t *ctx,
1799 const lxb_char_t **data, const lxb_char_t *end)
1800 {
1801 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_macintosh);
1802
1803 return LXB_STATUS_OK;
1804 }
1805
1806 lxb_status_t
lxb_encoding_decode_replacement(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1807 lxb_encoding_decode_replacement(lxb_encoding_decode_t *ctx,
1808 const lxb_char_t **data, const lxb_char_t *end)
1809 {
1810 *data = end;
1811 return LXB_STATUS_ERROR;
1812 }
1813
1814 lxb_status_t
lxb_encoding_decode_windows_1250(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1815 lxb_encoding_decode_windows_1250(lxb_encoding_decode_t *ctx,
1816 const lxb_char_t **data, const lxb_char_t *end)
1817 {
1818 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1250);
1819
1820 return LXB_STATUS_OK;
1821 }
1822
1823 lxb_status_t
lxb_encoding_decode_windows_1251(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1824 lxb_encoding_decode_windows_1251(lxb_encoding_decode_t *ctx,
1825 const lxb_char_t **data, const lxb_char_t *end)
1826 {
1827 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1251);
1828
1829 return LXB_STATUS_OK;
1830 }
1831
1832 lxb_status_t
lxb_encoding_decode_windows_1252(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1833 lxb_encoding_decode_windows_1252(lxb_encoding_decode_t *ctx,
1834 const lxb_char_t **data, const lxb_char_t *end)
1835 {
1836 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1252);
1837
1838 return LXB_STATUS_OK;
1839 }
1840
1841 lxb_status_t
lxb_encoding_decode_windows_1253(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1842 lxb_encoding_decode_windows_1253(lxb_encoding_decode_t *ctx,
1843 const lxb_char_t **data, const lxb_char_t *end)
1844 {
1845 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1253);
1846
1847 return LXB_STATUS_OK;
1848 }
1849
1850 lxb_status_t
lxb_encoding_decode_windows_1254(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1851 lxb_encoding_decode_windows_1254(lxb_encoding_decode_t *ctx,
1852 const lxb_char_t **data, const lxb_char_t *end)
1853 {
1854 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1254);
1855
1856 return LXB_STATUS_OK;
1857 }
1858
1859 lxb_status_t
lxb_encoding_decode_windows_1255(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1860 lxb_encoding_decode_windows_1255(lxb_encoding_decode_t *ctx,
1861 const lxb_char_t **data, const lxb_char_t *end)
1862 {
1863 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1255);
1864
1865 return LXB_STATUS_OK;
1866 }
1867
1868 lxb_status_t
lxb_encoding_decode_windows_1256(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1869 lxb_encoding_decode_windows_1256(lxb_encoding_decode_t *ctx,
1870 const lxb_char_t **data, const lxb_char_t *end)
1871 {
1872 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1256);
1873
1874 return LXB_STATUS_OK;
1875 }
1876
1877 lxb_status_t
lxb_encoding_decode_windows_1257(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1878 lxb_encoding_decode_windows_1257(lxb_encoding_decode_t *ctx,
1879 const lxb_char_t **data, const lxb_char_t *end)
1880 {
1881 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1257);
1882
1883 return LXB_STATUS_OK;
1884 }
1885
1886 lxb_status_t
lxb_encoding_decode_windows_1258(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1887 lxb_encoding_decode_windows_1258(lxb_encoding_decode_t *ctx,
1888 const lxb_char_t **data, const lxb_char_t *end)
1889 {
1890 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1258);
1891
1892 return LXB_STATUS_OK;
1893 }
1894
1895 lxb_status_t
lxb_encoding_decode_windows_874(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1896 lxb_encoding_decode_windows_874(lxb_encoding_decode_t *ctx,
1897 const lxb_char_t **data, const lxb_char_t *end)
1898 {
1899 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_874);
1900
1901 return LXB_STATUS_OK;
1902 }
1903
1904 lxb_status_t
lxb_encoding_decode_x_mac_cyrillic(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1905 lxb_encoding_decode_x_mac_cyrillic(lxb_encoding_decode_t *ctx,
1906 const lxb_char_t **data, const lxb_char_t *end)
1907 {
1908 LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_x_mac_cyrillic);
1909
1910 return LXB_STATUS_OK;
1911 }
1912
1913 lxb_status_t
lxb_encoding_decode_x_user_defined(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1914 lxb_encoding_decode_x_user_defined(lxb_encoding_decode_t *ctx,
1915 const lxb_char_t **data, const lxb_char_t *end)
1916 {
1917 while (*data < end) {
1918 if (**data < 0x80) {
1919 LXB_ENCODING_DECODE_APPEND(ctx, *(*data)++);
1920 }
1921 else {
1922 LXB_ENCODING_DECODE_APPEND(ctx, 0xF780 + (*(*data)++) - 0x80);
1923 }
1924 }
1925
1926 return LXB_STATUS_OK;
1927 }
1928
1929 /*
1930 * Single
1931 */
1932 lxb_codepoint_t
lxb_encoding_decode_default_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1933 lxb_encoding_decode_default_single(lxb_encoding_decode_t *ctx,
1934 const lxb_char_t **data, const lxb_char_t *end)
1935 {
1936 return lxb_encoding_decode_utf_8_single(ctx, data, end);
1937 }
1938
1939 lxb_codepoint_t
lxb_encoding_decode_auto_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1940 lxb_encoding_decode_auto_single(lxb_encoding_decode_t *ctx,
1941 const lxb_char_t **data, const lxb_char_t *end)
1942 {
1943 return LXB_ENCODING_DECODE_ERROR;
1944 }
1945
1946 lxb_codepoint_t
lxb_encoding_decode_undefined_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1947 lxb_encoding_decode_undefined_single(lxb_encoding_decode_t *ctx,
1948 const lxb_char_t **data, const lxb_char_t *end)
1949 {
1950 return LXB_ENCODING_DECODE_ERROR;
1951 }
1952
1953 lxb_codepoint_t
lxb_encoding_decode_big5_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)1954 lxb_encoding_decode_big5_single(lxb_encoding_decode_t *ctx,
1955 const lxb_char_t **data, const lxb_char_t *end)
1956 {
1957 uint32_t index;
1958 lxb_char_t lead, byte;
1959
1960 if (ctx->u.lead != 0x00) {
1961 if (ctx->second_codepoint != 0x00) {
1962 (*data)++;
1963
1964 ctx->u.lead = 0x00;
1965
1966 ctx->codepoint = ctx->second_codepoint;
1967 ctx->second_codepoint = 0x00;
1968
1969 return ctx->codepoint;
1970 }
1971
1972 lead = (lxb_char_t) ctx->u.lead;
1973 ctx->u.lead = 0x00;
1974
1975 goto lead_state;
1976 }
1977
1978 lead = *(*data)++;
1979
1980 if (lead < 0x80) {
1981 return lead;
1982 }
1983
1984 if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
1985 return LXB_ENCODING_DECODE_ERROR;
1986 }
1987
1988 if (*data >= end) {
1989 ctx->u.lead = lead;
1990
1991 return LXB_ENCODING_DECODE_CONTINUE;
1992 }
1993
1994 lead_state:
1995
1996 index = 0;
1997 byte = **data;
1998
1999 if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2000 || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
2001 {
2002 if (byte < 0x7F) {
2003 /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
2004 index = (lead - 0x81) * 157 + (byte - 0x40);
2005 }
2006 else {
2007 /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
2008 index = (lead - 0x81) * 157 + (byte - 0x62);
2009 }
2010 }
2011
2012 /*
2013 * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
2014 * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
2015 * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
2016 * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
2017 */
2018 switch (index) {
2019 case 1133:
2020 ctx->u.lead = lead;
2021 ctx->second_codepoint = 0x0304;
2022 return 0x00CA;
2023
2024 case 1135:
2025 ctx->u.lead = lead;
2026 ctx->second_codepoint = 0x030C;
2027 return 0x00CA;
2028
2029 case 1164:
2030 ctx->u.lead = lead;
2031 ctx->second_codepoint = 0x0304;
2032 return 0x00EA;
2033
2034 case 1166:
2035 ctx->u.lead = lead;
2036 ctx->second_codepoint = 0x030C;
2037 return 0x00EA;
2038
2039 case 0:
2040 goto failed;
2041 }
2042
2043 ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
2044 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2045 goto failed;
2046 }
2047
2048 (*data)++;
2049
2050 return ctx->codepoint;
2051
2052 failed:
2053
2054 if (byte >= 0x80) {
2055 (*data)++;
2056 }
2057
2058 return LXB_ENCODING_DECODE_ERROR;
2059 }
2060
2061 lxb_codepoint_t
lxb_encoding_decode_euc_jp_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2062 lxb_encoding_decode_euc_jp_single(lxb_encoding_decode_t *ctx,
2063 const lxb_char_t **data, const lxb_char_t *end)
2064 {
2065 bool is_jis0212;
2066 lxb_char_t byte, lead;
2067
2068 if (ctx->u.euc_jp.lead != 0x00) {
2069 lead = ctx->u.euc_jp.lead;
2070 byte = *(*data)++;
2071
2072 ctx->u.euc_jp.lead = 0x00;
2073
2074 if (ctx->u.euc_jp.is_jis0212) {
2075 is_jis0212 = true;
2076 ctx->u.euc_jp.is_jis0212 = false;
2077
2078 goto lead_jis_state;
2079 }
2080
2081 goto lead_state;
2082 }
2083
2084 lead = *(*data)++;
2085
2086 if (lead < 0x80) {
2087 return lead;
2088 }
2089
2090 if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2091 && (lead != 0x8E && lead != 0x8F))
2092 {
2093 return LXB_ENCODING_DECODE_ERROR;
2094 }
2095
2096 if (*data >= end) {
2097 ctx->u.euc_jp.lead = lead;
2098 return LXB_ENCODING_DECODE_CONTINUE;
2099 }
2100
2101 byte = *(*data)++;
2102
2103 lead_state:
2104
2105 if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
2106 return 0xFF61 - 0xA1 + byte;
2107 }
2108
2109 is_jis0212 = false;
2110
2111 if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
2112 if (*data >= end) {
2113 ctx->u.euc_jp.lead = byte;
2114 ctx->u.euc_jp.is_jis0212 = true;
2115
2116 return LXB_ENCODING_DECODE_CONTINUE;
2117 }
2118
2119 lead = byte;
2120 byte = *(*data)++;
2121 is_jis0212 = true;
2122 }
2123
2124 lead_jis_state:
2125
2126 if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2127 || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
2128 {
2129 goto failed;
2130 }
2131
2132 /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
2133 ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
2134
2135 if (is_jis0212) {
2136 if ((sizeof(lxb_encoding_multi_index_jis0212)
2137 / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2138 {
2139 goto failed;
2140 }
2141
2142 ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
2143 }
2144 else {
2145 if ((sizeof(lxb_encoding_multi_index_jis0208)
2146 / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2147 {
2148 goto failed;
2149 }
2150
2151 ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2152 }
2153
2154 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2155 goto failed;
2156 }
2157
2158 return ctx->codepoint;
2159
2160 failed:
2161
2162 if (byte < 0x80) {
2163 (*data)--;
2164 }
2165
2166 return LXB_ENCODING_DECODE_ERROR;
2167 }
2168
2169 lxb_codepoint_t
lxb_encoding_decode_euc_kr_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2170 lxb_encoding_decode_euc_kr_single(lxb_encoding_decode_t *ctx,
2171 const lxb_char_t **data, const lxb_char_t *end)
2172 {
2173 lxb_char_t lead, byte;
2174
2175 if (ctx->u.lead != 0x00) {
2176 lead = (lxb_char_t) ctx->u.lead;
2177 ctx->u.lead = 0x00;
2178
2179 goto lead_state;
2180 }
2181
2182 lead = *(*data)++;
2183
2184 if (lead < 0x80) {
2185 return lead;
2186 }
2187
2188 if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
2189 return LXB_ENCODING_DECODE_ERROR;
2190 }
2191
2192 if (*data == end) {
2193 ctx->u.lead = lead;
2194 return LXB_ENCODING_DECODE_CONTINUE;
2195 }
2196
2197 lead_state:
2198
2199 byte = *(*data)++;
2200
2201 if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
2202 goto failed;
2203 }
2204
2205 /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2206 ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
2207
2208 if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
2209 / sizeof(lxb_encoding_multi_index_t))
2210 {
2211 goto failed;
2212 }
2213
2214 ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
2215 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2216 goto failed;
2217 }
2218
2219 return ctx->codepoint;
2220
2221 failed:
2222
2223 if (byte < 0x80) {
2224 (*data)--;
2225 }
2226
2227 return LXB_ENCODING_DECODE_ERROR;
2228 }
2229
2230 lxb_codepoint_t
lxb_encoding_decode_gbk_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2231 lxb_encoding_decode_gbk_single(lxb_encoding_decode_t *ctx,
2232 const lxb_char_t **data, const lxb_char_t *end)
2233 {
2234 return lxb_encoding_decode_gb18030_single(ctx, data, end);
2235 }
2236
2237 lxb_codepoint_t
lxb_encoding_decode_ibm866_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2238 lxb_encoding_decode_ibm866_single(lxb_encoding_decode_t *ctx,
2239 const lxb_char_t **data, const lxb_char_t *end)
2240 {
2241 if (**data < 0x80) {
2242 return *(*data)++;
2243 }
2244
2245 return lxb_encoding_single_index_ibm866[*(*data)++ - 0x80].codepoint;
2246 }
2247
2248 lxb_codepoint_t
lxb_encoding_decode_iso_2022_jp_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2249 lxb_encoding_decode_iso_2022_jp_single(lxb_encoding_decode_t *ctx,
2250 const lxb_char_t **data, const lxb_char_t *end)
2251 {
2252 lxb_char_t byte;
2253 lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
2254
2255 if (iso->prepand != 0x00) {
2256 byte = iso->prepand;
2257 iso->prepand = 0x00;
2258
2259 goto prepand;
2260 }
2261
2262 do {
2263 byte = *(*data)++;
2264
2265 prepand:
2266
2267 switch (iso->state) {
2268 case LXB_ENCODING_DECODE_2022_JP_ASCII:
2269 if (byte == 0x1B) {
2270 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2271
2272 break;
2273 }
2274
2275 /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
2276 if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
2277 && byte != 0x0E && byte != 0x0F)
2278 {
2279 iso->out_flag = false;
2280
2281 return byte;
2282 }
2283
2284 iso->out_flag = false;
2285
2286 return LXB_ENCODING_DECODE_ERROR;
2287
2288 case LXB_ENCODING_DECODE_2022_JP_ROMAN:
2289 switch (byte) {
2290 case 0x1B:
2291 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2292
2293 continue;
2294
2295 case 0x5C:
2296 iso->out_flag = false;
2297
2298 return 0x00A5;
2299
2300 case 0x7E:
2301 iso->out_flag = false;
2302
2303 return 0x203E;
2304
2305 case 0x0E:
2306 case 0x0F:
2307 break;
2308
2309 default:
2310 /* 0x00 to 0x7F */
2311 if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
2312 iso->out_flag = false;
2313
2314 return byte;
2315 }
2316
2317 break;
2318 }
2319
2320 iso->out_flag = false;
2321
2322 return LXB_ENCODING_DECODE_ERROR;
2323
2324 case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
2325 if (byte == 0x1B) {
2326 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2327
2328 break;
2329 }
2330
2331 /* 0x21 to 0x5F */
2332 if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
2333 iso->out_flag = false;
2334
2335 return 0xFF61 - 0x21 + byte;
2336 }
2337
2338 iso->out_flag = false;
2339
2340 return LXB_ENCODING_DECODE_ERROR;
2341
2342 case LXB_ENCODING_DECODE_2022_JP_LEAD:
2343 if (byte == 0x1B) {
2344 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2345
2346 break;
2347 }
2348
2349 /* 0x21 to 0x7E */
2350 if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2351 iso->out_flag = false;
2352 iso->lead = byte;
2353 iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
2354
2355 break;
2356 }
2357
2358 iso->out_flag = false;
2359
2360 return LXB_ENCODING_DECODE_ERROR;
2361
2362 case LXB_ENCODING_DECODE_2022_JP_TRAIL:
2363 if (byte == 0x1B) {
2364 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2365
2366 return LXB_ENCODING_DECODE_ERROR;
2367 }
2368
2369 iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2370
2371 /* 0x21 to 0x7E */
2372 if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2373 /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
2374 ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
2375
2376 return lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2377 }
2378
2379 return LXB_ENCODING_DECODE_ERROR;
2380
2381 case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
2382 if (byte == 0x24 || byte == 0x28) {
2383 iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
2384 iso->lead = byte;
2385
2386 break;
2387 }
2388
2389 (*data)--;
2390
2391 iso->out_flag = false;
2392 iso->state = ctx->u.iso_2022_jp.out_state;
2393
2394 return LXB_ENCODING_DECODE_ERROR;
2395
2396 case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
2397 iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
2398
2399 if (iso->lead == 0x28) {
2400 if (byte == 0x42) {
2401 iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
2402 }
2403 else if (byte == 0x4A) {
2404 iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
2405 }
2406 else if (byte == 0x49) {
2407 iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
2408 }
2409 }
2410 else if (iso->lead == 0x24) {
2411 if (byte == 0x40 || byte == 0x42) {
2412 iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2413 }
2414 }
2415
2416 if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
2417 iso->prepand = iso->lead;
2418 iso->lead = 0x00;
2419
2420 (*data)--;
2421
2422 iso->out_flag = false;
2423 iso->state = iso->out_state;
2424
2425 return LXB_ENCODING_DECODE_ERROR;
2426 }
2427
2428 iso->lead = 0x00;
2429 iso->out_state = iso->state;
2430
2431 if (iso->out_flag) {
2432 return LXB_ENCODING_DECODE_ERROR;
2433 }
2434
2435 iso->out_flag = true;
2436
2437 break;
2438 }
2439 }
2440 while (*data < end);
2441
2442 return LXB_ENCODING_DECODE_CONTINUE;
2443 }
2444
2445 lxb_codepoint_t
lxb_encoding_decode_iso_8859_10_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2446 lxb_encoding_decode_iso_8859_10_single(lxb_encoding_decode_t *ctx,
2447 const lxb_char_t **data, const lxb_char_t *end)
2448 {
2449 if (**data < 0x80) {
2450 return *(*data)++;
2451 }
2452
2453 return lxb_encoding_single_index_iso_8859_10[*(*data)++ - 0x80].codepoint;
2454 }
2455
2456 lxb_codepoint_t
lxb_encoding_decode_iso_8859_13_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2457 lxb_encoding_decode_iso_8859_13_single(lxb_encoding_decode_t *ctx,
2458 const lxb_char_t **data, const lxb_char_t *end)
2459 {
2460 if (**data < 0x80) {
2461 return *(*data)++;
2462 }
2463
2464 return lxb_encoding_single_index_iso_8859_13[*(*data)++ - 0x80].codepoint;
2465 }
2466
2467 lxb_codepoint_t
lxb_encoding_decode_iso_8859_14_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2468 lxb_encoding_decode_iso_8859_14_single(lxb_encoding_decode_t *ctx,
2469 const lxb_char_t **data, const lxb_char_t *end)
2470 {
2471 if (**data < 0x80) {
2472 return *(*data)++;
2473 }
2474
2475 return lxb_encoding_single_index_iso_8859_14[*(*data)++ - 0x80].codepoint;
2476 }
2477
2478 lxb_codepoint_t
lxb_encoding_decode_iso_8859_15_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2479 lxb_encoding_decode_iso_8859_15_single(lxb_encoding_decode_t *ctx,
2480 const lxb_char_t **data, const lxb_char_t *end)
2481 {
2482 if (**data < 0x80) {
2483 return *(*data)++;
2484 }
2485
2486 return lxb_encoding_single_index_iso_8859_15[*(*data)++ - 0x80].codepoint;
2487 }
2488
2489 lxb_codepoint_t
lxb_encoding_decode_iso_8859_16_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2490 lxb_encoding_decode_iso_8859_16_single(lxb_encoding_decode_t *ctx,
2491 const lxb_char_t **data, const lxb_char_t *end)
2492 {
2493 if (**data < 0x80) {
2494 return *(*data)++;
2495 }
2496
2497 return lxb_encoding_single_index_iso_8859_16[*(*data)++ - 0x80].codepoint;
2498 }
2499
2500 lxb_codepoint_t
lxb_encoding_decode_iso_8859_2_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2501 lxb_encoding_decode_iso_8859_2_single(lxb_encoding_decode_t *ctx,
2502 const lxb_char_t **data, const lxb_char_t *end)
2503 {
2504 if (**data < 0x80) {
2505 return *(*data)++;
2506 }
2507
2508 return lxb_encoding_single_index_iso_8859_2[*(*data)++ - 0x80].codepoint;
2509 }
2510
2511 lxb_codepoint_t
lxb_encoding_decode_iso_8859_3_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2512 lxb_encoding_decode_iso_8859_3_single(lxb_encoding_decode_t *ctx,
2513 const lxb_char_t **data, const lxb_char_t *end)
2514 {
2515 if (**data < 0x80) {
2516 return *(*data)++;
2517 }
2518
2519 return lxb_encoding_single_index_iso_8859_3[*(*data)++ - 0x80].codepoint;
2520 }
2521
2522 lxb_codepoint_t
lxb_encoding_decode_iso_8859_4_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2523 lxb_encoding_decode_iso_8859_4_single(lxb_encoding_decode_t *ctx,
2524 const lxb_char_t **data, const lxb_char_t *end)
2525 {
2526 if (**data < 0x80) {
2527 return *(*data)++;
2528 }
2529
2530 return lxb_encoding_single_index_iso_8859_4[*(*data)++ - 0x80].codepoint;
2531 }
2532
2533 lxb_codepoint_t
lxb_encoding_decode_iso_8859_5_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2534 lxb_encoding_decode_iso_8859_5_single(lxb_encoding_decode_t *ctx,
2535 const lxb_char_t **data, const lxb_char_t *end)
2536 {
2537 if (**data < 0x80) {
2538 return *(*data)++;
2539 }
2540
2541 return lxb_encoding_single_index_iso_8859_5[*(*data)++ - 0x80].codepoint;
2542 }
2543
2544 lxb_codepoint_t
lxb_encoding_decode_iso_8859_6_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2545 lxb_encoding_decode_iso_8859_6_single(lxb_encoding_decode_t *ctx,
2546 const lxb_char_t **data, const lxb_char_t *end)
2547 {
2548 if (**data < 0x80) {
2549 return *(*data)++;
2550 }
2551
2552 return lxb_encoding_single_index_iso_8859_6[*(*data)++ - 0x80].codepoint;
2553 }
2554
2555 lxb_codepoint_t
lxb_encoding_decode_iso_8859_7_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2556 lxb_encoding_decode_iso_8859_7_single(lxb_encoding_decode_t *ctx,
2557 const lxb_char_t **data, const lxb_char_t *end)
2558 {
2559 if (**data < 0x80) {
2560 return *(*data)++;
2561 }
2562
2563 return lxb_encoding_single_index_iso_8859_7[*(*data)++ - 0x80].codepoint;
2564 }
2565
2566 lxb_codepoint_t
lxb_encoding_decode_iso_8859_8_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2567 lxb_encoding_decode_iso_8859_8_single(lxb_encoding_decode_t *ctx,
2568 const lxb_char_t **data, const lxb_char_t *end)
2569 {
2570 if (**data < 0x80) {
2571 return *(*data)++;
2572 }
2573
2574 return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2575 }
2576
2577 lxb_codepoint_t
lxb_encoding_decode_iso_8859_8_i_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2578 lxb_encoding_decode_iso_8859_8_i_single(lxb_encoding_decode_t *ctx,
2579 const lxb_char_t **data, const lxb_char_t *end)
2580 {
2581 if (**data < 0x80) {
2582 return *(*data)++;
2583 }
2584
2585 return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2586 }
2587
2588 lxb_codepoint_t
lxb_encoding_decode_koi8_r_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2589 lxb_encoding_decode_koi8_r_single(lxb_encoding_decode_t *ctx,
2590 const lxb_char_t **data, const lxb_char_t *end)
2591 {
2592 if (**data < 0x80) {
2593 return *(*data)++;
2594 }
2595
2596 return lxb_encoding_single_index_koi8_r[*(*data)++ - 0x80].codepoint;
2597 }
2598
2599 lxb_codepoint_t
lxb_encoding_decode_koi8_u_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2600 lxb_encoding_decode_koi8_u_single(lxb_encoding_decode_t *ctx,
2601 const lxb_char_t **data, const lxb_char_t *end)
2602 {
2603 if (**data < 0x80) {
2604 return *(*data)++;
2605 }
2606
2607 return lxb_encoding_single_index_koi8_u[*(*data)++ - 0x80].codepoint;
2608 }
2609
2610 lxb_codepoint_t
lxb_encoding_decode_shift_jis_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2611 lxb_encoding_decode_shift_jis_single(lxb_encoding_decode_t *ctx,
2612 const lxb_char_t **data, const lxb_char_t *end)
2613 {
2614 lxb_char_t byte, lead;
2615
2616 if (ctx->u.lead != 0x00) {
2617 lead = (lxb_char_t) ctx->u.lead;
2618 ctx->u.lead = 0x00;
2619
2620 goto lead_state;
2621 }
2622
2623 lead = *(*data)++;
2624
2625 if (lead <= 0x80) {
2626 return lead;
2627 }
2628
2629 if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
2630 return 0xFF61 - 0xA1 + lead;
2631 }
2632
2633 if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
2634 && lead != 0xE0 && lead != 0xFC)
2635 {
2636 return LXB_ENCODING_DECODE_ERROR;
2637 }
2638
2639 if (*data >= end) {
2640 ctx->u.lead = lead;
2641
2642 return LXB_ENCODING_DECODE_CONTINUE;
2643 }
2644
2645 lead_state:
2646
2647 byte = *(*data)++;
2648
2649 if (byte < 0x7F) {
2650 ctx->codepoint = 0x40;
2651 }
2652 else {
2653 ctx->codepoint = 0x41;
2654 }
2655
2656 if (lead < 0xA0) {
2657 ctx->second_codepoint = 0x81;
2658 }
2659 else {
2660 ctx->second_codepoint = 0xC1;
2661 }
2662
2663 if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2664 || (unsigned) (byte - 0x80) <= (0xFC - 0x80))
2665 {
2666 /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
2667 ctx->codepoint = (lead - ctx->second_codepoint) * 188
2668 + byte - ctx->codepoint;
2669
2670 if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
2671 / sizeof(lxb_encoding_multi_index_t)))
2672 {
2673 goto failed;
2674 }
2675
2676 if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
2677 return 0xE000 - 8836 + ctx->codepoint;
2678 }
2679
2680 ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2681 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2682 goto failed;
2683 }
2684
2685 return ctx->codepoint;
2686 }
2687
2688 failed:
2689
2690 if (byte < 0x80) {
2691 (*data)--;
2692 }
2693
2694 return LXB_ENCODING_DECODE_ERROR;
2695 }
2696
2697 lxb_inline lxb_codepoint_t
lxb_encoding_decode_utf_16_single(lxb_encoding_decode_t * ctx,bool is_be,const lxb_char_t ** data,const lxb_char_t * end)2698 lxb_encoding_decode_utf_16_single(lxb_encoding_decode_t *ctx, bool is_be,
2699 const lxb_char_t **data, const lxb_char_t *end)
2700 {
2701 unsigned lead;
2702 lxb_codepoint_t unit;
2703
2704 if (ctx->u.lead != 0x00) {
2705 lead = ctx->u.lead - 0x01;
2706 ctx->u.lead = 0x00;
2707
2708 goto lead_state;
2709 }
2710
2711 pair_state:
2712
2713 lead = *(*data)++;
2714
2715 if (*data >= end) {
2716 ctx->u.lead = lead + 0x01;
2717 return LXB_ENCODING_DECODE_CONTINUE;
2718 }
2719
2720 lead_state:
2721
2722 /* For UTF-16BE or UTF-16LE */
2723 if (is_be) {
2724 unit = (lead << 8) + *(*data)++;
2725 }
2726 else {
2727 unit = (*(*data)++ << 8) + lead;
2728 }
2729
2730 if (ctx->second_codepoint != 0x00) {
2731 if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2732 ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
2733 + (unit - 0xDC00);
2734
2735 ctx->second_codepoint = 0x00;
2736 return ctx->codepoint;
2737 }
2738
2739 (*data)--;
2740
2741 ctx->u.lead = lead + 0x01;
2742 ctx->second_codepoint = 0x00;
2743
2744 return LXB_ENCODING_DECODE_ERROR;
2745 }
2746
2747 /* Surrogate pair */
2748 if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
2749 if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2750 return LXB_ENCODING_DECODE_ERROR;
2751 }
2752
2753 ctx->second_codepoint = unit;
2754
2755 if (*data >= end) {
2756 return LXB_ENCODING_DECODE_CONTINUE;
2757 }
2758
2759 goto pair_state;
2760 }
2761
2762 return unit;
2763 }
2764
2765 lxb_codepoint_t
lxb_encoding_decode_utf_16be_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2766 lxb_encoding_decode_utf_16be_single(lxb_encoding_decode_t *ctx,
2767 const lxb_char_t **data, const lxb_char_t *end)
2768 {
2769 return lxb_encoding_decode_utf_16_single(ctx, true, data, end);
2770 }
2771
2772 lxb_codepoint_t
lxb_encoding_decode_utf_16le_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2773 lxb_encoding_decode_utf_16le_single(lxb_encoding_decode_t *ctx,
2774 const lxb_char_t **data, const lxb_char_t *end)
2775 {
2776 return lxb_encoding_decode_utf_16_single(ctx, false, data, end);
2777 }
2778
2779 lxb_codepoint_t
lxb_encoding_decode_utf_8_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2780 lxb_encoding_decode_utf_8_single(lxb_encoding_decode_t *ctx,
2781 const lxb_char_t **data, const lxb_char_t *end)
2782 {
2783 unsigned needed;
2784 lxb_char_t ch;
2785 const lxb_char_t *p;
2786
2787 if (ctx->u.utf_8.need != 0) {
2788 needed = ctx->u.utf_8.need;
2789 ctx->u.utf_8.need = 0;
2790
2791 if (ctx->u.utf_8.lower != 0x00) {
2792 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(ctx->u.utf_8.lower,
2793 ctx->u.utf_8.upper);
2794 ctx->u.utf_8.lower = 0x00;
2795 }
2796
2797 goto decode;
2798 }
2799
2800 ch = *(*data)++;
2801
2802 if (ch < 0x80) {
2803 return ch;
2804 }
2805 else if (ch <= 0xDF) {
2806 if (ch < 0xC2) {
2807 return LXB_ENCODING_DECODE_ERROR;
2808 }
2809
2810 needed = 1;
2811 ctx->codepoint = ch & 0x1F;
2812 }
2813 else if (ch < 0xF0) {
2814 needed = 2;
2815 ctx->codepoint = ch & 0x0F;
2816
2817 if (*data == end) {
2818 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xE0, 0xED,
2819 0xA0, 0x9F);
2820 goto next;
2821 }
2822
2823 if (ch == 0xE0) {
2824 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0xA0, 0xBF);
2825 }
2826 else if (ch == 0xED) {
2827 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x9F);
2828 }
2829 }
2830 else if (ch < 0xF5) {
2831 needed = 3;
2832 ctx->codepoint = ch & 0x07;
2833
2834 if (*data == end) {
2835 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xF0, 0xF4,
2836 0x90, 0x8F);
2837
2838 goto next;
2839 }
2840
2841 if (ch == 0xF0) {
2842 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x90, 0xBF);
2843 }
2844 else if (ch == 0xF4) {
2845 LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x8F);
2846 }
2847 }
2848 else {
2849 return LXB_ENCODING_DECODE_ERROR;
2850 }
2851
2852 decode:
2853
2854 for (p = *data; p < end; p++) {
2855 ch = *p;
2856
2857 if (ch < 0x80 || ch > 0xBF) {
2858 *data = p;
2859
2860 goto failed;
2861 }
2862
2863 ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
2864
2865 if (--needed == 0) {
2866 *data = p + 1;
2867
2868 return ctx->codepoint;
2869 }
2870 }
2871
2872 *data = p;
2873
2874 next:
2875
2876 ctx->u.utf_8.need = needed;
2877
2878 return LXB_ENCODING_DECODE_CONTINUE;
2879
2880 failed:
2881
2882 ctx->u.utf_8.lower = 0x00;
2883 ctx->u.utf_8.need = 0;
2884
2885 return LXB_ENCODING_DECODE_ERROR;
2886 }
2887
2888 lxb_codepoint_t
lxb_encoding_decode_valid_utf_8_single(const lxb_char_t ** data,const lxb_char_t * end)2889 lxb_encoding_decode_valid_utf_8_single(const lxb_char_t **data,
2890 const lxb_char_t *end)
2891 {
2892 lxb_codepoint_t cp;
2893 const lxb_char_t *p = *data;
2894
2895 if (*p < 0x80){
2896 /* 0xxxxxxx */
2897
2898 if (end - p < 1) {
2899 *data = end;
2900 return LXB_ENCODING_DECODE_ERROR;
2901 }
2902
2903 cp = (lxb_codepoint_t) *p;
2904
2905 (*data) += 1;
2906 }
2907 else if ((*p & 0xe0) == 0xc0) {
2908 /* 110xxxxx 10xxxxxx */
2909
2910 if (end - p < 2) {
2911 *data = end;
2912 return LXB_ENCODING_DECODE_ERROR;
2913 }
2914
2915 cp = (p[0] ^ (0xC0 & p[0])) << 6;
2916 cp |= (p[1] ^ (0x80 & p[1]));
2917
2918 (*data) += 2;
2919 }
2920 else if ((*p & 0xf0) == 0xe0) {
2921 /* 1110xxxx 10xxxxxx 10xxxxxx */
2922
2923 if (end - p < 3) {
2924 *data = end;
2925 return LXB_ENCODING_DECODE_ERROR;
2926 }
2927
2928 cp = (p[0] ^ (0xE0 & p[0])) << 12;
2929 cp |= (p[1] ^ (0x80 & p[1])) << 6;
2930 cp |= (p[2] ^ (0x80 & p[2]));
2931
2932 (*data) += 3;
2933 }
2934 else if ((*p & 0xf8) == 0xf0) {
2935 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
2936
2937 if (end - p < 4) {
2938 *data = end;
2939 return LXB_ENCODING_DECODE_ERROR;
2940 }
2941
2942 cp = (p[0] ^ (0xF0 & p[0])) << 18;
2943 cp |= (p[1] ^ (0x80 & p[1])) << 12;
2944 cp |= (p[2] ^ (0x80 & p[2])) << 6;
2945 cp |= (p[3] ^ (0x80 & p[3]));
2946
2947 (*data) += 4;
2948 }
2949 else {
2950 (*data)++;
2951
2952 return LXB_ENCODING_DECODE_ERROR;
2953 }
2954
2955 return cp;
2956 }
2957
2958 uint8_t
lxb_encoding_decode_utf_8_length(lxb_char_t data)2959 lxb_encoding_decode_utf_8_length(lxb_char_t data)
2960 {
2961
2962 if (data < 0x80){
2963 return 1;
2964 }
2965 else if ((data & 0xe0) == 0xc0) {
2966 return 2;
2967 }
2968 else if ((data & 0xf0) == 0xe0) {
2969 return 3;
2970 }
2971 else if ((data & 0xf8) == 0xf0) {
2972 return 4;
2973 }
2974
2975 return 0;
2976 }
2977
2978 lxb_codepoint_t
lxb_encoding_decode_gb18030_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)2979 lxb_encoding_decode_gb18030_single(lxb_encoding_decode_t *ctx,
2980 const lxb_char_t **data, const lxb_char_t *end)
2981 {
2982 uint32_t pointer;
2983 lxb_char_t first, second, third, offset;
2984
2985 /* Make compiler happy */
2986 second = 0x00;
2987
2988 if (ctx->u.gb18030.first != 0) {
2989 if (ctx->u.gb18030.third != 0x00) {
2990 first = ctx->u.gb18030.first;
2991 second = ctx->u.gb18030.second;
2992 third = ctx->u.gb18030.third;
2993
2994 memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2995
2996 if (ctx->prepend) {
2997 /* The first is always < 0x80 */
2998 ctx->u.gb18030.first = third;
2999
3000 return second;
3001 }
3002
3003 goto third_state;
3004 }
3005 else if (ctx->u.gb18030.second != 0x00) {
3006 first = ctx->u.gb18030.first;
3007 second = ctx->u.gb18030.second;
3008
3009 memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
3010
3011 goto second_state;
3012 }
3013
3014 first = ctx->u.gb18030.first;
3015 ctx->u.gb18030.first = 0x00;
3016
3017 if (ctx->prepend) {
3018 ctx->prepend = false;
3019 goto prepend_first;
3020 }
3021
3022 goto first_state;
3023 }
3024
3025 first = *(*data)++;
3026
3027 prepend_first:
3028
3029 if (first < 0x80) {
3030 return first;
3031 }
3032
3033 if (first == 0x80) {
3034 return 0x20AC;
3035 }
3036
3037 /* Range 0x81 to 0xFE, inclusive */
3038 if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
3039 return LXB_ENCODING_DECODE_ERROR;
3040 }
3041
3042 if (*data == end) {
3043 ctx->u.gb18030.first = first;
3044 return LXB_ENCODING_DECODE_CONTINUE;
3045 }
3046
3047 /* First */
3048 first_state:
3049
3050 second = *(*data)++;
3051
3052 /* Range 0x30 to 0x39, inclusive */
3053 if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
3054 offset = (second < 0x7F) ? 0x40 : 0x41;
3055
3056 /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
3057 if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
3058 || (unsigned) (second - 0x80) <= (0xFE - 0x80))
3059 {
3060 pointer = (first - 0x81) * 190 + (second - offset);
3061 }
3062 else {
3063 goto failed;
3064 }
3065
3066 /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
3067 ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
3068 if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
3069 goto failed;
3070 }
3071
3072 return ctx->codepoint;
3073 }
3074
3075 if (*data == end) {
3076 ctx->u.gb18030.first = first;
3077 ctx->u.gb18030.second = second;
3078
3079 return LXB_ENCODING_DECODE_CONTINUE;
3080 }
3081
3082 /* Second */
3083 second_state:
3084
3085 third = *(*data)++;
3086
3087 /* Range 0x81 to 0xFE, inclusive */
3088 if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
3089 (*data)--;
3090
3091 ctx->prepend = true;
3092 ctx->u.gb18030.first = second;
3093
3094 return LXB_ENCODING_DECODE_ERROR;
3095 }
3096
3097 if (*data == end) {
3098 ctx->u.gb18030.first = first;
3099 ctx->u.gb18030.second = second;
3100 ctx->u.gb18030.third = third;
3101
3102 return LXB_ENCODING_DECODE_CONTINUE;
3103 }
3104
3105 /* Third */
3106 third_state:
3107
3108 /* Range 0x30 to 0x39, inclusive */
3109 if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
3110 ctx->prepend = true;
3111
3112 /* First is a fake for trigger */
3113 ctx->u.gb18030.first = 0x01;
3114 ctx->u.gb18030.second = second;
3115 ctx->u.gb18030.third = third;
3116
3117 return LXB_ENCODING_DECODE_ERROR;
3118 }
3119
3120 pointer = ((first - 0x81) * (10 * 126 * 10))
3121 + ((second - 0x30) * (10 * 126))
3122 + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
3123
3124 return lxb_encoding_decode_gb18030_range(pointer);
3125
3126 failed:
3127
3128 if (second < 0x80) {
3129 (*data)--;
3130 }
3131
3132 return LXB_ENCODING_DECODE_ERROR;
3133 }
3134
3135 lxb_codepoint_t
lxb_encoding_decode_macintosh_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3136 lxb_encoding_decode_macintosh_single(lxb_encoding_decode_t *ctx,
3137 const lxb_char_t **data, const lxb_char_t *end)
3138 {
3139 if (**data < 0x80) {
3140 return *(*data)++;
3141 }
3142
3143 return lxb_encoding_single_index_macintosh[*(*data)++ - 0x80].codepoint;
3144 }
3145
3146 lxb_codepoint_t
lxb_encoding_decode_replacement_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3147 lxb_encoding_decode_replacement_single(lxb_encoding_decode_t *ctx,
3148 const lxb_char_t **data, const lxb_char_t *end)
3149 {
3150 return LXB_ENCODING_DECODE_ERROR;
3151 }
3152
3153 lxb_codepoint_t
lxb_encoding_decode_windows_1250_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3154 lxb_encoding_decode_windows_1250_single(lxb_encoding_decode_t *ctx,
3155 const lxb_char_t **data, const lxb_char_t *end)
3156 {
3157 if (**data < 0x80) {
3158 return *(*data)++;
3159 }
3160
3161 return lxb_encoding_single_index_windows_1250[*(*data)++ - 0x80].codepoint;
3162 }
3163
3164 lxb_codepoint_t
lxb_encoding_decode_windows_1251_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3165 lxb_encoding_decode_windows_1251_single(lxb_encoding_decode_t *ctx,
3166 const lxb_char_t **data, const lxb_char_t *end)
3167 {
3168 if (**data < 0x80) {
3169 return *(*data)++;
3170 }
3171
3172 return lxb_encoding_single_index_windows_1251[*(*data)++ - 0x80].codepoint;
3173 }
3174
3175 lxb_codepoint_t
lxb_encoding_decode_windows_1252_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3176 lxb_encoding_decode_windows_1252_single(lxb_encoding_decode_t *ctx,
3177 const lxb_char_t **data, const lxb_char_t *end)
3178 {
3179 if (**data < 0x80) {
3180 return *(*data)++;
3181 }
3182
3183 return lxb_encoding_single_index_windows_1252[*(*data)++ - 0x80].codepoint;
3184 }
3185
3186 lxb_codepoint_t
lxb_encoding_decode_windows_1253_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3187 lxb_encoding_decode_windows_1253_single(lxb_encoding_decode_t *ctx,
3188 const lxb_char_t **data, const lxb_char_t *end)
3189 {
3190 if (**data < 0x80) {
3191 return *(*data)++;
3192 }
3193
3194 return lxb_encoding_single_index_windows_1253[*(*data)++ - 0x80].codepoint;
3195 }
3196
3197 lxb_codepoint_t
lxb_encoding_decode_windows_1254_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3198 lxb_encoding_decode_windows_1254_single(lxb_encoding_decode_t *ctx,
3199 const lxb_char_t **data, const lxb_char_t *end)
3200 {
3201 if (**data < 0x80) {
3202 return *(*data)++;
3203 }
3204
3205 return lxb_encoding_single_index_windows_1254[*(*data)++ - 0x80].codepoint;
3206 }
3207
3208 lxb_codepoint_t
lxb_encoding_decode_windows_1255_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3209 lxb_encoding_decode_windows_1255_single(lxb_encoding_decode_t *ctx,
3210 const lxb_char_t **data, const lxb_char_t *end)
3211 {
3212 if (**data < 0x80) {
3213 return *(*data)++;
3214 }
3215
3216 return lxb_encoding_single_index_windows_1255[*(*data)++ - 0x80].codepoint;
3217 }
3218
3219 lxb_codepoint_t
lxb_encoding_decode_windows_1256_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3220 lxb_encoding_decode_windows_1256_single(lxb_encoding_decode_t *ctx,
3221 const lxb_char_t **data, const lxb_char_t *end)
3222 {
3223 if (**data < 0x80) {
3224 return *(*data)++;
3225 }
3226
3227 return lxb_encoding_single_index_windows_1256[*(*data)++ - 0x80].codepoint;
3228 }
3229
3230 lxb_codepoint_t
lxb_encoding_decode_windows_1257_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3231 lxb_encoding_decode_windows_1257_single(lxb_encoding_decode_t *ctx,
3232 const lxb_char_t **data, const lxb_char_t *end)
3233 {
3234 if (**data < 0x80) {
3235 return *(*data)++;
3236 }
3237
3238 return lxb_encoding_single_index_windows_1257[*(*data)++ - 0x80].codepoint;
3239 }
3240
3241 lxb_codepoint_t
lxb_encoding_decode_windows_1258_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3242 lxb_encoding_decode_windows_1258_single(lxb_encoding_decode_t *ctx,
3243 const lxb_char_t **data, const lxb_char_t *end)
3244 {
3245 if (**data < 0x80) {
3246 return *(*data)++;
3247 }
3248
3249 return lxb_encoding_single_index_windows_1258[*(*data)++ - 0x80].codepoint;
3250 }
3251
3252 lxb_codepoint_t
lxb_encoding_decode_windows_874_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3253 lxb_encoding_decode_windows_874_single(lxb_encoding_decode_t *ctx,
3254 const lxb_char_t **data, const lxb_char_t *end)
3255 {
3256 if (**data < 0x80) {
3257 return *(*data)++;
3258 }
3259
3260 return lxb_encoding_single_index_windows_874[*(*data)++ - 0x80].codepoint;
3261 }
3262
3263 lxb_codepoint_t
lxb_encoding_decode_x_mac_cyrillic_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3264 lxb_encoding_decode_x_mac_cyrillic_single(lxb_encoding_decode_t *ctx,
3265 const lxb_char_t **data, const lxb_char_t *end)
3266 {
3267 if (**data < 0x80) {
3268 return *(*data)++;
3269 }
3270
3271 return lxb_encoding_single_index_x_mac_cyrillic[*(*data)++ - 0x80].codepoint;
3272 }
3273
3274 lxb_codepoint_t
lxb_encoding_decode_x_user_defined_single(lxb_encoding_decode_t * ctx,const lxb_char_t ** data,const lxb_char_t * end)3275 lxb_encoding_decode_x_user_defined_single(lxb_encoding_decode_t *ctx,
3276 const lxb_char_t **data, const lxb_char_t *end)
3277 {
3278 if (**data < 0x80) {
3279 return *(*data)++;
3280 }
3281
3282 return 0xF780 + (*(*data)++) - 0x80;
3283 }
3284