xref: /php-src/ext/dom/lexbor/lexbor/encoding/encode.c (revision f0934090)
1 /*
2  * Copyright (C) 2019 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #include "lexbor/encoding/encode.h"
8 #include "lexbor/encoding/single.h"
9 #include "lexbor/encoding/multi.h"
10 #include "lexbor/encoding/range.h"
11 
12 
13 #define LXB_ENCODING_ENCODE_APPEND(ctx, cp)                                    \
14     do {                                                                       \
15         if ((ctx)->buffer_used == (ctx)->buffer_length) {                      \
16             return LXB_STATUS_SMALL_BUFFER;                                    \
17         }                                                                      \
18                                                                                \
19         (ctx)->buffer_out[(ctx)->buffer_used++] = (lxb_char_t) cp;             \
20     }                                                                          \
21     while (0)
22 
23 #define LXB_ENCODING_ENCODE_APPEND_P(ctx, cp)                                  \
24     do {                                                                       \
25         if ((ctx)->buffer_used == (ctx)->buffer_length) {                      \
26             *cps = p;                                                          \
27             return LXB_STATUS_SMALL_BUFFER;                                    \
28         }                                                                      \
29                                                                                \
30         (ctx)->buffer_out[(ctx)->buffer_used++] = (lxb_char_t) cp;             \
31     }                                                                          \
32     while (0)
33 
34 #define LXB_ENCODING_ENCODE_ERROR(ctx)                                         \
35     do {                                                                       \
36         if (ctx->replace_to == NULL) {                                         \
37             return LXB_STATUS_ERROR;                                           \
38         }                                                                      \
39                                                                                \
40         if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) {      \
41             return LXB_STATUS_SMALL_BUFFER;                                    \
42         }                                                                      \
43                                                                                \
44         memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to,            \
45                ctx->replace_len);                                              \
46                                                                                \
47         ctx->buffer_used += ctx->replace_len;                                  \
48     }                                                                          \
49     while (0)
50 
51 #define LXB_ENCODING_ENCODE_ERROR_P(ctx)                                       \
52     do {                                                                       \
53         if (ctx->replace_to == NULL) {                                         \
54             *cps = p;                                                          \
55             return LXB_STATUS_ERROR;                                           \
56         }                                                                      \
57                                                                                \
58         if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) {      \
59             *cps = p;                                                          \
60             return LXB_STATUS_SMALL_BUFFER;                                    \
61         }                                                                      \
62                                                                                \
63         memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to,            \
64                ctx->replace_len);                                              \
65                                                                                \
66         ctx->buffer_used += ctx->replace_len;                                  \
67     }                                                                          \
68     while (0)
69 
70 #define LXB_ENCODING_ENCODE_SINGLE_BYTE(table, table_size)                     \
71     do {                                                                       \
72         lxb_codepoint_t cp;                                                    \
73         const lxb_codepoint_t *p = *cps;                                       \
74         const lexbor_shs_hash_t *hash;                                         \
75                                                                                \
76         for (; p < end; p++) {                                                 \
77             cp = *p;                                                           \
78                                                                                \
79             if (cp < 0x80) {                                                   \
80                 LXB_ENCODING_ENCODE_APPEND_P(ctx, cp);                         \
81                 continue;                                                      \
82             }                                                                  \
83                                                                                \
84             hash = lexbor_shs_hash_get_static(table, table_size, cp);          \
85             if (hash == NULL) {                                                \
86                 LXB_ENCODING_ENCODE_ERROR_P(ctx);                              \
87                 continue;                                                      \
88             }                                                                  \
89                                                                                \
90             LXB_ENCODING_ENCODE_APPEND_P(ctx, (uintptr_t) hash->value);        \
91         }                                                                      \
92                                                                                \
93         return LXB_STATUS_OK;                                                  \
94     }                                                                          \
95     while (0)
96 
97 #define LXB_ENCODING_ENCODE_BYTE_SINGLE(table, table_size)                     \
98     const lexbor_shs_hash_t *hash;                                             \
99                                                                                \
100     if (cp < 0x80) {                                                           \
101         *(*data)++ = (lxb_char_t) cp;                                          \
102         return 1;                                                              \
103     }                                                                          \
104                                                                                \
105     hash = lexbor_shs_hash_get_static(table, table_size, cp);                  \
106     if (hash == NULL) {                                                        \
107         return LXB_ENCODING_ENCODE_ERROR;                                      \
108     }                                                                          \
109                                                                                \
110     *(*data)++ = (lxb_char_t) (uintptr_t) hash->value;                         \
111     return 1
112 
113 
114 lxb_status_t
lxb_encoding_encode_default(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)115 lxb_encoding_encode_default(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
116                             const lxb_codepoint_t *end)
117 {
118     return lxb_encoding_encode_utf_8(ctx, cps, end);
119 }
120 
121 lxb_status_t
lxb_encoding_encode_auto(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)122 lxb_encoding_encode_auto(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
123                          const lxb_codepoint_t *end)
124 {
125     *cps = end;
126     return LXB_STATUS_ERROR;
127 }
128 
129 lxb_status_t
lxb_encoding_encode_undefined(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)130 lxb_encoding_encode_undefined(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
131                               const lxb_codepoint_t *end)
132 {
133     *cps = end;
134     return LXB_STATUS_ERROR;
135 }
136 
137 lxb_status_t
lxb_encoding_encode_big5(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)138 lxb_encoding_encode_big5(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
139                          const lxb_codepoint_t *end)
140 {
141     lxb_codepoint_t cp;
142     const lexbor_shs_hash_t *hash;
143 
144     for (; *cps < end; (*cps)++) {
145         cp = **cps;
146 
147         if (cp < 0x80) {
148             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
149             continue;
150         }
151 
152         hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_big5,
153                                           LXB_ENCODING_MULTI_HASH_BIG5_SIZE, cp);
154         if (hash == NULL) {
155             LXB_ENCODING_ENCODE_ERROR(ctx);
156             continue;
157         }
158 
159         if ((ctx->buffer_used + 2) > ctx->buffer_length) {
160             return LXB_STATUS_SMALL_BUFFER;
161         }
162 
163         ctx->buffer_out[ ctx->buffer_used++ ] = ((uint32_t) (uintptr_t) hash->value) / 157 + 0x81;
164 
165         if ((((uint32_t) (uintptr_t) hash->value) % 157) < 0x3F) {
166             ctx->buffer_out[ ctx->buffer_used++ ] = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x40;
167         }
168         else {
169             ctx->buffer_out[ ctx->buffer_used++ ] = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x62;
170         }
171     }
172 
173     return LXB_STATUS_OK;
174 }
175 
176 lxb_status_t
lxb_encoding_encode_euc_jp(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)177 lxb_encoding_encode_euc_jp(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
178                            const lxb_codepoint_t *end)
179 {
180     lxb_codepoint_t cp;
181     const lexbor_shs_hash_t *hash;
182 
183     for (; *cps < end; (*cps)++) {
184         cp = **cps;
185 
186         if (cp < 0x80) {
187             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
188             continue;
189         }
190 
191         if (cp == 0x00A5) {
192             LXB_ENCODING_ENCODE_APPEND(ctx, 0x5C);
193             continue;
194         }
195 
196         if (cp == 0x203E) {
197             LXB_ENCODING_ENCODE_APPEND(ctx, 0x7E);
198             continue;
199         }
200 
201         if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
202             if ((ctx->buffer_used + 2) > ctx->buffer_length) {
203                 return LXB_STATUS_SMALL_BUFFER;
204             }
205 
206             ctx->buffer_out[ ctx->buffer_used++ ] = 0x8E;
207             ctx->buffer_out[ ctx->buffer_used++ ] = cp - 0xFF61 + 0xA1;
208 
209             continue;
210         }
211 
212         if (cp == 0x2212) {
213             cp = 0xFF0D;
214         }
215 
216         hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
217                                           LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
218         if (hash == NULL) {
219             LXB_ENCODING_ENCODE_ERROR(ctx);
220             continue;
221         }
222 
223         if ((ctx->buffer_used + 2) > ctx->buffer_length) {
224             return LXB_STATUS_SMALL_BUFFER;
225         }
226 
227         ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 94 + 0xA1;
228         ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value % 94 + 0xA1;
229     }
230 
231     return LXB_STATUS_OK;
232 }
233 
234 lxb_status_t
lxb_encoding_encode_euc_kr(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)235 lxb_encoding_encode_euc_kr(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
236                            const lxb_codepoint_t *end)
237 {
238     lxb_codepoint_t cp;
239     const lexbor_shs_hash_t *hash;
240 
241     for (; *cps < end; (*cps)++) {
242         cp = **cps;
243 
244         if (cp < 0x80) {
245             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
246             continue;
247         }
248 
249         hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_euc_kr,
250                                           LXB_ENCODING_MULTI_HASH_EUC_KR_SIZE, cp);
251         if (hash == NULL) {
252             LXB_ENCODING_ENCODE_ERROR(ctx);
253             continue;
254         }
255 
256         if ((ctx->buffer_used + 2) > ctx->buffer_length) {
257             return LXB_STATUS_SMALL_BUFFER;
258         }
259 
260         ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
261         ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value % 190 + 0x41;
262     }
263 
264     return LXB_STATUS_OK;
265 }
266 
267 lxb_status_t
lxb_encoding_encode_gbk(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)268 lxb_encoding_encode_gbk(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
269                         const lxb_codepoint_t *end)
270 {
271     lxb_codepoint_t cp;
272     const lexbor_shs_hash_t *hash;
273 
274     for (; *cps < end; (*cps)++) {
275         cp = **cps;
276 
277         if (cp < 0x80) {
278             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
279             continue;
280         }
281 
282         if (cp == 0xE5E5) {
283             LXB_ENCODING_ENCODE_ERROR(ctx);
284             continue;
285         }
286 
287         if (cp == 0x20AC) {
288             LXB_ENCODING_ENCODE_APPEND(ctx, 0x80);
289             continue;
290         }
291 
292         hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
293                                           LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
294         if (hash == NULL) {
295             LXB_ENCODING_ENCODE_ERROR(ctx);
296             continue;
297         }
298 
299         if ((ctx->buffer_used + 2) > ctx->buffer_length) {
300             return LXB_STATUS_SMALL_BUFFER;
301         }
302 
303         ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (uintptr_t) hash->value / 190 + 0x81;
304 
305         if (((lxb_char_t) (uintptr_t) hash->value % 190) < 0x3F) {
306             ctx->buffer_out[ ctx->buffer_used++ ] = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x40;
307         }
308         else {
309             ctx->buffer_out[ ctx->buffer_used++ ] = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x41;
310         }
311     }
312 
313     return LXB_STATUS_OK;
314 }
315 
316 lxb_status_t
lxb_encoding_encode_ibm866(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)317 lxb_encoding_encode_ibm866(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
318                            const lxb_codepoint_t *end)
319 {
320 
321     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_ibm866,
322                                     LXB_ENCODING_SINGLE_HASH_IBM866_SIZE);
323 }
324 
325 lxb_status_t
lxb_encoding_encode_iso_2022_jp(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)326 lxb_encoding_encode_iso_2022_jp(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
327                                 const lxb_codepoint_t *end)
328 {
329     int8_t size;
330     unsigned state;
331     lxb_codepoint_t cp;
332     const lexbor_shs_hash_t *hash;
333 
334     size = 0;
335     state = ctx->state;
336 
337     for (; *cps < end; (*cps)++) {
338         cp = **cps;
339 
340     begin:
341 
342         switch (ctx->state) {
343             case LXB_ENCODING_ENCODE_2022_JP_ASCII:
344                 if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
345                     goto failed;
346                 }
347 
348                 if (cp < 0x80) {
349                     LXB_ENCODING_ENCODE_APPEND(ctx, cp);
350                     continue;
351                 }
352 
353                 if (cp == 0x00A5 || cp == 0x203E) {
354                     /*
355                      * Do not switch to the ROMAN stage with prepend code point
356                      * to stream, add it immediately.
357                      */
358                     if ((ctx->buffer_used + 4) > ctx->buffer_length) {
359                         goto small_buffer;
360                     }
361 
362                     ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
363 
364                     if (cp == 0x00A5) {
365                         memcpy(&ctx->buffer_out[ctx->buffer_used],
366                                "\x1B\x28\x4A\x5C", 4);
367                         ctx->buffer_used += 4;
368 
369                         continue;
370                     }
371 
372                     memcpy(&ctx->buffer_out[ctx->buffer_used],
373                            "\x1B\x28\x4A\x7E", 4);
374                     ctx->buffer_used += 4;
375 
376                     continue;
377                 }
378 
379                 break;
380 
381             case LXB_ENCODING_ENCODE_2022_JP_ROMAN:
382                 if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
383                     goto failed;
384                 }
385 
386                 if (cp < 0x80) {
387                     switch (cp) {
388                         case 0x005C:
389                         case 0x007E:
390                             break;
391 
392                         case 0x00A5:
393                             LXB_ENCODING_ENCODE_APPEND(ctx, 0x5C);
394                             continue;
395 
396                         case 0x203E:
397                             LXB_ENCODING_ENCODE_APPEND(ctx, 0x7E);
398                             continue;
399 
400                         default:
401                             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
402                             continue;
403                     }
404 
405                     /*
406                      * Do not switch to the ANSI stage with prepend code point
407                      * to stream, add it immediately.
408                      */
409                     if ((ctx->buffer_used + 4) > ctx->buffer_length) {
410                         goto small_buffer;
411                     }
412 
413                     ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
414 
415                     memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x28\x42", 3);
416                     ctx->buffer_used += 3;
417 
418                     ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) cp;
419                     continue;
420                 }
421 
422                 break;
423 
424             case LXB_ENCODING_ENCODE_2022_JP_JIS0208:
425                 if (cp < 0x80) {
426                     if ((ctx->buffer_used + 4) > ctx->buffer_length) {
427                         goto small_buffer;
428                     }
429 
430                     ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
431 
432                     memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x28\x42", 3);
433                     ctx->buffer_used += 3;
434 
435                     ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) cp;
436                     continue;
437                 }
438 
439                 if (cp == 0x00A5 || cp == 0x203E) {
440                     if ((ctx->buffer_used + 4) > ctx->buffer_length) {
441                         goto small_buffer;
442                     }
443 
444                     ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
445 
446                     if (cp == 0x00A5) {
447                         memcpy(&ctx->buffer_out[ctx->buffer_used],
448                                "\x1B\x28\x4A\x5C", 4);
449                         ctx->buffer_used += 4;
450 
451                         continue;
452                     }
453 
454                     memcpy(&ctx->buffer_out[ctx->buffer_used],
455                            "\x1B\x28\x4A\x7E", 4);
456                     ctx->buffer_used += 4;
457 
458                     continue;
459                 }
460 
461                 break;
462         }
463 
464         if ((ctx->buffer_used + 2) > ctx->buffer_length) {
465             goto small_buffer;
466         }
467 
468         if (cp == 0x2212) {
469             cp = 0xFF0D;
470         }
471 
472         if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
473             cp = lxb_encoding_multi_index_iso_2022_jp_katakana[cp - 0xFF61].codepoint;
474         }
475 
476         hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
477                                           LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
478         if (hash == NULL) {
479             goto failed;
480         }
481 
482         if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_JIS0208) {
483             if ((ctx->buffer_used + 3) > ctx->buffer_length) {
484                 goto small_buffer;
485             }
486 
487             memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x24\x42", 3);
488             ctx->buffer_used += 3;
489 
490             ctx->state = LXB_ENCODING_ENCODE_2022_JP_JIS0208;
491             size += 3;
492 
493             goto begin;
494         }
495 
496         ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 94 + 0x21;
497         ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value % 94 + 0x21;
498 
499         continue;
500 
501     small_buffer:
502 
503         ctx->state = state;
504         ctx->buffer_used -= size;
505 
506         return LXB_STATUS_SMALL_BUFFER;
507 
508     failed:
509 
510         ctx->buffer_used -= size;
511         LXB_ENCODING_ENCODE_ERROR(ctx);
512     }
513 
514     return LXB_STATUS_OK;
515 }
516 
517 lxb_status_t
lxb_encoding_encode_iso_2022_jp_eof(lxb_encoding_encode_t * ctx)518 lxb_encoding_encode_iso_2022_jp_eof(lxb_encoding_encode_t *ctx)
519 {
520     if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_ASCII) {
521         if ((ctx->buffer_used + 3) > ctx->buffer_length) {
522             return LXB_STATUS_SMALL_BUFFER;
523         }
524 
525         memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x28\x42", 3);
526         ctx->buffer_used += 3;
527     }
528 
529     return LXB_STATUS_OK;
530 }
531 
532 lxb_status_t
lxb_encoding_encode_iso_8859_10(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)533 lxb_encoding_encode_iso_8859_10(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
534                                 const lxb_codepoint_t *end)
535 {
536     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_10,
537                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_10_SIZE);
538 }
539 
540 lxb_status_t
lxb_encoding_encode_iso_8859_13(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)541 lxb_encoding_encode_iso_8859_13(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
542                                 const lxb_codepoint_t *end)
543 {
544     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_13,
545                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_13_SIZE);
546 }
547 
548 lxb_status_t
lxb_encoding_encode_iso_8859_14(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)549 lxb_encoding_encode_iso_8859_14(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
550                                 const lxb_codepoint_t *end)
551 {
552     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_14,
553                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_14_SIZE);
554 }
555 
556 lxb_status_t
lxb_encoding_encode_iso_8859_15(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)557 lxb_encoding_encode_iso_8859_15(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
558                                 const lxb_codepoint_t *end)
559 {
560     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_15,
561                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_15_SIZE);
562 }
563 
564 lxb_status_t
lxb_encoding_encode_iso_8859_16(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)565 lxb_encoding_encode_iso_8859_16(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
566                                 const lxb_codepoint_t *end)
567 {
568     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_16,
569                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_16_SIZE);
570 }
571 
572 lxb_status_t
lxb_encoding_encode_iso_8859_2(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)573 lxb_encoding_encode_iso_8859_2(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
574                                const lxb_codepoint_t *end)
575 {
576     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_2,
577                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_2_SIZE);
578 }
579 
580 lxb_status_t
lxb_encoding_encode_iso_8859_3(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)581 lxb_encoding_encode_iso_8859_3(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
582                                const lxb_codepoint_t *end)
583 {
584     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_3,
585                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_3_SIZE);
586 }
587 
588 lxb_status_t
lxb_encoding_encode_iso_8859_4(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)589 lxb_encoding_encode_iso_8859_4(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
590                                const lxb_codepoint_t *end)
591 {
592     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_4,
593                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_4_SIZE);
594 }
595 
596 lxb_status_t
lxb_encoding_encode_iso_8859_5(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)597 lxb_encoding_encode_iso_8859_5(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
598                                const lxb_codepoint_t *end)
599 {
600     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_5,
601                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_5_SIZE);
602 }
603 
604 lxb_status_t
lxb_encoding_encode_iso_8859_6(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)605 lxb_encoding_encode_iso_8859_6(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
606                                const lxb_codepoint_t *end)
607 {
608     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_6,
609                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_6_SIZE);
610 }
611 
612 lxb_status_t
lxb_encoding_encode_iso_8859_7(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)613 lxb_encoding_encode_iso_8859_7(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
614                                const lxb_codepoint_t *end)
615 {
616     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_7,
617                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_7_SIZE);
618 }
619 
620 lxb_status_t
lxb_encoding_encode_iso_8859_8(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)621 lxb_encoding_encode_iso_8859_8(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
622                                const lxb_codepoint_t *end)
623 {
624     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_8,
625                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
626 }
627 
628 lxb_status_t
lxb_encoding_encode_iso_8859_8_i(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)629 lxb_encoding_encode_iso_8859_8_i(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
630                                  const lxb_codepoint_t *end)
631 {
632     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_8,
633                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
634 }
635 
636 lxb_status_t
lxb_encoding_encode_koi8_r(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)637 lxb_encoding_encode_koi8_r(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
638                            const lxb_codepoint_t *end)
639 {
640     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_koi8_r,
641                                     LXB_ENCODING_SINGLE_HASH_KOI8_R_SIZE);
642 }
643 
644 lxb_status_t
lxb_encoding_encode_koi8_u(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)645 lxb_encoding_encode_koi8_u(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
646                            const lxb_codepoint_t *end)
647 {
648     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_koi8_u,
649                                     LXB_ENCODING_SINGLE_HASH_KOI8_U_SIZE);
650 }
651 
652 lxb_inline const lexbor_shs_hash_t *
lxb_encoding_encode_shift_jis_index(lxb_codepoint_t cp)653 lxb_encoding_encode_shift_jis_index(lxb_codepoint_t cp)
654 {
655     const lexbor_shs_hash_t *entry;
656 
657     entry = &lxb_encoding_multi_hash_jis0208[ (cp % LXB_ENCODING_MULTI_HASH_JIS0208_SIZE) + 1 ];
658 
659     do {
660         if (entry->key == cp) {
661             if ((unsigned) ((uint32_t) (uintptr_t) entry->value - 8272) > (8835 - 8272)) {
662                 return entry;
663             }
664         }
665 
666         entry = &lxb_encoding_multi_hash_jis0208[entry->next];
667     }
668     while (entry != lxb_encoding_multi_hash_jis0208);
669 
670     return NULL;
671 }
672 
673 lxb_status_t
lxb_encoding_encode_shift_jis(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)674 lxb_encoding_encode_shift_jis(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
675                               const lxb_codepoint_t *end)
676 {
677     uint32_t lead, trail;
678     lxb_codepoint_t cp;
679     const lexbor_shs_hash_t *hash;
680 
681     for (; *cps < end; (*cps)++) {
682         cp = **cps;
683 
684         if (cp <= 0x80) {
685             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
686             continue;
687         }
688 
689         if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
690             LXB_ENCODING_ENCODE_APPEND(ctx, cp - 0xFF61 + 0xA1);
691             continue;
692         }
693 
694         switch (cp) {
695             case 0x00A5:
696                 LXB_ENCODING_ENCODE_APPEND(ctx, 0x5C);
697                 continue;
698 
699             case 0x203E:
700                 LXB_ENCODING_ENCODE_APPEND(ctx, 0x7E);
701                 continue;
702 
703             case 0x2212:
704                 cp = 0xFF0D;
705                 break;
706         }
707 
708         hash = lxb_encoding_encode_shift_jis_index(cp);
709         if (hash == NULL) {
710             LXB_ENCODING_ENCODE_ERROR(ctx);
711             continue;
712         }
713 
714         if ((ctx->buffer_used + 2) > ctx->buffer_length) {
715             return LXB_STATUS_SMALL_BUFFER;
716         }
717 
718         lead = (uint32_t) (uintptr_t) hash->value / 188;
719         trail = (uint32_t) (uintptr_t) hash->value % 188;
720 
721         ctx->buffer_out[ctx->buffer_used++ ] = lead + ((lead < 0x1F) ? 0x81 : 0xC1);
722         ctx->buffer_out[ctx->buffer_used++ ] = trail + ((trail < 0x3F) ? 0x40 : 0x41);
723     }
724 
725     return LXB_STATUS_OK;
726 }
727 
728 lxb_inline void
lxb_encoding_encode_utf_16_write(lxb_encoding_encode_t * ctx,bool is_be,lxb_codepoint_t cp)729 lxb_encoding_encode_utf_16_write(lxb_encoding_encode_t *ctx, bool is_be,
730                                  lxb_codepoint_t cp)
731 {
732     if (is_be) {
733         ctx->buffer_out[ctx->buffer_used++] = cp >> 8;
734         ctx->buffer_out[ctx->buffer_used++] = cp & 0x00FF;
735 
736         return;
737     }
738 
739     ctx->buffer_out[ctx->buffer_used++] = cp & 0x00FF;
740     ctx->buffer_out[ctx->buffer_used++] = cp >> 8;
741 }
742 
743 lxb_inline int8_t
lxb_encoding_encode_utf_16(lxb_encoding_encode_t * ctx,bool is_be,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)744 lxb_encoding_encode_utf_16(lxb_encoding_encode_t *ctx, bool is_be,
745                         const lxb_codepoint_t **cps, const lxb_codepoint_t *end)
746 {
747     lxb_codepoint_t cp;
748 
749     for (; *cps < end; (*cps)++) {
750         cp = **cps;
751 
752         if (cp < 0x10000) {
753             if ((ctx->buffer_used + 2) > ctx->buffer_length) {
754                 return LXB_STATUS_SMALL_BUFFER;
755             }
756 
757             lxb_encoding_encode_utf_16_write(ctx, is_be, cp);
758 
759             continue;
760         }
761 
762         if ((ctx->buffer_used + 4) > ctx->buffer_length) {
763             return LXB_STATUS_SMALL_BUFFER;
764         }
765 
766         cp -= 0x10000;
767 
768         lxb_encoding_encode_utf_16_write(ctx, is_be, (0xD800 | (cp >> 0x0A)));
769         lxb_encoding_encode_utf_16_write(ctx, is_be, (0xDC00 | (cp & 0x03FF)));
770     }
771 
772     return LXB_STATUS_OK;
773 }
774 
775 lxb_status_t
lxb_encoding_encode_utf_16be(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)776 lxb_encoding_encode_utf_16be(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
777                              const lxb_codepoint_t *end)
778 {
779     return lxb_encoding_encode_utf_16(ctx, true, cps, end);
780 }
781 
782 lxb_status_t
lxb_encoding_encode_utf_16le(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)783 lxb_encoding_encode_utf_16le(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
784                              const lxb_codepoint_t *end)
785 {
786     return lxb_encoding_encode_utf_16(ctx, false, cps, end);
787 }
788 
789 lxb_status_t
lxb_encoding_encode_utf_8(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)790 lxb_encoding_encode_utf_8(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
791                           const lxb_codepoint_t *end)
792 {
793     lxb_codepoint_t cp;
794     const lxb_codepoint_t *p = *cps;
795 
796     for (; p < end; p++) {
797         cp = *p;
798 
799         if (cp < 0x80) {
800             if ((ctx->buffer_used + 1) > ctx->buffer_length) {
801                 *cps = p;
802 
803                 return LXB_STATUS_SMALL_BUFFER;
804             }
805 
806             /* 0xxxxxxx */
807             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) cp;
808         }
809         else if (cp < 0x800) {
810             if ((ctx->buffer_used + 2) > ctx->buffer_length) {
811                 *cps = p;
812 
813                 return LXB_STATUS_SMALL_BUFFER;
814             }
815 
816             /* 110xxxxx 10xxxxxx */
817             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0xC0 | (cp >> 6  ));
818             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | (cp & 0x3F));
819         }
820         else if (cp < 0x10000) {
821             if ((ctx->buffer_used + 3) > ctx->buffer_length) {
822                 *cps = p;
823 
824                 return LXB_STATUS_SMALL_BUFFER;
825             }
826 
827             /* 1110xxxx 10xxxxxx 10xxxxxx */
828             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0xE0 | ((cp >> 12)));
829             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
830             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ( cp        & 0x3F));
831         }
832         else if (cp < 0x110000) {
833             if ((ctx->buffer_used + 4) > ctx->buffer_length) {
834                 *cps = p;
835 
836                 return LXB_STATUS_SMALL_BUFFER;
837             }
838 
839             /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
840             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0xF0 | ( cp >> 18));
841             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ((cp >> 12) & 0x3F));
842             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
843             ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ( cp        & 0x3F));
844         }
845         else {
846             *cps = p;
847             LXB_ENCODING_ENCODE_ERROR(ctx);
848         }
849     }
850 
851     *cps = p;
852 
853     return LXB_STATUS_OK;
854 }
855 
856 lxb_inline uint32_t
lxb_encoding_encode_gb18030_range(lxb_codepoint_t cp)857 lxb_encoding_encode_gb18030_range(lxb_codepoint_t cp)
858 {
859     size_t mid, left, right;
860     const lxb_encoding_range_index_t *range;
861 
862     if (cp == 0xE7C7) {
863         return 7457;
864     }
865 
866     left = 0;
867     right = LXB_ENCODING_RANGE_INDEX_GB18030_SIZE;
868     range = lxb_encoding_range_index_gb18030;
869 
870     /* Some compilers say about uninitialized mid */
871     mid = 0;
872 
873     while (left < right) {
874         mid = left + (right - left) / 2;
875 
876         if (range[mid].codepoint < cp) {
877             left = mid + 1;
878 
879             if (left < right && range[left].codepoint > cp) {
880                 break;
881             }
882         }
883         else if (range[mid].codepoint > cp) {
884             right = mid - 1;
885 
886             if (right > 0 && range[right].codepoint <= cp) {
887                 mid = right;
888                 break;
889             }
890         }
891         else {
892             break;
893         }
894     }
895 
896     return range[mid].index + cp - range[mid].codepoint;
897 }
898 
899 lxb_status_t
lxb_encoding_encode_gb18030(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)900 lxb_encoding_encode_gb18030(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
901                             const lxb_codepoint_t *end)
902 {
903     uint32_t index;
904     lxb_codepoint_t cp;
905     const lexbor_shs_hash_t *hash;
906 
907     for (; *cps < end; (*cps)++) {
908         cp = **cps;
909 
910         if (cp < 0x80) {
911             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
912             continue;
913         }
914 
915         if (cp == 0xE5E5) {
916             LXB_ENCODING_ENCODE_ERROR(ctx);
917             continue;
918         }
919 
920         hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
921                                           LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
922         if (hash != NULL) {
923             if ((ctx->buffer_used + 2) > ctx->buffer_length) {
924                 return LXB_STATUS_SMALL_BUFFER;
925             }
926 
927             ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
928 
929             if (((uint32_t) (uintptr_t) hash->value % 190) < 0x3F) {
930                 ctx->buffer_out[ ctx->buffer_used++ ] = ((uint32_t) (uintptr_t) hash->value % 190) + 0x40;
931             }
932             else {
933                 ctx->buffer_out[ ctx->buffer_used++ ] = ((uint32_t) (uintptr_t) hash->value % 190) + 0x41;
934             }
935 
936             continue;
937         }
938 
939         if ((ctx->buffer_used + 4) > ctx->buffer_length) {
940             return LXB_STATUS_SMALL_BUFFER;
941         }
942 
943         index = lxb_encoding_encode_gb18030_range(cp);
944 
945         ctx->buffer_out[ ctx->buffer_used++ ] = (index / (10 * 126 * 10)) + 0x81;
946         ctx->buffer_out[ ctx->buffer_used++ ] = ((index % (10 * 126 * 10)) / (10 * 126)) + 0x30;
947 
948         index = (index % (10 * 126 * 10)) % (10 * 126);
949 
950         ctx->buffer_out[ ctx->buffer_used++ ] = (index / 10) + 0x81;
951         ctx->buffer_out[ ctx->buffer_used++ ] = (index % 10) + 0x30;
952     }
953 
954     return LXB_STATUS_OK;
955 }
956 
957 lxb_status_t
lxb_encoding_encode_macintosh(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)958 lxb_encoding_encode_macintosh(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
959                               const lxb_codepoint_t *end)
960 {
961     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_macintosh,
962                                     LXB_ENCODING_SINGLE_HASH_MACINTOSH_SIZE);
963 }
964 
965 lxb_status_t
lxb_encoding_encode_replacement(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)966 lxb_encoding_encode_replacement(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
967                                 const lxb_codepoint_t *end)
968 {
969     *cps = end;
970     return LXB_STATUS_ERROR;
971 }
972 
973 lxb_status_t
lxb_encoding_encode_windows_1250(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)974 lxb_encoding_encode_windows_1250(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
975                                  const lxb_codepoint_t *end)
976 {
977     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1250,
978                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1250_SIZE);
979 }
980 
981 lxb_status_t
lxb_encoding_encode_windows_1251(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)982 lxb_encoding_encode_windows_1251(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
983                                  const lxb_codepoint_t *end)
984 {
985     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1251,
986                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1251_SIZE);
987 }
988 
989 lxb_status_t
lxb_encoding_encode_windows_1252(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)990 lxb_encoding_encode_windows_1252(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
991                                  const lxb_codepoint_t *end)
992 {
993     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1252,
994                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1252_SIZE);
995 }
996 
997 lxb_status_t
lxb_encoding_encode_windows_1253(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)998 lxb_encoding_encode_windows_1253(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
999                                  const lxb_codepoint_t *end)
1000 {
1001     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1253,
1002                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1253_SIZE);
1003 }
1004 
1005 lxb_status_t
lxb_encoding_encode_windows_1254(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1006 lxb_encoding_encode_windows_1254(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1007                                  const lxb_codepoint_t *end)
1008 {
1009     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1254,
1010                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1254_SIZE);
1011 }
1012 
1013 lxb_status_t
lxb_encoding_encode_windows_1255(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1014 lxb_encoding_encode_windows_1255(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1015                                  const lxb_codepoint_t *end)
1016 {
1017     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1255,
1018                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1255_SIZE);
1019 }
1020 
1021 lxb_status_t
lxb_encoding_encode_windows_1256(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1022 lxb_encoding_encode_windows_1256(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1023                                  const lxb_codepoint_t *end)
1024 {
1025     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1256,
1026                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1256_SIZE);
1027 }
1028 
1029 lxb_status_t
lxb_encoding_encode_windows_1257(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1030 lxb_encoding_encode_windows_1257(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1031                                  const lxb_codepoint_t *end)
1032 {
1033     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1257,
1034                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1257_SIZE);
1035 }
1036 
1037 lxb_status_t
lxb_encoding_encode_windows_1258(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1038 lxb_encoding_encode_windows_1258(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1039                                  const lxb_codepoint_t *end)
1040 {
1041     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1258,
1042                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1258_SIZE);
1043 }
1044 
1045 lxb_status_t
lxb_encoding_encode_windows_874(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1046 lxb_encoding_encode_windows_874(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1047                                 const lxb_codepoint_t *end)
1048 {
1049     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_874,
1050                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_874_SIZE);
1051 }
1052 
1053 lxb_status_t
lxb_encoding_encode_x_mac_cyrillic(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1054 lxb_encoding_encode_x_mac_cyrillic(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1055                                    const lxb_codepoint_t *end)
1056 {
1057     LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_x_mac_cyrillic,
1058                                   LXB_ENCODING_SINGLE_HASH_X_MAC_CYRILLIC_SIZE);
1059 }
1060 
1061 lxb_status_t
lxb_encoding_encode_x_user_defined(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1062 lxb_encoding_encode_x_user_defined(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1063                                    const lxb_codepoint_t *end)
1064 {
1065     lxb_codepoint_t cp;
1066 
1067     for (; *cps < end; (*cps)++) {
1068         cp = **cps;
1069 
1070         if (cp < 0x80) {
1071             LXB_ENCODING_ENCODE_APPEND(ctx, cp);
1072         }
1073         else if (cp >= 0xF780 && cp <= 0xF7FF) {
1074             LXB_ENCODING_ENCODE_APPEND(ctx, (cp - 0xF780 + 0x80));
1075         }
1076         else {
1077             LXB_ENCODING_ENCODE_ERROR(ctx);
1078         }
1079     }
1080 
1081     return LXB_STATUS_OK;
1082 }
1083 
1084 /*
1085  * Single
1086  */
1087 int8_t
lxb_encoding_encode_default_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1088 lxb_encoding_encode_default_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1089                                    const lxb_char_t *end, lxb_codepoint_t cp)
1090 {
1091     return lxb_encoding_encode_utf_8_single(ctx, data, end, cp);
1092 }
1093 
1094 int8_t
lxb_encoding_encode_auto_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1095 lxb_encoding_encode_auto_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1096                                 const lxb_char_t *end, lxb_codepoint_t cp)
1097 {
1098     return LXB_ENCODING_ENCODE_ERROR;
1099 }
1100 
1101 int8_t
lxb_encoding_encode_undefined_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1102 lxb_encoding_encode_undefined_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1103                                      const lxb_char_t *end, lxb_codepoint_t cp)
1104 {
1105     return LXB_ENCODING_ENCODE_ERROR;
1106 }
1107 
1108 int8_t
lxb_encoding_encode_big5_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1109 lxb_encoding_encode_big5_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1110                                 const lxb_char_t *end, lxb_codepoint_t cp)
1111 {
1112     const lexbor_shs_hash_t *hash;
1113 
1114     if (cp < 0x80) {
1115         *(*data)++ = (lxb_char_t) cp;
1116 
1117         return 1;
1118     }
1119 
1120     hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_big5,
1121                                       LXB_ENCODING_MULTI_HASH_BIG5_SIZE, cp);
1122     if (hash == NULL) {
1123         return LXB_ENCODING_ENCODE_ERROR;
1124     }
1125 
1126     if ((*data + 2) > end) {
1127         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1128     }
1129 
1130     *(*data)++ = ((uint32_t) (uintptr_t) hash->value) / 157 + 0x81;
1131 
1132     if ((((uint32_t) (uintptr_t) hash->value) % 157) < 0x3F) {
1133         *(*data)++ = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x40;
1134     }
1135     else {
1136         *(*data)++ = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x62;
1137     }
1138 
1139     return 2;
1140 }
1141 
1142 int8_t
lxb_encoding_encode_euc_jp_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1143 lxb_encoding_encode_euc_jp_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1144                                   const lxb_char_t *end, lxb_codepoint_t cp)
1145 {
1146     const lexbor_shs_hash_t *hash;
1147 
1148     if (cp < 0x80) {
1149         *(*data)++ = (lxb_char_t) cp;
1150 
1151         return 1;
1152     }
1153 
1154     if (cp == 0x00A5) {
1155         *(*data)++ = 0x5C;
1156 
1157         return 1;
1158     }
1159 
1160     if (cp == 0x203E) {
1161         *(*data)++ = 0x7E;
1162 
1163         return 1;
1164     }
1165 
1166     if ((*data + 2) > end) {
1167         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1168     }
1169 
1170     if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
1171         *(*data)++ = 0x8E;
1172         *(*data)++ = cp - 0xFF61 + 0xA1;
1173 
1174         return 2;
1175     }
1176 
1177     if (cp == 0x2212) {
1178         cp = 0xFF0D;
1179     }
1180 
1181     hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
1182                                       LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
1183     if (hash == NULL) {
1184         return LXB_ENCODING_ENCODE_ERROR;
1185     }
1186 
1187     *(*data)++ = (uint32_t) (uintptr_t) hash->value / 94 + 0xA1;
1188     *(*data)++ = (uint32_t) (uintptr_t) hash->value % 94 + 0xA1;
1189 
1190     return 2;
1191 }
1192 
1193 int8_t
lxb_encoding_encode_euc_kr_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1194 lxb_encoding_encode_euc_kr_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1195                                   const lxb_char_t *end, lxb_codepoint_t cp)
1196 {
1197     const lexbor_shs_hash_t *hash;
1198 
1199     if (cp < 0x80) {
1200         *(*data)++ = (lxb_char_t) cp;
1201 
1202         return 1;
1203     }
1204 
1205     if ((*data + 2) > end) {
1206         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1207     }
1208 
1209     hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_euc_kr,
1210                                       LXB_ENCODING_MULTI_HASH_EUC_KR_SIZE, cp);
1211     if (hash == NULL) {
1212         return LXB_ENCODING_ENCODE_ERROR;
1213     }
1214 
1215     *(*data)++ = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
1216     *(*data)++ = (uint32_t) (uintptr_t) hash->value % 190 + 0x41;
1217 
1218     return 2;
1219 }
1220 
1221 int8_t
lxb_encoding_encode_gbk_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1222 lxb_encoding_encode_gbk_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1223                                const lxb_char_t *end, lxb_codepoint_t cp)
1224 {
1225     const lexbor_shs_hash_t *hash;
1226 
1227     if (cp < 0x80) {
1228         *(*data)++ = (lxb_char_t) cp;
1229 
1230         return 1;
1231     }
1232 
1233     if (cp == 0xE5E5) {
1234         return LXB_ENCODING_ENCODE_ERROR;
1235     }
1236 
1237     if (cp == 0x20AC) {
1238         *(*data)++ = 0x80;
1239 
1240         return 1;
1241     }
1242 
1243     hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
1244                                       LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
1245     if (hash != NULL) {
1246         if ((*data + 2) > end) {
1247             return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1248         }
1249 
1250         *(*data)++ = (lxb_char_t) (uintptr_t) hash->value / 190 + 0x81;
1251 
1252         if (((lxb_char_t) (uintptr_t) hash->value % 190) < 0x3F) {
1253             *(*data)++ = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x40;
1254         }
1255         else {
1256             *(*data)++ = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x41;
1257         }
1258 
1259         return 2;
1260     }
1261 
1262     return LXB_ENCODING_ENCODE_ERROR;
1263 }
1264 
1265 int8_t
lxb_encoding_encode_ibm866_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1266 lxb_encoding_encode_ibm866_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1267                                   const lxb_char_t *end, lxb_codepoint_t cp)
1268 {
1269     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_ibm866,
1270                                     LXB_ENCODING_SINGLE_HASH_IBM866_SIZE);
1271 }
1272 
1273 int8_t
lxb_encoding_encode_iso_2022_jp_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1274 lxb_encoding_encode_iso_2022_jp_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1275                                        const lxb_char_t *end, lxb_codepoint_t cp)
1276 {
1277     int8_t size;
1278     unsigned state;
1279     const lexbor_shs_hash_t *hash;
1280 
1281     size = 0;
1282     state = ctx->state;
1283 
1284 begin:
1285 
1286     switch (ctx->state) {
1287         case LXB_ENCODING_ENCODE_2022_JP_ASCII:
1288             if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
1289                 goto failed;
1290             }
1291 
1292             if (cp < 0x80) {
1293                 *(*data)++ = (lxb_char_t) cp;
1294 
1295                 return size + 1;
1296             }
1297 
1298             if (cp == 0x00A5 || cp == 0x203E) {
1299                 /*
1300                  * Do not switch to the ROMAN stage with prepend code point
1301                  * to stream, add it immediately.
1302                  */
1303                 if ((*data + 4) > end) {
1304                     goto small_buffer;
1305                 }
1306 
1307                 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
1308 
1309                 if (cp == 0x00A5) {
1310                     memcpy(*data, "\x1B\x28\x4A\x5C", 4);
1311                     *data = *data + 4;
1312 
1313                     return size + 4;
1314                 }
1315 
1316                 memcpy(*data, "\x1B\x28\x4A\x7E", 4);
1317                 *data = *data + 4;
1318 
1319                 return size + 4;
1320             }
1321 
1322             break;
1323 
1324         case LXB_ENCODING_ENCODE_2022_JP_ROMAN:
1325             if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
1326                 goto failed;
1327             }
1328 
1329             if (cp < 0x80) {
1330                 switch (cp) {
1331                     case 0x005C:
1332                     case 0x007E:
1333                         break;
1334 
1335                     case 0x00A5:
1336                         *(*data)++ = 0x5C;
1337                         return size + 1;
1338 
1339                     case 0x203E:
1340                         *(*data)++ = 0x7E;
1341                         return size + 1;
1342 
1343                     default:
1344                         *(*data)++ = (lxb_char_t) cp;
1345                         return size + 1;
1346                 }
1347 
1348                 /*
1349                  * Do not switch to the ANSI stage with prepend code point
1350                  * to stream, add it immediately.
1351                  */
1352                 if ((*data + 4) > end) {
1353                     goto small_buffer;
1354                 }
1355 
1356                 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
1357 
1358                 memcpy(*data, "\x1B\x28\x42", 3);
1359                 *data = *data + 3;
1360 
1361                 *(*data)++ = (lxb_char_t) cp;
1362 
1363                 return size + 4;
1364             }
1365 
1366             break;
1367 
1368         case LXB_ENCODING_ENCODE_2022_JP_JIS0208:
1369             if (cp < 0x80) {
1370                 if ((*data + 4) > end) {
1371                     goto small_buffer;
1372                 }
1373 
1374                 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
1375 
1376                 memcpy(*data, "\x1B\x28\x42", 3);
1377                 *data = *data + 3;
1378 
1379                 *(*data)++ = (lxb_char_t) cp;
1380 
1381                 return size + 4;
1382             }
1383 
1384             if (cp == 0x00A5 || cp == 0x203E) {
1385                 if ((*data + 4) > end) {
1386                     goto small_buffer;
1387                 }
1388 
1389                 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
1390 
1391                 if (cp == 0x00A5) {
1392                     memcpy(*data, "\x1B\x28\x4A\x5C", 4);
1393                     *data = *data + 4;
1394 
1395                     return size + 4;
1396                 }
1397 
1398                 memcpy(*data, "\x1B\x28\x4A\x7E", 4);
1399                 *data = *data + 4;
1400 
1401                 return size + 4;
1402             }
1403 
1404             break;
1405     }
1406 
1407     if ((*data + 2) > end) {
1408         goto small_buffer;
1409     }
1410 
1411     if (cp == 0x2212) {
1412         cp = 0xFF0D;
1413     }
1414 
1415     if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
1416         cp = lxb_encoding_multi_index_iso_2022_jp_katakana[cp - 0xFF61].codepoint;
1417     }
1418 
1419     hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
1420                                       LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
1421     if (hash == NULL) {
1422         goto failed;
1423     }
1424 
1425     if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_JIS0208) {
1426         if ((*data + 3) > end) {
1427             goto small_buffer;
1428         }
1429 
1430         memcpy(*data, "\x1B\x24\x42", 3);
1431         *data = *data + 3;
1432 
1433         ctx->state = LXB_ENCODING_ENCODE_2022_JP_JIS0208;
1434         size += 3;
1435 
1436         goto begin;
1437     }
1438 
1439     *(*data)++ = (uint32_t) (uintptr_t) hash->value / 94 + 0x21;
1440     *(*data)++ = (uint32_t) (uintptr_t) hash->value % 94 + 0x21;
1441 
1442     return size + 2;
1443 
1444 small_buffer:
1445 
1446     ctx->state = state;
1447     *data = *data - size;
1448 
1449     return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1450 
1451 failed:
1452 
1453     *data = *data - size;
1454 
1455     return LXB_ENCODING_ENCODE_ERROR;
1456 }
1457 
1458 int8_t
lxb_encoding_encode_iso_2022_jp_eof_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end)1459 lxb_encoding_encode_iso_2022_jp_eof_single(lxb_encoding_encode_t *ctx,
1460                                        lxb_char_t **data, const lxb_char_t *end)
1461 {
1462     if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_ASCII) {
1463         if ((*data + 3) > end) {
1464             return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1465         }
1466 
1467         memcpy(*data, "\x1B\x28\x42", 3);
1468         *data = *data + 3;
1469 
1470         ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
1471 
1472         return 3;
1473     }
1474 
1475     return 0;
1476 }
1477 
1478 int8_t
lxb_encoding_encode_iso_8859_10_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1479 lxb_encoding_encode_iso_8859_10_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1480                                        const lxb_char_t *end, lxb_codepoint_t cp)
1481 {
1482     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_10,
1483                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_10_SIZE);
1484 }
1485 
1486 int8_t
lxb_encoding_encode_iso_8859_13_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1487 lxb_encoding_encode_iso_8859_13_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1488                                        const lxb_char_t *end, lxb_codepoint_t cp)
1489 {
1490     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_13,
1491                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_13_SIZE);
1492 }
1493 
1494 int8_t
lxb_encoding_encode_iso_8859_14_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1495 lxb_encoding_encode_iso_8859_14_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1496                                        const lxb_char_t *end, lxb_codepoint_t cp)
1497 {
1498     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_14,
1499                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_14_SIZE);
1500 }
1501 
1502 int8_t
lxb_encoding_encode_iso_8859_15_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1503 lxb_encoding_encode_iso_8859_15_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1504                                        const lxb_char_t *end, lxb_codepoint_t cp)
1505 {
1506     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_15,
1507                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_15_SIZE);
1508 }
1509 
1510 int8_t
lxb_encoding_encode_iso_8859_16_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1511 lxb_encoding_encode_iso_8859_16_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1512                                        const lxb_char_t *end, lxb_codepoint_t cp)
1513 {
1514     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_16,
1515                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_16_SIZE);
1516 }
1517 
1518 int8_t
lxb_encoding_encode_iso_8859_2_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1519 lxb_encoding_encode_iso_8859_2_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1520                                       const lxb_char_t *end, lxb_codepoint_t cp)
1521 {
1522     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_2,
1523                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_2_SIZE);
1524 }
1525 
1526 int8_t
lxb_encoding_encode_iso_8859_3_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1527 lxb_encoding_encode_iso_8859_3_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1528                                       const lxb_char_t *end, lxb_codepoint_t cp)
1529 {
1530     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_3,
1531                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_3_SIZE);
1532 }
1533 
1534 int8_t
lxb_encoding_encode_iso_8859_4_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1535 lxb_encoding_encode_iso_8859_4_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1536                                       const lxb_char_t *end, lxb_codepoint_t cp)
1537 {
1538     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_4,
1539                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_4_SIZE);
1540 }
1541 
1542 int8_t
lxb_encoding_encode_iso_8859_5_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1543 lxb_encoding_encode_iso_8859_5_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1544                                       const lxb_char_t *end, lxb_codepoint_t cp)
1545 {
1546     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_5,
1547                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_5_SIZE);
1548 }
1549 
1550 int8_t
lxb_encoding_encode_iso_8859_6_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1551 lxb_encoding_encode_iso_8859_6_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1552                                       const lxb_char_t *end, lxb_codepoint_t cp)
1553 {
1554     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_6,
1555                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_6_SIZE);
1556 }
1557 
1558 int8_t
lxb_encoding_encode_iso_8859_7_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1559 lxb_encoding_encode_iso_8859_7_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1560                                       const lxb_char_t *end, lxb_codepoint_t cp)
1561 {
1562     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_7,
1563                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_7_SIZE);
1564 }
1565 
1566 int8_t
lxb_encoding_encode_iso_8859_8_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1567 lxb_encoding_encode_iso_8859_8_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1568                                       const lxb_char_t *end, lxb_codepoint_t cp)
1569 {
1570     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_8,
1571                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
1572 }
1573 
1574 int8_t
lxb_encoding_encode_iso_8859_8_i_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1575 lxb_encoding_encode_iso_8859_8_i_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1576                                         const lxb_char_t *end, lxb_codepoint_t cp)
1577 {
1578     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_8,
1579                                     LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
1580 }
1581 
1582 int8_t
lxb_encoding_encode_koi8_r_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1583 lxb_encoding_encode_koi8_r_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1584                                   const lxb_char_t *end, lxb_codepoint_t cp)
1585 {
1586     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_koi8_r,
1587                                     LXB_ENCODING_SINGLE_HASH_KOI8_R_SIZE);
1588 }
1589 
1590 int8_t
lxb_encoding_encode_koi8_u_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1591 lxb_encoding_encode_koi8_u_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1592                                   const lxb_char_t *end, lxb_codepoint_t cp)
1593 {
1594     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_koi8_u,
1595                                     LXB_ENCODING_SINGLE_HASH_KOI8_U_SIZE);
1596 }
1597 
1598 int8_t
lxb_encoding_encode_shift_jis_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1599 lxb_encoding_encode_shift_jis_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1600                                      const lxb_char_t *end, lxb_codepoint_t cp)
1601 {
1602     uint32_t lead, trail;
1603     const lexbor_shs_hash_t *hash;
1604 
1605     if (cp <= 0x80) {
1606         *(*data)++ = (lxb_char_t) cp;
1607 
1608         return 1;
1609     }
1610 
1611     if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
1612         *(*data)++ = cp - 0xFF61 + 0xA1;
1613 
1614         return 1;
1615     }
1616 
1617     switch (cp) {
1618         case 0x00A5:
1619             *(*data)++ = 0x5C;
1620             return 1;
1621 
1622         case 0x203E:
1623             *(*data)++ = 0x7E;
1624             return 1;
1625 
1626         case 0x2212:
1627             cp = 0xFF0D;
1628             break;
1629     }
1630 
1631     hash = lxb_encoding_encode_shift_jis_index(cp);
1632     if (hash == NULL) {
1633         return LXB_ENCODING_ENCODE_ERROR;
1634     }
1635 
1636     if ((*data + 2) > end) {
1637         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1638     }
1639 
1640     lead = (uint32_t) (uintptr_t) hash->value / 188;
1641     trail = (uint32_t) (uintptr_t) hash->value % 188;
1642 
1643     *(*data)++ = lead + ((lead < 0x1F) ? 0x81 : 0xC1);
1644     *(*data)++ = trail + ((trail < 0x3F) ? 0x40 : 0x41);
1645 
1646     return 2;
1647 }
1648 
1649 lxb_inline void
lxb_encoding_encode_utf_16_write_single(bool is_be,lxb_char_t ** data,lxb_codepoint_t cp)1650 lxb_encoding_encode_utf_16_write_single(bool is_be, lxb_char_t **data,
1651                                         lxb_codepoint_t cp)
1652 {
1653     if (is_be) {
1654         *(*data)++ = cp >> 8;
1655         *(*data)++ = cp & 0x00FF;
1656 
1657         return;
1658     }
1659 
1660     *(*data)++ = cp & 0x00FF;
1661     *(*data)++ = cp >> 8;
1662 }
1663 
1664 lxb_inline int8_t
lxb_encoding_encode_utf_16_single(lxb_encoding_encode_t * ctx,bool is_be,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1665 lxb_encoding_encode_utf_16_single(lxb_encoding_encode_t *ctx, bool is_be,
1666                    lxb_char_t **data, const lxb_char_t *end, lxb_codepoint_t cp)
1667 {
1668     if ((*data + 2) > end) {
1669         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1670     }
1671 
1672     if (cp < 0x10000) {
1673         lxb_encoding_encode_utf_16_write_single(is_be, data, cp);
1674 
1675         return 2;
1676     }
1677 
1678     if ((*data + 4) > end) {
1679         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1680     }
1681 
1682     cp -= 0x10000;
1683 
1684     lxb_encoding_encode_utf_16_write_single(is_be, data, (0xD800 | (cp >> 0x0A)));
1685     lxb_encoding_encode_utf_16_write_single(is_be, data, (0xDC00 | (cp & 0x03FF)));
1686 
1687     return 4;
1688 }
1689 
1690 int8_t
lxb_encoding_encode_utf_16be_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1691 lxb_encoding_encode_utf_16be_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1692                                     const lxb_char_t *end, lxb_codepoint_t cp)
1693 {
1694     return lxb_encoding_encode_utf_16_single(ctx, true, data, end, cp);
1695 }
1696 
1697 int8_t
lxb_encoding_encode_utf_16le_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1698 lxb_encoding_encode_utf_16le_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1699                                     const lxb_char_t *end, lxb_codepoint_t cp)
1700 {
1701     return lxb_encoding_encode_utf_16_single(ctx, false, data, end, cp);
1702 }
1703 
1704 int8_t
lxb_encoding_encode_utf_8_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1705 lxb_encoding_encode_utf_8_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1706                                  const lxb_char_t *end, lxb_codepoint_t cp)
1707 {
1708     if (cp < 0x80) {
1709         /* 0xxxxxxx */
1710         *(*data)++ = (lxb_char_t) cp;
1711 
1712         return 1;
1713     }
1714 
1715     if (cp < 0x800) {
1716         if ((*data + 2) > end) {
1717             return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1718         }
1719 
1720         /* 110xxxxx 10xxxxxx */
1721         *(*data)++ = (lxb_char_t) (0xC0 | (cp >> 6  ));
1722         *(*data)++ = (lxb_char_t) (0x80 | (cp & 0x3F));
1723 
1724         return 2;
1725     }
1726 
1727     if (cp < 0x10000) {
1728         if ((*data + 3) > end) {
1729             return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1730         }
1731 
1732         /* 1110xxxx 10xxxxxx 10xxxxxx */
1733         *(*data)++ = (lxb_char_t) (0xE0 | ((cp >> 12)));
1734         *(*data)++ = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
1735         *(*data)++ = (lxb_char_t) (0x80 | ( cp        & 0x3F));
1736 
1737         return 3;
1738     }
1739 
1740     if (cp < 0x110000) {
1741         if ((*data + 4) > end) {
1742             return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1743         }
1744 
1745         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
1746         *(*data)++ = (lxb_char_t) (0xF0 | ( cp >> 18));
1747         *(*data)++ = (lxb_char_t) (0x80 | ((cp >> 12) & 0x3F));
1748         *(*data)++ = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
1749         *(*data)++ = (lxb_char_t) (0x80 | ( cp        & 0x3F));
1750 
1751         return 4;
1752     }
1753 
1754     return LXB_ENCODING_ENCODE_ERROR;
1755 }
1756 
1757 int8_t
lxb_encoding_encode_utf_8_length(lxb_codepoint_t cp)1758 lxb_encoding_encode_utf_8_length(lxb_codepoint_t cp)
1759 {
1760     if (cp < 0x80) {
1761         return 1;
1762     }
1763     else if (cp < 0x800) {
1764         return 2;
1765     }
1766     else if (cp < 0x10000) {
1767         return 3;
1768     }
1769     else if (cp < 0x110000) {
1770         return 4;
1771     }
1772 
1773     return 0;
1774 }
1775 
1776 int8_t
lxb_encoding_encode_gb18030_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1777 lxb_encoding_encode_gb18030_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1778                                    const lxb_char_t *end, lxb_codepoint_t cp)
1779 {
1780     uint32_t index;
1781     const lexbor_shs_hash_t *hash;
1782 
1783     if (cp < 0x80) {
1784         *(*data)++ = (lxb_char_t) cp;
1785 
1786         return 1;
1787     }
1788 
1789     if (cp == 0xE5E5) {
1790         return LXB_ENCODING_ENCODE_ERROR;
1791     }
1792 
1793     hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
1794                                       LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
1795     if (hash != NULL) {
1796         if ((*data + 2) > end) {
1797             return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1798         }
1799 
1800         *(*data)++ = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
1801 
1802         if (((uint32_t) (uintptr_t) hash->value % 190) < 0x3F) {
1803             *(*data)++ = ((uint32_t) (uintptr_t) hash->value % 190) + 0x40;
1804         }
1805         else {
1806             *(*data)++ = ((uint32_t) (uintptr_t) hash->value % 190) + 0x41;
1807         }
1808 
1809         return 2;
1810     }
1811 
1812     if ((*data + 4) > end) {
1813         return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1814     }
1815 
1816     index = lxb_encoding_encode_gb18030_range(cp);
1817 
1818     *(*data)++ = (index / (10 * 126 * 10)) + 0x81;
1819     *(*data)++ = ((index % (10 * 126 * 10)) / (10 * 126)) + 0x30;
1820 
1821     index = (index % (10 * 126 * 10)) % (10 * 126);
1822 
1823     *(*data)++ = (index / 10) + 0x81;
1824     *(*data)++ = (index % 10) + 0x30;
1825 
1826     return 4;
1827 }
1828 
1829 int8_t
lxb_encoding_encode_macintosh_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1830 lxb_encoding_encode_macintosh_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1831                                      const lxb_char_t *end, lxb_codepoint_t cp)
1832 {
1833     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_macintosh,
1834                                     LXB_ENCODING_SINGLE_HASH_MACINTOSH_SIZE);
1835 }
1836 
1837 int8_t
lxb_encoding_encode_replacement_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1838 lxb_encoding_encode_replacement_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1839                                        const lxb_char_t *end, lxb_codepoint_t cp)
1840 {
1841     (*data)++;
1842     return LXB_ENCODING_ENCODE_ERROR;
1843 }
1844 
1845 int8_t
lxb_encoding_encode_windows_1250_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1846 lxb_encoding_encode_windows_1250_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1847                                         const lxb_char_t *end, lxb_codepoint_t cp)
1848 {
1849     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1250,
1850                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1250_SIZE);
1851 }
1852 
1853 int8_t
lxb_encoding_encode_windows_1251_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1854 lxb_encoding_encode_windows_1251_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1855                                         const lxb_char_t *end, lxb_codepoint_t cp)
1856 {
1857     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1251,
1858                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1251_SIZE);
1859 }
1860 
1861 int8_t
lxb_encoding_encode_windows_1252_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1862 lxb_encoding_encode_windows_1252_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1863                                         const lxb_char_t *end, lxb_codepoint_t cp)
1864 {
1865     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1252,
1866                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1252_SIZE);
1867 }
1868 
1869 int8_t
lxb_encoding_encode_windows_1253_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1870 lxb_encoding_encode_windows_1253_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1871                                         const lxb_char_t *end, lxb_codepoint_t cp)
1872 {
1873     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1253,
1874                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1253_SIZE);
1875 }
1876 
1877 int8_t
lxb_encoding_encode_windows_1254_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1878 lxb_encoding_encode_windows_1254_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1879                                         const lxb_char_t *end, lxb_codepoint_t cp)
1880 {
1881     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1254,
1882                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1254_SIZE);
1883 }
1884 
1885 int8_t
lxb_encoding_encode_windows_1255_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1886 lxb_encoding_encode_windows_1255_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1887                                         const lxb_char_t *end, lxb_codepoint_t cp)
1888 {
1889     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1255,
1890                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1255_SIZE);
1891 }
1892 
1893 int8_t
lxb_encoding_encode_windows_1256_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1894 lxb_encoding_encode_windows_1256_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1895                                         const lxb_char_t *end, lxb_codepoint_t cp)
1896 {
1897     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1256,
1898                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1256_SIZE);
1899 }
1900 
1901 int8_t
lxb_encoding_encode_windows_1257_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1902 lxb_encoding_encode_windows_1257_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1903                                         const lxb_char_t *end, lxb_codepoint_t cp)
1904 {
1905     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1257,
1906                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1257_SIZE);
1907 }
1908 
1909 int8_t
lxb_encoding_encode_windows_1258_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1910 lxb_encoding_encode_windows_1258_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1911                                         const lxb_char_t *end, lxb_codepoint_t cp)
1912 {
1913     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1258,
1914                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_1258_SIZE);
1915 }
1916 
1917 int8_t
lxb_encoding_encode_windows_874_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1918 lxb_encoding_encode_windows_874_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1919                                        const lxb_char_t *end, lxb_codepoint_t cp)
1920 {
1921     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_874,
1922                                     LXB_ENCODING_SINGLE_HASH_WINDOWS_874_SIZE);
1923 }
1924 
1925 int8_t
lxb_encoding_encode_x_mac_cyrillic_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1926 lxb_encoding_encode_x_mac_cyrillic_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1927                                           const lxb_char_t *end, lxb_codepoint_t cp)
1928 {
1929     LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_x_mac_cyrillic,
1930                                   LXB_ENCODING_SINGLE_HASH_X_MAC_CYRILLIC_SIZE);
1931 }
1932 
1933 int8_t
lxb_encoding_encode_x_user_defined_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1934 lxb_encoding_encode_x_user_defined_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1935                                           const lxb_char_t *end, lxb_codepoint_t cp)
1936 {
1937     if (cp < 0x80) {
1938         *(*data)++ = (lxb_char_t) cp;
1939 
1940         return 1;
1941     }
1942 
1943     if (cp >= 0xF780 && cp <= 0xF7FF) {
1944         *(*data)++ = (lxb_char_t) (cp - 0xF780 + 0x80);
1945 
1946         return 1;
1947     }
1948 
1949     return LXB_ENCODING_ENCODE_ERROR;
1950 }
1951