1 /*
2 * Copyright (C) 2019-2024 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7 #include "lexbor/encoding/encode.h"
8 #include "lexbor/encoding/single.h"
9 #include "lexbor/encoding/multi.h"
10 #include "lexbor/encoding/range.h"
11
12
13 #define LXB_ENCODING_ENCODE_APPEND(ctx, cp) \
14 do { \
15 if ((ctx)->buffer_used == (ctx)->buffer_length) { \
16 return LXB_STATUS_SMALL_BUFFER; \
17 } \
18 \
19 (ctx)->buffer_out[(ctx)->buffer_used++] = (lxb_char_t) cp; \
20 } \
21 while (0)
22
23 #define LXB_ENCODING_ENCODE_APPEND_P(ctx, cp) \
24 do { \
25 if ((ctx)->buffer_used == (ctx)->buffer_length) { \
26 *cps = p; \
27 return LXB_STATUS_SMALL_BUFFER; \
28 } \
29 \
30 (ctx)->buffer_out[(ctx)->buffer_used++] = (lxb_char_t) cp; \
31 } \
32 while (0)
33
34 #define LXB_ENCODING_ENCODE_ERROR(ctx) \
35 do { \
36 if (ctx->replace_to == NULL) { \
37 return LXB_STATUS_ERROR; \
38 } \
39 \
40 if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) { \
41 return LXB_STATUS_SMALL_BUFFER; \
42 } \
43 \
44 memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to, \
45 ctx->replace_len); \
46 \
47 ctx->buffer_used += ctx->replace_len; \
48 } \
49 while (0)
50
51 #define LXB_ENCODING_ENCODE_ERROR_P(ctx) \
52 do { \
53 if (ctx->replace_to == NULL) { \
54 *cps = p; \
55 return LXB_STATUS_ERROR; \
56 } \
57 \
58 if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) { \
59 *cps = p; \
60 return LXB_STATUS_SMALL_BUFFER; \
61 } \
62 \
63 memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to, \
64 ctx->replace_len); \
65 \
66 ctx->buffer_used += ctx->replace_len; \
67 } \
68 while (0)
69
70 #define LXB_ENCODING_ENCODE_SINGLE_BYTE(table, table_size) \
71 do { \
72 lxb_codepoint_t cp; \
73 const lxb_codepoint_t *p = *cps; \
74 const lexbor_shs_hash_t *hash; \
75 \
76 for (; p < end; p++) { \
77 cp = *p; \
78 \
79 if (cp < 0x80) { \
80 LXB_ENCODING_ENCODE_APPEND_P(ctx, cp); \
81 continue; \
82 } \
83 \
84 hash = lexbor_shs_hash_get_static(table, table_size, cp); \
85 if (hash == NULL) { \
86 LXB_ENCODING_ENCODE_ERROR_P(ctx); \
87 continue; \
88 } \
89 \
90 LXB_ENCODING_ENCODE_APPEND_P(ctx, (uintptr_t) hash->value); \
91 } \
92 \
93 return LXB_STATUS_OK; \
94 } \
95 while (0)
96
97 #define LXB_ENCODING_ENCODE_BYTE_SINGLE(table, table_size) \
98 const lexbor_shs_hash_t *hash; \
99 \
100 if (cp < 0x80) { \
101 *(*data)++ = (lxb_char_t) cp; \
102 return 1; \
103 } \
104 \
105 hash = lexbor_shs_hash_get_static(table, table_size, cp); \
106 if (hash == NULL) { \
107 return LXB_ENCODING_ENCODE_ERROR; \
108 } \
109 \
110 *(*data)++ = (lxb_char_t) (uintptr_t) hash->value; \
111 return 1
112
113
114 lxb_status_t
lxb_encoding_encode_default(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)115 lxb_encoding_encode_default(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
116 const lxb_codepoint_t *end)
117 {
118 return lxb_encoding_encode_utf_8(ctx, cps, end);
119 }
120
121 lxb_status_t
lxb_encoding_encode_auto(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)122 lxb_encoding_encode_auto(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
123 const lxb_codepoint_t *end)
124 {
125 *cps = end;
126 return LXB_STATUS_ERROR;
127 }
128
129 lxb_status_t
lxb_encoding_encode_undefined(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)130 lxb_encoding_encode_undefined(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
131 const lxb_codepoint_t *end)
132 {
133 *cps = end;
134 return LXB_STATUS_ERROR;
135 }
136
137 lxb_status_t
lxb_encoding_encode_big5(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)138 lxb_encoding_encode_big5(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
139 const lxb_codepoint_t *end)
140 {
141 lxb_codepoint_t cp;
142 const lexbor_shs_hash_t *hash;
143
144 for (; *cps < end; (*cps)++) {
145 cp = **cps;
146
147 if (cp < 0x80) {
148 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
149 continue;
150 }
151
152 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_big5,
153 LXB_ENCODING_MULTI_HASH_BIG5_SIZE, cp);
154 if (hash == NULL) {
155 LXB_ENCODING_ENCODE_ERROR(ctx);
156 continue;
157 }
158
159 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
160 return LXB_STATUS_SMALL_BUFFER;
161 }
162
163 ctx->buffer_out[ ctx->buffer_used++ ] = ((uint32_t) (uintptr_t) hash->value) / 157 + 0x81;
164
165 if ((((uint32_t) (uintptr_t) hash->value) % 157) < 0x3F) {
166 ctx->buffer_out[ ctx->buffer_used++ ] = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x40;
167 }
168 else {
169 ctx->buffer_out[ ctx->buffer_used++ ] = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x62;
170 }
171 }
172
173 return LXB_STATUS_OK;
174 }
175
176 lxb_status_t
lxb_encoding_encode_euc_jp(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)177 lxb_encoding_encode_euc_jp(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
178 const lxb_codepoint_t *end)
179 {
180 lxb_codepoint_t cp;
181 const lexbor_shs_hash_t *hash;
182
183 for (; *cps < end; (*cps)++) {
184 cp = **cps;
185
186 if (cp < 0x80) {
187 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
188 continue;
189 }
190
191 if (cp == 0x00A5) {
192 LXB_ENCODING_ENCODE_APPEND(ctx, 0x5C);
193 continue;
194 }
195
196 if (cp == 0x203E) {
197 LXB_ENCODING_ENCODE_APPEND(ctx, 0x7E);
198 continue;
199 }
200
201 if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
202 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
203 return LXB_STATUS_SMALL_BUFFER;
204 }
205
206 ctx->buffer_out[ ctx->buffer_used++ ] = 0x8E;
207 ctx->buffer_out[ ctx->buffer_used++ ] = cp - 0xFF61 + 0xA1;
208
209 continue;
210 }
211
212 if (cp == 0x2212) {
213 cp = 0xFF0D;
214 }
215
216 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
217 LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
218 if (hash == NULL) {
219 LXB_ENCODING_ENCODE_ERROR(ctx);
220 continue;
221 }
222
223 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
224 return LXB_STATUS_SMALL_BUFFER;
225 }
226
227 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 94 + 0xA1;
228 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value % 94 + 0xA1;
229 }
230
231 return LXB_STATUS_OK;
232 }
233
234 lxb_status_t
lxb_encoding_encode_euc_kr(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)235 lxb_encoding_encode_euc_kr(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
236 const lxb_codepoint_t *end)
237 {
238 lxb_codepoint_t cp;
239 const lexbor_shs_hash_t *hash;
240
241 for (; *cps < end; (*cps)++) {
242 cp = **cps;
243
244 if (cp < 0x80) {
245 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
246 continue;
247 }
248
249 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_euc_kr,
250 LXB_ENCODING_MULTI_HASH_EUC_KR_SIZE, cp);
251 if (hash == NULL) {
252 LXB_ENCODING_ENCODE_ERROR(ctx);
253 continue;
254 }
255
256 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
257 return LXB_STATUS_SMALL_BUFFER;
258 }
259
260 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
261 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value % 190 + 0x41;
262 }
263
264 return LXB_STATUS_OK;
265 }
266
267 lxb_status_t
lxb_encoding_encode_gbk(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)268 lxb_encoding_encode_gbk(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
269 const lxb_codepoint_t *end)
270 {
271 lxb_codepoint_t cp;
272 const lexbor_shs_hash_t *hash;
273
274 for (; *cps < end; (*cps)++) {
275 cp = **cps;
276
277 if (cp < 0x80) {
278 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
279 continue;
280 }
281
282 if (cp == 0xE5E5) {
283 LXB_ENCODING_ENCODE_ERROR(ctx);
284 continue;
285 }
286
287 if (cp == 0x20AC) {
288 LXB_ENCODING_ENCODE_APPEND(ctx, 0x80);
289 continue;
290 }
291
292 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
293 LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
294 if (hash == NULL) {
295 LXB_ENCODING_ENCODE_ERROR(ctx);
296 continue;
297 }
298
299 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
300 return LXB_STATUS_SMALL_BUFFER;
301 }
302
303 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (uintptr_t) hash->value / 190 + 0x81;
304
305 if (((lxb_char_t) (uintptr_t) hash->value % 190) < 0x3F) {
306 ctx->buffer_out[ ctx->buffer_used++ ] = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x40;
307 }
308 else {
309 ctx->buffer_out[ ctx->buffer_used++ ] = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x41;
310 }
311 }
312
313 return LXB_STATUS_OK;
314 }
315
316 lxb_status_t
lxb_encoding_encode_ibm866(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)317 lxb_encoding_encode_ibm866(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
318 const lxb_codepoint_t *end)
319 {
320
321 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_ibm866,
322 LXB_ENCODING_SINGLE_HASH_IBM866_SIZE);
323 }
324
325 lxb_status_t
lxb_encoding_encode_iso_2022_jp(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)326 lxb_encoding_encode_iso_2022_jp(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
327 const lxb_codepoint_t *end)
328 {
329 int8_t size;
330 unsigned state;
331 lxb_codepoint_t cp;
332 const lexbor_shs_hash_t *hash;
333
334 size = 0;
335 state = ctx->state;
336
337 for (; *cps < end; (*cps)++) {
338 cp = **cps;
339
340 begin:
341
342 switch (ctx->state) {
343 case LXB_ENCODING_ENCODE_2022_JP_ASCII:
344 if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
345 goto failed;
346 }
347
348 if (cp < 0x80) {
349 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
350 continue;
351 }
352
353 if (cp == 0x00A5 || cp == 0x203E) {
354 /*
355 * Do not switch to the ROMAN stage with prepend code point
356 * to stream, add it immediately.
357 */
358 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
359 goto small_buffer;
360 }
361
362 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
363
364 if (cp == 0x00A5) {
365 memcpy(&ctx->buffer_out[ctx->buffer_used],
366 "\x1B\x28\x4A\x5C", 4);
367 ctx->buffer_used += 4;
368
369 continue;
370 }
371
372 memcpy(&ctx->buffer_out[ctx->buffer_used],
373 "\x1B\x28\x4A\x7E", 4);
374 ctx->buffer_used += 4;
375
376 continue;
377 }
378
379 break;
380
381 case LXB_ENCODING_ENCODE_2022_JP_ROMAN:
382 if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
383 goto failed;
384 }
385
386 if (cp < 0x80) {
387 switch (cp) {
388 case 0x005C:
389 case 0x007E:
390 break;
391
392 default:
393 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
394 continue;
395 }
396
397 /*
398 * Do not switch to the ANSI stage with prepend code point
399 * to stream, add it immediately.
400 */
401 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
402 goto small_buffer;
403 }
404
405 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
406
407 memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x28\x42", 3);
408 ctx->buffer_used += 3;
409
410 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) cp;
411 continue;
412 }
413 else if (cp == 0x00A5) {
414 LXB_ENCODING_ENCODE_APPEND(ctx, 0x5C);
415 continue;
416 }
417 else if (cp == 0x203E) {
418 LXB_ENCODING_ENCODE_APPEND(ctx, 0x7E);
419 continue;
420 }
421
422 break;
423
424 case LXB_ENCODING_ENCODE_2022_JP_JIS0208:
425 if (cp < 0x80) {
426 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
427 goto small_buffer;
428 }
429
430 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
431
432 memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x28\x42", 3);
433 ctx->buffer_used += 3;
434
435 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) cp;
436 continue;
437 }
438
439 if (cp == 0x00A5 || cp == 0x203E) {
440 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
441 goto small_buffer;
442 }
443
444 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
445
446 if (cp == 0x00A5) {
447 memcpy(&ctx->buffer_out[ctx->buffer_used],
448 "\x1B\x28\x4A\x5C", 4);
449 ctx->buffer_used += 4;
450
451 continue;
452 }
453
454 memcpy(&ctx->buffer_out[ctx->buffer_used],
455 "\x1B\x28\x4A\x7E", 4);
456 ctx->buffer_used += 4;
457
458 continue;
459 }
460
461 break;
462 }
463
464 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
465 goto small_buffer;
466 }
467
468 if (cp == 0x2212) {
469 cp = 0xFF0D;
470 }
471
472 if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
473 cp = lxb_encoding_multi_index_iso_2022_jp_katakana[cp - 0xFF61].codepoint;
474 }
475
476 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
477 LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
478 if (hash == NULL) {
479 goto failed;
480 }
481
482 if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_JIS0208) {
483 if ((ctx->buffer_used + 3) > ctx->buffer_length) {
484 goto small_buffer;
485 }
486
487 memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x24\x42", 3);
488 ctx->buffer_used += 3;
489
490 ctx->state = LXB_ENCODING_ENCODE_2022_JP_JIS0208;
491 size += 3;
492
493 goto begin;
494 }
495
496 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 94 + 0x21;
497 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value % 94 + 0x21;
498
499 continue;
500
501 small_buffer:
502
503 ctx->state = state;
504 ctx->buffer_used -= size;
505
506 return LXB_STATUS_SMALL_BUFFER;
507
508 failed:
509
510 ctx->buffer_used -= size;
511 LXB_ENCODING_ENCODE_ERROR(ctx);
512 }
513
514 return LXB_STATUS_OK;
515 }
516
517 lxb_status_t
lxb_encoding_encode_iso_2022_jp_eof(lxb_encoding_encode_t * ctx)518 lxb_encoding_encode_iso_2022_jp_eof(lxb_encoding_encode_t *ctx)
519 {
520 if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_ASCII) {
521 if ((ctx->buffer_used + 3) > ctx->buffer_length) {
522 return LXB_STATUS_SMALL_BUFFER;
523 }
524
525 memcpy(&ctx->buffer_out[ctx->buffer_used], "\x1B\x28\x42", 3);
526 ctx->buffer_used += 3;
527 }
528
529 return LXB_STATUS_OK;
530 }
531
532 lxb_status_t
lxb_encoding_encode_iso_8859_10(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)533 lxb_encoding_encode_iso_8859_10(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
534 const lxb_codepoint_t *end)
535 {
536 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_10,
537 LXB_ENCODING_SINGLE_HASH_ISO_8859_10_SIZE);
538 }
539
540 lxb_status_t
lxb_encoding_encode_iso_8859_13(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)541 lxb_encoding_encode_iso_8859_13(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
542 const lxb_codepoint_t *end)
543 {
544 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_13,
545 LXB_ENCODING_SINGLE_HASH_ISO_8859_13_SIZE);
546 }
547
548 lxb_status_t
lxb_encoding_encode_iso_8859_14(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)549 lxb_encoding_encode_iso_8859_14(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
550 const lxb_codepoint_t *end)
551 {
552 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_14,
553 LXB_ENCODING_SINGLE_HASH_ISO_8859_14_SIZE);
554 }
555
556 lxb_status_t
lxb_encoding_encode_iso_8859_15(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)557 lxb_encoding_encode_iso_8859_15(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
558 const lxb_codepoint_t *end)
559 {
560 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_15,
561 LXB_ENCODING_SINGLE_HASH_ISO_8859_15_SIZE);
562 }
563
564 lxb_status_t
lxb_encoding_encode_iso_8859_16(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)565 lxb_encoding_encode_iso_8859_16(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
566 const lxb_codepoint_t *end)
567 {
568 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_16,
569 LXB_ENCODING_SINGLE_HASH_ISO_8859_16_SIZE);
570 }
571
572 lxb_status_t
lxb_encoding_encode_iso_8859_2(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)573 lxb_encoding_encode_iso_8859_2(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
574 const lxb_codepoint_t *end)
575 {
576 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_2,
577 LXB_ENCODING_SINGLE_HASH_ISO_8859_2_SIZE);
578 }
579
580 lxb_status_t
lxb_encoding_encode_iso_8859_3(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)581 lxb_encoding_encode_iso_8859_3(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
582 const lxb_codepoint_t *end)
583 {
584 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_3,
585 LXB_ENCODING_SINGLE_HASH_ISO_8859_3_SIZE);
586 }
587
588 lxb_status_t
lxb_encoding_encode_iso_8859_4(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)589 lxb_encoding_encode_iso_8859_4(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
590 const lxb_codepoint_t *end)
591 {
592 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_4,
593 LXB_ENCODING_SINGLE_HASH_ISO_8859_4_SIZE);
594 }
595
596 lxb_status_t
lxb_encoding_encode_iso_8859_5(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)597 lxb_encoding_encode_iso_8859_5(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
598 const lxb_codepoint_t *end)
599 {
600 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_5,
601 LXB_ENCODING_SINGLE_HASH_ISO_8859_5_SIZE);
602 }
603
604 lxb_status_t
lxb_encoding_encode_iso_8859_6(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)605 lxb_encoding_encode_iso_8859_6(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
606 const lxb_codepoint_t *end)
607 {
608 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_6,
609 LXB_ENCODING_SINGLE_HASH_ISO_8859_6_SIZE);
610 }
611
612 lxb_status_t
lxb_encoding_encode_iso_8859_7(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)613 lxb_encoding_encode_iso_8859_7(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
614 const lxb_codepoint_t *end)
615 {
616 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_7,
617 LXB_ENCODING_SINGLE_HASH_ISO_8859_7_SIZE);
618 }
619
620 lxb_status_t
lxb_encoding_encode_iso_8859_8(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)621 lxb_encoding_encode_iso_8859_8(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
622 const lxb_codepoint_t *end)
623 {
624 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_8,
625 LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
626 }
627
628 lxb_status_t
lxb_encoding_encode_iso_8859_8_i(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)629 lxb_encoding_encode_iso_8859_8_i(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
630 const lxb_codepoint_t *end)
631 {
632 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_iso_8859_8,
633 LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
634 }
635
636 lxb_status_t
lxb_encoding_encode_koi8_r(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)637 lxb_encoding_encode_koi8_r(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
638 const lxb_codepoint_t *end)
639 {
640 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_koi8_r,
641 LXB_ENCODING_SINGLE_HASH_KOI8_R_SIZE);
642 }
643
644 lxb_status_t
lxb_encoding_encode_koi8_u(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)645 lxb_encoding_encode_koi8_u(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
646 const lxb_codepoint_t *end)
647 {
648 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_koi8_u,
649 LXB_ENCODING_SINGLE_HASH_KOI8_U_SIZE);
650 }
651
652 lxb_inline const lexbor_shs_hash_t *
lxb_encoding_encode_shift_jis_index(lxb_codepoint_t cp)653 lxb_encoding_encode_shift_jis_index(lxb_codepoint_t cp)
654 {
655 const lexbor_shs_hash_t *entry;
656
657 entry = &lxb_encoding_multi_hash_jis0208[ (cp % LXB_ENCODING_MULTI_HASH_JIS0208_SIZE) + 1 ];
658
659 do {
660 if (entry->key == cp) {
661 if ((unsigned) ((uint32_t) (uintptr_t) entry->value - 8272) > (8835 - 8272)) {
662 return entry;
663 }
664 }
665
666 entry = &lxb_encoding_multi_hash_jis0208[entry->next];
667 }
668 while (entry != lxb_encoding_multi_hash_jis0208);
669
670 return NULL;
671 }
672
673 lxb_status_t
lxb_encoding_encode_shift_jis(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)674 lxb_encoding_encode_shift_jis(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
675 const lxb_codepoint_t *end)
676 {
677 uint32_t lead, trail;
678 lxb_codepoint_t cp;
679 const lexbor_shs_hash_t *hash;
680
681 for (; *cps < end; (*cps)++) {
682 cp = **cps;
683
684 if (cp <= 0x80) {
685 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
686 continue;
687 }
688
689 if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
690 LXB_ENCODING_ENCODE_APPEND(ctx, cp - 0xFF61 + 0xA1);
691 continue;
692 }
693
694 switch (cp) {
695 case 0x00A5:
696 LXB_ENCODING_ENCODE_APPEND(ctx, 0x5C);
697 continue;
698
699 case 0x203E:
700 LXB_ENCODING_ENCODE_APPEND(ctx, 0x7E);
701 continue;
702
703 case 0x2212:
704 cp = 0xFF0D;
705 break;
706 }
707
708 hash = lxb_encoding_encode_shift_jis_index(cp);
709 if (hash == NULL) {
710 LXB_ENCODING_ENCODE_ERROR(ctx);
711 continue;
712 }
713
714 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
715 return LXB_STATUS_SMALL_BUFFER;
716 }
717
718 lead = (uint32_t) (uintptr_t) hash->value / 188;
719 trail = (uint32_t) (uintptr_t) hash->value % 188;
720
721 ctx->buffer_out[ctx->buffer_used++ ] = lead + ((lead < 0x1F) ? 0x81 : 0xC1);
722 ctx->buffer_out[ctx->buffer_used++ ] = trail + ((trail < 0x3F) ? 0x40 : 0x41);
723 }
724
725 return LXB_STATUS_OK;
726 }
727
728 lxb_inline void
lxb_encoding_encode_utf_16_write(lxb_encoding_encode_t * ctx,bool is_be,lxb_codepoint_t cp)729 lxb_encoding_encode_utf_16_write(lxb_encoding_encode_t *ctx, bool is_be,
730 lxb_codepoint_t cp)
731 {
732 if (is_be) {
733 ctx->buffer_out[ctx->buffer_used++] = cp >> 8;
734 ctx->buffer_out[ctx->buffer_used++] = cp & 0x00FF;
735
736 return;
737 }
738
739 ctx->buffer_out[ctx->buffer_used++] = cp & 0x00FF;
740 ctx->buffer_out[ctx->buffer_used++] = cp >> 8;
741 }
742
743 lxb_inline int8_t
lxb_encoding_encode_utf_16(lxb_encoding_encode_t * ctx,bool is_be,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)744 lxb_encoding_encode_utf_16(lxb_encoding_encode_t *ctx, bool is_be,
745 const lxb_codepoint_t **cps, const lxb_codepoint_t *end)
746 {
747 lxb_codepoint_t cp;
748
749 for (; *cps < end; (*cps)++) {
750 cp = **cps;
751
752 if (cp < 0x10000) {
753 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
754 return LXB_STATUS_SMALL_BUFFER;
755 }
756
757 lxb_encoding_encode_utf_16_write(ctx, is_be, cp);
758
759 continue;
760 }
761
762 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
763 return LXB_STATUS_SMALL_BUFFER;
764 }
765
766 cp -= 0x10000;
767
768 lxb_encoding_encode_utf_16_write(ctx, is_be, (0xD800 | (cp >> 0x0A)));
769 lxb_encoding_encode_utf_16_write(ctx, is_be, (0xDC00 | (cp & 0x03FF)));
770 }
771
772 return LXB_STATUS_OK;
773 }
774
775 lxb_status_t
lxb_encoding_encode_utf_16be(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)776 lxb_encoding_encode_utf_16be(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
777 const lxb_codepoint_t *end)
778 {
779 return lxb_encoding_encode_utf_16(ctx, true, cps, end);
780 }
781
782 lxb_status_t
lxb_encoding_encode_utf_16le(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)783 lxb_encoding_encode_utf_16le(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
784 const lxb_codepoint_t *end)
785 {
786 return lxb_encoding_encode_utf_16(ctx, false, cps, end);
787 }
788
789 lxb_status_t
lxb_encoding_encode_utf_8(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)790 lxb_encoding_encode_utf_8(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
791 const lxb_codepoint_t *end)
792 {
793 lxb_codepoint_t cp;
794 const lxb_codepoint_t *p = *cps;
795
796 for (; p < end; p++) {
797 cp = *p;
798
799 if (cp < 0x80) {
800 if ((ctx->buffer_used + 1) > ctx->buffer_length) {
801 *cps = p;
802
803 return LXB_STATUS_SMALL_BUFFER;
804 }
805
806 /* 0xxxxxxx */
807 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) cp;
808 }
809 else if (cp < 0x800) {
810 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
811 *cps = p;
812
813 return LXB_STATUS_SMALL_BUFFER;
814 }
815
816 /* 110xxxxx 10xxxxxx */
817 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0xC0 | (cp >> 6 ));
818 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | (cp & 0x3F));
819 }
820 else if (cp < 0x10000) {
821 if ((ctx->buffer_used + 3) > ctx->buffer_length) {
822 *cps = p;
823
824 return LXB_STATUS_SMALL_BUFFER;
825 }
826
827 /* 1110xxxx 10xxxxxx 10xxxxxx */
828 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0xE0 | ((cp >> 12)));
829 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
830 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ( cp & 0x3F));
831 }
832 else if (cp < 0x110000) {
833 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
834 *cps = p;
835
836 return LXB_STATUS_SMALL_BUFFER;
837 }
838
839 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
840 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0xF0 | ( cp >> 18));
841 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ((cp >> 12) & 0x3F));
842 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
843 ctx->buffer_out[ ctx->buffer_used++ ] = (lxb_char_t) (0x80 | ( cp & 0x3F));
844 }
845 else {
846 *cps = p;
847 LXB_ENCODING_ENCODE_ERROR(ctx);
848 }
849 }
850
851 *cps = p;
852
853 return LXB_STATUS_OK;
854 }
855
856 lxb_inline uint32_t
lxb_encoding_encode_gb18030_range(lxb_codepoint_t cp)857 lxb_encoding_encode_gb18030_range(lxb_codepoint_t cp)
858 {
859 size_t mid, left, right;
860 const lxb_encoding_range_index_t *range;
861
862 if (cp == 0xE7C7) {
863 return 7457;
864 }
865
866 left = 0;
867 right = LXB_ENCODING_RANGE_INDEX_GB18030_SIZE;
868 range = lxb_encoding_range_index_gb18030;
869
870 /* Some compilers say about uninitialized mid */
871 mid = 0;
872
873 while (left < right) {
874 mid = left + (right - left) / 2;
875
876 if (range[mid].codepoint < cp) {
877 left = mid + 1;
878
879 if (left < right && range[left].codepoint > cp) {
880 break;
881 }
882 }
883 else if (range[mid].codepoint > cp) {
884 right = mid - 1;
885
886 if (right > 0 && range[right].codepoint <= cp) {
887 mid = right;
888 break;
889 }
890 }
891 else {
892 break;
893 }
894 }
895
896 return range[mid].index + cp - range[mid].codepoint;
897 }
898
899 lxb_status_t
lxb_encoding_encode_gb18030(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)900 lxb_encoding_encode_gb18030(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
901 const lxb_codepoint_t *end)
902 {
903 uint32_t index;
904 lxb_codepoint_t cp;
905 const lexbor_shs_hash_t *hash;
906
907 for (; *cps < end; (*cps)++) {
908 cp = **cps;
909
910 if (cp < 0x80) {
911 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
912 continue;
913 }
914
915 if (cp == 0xE5E5) {
916 LXB_ENCODING_ENCODE_ERROR(ctx);
917 continue;
918 }
919
920 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
921 LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
922 if (hash != NULL) {
923 if ((ctx->buffer_used + 2) > ctx->buffer_length) {
924 return LXB_STATUS_SMALL_BUFFER;
925 }
926
927 ctx->buffer_out[ ctx->buffer_used++ ] = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
928
929 if (((uint32_t) (uintptr_t) hash->value % 190) < 0x3F) {
930 ctx->buffer_out[ ctx->buffer_used++ ] = ((uint32_t) (uintptr_t) hash->value % 190) + 0x40;
931 }
932 else {
933 ctx->buffer_out[ ctx->buffer_used++ ] = ((uint32_t) (uintptr_t) hash->value % 190) + 0x41;
934 }
935
936 continue;
937 }
938
939 if ((ctx->buffer_used + 4) > ctx->buffer_length) {
940 return LXB_STATUS_SMALL_BUFFER;
941 }
942
943 index = lxb_encoding_encode_gb18030_range(cp);
944
945 ctx->buffer_out[ ctx->buffer_used++ ] = (index / (10 * 126 * 10)) + 0x81;
946 ctx->buffer_out[ ctx->buffer_used++ ] = ((index % (10 * 126 * 10)) / (10 * 126)) + 0x30;
947
948 index = (index % (10 * 126 * 10)) % (10 * 126);
949
950 ctx->buffer_out[ ctx->buffer_used++ ] = (index / 10) + 0x81;
951 ctx->buffer_out[ ctx->buffer_used++ ] = (index % 10) + 0x30;
952 }
953
954 return LXB_STATUS_OK;
955 }
956
957 lxb_status_t
lxb_encoding_encode_macintosh(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)958 lxb_encoding_encode_macintosh(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
959 const lxb_codepoint_t *end)
960 {
961 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_macintosh,
962 LXB_ENCODING_SINGLE_HASH_MACINTOSH_SIZE);
963 }
964
965 lxb_status_t
lxb_encoding_encode_replacement(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)966 lxb_encoding_encode_replacement(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
967 const lxb_codepoint_t *end)
968 {
969 *cps = end;
970 return LXB_STATUS_ERROR;
971 }
972
973 lxb_status_t
lxb_encoding_encode_windows_1250(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)974 lxb_encoding_encode_windows_1250(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
975 const lxb_codepoint_t *end)
976 {
977 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1250,
978 LXB_ENCODING_SINGLE_HASH_WINDOWS_1250_SIZE);
979 }
980
981 lxb_status_t
lxb_encoding_encode_windows_1251(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)982 lxb_encoding_encode_windows_1251(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
983 const lxb_codepoint_t *end)
984 {
985 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1251,
986 LXB_ENCODING_SINGLE_HASH_WINDOWS_1251_SIZE);
987 }
988
989 lxb_status_t
lxb_encoding_encode_windows_1252(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)990 lxb_encoding_encode_windows_1252(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
991 const lxb_codepoint_t *end)
992 {
993 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1252,
994 LXB_ENCODING_SINGLE_HASH_WINDOWS_1252_SIZE);
995 }
996
997 lxb_status_t
lxb_encoding_encode_windows_1253(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)998 lxb_encoding_encode_windows_1253(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
999 const lxb_codepoint_t *end)
1000 {
1001 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1253,
1002 LXB_ENCODING_SINGLE_HASH_WINDOWS_1253_SIZE);
1003 }
1004
1005 lxb_status_t
lxb_encoding_encode_windows_1254(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1006 lxb_encoding_encode_windows_1254(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1007 const lxb_codepoint_t *end)
1008 {
1009 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1254,
1010 LXB_ENCODING_SINGLE_HASH_WINDOWS_1254_SIZE);
1011 }
1012
1013 lxb_status_t
lxb_encoding_encode_windows_1255(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1014 lxb_encoding_encode_windows_1255(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1015 const lxb_codepoint_t *end)
1016 {
1017 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1255,
1018 LXB_ENCODING_SINGLE_HASH_WINDOWS_1255_SIZE);
1019 }
1020
1021 lxb_status_t
lxb_encoding_encode_windows_1256(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1022 lxb_encoding_encode_windows_1256(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1023 const lxb_codepoint_t *end)
1024 {
1025 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1256,
1026 LXB_ENCODING_SINGLE_HASH_WINDOWS_1256_SIZE);
1027 }
1028
1029 lxb_status_t
lxb_encoding_encode_windows_1257(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1030 lxb_encoding_encode_windows_1257(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1031 const lxb_codepoint_t *end)
1032 {
1033 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1257,
1034 LXB_ENCODING_SINGLE_HASH_WINDOWS_1257_SIZE);
1035 }
1036
1037 lxb_status_t
lxb_encoding_encode_windows_1258(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1038 lxb_encoding_encode_windows_1258(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1039 const lxb_codepoint_t *end)
1040 {
1041 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_1258,
1042 LXB_ENCODING_SINGLE_HASH_WINDOWS_1258_SIZE);
1043 }
1044
1045 lxb_status_t
lxb_encoding_encode_windows_874(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1046 lxb_encoding_encode_windows_874(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1047 const lxb_codepoint_t *end)
1048 {
1049 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_windows_874,
1050 LXB_ENCODING_SINGLE_HASH_WINDOWS_874_SIZE);
1051 }
1052
1053 lxb_status_t
lxb_encoding_encode_x_mac_cyrillic(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1054 lxb_encoding_encode_x_mac_cyrillic(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1055 const lxb_codepoint_t *end)
1056 {
1057 LXB_ENCODING_ENCODE_SINGLE_BYTE(lxb_encoding_single_hash_x_mac_cyrillic,
1058 LXB_ENCODING_SINGLE_HASH_X_MAC_CYRILLIC_SIZE);
1059 }
1060
1061 lxb_status_t
lxb_encoding_encode_x_user_defined(lxb_encoding_encode_t * ctx,const lxb_codepoint_t ** cps,const lxb_codepoint_t * end)1062 lxb_encoding_encode_x_user_defined(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cps,
1063 const lxb_codepoint_t *end)
1064 {
1065 lxb_codepoint_t cp;
1066
1067 for (; *cps < end; (*cps)++) {
1068 cp = **cps;
1069
1070 if (cp < 0x80) {
1071 LXB_ENCODING_ENCODE_APPEND(ctx, cp);
1072 }
1073 else if (cp >= 0xF780 && cp <= 0xF7FF) {
1074 LXB_ENCODING_ENCODE_APPEND(ctx, (cp - 0xF780 + 0x80));
1075 }
1076 else {
1077 LXB_ENCODING_ENCODE_ERROR(ctx);
1078 }
1079 }
1080
1081 return LXB_STATUS_OK;
1082 }
1083
1084 /*
1085 * Single
1086 */
1087 int8_t
lxb_encoding_encode_default_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1088 lxb_encoding_encode_default_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1089 const lxb_char_t *end, lxb_codepoint_t cp)
1090 {
1091 return lxb_encoding_encode_utf_8_single(ctx, data, end, cp);
1092 }
1093
1094 int8_t
lxb_encoding_encode_auto_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1095 lxb_encoding_encode_auto_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1096 const lxb_char_t *end, lxb_codepoint_t cp)
1097 {
1098 return LXB_ENCODING_ENCODE_ERROR;
1099 }
1100
1101 int8_t
lxb_encoding_encode_undefined_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1102 lxb_encoding_encode_undefined_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1103 const lxb_char_t *end, lxb_codepoint_t cp)
1104 {
1105 return LXB_ENCODING_ENCODE_ERROR;
1106 }
1107
1108 int8_t
lxb_encoding_encode_big5_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1109 lxb_encoding_encode_big5_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1110 const lxb_char_t *end, lxb_codepoint_t cp)
1111 {
1112 const lexbor_shs_hash_t *hash;
1113
1114 if (cp < 0x80) {
1115 *(*data)++ = (lxb_char_t) cp;
1116
1117 return 1;
1118 }
1119
1120 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_big5,
1121 LXB_ENCODING_MULTI_HASH_BIG5_SIZE, cp);
1122 if (hash == NULL) {
1123 return LXB_ENCODING_ENCODE_ERROR;
1124 }
1125
1126 if ((*data + 2) > end) {
1127 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1128 }
1129
1130 *(*data)++ = ((uint32_t) (uintptr_t) hash->value) / 157 + 0x81;
1131
1132 if ((((uint32_t) (uintptr_t) hash->value) % 157) < 0x3F) {
1133 *(*data)++ = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x40;
1134 }
1135 else {
1136 *(*data)++ = (((uint32_t) (uintptr_t) hash->value) % 157) + 0x62;
1137 }
1138
1139 return 2;
1140 }
1141
1142 int8_t
lxb_encoding_encode_euc_jp_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1143 lxb_encoding_encode_euc_jp_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1144 const lxb_char_t *end, lxb_codepoint_t cp)
1145 {
1146 const lexbor_shs_hash_t *hash;
1147
1148 if (cp < 0x80) {
1149 *(*data)++ = (lxb_char_t) cp;
1150
1151 return 1;
1152 }
1153
1154 if (cp == 0x00A5) {
1155 *(*data)++ = 0x5C;
1156
1157 return 1;
1158 }
1159
1160 if (cp == 0x203E) {
1161 *(*data)++ = 0x7E;
1162
1163 return 1;
1164 }
1165
1166 if ((*data + 2) > end) {
1167 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1168 }
1169
1170 if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
1171 *(*data)++ = 0x8E;
1172 *(*data)++ = cp - 0xFF61 + 0xA1;
1173
1174 return 2;
1175 }
1176
1177 if (cp == 0x2212) {
1178 cp = 0xFF0D;
1179 }
1180
1181 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
1182 LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
1183 if (hash == NULL) {
1184 return LXB_ENCODING_ENCODE_ERROR;
1185 }
1186
1187 *(*data)++ = (uint32_t) (uintptr_t) hash->value / 94 + 0xA1;
1188 *(*data)++ = (uint32_t) (uintptr_t) hash->value % 94 + 0xA1;
1189
1190 return 2;
1191 }
1192
1193 int8_t
lxb_encoding_encode_euc_kr_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1194 lxb_encoding_encode_euc_kr_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1195 const lxb_char_t *end, lxb_codepoint_t cp)
1196 {
1197 const lexbor_shs_hash_t *hash;
1198
1199 if (cp < 0x80) {
1200 *(*data)++ = (lxb_char_t) cp;
1201
1202 return 1;
1203 }
1204
1205 if ((*data + 2) > end) {
1206 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1207 }
1208
1209 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_euc_kr,
1210 LXB_ENCODING_MULTI_HASH_EUC_KR_SIZE, cp);
1211 if (hash == NULL) {
1212 return LXB_ENCODING_ENCODE_ERROR;
1213 }
1214
1215 *(*data)++ = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
1216 *(*data)++ = (uint32_t) (uintptr_t) hash->value % 190 + 0x41;
1217
1218 return 2;
1219 }
1220
1221 int8_t
lxb_encoding_encode_gbk_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1222 lxb_encoding_encode_gbk_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1223 const lxb_char_t *end, lxb_codepoint_t cp)
1224 {
1225 const lexbor_shs_hash_t *hash;
1226
1227 if (cp < 0x80) {
1228 *(*data)++ = (lxb_char_t) cp;
1229
1230 return 1;
1231 }
1232
1233 if (cp == 0xE5E5) {
1234 return LXB_ENCODING_ENCODE_ERROR;
1235 }
1236
1237 if (cp == 0x20AC) {
1238 *(*data)++ = 0x80;
1239
1240 return 1;
1241 }
1242
1243 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
1244 LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
1245 if (hash != NULL) {
1246 if ((*data + 2) > end) {
1247 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1248 }
1249
1250 *(*data)++ = (lxb_char_t) (uintptr_t) hash->value / 190 + 0x81;
1251
1252 if (((lxb_char_t) (uintptr_t) hash->value % 190) < 0x3F) {
1253 *(*data)++ = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x40;
1254 }
1255 else {
1256 *(*data)++ = ((lxb_char_t) (uintptr_t) hash->value % 190) + 0x41;
1257 }
1258
1259 return 2;
1260 }
1261
1262 return LXB_ENCODING_ENCODE_ERROR;
1263 }
1264
1265 int8_t
lxb_encoding_encode_ibm866_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1266 lxb_encoding_encode_ibm866_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1267 const lxb_char_t *end, lxb_codepoint_t cp)
1268 {
1269 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_ibm866,
1270 LXB_ENCODING_SINGLE_HASH_IBM866_SIZE);
1271 }
1272
1273 int8_t
lxb_encoding_encode_iso_2022_jp_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1274 lxb_encoding_encode_iso_2022_jp_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1275 const lxb_char_t *end, lxb_codepoint_t cp)
1276 {
1277 int8_t size;
1278 unsigned state;
1279 const lexbor_shs_hash_t *hash;
1280
1281 size = 0;
1282 state = ctx->state;
1283
1284 begin:
1285
1286 switch (ctx->state) {
1287 case LXB_ENCODING_ENCODE_2022_JP_ASCII:
1288 if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
1289 goto failed;
1290 }
1291
1292 if (cp < 0x80) {
1293 *(*data)++ = (lxb_char_t) cp;
1294
1295 return size + 1;
1296 }
1297
1298 if (cp == 0x00A5 || cp == 0x203E) {
1299 /*
1300 * Do not switch to the ROMAN stage with prepend code point
1301 * to stream, add it immediately.
1302 */
1303 if ((*data + 4) > end) {
1304 goto small_buffer;
1305 }
1306
1307 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
1308
1309 if (cp == 0x00A5) {
1310 memcpy(*data, "\x1B\x28\x4A\x5C", 4);
1311 *data = *data + 4;
1312
1313 return size + 4;
1314 }
1315
1316 memcpy(*data, "\x1B\x28\x4A\x7E", 4);
1317 *data = *data + 4;
1318
1319 return size + 4;
1320 }
1321
1322 break;
1323
1324 case LXB_ENCODING_ENCODE_2022_JP_ROMAN:
1325 if (cp == 0x000E || cp == 0x000F || cp == 0x001B) {
1326 goto failed;
1327 }
1328
1329 if (cp < 0x80) {
1330 switch (cp) {
1331 case 0x005C:
1332 case 0x007E:
1333 break;
1334
1335 default:
1336 *(*data)++ = (lxb_char_t) cp;
1337 return size + 1;
1338 }
1339
1340 /*
1341 * Do not switch to the ANSI stage with prepend code point
1342 * to stream, add it immediately.
1343 */
1344 if ((*data + 4) > end) {
1345 goto small_buffer;
1346 }
1347
1348 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
1349
1350 memcpy(*data, "\x1B\x28\x42", 3);
1351 *data = *data + 3;
1352
1353 *(*data)++ = (lxb_char_t) cp;
1354
1355 return size + 4;
1356 }
1357 else if (cp == 0x00A5) {
1358 *(*data)++ = 0x5C;
1359 return size + 1;
1360 }
1361 else if (cp == 0x203E) {
1362 *(*data)++ = 0x7E;
1363 return size + 1;
1364 }
1365
1366 break;
1367
1368 case LXB_ENCODING_ENCODE_2022_JP_JIS0208:
1369 if (cp < 0x80) {
1370 if ((*data + 4) > end) {
1371 goto small_buffer;
1372 }
1373
1374 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
1375
1376 memcpy(*data, "\x1B\x28\x42", 3);
1377 *data = *data + 3;
1378
1379 *(*data)++ = (lxb_char_t) cp;
1380
1381 return size + 4;
1382 }
1383
1384 if (cp == 0x00A5 || cp == 0x203E) {
1385 if ((*data + 4) > end) {
1386 goto small_buffer;
1387 }
1388
1389 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ROMAN;
1390
1391 if (cp == 0x00A5) {
1392 memcpy(*data, "\x1B\x28\x4A\x5C", 4);
1393 *data = *data + 4;
1394
1395 return size + 4;
1396 }
1397
1398 memcpy(*data, "\x1B\x28\x4A\x7E", 4);
1399 *data = *data + 4;
1400
1401 return size + 4;
1402 }
1403
1404 break;
1405 }
1406
1407 if ((*data + 2) > end) {
1408 goto small_buffer;
1409 }
1410
1411 if (cp == 0x2212) {
1412 cp = 0xFF0D;
1413 }
1414
1415 if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
1416 cp = lxb_encoding_multi_index_iso_2022_jp_katakana[cp - 0xFF61].codepoint;
1417 }
1418
1419 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_jis0208,
1420 LXB_ENCODING_MULTI_HASH_JIS0208_SIZE, cp);
1421 if (hash == NULL) {
1422 goto failed;
1423 }
1424
1425 if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_JIS0208) {
1426 if ((*data + 3) > end) {
1427 goto small_buffer;
1428 }
1429
1430 memcpy(*data, "\x1B\x24\x42", 3);
1431 *data = *data + 3;
1432
1433 ctx->state = LXB_ENCODING_ENCODE_2022_JP_JIS0208;
1434 size += 3;
1435
1436 goto begin;
1437 }
1438
1439 *(*data)++ = (uint32_t) (uintptr_t) hash->value / 94 + 0x21;
1440 *(*data)++ = (uint32_t) (uintptr_t) hash->value % 94 + 0x21;
1441
1442 return size + 2;
1443
1444 small_buffer:
1445
1446 ctx->state = state;
1447 *data = *data - size;
1448
1449 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1450
1451 failed:
1452
1453 *data = *data - size;
1454
1455 return LXB_ENCODING_ENCODE_ERROR;
1456 }
1457
1458 int8_t
lxb_encoding_encode_iso_2022_jp_eof_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end)1459 lxb_encoding_encode_iso_2022_jp_eof_single(lxb_encoding_encode_t *ctx,
1460 lxb_char_t **data, const lxb_char_t *end)
1461 {
1462 if (ctx->state != LXB_ENCODING_ENCODE_2022_JP_ASCII) {
1463 if ((*data + 3) > end) {
1464 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1465 }
1466
1467 memcpy(*data, "\x1B\x28\x42", 3);
1468 *data = *data + 3;
1469
1470 ctx->state = LXB_ENCODING_ENCODE_2022_JP_ASCII;
1471
1472 return 3;
1473 }
1474
1475 return 0;
1476 }
1477
1478 int8_t
lxb_encoding_encode_iso_8859_10_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1479 lxb_encoding_encode_iso_8859_10_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1480 const lxb_char_t *end, lxb_codepoint_t cp)
1481 {
1482 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_10,
1483 LXB_ENCODING_SINGLE_HASH_ISO_8859_10_SIZE);
1484 }
1485
1486 int8_t
lxb_encoding_encode_iso_8859_13_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1487 lxb_encoding_encode_iso_8859_13_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1488 const lxb_char_t *end, lxb_codepoint_t cp)
1489 {
1490 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_13,
1491 LXB_ENCODING_SINGLE_HASH_ISO_8859_13_SIZE);
1492 }
1493
1494 int8_t
lxb_encoding_encode_iso_8859_14_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1495 lxb_encoding_encode_iso_8859_14_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1496 const lxb_char_t *end, lxb_codepoint_t cp)
1497 {
1498 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_14,
1499 LXB_ENCODING_SINGLE_HASH_ISO_8859_14_SIZE);
1500 }
1501
1502 int8_t
lxb_encoding_encode_iso_8859_15_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1503 lxb_encoding_encode_iso_8859_15_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1504 const lxb_char_t *end, lxb_codepoint_t cp)
1505 {
1506 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_15,
1507 LXB_ENCODING_SINGLE_HASH_ISO_8859_15_SIZE);
1508 }
1509
1510 int8_t
lxb_encoding_encode_iso_8859_16_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1511 lxb_encoding_encode_iso_8859_16_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1512 const lxb_char_t *end, lxb_codepoint_t cp)
1513 {
1514 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_16,
1515 LXB_ENCODING_SINGLE_HASH_ISO_8859_16_SIZE);
1516 }
1517
1518 int8_t
lxb_encoding_encode_iso_8859_2_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1519 lxb_encoding_encode_iso_8859_2_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1520 const lxb_char_t *end, lxb_codepoint_t cp)
1521 {
1522 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_2,
1523 LXB_ENCODING_SINGLE_HASH_ISO_8859_2_SIZE);
1524 }
1525
1526 int8_t
lxb_encoding_encode_iso_8859_3_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1527 lxb_encoding_encode_iso_8859_3_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1528 const lxb_char_t *end, lxb_codepoint_t cp)
1529 {
1530 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_3,
1531 LXB_ENCODING_SINGLE_HASH_ISO_8859_3_SIZE);
1532 }
1533
1534 int8_t
lxb_encoding_encode_iso_8859_4_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1535 lxb_encoding_encode_iso_8859_4_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1536 const lxb_char_t *end, lxb_codepoint_t cp)
1537 {
1538 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_4,
1539 LXB_ENCODING_SINGLE_HASH_ISO_8859_4_SIZE);
1540 }
1541
1542 int8_t
lxb_encoding_encode_iso_8859_5_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1543 lxb_encoding_encode_iso_8859_5_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1544 const lxb_char_t *end, lxb_codepoint_t cp)
1545 {
1546 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_5,
1547 LXB_ENCODING_SINGLE_HASH_ISO_8859_5_SIZE);
1548 }
1549
1550 int8_t
lxb_encoding_encode_iso_8859_6_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1551 lxb_encoding_encode_iso_8859_6_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1552 const lxb_char_t *end, lxb_codepoint_t cp)
1553 {
1554 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_6,
1555 LXB_ENCODING_SINGLE_HASH_ISO_8859_6_SIZE);
1556 }
1557
1558 int8_t
lxb_encoding_encode_iso_8859_7_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1559 lxb_encoding_encode_iso_8859_7_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1560 const lxb_char_t *end, lxb_codepoint_t cp)
1561 {
1562 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_7,
1563 LXB_ENCODING_SINGLE_HASH_ISO_8859_7_SIZE);
1564 }
1565
1566 int8_t
lxb_encoding_encode_iso_8859_8_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1567 lxb_encoding_encode_iso_8859_8_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1568 const lxb_char_t *end, lxb_codepoint_t cp)
1569 {
1570 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_8,
1571 LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
1572 }
1573
1574 int8_t
lxb_encoding_encode_iso_8859_8_i_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1575 lxb_encoding_encode_iso_8859_8_i_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1576 const lxb_char_t *end, lxb_codepoint_t cp)
1577 {
1578 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_iso_8859_8,
1579 LXB_ENCODING_SINGLE_HASH_ISO_8859_8_SIZE);
1580 }
1581
1582 int8_t
lxb_encoding_encode_koi8_r_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1583 lxb_encoding_encode_koi8_r_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1584 const lxb_char_t *end, lxb_codepoint_t cp)
1585 {
1586 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_koi8_r,
1587 LXB_ENCODING_SINGLE_HASH_KOI8_R_SIZE);
1588 }
1589
1590 int8_t
lxb_encoding_encode_koi8_u_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1591 lxb_encoding_encode_koi8_u_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1592 const lxb_char_t *end, lxb_codepoint_t cp)
1593 {
1594 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_koi8_u,
1595 LXB_ENCODING_SINGLE_HASH_KOI8_U_SIZE);
1596 }
1597
1598 int8_t
lxb_encoding_encode_shift_jis_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1599 lxb_encoding_encode_shift_jis_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1600 const lxb_char_t *end, lxb_codepoint_t cp)
1601 {
1602 uint32_t lead, trail;
1603 const lexbor_shs_hash_t *hash;
1604
1605 if (cp <= 0x80) {
1606 *(*data)++ = (lxb_char_t) cp;
1607
1608 return 1;
1609 }
1610
1611 if ((unsigned) (cp - 0xFF61) <= (0xFF9F - 0xFF61)) {
1612 *(*data)++ = cp - 0xFF61 + 0xA1;
1613
1614 return 1;
1615 }
1616
1617 switch (cp) {
1618 case 0x00A5:
1619 *(*data)++ = 0x5C;
1620 return 1;
1621
1622 case 0x203E:
1623 *(*data)++ = 0x7E;
1624 return 1;
1625
1626 case 0x2212:
1627 cp = 0xFF0D;
1628 break;
1629 }
1630
1631 hash = lxb_encoding_encode_shift_jis_index(cp);
1632 if (hash == NULL) {
1633 return LXB_ENCODING_ENCODE_ERROR;
1634 }
1635
1636 if ((*data + 2) > end) {
1637 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1638 }
1639
1640 lead = (uint32_t) (uintptr_t) hash->value / 188;
1641 trail = (uint32_t) (uintptr_t) hash->value % 188;
1642
1643 *(*data)++ = lead + ((lead < 0x1F) ? 0x81 : 0xC1);
1644 *(*data)++ = trail + ((trail < 0x3F) ? 0x40 : 0x41);
1645
1646 return 2;
1647 }
1648
1649 lxb_inline void
lxb_encoding_encode_utf_16_write_single(bool is_be,lxb_char_t ** data,lxb_codepoint_t cp)1650 lxb_encoding_encode_utf_16_write_single(bool is_be, lxb_char_t **data,
1651 lxb_codepoint_t cp)
1652 {
1653 if (is_be) {
1654 *(*data)++ = cp >> 8;
1655 *(*data)++ = cp & 0x00FF;
1656
1657 return;
1658 }
1659
1660 *(*data)++ = cp & 0x00FF;
1661 *(*data)++ = cp >> 8;
1662 }
1663
1664 lxb_inline int8_t
lxb_encoding_encode_utf_16_single(lxb_encoding_encode_t * ctx,bool is_be,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1665 lxb_encoding_encode_utf_16_single(lxb_encoding_encode_t *ctx, bool is_be,
1666 lxb_char_t **data, const lxb_char_t *end, lxb_codepoint_t cp)
1667 {
1668 if ((*data + 2) > end) {
1669 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1670 }
1671
1672 if (cp < 0x10000) {
1673 lxb_encoding_encode_utf_16_write_single(is_be, data, cp);
1674
1675 return 2;
1676 }
1677
1678 if ((*data + 4) > end) {
1679 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1680 }
1681
1682 cp -= 0x10000;
1683
1684 lxb_encoding_encode_utf_16_write_single(is_be, data, (0xD800 | (cp >> 0x0A)));
1685 lxb_encoding_encode_utf_16_write_single(is_be, data, (0xDC00 | (cp & 0x03FF)));
1686
1687 return 4;
1688 }
1689
1690 int8_t
lxb_encoding_encode_utf_16be_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1691 lxb_encoding_encode_utf_16be_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1692 const lxb_char_t *end, lxb_codepoint_t cp)
1693 {
1694 return lxb_encoding_encode_utf_16_single(ctx, true, data, end, cp);
1695 }
1696
1697 int8_t
lxb_encoding_encode_utf_16le_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1698 lxb_encoding_encode_utf_16le_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1699 const lxb_char_t *end, lxb_codepoint_t cp)
1700 {
1701 return lxb_encoding_encode_utf_16_single(ctx, false, data, end, cp);
1702 }
1703
1704 int8_t
lxb_encoding_encode_utf_8_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1705 lxb_encoding_encode_utf_8_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1706 const lxb_char_t *end, lxb_codepoint_t cp)
1707 {
1708 if (cp < 0x80) {
1709 /* 0xxxxxxx */
1710 *(*data)++ = (lxb_char_t) cp;
1711
1712 return 1;
1713 }
1714
1715 if (cp < 0x800) {
1716 if ((*data + 2) > end) {
1717 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1718 }
1719
1720 /* 110xxxxx 10xxxxxx */
1721 *(*data)++ = (lxb_char_t) (0xC0 | (cp >> 6 ));
1722 *(*data)++ = (lxb_char_t) (0x80 | (cp & 0x3F));
1723
1724 return 2;
1725 }
1726
1727 if (cp < 0x10000) {
1728 if ((*data + 3) > end) {
1729 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1730 }
1731
1732 /* 1110xxxx 10xxxxxx 10xxxxxx */
1733 *(*data)++ = (lxb_char_t) (0xE0 | ((cp >> 12)));
1734 *(*data)++ = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
1735 *(*data)++ = (lxb_char_t) (0x80 | ( cp & 0x3F));
1736
1737 return 3;
1738 }
1739
1740 if (cp < 0x110000) {
1741 if ((*data + 4) > end) {
1742 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1743 }
1744
1745 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
1746 *(*data)++ = (lxb_char_t) (0xF0 | ( cp >> 18));
1747 *(*data)++ = (lxb_char_t) (0x80 | ((cp >> 12) & 0x3F));
1748 *(*data)++ = (lxb_char_t) (0x80 | ((cp >> 6 ) & 0x3F));
1749 *(*data)++ = (lxb_char_t) (0x80 | ( cp & 0x3F));
1750
1751 return 4;
1752 }
1753
1754 return LXB_ENCODING_ENCODE_ERROR;
1755 }
1756
1757 int8_t
lxb_encoding_encode_utf_8_length(lxb_codepoint_t cp)1758 lxb_encoding_encode_utf_8_length(lxb_codepoint_t cp)
1759 {
1760 if (cp < 0x80) {
1761 return 1;
1762 }
1763 else if (cp < 0x800) {
1764 return 2;
1765 }
1766 else if (cp < 0x10000) {
1767 return 3;
1768 }
1769 else if (cp < 0x110000) {
1770 return 4;
1771 }
1772
1773 return 0;
1774 }
1775
1776 int8_t
lxb_encoding_encode_gb18030_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1777 lxb_encoding_encode_gb18030_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1778 const lxb_char_t *end, lxb_codepoint_t cp)
1779 {
1780 uint32_t index;
1781 const lexbor_shs_hash_t *hash;
1782
1783 if (cp < 0x80) {
1784 *(*data)++ = (lxb_char_t) cp;
1785
1786 return 1;
1787 }
1788
1789 if (cp == 0xE5E5) {
1790 return LXB_ENCODING_ENCODE_ERROR;
1791 }
1792
1793 hash = lexbor_shs_hash_get_static(lxb_encoding_multi_hash_gb18030,
1794 LXB_ENCODING_MULTI_HASH_GB18030_SIZE, cp);
1795 if (hash != NULL) {
1796 if ((*data + 2) > end) {
1797 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1798 }
1799
1800 *(*data)++ = (uint32_t) (uintptr_t) hash->value / 190 + 0x81;
1801
1802 if (((uint32_t) (uintptr_t) hash->value % 190) < 0x3F) {
1803 *(*data)++ = ((uint32_t) (uintptr_t) hash->value % 190) + 0x40;
1804 }
1805 else {
1806 *(*data)++ = ((uint32_t) (uintptr_t) hash->value % 190) + 0x41;
1807 }
1808
1809 return 2;
1810 }
1811
1812 if ((*data + 4) > end) {
1813 return LXB_ENCODING_ENCODE_SMALL_BUFFER;
1814 }
1815
1816 index = lxb_encoding_encode_gb18030_range(cp);
1817
1818 *(*data)++ = (index / (10 * 126 * 10)) + 0x81;
1819 *(*data)++ = ((index % (10 * 126 * 10)) / (10 * 126)) + 0x30;
1820
1821 index = (index % (10 * 126 * 10)) % (10 * 126);
1822
1823 *(*data)++ = (index / 10) + 0x81;
1824 *(*data)++ = (index % 10) + 0x30;
1825
1826 return 4;
1827 }
1828
1829 int8_t
lxb_encoding_encode_macintosh_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1830 lxb_encoding_encode_macintosh_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1831 const lxb_char_t *end, lxb_codepoint_t cp)
1832 {
1833 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_macintosh,
1834 LXB_ENCODING_SINGLE_HASH_MACINTOSH_SIZE);
1835 }
1836
1837 int8_t
lxb_encoding_encode_replacement_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1838 lxb_encoding_encode_replacement_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1839 const lxb_char_t *end, lxb_codepoint_t cp)
1840 {
1841 (*data)++;
1842 return LXB_ENCODING_ENCODE_ERROR;
1843 }
1844
1845 int8_t
lxb_encoding_encode_windows_1250_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1846 lxb_encoding_encode_windows_1250_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1847 const lxb_char_t *end, lxb_codepoint_t cp)
1848 {
1849 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1250,
1850 LXB_ENCODING_SINGLE_HASH_WINDOWS_1250_SIZE);
1851 }
1852
1853 int8_t
lxb_encoding_encode_windows_1251_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1854 lxb_encoding_encode_windows_1251_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1855 const lxb_char_t *end, lxb_codepoint_t cp)
1856 {
1857 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1251,
1858 LXB_ENCODING_SINGLE_HASH_WINDOWS_1251_SIZE);
1859 }
1860
1861 int8_t
lxb_encoding_encode_windows_1252_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1862 lxb_encoding_encode_windows_1252_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1863 const lxb_char_t *end, lxb_codepoint_t cp)
1864 {
1865 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1252,
1866 LXB_ENCODING_SINGLE_HASH_WINDOWS_1252_SIZE);
1867 }
1868
1869 int8_t
lxb_encoding_encode_windows_1253_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1870 lxb_encoding_encode_windows_1253_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1871 const lxb_char_t *end, lxb_codepoint_t cp)
1872 {
1873 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1253,
1874 LXB_ENCODING_SINGLE_HASH_WINDOWS_1253_SIZE);
1875 }
1876
1877 int8_t
lxb_encoding_encode_windows_1254_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1878 lxb_encoding_encode_windows_1254_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1879 const lxb_char_t *end, lxb_codepoint_t cp)
1880 {
1881 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1254,
1882 LXB_ENCODING_SINGLE_HASH_WINDOWS_1254_SIZE);
1883 }
1884
1885 int8_t
lxb_encoding_encode_windows_1255_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1886 lxb_encoding_encode_windows_1255_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1887 const lxb_char_t *end, lxb_codepoint_t cp)
1888 {
1889 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1255,
1890 LXB_ENCODING_SINGLE_HASH_WINDOWS_1255_SIZE);
1891 }
1892
1893 int8_t
lxb_encoding_encode_windows_1256_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1894 lxb_encoding_encode_windows_1256_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1895 const lxb_char_t *end, lxb_codepoint_t cp)
1896 {
1897 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1256,
1898 LXB_ENCODING_SINGLE_HASH_WINDOWS_1256_SIZE);
1899 }
1900
1901 int8_t
lxb_encoding_encode_windows_1257_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1902 lxb_encoding_encode_windows_1257_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1903 const lxb_char_t *end, lxb_codepoint_t cp)
1904 {
1905 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1257,
1906 LXB_ENCODING_SINGLE_HASH_WINDOWS_1257_SIZE);
1907 }
1908
1909 int8_t
lxb_encoding_encode_windows_1258_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1910 lxb_encoding_encode_windows_1258_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1911 const lxb_char_t *end, lxb_codepoint_t cp)
1912 {
1913 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_1258,
1914 LXB_ENCODING_SINGLE_HASH_WINDOWS_1258_SIZE);
1915 }
1916
1917 int8_t
lxb_encoding_encode_windows_874_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1918 lxb_encoding_encode_windows_874_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1919 const lxb_char_t *end, lxb_codepoint_t cp)
1920 {
1921 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_windows_874,
1922 LXB_ENCODING_SINGLE_HASH_WINDOWS_874_SIZE);
1923 }
1924
1925 int8_t
lxb_encoding_encode_x_mac_cyrillic_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1926 lxb_encoding_encode_x_mac_cyrillic_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1927 const lxb_char_t *end, lxb_codepoint_t cp)
1928 {
1929 LXB_ENCODING_ENCODE_BYTE_SINGLE(lxb_encoding_single_hash_x_mac_cyrillic,
1930 LXB_ENCODING_SINGLE_HASH_X_MAC_CYRILLIC_SIZE);
1931 }
1932
1933 int8_t
lxb_encoding_encode_x_user_defined_single(lxb_encoding_encode_t * ctx,lxb_char_t ** data,const lxb_char_t * end,lxb_codepoint_t cp)1934 lxb_encoding_encode_x_user_defined_single(lxb_encoding_encode_t *ctx, lxb_char_t **data,
1935 const lxb_char_t *end, lxb_codepoint_t cp)
1936 {
1937 if (cp < 0x80) {
1938 *(*data)++ = (lxb_char_t) cp;
1939
1940 return 1;
1941 }
1942
1943 if (cp >= 0xF780 && cp <= 0xF7FF) {
1944 *(*data)++ = (lxb_char_t) (cp - 0xF780 + 0x80);
1945
1946 return 1;
1947 }
1948
1949 return LXB_ENCODING_ENCODE_ERROR;
1950 }
1951