1 #include "mbfilter_cjk.h"
2
3 #include "unicode_table_jis.h"
4 #include "unicode_table_jis2004.h"
5 #include "unicode_table_big5.h"
6 #include "unicode_table_cns11643.h"
7 #include "unicode_table_cp932_ext.h"
8 #include "unicode_table_cp936.h"
9 #include "unicode_table_gb18030.h"
10 #include "unicode_table_gb2312.h"
11 #include "unicode_table_uhc.h"
12 #include "cp932_table.h"
13 #include "sjis_mac2uni.h"
14 #include "translit_kana_jisx0201_jisx0208.h"
15 #include "emoji2uni.h"
16
17 /* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
18 * These correspond to the letters A-Z
19 * To display the flag emoji for a country, two unicode codepoints are combined,
20 * which correspond to the two-letter code for that country
21 * This macro converts uppercase ASCII values to Regional Indicator codepoints */
22 #define NFLAGS(c) (0x1F1A5+((unsigned int)(c)))
23
24 static const char nflags_s[10][2] = {"CN", "DE", "ES", "FR", "GB", "IT", "JP", "KR", "RU", "US"};
25 static const int nflags_code_kddi[10] = { 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 };
26 static const int nflags_code_sb[10] = { 0x2B0A, 0x2B05, 0x2B08, 0x2B04, 0x2B07, 0x2B06, 0x2B02, 0x2B0B, 0x2B09, 0x2B03 };
27
28 #define EMIT_KEYPAD_EMOJI(c) do { *snd = (c); return 0x20E3; } while(0)
29 #define EMIT_FLAG_EMOJI(country) do { *snd = NFLAGS((country)[0]); return NFLAGS((country)[1]); } while(0)
30
31 static const char nflags_kddi[6][2] = {"FR", "DE", "IT", "GB", "CN", "KR"};
32 static const char nflags_sb[10][2] = {"JP", "US", "FR", "DE", "IT", "GB", "ES", "RU", "CN", "KR"};
33
34 /* number -> (ku*94)+ten value for telephone keypad character */
35 #define DOCOMO_KEYPAD(n) ((n) == 0 ? 0x296F : (0x2965 + (n)))
36 #define DOCOMO_KEYPAD_HASH 0x2964
37
38 /* `tbl` contains inclusive ranges, each represented by a pair of unsigned shorts */
mbfl_bisec_srch(int w,const unsigned short * tbl,int n)39 static int mbfl_bisec_srch(int w, const unsigned short *tbl, int n)
40 {
41 int l = 0, r = n-1;
42 while (l <= r) {
43 int probe = (l + r) >> 1;
44 unsigned short lo = tbl[2 * probe], hi = tbl[(2 * probe) + 1];
45 if (w < lo) {
46 r = probe - 1;
47 } else if (w > hi) {
48 l = probe + 1;
49 } else {
50 return probe;
51 }
52 }
53 return -1;
54 }
55
56 /* `tbl` contains single values, not ranges */
mbfl_bisec_srch2(int w,const unsigned short tbl[],int n)57 int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
58 {
59 int l = 0, r = n-1;
60 while (l <= r) {
61 int probe = (l + r) >> 1;
62 unsigned short val = tbl[probe];
63 if (w < val) {
64 r = probe - 1;
65 } else if (w > val) {
66 l = probe + 1;
67 } else {
68 return probe;
69 }
70 }
71 return -1;
72 }
73
74 #define SJIS_ENCODE(c1,c2,s1,s2) \
75 do { \
76 s1 = ((c1 - 1) >> 1) + ((c1) < 0x5F ? 0x71 : 0xB1); \
77 s2 = c2; \
78 if ((c1) & 1) { \
79 if ((c2) < 0x60) { \
80 s2--; \
81 } \
82 s2 += 0x20; \
83 } else { \
84 s2 += 0x7e; \
85 } \
86 } while (0)
87
88 #define SJIS_DECODE(c1,c2,s1,s2) \
89 do { \
90 if (c1 < 0xa0) { \
91 s1 = ((c1 - 0x81) << 1) + 0x21; \
92 } else { \
93 s1 = ((c1 - 0xc1) << 1) + 0x21; \
94 } \
95 s2 = c2; \
96 if (c2 < 0x9f) { \
97 if (c2 < 0x7f) { \
98 s2++; \
99 } \
100 s2 -= 0x20; \
101 } else { \
102 s1++; \
103 s2 -= 0x7e; \
104 } \
105 } while (0)
106
107 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
108
109 /*
110 * ISO-2022 variants
111 */
112
113 #define ASCII 0
114 #define JISX0201_KANA 0x20
115 #define JISX0208_KANJI 0x80
116
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)117 static int mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
118 {
119 int c1, s, w;
120
121 retry:
122 switch (filter->status & 0xf) {
123 /* case 0x00: ASCII */
124 /* case 0x10: X 0201 latin */
125 /* case 0x20: X 0201 kana */
126 /* case 0x80: X 0208 */
127 /* case 0x90: X 0212 */
128 case 0:
129 if (c == 0x1b) {
130 filter->status += 2;
131 } else if (c == 0x0e) { /* "kana in" */
132 filter->status = 0x20;
133 } else if (c == 0x0f) { /* "kana out" */
134 filter->status = 0;
135 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
136 CK((*filter->output_function)(0xa5, filter->data));
137 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
138 CK((*filter->output_function)(0x203e, filter->data));
139 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
140 CK((*filter->output_function)(0xff40 + c, filter->data));
141 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) { /* kanji first char */
142 filter->cache = c;
143 filter->status += 1;
144 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
145 CK((*filter->output_function)(c, filter->data));
146 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
147 CK((*filter->output_function)(0xfec0 + c, filter->data));
148 } else {
149 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
150 }
151 break;
152
153 /* case 0x81: X 0208 second char */
154 /* case 0x91: X 0212 second char */
155 case 1:
156 filter->status &= ~0xf;
157 c1 = filter->cache;
158 if (c > 0x20 && c < 0x7f) {
159 s = (c1 - 0x21)*94 + c - 0x21;
160 if (filter->status == 0x80) {
161 if (s >= 0 && s < jisx0208_ucs_table_size) {
162 w = jisx0208_ucs_table[s];
163 } else {
164 w = 0;
165 }
166
167 if (w <= 0) {
168 w = MBFL_BAD_INPUT;
169 }
170 } else {
171 if (s >= 0 && s < jisx0212_ucs_table_size) {
172 w = jisx0212_ucs_table[s];
173 } else {
174 w = 0;
175 }
176
177 if (w <= 0) {
178 w = MBFL_BAD_INPUT;
179 }
180 }
181 CK((*filter->output_function)(w, filter->data));
182 } else {
183 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
184 }
185 break;
186
187 /* ESC */
188 /* case 0x02: */
189 /* case 0x12: */
190 /* case 0x22: */
191 /* case 0x82: */
192 /* case 0x92: */
193 case 2:
194 if (c == 0x24) { /* '$' */
195 filter->status++;
196 } else if (c == 0x28) { /* '(' */
197 filter->status += 3;
198 } else {
199 filter->status &= ~0xf;
200 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
201 goto retry;
202 }
203 break;
204
205 /* ESC $ */
206 /* case 0x03: */
207 /* case 0x13: */
208 /* case 0x23: */
209 /* case 0x83: */
210 /* case 0x93: */
211 case 3:
212 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
213 filter->status = 0x80;
214 } else if (c == 0x28) { /* '(' */
215 filter->status++;
216 } else {
217 filter->status &= ~0xf;
218 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
219 CK((*filter->output_function)(0x24, filter->data));
220 goto retry;
221 }
222 break;
223
224 /* ESC $ ( */
225 /* case 0x04: */
226 /* case 0x14: */
227 /* case 0x24: */
228 /* case 0x84: */
229 /* case 0x94: */
230 case 4:
231 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
232 filter->status = 0x80;
233 } else if (c == 0x44) { /* 'D' */
234 filter->status = 0x90;
235 } else {
236 filter->status &= ~0xf;
237 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
238 CK((*filter->output_function)(0x24, filter->data));
239 CK((*filter->output_function)(0x28, filter->data));
240 goto retry;
241 }
242 break;
243
244 /* ESC ( */
245 /* case 0x05: */
246 /* case 0x15: */
247 /* case 0x25: */
248 /* case 0x85: */
249 /* case 0x95: */
250 case 5:
251 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
252 filter->status = 0;
253 } else if (c == 0x4a) { /* 'J' */
254 filter->status = 0x10;
255 } else if (c == 0x49) { /* 'I' */
256 filter->status = 0x20;
257 } else {
258 filter->status &= ~0xf;
259 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
260 CK((*filter->output_function)(0x28, filter->data));
261 goto retry;
262 }
263 break;
264
265 EMPTY_SWITCH_DEFAULT_CASE();
266 }
267
268 return 0;
269 }
270
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)271 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
272 {
273 if (filter->status & 0xF) {
274 /* 2-byte (JIS X 0208 or 0212) character was truncated,
275 * or else escape sequence was truncated */
276 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
277 }
278 filter->status = 0;
279
280 if (filter->flush_function) {
281 (*filter->flush_function)(filter->data);
282 }
283
284 return 0;
285 }
286
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)287 static int mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
288 {
289 int s = 0;
290
291 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
292 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
293 } else if (c == 0x203E) { /* OVERLINE */
294 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
295 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
296 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
297 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
298 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
299 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
300 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
301 }
302 if (s <= 0) {
303 if (c == 0xa5) { /* YEN SIGN */
304 s = 0x1005c;
305 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
306 s = 0x2140;
307 } else if (c == 0x2225) { /* PARALLEL TO */
308 s = 0x2142;
309 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
310 s = 0x215d;
311 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
312 s = 0x2171;
313 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
314 s = 0x2172;
315 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
316 s = 0x224c;
317 }
318 if (c == 0) {
319 s = 0;
320 } else if (s <= 0) {
321 s = -1;
322 }
323 }
324 if (s >= 0) {
325 if (s < 0x80) { /* ASCII */
326 if ((filter->status & 0xff00) != 0) {
327 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
328 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
329 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
330 }
331 filter->status = 0;
332 CK((*filter->output_function)(s, filter->data));
333 } else if (s < 0x8080) { /* X 0208 */
334 if ((filter->status & 0xff00) != 0x200) {
335 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
336 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
337 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
338 }
339 filter->status = 0x200;
340 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
341 CK((*filter->output_function)(s & 0x7f, filter->data));
342 } else if (s < 0x10000) { /* X 0212 */
343 if ((filter->status & 0xff00) != 0x300) {
344 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
345 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
346 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
347 CK((*filter->output_function)(0x44, filter->data)); /* 'D' */
348 }
349 filter->status = 0x300;
350 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
351 CK((*filter->output_function)(s & 0x7f, filter->data));
352 } else { /* X 0201 latin */
353 if ((filter->status & 0xff00) != 0x400) {
354 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
355 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
356 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
357 }
358 filter->status = 0x400;
359 CK((*filter->output_function)(s & 0x7f, filter->data));
360 }
361 } else {
362 CK(mbfl_filt_conv_illegal_output(c, filter));
363 }
364
365 return 0;
366 }
367
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)368 static int mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
369 {
370 int s;
371
372 s = 0;
373 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
374 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
375 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
376 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
377 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
378 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
379 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
380 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
381 }
382
383 if (s <= 0) {
384 if (c == 0xa5) { /* YEN SIGN */
385 s = 0x1005c;
386 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
387 s = 0x2140;
388 } else if (c == 0x2225) { /* PARALLEL TO */
389 s = 0x2142;
390 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
391 s = 0x215d;
392 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
393 s = 0x2171;
394 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
395 s = 0x2172;
396 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
397 s = 0x224c;
398 }
399 if (c == 0) {
400 s = 0;
401 } else if (s <= 0) {
402 s = -1;
403 }
404 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
405 s = -1;
406 }
407 if (s >= 0) {
408 if (s < 0x80) { /* ASCII */
409 if ((filter->status & 0xff00) != 0) {
410 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
411 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
412 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
413 }
414 filter->status = 0;
415 CK((*filter->output_function)(s, filter->data));
416 } else if (s < 0x10000) { /* X 0208 */
417 if ((filter->status & 0xff00) != 0x200) {
418 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
419 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
420 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
421 }
422 filter->status = 0x200;
423 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
424 CK((*filter->output_function)(s & 0x7f, filter->data));
425 } else { /* X 0201 latin */
426 if ((filter->status & 0xff00) != 0x400) {
427 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
428 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
429 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
430 }
431 filter->status = 0x400;
432 CK((*filter->output_function)(s & 0x7f, filter->data));
433 }
434 }
435
436 return 0;
437 }
438
439 #define ASCII 0
440 #define JISX_0201_LATIN 1
441 #define JISX_0201_KANA 2
442 #define JISX_0208 3
443 #define JISX_0212 4
444
mb_iso2022jp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)445 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
446 {
447 ZEND_ASSERT(bufsize >= 3);
448
449 unsigned char *p = *in, *e = p + *in_len;
450 uint32_t *out = buf, *limit = buf + bufsize;
451
452 while (p < e && out < limit) {
453 unsigned char c = *p++;
454
455 if (c == 0x1B) {
456 /* ESC seen; this is an escape sequence */
457 if ((e - p) < 2) {
458 *out++ = MBFL_BAD_INPUT;
459 if (p != e && (*p == '$' || *p == '('))
460 p++;
461 continue;
462 }
463
464 unsigned char c2 = *p++;
465 if (c2 == '$') {
466 unsigned char c3 = *p++;
467 if (c3 == '@' || c3 == 'B') {
468 *state = JISX_0208;
469 } else if (c3 == '(') {
470 if (p == e) {
471 *out++ = MBFL_BAD_INPUT;
472 break;
473 }
474 unsigned char c4 = *p++;
475 if (c4 == '@' || c4 == 'B') {
476 *state = JISX_0208;
477 } else if (c4 == 'D') {
478 *state = JISX_0212;
479 } else {
480 if ((limit - out) < 3) {
481 p -= 4;
482 break;
483 }
484 *out++ = MBFL_BAD_INPUT;
485 *out++ = '$';
486 *out++ = '(';
487 p--;
488 }
489 } else {
490 if ((limit - out) < 2) {
491 p -= 3;
492 break;
493 }
494 *out++ = MBFL_BAD_INPUT;
495 *out++ = '$';
496 p--;
497 }
498 } else if (c2 == '(') {
499 unsigned char c3 = *p++;
500 if (c3 == 'B' || c3 == 'H') {
501 *state = ASCII;
502 } else if (c3 == 'J') {
503 *state = JISX_0201_LATIN;
504 } else if (c3 == 'I') {
505 *state = JISX_0201_KANA;
506 } else {
507 if ((limit - out) < 2) {
508 p -= 3;
509 break;
510 }
511 *out++ = MBFL_BAD_INPUT;
512 *out++ = '(';
513 p--;
514 }
515 } else {
516 *out++ = MBFL_BAD_INPUT;
517 p--;
518 }
519 } else if (c == 0xE) {
520 /* "Kana In" marker; this is just for JIS-7/8, but we also accept it for ISO-2022-JP */
521 *state = JISX_0201_KANA;
522 } else if (c == 0xF) {
523 /* "Kana Out" marker */
524 *state = ASCII;
525 } else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
526 *out++ = 0xA5;
527 } else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
528 *out++ = 0x203E;
529 } else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
530 *out++ = 0xFF40 + c;
531 } else if (*state >= JISX_0208 && c > 0x20 && c < 0x7F) {
532 if (p == e) {
533 *out++ = MBFL_BAD_INPUT;
534 break;
535 }
536 unsigned char c2 = *p++;
537 if (c2 > 0x20 && c2 < 0x7F) {
538 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
539 uint32_t w = 0;
540 if (*state == JISX_0208) {
541 if (s < jisx0208_ucs_table_size) {
542 w = jisx0208_ucs_table[s];
543 }
544 if (!w) {
545 w = MBFL_BAD_INPUT;
546 }
547 } else {
548 if (s < jisx0212_ucs_table_size) {
549 w = jisx0212_ucs_table[s];
550 }
551 if (!w) {
552 w = MBFL_BAD_INPUT;
553 }
554 }
555 *out++ = w;
556 } else {
557 *out++ = MBFL_BAD_INPUT;
558 }
559 } else if (c < 0x80) {
560 *out++ = c;
561 } else if (c >= 0xA1 && c <= 0xDF) {
562 /* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
563 * with the MSB bit (in the context of ISO-2022 encoding).
564 *
565 * In this regard, Wikipedia states:
566 * "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
567 * encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
568 * escape sequences, using Shift Out and Shift In or setting the eighth bit
569 * (GR-invoked), respectively."
570 *
571 * Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
572 * and the 'JIS8' use of GR-invoked Kana */
573 *out++ = 0xFEC0 + c;
574 } else {
575 *out++ = MBFL_BAD_INPUT;
576 }
577 }
578
579 *in_len = e - p;
580 *in = p;
581 return out - buf;
582 }
583
mb_wchar_to_iso2022jp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)584 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
585 {
586 unsigned char *out, *limit;
587 MB_CONVERT_BUF_LOAD(buf, out, limit);
588 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
589
590 while (len--) {
591 uint32_t w = *in++;
592 unsigned int s = 0;
593
594 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
595 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
596 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
597 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
598 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
599 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
600 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
601 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
602 }
603
604 if (s == 0) {
605 if (w == 0xA5) { /* YEN SIGN */
606 s = 0x1005C;
607 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
608 s = 0x2140;
609 } else if (w == 0x2225) { /* PARALLEL TO */
610 s = 0x2142;
611 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
612 s = 0x215D;
613 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
614 s = 0x2171;
615 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
616 s = 0x2172;
617 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
618 s = 0x224C;
619 } else if (w != 0) {
620 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
621 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
622 continue;
623 }
624 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
625 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
626 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
627 continue;
628 }
629
630 if (s < 0x80) { /* ASCII */
631 if (buf->state != ASCII) {
632 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
633 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
634 buf->state = ASCII;
635 }
636 out = mb_convert_buf_add(out, s);
637 } else if (s < 0x8080) { /* JIS X 0208 */
638 if (buf->state != JISX_0208) {
639 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
640 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
641 buf->state = JISX_0208;
642 }
643 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
644 } else if (s < 0x10000) { /* JIS X 0212 */
645 if (buf->state != JISX_0212) {
646 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
647 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
648 buf->state = JISX_0212;
649 }
650 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
651 } else { /* X 0201 Latin */
652 if (buf->state != JISX_0201_LATIN) {
653 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
654 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
655 buf->state = JISX_0201_LATIN;
656 }
657 out = mb_convert_buf_add(out, s & 0x7F);
658 }
659 }
660
661 if (end && buf->state != ASCII) {
662 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
663 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
664 }
665
666 MB_CONVERT_BUF_STORE(buf, out, limit);
667 }
668
mb_wchar_to_jis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)669 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
670 {
671 unsigned char *out, *limit;
672 MB_CONVERT_BUF_LOAD(buf, out, limit);
673 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
674
675 while (len--) {
676 uint32_t w = *in++;
677 unsigned int s = 0;
678
679 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
680 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
681 } else if (w == 0x203E) { /* OVERLINE */
682 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
683 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
684 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
685 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
686 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
687 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
688 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
689 }
690
691 if (s == 0) {
692 if (w == 0xA5) { /* YEN SIGN */
693 s = 0x1005C;
694 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
695 s = 0x2140;
696 } else if (w == 0x2225) { /* PARALLEL TO */
697 s = 0x2142;
698 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
699 s = 0x215D;
700 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
701 s = 0x2171;
702 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
703 s = 0x2172;
704 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
705 s = 0x224C;
706 } else if (w != 0) {
707 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
708 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
709 continue;
710 }
711 }
712
713 if (s < 0x80) { /* ASCII */
714 if (buf->state != ASCII) {
715 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
716 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
717 buf->state = ASCII;
718 }
719 out = mb_convert_buf_add(out, s);
720 } else if (s >= 0xA1 && s <= 0xDF) {
721 if (buf->state != JISX_0201_KANA) {
722 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
723 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
724 buf->state = JISX_0201_KANA;
725 }
726 out = mb_convert_buf_add(out, s & 0x7F);
727 } else if (s < 0x8080) { /* JIS X 0208 */
728 if (buf->state != JISX_0208) {
729 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
730 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
731 buf->state = JISX_0208;
732 }
733 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
734 } else if (s < 0x10000) { /* JIS X 0212 */
735 if (buf->state != JISX_0212) {
736 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
737 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
738 buf->state = JISX_0212;
739 }
740 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
741 } else { /* X 0201 Latin */
742 if (buf->state != JISX_0201_LATIN) {
743 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
744 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
745 buf->state = JISX_0201_LATIN;
746 }
747 out = mb_convert_buf_add(out, s & 0x7F);
748 }
749 }
750
751 if (end && buf->state != ASCII) {
752 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
753 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
754 }
755
756 MB_CONVERT_BUF_STORE(buf, out, limit);
757 }
758
759 #define JISX_0201_KANA_SO 5
760
mb_check_jis(unsigned char * in,size_t in_len)761 static bool mb_check_jis(unsigned char *in, size_t in_len)
762 {
763 unsigned char *p = in, *e = p + in_len;
764 unsigned int state = ASCII;
765
766 while (p < e) {
767 unsigned char c = *p++;
768 if (c == 0x1B) {
769 /* ESC seen; this is an escape sequence */
770 if (state == JISX_0201_KANA_SO) {
771 return false;
772 }
773 if ((e - p) < 2) {
774 return false;
775 }
776 unsigned char c2 = *p++;
777 if (c2 == '$') {
778 unsigned char c3 = *p++;
779 if (c3 == '@' || c3 == 'B') {
780 state = JISX_0208;
781 } else if (c3 == '(') {
782 if (p == e) {
783 return false;
784 }
785 unsigned char c4 = *p++;
786 if (c4 == '@' || c4 == 'B') {
787 state = JISX_0208;
788 } else if (c4 == 'D') {
789 state = JISX_0212;
790 } else {
791 return false;
792 }
793 } else {
794 return false;
795 }
796 } else if (c2 == '(') {
797 unsigned char c3 = *p++;
798 /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
799 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
800 if (c3 == 'B' || c3 == 'H') {
801 state = ASCII;
802 } else if (c3 == 'J') {
803 state = JISX_0201_LATIN;
804 } else if (c3 == 'I') {
805 state = JISX_0201_KANA;
806 } else {
807 return false;
808 }
809 } else {
810 return false;
811 }
812 } else if (c == 0xE) {
813 /* "Kana In" marker */
814 if (state != ASCII) {
815 return false;
816 }
817 state = JISX_0201_KANA_SO;
818 } else if (c == 0xF) {
819 /* "Kana Out" marker */
820 if (state != JISX_0201_KANA_SO) {
821 return false;
822 }
823 state = ASCII;
824 } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
825 if (p == e) {
826 return false;
827 }
828 unsigned char c2 = *p++;
829 if (c2 > 0x20 && c2 < 0x7F) {
830 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
831 if (state == JISX_0208) {
832 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
833 continue;
834 }
835 } else {
836 if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
837 continue;
838 }
839 }
840 return false;
841 } else {
842 return false;
843 }
844 } else if (c < 0x80) {
845 continue;
846 } else if (c >= 0xA1 && c <= 0xDF) {
847 /* GR-invoked Kana */
848 continue;
849 } else {
850 return false;
851 }
852 }
853
854 return state == ASCII;
855 }
856
mb_check_iso2022jp(unsigned char * in,size_t in_len)857 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
858 {
859 unsigned char *p = in, *e = p + in_len;
860 unsigned int state = ASCII;
861
862 while (p < e) {
863 unsigned char c = *p++;
864 if (c == 0x1B) {
865 /* ESC seen; this is an escape sequence */
866 if ((e - p) < 2) {
867 return false;
868 }
869 unsigned char c2 = *p++;
870 if (c2 == '$') {
871 unsigned char c3 = *p++;
872 if (c3 == '@' || c3 == 'B') {
873 state = JISX_0208;
874 } else {
875 return false;
876 }
877 } else if (c2 == '(') {
878 unsigned char c3 = *p++;
879 if (c3 == 'B') {
880 state = ASCII;
881 } else if (c3 == 'J') {
882 state = JISX_0201_LATIN;
883 } else {
884 return false;
885 }
886 } else {
887 return false;
888 }
889 } else if (c == 0xE || c == 0xF) {
890 /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
891 return false;
892 } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
893 if (p == e) {
894 return false;
895 }
896 unsigned char c2 = *p++;
897 if (c2 > 0x20 && c2 < 0x7F) {
898 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
899 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
900 continue;
901 }
902 return false;
903 } else {
904 return false;
905 }
906 } else if (c < 0x80) {
907 continue;
908 } else {
909 return false;
910 }
911 }
912
913 return state == ASCII;
914 }
915
916 /* Unicode codepoints for emoji are above 0x1F000, but we only store 16-bits
917 * in our tables. Therefore, add 0x10000 to recover the true values.
918 *
919 * Again, for some emoji which are not supported by Unicode, we use codepoints
920 * in the Private Use Area above 0xFE000. Again, add 0xF0000 to recover the
921 * true value. */
convert_emoji_cp(int cp)922 static inline int convert_emoji_cp(int cp)
923 {
924 if (cp > 0xF000)
925 return cp + 0x10000;
926 else if (cp > 0xE000)
927 return cp + 0xF0000;
928 return cp;
929 }
930
mbfilter_sjis_emoji_kddi2unicode(int s,int * snd)931 int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd)
932 {
933 if (s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi1_max) {
934 if (s == 0x24C0) { /* Spain */
935 EMIT_FLAG_EMOJI("ES");
936 } else if (s == 0x24C1) { /* Russia */
937 EMIT_FLAG_EMOJI("RU");
938 } else if (s >= 0x2545 && s <= 0x254A) {
939 EMIT_FLAG_EMOJI(nflags_kddi[s - 0x2545]);
940 } else if (s == 0x25BC) {
941 EMIT_KEYPAD_EMOJI('#');
942 } else {
943 *snd = 0;
944 return convert_emoji_cp(mb_tbl_code2uni_kddi1[s - mb_tbl_code2uni_kddi1_min]);
945 }
946 } else if (s >= mb_tbl_code2uni_kddi2_min && s <= mb_tbl_code2uni_kddi2_max) {
947 if (s == 0x2750) { /* Japan */
948 EMIT_FLAG_EMOJI("JP");
949 } else if (s >= 0x27A6 && s <= 0x27AE) {
950 EMIT_KEYPAD_EMOJI(s - 0x27A6 + '1');
951 } else if (s == 0x27F7) { /* United States */
952 EMIT_FLAG_EMOJI("US");
953 } else if (s == 0x2830) {
954 EMIT_KEYPAD_EMOJI('0');
955 } else {
956 *snd = 0;
957 return convert_emoji_cp(mb_tbl_code2uni_kddi2[s - mb_tbl_code2uni_kddi2_min]);
958 }
959 }
960 return 0;
961 }
962
mbfl_filt_conv_2022jp_mobile_wchar(int c,mbfl_convert_filter * filter)963 static int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
964 {
965 int c1, s, w, snd = 0;
966
967 switch (filter->status & 0xF) {
968 case 0:
969 if (c == 0x1B) {
970 filter->status += 2;
971 } else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
972 CK((*filter->output_function)(0xFF40 + c, filter->data));
973 } else if (filter->status == JISX0208_KANJI && c > 0x20 && c < 0x80) {
974 filter->cache = c;
975 filter->status += 1;
976 } else if (c >= 0 && c < 0x80) { /* ASCII */
977 CK((*filter->output_function)(c, filter->data));
978 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
979 CK((*filter->output_function)(0xFEC0 + c, filter->data));
980 } else {
981 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
982 }
983 break;
984
985 /* JISX 0208, second byte */
986 case 1:
987 w = 0;
988 filter->status &= ~0xF;
989 c1 = filter->cache;
990 if (c > 0x20 && c < 0x7F) {
991 s = ((c1 - 0x21) * 94) + c - 0x21;
992
993 if (s <= 137) {
994 if (s == 31) {
995 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
996 } else if (s == 32) {
997 w = 0xFF5E; /* FULLWIDTH TILDE */
998 } else if (s == 33) {
999 w = 0x2225; /* PARALLEL TO */
1000 } else if (s == 60) {
1001 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1002 } else if (s == 80) {
1003 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1004 } else if (s == 81) {
1005 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1006 } else if (s == 137) {
1007 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1008 }
1009 }
1010
1011 if (s >= (84 * 94) && s < (91 * 94)) {
1012 s += 22 * 94;
1013 w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1014 if (w > 0 && snd > 0) {
1015 (*filter->output_function)(snd, filter->data);
1016 }
1017 }
1018
1019 if (w == 0) {
1020 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1021 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1022 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
1023 w = jisx0208_ucs_table[s];
1024 }
1025 }
1026
1027 if (w <= 0) {
1028 w = MBFL_BAD_INPUT;
1029 }
1030 CK((*filter->output_function)(w, filter->data));
1031 } else {
1032 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1033 }
1034 break;
1035
1036 /* ESC */
1037 case 2:
1038 if (c == '$') {
1039 filter->status++;
1040 } else if (c == '(') {
1041 filter->status += 3;
1042 } else {
1043 filter->status &= ~0xF;
1044 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1045 }
1046 break;
1047
1048 /* ESC $ */
1049 case 3:
1050 if (c == '@' || c == 'B') {
1051 filter->status = JISX0208_KANJI;
1052 } else if (c == '(') {
1053 filter->status++;
1054 } else {
1055 filter->status &= ~0xF;
1056 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1057 }
1058 break;
1059
1060 /* ESC $ ( */
1061 case 4:
1062 if (c == '@' || c == 'B') {
1063 filter->status = JISX0208_KANJI;
1064 } else {
1065 filter->status &= ~0xF;
1066 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1067 }
1068 break;
1069
1070 /* ESC ( */
1071 case 5:
1072 if (c == 'B' || c == 'J') {
1073 filter->status = 0; /* ASCII mode */
1074 } else if (c == 'I') {
1075 filter->status = JISX0201_KANA;
1076 } else {
1077 filter->status &= ~0xF;
1078 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1079 }
1080 }
1081
1082 return 0;
1083 }
1084
mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter * filter)1085 static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
1086 {
1087 if (filter->status & 0xF) {
1088 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
1089 }
1090 filter->status = 0;
1091
1092 if (filter->flush_function) {
1093 (*filter->flush_function)(filter->data);
1094 }
1095
1096 return 0;
1097 }
1098
mbfilter_unicode2sjis_emoji_kddi(int c,int * s1,mbfl_convert_filter * filter)1099 static int mbfilter_unicode2sjis_emoji_kddi(int c, int *s1, mbfl_convert_filter *filter)
1100 {
1101 if ((filter->status & 0xF) == 1) {
1102 int c1 = filter->cache;
1103 filter->cache = 0;
1104 filter->status &= ~0xFF;
1105 if (c == 0x20E3) {
1106 if (c1 == '#') {
1107 *s1 = 0x25BC;
1108 } else if (c1 == '0') {
1109 *s1 = 0x2830;
1110 } else { /* Previous character was '1'-'9' */
1111 *s1 = 0x27A6 + (c1 - '1');
1112 }
1113 return 1;
1114 } else {
1115 if (filter->status & 0xFF00) {
1116 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1117 CK((*filter->output_function)('(', filter->data));
1118 CK((*filter->output_function)('B', filter->data));
1119 }
1120 CK((*filter->output_function)(c1, filter->data));
1121 filter->status = 0;
1122 }
1123 }
1124
1125 if (c == '#' || (c >= '0' && c <= '9')) {
1126 filter->status |= 1;
1127 filter->cache = c;
1128 return 0;
1129 }
1130
1131 if (c == 0xA9) { /* Copyright sign */
1132 *s1 = 0x27DC;
1133 return 1;
1134 } else if (c == 0xAE) { /* Registered sign */
1135 *s1 = 0x27DD;
1136 return 1;
1137 } else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
1138 int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1139 if (i >= 0) {
1140 *s1 = mb_tbl_uni_kddi2code2_value[i];
1141 return 1;
1142 }
1143 } else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
1144 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1145 if (i >= 0) {
1146 *s1 = mb_tbl_uni_kddi2code3_value[i];
1147 return 1;
1148 }
1149 } else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
1150 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1151 if (i >= 0) {
1152 *s1 = mb_tbl_uni_kddi2code5_val[i];
1153 return 1;
1154 }
1155 }
1156 return 0;
1157 }
1158
1159 /* (ku*94)+ten value -> Shift-JIS byte sequence */
1160 #define CODE2JIS(c1,c2,s1,s2) \
1161 c1 = (s1)/94+0x21; \
1162 c2 = (s1)-94*((c1)-0x21)+0x21; \
1163 s1 = ((c1) << 8) | (c2); \
1164 s2 = 1
1165
mbfl_filt_conv_wchar_2022jp_mobile(int c,mbfl_convert_filter * filter)1166 static int mbfl_filt_conv_wchar_2022jp_mobile(int c, mbfl_convert_filter *filter)
1167 {
1168 int c1, c2, s1 = 0, s2 = 0;
1169
1170 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
1171 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
1172 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
1173 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
1174 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
1175 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
1176 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
1177 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
1178 }
1179
1180 if (s1 <= 0) {
1181 if (c == 0xA5) { /* YEN SIGN */
1182 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
1183 } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1184 s1 = 0x2140;
1185 } else if (c == 0x2225) { /* PARALLEL TO */
1186 s1 = 0x2142;
1187 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1188 s1 = 0x215d;
1189 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1190 s1 = 0x2171;
1191 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1192 s1 = 0x2172;
1193 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1194 s1 = 0x224c;
1195 }
1196 }
1197
1198 if (mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0) {
1199 /* A KDDI emoji was detected and stored in s1 */
1200 CODE2JIS(c1,c2,s1,s2);
1201 s1 -= 0x1600;
1202 } else if ((filter->status & 0xFF) == 1 && filter->cache) {
1203 /* We are just processing one of KDDI's special emoji for a phone keypad button */
1204 return 0;
1205 }
1206
1207 if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
1208 s1 = -1;
1209 for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
1210 if (c == cp932ext1_ucs_table[c1]) {
1211 s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
1212 break;
1213 }
1214 }
1215
1216 if (c == 0) {
1217 s1 = 0;
1218 }
1219 }
1220
1221 if (s1 >= 0) {
1222 if (s1 < 0x80) { /* ASCII */
1223 if (filter->status & 0xFF00) {
1224 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1225 CK((*filter->output_function)('(', filter->data));
1226 CK((*filter->output_function)('B', filter->data));
1227 }
1228 CK((*filter->output_function)(s1, filter->data));
1229 filter->status = 0;
1230 } else if (s1 > 0xA0 && s1 < 0xE0) { /* Kana */
1231 if ((filter->status & 0xFF00) != 0x100) {
1232 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1233 CK((*filter->output_function)('(', filter->data));
1234 CK((*filter->output_function)('I', filter->data));
1235 }
1236 filter->status = 0x100;
1237 CK((*filter->output_function)(s1 & 0x7F, filter->data));
1238 } else if (s1 < 0x7E7F) { /* JIS X 0208 */
1239 if ((filter->status & 0xFF00) != 0x200) {
1240 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1241 CK((*filter->output_function)('$', filter->data));
1242 CK((*filter->output_function)('B', filter->data));
1243 }
1244 filter->status = 0x200;
1245 CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
1246 CK((*filter->output_function)(s1 & 0x7F, filter->data));
1247 }
1248 } else {
1249 CK(mbfl_filt_conv_illegal_output(c, filter));
1250 }
1251
1252 return 0;
1253 }
1254
mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter * filter)1255 static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter)
1256 {
1257 /* Go back to ASCII mode (so strings can be safely concatenated) */
1258 if (filter->status & 0xFF00) {
1259 (*filter->output_function)(0x1B, filter->data); /* ESC */
1260 (*filter->output_function)('(', filter->data);
1261 (*filter->output_function)('B', filter->data);
1262 }
1263
1264 int c1 = filter->cache;
1265 if ((filter->status & 0xFF) == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
1266 (*filter->output_function)(c1, filter->data);
1267 }
1268 filter->status = filter->cache = 0;
1269
1270 if (filter->flush_function) {
1271 (*filter->flush_function)(filter->data);
1272 }
1273
1274 return 0;
1275 }
1276
mb_iso2022jp_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1277 static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1278 {
1279 unsigned char *p = *in, *e = p + *in_len;
1280 uint32_t *out = buf, *limit = buf + bufsize - 1;
1281
1282 while (p < e && out < limit) {
1283 unsigned char c = *p++;
1284
1285 if (c == 0x1B) {
1286 if ((e - p) < 2) {
1287 p = e;
1288 *out++ = MBFL_BAD_INPUT;
1289 break;
1290 }
1291 unsigned char c2 = *p++;
1292 unsigned char c3 = *p++;
1293
1294 if (c2 == '$') {
1295 if (c3 == '@' || c3 == 'B') {
1296 *state = JISX0208_KANJI;
1297 } else if (c3 == '(') {
1298 if (p == e) {
1299 *out++ = MBFL_BAD_INPUT;
1300 break;
1301 }
1302 unsigned char c4 = *p++;
1303
1304 if (c4 == '@' || c4 == 'B') {
1305 *state = JISX0208_KANJI;
1306 } else {
1307 *out++ = MBFL_BAD_INPUT;
1308 }
1309 } else {
1310 *out++ = MBFL_BAD_INPUT;
1311 }
1312 } else if (c2 == '(') {
1313 if (c3 == 'B' || c3 == 'J') {
1314 *state = ASCII;
1315 } else if (c3 == 'I') {
1316 *state = JISX0201_KANA;
1317 } else {
1318 *out++ = MBFL_BAD_INPUT;
1319 }
1320 } else {
1321 p--;
1322 *out++ = MBFL_BAD_INPUT;
1323 }
1324 } else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
1325 *out++ = 0xFF40 + c;
1326 } else if (*state == JISX0208_KANJI && c >= 0x21 && c <= 0x7F) {
1327 if (p == e) {
1328 *out++ = MBFL_BAD_INPUT;
1329 break;
1330 }
1331 unsigned char c2 = *p++;
1332
1333 if (c2 >= 0x21 && c2 <= 0x7E) {
1334 unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
1335 uint32_t w = 0;
1336
1337 if (s <= 137) {
1338 if (s == 31) {
1339 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
1340 } else if (s == 32) {
1341 w = 0xFF5E; /* FULLWIDTH TILDE */
1342 } else if (s == 33) {
1343 w = 0x2225; /* PARALLEL TO */
1344 } else if (s == 60) {
1345 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1346 } else if (s == 80) {
1347 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1348 } else if (s == 81) {
1349 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1350 } else if (s == 137) {
1351 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1352 }
1353 }
1354
1355 if (s >= (84 * 94) && s < (91 * 94)) {
1356 int snd = 0;
1357 s += 22 * 94;
1358 w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1359 if (w && snd) {
1360 *out++ = snd;
1361 }
1362 }
1363
1364 if (!w) {
1365 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1366 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1367 } else if (s < jisx0208_ucs_table_size) {
1368 w = jisx0208_ucs_table[s];
1369 }
1370 }
1371
1372 *out++ = w ? w : MBFL_BAD_INPUT;
1373 } else {
1374 *out++ = MBFL_BAD_INPUT;
1375 }
1376 } else if (c <= 0x7F) {
1377 *out++ = c;
1378 } else if (c >= 0xA1 && c <= 0xDF) {
1379 *out++ = 0xFEC0 + c;
1380 } else {
1381 *out++ = MBFL_BAD_INPUT;
1382 }
1383 }
1384
1385 *in_len = e - p;
1386 *in = p;
1387 return out - buf;
1388 }
1389
mb_wchar_to_iso2022jp_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1390 static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1391 {
1392 unsigned char *out, *limit;
1393 MB_CONVERT_BUF_LOAD(buf, out, limit);
1394 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1395
1396 while (len--) {
1397 uint32_t w = *in++;
1398 unsigned int s = 0;
1399
1400 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
1401 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
1402 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
1403 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
1404 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
1405 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
1406 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
1407 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
1408 }
1409
1410 if (!s) {
1411 if (w == 0xA5) { /* YEN SIGN */
1412 s = 0x216F; /* FULLWIDTH YEN SIGN */
1413 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1414 s = 0x2140;
1415 } else if (w == 0x2225) { /* PARALLEL TO */
1416 s = 0x2142;
1417 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1418 s = 0x215D;
1419 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1420 s = 0x2171;
1421 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1422 s = 0x2172;
1423 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1424 s = 0x224C;
1425 }
1426 }
1427
1428 if ((w == '#' || (w >= '0' && w <= '9')) && len) {
1429 uint32_t w2 = *in++; len--;
1430
1431 if (w2 == 0x20E3) {
1432 unsigned int s1 = 0;
1433 if (w == '#') {
1434 s1 = 0x25BC;
1435 } else if (w == '0') {
1436 s1 = 0x2830;
1437 } else { /* Previous character was '1'-'9' */
1438 s1 = 0x27A6 + (w - '1');
1439 }
1440 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1441 } else {
1442 in--; len++;
1443 }
1444 } else if (w >= NFLAGS('C') && w <= NFLAGS('U') && len) { /* C for CN, U for US */
1445 uint32_t w2 = *in++; len--;
1446
1447 if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
1448 for (int i = 0; i < 10; i++) {
1449 if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
1450 unsigned int s1 = nflags_code_kddi[i];
1451 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1452 goto found_flag_emoji;
1453 }
1454 }
1455 }
1456
1457 in--; len++;
1458 found_flag_emoji: ;
1459 }
1460
1461 if (w == 0xA9) { /* Copyright sign */
1462 unsigned int s1 = 0x27DC;
1463 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1464 } else if (w == 0xAE) { /* Registered sign */
1465 unsigned int s1 = 0x27DD;
1466 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1467 } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
1468 int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1469 if (i >= 0) {
1470 unsigned int s1 = mb_tbl_uni_kddi2code2_value[i];
1471 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1472 }
1473 } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
1474 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1475 if (i >= 0) {
1476 unsigned int s1 = mb_tbl_uni_kddi2code3_value[i];
1477 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1478 }
1479 } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
1480 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1481 if (i >= 0) {
1482 unsigned int s1 = mb_tbl_uni_kddi2code5_val[i];
1483 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1484 }
1485 }
1486
1487 if (!s || s >= 0xA1A1) {
1488 s = 0;
1489 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
1490 if (w == cp932ext1_ucs_table[i]) {
1491 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
1492 break;
1493 }
1494 }
1495 if (w == 0)
1496 s = 0;
1497 }
1498
1499 if (!s && w) {
1500 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1501 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1502 } else if (s <= 0x7F) {
1503 if (buf->state != ASCII) {
1504 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1505 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1506 buf->state = ASCII;
1507 }
1508 out = mb_convert_buf_add(out, s);
1509 } else if (s >= 0xA1 && s <= 0xDF) {
1510 if (buf->state != JISX0201_KANA) {
1511 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1512 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1513 buf->state = JISX0201_KANA;
1514 }
1515 out = mb_convert_buf_add(out, s & 0x7F);
1516 } else if (s <= 0x7E7E) {
1517 if (buf->state != JISX0208_KANJI) {
1518 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1519 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1520 buf->state = JISX0208_KANJI;
1521 } else {
1522 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1523 }
1524 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1525 } else {
1526 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1527 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1528 }
1529 }
1530
1531 if (end && buf->state != ASCII) {
1532 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1533 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1534 }
1535
1536 MB_CONVERT_BUF_STORE(buf, out, limit);
1537 }
1538
mbfl_filt_conv_jis2004_wchar(int c,mbfl_convert_filter * filter)1539 static int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
1540 {
1541 int k;
1542 int c1, c2, s, s1 = 0, s2 = 0, w = 0, w1;
1543
1544 switch (filter->status & 0xf) {
1545 case 0:
1546 if (c >= 0 && c < 0x80) { /* latin */
1547 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1548 CK((*filter->output_function)(c, filter->data));
1549 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1550 if (c == 0x5c) {
1551 CK((*filter->output_function)(0x00a5, filter->data));
1552 } else if (c == 0x7e) {
1553 CK((*filter->output_function)(0x203e, filter->data));
1554 } else {
1555 CK((*filter->output_function)(c, filter->data));
1556 }
1557 } else { /* ISO-2022-JP-2004 */
1558 if (c == 0x1b) {
1559 filter->status += 6;
1560 } else if ((filter->status == 0x80 || filter->status == 0x90 || filter->status == 0xa0)
1561 && c > 0x20 && c < 0x7f) { /* kanji first char */
1562 filter->cache = c;
1563 if (filter->status == 0x90) {
1564 filter->status += 1; /* JIS X 0213 plane 1 */
1565 } else if (filter->status == 0xa0) {
1566 filter->status += 4; /* JIS X 0213 plane 2 */
1567 } else {
1568 filter->status += 5; /* JIS X 0208 */
1569 }
1570 } else {
1571 CK((*filter->output_function)(c, filter->data));
1572 }
1573 }
1574 } else {
1575 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1576 if (c > 0xa0 && c < 0xff) { /* X 0213 plane 1 first char */
1577 filter->status = 1;
1578 filter->cache = c;
1579 } else if (c == 0x8e) { /* kana first char */
1580 filter->cache = 0x8E; /* So error will be reported if input is truncated right here */
1581 filter->status = 2;
1582 } else if (c == 0x8f) { /* X 0213 plane 2 first char */
1583 filter->status = 3;
1584 } else {
1585 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1586 }
1587 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1588 if (c > 0xa0 && c < 0xe0) { /* kana */
1589 CK((*filter->output_function)(0xfec0 + c, filter->data));
1590 } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
1591 filter->status = 1;
1592 filter->cache = c;
1593 } else {
1594 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1595 }
1596 } else {
1597 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1598 }
1599 }
1600 break;
1601
1602 case 1: /* kanji second char */
1603 filter->status &= ~0xf;
1604 c1 = filter->cache;
1605
1606 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1607 if (c > 0xa0 && c < 0xff) {
1608 s1 = c1 - 0x80;
1609 s2 = c - 0x80;
1610 } else {
1611 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1612 break;
1613 }
1614 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1615 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
1616 SJIS_DECODE(c1, c, s1, s2);
1617 } else {
1618 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1619 break;
1620 }
1621 } else { /* ISO-2022-JP-2004 */
1622 if (c >= 0x21 && c <= 0x7E) {
1623 s1 = c1;
1624 s2 = c;
1625 } else {
1626 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1627 break;
1628 }
1629 }
1630 w1 = (s1 << 8) | s2;
1631
1632 /* conversion for combining characters */
1633 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) ||
1634 (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 ||
1635 (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
1636 k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
1637 if (k >= 0) {
1638 w = jisx0213_u2_tbl[2*k];
1639 CK((*filter->output_function)(w, filter->data));
1640 w = jisx0213_u2_tbl[2*k+1];
1641 }
1642 }
1643
1644 /* conversion for BMP */
1645 if (w <= 0) {
1646 w1 = (s1 - 0x21)*94 + s2 - 0x21;
1647 if (w1 >= 0 && w1 < jisx0213_ucs_table_size) {
1648 w = jisx0213_ucs_table[w1];
1649 }
1650 }
1651
1652 /* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
1653 if (w <= 0) {
1654 k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1655 if (k >= 0) {
1656 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1657 }
1658 }
1659
1660 if (w <= 0) {
1661 w = MBFL_BAD_INPUT;
1662 }
1663 CK((*filter->output_function)(w, filter->data));
1664 break;
1665
1666 case 2: /* got 0x8e: EUC-JP-2004 kana */
1667 filter->status = 0;
1668 if (c > 0xa0 && c < 0xe0) {
1669 w = 0xfec0 + c;
1670 CK((*filter->output_function)(w, filter->data));
1671 } else {
1672 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1673 }
1674 break;
1675
1676 case 3: /* X 0213 plane 2 first char: EUC-JP-2004 (0x8f) */
1677 if (c == 0xA1 || (c >= 0xA3 && c <= 0xA5) || c == 0xA8 || (c >= 0xAC && c <= 0xAF) || (c >= 0xEE && c <= 0xFE)) {
1678 filter->cache = c - 0x80;
1679 filter->status++;
1680 } else {
1681 filter->status = 0;
1682 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1683 }
1684 break;
1685
1686 case 4: /* X 0213 plane 2 second char: EUC-JP-2004, ISO-2022-JP-2004 */
1687 filter->status &= ~0xF;
1688 c1 = filter->cache;
1689 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1690 c2 = c - 0x80;
1691 } else {
1692 c2 = c;
1693 }
1694
1695 if (c2 < 0x21 || c2 > 0x7E) {
1696 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1697 break;
1698 }
1699
1700 s1 = c1 - 0x21;
1701 s2 = c2 - 0x21;
1702
1703 if (((s1 >= 0 && s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) ||
1704 (s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) {
1705 /* calc offset from ku */
1706 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
1707 if (s1 == jisx0213_p2_ofst[k]) {
1708 break;
1709 }
1710 }
1711 k -= jisx0213_p2_ofst[k];
1712
1713 /* check for japanese chars in BMP */
1714 s = (s1 + 94 + k)*94 + s2;
1715 ZEND_ASSERT(s < jisx0213_ucs_table_size);
1716 w = jisx0213_ucs_table[s];
1717
1718 /* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1719 if (w <= 0) {
1720 k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1721 if (k >= 0) {
1722 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1723 }
1724 }
1725
1726 if (w <= 0) {
1727 w = MBFL_BAD_INPUT;
1728 }
1729
1730 CK((*filter->output_function)(w, filter->data));
1731 } else {
1732 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1733 }
1734 break;
1735
1736 case 5: /* X 0208: ISO-2022-JP-2004 */
1737 filter->status &= ~0xf;
1738 c1 = filter->cache;
1739 if (c > 0x20 && c < 0x7f) {
1740 s = (c1 - 0x21)*94 + c - 0x21;
1741 if (s >= 0 && s < jisx0208_ucs_table_size) {
1742 w = jisx0208_ucs_table[s];
1743 }
1744 }
1745
1746 if (w <= 0) {
1747 w = MBFL_BAD_INPUT;
1748 }
1749
1750 CK((*filter->output_function)(w, filter->data));
1751 break;
1752
1753 /* ESC: ISO-2022-JP-2004 */
1754 /* case 0x06: */
1755 /* case 0x16: */
1756 /* case 0x26: */
1757 /* case 0x86: */
1758 /* case 0x96: */
1759 /* case 0xa6: */
1760 case 6:
1761 if (c == '$') {
1762 filter->status++;
1763 } else if (c == '(') {
1764 filter->status += 3;
1765 } else {
1766 filter->status &= ~0xf;
1767 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1768 }
1769 break;
1770
1771 /* ESC $: ISO-2022-JP-2004 */
1772 /* case 0x07: */
1773 /* case 0x17: */
1774 /* case 0x27: */
1775 /* case 0x87: */
1776 /* case 0x97: */
1777 /* case 0xa7: */
1778 case 7:
1779 if (c == 'B') { /* JIS X 0208-1983 */
1780 filter->status = 0x80;
1781 } else if (c == '(') {
1782 filter->status++;
1783 } else {
1784 filter->status &= ~0xf;
1785 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1786 }
1787 break;
1788
1789 /* ESC $ (: ISO-2022-JP-2004 */
1790 /* case 0x08: */
1791 /* case 0x18: */
1792 /* case 0x28: */
1793 /* case 0x88: */
1794 /* case 0x98: */
1795 /* case 0xa8: */
1796 case 8:
1797 if (c == 'Q') { /* JIS X 0213 plane 1 */
1798 filter->status = 0x90;
1799 } else if (c == 'P') { /* JIS X 0213 plane 2 */
1800 filter->status = 0xa0;
1801 } else {
1802 filter->status &= ~0xf;
1803 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1804 }
1805 break;
1806
1807 /* ESC (: ISO-2022-JP-2004 */
1808 /* case 0x09: */
1809 /* case 0x19: */
1810 /* case 0x29: */
1811 /* case 0x89: */
1812 /* case 0x99: */
1813 case 9:
1814 if (c == 'B') {
1815 filter->status = 0;
1816 } else {
1817 filter->status &= ~0xf;
1818 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1819 }
1820 break;
1821
1822 EMPTY_SWITCH_DEFAULT_CASE();
1823 }
1824
1825 return 0;
1826 }
1827
mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter * filter)1828 static int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
1829 {
1830 if (filter->status & 0xF) {
1831 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1832 }
1833 filter->status = 0;
1834
1835 if (filter->flush_function) {
1836 return (*filter->flush_function)(filter->data);
1837 }
1838
1839 return 0;
1840 }
1841
mbfl_filt_conv_wchar_jis2004(int c,mbfl_convert_filter * filter)1842 static int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
1843 {
1844 int k;
1845 int c1, c2, s1, s2;
1846
1847 retry:
1848 s1 = 0;
1849 /* check for 1st char of combining characters */
1850 if ((filter->status & 0xf) == 0 && (
1851 c == 0x00E6 ||
1852 (c >= 0x0254 && c <= 0x02E9) ||
1853 (c >= 0x304B && c <= 0x3053) ||
1854 (c >= 0x30AB && c <= 0x30C8) ||
1855 c == 0x31F7)) {
1856 for (k = 0; k < jisx0213_u2_tbl_len; k++) {
1857 if (c == jisx0213_u2_tbl[2*k]) {
1858 filter->status++;
1859 filter->cache = k;
1860 return 0;
1861 }
1862 }
1863 }
1864
1865 /* check for 2nd char of combining characters */
1866 if ((filter->status & 0xf) == 1 && filter->cache >= 0 && filter->cache < jisx0213_u2_tbl_len) {
1867 k = filter->cache;
1868 filter->status &= ~0xf;
1869 filter->cache = 0;
1870
1871 c1 = jisx0213_u2_tbl[2*k];
1872 if ((c1 == 0x0254 || c1 == 0x028C || c1 == 0x0259 || c1 == 0x025A) && c == 0x0301) {
1873 k++;
1874 }
1875 if (c == jisx0213_u2_tbl[2*k+1]) {
1876 s1 = jisx0213_u2_key[k];
1877 } else { /* fallback */
1878 s1 = jisx0213_u2_fb_tbl[k];
1879
1880 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1881 c1 = (s1 >> 8) & 0xff;
1882 c2 = s1 & 0xff;
1883 SJIS_ENCODE(c1, c2, s1, s2);
1884 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1885 s2 = (s1 & 0xff) + 0x80;
1886 s1 = ((s1 >> 8) & 0xff) + 0x80;
1887 } else {
1888 if (filter->status != 0x200) {
1889 CK((*filter->output_function)(0x1b, filter->data));
1890 CK((*filter->output_function)('$', filter->data));
1891 CK((*filter->output_function)('(', filter->data));
1892 CK((*filter->output_function)('Q', filter->data));
1893 }
1894 filter->status = 0x200;
1895
1896 s2 = s1 & 0x7f;
1897 s1 = (s1 >> 8) & 0x7f;
1898 }
1899
1900 /* Flush out cached data */
1901 CK((*filter->output_function)(s1, filter->data));
1902 CK((*filter->output_function)(s2, filter->data));
1903 goto retry;
1904 }
1905 }
1906
1907 /* check for major japanese chars: U+4E00 - U+9FFF */
1908 if (s1 <= 0) {
1909 for (k = 0; k < uni2jis_tbl_len; k++) {
1910 if (c >= uni2jis_tbl_range[k][0] && c <= uni2jis_tbl_range[k][1]) {
1911 s1 = uni2jis_tbl[k][c-uni2jis_tbl_range[k][0]];
1912 break;
1913 }
1914 }
1915 }
1916
1917 /* check for japanese chars in compressed mapping area: U+1E00 - U+4DBF */
1918 if (s1 <= 0 && c >= ucs_c1_jisx0213_min && c <= ucs_c1_jisx0213_max) {
1919 k = mbfl_bisec_srch(c, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1920 if (k >= 0) {
1921 s1 = ucs_c1_jisx0213_ofst[k] + c - ucs_c1_jisx0213_tbl[2*k];
1922 }
1923 }
1924
1925 /* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1926 if (s1 <= 0 && c >= jisx0213_u5_tbl_min && c <= jisx0213_u5_tbl_max) {
1927 k = mbfl_bisec_srch2(c - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1928 if (k >= 0) {
1929 s1 = jisx0213_u5_jis_tbl[k];
1930 }
1931 }
1932
1933 if (s1 <= 0) {
1934 /* CJK Compatibility Forms: U+FE30 - U+FE4F */
1935 if (c == 0xfe45) {
1936 s1 = 0x233e;
1937 } else if (c == 0xfe46) {
1938 s1 = 0x233d;
1939 } else if (c >= 0xf91d && c <= 0xf9dc) {
1940 /* CJK Compatibility Ideographs: U+F900 - U+F92A */
1941 k = mbfl_bisec_srch2(c, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1942 if (k >= 0) {
1943 s1 = ucs_r2b_jisx0213_cmap_val[k];
1944 }
1945 }
1946 }
1947
1948 if (s1 <= 0) {
1949 if (c == 0) {
1950 s1 = 0;
1951 } else {
1952 s1 = -1;
1953 }
1954 }
1955
1956 if (s1 >= 0) {
1957 if (s1 < 0x80) { /* ASCII */
1958 if (filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (filter->status & 0xff00)) {
1959 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
1960 CK((*filter->output_function)('(', filter->data));
1961 CK((*filter->output_function)('B', filter->data));
1962 }
1963 filter->status = 0;
1964 CK((*filter->output_function)(s1, filter->data));
1965 } else if (s1 < 0x100) { /* latin or kana */
1966 if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1967 CK((*filter->output_function)(0x8e, filter->data));
1968 CK((*filter->output_function)(s1, filter->data));
1969 } else if (filter->to->no_encoding == mbfl_no_encoding_sjis2004 && (s1 >= 0xA1 && s1 <= 0xDF)) {
1970 CK((*filter->output_function)(s1, filter->data));
1971 } else {
1972 CK(mbfl_filt_conv_illegal_output(c, filter));
1973 }
1974 } else if (s1 < 0x7f00) { /* X 0213 plane 1 */
1975 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1976 c1 = (s1 >> 8) & 0xff;
1977 c2 = s1 & 0xff;
1978 SJIS_ENCODE(c1, c2, s1, s2);
1979 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1980 s2 = (s1 & 0xff) + 0x80;
1981 s1 = ((s1 >> 8) & 0xff) + 0x80;
1982 } else {
1983 if ((filter->status & 0xff00) != 0x200) {
1984 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
1985 CK((*filter->output_function)('$', filter->data));
1986 CK((*filter->output_function)('(', filter->data));
1987 CK((*filter->output_function)('Q', filter->data));
1988 }
1989 filter->status = 0x200;
1990 s2 = s1 & 0xff;
1991 s1 = (s1 >> 8) & 0xff;
1992 }
1993 CK((*filter->output_function)(s1, filter->data));
1994 CK((*filter->output_function)(s2, filter->data));
1995 } else { /* X 0213 plane 2 */
1996 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1997 c1 = (s1 >> 8) & 0xff;
1998 c2 = s1 & 0xff;
1999 SJIS_ENCODE(c1, c2, s1, s2);
2000 } else {
2001 s2 = s1 & 0xff;
2002 k = ((s1 >> 8) & 0xff) - 0x7f;
2003 if (k >= 0 && k < jisx0213_p2_ofst_len) {
2004 s1 = jisx0213_p2_ofst[k] + 0x21;
2005 }
2006 if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2007 s2 |= 0x80;
2008 s1 |= 0x80;
2009 CK((*filter->output_function)(0x8f, filter->data));
2010 } else {
2011 if ((filter->status & 0xff00) != 0x200) {
2012 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2013 CK((*filter->output_function)('$', filter->data));
2014 CK((*filter->output_function)('(', filter->data));
2015 CK((*filter->output_function)('P', filter->data));
2016 }
2017 filter->status = 0x200;
2018 }
2019 }
2020
2021 CK((*filter->output_function)(s1, filter->data));
2022 CK((*filter->output_function)(s2, filter->data));
2023 }
2024 } else {
2025 CK(mbfl_filt_conv_illegal_output(c, filter));
2026 }
2027
2028 return 0;
2029 }
2030
mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter * filter)2031 static int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter)
2032 {
2033 int k, c1, c2, s1, s2;
2034
2035 k = filter->cache;
2036 filter->cache = 0;
2037
2038 if (filter->status == 1 && k >= 0 && k <= jisx0213_u2_tbl_len) {
2039 s1 = jisx0213_u2_fb_tbl[k];
2040
2041 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
2042 c1 = (s1 >> 8) & 0xff;
2043 c2 = s1 & 0xff;
2044 SJIS_ENCODE(c1, c2, s1, s2);
2045 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2046 s2 = (s1 & 0xff) | 0x80;
2047 s1 = ((s1 >> 8) & 0xff) | 0x80;
2048 } else {
2049 s2 = s1 & 0x7f;
2050 s1 = (s1 >> 8) & 0x7f;
2051 if ((filter->status & 0xff00) != 0x200) {
2052 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2053 CK((*filter->output_function)('$', filter->data));
2054 CK((*filter->output_function)('(', filter->data));
2055 CK((*filter->output_function)('Q', filter->data));
2056 }
2057 filter->status = 0x200;
2058 }
2059
2060 CK((*filter->output_function)(s1, filter->data));
2061 CK((*filter->output_function)(s2, filter->data));
2062 }
2063
2064 /* If we had switched to a different charset, go back to ASCII mode
2065 * This makes it possible to concatenate arbitrary valid strings
2066 * together and get a valid string */
2067 if (filter->status & 0xff00) {
2068 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2069 CK((*filter->output_function)('(', filter->data));
2070 CK((*filter->output_function)('B', filter->data));
2071 }
2072
2073 filter->status = 0;
2074
2075 if (filter->flush_function) {
2076 return (*filter->flush_function)(filter->data);
2077 }
2078
2079 return 0;
2080 }
2081
2082 #define ASCII 0
2083 #define JISX0208 1
2084 #define JISX0213_PLANE1 2
2085 #define JISX0213_PLANE2 3
2086
mb_iso2022jp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)2087 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
2088 {
2089 unsigned char *p = *in, *e = p + *in_len;
2090 uint32_t *out = buf, *limit = buf + bufsize - 1;
2091
2092 while (p < e && out < limit) {
2093 unsigned char c = *p++;
2094
2095 if (c <= 0x7F) {
2096 if (c == 0x1B) {
2097 if ((e - p) < 2) {
2098 *out++ = MBFL_BAD_INPUT;
2099 p = e;
2100 break;
2101 }
2102 unsigned char c2 = *p++;
2103 unsigned char c3 = *p++;
2104 if (c2 == '$') {
2105 if (c3 == 'B') {
2106 *state = JISX0208;
2107 } else if (c3 == '(') {
2108 if (p == e) {
2109 *out++ = MBFL_BAD_INPUT;
2110 break;
2111 }
2112 unsigned char c4 = *p++;
2113 if (c4 == 'Q') {
2114 *state = JISX0213_PLANE1;
2115 } else if (c4 == 'P') {
2116 *state = JISX0213_PLANE2;
2117 } else {
2118 *out++ = MBFL_BAD_INPUT;
2119 }
2120 } else {
2121 *out++ = MBFL_BAD_INPUT;
2122 }
2123 } else if (c2 == '(') {
2124 if (c3 == 'B') {
2125 *state = ASCII;
2126 } else {
2127 *out++ = MBFL_BAD_INPUT;
2128 }
2129 } else {
2130 p--;
2131 *out++ = MBFL_BAD_INPUT;
2132 }
2133 } else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) {
2134 if (p == e) {
2135 *out++ = MBFL_BAD_INPUT;
2136 break;
2137 }
2138 unsigned char c2 = *p++;
2139 if (c2 < 0x21 || c2 > 0x7E) {
2140 *out++ = MBFL_BAD_INPUT;
2141 continue;
2142 }
2143
2144 if (*state == JISX0213_PLANE1) {
2145 unsigned int w1 = (c << 8) | c2;
2146
2147 /* Conversion for combining characters */
2148 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
2149 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
2150 if (k >= 0) {
2151 *out++ = jisx0213_u2_tbl[2*k];
2152 *out++ = jisx0213_u2_tbl[2*k+1];
2153 continue;
2154 }
2155 }
2156
2157 /* Conversion for BMP */
2158 uint32_t w = 0;
2159 w1 = (c - 0x21)*94 + c2 - 0x21;
2160 if (w1 < jisx0213_ucs_table_size) {
2161 w = jisx0213_ucs_table[w1];
2162 }
2163
2164 /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
2165 if (!w) {
2166 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2167 if (k >= 0) {
2168 w = jisx0213_jis_u5_tbl[k] + 0x20000;
2169 }
2170 }
2171
2172 *out++ = w ? w : MBFL_BAD_INPUT;
2173 } else if (*state == JISX0213_PLANE2) {
2174
2175 unsigned int s1 = c - 0x21, s2 = c2 - 0x21;
2176
2177 if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
2178 int k;
2179 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
2180 if (s1 == jisx0213_p2_ofst[k]) {
2181 break;
2182 }
2183 }
2184 k -= jisx0213_p2_ofst[k];
2185
2186 /* Check for Japanese chars in BMP */
2187 unsigned int s = (s1 + 94 + k)*94 + s2;
2188 ZEND_ASSERT(s < jisx0213_ucs_table_size);
2189 uint32_t w = jisx0213_ucs_table[s];
2190
2191 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2192 if (!w) {
2193 k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2194 if (k >= 0) {
2195 w = jisx0213_jis_u5_tbl[k] + 0x20000;
2196 }
2197 }
2198
2199 *out++ = w ? w : MBFL_BAD_INPUT;
2200 } else {
2201 *out++ = MBFL_BAD_INPUT;
2202 }
2203 } else { /* state == JISX0208 */
2204 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
2205 uint32_t w = 0;
2206 if (s < jisx0208_ucs_table_size) {
2207 w = jisx0208_ucs_table[s];
2208 }
2209 *out++ = w ? w : MBFL_BAD_INPUT;
2210 }
2211 } else {
2212 *out++ = c;
2213 }
2214 } else {
2215 *out++ = MBFL_BAD_INPUT;
2216 }
2217 }
2218
2219 *in_len = e - p;
2220 *in = p;
2221 return out - buf;
2222 }
2223
mb_wchar_to_iso2022jp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)2224 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
2225 {
2226 unsigned char *out, *limit;
2227 MB_CONVERT_BUF_LOAD(buf, out, limit);
2228 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2229
2230 uint32_t w;
2231 if (buf->state & 0xFF00) {
2232 int k = (buf->state >> 8) - 1;
2233 w = jisx0213_u2_tbl[2*k];
2234 buf->state &= 0xFF;
2235 goto process_codepoint;
2236 }
2237
2238 while (len--) {
2239 w = *in++;
2240 process_codepoint: ;
2241 unsigned int s = 0;
2242
2243 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
2244 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
2245 if (w == jisx0213_u2_tbl[2*k]) {
2246 if (!len) {
2247 if (!end) {
2248 buf->state |= (k+1) << 8;
2249 MB_CONVERT_BUF_STORE(buf, out, limit);
2250 return;
2251 }
2252 } else {
2253 uint32_t w2 = *in++; len--;
2254 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
2255 k++;
2256 }
2257 if (w2 == jisx0213_u2_tbl[2*k+1]) {
2258 s = jisx0213_u2_key[k];
2259 break;
2260 }
2261 in--; len++;
2262 }
2263
2264 s = jisx0213_u2_fb_tbl[k];
2265 break;
2266 }
2267 }
2268 }
2269
2270 /* Check for major Japanese chars: U+4E00-U+9FFF */
2271 if (!s) {
2272 for (int k = 0; k < uni2jis_tbl_len; k++) {
2273 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
2274 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
2275 break;
2276 }
2277 }
2278 }
2279
2280 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
2281 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
2282 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
2283 if (k >= 0) {
2284 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
2285 }
2286 }
2287
2288 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2289 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
2290 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
2291 if (k >= 0) {
2292 s = jisx0213_u5_jis_tbl[k];
2293 }
2294 }
2295
2296 if (!s) {
2297 /* CJK Compatibility Forms: U+FE30-U+FE4F */
2298 if (w == 0xFE45) {
2299 s = 0x233E;
2300 } else if (w == 0xFE46) {
2301 s = 0x233D;
2302 } else if (w >= 0xF91D && w <= 0xF9DC) {
2303 /* CJK Compatibility Ideographs: U+F900-U+F92A */
2304 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
2305 if (k >= 0) {
2306 s = ucs_r2b_jisx0213_cmap_val[k];
2307 }
2308 }
2309 }
2310
2311 if (!s && w) {
2312 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2313 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2314 } else if (s <= 0x7F) {
2315 if (buf->state != ASCII) {
2316 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
2317 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2318 buf->state = ASCII;
2319 }
2320 out = mb_convert_buf_add(out, s);
2321 } else if (s <= 0xFF) {
2322 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2323 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2324 } else if (s <= 0x7EFF) {
2325 if (buf->state != JISX0213_PLANE1) {
2326 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2327 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q');
2328 buf->state = JISX0213_PLANE1;
2329 } else {
2330 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2331 }
2332 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
2333 } else {
2334 if (buf->state != JISX0213_PLANE2) {
2335 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2336 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P');
2337 buf->state = JISX0213_PLANE2;
2338 } else {
2339 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2340 }
2341 unsigned int s2 = s & 0xFF;
2342 int k = ((s >> 8) & 0xFF) - 0x7F;
2343 ZEND_ASSERT(k < jisx0213_p2_ofst_len);
2344 s = jisx0213_p2_ofst[k] + 0x21;
2345 out = mb_convert_buf_add2(out, s, s2);
2346 }
2347 }
2348
2349 if (end && buf->state != ASCII) {
2350 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
2351 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2352 }
2353
2354 MB_CONVERT_BUF_STORE(buf, out, limit);
2355 }
2356
mbfl_filt_conv_cp5022x_wchar(int c,mbfl_convert_filter * filter)2357 static int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
2358 {
2359 int c1, s, w;
2360
2361 retry:
2362 switch (filter->status & 0xf) {
2363 /* case 0x00: ASCII */
2364 /* case 0x10: X 0201 latin */
2365 /* case 0x20: X 0201 kana */
2366 /* case 0x80: X 0208 */
2367 /* case 0x90: X 0212 */
2368 case 0:
2369 if (c == 0x1b) {
2370 filter->status += 2;
2371 } else if (c == 0x0e) { /* "kana in" */
2372 filter->status = 0x20;
2373 } else if (c == 0x0f) { /* "kana out" */
2374 filter->status = 0;
2375 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
2376 CK((*filter->output_function)(0xa5, filter->data));
2377 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
2378 CK((*filter->output_function)(0x203e, filter->data));
2379 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
2380 CK((*filter->output_function)(0xff40 + c, filter->data));
2381 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
2382 filter->cache = c;
2383 filter->status += 1;
2384 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
2385 CK((*filter->output_function)(c, filter->data));
2386 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
2387 CK((*filter->output_function)(0xfec0 + c, filter->data));
2388 } else {
2389 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2390 }
2391 break;
2392
2393 /* case 0x81: X 0208 second char */
2394 /* case 0x91: X 0212 second char */
2395 case 1:
2396 filter->status &= ~0xf;
2397 c1 = filter->cache;
2398 if (c > 0x20 && c < 0x7f) {
2399 s = (c1 - 0x21)*94 + c - 0x21;
2400 if (filter->status == 0x80) {
2401 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
2402 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
2403 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
2404 w = jisx0208_ucs_table[s];
2405 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
2406 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
2407 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
2408 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
2409 } else if (s >= 94 * 94 && s < 114 * 94) {
2410 /* user-defined => PUA (Microsoft extended) */
2411 w = s - 94*94 + 0xe000;
2412 } else {
2413 w = 0;
2414 }
2415
2416 if (w <= 0) {
2417 w = MBFL_BAD_INPUT;
2418 }
2419 } else {
2420 if (s >= 0 && s < jisx0212_ucs_table_size) {
2421 w = jisx0212_ucs_table[s];
2422 } else {
2423 w = 0;
2424 }
2425
2426 if (w <= 0) {
2427 w = MBFL_BAD_INPUT;
2428 }
2429 }
2430 CK((*filter->output_function)(w, filter->data));
2431 } else {
2432 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2433 }
2434 break;
2435
2436 /* ESC */
2437 /* case 0x02: */
2438 /* case 0x12: */
2439 /* case 0x22: */
2440 /* case 0x82: */
2441 /* case 0x92: */
2442 case 2:
2443 if (c == 0x24) { /* '$' */
2444 filter->status++;
2445 } else if (c == 0x28) { /* '(' */
2446 filter->status += 3;
2447 } else {
2448 filter->status &= ~0xf;
2449 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2450 goto retry;
2451 }
2452 break;
2453
2454 /* ESC $ */
2455 /* case 0x03: */
2456 /* case 0x13: */
2457 /* case 0x23: */
2458 /* case 0x83: */
2459 /* case 0x93: */
2460 case 3:
2461 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
2462 filter->status = 0x80;
2463 } else if (c == 0x28) { /* '(' */
2464 filter->status++;
2465 } else {
2466 filter->status &= ~0xf;
2467 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2468 CK((*filter->output_function)(0x24, filter->data));
2469 goto retry;
2470 }
2471 break;
2472
2473 /* ESC $ ( */
2474 /* case 0x04: */
2475 /* case 0x14: */
2476 /* case 0x24: */
2477 /* case 0x84: */
2478 /* case 0x94: */
2479 case 4:
2480 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
2481 filter->status = 0x80;
2482 } else if (c == 0x44) { /* 'D' */
2483 filter->status = 0x90;
2484 } else {
2485 filter->status &= ~0xf;
2486 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2487 CK((*filter->output_function)(0x24, filter->data));
2488 CK((*filter->output_function)(0x28, filter->data));
2489 goto retry;
2490 }
2491 break;
2492
2493 /* ESC ( */
2494 /* case 0x05: */
2495 /* case 0x15: */
2496 /* case 0x25: */
2497 /* case 0x85: */
2498 /* case 0x95: */
2499 case 5:
2500 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
2501 filter->status = 0;
2502 } else if (c == 0x4a) { /* 'J' */
2503 filter->status = 0x10;
2504 } else if (c == 0x49) { /* 'I' */
2505 filter->status = 0x20;
2506 } else {
2507 filter->status &= ~0xf;
2508 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2509 CK((*filter->output_function)(0x28, filter->data));
2510 goto retry;
2511 }
2512 break;
2513
2514 EMPTY_SWITCH_DEFAULT_CASE();
2515 }
2516
2517 return 0;
2518 }
2519
mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter * filter)2520 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
2521 {
2522 if (filter->status & 0xF) {
2523 /* 2-byte (JIS X 0208 or 0212) character was truncated, or else
2524 * escape sequence was truncated */
2525 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2526 }
2527 filter->status = 0;
2528
2529 if (filter->flush_function) {
2530 (*filter->flush_function)(filter->data);
2531 }
2532
2533 return 0;
2534 }
2535
2536 static const unsigned char hankana2zenkana_table[64] = {
2537 0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
2538 0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,
2539 0xA8,0xAA,0xAB,0xAD,0xAF,0xB1,0xB3,0xB5,0xB7,0xB9,
2540 0xBB,0xBD,0xBF,0xC1,0xC4,0xC6,0xC8,0xCA,0xCB,0xCC,
2541 0xCD,0xCE,0xCF,0xD2,0xD5,0xD8,0xDB,0xDE,0xDF,0xE0,
2542 0xE1,0xE2,0xE4,0xE6,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,
2543 0xEF,0xF3,0x9B,0x9C
2544 };
2545
2546 static const unsigned char hankana2zenhira_table[64] = {
2547 0x00,0x02,0x0C,0x0D,0x01,0xFB,0x92,0x41,0x43,0x45,
2548 0x47,0x49,0x83,0x85,0x87,0x63,0xFC,0x42,0x44,0x46,
2549 0x48,0x4A,0x4B,0x4D,0x4F,0x51,0x53,0x55,0x57,0x59,
2550 0x5B,0x5D,0x5F,0x61,0x64,0x66,0x68,0x6A,0x6B,0x6C,
2551 0x6D,0x6E,0x6F,0x72,0x75,0x78,0x7B,0x7E,0x7F,0x80,
2552 0x81,0x82,0x84,0x86,0x88,0x89,0x8A,0x8B,0x8C,0x8D,
2553 0x8F,0x93,0x9B,0x9C
2554 };
2555
2556 static const unsigned char zenkana2hankana_table[84][2] = {
2557 {0x67,0x00},{0x71,0x00},{0x68,0x00},{0x72,0x00},{0x69,0x00},
2558 {0x73,0x00},{0x6A,0x00},{0x74,0x00},{0x6B,0x00},{0x75,0x00},
2559 {0x76,0x00},{0x76,0x9E},{0x77,0x00},{0x77,0x9E},{0x78,0x00},
2560 {0x78,0x9E},{0x79,0x00},{0x79,0x9E},{0x7A,0x00},{0x7A,0x9E},
2561 {0x7B,0x00},{0x7B,0x9E},{0x7C,0x00},{0x7C,0x9E},{0x7D,0x00},
2562 {0x7D,0x9E},{0x7E,0x00},{0x7E,0x9E},{0x7F,0x00},{0x7F,0x9E},
2563 {0x80,0x00},{0x80,0x9E},{0x81,0x00},{0x81,0x9E},{0x6F,0x00},
2564 {0x82,0x00},{0x82,0x9E},{0x83,0x00},{0x83,0x9E},{0x84,0x00},
2565 {0x84,0x9E},{0x85,0x00},{0x86,0x00},{0x87,0x00},{0x88,0x00},
2566 {0x89,0x00},{0x8A,0x00},{0x8A,0x9E},{0x8A,0x9F},{0x8B,0x00},
2567 {0x8B,0x9E},{0x8B,0x9F},{0x8C,0x00},{0x8C,0x9E},{0x8C,0x9F},
2568 {0x8D,0x00},{0x8D,0x9E},{0x8D,0x9F},{0x8E,0x00},{0x8E,0x9E},
2569 {0x8E,0x9F},{0x8F,0x00},{0x90,0x00},{0x91,0x00},{0x92,0x00},
2570 {0x93,0x00},{0x6C,0x00},{0x94,0x00},{0x6D,0x00},{0x95,0x00},
2571 {0x6E,0x00},{0x96,0x00},{0x97,0x00},{0x98,0x00},{0x99,0x00},
2572 {0x9A,0x00},{0x9B,0x00},{0x9C,0x00},{0x9C,0x00},{0x72,0x00},
2573 {0x74,0x00},{0x66,0x00},{0x9D,0x00},{0x73,0x9E}
2574 };
2575
2576 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
2577 * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
2578 * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
2579 * `mode` must not call for transforms which are inverses (i.e. which would cancel
2580 * each other out).
2581 *
2582 * In some cases, successive input codepoints may be merged into one output codepoint.
2583 * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
2584 * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
2585 * will not be modified. If there is no following codepoint, `next` should be zero.
2586 *
2587 * Again, in some cases, one input codepoint may convert to two output codepoints.
2588 * If so, the second output codepoint will be stored in `*second`.
2589 *
2590 * Return the resulting codepoint. If none of the requested transforms apply, return
2591 * the input codepoint unchanged.
2592 */
mb_convert_kana_codepoint(uint32_t c,uint32_t next,bool * consumed,uint32_t * second,unsigned int mode)2593 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
2594 {
2595 if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
2596 return c + 0xFEE0;
2597 }
2598 if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2599 return c + 0xFEE0;
2600 }
2601 if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
2602 return c + 0xFEE0;
2603 }
2604 if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
2605 return 0x3000;
2606 }
2607
2608 if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
2609 /* Convert Hankaku kana to Zenkaku kana
2610 * Either all Hankaku kana (including katakana and hiragana) will be converted
2611 * to Zenkaku katakana, or to Zenkaku hiragana */
2612 if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2613 if (c >= 0xFF61 && c <= 0xFF9F) {
2614 int n = c - 0xFF60;
2615
2616 if (next >= 0xFF61 && next <= 0xFF9F) {
2617 if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2618 *consumed = true;
2619 return 0x3001 + hankana2zenkana_table[n];
2620 }
2621 if (next == 0xFF9E && n == 19) {
2622 *consumed = true;
2623 return 0x30F4;
2624 }
2625 if (next == 0xFF9F && n >= 42 && n <= 46) {
2626 *consumed = true;
2627 return 0x3002 + hankana2zenkana_table[n];
2628 }
2629 }
2630
2631 return 0x3000 + hankana2zenkana_table[n];
2632 }
2633 }
2634 if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2635 if (c >= 0xFF61 && c <= 0xFF9F) {
2636 int n = c - 0xFF60;
2637
2638 if (next >= 0xFF61 && next <= 0xFF9F) {
2639 if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2640 *consumed = true;
2641 return 0x3001 + hankana2zenhira_table[n];
2642 }
2643 if (next == 0xFF9F && n >= 42 && n <= 46) {
2644 *consumed = true;
2645 return 0x3002 + hankana2zenhira_table[n];
2646 }
2647 }
2648
2649 return 0x3000 + hankana2zenhira_table[n];
2650 }
2651 }
2652 if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
2653 return 0x3000 + hankana2zenkana_table[c - 0xFF60];
2654 }
2655 if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
2656 return 0x3000 + hankana2zenhira_table[c - 0xFF60];
2657 }
2658 }
2659
2660 if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
2661 if (c == '\\' || c == 0xA5) { /* YEN SIGN */
2662 return 0xFFE5; /* FULLWIDTH YEN SIGN */
2663 }
2664 if (c == 0x7E || c == 0x203E) {
2665 return 0xFFE3; /* FULLWIDTH MACRON */
2666 }
2667 if (c == '\'') {
2668 return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
2669 }
2670 if (c == '"') {
2671 return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
2672 }
2673 }
2674
2675 if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
2676 /* Zenkaku to Hankaku */
2677 if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
2678 /* all except " ' \ ~ */
2679 return c - 0xFEE0;
2680 }
2681 if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
2682 return c - 0xFEE0;
2683 }
2684 if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
2685 return c - 0xFEE0;
2686 }
2687 if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
2688 return ' ';
2689 }
2690 if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
2691 return '-';
2692 }
2693 }
2694
2695 if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
2696 /* Zenkaku kana to hankaku kana */
2697 if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
2698 /* Zenkaku katakana to hankaku kana */
2699 int n = c - 0x30A1;
2700 if (zenkana2hankana_table[n][1]) {
2701 *second = 0xFF00 + zenkana2hankana_table[n][1];
2702 }
2703 return 0xFF00 + zenkana2hankana_table[n][0];
2704 }
2705 if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
2706 /* Zenkaku hiragana to hankaku kana */
2707 int n = c - 0x3041;
2708 if (zenkana2hankana_table[n][1]) {
2709 *second = 0xFF00 + zenkana2hankana_table[n][1];
2710 }
2711 return 0xFF00 + zenkana2hankana_table[n][0];
2712 }
2713 if (c == 0x3001) {
2714 return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
2715 }
2716 if (c == 0x3002) {
2717 return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
2718 }
2719 if (c == 0x300C) {
2720 return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
2721 }
2722 if (c == 0x300D) {
2723 return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
2724 }
2725 if (c == 0x309B) {
2726 return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
2727 }
2728 if (c == 0x309C) {
2729 return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
2730 }
2731 if (c == 0x30FC) {
2732 return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
2733 }
2734 if (c == 0x30FB) {
2735 return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
2736 }
2737 }
2738
2739 if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
2740 if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
2741 /* Zenkaku hiragana to Zenkaku katakana */
2742 return c + 0x60;
2743 }
2744 if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
2745 /* Zenkaku katakana to Zenkaku hiragana */
2746 return c - 0x60;
2747 }
2748 }
2749
2750 if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
2751 if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
2752 return '\\';
2753 }
2754 if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
2755 return '~';
2756 }
2757 if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
2758 return '\'';
2759 }
2760 if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
2761 return '"';
2762 }
2763 }
2764
2765 return c;
2766 }
2767
2768 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
2769
mbfl_filt_conv_wchar_cp50220(int c,mbfl_convert_filter * filter)2770 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
2771 {
2772 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2773 bool consumed = false;
2774
2775 if (filter->cache) {
2776 int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
2777 filter->cache = consumed ? 0 : c;
2778 /* Terrible hack to get CP50220 to emit error markers in the proper
2779 * position, not reordering them with subsequent characters */
2780 filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2781 mbfl_filt_conv_wchar_cp50221(s, filter);
2782 filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2783 if (c == 0 && !consumed) {
2784 (*filter->output_function)(0, filter->data);
2785 }
2786 } else if (c == 0) {
2787 /* This case has to be handled separately, since `filter->cache == 0` means
2788 * no codepoint is cached */
2789 (*filter->output_function)(0, filter->data);
2790 } else {
2791 filter->cache = c;
2792 }
2793
2794 return 0;
2795 }
2796
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)2797 static int mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
2798 {
2799 /* back to latin */
2800 if ((filter->status & 0xff00) != 0) {
2801 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2802 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2803 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
2804 }
2805 filter->status = 0;
2806
2807 if (filter->flush_function != NULL) {
2808 return (*filter->flush_function)(filter->data);
2809 }
2810
2811 return 0;
2812 }
2813
mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter * filter)2814 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
2815 {
2816 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2817
2818 if (filter->cache) {
2819 int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
2820 filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2821 mbfl_filt_conv_wchar_cp50221(s, filter);
2822 filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2823 filter->cache = 0;
2824 }
2825
2826 return mbfl_filt_conv_any_jis_flush(filter);
2827 }
2828
mbfl_filt_conv_wchar_cp50221(int c,mbfl_convert_filter * filter)2829 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
2830 {
2831 int s = 0;
2832
2833 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2834 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2835 } else if (c == 0x203E) { /* OVERLINE */
2836 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2837 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2838 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2839 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2840 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2841 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2842 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2843 } else if (c >= 0xE000 && c <= 0xE757) {
2844 /* 'private'/'user' codepoints */
2845 s = c - 0xE000;
2846 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2847 }
2848
2849 if (s <= 0) {
2850 if (c == 0xa5) { /* YEN SIGN */
2851 s = 0x1005c;
2852 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
2853 s = 0x2140;
2854 } else if (c == 0x2225) { /* PARALLEL TO */
2855 s = 0x2142;
2856 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
2857 s = 0x215d;
2858 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
2859 s = 0x2171;
2860 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
2861 s = 0x2172;
2862 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
2863 s = 0x224c;
2864 }
2865 }
2866
2867 /* Above, we do a series of lookups in `ucs_*_jis_table` to find a
2868 * corresponding kuten code for this Unicode codepoint
2869 * If we get zero, that means the codepoint is not in JIS X 0208
2870 * On the other hand, if we get a result with the high bits set on both
2871 * upper and lower bytes, that is not a code in JIS X 0208 but rather
2872 * in JIS X 0213
2873 * In either case, check if this codepoint is one of the extensions added
2874 * to JIS X 0208 by MicroSoft (to make CP932) */
2875 if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
2876 int i;
2877 s = -1;
2878
2879 for (i = 0;
2880 i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
2881 i++) {
2882 const int oh = cp932ext1_ucs_table_min / 94;
2883
2884 if (c == cp932ext1_ucs_table[i]) {
2885 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2886 break;
2887 }
2888 }
2889
2890 if (s < 0) {
2891 const int oh = cp932ext2_ucs_table_min / 94;
2892 const int cp932ext2_ucs_table_size =
2893 cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
2894 for (i = 0; i < cp932ext2_ucs_table_size; i++) {
2895 if (c == cp932ext2_ucs_table[i]) {
2896 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2897 break;
2898 }
2899 }
2900 }
2901
2902 if (c == 0) {
2903 s = 0;
2904 } else if (s <= 0) {
2905 s = -1;
2906 }
2907 }
2908
2909 if (s >= 0) {
2910 if (s < 0x80) { /* ASCII */
2911 if ((filter->status & 0xff00) != 0) {
2912 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2913 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2914 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
2915 filter->status = 0;
2916 }
2917 CK((*filter->output_function)(s, filter->data));
2918 } else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
2919 if ((filter->status & 0xff00) != 0x500) {
2920 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2921 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2922 CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
2923 filter->status = 0x500;
2924 }
2925 CK((*filter->output_function)(s - 0x80, filter->data));
2926 } else if (s <= 0x927E) { /* X 0208 + extensions */
2927 if ((filter->status & 0xff00) != 0x200) {
2928 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2929 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
2930 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
2931 filter->status = 0x200;
2932 }
2933 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
2934 CK((*filter->output_function)(s & 0xff, filter->data));
2935 } else if (s < 0x10000) { /* X0212 */
2936 CK(mbfl_filt_conv_illegal_output(c, filter));
2937 } else { /* X 0201 latin */
2938 if ((filter->status & 0xff00) != 0x400) {
2939 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2940 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2941 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
2942 }
2943 filter->status = 0x400;
2944 CK((*filter->output_function)(s & 0x7f, filter->data));
2945 }
2946 } else {
2947 CK(mbfl_filt_conv_illegal_output(c, filter));
2948 }
2949
2950 return 0;
2951 }
2952
mbfl_filt_conv_wchar_cp50222(int c,mbfl_convert_filter * filter)2953 static int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
2954 {
2955 int s = 0;
2956
2957 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2958 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2959 } else if (c == 0x203E) { /* OVERLINE */
2960 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2961 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2962 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2963 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2964 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2965 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2966 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2967 } else if (c >= 0xE000 && c <= 0xE757) {
2968 /* 'private'/'user' codepoints */
2969 s = c - 0xE000;
2970 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2971 }
2972
2973 if (s <= 0) {
2974 if (c == 0xa5) { /* YEN SIGN */
2975 s = 0x1005c;
2976 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
2977 s = 0x2140;
2978 } else if (c == 0x2225) { /* PARALLEL TO */
2979 s = 0x2142;
2980 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
2981 s = 0x215d;
2982 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
2983 s = 0x2171;
2984 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
2985 s = 0x2172;
2986 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
2987 s = 0x224c;
2988 }
2989 }
2990 if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
2991 int i;
2992 s = -1;
2993
2994 for (i = 0;
2995 i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
2996 const int oh = cp932ext1_ucs_table_min / 94;
2997
2998 if (c == cp932ext1_ucs_table[i]) {
2999 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3000 break;
3001 }
3002 }
3003
3004 if (s <= 0) {
3005 const int oh = cp932ext2_ucs_table_min / 94;
3006 const int cp932ext2_ucs_table_size =
3007 cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
3008 for (i = 0; i < cp932ext2_ucs_table_size; i++) {
3009 if (c == cp932ext2_ucs_table[i]) {
3010 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3011 break;
3012 }
3013 }
3014 }
3015
3016 if (c == 0) {
3017 s = 0;
3018 } else if (s <= 0) {
3019 s = -1;
3020 }
3021 }
3022
3023 if (s >= 0) {
3024 if (s < 0x80) { /* ASCII */
3025 if ((filter->status & 0xff00) == 0x500) {
3026 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3027 filter->status = 0;
3028 } else if ((filter->status & 0xff00) != 0) {
3029 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3030 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
3031 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
3032 filter->status = 0;
3033 }
3034 CK((*filter->output_function)(s, filter->data));
3035 } else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
3036 if ((filter->status & 0xff00) != 0x500) {
3037 CK((*filter->output_function)(0x0e, filter->data)); /* SI */
3038 filter->status = 0x500;
3039 }
3040 CK((*filter->output_function)(s - 0x80, filter->data));
3041 } else if (s <= 0x927E) { /* X 0208 */
3042 if ((filter->status & 0xff00) == 0x500) {
3043 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3044 filter->status = 0;
3045 }
3046 if ((filter->status & 0xff00) != 0x200) {
3047 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3048 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
3049 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
3050 filter->status = 0x200;
3051 }
3052 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
3053 CK((*filter->output_function)(s & 0xff, filter->data));
3054 } else if (s < 0x10000) { /* X0212 */
3055 CK(mbfl_filt_conv_illegal_output(c, filter));
3056 } else { /* X 0201 latin */
3057 if ((filter->status & 0xff00) == 0x500) {
3058 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3059 filter->status = 0;
3060 }
3061 if ((filter->status & 0xff00) != 0x400) {
3062 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3063 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
3064 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
3065 }
3066 filter->status = 0x400;
3067 CK((*filter->output_function)(s & 0x7f, filter->data));
3068 }
3069 } else {
3070 CK(mbfl_filt_conv_illegal_output(c, filter));
3071 }
3072
3073 return 0;
3074 }
3075
mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter * filter)3076 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
3077 {
3078 /* back to latin */
3079 if ((filter->status & 0xff00) == 0x500) {
3080 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3081 } else if ((filter->status & 0xff00) != 0) {
3082 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3083 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
3084 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
3085 }
3086 filter->status = 0;
3087
3088 if (filter->flush_function) {
3089 (*filter->flush_function)(filter->data);
3090 }
3091
3092 return 0;
3093 }
3094
3095 #define ASCII 0
3096 #define JISX_0201_LATIN 1
3097 #define JISX_0201_KANA 2
3098 #define JISX_0208 3
3099 #define JISX_0212 4
3100
mb_cp5022x_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3101 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3102 {
3103 ZEND_ASSERT(bufsize >= 3);
3104
3105 unsigned char *p = *in, *e = p + *in_len;
3106 uint32_t *out = buf, *limit = buf + bufsize;
3107
3108 while (p < e && out < limit) {
3109 unsigned char c = *p++;
3110
3111 if (c == 0x1B) {
3112 /* Escape sequence */
3113 if ((e - p) < 2) {
3114 *out++ = MBFL_BAD_INPUT;
3115 /* Duplicate error-handling behavior of legacy code */
3116 if (p < e && (*p == '(' || *p == '$'))
3117 p++;
3118 continue;
3119 }
3120 unsigned char c2 = *p++;
3121 if (c2 == '$') {
3122 unsigned char c3 = *p++;
3123 if (c3 == '@' || c3 == 'B') {
3124 *state = JISX_0208;
3125 } else if (c3 == '(') {
3126 if (p == e) {
3127 *out++ = MBFL_BAD_INPUT;
3128 break;
3129 }
3130 unsigned char c4 = *p++;
3131 if (c4 == '@' || c4 == 'B') {
3132 *state = JISX_0208;
3133 } else if (c4 == 'D') {
3134 *state = JISX_0212;
3135 } else {
3136 if ((limit - out) < 3) {
3137 p -= 4;
3138 break;
3139 }
3140 *out++ = MBFL_BAD_INPUT;
3141 *out++ = '$';
3142 *out++ = '(';
3143 p--;
3144 }
3145 } else {
3146 if ((limit - out) < 2) {
3147 p -= 3;
3148 break;
3149 }
3150 *out++ = MBFL_BAD_INPUT;
3151 *out++ = '$';
3152 p--;
3153 }
3154 } else if (c2 == '(') {
3155 unsigned char c3 = *p++;
3156 if (c3 == 'B' || c3 == 'H') {
3157 *state = ASCII;
3158 } else if (c3 == 'J') {
3159 *state = JISX_0201_LATIN;
3160 } else if (c3 == 'I') {
3161 *state = JISX_0201_KANA;
3162 } else {
3163 if ((limit - out) < 2) {
3164 p -= 3;
3165 break;
3166 }
3167 *out++ = MBFL_BAD_INPUT;
3168 *out++ = '(';
3169 p--;
3170 }
3171 } else {
3172 *out++ = MBFL_BAD_INPUT;
3173 p--;
3174 }
3175 } else if (c == 0xE) {
3176 *state = JISX_0201_KANA;
3177 } else if (c == 0xF) {
3178 *state = ASCII;
3179 } else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
3180 *out++ = 0xA5;
3181 } else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
3182 *out++ = 0x203E;
3183 } else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
3184 *out++ = 0xFF40 + c;
3185 } else if (*state >= JISX_0208 && c > 0x20 && c <= 0x97) {
3186 if (p == e) {
3187 *out++ = MBFL_BAD_INPUT;
3188 break;
3189 }
3190 unsigned char c2 = *p++;
3191 if (c2 > 0x20 && c2 < 0x7F) {
3192 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
3193 uint32_t w = 0;
3194 if (*state == JISX_0208) {
3195 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3196 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3197 } else if (s < jisx0208_ucs_table_size) {
3198 w = jisx0208_ucs_table[s];
3199 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3200 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3201 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
3202 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
3203 } else if (s >= 94*94 && s < 114*94) {
3204 /* MicroSoft extension */
3205 w = s - 94*94 + 0xE000;
3206 }
3207 if (!w)
3208 w = MBFL_BAD_INPUT;
3209 } else {
3210 if (s < jisx0212_ucs_table_size) {
3211 w = jisx0212_ucs_table[s];
3212 }
3213 if (!w)
3214 w = MBFL_BAD_INPUT;
3215 }
3216 *out++ = w;
3217 } else {
3218 *out++ = MBFL_BAD_INPUT;
3219 }
3220 } else if (c < 0x80) {
3221 *out++ = c;
3222 } else if (c >= 0xA1 && c <= 0xDF) {
3223 *out++ = 0xFEC0 + c;
3224 } else {
3225 *out++ = MBFL_BAD_INPUT;
3226 }
3227 }
3228
3229 *in_len = e - p;
3230 *in = p;
3231 return out - buf;
3232 }
3233
lookup_wchar(uint32_t w)3234 static unsigned int lookup_wchar(uint32_t w)
3235 {
3236 unsigned int s = 0;
3237
3238 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3239 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3240 } else if (w == 0x203E) { /* OVERLINE */
3241 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
3242 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3243 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3244 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3245 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3246 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3247 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3248 } else if (w >= 0xE000 && w <= 0xE757) {
3249 /* Private Use Area codepoints */
3250 s = w - 0xE000;
3251 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
3252 }
3253
3254 if (!s) {
3255 if (w == 0xA5) { /* YEN SIGN */
3256 s = 0x1005C;
3257 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3258 s = 0x2140;
3259 } else if (w == 0x2225) { /* PARALLEL TO */
3260 s = 0x2142;
3261 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3262 s = 0x215D;
3263 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3264 s = 0x2171;
3265 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3266 s = 0x2172;
3267 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3268 s = 0x224C;
3269 } else if (w == 0) {
3270 return 0;
3271 }
3272 }
3273
3274 /* Above, we do a series of lookups in `ucs_*_jis_table` to find a
3275 * corresponding kuten code for this Unicode codepoint
3276 * If we get zero, that means the codepoint is not in JIS X 0208
3277 * On the other hand, if we get a result with the high bits set on both
3278 * upper and lower bytes, that is not a code in JIS X 0208 but rather
3279 * in JIS X 0213
3280 * In either case, check if this codepoint is one of the extensions added
3281 * to JIS X 0208 by MicroSoft (to make CP932) */
3282 if (!s || s >= 0x8080) {
3283 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3284 if (w == cp932ext1_ucs_table[i]) {
3285 return (((i / 94) + (cp932ext1_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3286 }
3287 }
3288
3289 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
3290 if (w == cp932ext2_ucs_table[i]) {
3291 return (((i / 94) + (cp932ext2_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3292 }
3293 }
3294 }
3295
3296 return s;
3297 }
3298
3299 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3300
mb_wchar_to_cp50220(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3301 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3302 {
3303 unsigned char *out, *limit;
3304 MB_CONVERT_BUF_LOAD(buf, out, limit);
3305 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3306
3307 uint32_t w;
3308
3309 if (buf->state & 0xFFFF00) {
3310 /* Reprocess cached codepoint */
3311 w = buf->state >> 8;
3312 buf->state &= 0xFF;
3313 goto reprocess_codepoint;
3314 }
3315
3316 while (len--) {
3317 w = *in++;
3318 reprocess_codepoint:
3319
3320 if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
3321 /* This codepoint may need to combine with the next one,
3322 * but the 'next one' will come in a separate buffer */
3323 buf->state |= w << 8;
3324 break;
3325 }
3326
3327 bool consumed = false;
3328 w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
3329 if (consumed) {
3330 /* Two successive codepoints were converted into one */
3331 in++; len--; consumed = false;
3332 }
3333
3334 unsigned int s = lookup_wchar(w);
3335
3336 if (!s && w) {
3337 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3338 } else if (s < 0x80) {
3339 /* ASCII */
3340 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3341 if (buf->state != ASCII) {
3342 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3343 buf->state = ASCII;
3344 }
3345 out = mb_convert_buf_add(out, s);
3346 } else if (s >= 0xA0 && s < 0xE0) {
3347 /* JISX 0201 Kana */
3348 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3349 if (buf->state != JISX_0201_KANA) {
3350 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3351 buf->state = JISX_0201_KANA;
3352 }
3353 out = mb_convert_buf_add(out, s - 0x80);
3354 } else if (s <= 0x927E) {
3355 /* JISX 0208 Kanji */
3356 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3357 if (buf->state != JISX_0208) {
3358 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3359 buf->state = JISX_0208;
3360 }
3361 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3362 } else if (s >= 0x10000) {
3363 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3364 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3365 if (buf->state != JISX_0201_LATIN) {
3366 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3367 buf->state = JISX_0201_LATIN;
3368 }
3369 out = mb_convert_buf_add(out, s & 0x7F);
3370 } else {
3371 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3372 }
3373 }
3374
3375 if (end && buf->state != ASCII) {
3376 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3377 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3378 }
3379
3380 MB_CONVERT_BUF_STORE(buf, out, limit);
3381 }
3382
mb_wchar_to_cp50221(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3383 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3384 {
3385 unsigned char *out, *limit;
3386 MB_CONVERT_BUF_LOAD(buf, out, limit);
3387 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3388
3389 while (len--) {
3390 uint32_t w = *in++;
3391 unsigned int s = lookup_wchar(w);
3392
3393 if (!s && w) {
3394 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3395 } else if (s < 0x80) {
3396 /* ASCII */
3397 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3398 if (buf->state != ASCII) {
3399 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3400 buf->state = ASCII;
3401 }
3402 out = mb_convert_buf_add(out, s);
3403 } else if (s >= 0xA0 && s < 0xE0) {
3404 /* JISX 0201 Kana */
3405 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3406 if (buf->state != JISX_0201_KANA) {
3407 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3408 buf->state = JISX_0201_KANA;
3409 }
3410 out = mb_convert_buf_add(out, s - 0x80);
3411 } else if (s <= 0x927E) {
3412 /* JISX 0208 Kanji */
3413 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3414 if (buf->state != JISX_0208) {
3415 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3416 buf->state = JISX_0208;
3417 }
3418 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3419 } else if (s >= 0x10000) {
3420 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3421 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3422 if (buf->state != JISX_0201_LATIN) {
3423 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3424 buf->state = JISX_0201_LATIN;
3425 }
3426 out = mb_convert_buf_add(out, s & 0x7F);
3427 } else {
3428 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3429 }
3430 }
3431
3432 if (end && buf->state != ASCII) {
3433 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3434 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3435 }
3436
3437 MB_CONVERT_BUF_STORE(buf, out, limit);
3438 }
3439
mb_wchar_to_cp50222(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3440 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3441 {
3442 unsigned char *out, *limit;
3443 MB_CONVERT_BUF_LOAD(buf, out, limit);
3444 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3445
3446 while (len--) {
3447 uint32_t w = *in++;
3448 unsigned int s = lookup_wchar(w);
3449
3450 if (!s && w) {
3451 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3452 } else if (s < 0x80) {
3453 /* ASCII */
3454 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3455 if (buf->state == JISX_0201_KANA) {
3456 out = mb_convert_buf_add(out, 0xF);
3457 buf->state = ASCII;
3458 } else if (buf->state != ASCII) {
3459 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3460 buf->state = ASCII;
3461 }
3462 out = mb_convert_buf_add(out, s);
3463 } else if (s >= 0xA0 && s < 0xE0) {
3464 /* JISX 0201 Kana */
3465 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3466 if (buf->state != JISX_0201_KANA) {
3467 out = mb_convert_buf_add(out, 0xE);
3468 buf->state = JISX_0201_KANA;
3469 }
3470 out = mb_convert_buf_add(out, s - 0x80);
3471 } else if (s <= 0x927E) {
3472 /* JISX 0208 Kanji */
3473 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
3474 if (buf->state == JISX_0201_KANA) {
3475 out = mb_convert_buf_add(out, 0xF);
3476 }
3477 if (buf->state != JISX_0208) {
3478 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3479 buf->state = JISX_0208;
3480 }
3481 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3482 } else if (s >= 0x10000) {
3483 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3484 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3485 if (buf->state == JISX_0201_KANA) {
3486 out = mb_convert_buf_add(out, 0xF);
3487 }
3488 if (buf->state != JISX_0201_LATIN) {
3489 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3490 buf->state = JISX_0201_LATIN;
3491 }
3492 out = mb_convert_buf_add(out, s & 0x7F);
3493 } else {
3494 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3495 }
3496 }
3497
3498 if (end) {
3499 if (buf->state == JISX_0201_KANA) {
3500 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
3501 out = mb_convert_buf_add(out, 0xF);
3502 } else if (buf->state != ASCII) {
3503 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3504 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3505 }
3506 }
3507
3508 MB_CONVERT_BUF_STORE(buf, out, limit);
3509 }
3510
3511 #define ASCII 0
3512 #define JISX0201_KANA 0x20
3513 #define JISX0208_KANJI 0x80
3514 #define UDC 0xA0
3515
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)3516 static int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
3517 {
3518 int c1, s, w;
3519
3520 switch (filter->status & 0xF) {
3521 case 0:
3522 if (c == 0x1B) {
3523 filter->status += 2;
3524 } else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
3525 CK((*filter->output_function)(0xFF40 + c, filter->data));
3526 } else if ((filter->status == JISX0208_KANJI || filter->status == UDC) && c > 0x20 && c < 0x80) {
3527 filter->cache = c;
3528 filter->status += 1;
3529 } else if (c >= 0 && c < 0x80) { /* ASCII */
3530 CK((*filter->output_function)(c, filter->data));
3531 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
3532 CK((*filter->output_function)(0xFEC0 + c, filter->data));
3533 } else {
3534 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3535 }
3536 break;
3537
3538 /* Kanji, second byte */
3539 case 1:
3540 w = 0;
3541 filter->status &= ~0xF;
3542 c1 = filter->cache;
3543 if (c > 0x20 && c < 0x7F) {
3544 s = ((c1 - 0x21) * 94) + c - 0x21;
3545 if (filter->status == JISX0208_KANJI) {
3546 if (s <= 137) {
3547 if (s == 31) {
3548 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3549 } else if (s == 32) {
3550 w = 0xFF5E; /* FULLWIDTH TILDE */
3551 } else if (s == 33) {
3552 w = 0x2225; /* PARALLEL TO */
3553 } else if (s == 60) {
3554 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3555 } else if (s == 80) {
3556 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3557 } else if (s == 81) {
3558 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3559 } else if (s == 137) {
3560 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3561 }
3562 }
3563
3564 if (w == 0) {
3565 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
3566 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3567 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
3568 w = jisx0208_ucs_table[s];
3569 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
3570 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3571 }
3572 }
3573
3574 if (w <= 0) {
3575 w = MBFL_BAD_INPUT;
3576 }
3577 } else {
3578 if (c1 > 0x20 && c1 < 0x35) {
3579 w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
3580 } else {
3581 w = MBFL_BAD_INPUT;
3582 }
3583 }
3584 CK((*filter->output_function)(w, filter->data));
3585 } else {
3586 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3587 }
3588 break;
3589
3590 /* ESC */
3591 case 2:
3592 if (c == '$') {
3593 filter->status++;
3594 } else if (c == '(') {
3595 filter->status += 3;
3596 } else {
3597 filter->status &= ~0xF;
3598 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3599 }
3600 break;
3601
3602 /* ESC $ */
3603 case 3:
3604 if (c == '@' || c == 'B') {
3605 filter->status = JISX0208_KANJI;
3606 } else if (c == '(') {
3607 filter->status++;
3608 } else {
3609 filter->status &= ~0xF;
3610 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3611 }
3612 break;
3613
3614 /* ESC $ ( */
3615 case 4:
3616 if (c == '@' || c == 'B') {
3617 filter->status = JISX0208_KANJI;
3618 } else if (c == '?') {
3619 filter->status = UDC;
3620 } else {
3621 filter->status &= ~0xF;
3622 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3623 }
3624 break;
3625
3626 /* ESC ( */
3627 case 5:
3628 if (c == 'B' || c == 'J') {
3629 filter->status = 0;
3630 } else if (c == 'I') {
3631 filter->status = JISX0201_KANA;
3632 } else {
3633 filter->status &= ~0xF;
3634 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3635 }
3636 }
3637
3638 return 0;
3639 }
3640
mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter * filter)3641 static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
3642 {
3643 if (filter->status & 0xF) {
3644 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
3645 }
3646 filter->status = 0;
3647
3648 if (filter->flush_function) {
3649 (*filter->flush_function)(filter->data);
3650 }
3651
3652 return 0;
3653 }
3654
3655 #define sjistoidx(c1, c2) \
3656 (((c1) > 0x9f) ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
3657 #define idxtojis1(c) (((c) / 94) + 0x21)
3658 #define idxtojis2(c) (((c) % 94) + 0x21)
3659
cp932ext3_cp932ext2_jis(int c)3660 static int cp932ext3_cp932ext2_jis(int c)
3661 {
3662 int idx;
3663
3664 idx = sjistoidx(0xfa, 0x40) + c;
3665 if (idx >= sjistoidx(0xfa, 0x5c))
3666 idx -= sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
3667 else if (idx >= sjistoidx(0xfa, 0x55))
3668 idx -= sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
3669 else if (idx >= sjistoidx(0xfa, 0x40))
3670 idx -= sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
3671 return idxtojis1(idx) << 8 | idxtojis2(idx);
3672 }
3673
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)3674 static int mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
3675 {
3676 int c1, c2, s1 = 0, s2 = 0;
3677
3678 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
3679 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
3680 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
3681 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
3682 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
3683 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
3684 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
3685 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
3686 } else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
3687 /* Private User Area (95ku - 114ku) */
3688 s1 = c - 0xE000;
3689 c1 = (s1 / 94) + 0x7f;
3690 c2 = (s1 % 94) + 0x21;
3691 s1 = (c1 << 8) | c2;
3692 }
3693
3694 if (s1 <= 0) {
3695 if (c == 0xA5) { /* YEN SIGN */
3696 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
3697 } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3698 s1 = 0x2140;
3699 } else if (c == 0x2225) { /* PARALLEL TO */
3700 s1 = 0x2142;
3701 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3702 s1 = 0x215d;
3703 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3704 s1 = 0x2171;
3705 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3706 s1 = 0x2172;
3707 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3708 s1 = 0x224C;
3709 }
3710 }
3711
3712 if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
3713 s1 = -1;
3714 for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
3715 if (c == cp932ext1_ucs_table[c1]) {
3716 s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
3717 break;
3718 }
3719 }
3720
3721 if (s1 <= 0) {
3722 for (c1 = 0; c1 < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; c1++) {
3723 if (c == cp932ext3_ucs_table[c1]) {
3724 s1 = cp932ext3_cp932ext2_jis(c1);
3725 break;
3726 }
3727 }
3728 }
3729
3730 if (c == 0) {
3731 s1 = 0;
3732 }
3733 }
3734
3735 if (s1 >= 0) {
3736 if (s1 < 0x80) { /* latin */
3737 if (filter->status & 0xFF00) {
3738 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3739 CK((*filter->output_function)('(', filter->data));
3740 CK((*filter->output_function)('B', filter->data));
3741 }
3742 CK((*filter->output_function)(s1, filter->data));
3743 filter->status = 0;
3744 } else if (s1 > 0xA0 && s1 < 0xE0) { /* kana */
3745 if ((filter->status & 0xFF00) != 0x100) {
3746 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3747 CK((*filter->output_function)('(', filter->data));
3748 CK((*filter->output_function)('I', filter->data));
3749 }
3750 filter->status = 0x100;
3751 CK((*filter->output_function)(s1 & 0x7F, filter->data));
3752 } else if (s1 < 0x7E7F) { /* X 0208 */
3753 if ((filter->status & 0xFF00) != 0x200) {
3754 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3755 CK((*filter->output_function)('$', filter->data));
3756 CK((*filter->output_function)('B', filter->data));
3757 }
3758 filter->status = 0x200;
3759 CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
3760 CK((*filter->output_function)(s1 & 0x7F, filter->data));
3761 } else if (s1 < 0x927F) { /* UDC */
3762 if ((filter->status & 0xFF00) != 0x800) {
3763 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3764 CK((*filter->output_function)('$', filter->data));
3765 CK((*filter->output_function)('(', filter->data));
3766 CK((*filter->output_function)('?', filter->data));
3767 }
3768 filter->status = 0x800;
3769 CK((*filter->output_function)(((s1 >> 8) - 0x5E) & 0x7F, filter->data));
3770 CK((*filter->output_function)(s1 & 0x7F, filter->data));
3771 }
3772 } else {
3773 CK(mbfl_filt_conv_illegal_output(c, filter));
3774 }
3775
3776 return 0;
3777 }
3778
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)3779 static int mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
3780 {
3781 /* Go back to ASCII (so strings can be safely concatenated) */
3782 if ((filter->status & 0xFF00) != 0) {
3783 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3784 CK((*filter->output_function)('(', filter->data));
3785 CK((*filter->output_function)('B', filter->data));
3786 }
3787 filter->status = 0;
3788
3789 if (filter->flush_function) {
3790 (*filter->flush_function)(filter->data);
3791 }
3792
3793 return 0;
3794 }
3795
mb_iso2022jpms_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3796 static size_t mb_iso2022jpms_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3797 {
3798 unsigned char *p = *in, *e = p + *in_len;
3799 uint32_t *out = buf, *limit = buf + bufsize;
3800
3801 while (p < e && out < limit) {
3802 unsigned char c = *p++;
3803
3804 if (c == 0x1B) {
3805 if ((e - p) < 2) {
3806 *out++ = MBFL_BAD_INPUT;
3807 p = e;
3808 break;
3809 }
3810 unsigned char c2 = *p++;
3811 unsigned char c3 = *p++;
3812
3813 if (c2 == '$') {
3814 if (c3 == '@' || c3 == 'B') {
3815 *state = JISX0208_KANJI;
3816 } else if (c3 == '(' && p < e) {
3817 unsigned char c4 = *p++;
3818
3819 if (c4 == '@' || c4 == 'B') {
3820 *state = JISX0208_KANJI;
3821 } else if (c4 == '?') {
3822 *state = UDC;
3823 } else {
3824 *out++ = MBFL_BAD_INPUT;
3825 }
3826 } else {
3827 *out++ = MBFL_BAD_INPUT;
3828 }
3829 } else if (c2 == '(') {
3830 if (c3 == 'B' || c3 == 'J') {
3831 *state = ASCII;
3832 } else if (c3 == 'I') {
3833 *state = JISX0201_KANA;
3834 } else {
3835 *out++ = MBFL_BAD_INPUT;
3836 }
3837 } else {
3838 p--;
3839 *out++ = MBFL_BAD_INPUT;
3840 }
3841 } else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
3842 *out++ = 0xFF40 + c;
3843 } else if ((*state == JISX0208_KANJI || *state == UDC) && c >= 0x21 && c <= 0x7F) {
3844 if (p == e) {
3845 *out++ = MBFL_BAD_INPUT;
3846 break;
3847 }
3848 unsigned char c2 = *p++;
3849 unsigned int w = 0;
3850
3851 if (c2 >= 0x21 && c2 <= 0x7E) {
3852 unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
3853 if (*state == JISX0208_KANJI) {
3854 if (s <= 137) {
3855 if (s == 31) {
3856 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3857 } else if (s == 32) {
3858 w = 0xFF5E; /* FULLWIDTH TILDE */
3859 } else if (s == 33) {
3860 w = 0x2225; /* PARALLEL TO */
3861 } else if (s == 60) {
3862 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3863 } else if (s == 80) {
3864 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3865 } else if (s == 81) {
3866 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3867 } else if (s == 137) {
3868 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3869 }
3870 }
3871
3872 if (!w) {
3873 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3874 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3875 } else if (s < jisx0208_ucs_table_size) {
3876 w = jisx0208_ucs_table[s];
3877 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3878 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3879 }
3880 }
3881 } else if (c >= 0x21 && c <= 0x34) {
3882 w = 0xE000 + ((c - 0x21) * 94) + c2 - 0x21;
3883 }
3884
3885 *out++ = w ? w : MBFL_BAD_INPUT;
3886 } else {
3887 *out++ = MBFL_BAD_INPUT;
3888 }
3889 } else if (c <= 0x7F) {
3890 *out++ = c;
3891 } else if (c >= 0xA1 && c <= 0xDF) {
3892 *out++ = 0xFEC0 + c;
3893 } else {
3894 *out++ = MBFL_BAD_INPUT;
3895 }
3896 }
3897
3898 *in_len = e - p;
3899 *in = p;
3900 return out - buf;
3901 }
3902
mb_wchar_to_iso2022jpms(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3903 static void mb_wchar_to_iso2022jpms(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3904 {
3905 unsigned char *out, *limit;
3906 MB_CONVERT_BUF_LOAD(buf, out, limit);
3907 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3908
3909 while (len--) {
3910 uint32_t w = *in++;
3911 unsigned int s = 0;
3912
3913 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3914 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3915 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3916 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3917 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3918 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3919 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3920 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3921 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
3922 /* Private User Area (95ku - 114ku) */
3923 s = ((((w - 0xE000) / 94) + 0x7F) << 8) | (((w - 0xE000) % 94) + 0x21);
3924 }
3925
3926 if (!s) {
3927 if (w == 0xA5) { /* YEN SIGN */
3928 s = 0x216F; /* FULLWIDTH YEN SIGN */
3929 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3930 s = 0x2140;
3931 } else if (w == 0x2225) { /* PARALLEL TO */
3932 s = 0x2142;
3933 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3934 s = 0x215D;
3935 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3936 s = 0x2171;
3937 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3938 s = 0x2172;
3939 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3940 s = 0x224C;
3941 }
3942 }
3943
3944 if (s >= 0xA1A1) /* JISX 0212 */
3945 s = 0;
3946
3947 if (!s && w) {
3948 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3949 if (w == cp932ext1_ucs_table[i]) {
3950 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
3951 break;
3952 }
3953 }
3954
3955 if (!s) {
3956 for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
3957 if (w == cp932ext3_ucs_table[i]) {
3958 s = cp932ext3_cp932ext2_jis(i);
3959 break;
3960 }
3961 }
3962 }
3963 }
3964
3965 if (!s && w) {
3966 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
3967 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3968 } else if (s <= 0x7F) {
3969 if (buf->state != ASCII) {
3970 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3971 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3972 buf->state = ASCII;
3973 }
3974 out = mb_convert_buf_add(out, s);
3975 } else if (s >= 0xA1 && s <= 0xDF) {
3976 if (buf->state != JISX0201_KANA) {
3977 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3978 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3979 buf->state = JISX0201_KANA;
3980 }
3981 out = mb_convert_buf_add(out, s & 0x7F);
3982 } else if (s <= 0x7E7E) {
3983 if (buf->state != JISX0208_KANJI) {
3984 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3985 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3986 buf->state = JISX0208_KANJI;
3987 } else {
3988 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3989 }
3990 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0x7F);
3991 } else if (s < 0x927F) {
3992 if (buf->state != UDC) {
3993 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
3994 out = mb_convert_buf_add4(out, 0x1B, '$', '(', '?');
3995 buf->state = UDC;
3996 } else {
3997 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3998 }
3999 out = mb_convert_buf_add2(out, ((s >> 8) - 0x5E) & 0x7F, s & 0x7F);
4000 } else {
4001 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
4002 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4003 }
4004 }
4005
4006 if (end && buf->state != ASCII) {
4007 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
4008 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
4009 }
4010
4011 MB_CONVERT_BUF_STORE(buf, out, limit);
4012 }
4013
mbfl_filt_conv_2022kr_wchar(int c,mbfl_convert_filter * filter)4014 static int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
4015 {
4016 int w = 0;
4017
4018 switch (filter->status & 0xf) {
4019 /* case 0x00: ASCII */
4020 /* case 0x10: KSC5601 */
4021 case 0:
4022 if (c == 0x1b) { /* ESC */
4023 filter->status += 2;
4024 } else if (c == 0x0f) { /* shift in (ASCII) */
4025 filter->status = 0;
4026 } else if (c == 0x0e) { /* shift out (KSC5601) */
4027 filter->status = 0x10;
4028 } else if ((filter->status & 0x10) && c > 0x20 && c < 0x7f) {
4029 /* KSC5601 lead byte */
4030 filter->cache = c;
4031 filter->status = 0x11;
4032 } else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) {
4033 /* latin, CTLs */
4034 CK((*filter->output_function)(c, filter->data));
4035 } else {
4036 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4037 }
4038 break;
4039
4040 case 1: /* dbcs second byte */
4041 filter->status = 0x10;
4042 int c1 = filter->cache;
4043 int flag = 0;
4044
4045 if (c1 > 0x20 && c1 < 0x47) {
4046 flag = 1;
4047 } else if (c1 >= 0x47 && c1 <= 0x7e && c1 != 0x49) {
4048 flag = 2;
4049 }
4050
4051 if (flag > 0 && c > 0x20 && c < 0x7f) {
4052 if (flag == 1) {
4053 if (c1 != 0x22 || c <= 0x65) {
4054 w = (c1 - 1)*190 + (c - 0x41) + 0x80;
4055 ZEND_ASSERT(w < uhc1_ucs_table_size);
4056 w = uhc1_ucs_table[w];
4057 }
4058 } else {
4059 w = (c1 - 0x47)*94 + c - 0x21;
4060 if (w < uhc3_ucs_table_size) {
4061 w = uhc3_ucs_table[w];
4062 } else {
4063 w = MBFL_BAD_INPUT;
4064 }
4065 }
4066
4067 if (w <= 0) {
4068 w = MBFL_BAD_INPUT;
4069 }
4070 CK((*filter->output_function)(w, filter->data));
4071 } else {
4072 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4073 }
4074 break;
4075
4076 case 2: /* ESC */
4077 if (c == '$') {
4078 filter->status++;
4079 } else {
4080 filter->status &= ~0xF;
4081 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4082 }
4083 break;
4084
4085 case 3: /* ESC $ */
4086 if (c == ')') {
4087 filter->status++;
4088 } else {
4089 filter->status &= ~0xF;
4090 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4091 }
4092 break;
4093
4094 case 4: /* ESC $ ) */
4095 filter->status = 0;
4096 if (c != 'C') {
4097 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4098 }
4099 break;
4100
4101 EMPTY_SWITCH_DEFAULT_CASE();
4102 }
4103
4104 return 0;
4105 }
4106
mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter * filter)4107 static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
4108 {
4109 if (filter->status & 0xF) {
4110 /* 2-byte character was truncated */
4111 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4112 }
4113 filter->status = 0;
4114
4115 if (filter->flush_function) {
4116 (*filter->flush_function)(filter->data);
4117 }
4118
4119 return 0;
4120 }
4121
mbfl_filt_conv_wchar_2022kr(int c,mbfl_convert_filter * filter)4122 static int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
4123 {
4124 int c1, c2, s = 0;
4125
4126 if ((filter->status & 0x100) == 0) {
4127 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
4128 CK((*filter->output_function)('$', filter->data));
4129 CK((*filter->output_function)(')', filter->data));
4130 CK((*filter->output_function)('C', filter->data));
4131 filter->status |= 0x100;
4132 }
4133
4134 if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
4135 s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
4136 } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
4137 s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
4138 } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
4139 s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
4140 } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
4141 s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
4142 } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
4143 s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
4144 } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
4145 s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
4146 } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
4147 s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
4148 }
4149
4150 c1 = (s >> 8) & 0xff;
4151 c2 = s & 0xff;
4152 /* exclude UHC extension area */
4153 if (c1 < 0xa1 || c2 < 0xa1) {
4154 s = c;
4155 } else if (s & 0x8000) {
4156 s -= 0x8080;
4157 }
4158
4159 if (s <= 0) {
4160 if (c == 0) {
4161 s = 0;
4162 } else {
4163 s = -1;
4164 }
4165 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4166 s = -1;
4167 }
4168
4169 if (s >= 0) {
4170 if (s < 0x80 && s >= 0) { /* ASCII */
4171 if (filter->status & 0x10) {
4172 CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4173 filter->status &= ~0x10;
4174 }
4175 CK((*filter->output_function)(s, filter->data));
4176 } else {
4177 if ((filter->status & 0x10) == 0) {
4178 CK((*filter->output_function)(0x0e, filter->data)); /* shift out */
4179 filter->status |= 0x10;
4180 }
4181 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
4182 CK((*filter->output_function)(s & 0xff, filter->data));
4183 }
4184 } else {
4185 CK(mbfl_filt_conv_illegal_output(c, filter));
4186 }
4187
4188 return 0;
4189 }
4190
mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter * filter)4191 static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter)
4192 {
4193 if (filter->status & 0xF) {
4194 /* Escape sequence or 2-byte character was truncated */
4195 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4196 }
4197 /* back to ascii */
4198 if (filter->status & 0x10) {
4199 CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4200 }
4201
4202 filter->status = filter->cache = 0;
4203
4204 if (filter->flush_function) {
4205 return (*filter->flush_function)(filter->data);
4206 }
4207
4208 return 0;
4209 }
4210
4211 #define ASCII 0
4212 #define KSC5601 1
4213
mb_iso2022kr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4214 static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4215 {
4216 unsigned char *p = *in, *e = p + *in_len;
4217 uint32_t *out = buf, *limit = buf + bufsize;
4218
4219 while (p < e && out < limit) {
4220 unsigned char c = *p++;
4221
4222 if (c == 0x1B) {
4223 if ((e - p) < 3) {
4224 *out++ = MBFL_BAD_INPUT;
4225 if (p < e && *p++ == '$') {
4226 if (p < e) {
4227 p++;
4228 }
4229 }
4230 continue;
4231 }
4232 unsigned char c2 = *p++;
4233 unsigned char c3 = *p++;
4234 unsigned char c4 = *p++;
4235 if (c2 == '$' && c3 == ')' && c4 == 'C') {
4236 *state = ASCII;
4237 } else {
4238 if (c3 != ')') {
4239 p--;
4240 if (c2 != '$')
4241 p--;
4242 }
4243 *out++ = MBFL_BAD_INPUT;
4244 }
4245 } else if (c == 0xF) {
4246 *state = ASCII;
4247 } else if (c == 0xE) {
4248 *state = KSC5601;
4249 } else if (c >= 0x21 && c <= 0x7E && *state == KSC5601) {
4250 if (p == e) {
4251 *out++ = MBFL_BAD_INPUT;
4252 break;
4253 }
4254 unsigned char c2 = *p++;
4255 unsigned int w = 0;
4256
4257 if (c2 < 0x21 || c2 > 0x7E) {
4258 *out++ = MBFL_BAD_INPUT;
4259 continue;
4260 }
4261
4262 if (c < 0x47) {
4263 if (c != 0x22 || c2 <= 0x65) {
4264 w = (c - 1)*190 + c2 - 0x41 + 0x80;
4265 ZEND_ASSERT(w < uhc1_ucs_table_size);
4266 w = uhc1_ucs_table[w];
4267 }
4268 } else if (c != 0x49 && c <= 0x7D) {
4269 w = (c - 0x47)*94 + c2 - 0x21;
4270 ZEND_ASSERT(w < uhc3_ucs_table_size);
4271 w = uhc3_ucs_table[w];
4272 }
4273
4274 if (!w)
4275 w = MBFL_BAD_INPUT;
4276 *out++ = w;
4277 } else if (c < 0x80 && *state == ASCII) {
4278 *out++ = c;
4279 } else {
4280 *out++ = MBFL_BAD_INPUT;
4281 }
4282 }
4283
4284 *in_len = e - p;
4285 *in = p;
4286 return out - buf;
4287 }
4288
4289 #define EMITTED_ESC_SEQUENCE 0x10
4290
mb_wchar_to_iso2022kr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4291 static void mb_wchar_to_iso2022kr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4292 {
4293 unsigned char *out, *limit;
4294 MB_CONVERT_BUF_LOAD(buf, out, limit);
4295
4296 /* This escape sequence needs to come *somewhere* at the beginning of a line before
4297 * we can use the Shift In/Shift Out bytes, but it only needs to come once in a string
4298 * Rather than tracking newlines, we can just emit the sequence once at the beginning
4299 * of the output string... since that will always be "the beginning of a line" */
4300 if (len && !(buf->state & EMITTED_ESC_SEQUENCE)) {
4301 MB_CONVERT_BUF_ENSURE(buf, out, limit, 4 + len);
4302 out = mb_convert_buf_add4(out, 0x1B, '$', ')', 'C');
4303 buf->state |= EMITTED_ESC_SEQUENCE;
4304 } else {
4305 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4306 }
4307
4308 while (len--) {
4309 uint32_t w = *in++;
4310 unsigned int s = 0;
4311
4312 if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
4313 s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
4314 } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
4315 s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
4316 } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
4317 s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
4318 } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
4319 s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
4320 } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
4321 s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
4322 } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
4323 s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
4324 } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
4325 s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
4326 }
4327
4328 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
4329 s = w;
4330 } else {
4331 s -= 0x8080;
4332 }
4333
4334 if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4335 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022kr);
4336 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4337 } else if (s < 0x80) {
4338 if ((buf->state & 1) != ASCII) {
4339 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4340 out = mb_convert_buf_add(out, 0xF);
4341 buf->state &= ~KSC5601;
4342 }
4343 out = mb_convert_buf_add(out, s);
4344 } else {
4345 if ((buf->state & 1) != KSC5601) {
4346 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
4347 out = mb_convert_buf_add(out, 0xE);
4348 buf->state |= KSC5601;
4349 } else {
4350 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4351 }
4352 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
4353 }
4354 }
4355
4356 if (end && (buf->state & 1) != ASCII) {
4357 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
4358 out = mb_convert_buf_add(out, 0xF);
4359 }
4360
4361 MB_CONVERT_BUF_STORE(buf, out, limit);
4362 }
4363
4364 static const struct mbfl_convert_vtbl vtbl_jis_wchar = {
4365 mbfl_no_encoding_jis,
4366 mbfl_no_encoding_wchar,
4367 mbfl_filt_conv_common_ctor,
4368 NULL,
4369 mbfl_filt_conv_jis_wchar,
4370 mbfl_filt_conv_jis_wchar_flush,
4371 NULL,
4372 };
4373
4374 static const struct mbfl_convert_vtbl vtbl_wchar_jis = {
4375 mbfl_no_encoding_wchar,
4376 mbfl_no_encoding_jis,
4377 mbfl_filt_conv_common_ctor,
4378 NULL,
4379 mbfl_filt_conv_wchar_jis,
4380 mbfl_filt_conv_any_jis_flush,
4381 NULL,
4382 };
4383
4384 const mbfl_encoding mbfl_encoding_jis = {
4385 mbfl_no_encoding_jis,
4386 "JIS",
4387 "ISO-2022-JP",
4388 NULL,
4389 NULL,
4390 MBFL_ENCTYPE_GL_UNSAFE,
4391 &vtbl_jis_wchar,
4392 &vtbl_wchar_jis,
4393 mb_iso2022jp_to_wchar,
4394 mb_wchar_to_jis,
4395 mb_check_jis
4396 };
4397
4398 static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
4399 mbfl_no_encoding_2022jp,
4400 mbfl_no_encoding_wchar,
4401 mbfl_filt_conv_common_ctor,
4402 NULL,
4403 mbfl_filt_conv_jis_wchar,
4404 mbfl_filt_conv_jis_wchar_flush,
4405 NULL,
4406 };
4407
4408 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
4409 mbfl_no_encoding_wchar,
4410 mbfl_no_encoding_2022jp,
4411 mbfl_filt_conv_common_ctor,
4412 NULL,
4413 mbfl_filt_conv_wchar_2022jp,
4414 mbfl_filt_conv_any_jis_flush,
4415 NULL,
4416 };
4417
4418 const mbfl_encoding mbfl_encoding_2022jp = {
4419 mbfl_no_encoding_2022jp,
4420 "ISO-2022-JP",
4421 "ISO-2022-JP",
4422 NULL,
4423 NULL,
4424 MBFL_ENCTYPE_GL_UNSAFE,
4425 &vtbl_2022jp_wchar,
4426 &vtbl_wchar_2022jp,
4427 mb_iso2022jp_to_wchar,
4428 mb_wchar_to_iso2022jp,
4429 mb_check_iso2022jp
4430 };
4431
4432 static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
4433
4434 static const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
4435 mbfl_no_encoding_2022jp_kddi,
4436 mbfl_no_encoding_wchar,
4437 mbfl_filt_conv_common_ctor,
4438 NULL,
4439 mbfl_filt_conv_2022jp_mobile_wchar,
4440 mbfl_filt_conv_2022jp_mobile_wchar_flush,
4441 NULL,
4442 };
4443
4444 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_kddi = {
4445 mbfl_no_encoding_wchar,
4446 mbfl_no_encoding_2022jp_kddi,
4447 mbfl_filt_conv_common_ctor,
4448 NULL,
4449 mbfl_filt_conv_wchar_2022jp_mobile,
4450 mbfl_filt_conv_wchar_2022jp_mobile_flush,
4451 NULL,
4452 };
4453
4454 const mbfl_encoding mbfl_encoding_2022jp_kddi = {
4455 mbfl_no_encoding_2022jp_kddi,
4456 "ISO-2022-JP-MOBILE#KDDI",
4457 "ISO-2022-JP",
4458 mbfl_encoding_2022jp_kddi_aliases,
4459 NULL,
4460 MBFL_ENCTYPE_GL_UNSAFE,
4461 &vtbl_2022jp_kddi_wchar,
4462 &vtbl_wchar_2022jp_kddi,
4463 mb_iso2022jp_kddi_to_wchar,
4464 mb_wchar_to_iso2022jp_kddi,
4465 NULL
4466 };
4467
4468 static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
4469 mbfl_no_encoding_2022jp_2004,
4470 mbfl_no_encoding_wchar,
4471 mbfl_filt_conv_common_ctor,
4472 NULL,
4473 mbfl_filt_conv_jis2004_wchar,
4474 mbfl_filt_conv_jis2004_wchar_flush,
4475 NULL,
4476 };
4477
4478 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = {
4479 mbfl_no_encoding_wchar,
4480 mbfl_no_encoding_2022jp_2004,
4481 mbfl_filt_conv_common_ctor,
4482 NULL,
4483 mbfl_filt_conv_wchar_jis2004,
4484 mbfl_filt_conv_wchar_jis2004_flush,
4485 NULL,
4486 };
4487
4488 const mbfl_encoding mbfl_encoding_2022jp_2004 = {
4489 mbfl_no_encoding_2022jp_2004,
4490 "ISO-2022-JP-2004",
4491 "ISO-2022-JP-2004",
4492 NULL,
4493 NULL,
4494 MBFL_ENCTYPE_GL_UNSAFE,
4495 &vtbl_2022jp_2004_wchar,
4496 &vtbl_wchar_2022jp_2004,
4497 mb_iso2022jp2004_to_wchar,
4498 mb_wchar_to_iso2022jp2004,
4499 NULL
4500 };
4501
4502 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4503 * This was just CP50220, but the implementation was less strict regarding
4504 * invalid characters; it would silently pass some through
4505 * This 'encoding' only existed in mbstring. In case some poor, lost soul is
4506 * still using it, retain minimal support by aliasing it to CP50220
4507 *
4508 * Further, mbstring also had a made-up encoding called "JIS-ms"
4509 * This was the same as CP5022{0,1,2}, but without their special ways of
4510 * handling conversion of Unicode half-width katakana */
4511 static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
4512
4513 static const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
4514 mbfl_no_encoding_cp50220,
4515 mbfl_no_encoding_wchar,
4516 mbfl_filt_conv_common_ctor,
4517 NULL,
4518 mbfl_filt_conv_cp5022x_wchar,
4519 mbfl_filt_conv_cp5022x_wchar_flush,
4520 NULL,
4521 };
4522
4523 static const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
4524 mbfl_no_encoding_wchar,
4525 mbfl_no_encoding_cp50220,
4526 mbfl_filt_conv_common_ctor,
4527 NULL,
4528 mbfl_filt_conv_wchar_cp50220,
4529 mbfl_filt_conv_wchar_cp50220_flush,
4530 NULL,
4531 };
4532
4533 static const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
4534 mbfl_no_encoding_cp50221,
4535 mbfl_no_encoding_wchar,
4536 mbfl_filt_conv_common_ctor,
4537 NULL,
4538 mbfl_filt_conv_cp5022x_wchar,
4539 mbfl_filt_conv_cp5022x_wchar_flush,
4540 NULL,
4541 };
4542
4543 static const struct mbfl_convert_vtbl vtbl_wchar_cp50221 = {
4544 mbfl_no_encoding_wchar,
4545 mbfl_no_encoding_cp50221,
4546 mbfl_filt_conv_common_ctor,
4547 NULL,
4548 mbfl_filt_conv_wchar_cp50221,
4549 mbfl_filt_conv_any_jis_flush,
4550 NULL,
4551 };
4552
4553 static const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
4554 mbfl_no_encoding_cp50222,
4555 mbfl_no_encoding_wchar,
4556 mbfl_filt_conv_common_ctor,
4557 NULL,
4558 mbfl_filt_conv_cp5022x_wchar,
4559 mbfl_filt_conv_cp5022x_wchar_flush,
4560 NULL,
4561 };
4562
4563 static const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
4564 mbfl_no_encoding_wchar,
4565 mbfl_no_encoding_cp50222,
4566 mbfl_filt_conv_common_ctor,
4567 NULL,
4568 mbfl_filt_conv_wchar_cp50222,
4569 mbfl_filt_conv_wchar_cp50222_flush,
4570 NULL,
4571 };
4572
4573 const mbfl_encoding mbfl_encoding_cp50220 = {
4574 mbfl_no_encoding_cp50220,
4575 "CP50220",
4576 "ISO-2022-JP",
4577 cp50220_aliases,
4578 NULL,
4579 MBFL_ENCTYPE_GL_UNSAFE,
4580 &vtbl_cp50220_wchar,
4581 &vtbl_wchar_cp50220,
4582 mb_cp5022x_to_wchar,
4583 mb_wchar_to_cp50220,
4584 NULL
4585 };
4586
4587 const mbfl_encoding mbfl_encoding_cp50221 = {
4588 mbfl_no_encoding_cp50221,
4589 "CP50221",
4590 "ISO-2022-JP",
4591 NULL,
4592 NULL,
4593 MBFL_ENCTYPE_GL_UNSAFE,
4594 &vtbl_cp50221_wchar,
4595 &vtbl_wchar_cp50221,
4596 mb_cp5022x_to_wchar,
4597 mb_wchar_to_cp50221,
4598 NULL
4599 };
4600
4601 const mbfl_encoding mbfl_encoding_cp50222 = {
4602 mbfl_no_encoding_cp50222,
4603 "CP50222",
4604 "ISO-2022-JP",
4605 NULL,
4606 NULL,
4607 MBFL_ENCTYPE_GL_UNSAFE,
4608 &vtbl_cp50222_wchar,
4609 &vtbl_wchar_cp50222,
4610 mb_cp5022x_to_wchar,
4611 mb_wchar_to_cp50222,
4612 NULL
4613 };
4614
4615 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
4616
4617 static const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
4618 mbfl_no_encoding_2022jpms,
4619 mbfl_no_encoding_wchar,
4620 mbfl_filt_conv_common_ctor,
4621 NULL,
4622 mbfl_filt_conv_2022jpms_wchar,
4623 mbfl_filt_conv_2022jpms_wchar_flush,
4624 NULL,
4625 };
4626
4627 static const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
4628 mbfl_no_encoding_wchar,
4629 mbfl_no_encoding_2022jpms,
4630 mbfl_filt_conv_common_ctor,
4631 NULL,
4632 mbfl_filt_conv_wchar_2022jpms,
4633 mbfl_filt_conv_any_2022jpms_flush,
4634 NULL,
4635 };
4636
4637 const mbfl_encoding mbfl_encoding_2022jpms = {
4638 mbfl_no_encoding_2022jpms,
4639 "ISO-2022-JP-MS",
4640 "ISO-2022-JP",
4641 mbfl_encoding_2022jpms_aliases,
4642 NULL,
4643 MBFL_ENCTYPE_GL_UNSAFE,
4644 &vtbl_2022jpms_wchar,
4645 &vtbl_wchar_2022jpms,
4646 mb_iso2022jpms_to_wchar,
4647 mb_wchar_to_iso2022jpms,
4648 NULL
4649 };
4650
4651 /* ISO-2022-KR is defined in RFC 1557
4652 *
4653 * The RFC says that ESC $ ) C must appear once in a ISO-2022-KR string,
4654 * at the beginning of a line, before any instances of the Shift In or
4655 * Shift Out bytes which are used to switch between ASCII/KSC 5601 modes
4656 *
4657 * We don't enforce that for ISO-2022-KR input */
4658
4659 static const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {
4660 mbfl_no_encoding_wchar,
4661 mbfl_no_encoding_2022kr,
4662 mbfl_filt_conv_common_ctor,
4663 NULL,
4664 mbfl_filt_conv_wchar_2022kr,
4665 mbfl_filt_conv_any_2022kr_flush,
4666 NULL,
4667 };
4668
4669 static const struct mbfl_convert_vtbl vtbl_2022kr_wchar = {
4670 mbfl_no_encoding_2022kr,
4671 mbfl_no_encoding_wchar,
4672 mbfl_filt_conv_common_ctor,
4673 NULL,
4674 mbfl_filt_conv_2022kr_wchar,
4675 mbfl_filt_conv_2022kr_wchar_flush,
4676 NULL,
4677 };
4678
4679 const mbfl_encoding mbfl_encoding_2022kr = {
4680 mbfl_no_encoding_2022kr,
4681 "ISO-2022-KR",
4682 "ISO-2022-KR",
4683 NULL,
4684 NULL,
4685 MBFL_ENCTYPE_GL_UNSAFE,
4686 &vtbl_2022kr_wchar,
4687 &vtbl_wchar_2022kr,
4688 mb_iso2022kr_to_wchar,
4689 mb_wchar_to_iso2022kr,
4690 NULL
4691 };
4692
4693 /*
4694 * SJIS variants
4695 */
4696
mbfl_filt_conv_sjis_wchar(int c,mbfl_convert_filter * filter)4697 static int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
4698 {
4699 int s1, s2, w;
4700
4701 switch (filter->status) {
4702 case 0:
4703 if (c >= 0 && c < 0x80) { /* ASCII */
4704 CK((*filter->output_function)(c, filter->data));
4705 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
4706 CK((*filter->output_function)(0xFEC0 + c, filter->data));
4707 } else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */
4708 filter->status = 1;
4709 filter->cache = c;
4710 } else {
4711 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4712 }
4713 break;
4714
4715 case 1: /* Kanji, second byte */
4716 filter->status = 0;
4717 int c1 = filter->cache;
4718 if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
4719 SJIS_DECODE(c1, c, s1, s2);
4720 w = (s1 - 0x21)*94 + s2 - 0x21;
4721 if (w >= 0 && w < jisx0208_ucs_table_size) {
4722 w = jisx0208_ucs_table[w];
4723 if (!w)
4724 w = MBFL_BAD_INPUT;
4725 } else {
4726 w = MBFL_BAD_INPUT;
4727 }
4728 CK((*filter->output_function)(w, filter->data));
4729 } else {
4730 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4731 }
4732 }
4733
4734 return 0;
4735 }
4736
mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter * filter)4737 static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter)
4738 {
4739 if (filter->status && filter->status != 4) {
4740 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4741 }
4742 filter->status = 0;
4743
4744 if (filter->flush_function) {
4745 (*filter->flush_function)(filter->data);
4746 }
4747
4748 return 0;
4749 }
4750
mbfl_filt_conv_wchar_sjis(int c,mbfl_convert_filter * filter)4751 static int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
4752 {
4753 int c1, c2, s1 = 0, s2;
4754
4755 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
4756 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
4757 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
4758 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
4759 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
4760 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
4761 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
4762 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
4763 }
4764 if (s1 <= 0) {
4765 if (c == 0xA5) { /* YEN SIGN */
4766 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
4767 } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
4768 s1 = 0x2131; /* FULLWIDTH MACRON */
4769 } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4770 s1 = 0x2140;
4771 } else if (c == 0x2225) { /* PARALLEL TO */
4772 s1 = 0x2142;
4773 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4774 s1 = 0x215D;
4775 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4776 s1 = 0x2171;
4777 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4778 s1 = 0x2172;
4779 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4780 s1 = 0x224C;
4781 } else if (c == 0) {
4782 s1 = 0;
4783 } else {
4784 s1 = -1;
4785 }
4786 } else if (s1 >= 0x8080) { /* JIS X 0212; not supported */
4787 s1 = -1;
4788 }
4789
4790 if (s1 >= 0) {
4791 if (s1 < 0x100) { /* Latin/Kana */
4792 CK((*filter->output_function)(s1, filter->data));
4793 } else { /* Kanji */
4794 c1 = (s1 >> 8) & 0xFF;
4795 c2 = s1 & 0xFF;
4796 SJIS_ENCODE(c1, c2, s1, s2);
4797 CK((*filter->output_function)(s1, filter->data));
4798 CK((*filter->output_function)(s2, filter->data));
4799 }
4800 } else {
4801 CK(mbfl_filt_conv_illegal_output(c, filter));
4802 }
4803
4804 return 0;
4805 }
4806
4807 static const unsigned short sjis_decode_tbl1[] = {
4808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
4809 };
4810
4811 static const unsigned short sjis_decode_tbl2[] = {
4812 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 0xFFFF, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 0xFFFF, 0xFFFF, 0xFFFF
4813 };
4814
mb_sjis_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4815 static size_t mb_sjis_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4816 {
4817 unsigned char *p = *in, *e = p + *in_len;
4818 uint32_t *out = buf, *limit = buf + bufsize;
4819
4820 e--; /* Stop the main loop 1 byte short of the end of the input */
4821
4822 while (p < e && out < limit) {
4823 unsigned char c = *p++;
4824
4825 if (c <= 0x7F) {
4826 *out++ = c;
4827 } else if (c >= 0xA1 && c <= 0xDF) { /* Kana */
4828 *out++ = 0xFEC0 + c;
4829 } else {
4830 /* Don't need to check p < e; it's not possible to go out of bounds here, due to e-- above */
4831 unsigned char c2 = *p++;
4832 /* This is only legal if c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F
4833 * But the values in the above conversion tables have been chosen such that
4834 * illegal values of c2 will always result in w > jisx0208_ucs_table_size,
4835 * so we don't need to do a separate bounds check on c2
4836 * Likewise, the values in the conversion tables are such that illegal values
4837 * for c will always result in w > jisx0208_ucs_table_size */
4838 uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
4839 if (w < jisx0208_ucs_table_size) {
4840 w = jisx0208_ucs_table[w];
4841 if (!w)
4842 w = MBFL_BAD_INPUT;
4843 *out++ = w;
4844 } else {
4845 if (c == 0x80 || c == 0xA0 || c > 0xEF) {
4846 p--;
4847 }
4848 *out++ = MBFL_BAD_INPUT;
4849 }
4850 }
4851 }
4852
4853 /* Finish up last byte of input string if there is one */
4854 if (p == e && out < limit) {
4855 unsigned char c = *p++;
4856 if (c <= 0x7F) {
4857 *out++ = c;
4858 } else if (c >= 0xA1 && c <= 0xDF) {
4859 *out++ = 0xFEC0 + c;
4860 } else {
4861 *out++ = MBFL_BAD_INPUT;
4862 }
4863 }
4864
4865 *in_len = e - p + 1;
4866 *in = p;
4867 return out - buf;
4868 }
4869
mb_wchar_to_sjis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4870 static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4871 {
4872 unsigned char *out, *limit;
4873 MB_CONVERT_BUF_LOAD(buf, out, limit);
4874 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4875
4876 while (len--) {
4877 uint32_t w = *in++;
4878 unsigned int s = 0;
4879
4880 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
4881 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
4882 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
4883 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
4884 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
4885 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
4886 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
4887 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
4888 }
4889
4890 if (s == 0) {
4891 if (w == 0xA5) { /* YEN SIGN */
4892 s = 0x216F; /* FULLWIDTH YEN SIGN */
4893 } else if (w == 0xAF || w == 0x203E) {
4894 s = 0x2131; /* FULLWIDTH MACRON */
4895 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4896 s = 0x2140;
4897 } else if (w == 0x2225) { /* PARALLEL TO */
4898 s = 0x2142;
4899 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4900 s = 0x215D;
4901 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4902 s = 0x2171;
4903 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4904 s = 0x2172;
4905 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4906 s = 0x224C;
4907 } else if (w != 0) {
4908 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4909 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4910 continue;
4911 }
4912 } else if (s >= 0x8080) { /* JIS X 0212; not supported */
4913 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4914 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4915 continue;
4916 }
4917
4918 if (s <= 0xFF) {
4919 /* Latin/Kana */
4920 out = mb_convert_buf_add(out, s);
4921 } else {
4922 /* Kanji */
4923 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s2;
4924 SJIS_ENCODE(c1, c2, s, s2);
4925 out = mb_convert_buf_add2(out, s, s2);
4926 }
4927 }
4928
4929 MB_CONVERT_BUF_STORE(buf, out, limit);
4930 }
4931
mbfl_filt_conv_sjis_mac_wchar(int c,mbfl_convert_filter * filter)4932 static int mbfl_filt_conv_sjis_mac_wchar(int c, mbfl_convert_filter *filter)
4933 {
4934 int i, j, n;
4935 int c1, s, s1, s2, w;
4936
4937 switch (filter->status) {
4938 case 0:
4939 if (c >= 0 && c < 0x80 && c != 0x5c) { /* latin */
4940 CK((*filter->output_function)(c, filter->data));
4941 } else if (c > 0xa0 && c < 0xe0) { /* kana */
4942 CK((*filter->output_function)(0xfec0 + c, filter->data));
4943 } else if (c > 0x80 && c <= 0xed && c != 0xa0) { /* kanji first char */
4944 filter->status = 1;
4945 filter->cache = c;
4946 } else if (c == 0x5c) {
4947 CK((*filter->output_function)(0x00a5, filter->data));
4948 } else if (c == 0x80) {
4949 CK((*filter->output_function)(0x005c, filter->data));
4950 } else if (c == 0xa0) {
4951 CK((*filter->output_function)(0x00a0, filter->data));
4952 } else if (c == 0xfd) {
4953 CK((*filter->output_function)(0x00a9, filter->data));
4954 } else if (c == 0xfe) {
4955 CK((*filter->output_function)(0x2122, filter->data));
4956 } else if (c == 0xff) {
4957 CK((*filter->output_function)(0x2026, filter->data));
4958 CK((*filter->output_function)(0xf87f, filter->data));
4959 } else {
4960 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4961 }
4962 break;
4963
4964 case 1: /* kanji second char */
4965 filter->status = 0;
4966 c1 = filter->cache;
4967 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
4968 w = 0;
4969 SJIS_DECODE(c1, c, s1, s2);
4970 s = (s1 - 0x21)*94 + s2 - 0x21;
4971 if (s <= 0x89) {
4972 if (s == 0x1c) {
4973 w = 0x2014; /* EM DASH */
4974 } else if (s == 0x1f) {
4975 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
4976 } else if (s == 0x20) {
4977 w = 0x301c; /* FULLWIDTH TILDE */
4978 } else if (s == 0x21) {
4979 w = 0x2016; /* PARALLEL TO */
4980 } else if (s == 0x3c) {
4981 w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
4982 } else if (s == 0x50) {
4983 w = 0x00a2; /* FULLWIDTH CENT SIGN */
4984 } else if (s == 0x51) {
4985 w = 0x00a3; /* FULLWIDTH POUND SIGN */
4986 } else if (s == 0x89) {
4987 w = 0x00ac; /* FULLWIDTH NOT SIGN */
4988 }
4989 }
4990
4991 /* apple gaiji area 0x8540 - 0x886d */
4992 if (w == 0) {
4993 for (i=0; i<7; i++) {
4994 if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) {
4995 w = s - code_tbl[i][0] + code_tbl[i][2];
4996 break;
4997 }
4998 }
4999 }
5000
5001 if (w == 0) {
5002
5003 for (i=0; i<code_tbl_m_len; i++) {
5004 if (s == code_tbl_m[i][0]) {
5005 if (code_tbl_m[i][1] == 0xf860) {
5006 n = 4;
5007 } else if (code_tbl_m[i][1] == 0xf861) {
5008 n = 5;
5009 } else {
5010 n = 6;
5011 }
5012 for (j=1; j<n-1; j++) {
5013 CK((*filter->output_function)(code_tbl_m[i][j], filter->data));
5014 }
5015 w = code_tbl_m[i][n-1];
5016 break;
5017 }
5018 }
5019 }
5020
5021 if (w == 0) {
5022 for (i=0; i<8; i++) {
5023 if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) {
5024 w = code_map[i][s - code_ofst_tbl[i][0]];
5025 if (w == 0) {
5026 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5027 return 0;
5028 }
5029 s2 = 0;
5030 if (s >= 0x043e && s <= 0x0441) {
5031 s2 = 0xf87a;
5032 } else if (s == 0x03b1 || s == 0x03b7) {
5033 s2 = 0xf87f;
5034 } else if (s == 0x04b8 || s == 0x04b9 || s == 0x04c4) {
5035 s2 = 0x20dd;
5036 } else if (s == 0x1ed9 || s == 0x1eda || s == 0x1ee8 || s == 0x1ef3 ||
5037 (s >= 0x1ef5 && s <= 0x1efb) || s == 0x1f05 || s == 0x1f06 ||
5038 s == 0x1f18 || (s >= 0x1ff2 && s <= 0x20a5)) {
5039 s2 = 0xf87e;
5040 }
5041 if (s2 > 0) {
5042 CK((*filter->output_function)(w, filter->data));
5043 w = s2;
5044 }
5045 break;
5046 }
5047 }
5048 }
5049
5050 if (w == 0 && s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
5051 w = jisx0208_ucs_table[s];
5052 }
5053
5054 if (w <= 0) {
5055 w = MBFL_BAD_INPUT;
5056 }
5057 CK((*filter->output_function)(w, filter->data));
5058 } else {
5059 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5060 }
5061 break;
5062
5063 EMPTY_SWITCH_DEFAULT_CASE();
5064 }
5065
5066 return 0;
5067 }
5068
mbfl_filt_conv_wchar_sjis_mac(int c,mbfl_convert_filter * filter)5069 static int mbfl_filt_conv_wchar_sjis_mac(int c, mbfl_convert_filter *filter)
5070 {
5071 int i, c1, c2, s1 = 0, s2 = 0, mode;
5072
5073 // a1: U+0000 -> U+046F
5074 // a2: U+2000 -> U+30FF
5075 // i: U+4E00 -> U+9FFF
5076 // r: U+FF00 -> U+FFFF
5077
5078 switch (filter->status) {
5079 case 1:
5080 c1 = filter->cache;
5081 filter->cache = filter->status = 0;
5082
5083 if (c == 0xf87a) {
5084 for (i = 0; i < 4; i++) {
5085 if (c1 == s_form_tbl[i+34+3+3]) {
5086 s1 = s_form_sjis_tbl[i+34+3+3];
5087 break;
5088 }
5089 }
5090 if (s1 <= 0) {
5091 s2 = c1;
5092 }
5093 } else if (c == 0x20dd) {
5094 for (i = 0; i < 3; i++) {
5095 if (c1 == s_form_tbl[i+34+3]) {
5096 s1 = s_form_sjis_tbl[i+34+3];
5097 break;
5098 }
5099 }
5100 if (s1 <= 0) {
5101 s2 = c1;
5102 }
5103 } else if (c == 0xf87f) {
5104 for (i = 0; i < 3; i++) {
5105 if (c1 == s_form_tbl[i+34]) {
5106 s1 = s_form_sjis_tbl[i+34];
5107 break;
5108 }
5109 }
5110 if (s1 <= 0) {
5111 s2 = c1;
5112 s1 = -1;
5113 }
5114 } else if (c == 0xf87e) {
5115 for (i = 0; i < 34; i++) {
5116 if (c1 == s_form_tbl[i]) {
5117 s1 = s_form_sjis_tbl[i];
5118 break;
5119 }
5120 }
5121 if (s1 <= 0) {
5122 s2 = c1;
5123 s1 = -1;
5124 }
5125 } else {
5126 s2 = c1;
5127 s1 = c;
5128 }
5129
5130 if (s2 > 0) {
5131 for (i = 0; i < s_form_tbl_len; i++) {
5132 if (c1 == s_form_tbl[i]) {
5133 s1 = s_form_sjis_fallback_tbl[i];
5134 break;
5135 }
5136 }
5137 }
5138
5139 if (s1 >= 0) {
5140 if (s1 < 0x100) {
5141 CK((*filter->output_function)(s1, filter->data));
5142 } else {
5143 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5144 CK((*filter->output_function)(s1 & 0xff, filter->data));
5145 }
5146 } else {
5147 CK(mbfl_filt_conv_illegal_output(c, filter));
5148 }
5149
5150 if (s2 <= 0 || s1 == -1) {
5151 break;
5152 }
5153 s1 = s2 = 0;
5154 ZEND_FALLTHROUGH;
5155
5156 case 0:
5157 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
5158 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
5159 if (c == 0x5c) {
5160 s1 = 0x80;
5161 } else if (c == 0xa9) {
5162 s1 = 0xfd;
5163 }
5164 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
5165 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
5166 if (c == 0x2122) {
5167 s1 = 0xfe;
5168 } else if (c == 0x2014) {
5169 s1 = 0x213d;
5170 } else if (c == 0x2116) {
5171 s1 = 0x2c1d;
5172 }
5173 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
5174 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
5175 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
5176 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
5177 }
5178
5179 if (c >= 0x2000) {
5180 for (i = 0; i < s_form_tbl_len; i++) {
5181 if (c == s_form_tbl[i]) {
5182 filter->status = 1;
5183 filter->cache = c;
5184 return 0;
5185 }
5186 }
5187
5188 if (c == 0xf860 || c == 0xf861 || c == 0xf862) {
5189 /* Apple 'transcoding hint' codepoints (from private use area) */
5190 filter->status = 2;
5191 filter->cache = c;
5192 return 0;
5193 }
5194 }
5195
5196 if (s1 <= 0) {
5197 if (c == 0xa0) {
5198 s1 = 0x00a0;
5199 } else if (c == 0xa5) { /* YEN SIGN */
5200 /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5201 * convert codepoint 0xA5 to halfwidth Yen sign */
5202 s1 = 0x5c; /* HALFWIDTH YEN SIGN */
5203 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
5204 s1 = 0x2140;
5205 }
5206 }
5207
5208 if (s1 <= 0) {
5209 for (i=0; i<wchar2sjis_mac_r_tbl_len; i++) {
5210 if (c >= wchar2sjis_mac_r_tbl[i][0] && c <= wchar2sjis_mac_r_tbl[i][1]) {
5211 s1 = c - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5212 break;
5213 }
5214 }
5215
5216 if (s1 <= 0) {
5217 for (i=0; i<wchar2sjis_mac_r_map_len; i++) {
5218 if (c >= wchar2sjis_mac_r_map[i][0] && c <= wchar2sjis_mac_r_map[i][1]) {
5219 s1 = wchar2sjis_mac_code_map[i][c-wchar2sjis_mac_r_map[i][0]];
5220 break;
5221 }
5222 }
5223 }
5224
5225 if (s1 <= 0) {
5226 for (i=0; i<wchar2sjis_mac_wchar_tbl_len ; i++) {
5227 if ( c == wchar2sjis_mac_wchar_tbl[i][0]) {
5228 s1 = wchar2sjis_mac_wchar_tbl[i][1] & 0xffff;
5229 break;
5230 }
5231 }
5232 }
5233
5234 if (s1 > 0) {
5235 c1 = s1/94+0x21;
5236 c2 = s1-94*(c1-0x21)+0x21;
5237 s1 = (c1 << 8) | c2;
5238 s2 = 1;
5239 }
5240 }
5241
5242 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
5243 s1 = -1;
5244 c1 = 0;
5245
5246 if (c == 0) {
5247 s1 = 0;
5248 } else if (s1 <= 0) {
5249 s1 = -1;
5250 }
5251 }
5252
5253 if (s1 >= 0) {
5254 if (s1 < 0x100) { /* latin or kana */
5255 CK((*filter->output_function)(s1, filter->data));
5256 } else { /* kanji */
5257 c1 = (s1 >> 8) & 0xff;
5258 c2 = s1 & 0xff;
5259 SJIS_ENCODE(c1, c2, s1, s2);
5260 CK((*filter->output_function)(s1, filter->data));
5261 CK((*filter->output_function)(s2, filter->data));
5262 }
5263 } else {
5264 CK(mbfl_filt_conv_illegal_output(c, filter));
5265 }
5266 break;
5267
5268 case 2:
5269 c1 = filter->cache;
5270 filter->cache = 0;
5271 filter->status = 0;
5272 if (c1 == 0xf860) {
5273 for (i = 0; i < 5; i++) {
5274 if (c == code_tbl_m[i][2]) {
5275 filter->cache = c | 0x10000;
5276 filter->status = 3;
5277 break;
5278 }
5279 }
5280 } else if (c1 == 0xf861) {
5281 for (i = 0; i < 3; i++) {
5282 if (c == code_tbl_m[i+5][2]) {
5283 filter->cache = c | 0x20000;
5284 filter->status = 3;
5285 break;
5286 }
5287 }
5288 } else if (c1 == 0xf862) {
5289 for (i = 0; i < 4; i++) {
5290 if (c == code_tbl_m[i+5+3][2]) {
5291 filter->cache = c | 0x40000;
5292 filter->status = 3;
5293 break;
5294 }
5295 }
5296 }
5297
5298 if (filter->status == 0) {
5299 /* Didn't find any of expected codepoints after Apple transcoding hint */
5300 CK(mbfl_filt_conv_illegal_output(c1, filter));
5301 return mbfl_filt_conv_wchar_sjis_mac(c, filter);
5302 }
5303 break;
5304
5305 case 3:
5306 s1 = 0;
5307 c1 = filter->cache & 0xffff;
5308 mode = (filter->cache & 0xf0000) >> 16;
5309
5310 filter->cache = filter->status = 0;
5311
5312 if (mode == 0x1) {
5313 for (i = 0; i < 5; i++) {
5314 if (c1 == code_tbl_m[i][2] && c == code_tbl_m[i][3]) {
5315 s1 = code_tbl_m[i][0];
5316 break;
5317 }
5318 }
5319
5320 if (s1 > 0) {
5321 c1 = s1/94+0x21;
5322 c2 = s1-94*(c1-0x21)+0x21;
5323 SJIS_ENCODE(c1, c2, s1, s2);
5324 CK((*filter->output_function)(s1, filter->data));
5325 CK((*filter->output_function)(s2, filter->data));
5326 } else {
5327 CK(mbfl_filt_conv_illegal_output(0xf860, filter));
5328 CK(mbfl_filt_conv_illegal_output(c1, filter));
5329 CK(mbfl_filt_conv_illegal_output(c, filter));
5330 }
5331 } else if (mode == 0x2) {
5332 for (i = 0; i < 3; i++) {
5333 if (c1 == code_tbl_m[i+5][2] && c == code_tbl_m[i+5][3]) {
5334 filter->cache = c | 0x20000;
5335 filter->status = 4;
5336 break;
5337 }
5338 }
5339 } else if (mode == 0x4) {
5340 for (i = 0; i < 4; i++) {
5341 if (c1 == code_tbl_m[i+8][2] && c == code_tbl_m[i+8][3]) {
5342 filter->cache = c | 0x40000;
5343 filter->status = 4;
5344 break;
5345 }
5346 }
5347 }
5348 break;
5349
5350 case 4:
5351 s1 = 0;
5352 c1 = filter->cache & 0xffff;
5353 mode = (filter->cache & 0xf0000) >> 16;
5354
5355 filter->cache = 0;
5356 filter->status = 0;
5357
5358 if (mode == 0x2) {
5359 for (i = 0; i < 3; i++) {
5360 if (c1 == code_tbl_m[i+5][3] && c == code_tbl_m[i+5][4]) {
5361 s1 = code_tbl_m[i+5][0];
5362 break;
5363 }
5364 }
5365
5366 if (s1 > 0) {
5367 c1 = s1/94+0x21;
5368 c2 = s1-94*(c1-0x21)+0x21;
5369 SJIS_ENCODE(c1, c2, s1, s2);
5370 CK((*filter->output_function)(s1, filter->data));
5371 CK((*filter->output_function)(s2, filter->data));
5372 } else {
5373 CK(mbfl_filt_conv_illegal_output(0xf861, filter));
5374 for (i = 0; i < 3; i++) {
5375 if (c1 == code_tbl_m[i+5][3]) {
5376 CK(mbfl_filt_conv_illegal_output(code_tbl_m[i+5][2], filter));
5377 break;
5378 }
5379 }
5380 CK(mbfl_filt_conv_illegal_output(c1, filter));
5381 CK(mbfl_filt_conv_illegal_output(c, filter));
5382 }
5383 } else if (mode == 0x4) {
5384 for (i = 0; i < 4; i++) {
5385 if (c1 == code_tbl_m[i+8][3] && c == code_tbl_m[i+8][4]) {
5386 filter->cache = c | 0x40000;
5387 filter->status = 5;
5388 break;
5389 }
5390 }
5391 }
5392 break;
5393
5394 case 5:
5395 s1 = 0;
5396 c1 = filter->cache & 0xffff;
5397 mode = (filter->cache & 0xf0000) >> 16;
5398
5399 filter->cache = filter->status = 0;
5400
5401 if (mode == 0x4) {
5402 for (i = 0; i < 4; i++) {
5403 if (c1 == code_tbl_m[i+8][4] && c == code_tbl_m[i+8][5]) {
5404 s1 = code_tbl_m[i+8][0];
5405 break;
5406 }
5407 }
5408
5409 if (s1 > 0) {
5410 c1 = s1/94+0x21;
5411 c2 = s1-94*(c1-0x21)+0x21;
5412 SJIS_ENCODE(c1, c2, s1, s2);
5413 CK((*filter->output_function)(s1, filter->data));
5414 CK((*filter->output_function)(s2, filter->data));
5415 } else {
5416 CK(mbfl_filt_conv_illegal_output(0xf862, filter));
5417 for (i = 0; i < 4; i++) {
5418 if (c1 == code_tbl_m[i+8][4]) {
5419 CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][2], filter));
5420 CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][3], filter));
5421 break;
5422 }
5423 }
5424 CK(mbfl_filt_conv_illegal_output(c1, filter));
5425 CK(mbfl_filt_conv_illegal_output(c, filter));
5426 }
5427 }
5428 break;
5429
5430 EMPTY_SWITCH_DEFAULT_CASE();
5431 }
5432
5433 return 0;
5434 }
5435
mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter * filter)5436 static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter)
5437 {
5438 int i, c1, s1 = 0;
5439 if (filter->status == 1 && filter->cache > 0) {
5440 c1 = filter->cache;
5441 for (i=0;i<s_form_tbl_len;i++) {
5442 if (c1 == s_form_tbl[i]) {
5443 s1 = s_form_sjis_fallback_tbl[i];
5444 break;
5445 }
5446 }
5447 if (s1 > 0) {
5448 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5449 CK((*filter->output_function)(s1 & 0xff, filter->data));
5450 }
5451 }
5452 filter->cache = 0;
5453 filter->status = 0;
5454
5455 if (filter->flush_function != NULL) {
5456 return (*filter->flush_function)(filter->data);
5457 }
5458
5459 return 0;
5460 }
5461
mb_sjismac_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)5462 static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
5463 {
5464 /* A single SJIS-Mac kuten code can convert to up to 5 Unicode codepoints, oh my! */
5465 ZEND_ASSERT(bufsize >= 5);
5466
5467 unsigned char *p = *in, *e = p + *in_len;
5468 uint32_t *out = buf, *limit = buf + bufsize;
5469
5470 while (p < e && out < limit) {
5471 unsigned char c = *p++;
5472
5473 if (c <= 0x80 || c == 0xA0) {
5474 if (c == 0x5C) {
5475 *out++ = 0xA5;
5476 } else if (c == 0x80) {
5477 *out++ = 0x5C;
5478 } else {
5479 *out++ = c;
5480 }
5481 } else if (c >= 0xA1 && c <= 0xDF) {
5482 *out++ = 0xFEC0 + c;
5483 } else if (c <= 0xED) {
5484 if (p == e) {
5485 *out++ = MBFL_BAD_INPUT;
5486 break;
5487 }
5488 unsigned char c2 = *p++;
5489 uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
5490
5491 if (w <= 0x89) {
5492 if (w == 0x1C) {
5493 *out++ = 0x2014; /* EM DASH */
5494 continue;
5495 } else if (w == 0x1F) {
5496 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
5497 continue;
5498 } else if (w == 0x20) {
5499 *out++ = 0x301C; /* FULLWIDTH TILDE */
5500 continue;
5501 } else if (w == 0x21) {
5502 *out++ = 0x2016; /* PARALLEL TO */
5503 continue;
5504 } else if (w == 0x3C) {
5505 *out++ = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
5506 continue;
5507 } else if (w == 0x50) {
5508 *out++ = 0xA2; /* FULLWIDTH CENT SIGN */
5509 continue;
5510 } else if (w == 0x51) {
5511 *out++ = 0xA3; /* FULLWIDTH POUND SIGN */
5512 continue;
5513 } else if (w == 0x89) {
5514 *out++ = 0xAC; /* FULLWIDTH NOT SIGN */
5515 continue;
5516 }
5517 } else {
5518 if (w >= 0x2F0 && w <= 0x3A3) {
5519 for (int i = 0; i < 7; i++) {
5520 if (w >= code_tbl[i][0] && w <= code_tbl[i][1]) {
5521 *out++ = w - code_tbl[i][0] + code_tbl[i][2];
5522 goto next_iteration;
5523 }
5524 }
5525 }
5526
5527 if (w >= 0x340 && w <= 0x523) {
5528 for (int i = 0; i < code_tbl_m_len; i++) {
5529 if (w == code_tbl_m[i][0]) {
5530 int n = 5;
5531 if (code_tbl_m[i][1] == 0xF860) {
5532 n = 3;
5533 } else if (code_tbl_m[i][1] == 0xF861) {
5534 n = 4;
5535 }
5536 if ((limit - out) < n) {
5537 p -= 2;
5538 goto finished;
5539 }
5540 for (int j = 1; j <= n; j++) {
5541 *out++ = code_tbl_m[i][j];
5542 }
5543 goto next_iteration;
5544 }
5545 }
5546 }
5547
5548 if (w >= 0x3AC && w <= 0x20A5) {
5549 for (int i = 0; i < 8; i++) {
5550 if (w >= code_ofst_tbl[i][0] && w <= code_ofst_tbl[i][1]) {
5551 uint32_t w2 = code_map[i][w - code_ofst_tbl[i][0]];
5552 if (!w2) {
5553 *out++ = MBFL_BAD_INPUT;
5554 goto next_iteration;
5555 }
5556 if ((limit - out) < 2) {
5557 p -= 2;
5558 goto finished;
5559 }
5560 *out++ = w2;
5561 if (w >= 0x43E && w <= 0x441) {
5562 *out++ = 0xF87A;
5563 } else if (w == 0x3B1 || w == 0x3B7) {
5564 *out++ = 0xF87F;
5565 } else if (w == 0x4B8 || w == 0x4B9 || w == 0x4C4) {
5566 *out++ = 0x20DD;
5567 } else if (w == 0x1ED9 || w == 0x1EDA || w == 0x1EE8 || w == 0x1EF3 || (w >= 0x1EF5 && w <= 0x1EFB) || w == 0x1F05 || w == 0x1F06 || w == 0x1F18 || (w >= 0x1FF2 && w <= 0x20A5)) {
5568 *out++ = 0xF87E;
5569 }
5570 goto next_iteration;
5571 }
5572 }
5573 }
5574 }
5575
5576 if (w < jisx0208_ucs_table_size) {
5577 w = jisx0208_ucs_table[w];
5578 if (!w)
5579 w = MBFL_BAD_INPUT;
5580 *out++ = w;
5581 } else {
5582 *out++ = MBFL_BAD_INPUT;
5583 }
5584 } else if (c == 0xFD) {
5585 *out++ = 0xA9;
5586 } else if (c == 0xFE) {
5587 *out++ = 0x2122;
5588 } else if (c == 0xFF) {
5589 if ((limit - out) < 2) {
5590 p--;
5591 break;
5592 }
5593 *out++ = 0x2026;
5594 *out++ = 0xF87F;
5595 } else {
5596 *out++ = MBFL_BAD_INPUT;
5597 }
5598 next_iteration: ;
5599 }
5600
5601 finished:
5602 *in_len = e - p;
5603 *in = p;
5604 return out - buf;
5605 }
5606
process_s_form(uint32_t w,uint32_t w2,unsigned int * s)5607 static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s)
5608 {
5609 if (w2 == 0xF87A) {
5610 for (int i = 0; i < 4; i++) {
5611 if (w == s_form_tbl[i+34+3+3]) {
5612 *s = s_form_sjis_tbl[i+34+3+3];
5613 return true;
5614 }
5615 }
5616 } else if (w2 == 0x20DD) {
5617 for (int i = 0; i < 3; i++) {
5618 if (w == s_form_tbl[i+34+3]) {
5619 *s = s_form_sjis_tbl[i+34+3];
5620 return true;
5621 }
5622 }
5623 } else if (w2 == 0xF87F) {
5624 for (int i = 0; i < 3; i++) {
5625 if (w == s_form_tbl[i+34]) {
5626 *s = s_form_sjis_tbl[i+34];
5627 return true;
5628 }
5629 }
5630 } else if (w2 == 0xF87E) {
5631 for (int i = 0; i < 34; i++) {
5632 if (w == s_form_tbl[i]) {
5633 *s = s_form_sjis_tbl[i];
5634 return true;
5635 }
5636 }
5637 }
5638
5639 return false;
5640 }
5641
5642 /* For codepoints F860-F862, which are treated specially in MacJapanese */
5643 static int transcoding_hint_cp_width[3] = { 3, 4, 5 };
5644
mb_wchar_to_sjismac(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)5645 static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
5646 {
5647 unsigned char *out, *limit;
5648 MB_CONVERT_BUF_LOAD(buf, out, limit);
5649 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5650
5651 uint32_t w;
5652
5653 if (buf->state) {
5654 w = buf->state & 0xFFFF;
5655 if (buf->state & 0xFF000000L) {
5656 goto resume_transcoding_hint;
5657 } else {
5658 buf->state = 0;
5659 goto process_codepoint;
5660 }
5661 }
5662
5663 while (len--) {
5664 w = *in++;
5665 process_codepoint: ;
5666 unsigned int s = 0;
5667
5668 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
5669 if (w == 0x5C) {
5670 s = 0x80;
5671 } else if (w == 0xA9) {
5672 s = 0xFD;
5673 } else {
5674 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
5675 }
5676 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
5677 if (w == 0x2122) {
5678 s = 0xFE;
5679 } else if (w == 0x2014) {
5680 s = 0x213D;
5681 } else if (w == 0x2116) {
5682 s = 0x2C1D;
5683 } else {
5684 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
5685 }
5686 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
5687 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
5688 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
5689 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
5690 }
5691
5692 if (w >= 0x2000) {
5693 for (int i = 0; i < s_form_tbl_len; i++) {
5694 if (w == s_form_tbl[i]) {
5695 if (!len) {
5696 if (end) {
5697 s = s_form_sjis_fallback_tbl[i];
5698 if (s) {
5699 MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
5700 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5701 } else {
5702 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5703 }
5704 } else {
5705 buf->state = w;
5706 }
5707 MB_CONVERT_BUF_STORE(buf, out, limit);
5708 return;
5709 }
5710 uint32_t w2 = *in++;
5711 len--;
5712
5713 if (!process_s_form(w, w2, &s)) {
5714 in--; len++;
5715
5716 for (int i = 0; i < s_form_tbl_len; i++) {
5717 if (w == s_form_tbl[i]) {
5718 s = s_form_sjis_fallback_tbl[i];
5719 break;
5720 }
5721 }
5722 }
5723
5724 if (s <= 0xFF) {
5725 out = mb_convert_buf_add(out, s);
5726 } else {
5727 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5728 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5729 }
5730
5731 goto next_iteration;
5732 }
5733 }
5734
5735 if (w == 0xF860 || w == 0xF861 || w == 0xF862) {
5736 /* Apple 'transcoding hint' codepoints (from private use area) */
5737 if (!len) {
5738 if (end) {
5739 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5740 } else {
5741 buf->state = w;
5742 }
5743 MB_CONVERT_BUF_STORE(buf, out, limit);
5744 return;
5745 }
5746
5747 uint32_t w2 = *in++;
5748 len--;
5749
5750 for (int i = 0; i < code_tbl_m_len; i++) {
5751 if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) {
5752 /* This might be a valid transcoding hint sequence */
5753 int index = 3;
5754
5755 if (buf->state) {
5756 resume_transcoding_hint:
5757 i = buf->state >> 24;
5758 index = (buf->state >> 16) & 0xFF;
5759 buf->state = 0;
5760 }
5761
5762 int expected = transcoding_hint_cp_width[w - 0xF860];
5763
5764 while (index <= expected) {
5765 if (!len) {
5766 if (end) {
5767 for (int j = 1; j < index; j++) {
5768 MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5769 }
5770 } else {
5771 buf->state = (i << 24) | (index << 16) | (w & 0xFFFF);
5772 }
5773 MB_CONVERT_BUF_STORE(buf, out, limit);
5774 return;
5775 }
5776
5777 w2 = *in++;
5778 len--;
5779
5780 if (w2 != code_tbl_m[i][index]) {
5781 /* Didn't match */
5782 for (int j = 1; j < index; j++) {
5783 MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5784 }
5785 MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac);
5786 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5787 goto next_iteration;
5788 }
5789
5790 index++;
5791 }
5792
5793 /* Successful match, emit SJIS-mac bytes */
5794 s = code_tbl_m[i][0];
5795 unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2;
5796 SJIS_ENCODE(c1, c2, s1, s2);
5797 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5798 out = mb_convert_buf_add2(out, s1, s2);
5799 goto next_iteration;
5800 }
5801 }
5802
5803 /* No valid transcoding hint sequence found */
5804 in--; len++;
5805 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5806 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5807 continue;
5808 }
5809 }
5810
5811 if (!s) {
5812 if (w == 0xA0) {
5813 s = 0xA0;
5814 } else if (w == 0xA5) { /* YEN SIGN */
5815 /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5816 * convert codepoint 0xA5 to halfwidth Yen sign */
5817 s = 0x5C; /* HALFWIDTH YEN SIGN */
5818 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
5819 s = 0x2140;
5820 } else {
5821 for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) {
5822 if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) {
5823 s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5824 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5825 goto found_kuten_code;
5826 }
5827 }
5828
5829 for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) {
5830 if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) {
5831 s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]];
5832 if (s) {
5833 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5834 goto found_kuten_code;
5835 }
5836 }
5837 }
5838
5839 for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) {
5840 if (w == wchar2sjis_mac_wchar_tbl[i][0]) {
5841 s = wchar2sjis_mac_wchar_tbl[i][1];
5842 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5843 goto found_kuten_code;
5844 }
5845 }
5846 }
5847 }
5848
5849 found_kuten_code:
5850 if ((!s && w) || s >= 0x8080) {
5851 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5852 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5853 } else if (s <= 0xFF) {
5854 out = mb_convert_buf_add(out, s);
5855 } else {
5856 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
5857 SJIS_ENCODE(c1, c2, s1, s2);
5858 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5859 out = mb_convert_buf_add2(out, s1, s2);
5860 }
5861
5862 next_iteration: ;
5863 }
5864
5865 MB_CONVERT_BUF_STORE(buf, out, limit);
5866 }
5867
mbfilter_sjis_emoji_docomo2unicode(int s,int * snd)5868 int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd)
5869 {
5870 /* All three mobile vendors had emoji for numbers on a telephone keypad
5871 * Unicode doesn't have those, but it has a combining character which puts
5872 * a 'keypad button' around the following character, making it look like
5873 * a key on a telephone or keyboard. That combining char is codepoint 0x20E3. */
5874 if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
5875 if ((s >= DOCOMO_KEYPAD(1) && s <= DOCOMO_KEYPAD(9)) || s == DOCOMO_KEYPAD(0) || s == DOCOMO_KEYPAD_HASH) {
5876 EMIT_KEYPAD_EMOJI(convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]));
5877 } else {
5878 *snd = 0;
5879 return convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]);
5880 }
5881 }
5882 return 0;
5883 }
5884
mbfilter_sjis_emoji_sb2unicode(int s,int * snd)5885 int mbfilter_sjis_emoji_sb2unicode(int s, int *snd)
5886 {
5887 if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb1_max) {
5888 if (s == 0x2817 || (s >= 0x2823 && s <= 0x282C)) {
5889 EMIT_KEYPAD_EMOJI(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5890 } else {
5891 *snd = 0;
5892 return convert_emoji_cp(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5893 }
5894 } else if (s >= mb_tbl_code2uni_sb2_min && s <= mb_tbl_code2uni_sb2_max) {
5895 *snd = 0;
5896 return convert_emoji_cp(mb_tbl_code2uni_sb2[s - mb_tbl_code2uni_sb2_min]);
5897 } else if (s >= mb_tbl_code2uni_sb3_min && s <= mb_tbl_code2uni_sb3_max) {
5898 if (s >= 0x2B02 && s <= 0x2B0B) {
5899 EMIT_FLAG_EMOJI(nflags_sb[s - 0x2B02]);
5900 } else {
5901 *snd = 0;
5902 return convert_emoji_cp(mb_tbl_code2uni_sb3[s - mb_tbl_code2uni_sb3_min]);
5903 }
5904 }
5905 return 0;
5906 }
5907
mbfilter_unicode2sjis_emoji_docomo(int c,int * s1,mbfl_convert_filter * filter)5908 int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter)
5909 {
5910 /* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
5911 * to a sequence of 2 codepoints, one of which is a combining character which
5912 * adds the 'key' image around the other
5913 *
5914 * In the other direction, look for such sequences and convert them to a
5915 * single emoji */
5916 if (filter->status == 1) {
5917 int c1 = filter->cache;
5918 filter->cache = filter->status = 0;
5919 if (c == 0x20E3) {
5920 if (c1 == '#') {
5921 *s1 = 0x2964;
5922 } else if (c1 == '0') {
5923 *s1 = 0x296F;
5924 } else { /* Previous character was '1'-'9' */
5925 *s1 = 0x2966 + (c1 - '1');
5926 }
5927 return 1;
5928 } else {
5929 /* This character wasn't combining character to make keypad symbol,
5930 * so pass the previous character through... and proceed to process the
5931 * current character as usual
5932 * (Single-byte ASCII characters are valid in Shift-JIS...) */
5933 CK((*filter->output_function)(c1, filter->data));
5934 }
5935 }
5936
5937 if (c == '#' || (c >= '0' && c <= '9')) {
5938 filter->status = 1;
5939 filter->cache = c;
5940 return 0;
5941 }
5942
5943 if (c == 0xA9) { /* Copyright sign */
5944 *s1 = 0x29B5;
5945 return 1;
5946 } else if (c == 0x00AE) { /* Registered sign */
5947 *s1 = 0x29BA;
5948 return 1;
5949 } else if (c >= mb_tbl_uni_docomo2code2_min && c <= mb_tbl_uni_docomo2code2_max) {
5950 int i = mbfl_bisec_srch2(c, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
5951 if (i >= 0) {
5952 *s1 = mb_tbl_uni_docomo2code2_value[i];
5953 return 1;
5954 }
5955 } else if (c >= mb_tbl_uni_docomo2code3_min && c <= mb_tbl_uni_docomo2code3_max) {
5956 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
5957 if (i >= 0) {
5958 *s1 = mb_tbl_uni_docomo2code3_value[i];
5959 return 1;
5960 }
5961 } else if (c >= mb_tbl_uni_docomo2code5_min && c <= mb_tbl_uni_docomo2code5_max) {
5962 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
5963 if (i >= 0) {
5964 *s1 = mb_tbl_uni_docomo2code5_val[i];
5965 return 1;
5966 }
5967 }
5968 return 0;
5969 }
5970
mbfilter_unicode2sjis_emoji_kddi_sjis(int c,int * s1,mbfl_convert_filter * filter)5971 int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter)
5972 {
5973 if (filter->status == 1) {
5974 int c1 = filter->cache;
5975 filter->cache = filter->status = 0;
5976 if (c == 0x20E3) {
5977 if (c1 == '#') {
5978 *s1 = 0x25BC;
5979 } else if (c1 == '0') {
5980 *s1 = 0x2830;
5981 } else { /* Previous character was '1'-'9' */
5982 *s1 = 0x27a6 + (c1 - '1');
5983 }
5984 return 1;
5985 } else {
5986 CK((*filter->output_function)(c1, filter->data));
5987 }
5988 } else if (filter->status == 2) {
5989 int c1 = filter->cache;
5990 filter->cache = filter->status = 0;
5991 if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
5992 for (int i = 0; i < 10; i++) {
5993 if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
5994 *s1 = nflags_code_kddi[i];
5995 return 1;
5996 }
5997 }
5998 }
5999
6000 /* If none of the KDDI national flag emoji matched, then we have no way
6001 * to convert the previous codepoint... */
6002 mbfl_filt_conv_illegal_output(c1, filter);
6003 }
6004
6005 if (c == '#' || (c >= '0' && c <= '9')) {
6006 filter->status = 1;
6007 filter->cache = c;
6008 return 0;
6009 } else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6010 filter->status = 2;
6011 filter->cache = c;
6012 return 0;
6013 }
6014
6015 if (c == 0xA9) { /* Copyright sign */
6016 *s1 = 0x27DC;
6017 return 1;
6018 } else if (c == 0xAE) { /* Registered sign */
6019 *s1 = 0x27DD;
6020 return 1;
6021 } else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
6022 int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6023 if (i >= 0) {
6024 *s1 = mb_tbl_uni_kddi2code2_value[i];
6025 return 1;
6026 }
6027 } else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
6028 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6029 if (i >= 0) {
6030 *s1 = mb_tbl_uni_kddi2code3_value[i];
6031 return 1;
6032 }
6033 } else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
6034 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6035 if (i >= 0) {
6036 *s1 = mb_tbl_uni_kddi2code5_val[i];
6037 return 1;
6038 }
6039 }
6040 return 0;
6041 }
6042
mbfilter_unicode2sjis_emoji_sb(int c,int * s1,mbfl_convert_filter * filter)6043 int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter)
6044 {
6045 if (filter->status == 1) {
6046 int c1 = filter->cache;
6047 filter->cache = filter->status = 0;
6048 if (c == 0x20E3) {
6049 if (c1 == '#') {
6050 *s1 = 0x2817;
6051 } else if (c1 == '0') {
6052 *s1 = 0x282c;
6053 } else { /* Previous character was '1'-'9' */
6054 *s1 = 0x2823 + (c1 - '1');
6055 }
6056 return 1;
6057 } else {
6058 (*filter->output_function)(c1, filter->data);
6059 }
6060 } else if (filter->status == 2) {
6061 int c1 = filter->cache;
6062 filter->cache = filter->status = 0;
6063 if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
6064 for (int i = 0; i < 10; i++) {
6065 if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
6066 *s1 = nflags_code_sb[i];
6067 return 1;
6068 }
6069 }
6070 }
6071
6072 /* If none of the SoftBank national flag emoji matched, then we have no way
6073 * to convert the previous codepoint... */
6074 mbfl_filt_conv_illegal_output(c1, filter);
6075 }
6076
6077 if (c == '#' || (c >= '0' && c <= '9')) {
6078 filter->status = 1;
6079 filter->cache = c;
6080 return 0;
6081 } else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6082 filter->status = 2;
6083 filter->cache = c;
6084 return 0;
6085 }
6086
6087 if (c == 0xA9) { /* Copyright sign */
6088 *s1 = 0x2855;
6089 return 1;
6090 } else if (c == 0xAE) { /* Registered sign */
6091 *s1 = 0x2856;
6092 return 1;
6093 } else if (c >= mb_tbl_uni_sb2code2_min && c <= mb_tbl_uni_sb2code2_max) {
6094 int i = mbfl_bisec_srch2(c, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
6095 if (i >= 0) {
6096 *s1 = mb_tbl_uni_sb2code2_value[i];
6097 return 1;
6098 }
6099 } else if (c >= mb_tbl_uni_sb2code3_min && c <= mb_tbl_uni_sb2code3_max) {
6100 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
6101 if (i >= 0) {
6102 *s1 = mb_tbl_uni_sb2code3_value[i];
6103 return 1;
6104 }
6105 } else if (c >= mb_tbl_uni_sb2code5_min && c <= mb_tbl_uni_sb2code5_max) {
6106 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
6107 if (i >= 0) {
6108 *s1 = mb_tbl_uni_sb2code5_val[i];
6109 return 1;
6110 }
6111 }
6112 return 0;
6113 }
6114
mbfl_filt_conv_sjis_mobile_wchar(int c,mbfl_convert_filter * filter)6115 static int mbfl_filt_conv_sjis_mobile_wchar(int c, mbfl_convert_filter *filter)
6116 {
6117 int c1, s, s1, s2, w, snd = 0;
6118
6119 switch (filter->status) {
6120 case 0:
6121 if (c >= 0 && c < 0x80) { /* ASCII */
6122 if (filter->from == &mbfl_encoding_sjis_sb && c == 0x1B) {
6123 /* ESC; escape sequences were used on older SoftBank phones for emoji */
6124 filter->cache = c;
6125 filter->status = 2;
6126 } else {
6127 CK((*filter->output_function)(c, filter->data));
6128 }
6129 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
6130 CK((*filter->output_function)(0xFEC0 + c, filter->data));
6131 } else if (c > 0x80 && c < 0xFD && c != 0xA0) { /* Kanji, first byte */
6132 filter->status = 1;
6133 filter->cache = c;
6134 } else {
6135 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6136 }
6137 break;
6138
6139 case 1: /* Kanji, second byte */
6140 filter->status = 0;
6141 c1 = filter->cache;
6142 if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
6143 w = 0;
6144 SJIS_DECODE(c1, c, s1, s2);
6145 s = ((s1 - 0x21) * 94) + s2 - 0x21;
6146 if (s <= 137) {
6147 if (s == 31) {
6148 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6149 } else if (s == 32) {
6150 w = 0xFF5E; /* FULLWIDTH TILDE */
6151 } else if (s == 33) {
6152 w = 0x2225; /* PARALLEL TO */
6153 } else if (s == 60) {
6154 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6155 } else if (s == 80) {
6156 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
6157 } else if (s == 81) {
6158 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
6159 } else if (s == 137) {
6160 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
6161 }
6162 }
6163 if (w == 0) {
6164 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
6165 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
6166 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
6167 w = jisx0208_ucs_table[s];
6168 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
6169 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
6170 }
6171
6172 /* Emoji */
6173 if (filter->from == &mbfl_encoding_sjis_docomo && s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
6174 w = mbfilter_sjis_emoji_docomo2unicode(s, &snd);
6175 if (snd > 0) {
6176 CK((*filter->output_function)(snd, filter->data));
6177 }
6178 } else if (filter->from == &mbfl_encoding_sjis_kddi && s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi2_max) {
6179 w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
6180 if (snd > 0) {
6181 CK((*filter->output_function)(snd, filter->data));
6182 }
6183 } else if (filter->from == &mbfl_encoding_sjis_sb && s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb3_max) {
6184 w = mbfilter_sjis_emoji_sb2unicode(s, &snd);
6185 if (snd > 0) {
6186 CK((*filter->output_function)(snd, filter->data));
6187 }
6188 }
6189
6190 if (w == 0) {
6191 if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */
6192 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
6193 } else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */
6194 w = s - (94*94) + 0xe000;
6195 }
6196 }
6197 }
6198 if (w <= 0) {
6199 w = MBFL_BAD_INPUT;
6200 }
6201 CK((*filter->output_function)(w, filter->data));
6202 } else {
6203 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6204 }
6205 break;
6206
6207 /* ESC: Softbank Emoji */
6208 case 2:
6209 if (c == '$') {
6210 filter->cache = c;
6211 filter->status++;
6212 } else {
6213 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6214 filter->status = filter->cache = 0;
6215 }
6216 break;
6217
6218 /* ESC $: Softbank Emoji */
6219 case 3:
6220 if ((c >= 'E' && c <= 'G') || (c >= 'O' && c <= 'Q')) {
6221 filter->cache = c;
6222 filter->status++;
6223 } else {
6224 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6225 filter->status = filter->cache = 0;
6226 }
6227 break;
6228
6229 /* ESC $ [GEFOPQ]: Softbank Emoji */
6230 case 4:
6231 c1 = filter->cache;
6232 if (c == 0xF) { /* Terminate sequence of emoji */
6233 filter->status = filter->cache = 0;
6234 return 0;
6235 } else {
6236 if (c1 == 'G' && c >= 0x21 && c <= 0x7a) {
6237 s1 = (0x91 - 0x21) * 94;
6238 } else if (c1 == 'E' && c >= 0x21 && c <= 0x7A) {
6239 s1 = (0x8D - 0x21) * 94;
6240 } else if (c1 == 'F' && c >= 0x21 && c <= 0x7A) {
6241 s1 = (0x8E - 0x21) * 94;
6242 } else if (c1 == 'O' && c >= 0x21 && c <= 0x6D) {
6243 s1 = (0x92 - 0x21) * 94;
6244 } else if (c1 == 'P' && c >= 0x21 && c <= 0x6C) {
6245 s1 = (0x95 - 0x21) * 94;
6246 } else if (c1 == 'Q' && c >= 0x21 && c <= 0x5E) {
6247 s1 = (0x96 - 0x21) * 94;
6248 } else {
6249 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6250 filter->status = filter->cache = 0;
6251 return 0;
6252 }
6253
6254 w = mbfilter_sjis_emoji_sb2unicode(s1 + c - 0x21, &snd);
6255 if (w > 0) {
6256 if (snd > 0) {
6257 CK((*filter->output_function)(snd, filter->data));
6258 }
6259 CK((*filter->output_function)(w, filter->data));
6260 } else {
6261 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6262 filter->status = filter->cache = 0;
6263 }
6264 }
6265 }
6266
6267 return 0;
6268 }
6269
mbfl_filt_conv_wchar_sjis_mobile(int c,mbfl_convert_filter * filter)6270 static int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter)
6271 {
6272 int c1, c2, s1 = 0, s2 = 0;
6273
6274 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
6275 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
6276 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
6277 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
6278 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
6279 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
6280 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
6281 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
6282 } else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
6283 /* Private User Area (95ku - 114ku) */
6284 s1 = c - 0xE000;
6285 c1 = (s1 / 94) + 0x7F;
6286 c2 = (s1 % 94) + 0x21;
6287 s1 = (c1 << 8) | c2;
6288 s2 = 1;
6289 }
6290
6291 if (s1 <= 0) {
6292 if (c == 0xA5) { /* YEN SIGN */
6293 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
6294 } else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6295 s1 = 0x2140;
6296 } else if (c == 0x2225) { /* PARALLEL TO */
6297 s1 = 0x2142;
6298 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6299 s1 = 0x215D;
6300 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6301 s1 = 0x2171;
6302 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6303 s1 = 0x2172;
6304 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6305 s1 = 0x224C;
6306 }
6307 }
6308
6309 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
6310 s1 = -1;
6311
6312 /* CP932 vendor ext1 (13ku) */
6313 for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
6314 if (c == cp932ext1_ucs_table[c1]) {
6315 s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
6316 break;
6317 }
6318 }
6319
6320 if (s1 <= 0) {
6321 /* CP932 vendor ext2 (115ku - 119ku) */
6322 for (c1 = 0; c1 < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; c1++) {
6323 if (c == cp932ext2_ucs_table[c1]) {
6324 s1 = (((c1 / 94) + 0x79) << 8) + (c1 % 94) + 0x21;
6325 break;
6326 }
6327 }
6328 }
6329
6330 if (c == 0) {
6331 s1 = 0;
6332 }
6333 }
6334
6335 if ((filter->to == &mbfl_encoding_sjis_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter)) ||
6336 (filter->to == &mbfl_encoding_sjis_kddi && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter)) ||
6337 (filter->to == &mbfl_encoding_sjis_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter))) {
6338 s1 = (((s1 / 94) + 0x21) << 8) | ((s1 % 94) + 0x21);
6339 }
6340
6341 if (filter->status) {
6342 return 0;
6343 }
6344
6345 if (s1 >= 0) {
6346 if (s1 < 0x100) { /* Latin/Kana */
6347 CK((*filter->output_function)(s1, filter->data));
6348 } else { /* Kanji */
6349 c1 = (s1 >> 8) & 0xff;
6350 c2 = s1 & 0xff;
6351 SJIS_ENCODE(c1, c2, s1, s2);
6352 CK((*filter->output_function)(s1, filter->data));
6353 CK((*filter->output_function)(s2, filter->data));
6354 }
6355 } else {
6356 CK(mbfl_filt_conv_illegal_output(c, filter));
6357 }
6358
6359 return 0;
6360 }
6361
mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter * filter)6362 int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter)
6363 {
6364 int c1 = filter->cache;
6365 if (filter->status == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
6366 filter->cache = filter->status = 0;
6367 CK((*filter->output_function)(c1, filter->data));
6368 } else if (filter->status == 2) {
6369 /* First of a pair of Regional Indicator codepoints came at the end of a string */
6370 filter->cache = filter->status = 0;
6371 mbfl_filt_conv_illegal_output(c1, filter);
6372 }
6373
6374 if (filter->flush_function) {
6375 (*filter->flush_function)(filter->data);
6376 }
6377
6378 return 0;
6379 }
6380
6381 static const unsigned short sjis_mobile_decode_tbl1[] = {
6382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 8836, 9024, 9212, 9400, 9588, 9776, 9964, 10152, 10340, 10528, 10716, 10904, 11092, 0xFFFF, 0xFFFF, 0xFFFF
6383 };
6384
mb_sjis_docomo_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6385 static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6386 {
6387 unsigned char *p = *in, *e = p + *in_len;
6388 /* Leave one extra space available in output buffer, since some iterations of
6389 * main loop (below) may emit two wchars */
6390 uint32_t *out = buf, *limit = buf + bufsize - 1;
6391
6392 while (p < e && out < limit) {
6393 unsigned char c = *p++;
6394
6395 if (c <= 0x7F) {
6396 *out++ = c;
6397 } else if (c >= 0xA1 && c <= 0xDF) {
6398 /* Kana */
6399 *out++ = 0xFEC0 + c;
6400 } else {
6401 /* Kanji */
6402 if (p == e) {
6403 *out++ = MBFL_BAD_INPUT;
6404 break;
6405 }
6406 unsigned char c2 = *p++;
6407 uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6408
6409 if (w <= 137) {
6410 if (w == 31) {
6411 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6412 continue;
6413 } else if (w == 32) {
6414 *out++ = 0xFF5E; /* FULLWIDTH TILDE */
6415 continue;
6416 } else if (w == 33) {
6417 *out++ = 0x2225; /* PARALLEL TO */
6418 continue;
6419 } else if (w == 60) {
6420 *out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6421 continue;
6422 } else if (w == 80) {
6423 *out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6424 continue;
6425 } else if (w == 81) {
6426 *out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6427 continue;
6428 } else if (w == 137) {
6429 *out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6430 continue;
6431 }
6432 }
6433
6434 if (w >= mb_tbl_code2uni_docomo1_min && w <= mb_tbl_code2uni_docomo1_max) {
6435 int snd = 0;
6436 w = mbfilter_sjis_emoji_docomo2unicode(w, &snd);
6437 if (snd) {
6438 *out++ = snd;
6439 }
6440 } else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6441 w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6442 } else if (w < jisx0208_ucs_table_size) {
6443 w = jisx0208_ucs_table[w];
6444 } else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6445 w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6446 } else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6447 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6448 } else if (w >= (94*94) && w < (114*94)) {
6449 w = w - (94*94) + 0xE000;
6450 } else {
6451 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6452 p--;
6453 }
6454 *out++ = MBFL_BAD_INPUT;
6455 continue;
6456 }
6457
6458 *out++ = w ? w : MBFL_BAD_INPUT;
6459 }
6460 }
6461
6462 *in_len = e - p;
6463 *in = p;
6464 return out - buf;
6465 }
6466
mb_wchar_to_sjis_docomo(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6467 static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6468 {
6469 unsigned char *out, *limit;
6470 MB_CONVERT_BUF_LOAD(buf, out, limit);
6471 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6472
6473 uint32_t w;
6474 unsigned int s = 0;
6475
6476 if (buf->state) {
6477 /* Continue what we were doing on the previous call */
6478 w = buf->state;
6479 buf->state = 0;
6480 goto reprocess_wchar;
6481 }
6482
6483 while (len--) {
6484 w = *in++;
6485 reprocess_wchar:
6486 s = 0;
6487
6488 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6489 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6490 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6491 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6492 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6493 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6494 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6495 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6496 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6497 /* Private User Area (95ku - 114ku) */
6498 s = w - 0xE000;
6499 s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6500 goto process_emoji;
6501 }
6502
6503 if (!s) {
6504 if (w == 0xA5) { /* YEN SIGN */
6505 s = 0x216F; /* FULLWIDTH YEN SIGN */
6506 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
6507 s = 0x2140;
6508 } else if (w == 0x2225) { /* PARALLEL TO */
6509 s = 0x2142;
6510 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6511 s = 0x215D;
6512 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6513 s = 0x2171;
6514 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6515 s = 0x2172;
6516 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6517 s = 0x224C;
6518 }
6519 }
6520
6521 if (w && (!s || s >= 0x8080)) {
6522 s = 0;
6523
6524 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6525 if (w == cp932ext1_ucs_table[i]) {
6526 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6527 goto process_emoji;
6528 }
6529 }
6530
6531 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6532 if (w == cp932ext2_ucs_table[i]) {
6533 s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6534 goto process_emoji;
6535 }
6536 }
6537 }
6538
6539 process_emoji:
6540 /* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
6541 * to a sequence of 2 codepoints, one of which is a combining character which
6542 * adds the 'key' image around the other
6543 *
6544 * In the other direction, look for such sequences and convert them to a
6545 * single emoji */
6546 if (w == '#' || (w >= '0' && w <= '9')) {
6547 if (!len) {
6548 if (end) {
6549 goto emit_output;
6550 } else {
6551 /* If we are at the end of the current buffer of codepoints, but another
6552 * buffer is coming, then remember that we have to reprocess `w` */
6553 buf->state = w;
6554 break;
6555 }
6556 }
6557 uint32_t w2 = *in++; len--;
6558 if (w2 == 0x20E3) {
6559 if (w == '#') {
6560 s = 0x2964;
6561 } else if (w == '0') {
6562 s = 0x296F;
6563 } else { /* Previous character was '1'-'9' */
6564 s = 0x2966 + (w - '1');
6565 }
6566 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6567 } else {
6568 in--; len++;
6569 }
6570 } else if (w == 0xA9) { /* Copyright sign */
6571 s = (((0x29B5 / 94) + 0x21) << 8) | ((0x29B5 % 94) + 0x21);
6572 } else if (w == 0xAE) { /* Registered sign */
6573 s = (((0x29BA / 94) + 0x21) << 8) | ((0x29BA % 94) + 0x21);
6574 } else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) {
6575 int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
6576 if (i >= 0) {
6577 s = mb_tbl_uni_docomo2code2_value[i];
6578 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6579 }
6580 } else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) {
6581 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
6582 if (i >= 0) {
6583 s = mb_tbl_uni_docomo2code3_value[i];
6584 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6585 }
6586 } else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) {
6587 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
6588 if (i >= 0) {
6589 s = mb_tbl_uni_docomo2code5_val[i];
6590 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6591 }
6592 }
6593
6594 emit_output:
6595 if (!s && w) {
6596 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_docomo);
6597 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6598 } else if (s <= 0xFF) {
6599 out = mb_convert_buf_add(out, s);
6600 } else {
6601 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6602 SJIS_ENCODE(c1, c2, s1, s2);
6603 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6604 out = mb_convert_buf_add2(out, s1, s2);
6605 }
6606 }
6607
6608 MB_CONVERT_BUF_STORE(buf, out, limit);
6609 }
6610
mb_sjis_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6611 static size_t mb_sjis_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6612 {
6613 unsigned char *p = *in, *e = p + *in_len;
6614 uint32_t *out = buf, *limit = buf + bufsize - 1;
6615
6616 while (p < e && out < limit) {
6617 unsigned char c = *p++;
6618
6619 if (c <= 0x7F) {
6620 *out++ = c;
6621 } else if (c >= 0xA1 && c <= 0xDF) {
6622 /* Kana */
6623 *out++ = 0xFEC0 + c;
6624 } else {
6625 /* Kanji */
6626 if (p == e) {
6627 *out++ = MBFL_BAD_INPUT;
6628 break;
6629 }
6630 unsigned char c2 = *p++;
6631 uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6632
6633 if (w <= 137) {
6634 if (w == 31) {
6635 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6636 continue;
6637 } else if (w == 32) {
6638 *out++ = 0xFF5E; /* FULLWIDTH TILDE */
6639 continue;
6640 } else if (w == 33) {
6641 *out++ = 0x2225; /* PARALLEL TO */
6642 continue;
6643 } else if (w == 60) {
6644 *out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6645 continue;
6646 } else if (w == 80) {
6647 *out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6648 continue;
6649 } else if (w == 81) {
6650 *out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6651 continue;
6652 } else if (w == 137) {
6653 *out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6654 continue;
6655 }
6656 }
6657
6658 if (w >= mb_tbl_code2uni_kddi1_min && w <= mb_tbl_code2uni_kddi2_max) {
6659 int snd = 0;
6660 w = mbfilter_sjis_emoji_kddi2unicode(w, &snd);
6661 if (!w) {
6662 w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6663 if (w >= (94*94) && w < (114*94)) {
6664 w = w - (94*94) + 0xE000;
6665 }
6666 } else if (snd) {
6667 *out++ = snd;
6668 }
6669 } else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6670 w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6671 } else if (w < jisx0208_ucs_table_size) {
6672 w = jisx0208_ucs_table[w];
6673 } else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6674 w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6675 } else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6676 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6677 } else if (w >= (94*94) && w < (114*94)) {
6678 w = w - (94*94) + 0xE000;
6679 } else {
6680 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6681 p--;
6682 }
6683 *out++ = MBFL_BAD_INPUT;
6684 continue;
6685 }
6686
6687 *out++ = w ? w : MBFL_BAD_INPUT;
6688 }
6689 }
6690
6691 *in_len = e - p;
6692 *in = p;
6693 return out - buf;
6694 }
6695
mb_wchar_to_sjis_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6696 static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6697 {
6698 unsigned char *out, *limit;
6699 MB_CONVERT_BUF_LOAD(buf, out, limit);
6700 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6701
6702 uint32_t w;
6703 unsigned int s = 0;
6704
6705 if (buf->state) {
6706 w = buf->state;
6707 buf->state = 0;
6708 goto reprocess_wchar;
6709 }
6710
6711 while (len--) {
6712 w = *in++;
6713 reprocess_wchar:
6714 s = 0;
6715
6716 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6717 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6718 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6719 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6720 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6721 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6722 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6723 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6724 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6725 /* Private User Area (95ku - 114ku) */
6726 s = w - 0xE000;
6727 s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6728 goto process_emoji;
6729 }
6730
6731 if (!s) {
6732 if (w == 0xA5) { /* YEN SIGN */
6733 s = 0x216F; /* FULLWIDTH YEN SIGN */
6734 } else if (w == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6735 s = 0x2140;
6736 } else if (w == 0x2225) { /* PARALLEL TO */
6737 s = 0x2142;
6738 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6739 s = 0x215D;
6740 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6741 s = 0x2171;
6742 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6743 s = 0x2172;
6744 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6745 s = 0x224C;
6746 }
6747 }
6748
6749 if (w && (!s || s >= 0x8080)) {
6750 s = 0;
6751
6752 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6753 if (w == cp932ext1_ucs_table[i]) {
6754 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6755 goto process_emoji;
6756 }
6757 }
6758
6759 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6760 if (w == cp932ext2_ucs_table[i]) {
6761 s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6762 goto process_emoji;
6763 }
6764 }
6765 }
6766
6767 process_emoji:
6768 if (w == '#' || (w >= '0' && w <= '9')) {
6769 if (!len) {
6770 if (end) {
6771 goto emit_output;
6772 } else {
6773 /* If we are at the end of the current buffer of codepoints, but another
6774 * buffer is coming, then remember that we have to reprocess `w` */
6775 buf->state = w;
6776 break;
6777 }
6778 }
6779 uint32_t w2 = *in++; len--;
6780 if (w2 == 0x20E3) {
6781 if (w == '#') {
6782 s = 0x25BC;
6783 } else if (w == '0') {
6784 s = 0x2830;
6785 } else { /* Previous character was '1'-'9' */
6786 s = 0x27A6 + (w - '1');
6787 }
6788 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6789 } else {
6790 in--; len++;
6791 }
6792 } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
6793 if (!len) {
6794 if (end) {
6795 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6796 } else {
6797 /* Reprocess `w` when this function is called again with another buffer
6798 * of wchars */
6799 buf->state = w;
6800 }
6801 break;
6802 }
6803 uint32_t w2 = *in++; len--;
6804 if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
6805 for (int i = 0; i < 10; i++) {
6806 if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
6807 s = nflags_code_kddi[i];
6808 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6809 goto emit_output;
6810 }
6811 }
6812 }
6813 in--; len++;
6814 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6815 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6816 continue;
6817 } else if (w == 0xA9) { /* Copyright sign */
6818 s = (((0x27DC / 94) + 0x21) << 8) | ((0x27DC % 94) + 0x21);
6819 } else if (w == 0xAE) { /* Registered sign */
6820 s = (((0x27DD / 94) + 0x21) << 8) | ((0x27DD % 94) + 0x21);
6821 } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
6822 int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6823 if (i >= 0) {
6824 s = mb_tbl_uni_kddi2code2_value[i];
6825 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6826 }
6827 } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
6828 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6829 if (i >= 0) {
6830 s = mb_tbl_uni_kddi2code3_value[i];
6831 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6832 }
6833 } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
6834 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6835 if (i >= 0) {
6836 s = mb_tbl_uni_kddi2code5_val[i];
6837 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6838 }
6839 }
6840
6841 emit_output:
6842 if (!s && w) {
6843 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6844 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6845 } else if (s <= 0xFF) {
6846 out = mb_convert_buf_add(out, s);
6847 } else {
6848 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6849 SJIS_ENCODE(c1, c2, s1, s2);
6850 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6851 out = mb_convert_buf_add2(out, s1, s2);
6852 }
6853 }
6854
6855 MB_CONVERT_BUF_STORE(buf, out, limit);
6856 }
6857
mb_sjis_sb_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6858 static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6859 {
6860 unsigned char *p = *in, *e = p + *in_len;
6861 uint32_t *out = buf, *limit = buf + bufsize - 1;
6862
6863 if (*state) {
6864 goto softbank_emoji_escapes;
6865 }
6866
6867 while (p < e && out < limit) {
6868 unsigned char c = *p++;
6869
6870 if (c == 0x1B) {
6871 /* Escape sequence */
6872 if (p == e || *p++ != '$' || p == e) {
6873 *out++ = MBFL_BAD_INPUT;
6874 continue;
6875 }
6876 unsigned char c2 = *p++;
6877 if ((c2 < 'E' || c2 > 'G') && (c2 < 'O' || c2 > 'Q')) {
6878 *out++ = MBFL_BAD_INPUT;
6879 continue;
6880 }
6881 /* Escape sequence was valid, next should be a series of specially
6882 * encoded Softbank emoji */
6883 *state = c2;
6884
6885 softbank_emoji_escapes:
6886 while (p < e && out < limit) {
6887 c = *p++;
6888 if (c == 0xF) {
6889 *state = 0;
6890 break;
6891 }
6892 unsigned int s = 0;
6893 if (*state == 'G' && c >= 0x21 && c <= 0x7A) {
6894 s = (0x91 - 0x21) * 94;
6895 } else if (*state == 'E' && c >= 0x21 && c <= 0x7A) {
6896 s = (0x8D - 0x21) * 94;
6897 } else if (*state == 'F' && c >= 0x21 && c <= 0x7A) {
6898 s = (0x8E - 0x21) * 94;
6899 } else if (*state == 'O' && c >= 0x21 && c <= 0x6D) {
6900 s = (0x92 - 0x21) * 94;
6901 } else if (*state == 'P' && c >= 0x21 && c <= 0x6C) {
6902 s = (0x95 - 0x21) * 94;
6903 } else if (*state == 'Q' && c >= 0x21 && c <= 0x5E) {
6904 s = (0x96 - 0x21) * 94;
6905 } else {
6906 *out++ = MBFL_BAD_INPUT;
6907 *state = 0;
6908 break;
6909 }
6910
6911 int snd = 0;
6912 uint32_t w = mbfilter_sjis_emoji_sb2unicode(s + c - 0x21, &snd);
6913 if (w) {
6914 if (snd) {
6915 *out++ = snd;
6916 }
6917 *out++ = w;
6918 } else {
6919 *out++ = MBFL_BAD_INPUT;
6920 *state = 0;
6921 break;
6922 }
6923 }
6924 } else if (c <= 0x7F) {
6925 *out++ = c;
6926 } else if (c >= 0xA1 && c <= 0xDF) {
6927 /* Kana */
6928 *out++ = 0xFEC0 + c;
6929 } else {
6930 /* Kanji */
6931 if (p == e) {
6932 *out++ = MBFL_BAD_INPUT;
6933 break;
6934 }
6935 unsigned char c2 = *p++;
6936 uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6937
6938 if (w <= 137) {
6939 if (w == 31) {
6940 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6941 continue;
6942 } else if (w == 32) {
6943 *out++ = 0xFF5E; /* FULLWIDTH TILDE */
6944 continue;
6945 } else if (w == 33) {
6946 *out++ = 0x2225; /* PARALLEL TO */
6947 continue;
6948 } else if (w == 60) {
6949 *out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6950 continue;
6951 } else if (w == 80) {
6952 *out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6953 continue;
6954 } else if (w == 81) {
6955 *out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6956 continue;
6957 } else if (w == 137) {
6958 *out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6959 continue;
6960 }
6961 }
6962
6963 if (w >= mb_tbl_code2uni_sb1_min && w <= mb_tbl_code2uni_sb3_max) {
6964 int snd = 0;
6965 w = mbfilter_sjis_emoji_sb2unicode(w, &snd);
6966 if (!w) {
6967 w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6968 if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6969 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6970 } else if (w >= (94*94) && w < (114*94)) {
6971 w = w - (94*94) + 0xE000;
6972 }
6973 } else if (snd) {
6974 *out++ = snd;
6975 }
6976 } else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6977 w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6978 } else if (w < jisx0208_ucs_table_size) {
6979 w = jisx0208_ucs_table[w];
6980 } else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6981 w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6982 } else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6983 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6984 } else if (w >= (94*94) && w < (114*94)) {
6985 w = w - (94*94) + 0xE000;
6986 } else {
6987 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6988 p--;
6989 }
6990 *out++ = MBFL_BAD_INPUT;
6991 continue;
6992 }
6993
6994 *out++ = w ? w : MBFL_BAD_INPUT;
6995 }
6996 }
6997
6998 *in_len = e - p;
6999 *in = p;
7000 return out - buf;
7001 }
7002
mb_wchar_to_sjis_sb(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7003 static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7004 {
7005 unsigned char *out, *limit;
7006 MB_CONVERT_BUF_LOAD(buf, out, limit);
7007 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
7008
7009 uint32_t w;
7010 unsigned int s = 0;
7011
7012 if (buf->state) {
7013 w = buf->state;
7014 buf->state = 0;
7015 goto reprocess_wchar;
7016 }
7017
7018 while (len--) {
7019 w = *in++;
7020 reprocess_wchar:
7021 s = 0;
7022
7023 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7024 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7025 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7026 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7027 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7028 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
7029 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7030 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
7031 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7032 /* Private User Area (95ku - 114ku) */
7033 s = w - 0xE000;
7034 s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
7035 goto process_emoji;
7036 }
7037
7038 if (!s) {
7039 if (w == 0xA5) { /* YEN SIGN */
7040 s = 0x216F; /* FULLWIDTH YEN SIGN */
7041 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7042 s = 0x2140;
7043 } else if (w == 0x2225) { /* PARALLEL TO */
7044 s = 0x2142;
7045 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7046 s = 0x215D;
7047 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7048 s = 0x2171;
7049 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7050 s = 0x2172;
7051 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7052 s = 0x224C;
7053 }
7054 }
7055
7056 if (w && (!s || s >= 0x8080)) {
7057 s = 0;
7058
7059 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7060 if (w == cp932ext1_ucs_table[i]) {
7061 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
7062 goto process_emoji;
7063 }
7064 }
7065
7066 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
7067 if (w == cp932ext2_ucs_table[i]) {
7068 s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
7069 goto process_emoji;
7070 }
7071 }
7072 }
7073
7074 process_emoji:
7075 if (w == '#' || (w >= '0' && w <= '9')) {
7076 if (!len) {
7077 if (end) {
7078 goto emit_output;
7079 } else {
7080 /* If we are at the end of the current buffer of codepoints, but another
7081 * buffer is coming, then remember that we have to reprocess `w` */
7082 buf->state = w;
7083 break;
7084 }
7085 }
7086 uint32_t w2 = *in++; len--;
7087 if (w2 == 0x20E3) {
7088 if (w == '#') {
7089 s = 0x2817;
7090 } else if (w == '0') {
7091 s = 0x282c;
7092 } else { /* Previous character was '1'-'9' */
7093 s = 0x2823 + (w - '1');
7094 }
7095 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7096 } else {
7097 in--; len++;
7098 }
7099 } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
7100 if (!len) {
7101 if (end) {
7102 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7103 } else {
7104 /* Reprocess `w` when this function is called again with
7105 * another buffer of wchars */
7106 buf->state = w;
7107 }
7108 break;
7109 }
7110 uint32_t w2 = *in++; len--;
7111 if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
7112 for (int i = 0; i < 10; i++) {
7113 if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
7114 s = nflags_code_sb[i];
7115 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7116 goto emit_output;
7117 }
7118 }
7119 }
7120 in--; len++;
7121 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7122 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7123 continue;
7124 } else if (w == 0xA9) { /* Copyright sign */
7125 s = (((0x2855 / 94) + 0x21) << 8) | ((0x2855 % 94) + 0x21);
7126 } else if (w == 0xAE) { /* Registered sign */
7127 s = (((0x2856 / 94) + 0x21) << 8) | ((0x2856 % 94) + 0x21);
7128 } else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) {
7129 int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
7130 if (i >= 0) {
7131 s = mb_tbl_uni_sb2code2_value[i];
7132 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7133 }
7134 } else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) {
7135 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
7136 if (i >= 0) {
7137 s = mb_tbl_uni_sb2code3_value[i];
7138 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7139 }
7140 } else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) {
7141 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
7142 if (i >= 0) {
7143 s = mb_tbl_uni_sb2code5_val[i];
7144 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7145 }
7146 }
7147
7148 emit_output:
7149 if (!s && w) {
7150 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7151 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7152 } else if (s <= 0xFF) {
7153 out = mb_convert_buf_add(out, s);
7154 } else {
7155 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7156 SJIS_ENCODE(c1, c2, s1, s2);
7157 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7158 out = mb_convert_buf_add2(out, s1, s2);
7159 }
7160 }
7161
7162 MB_CONVERT_BUF_STORE(buf, out, limit);
7163 }
7164
mb_sjis2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7165 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7166 {
7167 unsigned char *p = *in, *e = p + *in_len;
7168 uint32_t *out = buf, *limit = buf + bufsize - 1;
7169
7170 while (p < e && out < limit) {
7171 unsigned char c = *p++;
7172
7173 if (c <= 0x7F) {
7174 if (c == 0x5C) {
7175 *out++ = 0xA5;
7176 } else if (c == 0x7E) {
7177 *out++ = 0x203E;
7178 } else {
7179 *out++ = c;
7180 }
7181 } else if (c >= 0xA1 && c <= 0xDF) {
7182 *out++ = 0xFEC0 + c;
7183 } else {
7184 if (p == e) {
7185 *out++ = MBFL_BAD_INPUT;
7186 break;
7187 }
7188 unsigned char c2 = *p++;
7189 uint32_t w1 = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7190
7191 /* Conversion for combining characters */
7192 if (w1 >= 0x0170 && w1 <= 0x03F1) {
7193 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key_b, jisx0213_u2_tbl_len);
7194 if (k >= 0) {
7195 *out++ = jisx0213_u2_tbl[2*k];
7196 *out++ = jisx0213_u2_tbl[2*k+1];
7197 continue;
7198 }
7199 }
7200
7201 /* Conversion for BMP */
7202 if (w1 < jisx0213_ucs_table_size) {
7203 uint32_t w = jisx0213_ucs_table[w1];
7204 if (w) {
7205 *out++ = w;
7206 continue;
7207 }
7208 }
7209
7210 /* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
7211 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
7212 if (k >= 0) {
7213 *out++ = jisx0213_jis_u5_tbl[k] + 0x20000;
7214 } else {
7215 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7216 p--;
7217 }
7218 *out++ = MBFL_BAD_INPUT;
7219 }
7220 }
7221 }
7222
7223 *in_len = e - p;
7224 *in = p;
7225 return out - buf;
7226 }
7227
mb_wchar_to_sjis2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7228 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7229 {
7230 unsigned char *out, *limit;
7231 MB_CONVERT_BUF_LOAD(buf, out, limit);
7232 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7233
7234 uint32_t w;
7235 if (buf->state) {
7236 w = buf->state;
7237 buf->state = 0;
7238 goto process_codepoint;
7239 }
7240
7241 while (len--) {
7242 w = *in++;
7243 process_codepoint: ;
7244 unsigned int s = 0;
7245
7246 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
7247 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
7248 if (w == jisx0213_u2_tbl[2*k]) {
7249 if (!len) {
7250 if (!end) {
7251 buf->state = w;
7252 MB_CONVERT_BUF_STORE(buf, out, limit);
7253 return;
7254 }
7255 } else {
7256 uint32_t w2 = *in++; len--;
7257 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
7258 k++;
7259 }
7260 if (w2 == jisx0213_u2_tbl[2*k+1]) {
7261 s = jisx0213_u2_key[k];
7262 break;
7263 }
7264 in--; len++;
7265 }
7266
7267 /* Fallback */
7268 s = jisx0213_u2_fb_tbl[k];
7269 break;
7270 }
7271 }
7272 }
7273
7274 /* Check for major Japanese chars: U+4E00-U+9FFF */
7275 if (!s) {
7276 for (int k = 0; k < uni2jis_tbl_len; k++) {
7277 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
7278 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
7279 break;
7280 }
7281 }
7282 }
7283
7284 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
7285 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
7286 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
7287 if (k >= 0) {
7288 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
7289 }
7290 }
7291
7292 /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
7293 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
7294 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
7295 if (k >= 0) {
7296 s = jisx0213_u5_jis_tbl[k];
7297 }
7298 }
7299
7300 if (!s) {
7301 /* CJK Compatibility Forms: U+FE30-U+FE4F */
7302 if (w == 0xFE45) {
7303 s = 0x233E;
7304 } else if (w == 0xFE46) {
7305 s = 0x233D;
7306 } else if (w >= 0xF91D && w <= 0xF9DC) {
7307 /* CJK Compatibility Ideographs: U+F900-U+F92A */
7308 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
7309 if (k >= 0) {
7310 s = ucs_r2b_jisx0213_cmap_val[k];
7311 }
7312 }
7313 }
7314
7315 if (!s && w) {
7316 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004);
7317 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7318 } else if (s <= 0xFF) {
7319 out = mb_convert_buf_add(out, s);
7320 } else {
7321 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7322 SJIS_ENCODE(c1, c2, s1, s2);
7323 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7324 out = mb_convert_buf_add2(out, s1, s2);
7325 }
7326 }
7327
7328 MB_CONVERT_BUF_STORE(buf, out, limit);
7329 }
7330
mbfl_filt_conv_cp932_wchar(int c,mbfl_convert_filter * filter)7331 static int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
7332 {
7333 int c1, s, s1, s2, w;
7334
7335 switch (filter->status) {
7336 case 0:
7337 if (c >= 0 && c < 0x80) { /* latin */
7338 CK((*filter->output_function)(c, filter->data));
7339 } else if (c > 0xa0 && c < 0xe0) { /* kana */
7340 CK((*filter->output_function)(0xfec0 + c, filter->data));
7341 } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
7342 filter->status = 1;
7343 filter->cache = c;
7344 } else {
7345 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7346 }
7347 break;
7348
7349 case 1: /* kanji second char */
7350 filter->status = 0;
7351 c1 = filter->cache;
7352 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
7353 w = 0;
7354 SJIS_DECODE(c1, c, s1, s2);
7355 s = (s1 - 0x21)*94 + s2 - 0x21;
7356 if (s <= 137) {
7357 if (s == 31) {
7358 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
7359 } else if (s == 32) {
7360 w = 0xff5e; /* FULLWIDTH TILDE */
7361 } else if (s == 33) {
7362 w = 0x2225; /* PARALLEL TO */
7363 } else if (s == 60) {
7364 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
7365 } else if (s == 80) {
7366 w = 0xffe0; /* FULLWIDTH CENT SIGN */
7367 } else if (s == 81) {
7368 w = 0xffe1; /* FULLWIDTH POUND SIGN */
7369 } else if (s == 137) {
7370 w = 0xffe2; /* FULLWIDTH NOT SIGN */
7371 }
7372 }
7373 if (w == 0) {
7374 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
7375 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7376 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
7377 w = jisx0208_ucs_table[s];
7378 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
7379 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7380 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */
7381 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7382 } else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */
7383 w = s - (94*94) + 0xe000;
7384 }
7385 }
7386
7387 if (w <= 0) {
7388 w = MBFL_BAD_INPUT;
7389 }
7390
7391 CK((*filter->output_function)(w, filter->data));
7392 } else {
7393 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7394 }
7395 break;
7396
7397 EMPTY_SWITCH_DEFAULT_CASE();
7398 }
7399
7400 return 0;
7401 }
7402
mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter * filter)7403 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
7404 {
7405 if (filter->status) {
7406 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
7407 filter->status = 0;
7408 }
7409
7410 if (filter->flush_function) {
7411 (*filter->flush_function)(filter->data);
7412 }
7413
7414 return 0;
7415 }
7416
mbfl_filt_conv_wchar_cp932(int c,mbfl_convert_filter * filter)7417 static int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
7418 {
7419 int c1, c2, s1, s2;
7420
7421 s1 = 0;
7422 s2 = 0;
7423 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
7424 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
7425 } else if (c == 0x203E) {
7426 s1 = 0x7E;
7427 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
7428 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
7429 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
7430 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
7431 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
7432 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
7433 } else if (c >= 0xe000 && c < (0xe000 + 20*94)) { /* user (95ku - 114ku) */
7434 s1 = c - 0xe000;
7435 c1 = s1/94 + 0x7f;
7436 c2 = s1%94 + 0x21;
7437 s1 = (c1 << 8) | c2;
7438 s2 = 1;
7439 }
7440 if (s1 <= 0) {
7441 if (c == 0xa5) { /* YEN SIGN */
7442 s1 = 0x5C;
7443 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
7444 s1 = 0x2140;
7445 } else if (c == 0x2225) { /* PARALLEL TO */
7446 s1 = 0x2142;
7447 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
7448 s1 = 0x215d;
7449 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
7450 s1 = 0x2171;
7451 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
7452 s1 = 0x2172;
7453 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
7454 s1 = 0x224c;
7455 }
7456 }
7457 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
7458 s1 = -1;
7459 c1 = 0;
7460 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
7461 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
7462 if (c == cp932ext1_ucs_table[c1]) {
7463 s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
7464 break;
7465 }
7466 c1++;
7467 }
7468 if (s1 <= 0) {
7469 c1 = 0;
7470 c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
7471 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
7472 if (c == cp932ext3_ucs_table[c1]) {
7473 s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21);
7474 break;
7475 }
7476 c1++;
7477 }
7478 }
7479 if (c == 0) {
7480 s1 = 0;
7481 } else if (s1 <= 0) {
7482 s1 = -1;
7483 }
7484 }
7485 if (s1 >= 0) {
7486 if (s1 < 0x100) { /* latin or kana */
7487 CK((*filter->output_function)(s1, filter->data));
7488 } else { /* kanji */
7489 c1 = (s1 >> 8) & 0xff;
7490 c2 = s1 & 0xff;
7491 SJIS_ENCODE(c1, c2, s1, s2);
7492 CK((*filter->output_function)(s1, filter->data));
7493 CK((*filter->output_function)(s2, filter->data));
7494 }
7495 } else {
7496 CK(mbfl_filt_conv_illegal_output(c, filter));
7497 }
7498
7499 return 0;
7500 }
7501
mbfl_filt_conv_wchar_sjiswin(int c,mbfl_convert_filter * filter)7502 static int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter)
7503 {
7504 if (c == 0xA5) {
7505 CK((*filter->output_function)(0x81, filter->data));
7506 CK((*filter->output_function)(0x8F, filter->data));
7507 } else if (c == 0x203E) {
7508 CK((*filter->output_function)(0x81, filter->data));
7509 CK((*filter->output_function)(0x50, filter->data));
7510 } else {
7511 return mbfl_filt_conv_wchar_cp932(c, filter);
7512 }
7513 return 0;
7514 }
7515
mb_cp932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7516 static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7517 {
7518 unsigned char *p = *in, *e = p + *in_len;
7519 uint32_t *out = buf, *limit = buf + bufsize;
7520
7521 while (p < e && out < limit) {
7522 unsigned char c = *p++;
7523
7524 if (c < 0x80) {
7525 *out++ = c;
7526 } else if (c > 0xA0 && c < 0xE0) {
7527 /* Kana */
7528 *out++ = 0xFEC0 + c;
7529 } else {
7530 if (p == e) {
7531 *out++ = MBFL_BAD_INPUT;
7532 break;
7533 }
7534 unsigned char c2 = *p++;
7535 unsigned int w = 0;
7536 unsigned int s = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7537
7538 if (s <= 137) {
7539 if (s == 31) {
7540 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
7541 } else if (s == 32) {
7542 w = 0xFF5E; /* FULLWIDTH TILDE */
7543 } else if (s == 33) {
7544 w = 0x2225; /* PARALLEL TO */
7545 } else if (s == 60) {
7546 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
7547 } else if (s == 80) {
7548 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
7549 } else if (s == 81) {
7550 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
7551 } else if (s == 137) {
7552 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
7553 }
7554 }
7555
7556 if (w == 0) {
7557 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
7558 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7559 } else if (s < jisx0208_ucs_table_size) {
7560 w = jisx0208_ucs_table[s];
7561 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
7562 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7563 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
7564 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7565 } else if (s >= (94*94) && s < (114*94)) {
7566 w = s - (94*94) + 0xE000;
7567 }
7568 }
7569
7570 if (!w) {
7571 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7572 p--;
7573 }
7574 w = MBFL_BAD_INPUT;
7575 }
7576 *out++ = w;
7577 }
7578 }
7579
7580 *in_len = e - p;
7581 *in = p;
7582 return out - buf;
7583 }
7584
mb_wchar_to_cp932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7585 static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7586 {
7587 unsigned char *out, *limit;
7588 MB_CONVERT_BUF_LOAD(buf, out, limit);
7589 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7590
7591 while (len--) {
7592 uint32_t w = *in++;
7593 unsigned int s1 = 0, s2 = 0, c1, c2;
7594
7595 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7596 s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7597 } else if (w == 0x203E) {
7598 s1 = 0x7E;
7599 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7600 s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7601 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7602 s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7603 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7604 s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7605 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7606 s1 = w - 0xE000;
7607 c1 = s1/94 + 0x7F;
7608 c2 = s1%94 + 0x21;
7609 s1 = (c1 << 8) | c2;
7610 s2 = 1;
7611 }
7612
7613 if (w == 0xA5) { /* YEN SIGN */
7614 s1 = 0x5C;
7615 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7616 s1 = 0x2140;
7617 } else if (w == 0x2225) { /* PARALLEL TO */
7618 s1 = 0x2142;
7619 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7620 s1 = 0x215D;
7621 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7622 s1 = 0x2171;
7623 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7624 s1 = 0x2172;
7625 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7626 s1 = 0x224C;
7627 } else if (w == 0) {
7628 out = mb_convert_buf_add(out, 0);
7629 continue;
7630 }
7631
7632 if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7633 for (unsigned int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7634 if (cp932ext1_ucs_table[i] == w) {
7635 s1 = ((i/94 + 0x2D) << 8) + (i%94 + 0x21);
7636 goto emit_output;
7637 }
7638 }
7639
7640 for (unsigned int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
7641 if (cp932ext3_ucs_table[i] == w) {
7642 s1 = ((i/94 + 0x93) << 8) + (i%94 + 0x21);
7643 goto emit_output;
7644 }
7645 }
7646
7647 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7648 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7649 continue;
7650 }
7651
7652 emit_output:
7653 if (s1 < 0x100) {
7654 out = mb_convert_buf_add(out, s1);
7655 } else {
7656 c1 = (s1 >> 8) & 0xFF;
7657 c2 = s1 & 0xFF;
7658 SJIS_ENCODE(c1, c2, s1, s2);
7659 out = mb_convert_buf_add2(out, s1, s2);
7660 }
7661 }
7662
7663 MB_CONVERT_BUF_STORE(buf, out, limit);
7664 }
7665
mb_wchar_to_sjiswin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7666 static void mb_wchar_to_sjiswin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7667 {
7668 unsigned char *out, *limit;
7669 MB_CONVERT_BUF_LOAD(buf, out, limit);
7670 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7671
7672 while (len--) {
7673 uint32_t w = *in++;
7674 unsigned int s1 = 0, s2 = 0, c1, c2;
7675
7676 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7677 s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7678 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7679 s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7680 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7681 s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7682 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7683 s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7684 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7685 s1 = w - 0xE000;
7686 c1 = s1/94 + 0x7F;
7687 c2 = s1%94 + 0x21;
7688 s1 = (c1 << 8) | c2;
7689 s2 = 1;
7690 }
7691
7692 if (w == 0xA5) { /* YEN SIGN */
7693 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
7694 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7695 s1 = 0x2140;
7696 } else if (w == 0x2225) { /* PARALLEL TO */
7697 s1 = 0x2142;
7698 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7699 s1 = 0x215D;
7700 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7701 s1 = 0x2171;
7702 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7703 s1 = 0x2172;
7704 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7705 s1 = 0x224C;
7706 } else if (w == 0) {
7707 out = mb_convert_buf_add(out, 0);
7708 continue;
7709 }
7710
7711 if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7712 for (unsigned int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7713 if (cp932ext1_ucs_table[i] == w) {
7714 s1 = ((i/94 + 0x2D) << 8) + (i%94 + 0x21);
7715 goto emit_output;
7716 }
7717 }
7718
7719 for (unsigned int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
7720 if (cp932ext3_ucs_table[i] == w) {
7721 s1 = ((i/94 + 0x93) << 8) + (i%94 + 0x21);
7722 goto emit_output;
7723 }
7724 }
7725
7726 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7727 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7728 continue;
7729 }
7730
7731 emit_output:
7732 if (s1 < 0x100) {
7733 out = mb_convert_buf_add(out, s1);
7734 } else {
7735 c1 = (s1 >> 8) & 0xFF;
7736 c2 = s1 & 0xFF;
7737 SJIS_ENCODE(c1, c2, s1, s2);
7738 out = mb_convert_buf_add2(out, s1, s2);
7739 }
7740 }
7741
7742 MB_CONVERT_BUF_STORE(buf, out, limit);
7743 }
7744
7745 static const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */
7746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7754 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7755 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7760 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7762 };
7763
7764 static const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */
7765 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7767 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7768 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7769 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7770 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7771 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7772 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7773 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7774 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7775 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7779 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
7780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7781 };
7782
7783 static const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */
7784 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7785 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7787 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7788 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7789 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7790 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7791 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7792 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7793 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7794 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7795 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7796 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7797 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7798 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7799 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
7800 };
7801
7802 static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
7803
7804 static const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
7805 mbfl_no_encoding_sjis,
7806 mbfl_no_encoding_wchar,
7807 mbfl_filt_conv_common_ctor,
7808 NULL,
7809 mbfl_filt_conv_sjis_wchar,
7810 mbfl_filt_conv_sjis_wchar_flush,
7811 NULL
7812 };
7813
7814 static const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
7815 mbfl_no_encoding_wchar,
7816 mbfl_no_encoding_sjis,
7817 mbfl_filt_conv_common_ctor,
7818 NULL,
7819 mbfl_filt_conv_wchar_sjis,
7820 mbfl_filt_conv_common_flush,
7821 NULL
7822 };
7823
7824 const mbfl_encoding mbfl_encoding_sjis = {
7825 mbfl_no_encoding_sjis,
7826 "SJIS",
7827 "Shift_JIS",
7828 mbfl_encoding_sjis_aliases,
7829 mblen_table_sjis,
7830 MBFL_ENCTYPE_GL_UNSAFE,
7831 &vtbl_sjis_wchar,
7832 &vtbl_wchar_sjis,
7833 mb_sjis_to_wchar,
7834 mb_wchar_to_sjis,
7835 NULL
7836 };
7837
7838 static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
7839
7840 static const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
7841 mbfl_no_encoding_sjis_mac,
7842 mbfl_no_encoding_wchar,
7843 mbfl_filt_conv_common_ctor,
7844 NULL,
7845 mbfl_filt_conv_sjis_mac_wchar,
7846 mbfl_filt_conv_sjis_wchar_flush,
7847 NULL,
7848 };
7849
7850 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_mac = {
7851 mbfl_no_encoding_wchar,
7852 mbfl_no_encoding_sjis_mac,
7853 mbfl_filt_conv_common_ctor,
7854 NULL,
7855 mbfl_filt_conv_wchar_sjis_mac,
7856 mbfl_filt_conv_wchar_sjis_mac_flush,
7857 NULL,
7858 };
7859
7860 const mbfl_encoding mbfl_encoding_sjis_mac = {
7861 mbfl_no_encoding_sjis_mac,
7862 "SJIS-mac",
7863 "Shift_JIS",
7864 mbfl_encoding_sjis_mac_aliases,
7865 mblen_table_sjismac,
7866 MBFL_ENCTYPE_GL_UNSAFE,
7867 &vtbl_sjis_mac_wchar,
7868 &vtbl_wchar_sjis_mac,
7869 mb_sjismac_to_wchar,
7870 mb_wchar_to_sjismac,
7871 NULL
7872 };
7873
7874 static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
7875 static const char *mbfl_encoding_sjis_kddi_aliases[] = {"SJIS-KDDI", "shift_jis-kddi", "x-sjis-emoji-kddi", NULL};
7876 static const char *mbfl_encoding_sjis_sb_aliases[] = {"SJIS-SOFTBANK", "shift_jis-softbank", "x-sjis-emoji-softbank", NULL};
7877
7878 static const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = {
7879 mbfl_no_encoding_sjis_docomo,
7880 mbfl_no_encoding_wchar,
7881 mbfl_filt_conv_common_ctor,
7882 NULL,
7883 mbfl_filt_conv_sjis_mobile_wchar,
7884 mbfl_filt_conv_sjis_wchar_flush,
7885 NULL,
7886 };
7887
7888 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_docomo = {
7889 mbfl_no_encoding_wchar,
7890 mbfl_no_encoding_sjis_docomo,
7891 mbfl_filt_conv_common_ctor,
7892 NULL,
7893 mbfl_filt_conv_wchar_sjis_mobile,
7894 mbfl_filt_conv_sjis_mobile_flush,
7895 NULL,
7896 };
7897
7898 const mbfl_encoding mbfl_encoding_sjis_docomo = {
7899 mbfl_no_encoding_sjis_docomo,
7900 "SJIS-Mobile#DOCOMO",
7901 "Shift_JIS",
7902 mbfl_encoding_sjis_docomo_aliases,
7903 mblen_table_sjis_mobile,
7904 MBFL_ENCTYPE_GL_UNSAFE,
7905 &vtbl_sjis_docomo_wchar,
7906 &vtbl_wchar_sjis_docomo,
7907 mb_sjis_docomo_to_wchar,
7908 mb_wchar_to_sjis_docomo,
7909 NULL
7910 };
7911
7912 static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
7913 mbfl_no_encoding_sjis_kddi,
7914 mbfl_no_encoding_wchar,
7915 mbfl_filt_conv_common_ctor,
7916 NULL,
7917 mbfl_filt_conv_sjis_mobile_wchar,
7918 mbfl_filt_conv_sjis_wchar_flush,
7919 NULL,
7920 };
7921
7922 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_kddi = {
7923 mbfl_no_encoding_wchar,
7924 mbfl_no_encoding_sjis_kddi,
7925 mbfl_filt_conv_common_ctor,
7926 NULL,
7927 mbfl_filt_conv_wchar_sjis_mobile,
7928 mbfl_filt_conv_sjis_mobile_flush,
7929 NULL,
7930 };
7931
7932 const mbfl_encoding mbfl_encoding_sjis_kddi = {
7933 mbfl_no_encoding_sjis_kddi,
7934 "SJIS-Mobile#KDDI",
7935 "Shift_JIS",
7936 mbfl_encoding_sjis_kddi_aliases,
7937 mblen_table_sjis_mobile,
7938 MBFL_ENCTYPE_GL_UNSAFE,
7939 &vtbl_sjis_kddi_wchar,
7940 &vtbl_wchar_sjis_kddi,
7941 mb_sjis_kddi_to_wchar,
7942 mb_wchar_to_sjis_kddi,
7943 NULL
7944 };
7945
7946 static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
7947 mbfl_no_encoding_sjis_sb,
7948 mbfl_no_encoding_wchar,
7949 mbfl_filt_conv_common_ctor,
7950 NULL,
7951 mbfl_filt_conv_sjis_mobile_wchar,
7952 mbfl_filt_conv_sjis_wchar_flush,
7953 NULL,
7954 };
7955
7956 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_sb = {
7957 mbfl_no_encoding_wchar,
7958 mbfl_no_encoding_sjis_sb,
7959 mbfl_filt_conv_common_ctor,
7960 NULL,
7961 mbfl_filt_conv_wchar_sjis_mobile,
7962 mbfl_filt_conv_sjis_mobile_flush,
7963 NULL,
7964 };
7965
7966 const mbfl_encoding mbfl_encoding_sjis_sb = {
7967 mbfl_no_encoding_sjis_sb,
7968 "SJIS-Mobile#SOFTBANK",
7969 "Shift_JIS",
7970 mbfl_encoding_sjis_sb_aliases,
7971 mblen_table_sjis_mobile,
7972 MBFL_ENCTYPE_GL_UNSAFE,
7973 &vtbl_sjis_sb_wchar,
7974 &vtbl_wchar_sjis_sb,
7975 mb_sjis_sb_to_wchar,
7976 mb_wchar_to_sjis_sb,
7977 NULL
7978 };
7979
7980 /* Although the specification for Shift-JIS-2004 indicates that 0x5C and
7981 * 0x7E should (respectively) represent a Yen sign and an overbar, feedback
7982 * from Japanese PHP users indicates that they prefer 0x5C and 0x7E to be
7983 * treated as equivalent to U+005C and U+007E. This is the historical
7984 * behavior of mbstring, and promotes compatibility with other software
7985 * which handles Shift-JIS and Shift-JIS-2004 text in this way. */
7986
7987 static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL};
7988
7989 static const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
7990 mbfl_no_encoding_sjis2004,
7991 mbfl_no_encoding_wchar,
7992 mbfl_filt_conv_common_ctor,
7993 NULL,
7994 mbfl_filt_conv_jis2004_wchar,
7995 mbfl_filt_conv_jis2004_wchar_flush,
7996 NULL,
7997 };
7998
7999 static const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = {
8000 mbfl_no_encoding_wchar,
8001 mbfl_no_encoding_sjis2004,
8002 mbfl_filt_conv_common_ctor,
8003 NULL,
8004 mbfl_filt_conv_wchar_jis2004,
8005 mbfl_filt_conv_wchar_jis2004_flush,
8006 NULL,
8007 };
8008
8009 const mbfl_encoding mbfl_encoding_sjis2004 = {
8010 mbfl_no_encoding_sjis2004,
8011 "SJIS-2004",
8012 "Shift_JIS",
8013 mbfl_encoding_sjis2004_aliases,
8014 mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
8015 MBFL_ENCTYPE_GL_UNSAFE,
8016 &vtbl_sjis2004_wchar,
8017 &vtbl_wchar_sjis2004,
8018 mb_sjis2004_to_wchar,
8019 mb_wchar_to_sjis2004,
8020 NULL
8021 };
8022
8023 /* CP932 is Microsoft's version of Shift-JIS.
8024 *
8025 * What we call "SJIS-win" is a variant of CP932 which maps U+00A5
8026 * and U+203E the same way as eucJP-win; namely, instead of mapping
8027 * U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
8028 * these codepoints are mapped to appropriate JIS X 0208 characters.
8029 *
8030 * When converting from Shift-JIS to Unicode, there is no difference
8031 * between CP932 and "SJIS-win".
8032 *
8033 * Additional facts:
8034 *
8035 * • In the libmbfl library which formed the base for mbstring, "CP932" and
8036 * "SJIS-win" were originally aliases. The differing mappings were added in
8037 * December 2002. The libmbfl author later stated that this was done so that
8038 * "CP932" would comply with a certain specification, while "SJIS-win" would
8039 * maintain the existing mappings. He does not remember which specification
8040 * it was.
8041 * • The WHATWG specification for "Shift_JIS" (followed by web browsers)
8042 * agrees with our mappings for "CP932".
8043 * • Microsoft Windows' "best-fit" mappings for CP932 (via the
8044 * WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
8045 * our mappings for "CP932".
8046 * • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
8047 * our mappings for "CP932".
8048 * • When converting Shift-JIS to CP932, the conversion goes through Unicode.
8049 * Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
8050 * 0x7E will go to 0x7E when converting Shift-JIS to CP932.
8051 */
8052
8053 static const unsigned char mblen_table_sjiswin[] = { /* 0x81-0x9F,0xE0-0xFF */
8054 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8055 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8060 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8061 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8062 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8063 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8068 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8069 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
8070 };
8071
8072 static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
8073 static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL};
8074
8075 static const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
8076 mbfl_no_encoding_cp932,
8077 mbfl_no_encoding_wchar,
8078 mbfl_filt_conv_common_ctor,
8079 NULL,
8080 mbfl_filt_conv_cp932_wchar,
8081 mbfl_filt_conv_cp932_wchar_flush,
8082 NULL,
8083 };
8084
8085 static const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
8086 mbfl_no_encoding_wchar,
8087 mbfl_no_encoding_cp932,
8088 mbfl_filt_conv_common_ctor,
8089 NULL,
8090 mbfl_filt_conv_wchar_cp932,
8091 mbfl_filt_conv_common_flush,
8092 NULL,
8093 };
8094
8095 const mbfl_encoding mbfl_encoding_cp932 = {
8096 mbfl_no_encoding_cp932,
8097 "CP932",
8098 "Shift_JIS",
8099 mbfl_encoding_cp932_aliases,
8100 mblen_table_sjiswin,
8101 MBFL_ENCTYPE_GL_UNSAFE,
8102 &vtbl_cp932_wchar,
8103 &vtbl_wchar_cp932,
8104 mb_cp932_to_wchar,
8105 mb_wchar_to_cp932,
8106 NULL
8107 };
8108
8109 static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
8110 mbfl_no_encoding_sjiswin,
8111 mbfl_no_encoding_wchar,
8112 mbfl_filt_conv_common_ctor,
8113 NULL,
8114 mbfl_filt_conv_cp932_wchar,
8115 mbfl_filt_conv_cp932_wchar_flush,
8116 NULL,
8117 };
8118
8119 static const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
8120 mbfl_no_encoding_wchar,
8121 mbfl_no_encoding_sjiswin,
8122 mbfl_filt_conv_common_ctor,
8123 NULL,
8124 mbfl_filt_conv_wchar_sjiswin,
8125 mbfl_filt_conv_common_flush,
8126 NULL,
8127 };
8128
8129 const mbfl_encoding mbfl_encoding_sjiswin = {
8130 mbfl_no_encoding_sjiswin,
8131 "SJIS-win",
8132 "Shift_JIS",
8133 mbfl_encoding_sjiswin_aliases,
8134 mblen_table_sjiswin,
8135 MBFL_ENCTYPE_GL_UNSAFE,
8136 &vtbl_sjiswin_wchar,
8137 &vtbl_wchar_sjiswin,
8138 mb_cp932_to_wchar,
8139 mb_wchar_to_sjiswin,
8140 NULL
8141 };
8142
8143 /*
8144 * EUC variants
8145 */
8146
mbfl_filt_conv_eucjp_wchar(int c,mbfl_convert_filter * filter)8147 static int mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
8148 {
8149 int c1, s, w = 0;
8150
8151 switch (filter->status) {
8152 case 0:
8153 if (c >= 0 && c < 0x80) { /* latin */
8154 CK((*filter->output_function)(c, filter->data));
8155 } else if (c > 0xa0 && c < 0xff) { /* X 0208 first char */
8156 filter->status = 1;
8157 filter->cache = c;
8158 } else if (c == 0x8e) { /* kana first char */
8159 filter->status = 2;
8160 } else if (c == 0x8f) { /* X 0212 first char */
8161 filter->status = 3;
8162 } else {
8163 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8164 }
8165 break;
8166
8167 case 1: /* got first half */
8168 filter->status = 0;
8169 c1 = filter->cache;
8170 if (c > 0xa0 && c < 0xff) {
8171 s = (c1 - 0xa1)*94 + c - 0xa1;
8172 if (s >= 0 && s < jisx0208_ucs_table_size) {
8173 w = jisx0208_ucs_table[s];
8174 if (!w)
8175 w = MBFL_BAD_INPUT;
8176 } else {
8177 w = MBFL_BAD_INPUT;
8178 }
8179
8180 CK((*filter->output_function)(w, filter->data));
8181 } else {
8182 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8183 }
8184 break;
8185
8186 case 2: /* got 0x8e */
8187 filter->status = 0;
8188 if (c > 0xa0 && c < 0xe0) {
8189 w = 0xfec0 + c;
8190 CK((*filter->output_function)(w, filter->data));
8191 } else {
8192 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8193 }
8194 break;
8195
8196 case 3: /* got 0x8f, JIS X 0212 first byte */
8197 filter->status++;
8198 filter->cache = c;
8199 break;
8200
8201 case 4: /* got 0x8f, JIS X 0212 second byte */
8202 filter->status = 0;
8203 c1 = filter->cache;
8204 if (c > 0xA0 && c < 0xFF && c1 > 0xA0 && c1 < 0xFF) {
8205 s = (c1 - 0xa1)*94 + c - 0xa1;
8206 if (s >= 0 && s < jisx0212_ucs_table_size) {
8207 w = jisx0212_ucs_table[s];
8208 if (!w)
8209 w = MBFL_BAD_INPUT;
8210 } else {
8211 w = MBFL_BAD_INPUT;
8212 }
8213
8214 CK((*filter->output_function)(w, filter->data));
8215 } else {
8216 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8217 }
8218 break;
8219
8220 EMPTY_SWITCH_DEFAULT_CASE();
8221 }
8222
8223 return 0;
8224 }
8225
mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter * filter)8226 static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter)
8227 {
8228 if (filter->status) {
8229 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8230 filter->status = 0;
8231 }
8232
8233 if (filter->flush_function) {
8234 (*filter->flush_function)(filter->data);
8235 }
8236
8237 return 0;
8238 }
8239
mbfl_filt_conv_wchar_eucjp(int c,mbfl_convert_filter * filter)8240 static int mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
8241 {
8242 int s = 0;
8243
8244 if (c == 0xAF) { /* U+00AF is MACRON */
8245 s = 0xA2B4; /* Use JIS X 0212 overline */
8246 } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8247 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8248 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8249 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8250 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8251 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
8252 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8253 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
8254 }
8255 if (s <= 0) {
8256 if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
8257 s = 0x2140;
8258 } else if (c == 0x2225) { /* PARALLEL TO */
8259 s = 0x2142;
8260 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
8261 s = 0x215d;
8262 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
8263 s = 0x2171;
8264 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
8265 s = 0x2172;
8266 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
8267 s = 0x224c;
8268 } else if (c == 0) {
8269 s = 0;
8270 } else {
8271 s = -1;
8272 }
8273 }
8274 if (s >= 0) {
8275 if (s < 0x80) { /* latin */
8276 CK((*filter->output_function)(s, filter->data));
8277 } else if (s < 0x100) { /* kana */
8278 CK((*filter->output_function)(0x8e, filter->data));
8279 CK((*filter->output_function)(s, filter->data));
8280 } else if (s < 0x8080) { /* X 0208 */
8281 CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8282 CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8283 } else { /* X 0212 */
8284 CK((*filter->output_function)(0x8f, filter->data));
8285 CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8286 CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8287 }
8288 } else {
8289 CK(mbfl_filt_conv_illegal_output(c, filter));
8290 }
8291
8292 return 0;
8293 }
8294
mb_eucjp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8295 static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8296 {
8297 unsigned char *p = *in, *e = p + *in_len;
8298 uint32_t *out = buf, *limit = buf + bufsize;
8299
8300 while (p < e && out < limit) {
8301 unsigned char c = *p++;
8302
8303 if (c < 0x80) {
8304 *out++ = c;
8305 } else if (c >= 0xA1 && c <= 0xFE && p < e) {
8306 /* JISX 0208 */
8307 unsigned char c2 = *p++;
8308 if (c2 >= 0xA1 && c2 <= 0xFE) {
8309 unsigned int s = (c - 0xA1)*94 + c2 - 0xA1;
8310 if (s < jisx0208_ucs_table_size) {
8311 uint32_t w = jisx0208_ucs_table[s];
8312 if (!w)
8313 w = MBFL_BAD_INPUT;
8314 *out++ = w;
8315 } else {
8316 *out++ = MBFL_BAD_INPUT;
8317 }
8318 } else {
8319 *out++ = MBFL_BAD_INPUT;
8320 }
8321 } else if (c == 0x8E && p < e) {
8322 /* Kana */
8323 unsigned char c2 = *p++;
8324 *out++ = (c2 >= 0xA1 && c2 <= 0xDF) ? 0xFEC0 + c2 : MBFL_BAD_INPUT;
8325 } else if (c == 0x8F) {
8326 /* JISX 0212 */
8327 if ((e - p) >= 2) {
8328 unsigned char c2 = *p++;
8329 unsigned char c3 = *p++;
8330 if (c3 >= 0xA1 && c3 <= 0xFE && c2 >= 0xA1 && c2 <= 0xFE) {
8331 unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1;
8332 if (s < jisx0212_ucs_table_size) {
8333 uint32_t w = jisx0212_ucs_table[s];
8334 if (!w)
8335 w = MBFL_BAD_INPUT;
8336 *out++ = w;
8337 } else {
8338 *out++ = MBFL_BAD_INPUT;
8339 }
8340 } else {
8341 *out++ = MBFL_BAD_INPUT;
8342 }
8343 } else {
8344 *out++ = MBFL_BAD_INPUT;
8345 p = e; /* Jump to end of string */
8346 }
8347 } else {
8348 *out++ = MBFL_BAD_INPUT;
8349 }
8350 }
8351
8352 *in_len = e - p;
8353 *in = p;
8354 return out - buf;
8355 }
8356
mb_wchar_to_eucjp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8357 static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8358 {
8359 unsigned char *out, *limit;
8360 MB_CONVERT_BUF_LOAD(buf, out, limit);
8361 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8362
8363 while (len--) {
8364 uint32_t w = *in++;
8365 unsigned int s = 0;
8366
8367 if (w == 0xAF) { /* U+00AF is MACRON */
8368 s = 0xA2B4; /* Use JIS X 0212 overline */
8369 } else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8370 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8371 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8372 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8373 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8374 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8375 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8376 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8377 }
8378
8379 if (s == 0) {
8380 if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8381 s = 0x2140;
8382 } else if (w == 0x2225) { /* PARALLEL TO */
8383 s = 0x2142;
8384 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8385 s = 0x215D;
8386 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8387 s = 0x2171;
8388 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8389 s = 0x2172;
8390 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8391 s = 0x224C;
8392 } else if (w == 0) {
8393 out = mb_convert_buf_add(out, 0);
8394 continue;
8395 } else {
8396 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp);
8397 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8398 continue;
8399 }
8400 }
8401
8402 if (s < 0x80) {
8403 out = mb_convert_buf_add(out, s);
8404 } else if (s < 0x100) {
8405 out = mb_convert_buf_add2(out, 0x8E, s);
8406 } else if (s < 0x8080) {
8407 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8408 } else {
8409 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8410 out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8411 }
8412 }
8413
8414 MB_CONVERT_BUF_STORE(buf, out, limit);
8415 }
8416
mbfl_filt_conv_eucjpwin_wchar(int c,mbfl_convert_filter * filter)8417 static int mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter)
8418 {
8419 int c1, s, w, n;
8420
8421 switch (filter->status) {
8422 case 0:
8423 if (c >= 0 && c < 0x80) { /* latin */
8424 CK((*filter->output_function)(c, filter->data));
8425 } else if (c >= 0xa1 && c <= 0xfe) { /* CP932 first char */
8426 filter->status = 1;
8427 filter->cache = c;
8428 } else if (c == 0x8e) { /* kana first char */
8429 filter->status = 2;
8430 } else if (c == 0x8f) { /* X 0212 first char */
8431 filter->status = 3;
8432 } else {
8433 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8434 }
8435 break;
8436
8437 case 1: /* got first half */
8438 filter->status = 0;
8439 c1 = filter->cache;
8440 if (c > 0xa0 && c < 0xff) {
8441 w = 0;
8442 s = (c1 - 0xa1)*94 + c - 0xa1;
8443 if (s <= 137) {
8444 if (s == 31) {
8445 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
8446 } else if (s == 32) {
8447 w = 0xff5e; /* FULLWIDTH TILDE */
8448 } else if (s == 33) {
8449 w = 0x2225; /* PARALLEL TO */
8450 } else if (s == 60) {
8451 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
8452 } else if (s == 80) {
8453 w = 0xffe0; /* FULLWIDTH CENT SIGN */
8454 } else if (s == 81) {
8455 w = 0xffe1; /* FULLWIDTH POUND SIGN */
8456 } else if (s == 137) {
8457 w = 0xffe2; /* FULLWIDTH NOT SIGN */
8458 }
8459 }
8460
8461 if (w == 0) {
8462 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
8463 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8464 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
8465 w = jisx0208_ucs_table[s];
8466 } else if (s >= (84 * 94)) { /* user (85ku - 94ku) */
8467 w = s - (84 * 94) + 0xe000;
8468 }
8469 }
8470
8471 if (w <= 0) {
8472 w = MBFL_BAD_INPUT;
8473 }
8474 CK((*filter->output_function)(w, filter->data));
8475 } else {
8476 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8477 }
8478 break;
8479
8480 case 2: /* got 0x8e, X0201 kana */
8481 filter->status = 0;
8482 if (c > 0xa0 && c < 0xe0) {
8483 w = 0xfec0 + c;
8484 CK((*filter->output_function)(w, filter->data));
8485 } else {
8486 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8487 }
8488 break;
8489
8490 case 3: /* got 0x8f, X 0212 first char */
8491 filter->status++;
8492 filter->cache = c;
8493 break;
8494
8495 case 4: /* got 0x8f, X 0212 second char */
8496 filter->status = 0;
8497 c1 = filter->cache;
8498 if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
8499 s = (c1 - 0xa1)*94 + c - 0xa1;
8500
8501 if (s >= 0 && s < jisx0212_ucs_table_size) {
8502 w = jisx0212_ucs_table[s];
8503
8504 if (w == 0x007e) {
8505 w = 0xff5e; /* FULLWIDTH TILDE */
8506 }
8507 } else if (s >= (82*94) && s < (84*94)) { /* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */
8508 s = (c1 << 8) | c;
8509 w = 0;
8510 n = 0;
8511 while (n < cp932ext3_eucjp_table_size) {
8512 if (s == cp932ext3_eucjp_table[n]) {
8513 if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) {
8514 w = cp932ext3_ucs_table[n];
8515 }
8516 break;
8517 }
8518 n++;
8519 }
8520 } else if (s >= (84*94)) { /* user (85ku - 94ku) */
8521 w = s - (84*94) + (0xe000 + (94*10));
8522 } else {
8523 w = 0;
8524 }
8525
8526 if (w == 0x00A6) {
8527 w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
8528 }
8529
8530 if (w <= 0) {
8531 w = MBFL_BAD_INPUT;
8532 }
8533 CK((*filter->output_function)(w, filter->data));
8534 } else {
8535 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8536 }
8537 break;
8538
8539 EMPTY_SWITCH_DEFAULT_CASE();
8540 }
8541
8542 return 0;
8543 }
8544
mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter * filter)8545 static int mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter *filter)
8546 {
8547 if (filter->status) {
8548 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8549 filter->status = 0;
8550 }
8551
8552 if (filter->flush_function) {
8553 (*filter->flush_function)(filter->data);
8554 }
8555
8556 return 0;
8557 }
8558
mbfl_filt_conv_wchar_eucjpwin(int c,mbfl_convert_filter * filter)8559 static int mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
8560 {
8561 int c1, c2, s1 = 0;
8562
8563 if (c == 0xAF) { /* U+00AF is MACRON */
8564 s1 = 0xA2B4; /* Use JIS X 0212 overline */
8565 } else if (c == 0x203E) {
8566 s1 = 0x7E;
8567 } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8568 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8569 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8570 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8571 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8572 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8573 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8574 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8575 } else if (c >= 0xe000 && c < (0xe000 + 10*94)) { /* user (X0208 85ku - 94ku) */
8576 s1 = c - 0xe000;
8577 c1 = s1/94 + 0x75;
8578 c2 = s1%94 + 0x21;
8579 s1 = (c1 << 8) | c2;
8580 } else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) { /* user (X0212 85ku - 94ku) */
8581 s1 = c - (0xe000 + 10*94);
8582 c1 = s1/94 + 0xf5;
8583 c2 = s1%94 + 0xa1;
8584 s1 = (c1 << 8) | c2;
8585 }
8586
8587 if (s1 == 0xa2f1) {
8588 s1 = 0x2d62; /* NUMERO SIGN */
8589 }
8590
8591 if (s1 <= 0) {
8592 if (c == 0xa5) { /* YEN SIGN */
8593 s1 = 0x5C;
8594 } else if (c == 0x2014) {
8595 s1 = 0x213D;
8596 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
8597 s1 = 0x2140;
8598 } else if (c == 0x2225) { /* PARALLEL TO */
8599 s1 = 0x2142;
8600 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
8601 s1 = 0x215d;
8602 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
8603 s1 = 0x2171;
8604 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
8605 s1 = 0x2172;
8606 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
8607 s1 = 0x224c;
8608 } else {
8609 s1 = -1;
8610 c1 = 0;
8611 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
8612 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
8613 const int oh = cp932ext1_ucs_table_min / 94;
8614
8615 if (c == cp932ext1_ucs_table[c1]) {
8616 s1 = ((c1 / 94 + oh + 0x21) << 8) + (c1 % 94 + 0x21);
8617 break;
8618 }
8619 c1++;
8620 }
8621 if (s1 < 0) {
8622 c1 = 0;
8623 c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
8624 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
8625 if (c == cp932ext3_ucs_table[c1]) {
8626 if (c1 < cp932ext3_eucjp_table_size) {
8627 s1 = cp932ext3_eucjp_table[c1];
8628 }
8629 break;
8630 }
8631 c1++;
8632 }
8633 }
8634 }
8635
8636 if (c == 0) {
8637 s1 = 0;
8638 } else if (s1 <= 0) {
8639 s1 = -1;
8640 }
8641 }
8642
8643 if (s1 >= 0) {
8644 if (s1 < 0x80) { /* latin */
8645 CK((*filter->output_function)(s1, filter->data));
8646 } else if (s1 < 0x100) { /* kana */
8647 CK((*filter->output_function)(0x8e, filter->data));
8648 CK((*filter->output_function)(s1, filter->data));
8649 } else if (s1 < 0x8080) { /* X 0208 */
8650 CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8651 CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8652 } else { /* X 0212 */
8653 CK((*filter->output_function)(0x8f, filter->data));
8654 CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8655 CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8656 }
8657 } else {
8658 CK(mbfl_filt_conv_illegal_output(c, filter));
8659 }
8660
8661 return 0;
8662 }
8663
mb_eucjpwin_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8664 static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8665 {
8666 unsigned char *p = *in, *e = p + *in_len;
8667 uint32_t *out = buf, *limit = buf + bufsize;
8668
8669 while (p < e && out < limit) {
8670 unsigned char c = *p++;
8671
8672 if (c < 0x80) {
8673 *out++ = c;
8674 } else if (c >= 0xA1 && c <= 0xFE && p < e) {
8675 unsigned char c2 = *p++;
8676
8677 if (c2 >= 0xA1 && c2 <= 0xFE) {
8678 unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
8679
8680 if (s <= 137) {
8681 if (s == 31) {
8682 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
8683 } else if (s == 32) {
8684 w = 0xFF5E; /* FULLWIDTH TILDE */
8685 } else if (s == 33) {
8686 w = 0x2225; /* PARALLEL TO */
8687 } else if (s == 60) {
8688 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
8689 } else if (s == 80) {
8690 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
8691 } else if (s == 81) {
8692 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
8693 } else if (s == 137) {
8694 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
8695 }
8696 }
8697
8698 if (w == 0) {
8699 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
8700 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8701 } else if (s < jisx0208_ucs_table_size) {
8702 w = jisx0208_ucs_table[s];
8703 } else if (s >= (84 * 94)) {
8704 w = s - (84 * 94) + 0xE000;
8705 }
8706 }
8707
8708 if (!w)
8709 w = MBFL_BAD_INPUT;
8710 *out++ = w;
8711 } else {
8712 *out++ = MBFL_BAD_INPUT;
8713 }
8714 } else if (c == 0x8E && p < e) {
8715 unsigned char c2 = *p++;
8716 if (c2 >= 0xA1 && c2 <= 0xDF) {
8717 *out++ = 0xFEC0 + c2;
8718 } else {
8719 *out++ = MBFL_BAD_INPUT;
8720 }
8721 } else if (c == 0x8F && p < e) {
8722 unsigned char c2 = *p++;
8723 if (p == e) {
8724 *out++ = MBFL_BAD_INPUT;
8725 continue;
8726 }
8727 unsigned char c3 = *p++;
8728
8729 if (c2 >= 0xA1 && c2 <= 0xFE && c3 >= 0xA1 && c3 <= 0xFE) {
8730 unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1, w = 0;
8731
8732 if (s < jisx0212_ucs_table_size) {
8733 w = jisx0212_ucs_table[s];
8734 if (w == 0x7E)
8735 w = 0xFF5E; /* FULLWIDTH TILDE */
8736 } else if (s >= (82*94) && s < (84*94)) {
8737 s = (c2 << 8) | c3;
8738 for (int i = 0; i < cp932ext3_eucjp_table_size; i++) {
8739 if (cp932ext3_eucjp_table[i] == s) {
8740 w = cp932ext3_ucs_table[i];
8741 break;
8742 }
8743 }
8744 } else if (s >= (84*94)) {
8745 w = s - (84*94) + 0xE000 + (94*10);
8746 }
8747
8748 if (w == 0xA6)
8749 w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
8750
8751 if (!w)
8752 w = MBFL_BAD_INPUT;
8753 *out++ = w;
8754 } else {
8755 *out++ = MBFL_BAD_INPUT;
8756 }
8757 } else {
8758 *out++ = MBFL_BAD_INPUT;
8759 }
8760 }
8761
8762 *in_len = e - p;
8763 *in = p;
8764 return out - buf;
8765 }
8766
mb_wchar_to_eucjpwin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8767 static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8768 {
8769 unsigned char *out, *limit;
8770 MB_CONVERT_BUF_LOAD(buf, out, limit);
8771 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8772
8773 while (len--) {
8774 uint32_t w = *in++;
8775 unsigned int s = 0;
8776
8777 if (w == 0) {
8778 out = mb_convert_buf_add(out, 0);
8779 continue;
8780 } else if (w == 0xAF) { /* U+00AF is MACRON */
8781 s = 0xA2B4; /* Use JIS X 0212 overline */
8782 } else if (w == 0x203E) {
8783 s = 0x7E;
8784 } else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8785 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8786 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8787 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8788 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8789 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8790 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8791 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8792 } else if (w >= 0xE000 && w < (0xE000 + 10*94)) {
8793 s = w - 0xE000;
8794 s = ((s/94 + 0x75) << 8) + (s%94) + 0x21;
8795 } else if (w >= (0xE000 + 10*94) && w < (0xE000 + 20*94)) {
8796 s = w - (0xE000 + 10*94);
8797 s = ((s/94 + 0xF5) << 8) + (s%94) + 0xA1;
8798 }
8799
8800 if (s == 0xA2F1)
8801 s = 0x2D62; /* NUMERO SIGN */
8802
8803 if (s == 0) {
8804 if (w == 0xA5) { /* YEN SIGN */
8805 s = 0x5C;
8806 } else if (w == 0x2014) { /* EM DASH */
8807 s = 0x213D;
8808 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8809 s = 0x2140;
8810 } else if (w == 0x2225) { /* PARALLEL TO */
8811 s = 0x2142;
8812 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8813 s = 0x215D;
8814 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8815 s = 0x2171;
8816 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8817 s = 0x2172;
8818 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8819 s = 0x224C;
8820 } else {
8821 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
8822 if (cp932ext1_ucs_table[i] == w) {
8823 s = (((i/94) + (cp932ext1_ucs_table_min/94) + 0x21) << 8) + (i%94) + 0x21;
8824 break;
8825 }
8826 }
8827
8828 if (!s) {
8829 for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
8830 if (cp932ext3_ucs_table[i] == w) {
8831 s = cp932ext3_eucjp_table[i];
8832 break;
8833 }
8834 }
8835 }
8836 }
8837 }
8838
8839 if (!s) {
8840 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjpwin);
8841 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8842 } else if (s < 0x80) {
8843 out = mb_convert_buf_add(out, s);
8844 } else if (s < 0x100) {
8845 out = mb_convert_buf_add2(out, 0x8E, s);
8846 } else if (s < 0x8080) {
8847 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8848 } else {
8849 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8850 out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8851 }
8852 }
8853
8854 MB_CONVERT_BUF_STORE(buf, out, limit);
8855 }
8856
mbfl_filt_conv_cp51932_wchar(int c,mbfl_convert_filter * filter)8857 static int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
8858 {
8859 int c1, s, w;
8860
8861 switch (filter->status) {
8862 case 0:
8863 if (c >= 0 && c < 0x80) { /* latin */
8864 CK((*filter->output_function)(c, filter->data));
8865 } else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
8866 filter->status = 1;
8867 filter->cache = c;
8868 } else if (c == 0x8e) { /* kana first char */
8869 filter->status = 2;
8870 } else {
8871 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8872 }
8873 break;
8874
8875 case 1: /* got first half */
8876 filter->status = 0;
8877 c1 = filter->cache;
8878 if (c > 0xa0 && c < 0xff) {
8879 w = 0;
8880 s = (c1 - 0xa1)*94 + c - 0xa1;
8881 if (s <= 137) {
8882 if (s == 31) {
8883 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
8884 } else if (s == 32) {
8885 w = 0xff5e; /* FULLWIDTH TILDE */
8886 } else if (s == 33) {
8887 w = 0x2225; /* PARALLEL TO */
8888 } else if (s == 60) {
8889 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
8890 } else if (s == 80) {
8891 w = 0xffe0; /* FULLWIDTH CENT SIGN */
8892 } else if (s == 81) {
8893 w = 0xffe1; /* FULLWIDTH POUND SIGN */
8894 } else if (s == 137) {
8895 w = 0xffe2; /* FULLWIDTH NOT SIGN */
8896 }
8897 }
8898 if (w == 0) {
8899 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
8900 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8901 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
8902 w = jisx0208_ucs_table[s];
8903 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
8904 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
8905 }
8906 }
8907 if (w <= 0) {
8908 w = MBFL_BAD_INPUT;
8909 }
8910 CK((*filter->output_function)(w, filter->data));
8911 } else {
8912 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8913 }
8914 break;
8915
8916 case 2: /* got 0x8e, X0201 kana */
8917 filter->status = 0;
8918 if (c > 0xa0 && c < 0xe0) {
8919 w = 0xfec0 + c;
8920 CK((*filter->output_function)(w, filter->data));
8921 } else {
8922 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8923 }
8924 break;
8925
8926 EMPTY_SWITCH_DEFAULT_CASE();
8927 }
8928
8929 return 0;
8930 }
8931
mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter * filter)8932 static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
8933 {
8934 if (filter->status) {
8935 /* Input string was truncated */
8936 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8937 filter->status = 0;
8938 }
8939
8940 if (filter->flush_function) {
8941 (*filter->flush_function)(filter->data);
8942 }
8943
8944 return 0;
8945 }
8946
mbfl_filt_conv_wchar_cp51932(int c,mbfl_convert_filter * filter)8947 static int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
8948 {
8949 int c1, c2, s1;
8950
8951 s1 = 0;
8952 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8953 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8954 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8955 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8956 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8957 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8958 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8959 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8960 }
8961 if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */
8962 if (s1 <= 0) {
8963 if (c == 0xa5) { /* YEN SIGN */
8964 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
8965 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
8966 s1 = 0x2140;
8967 } else if (c == 0x2225) { /* PARALLEL TO */
8968 s1 = 0x2142;
8969 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
8970 s1 = 0x215d;
8971 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
8972 s1 = 0x2171;
8973 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
8974 s1 = 0x2172;
8975 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
8976 s1 = 0x224c;
8977 } else {
8978 s1 = -1;
8979 c1 = 0;
8980 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
8981 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
8982 if (c == cp932ext1_ucs_table[c1]) {
8983 s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
8984 break;
8985 }
8986 c1++;
8987 }
8988 if (s1 < 0) {
8989 c1 = 0;
8990 c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
8991 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
8992 if (c == cp932ext2_ucs_table[c1]) {
8993 s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21);
8994 break;
8995 }
8996 c1++;
8997 }
8998 }
8999 }
9000 if (c == 0) {
9001 s1 = 0;
9002 } else if (s1 <= 0) {
9003 s1 = -1;
9004 }
9005 }
9006
9007 if (s1 >= 0) {
9008 if (s1 < 0x80) { /* latin */
9009 CK((*filter->output_function)(s1, filter->data));
9010 } else if (s1 < 0x100) { /* kana */
9011 CK((*filter->output_function)(0x8e, filter->data));
9012 CK((*filter->output_function)(s1, filter->data));
9013 } else if (s1 < 0x8080) { /* X 0208 */
9014 CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
9015 CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
9016 } else {
9017 CK(mbfl_filt_conv_illegal_output(c, filter));
9018 }
9019 } else {
9020 CK(mbfl_filt_conv_illegal_output(c, filter));
9021 }
9022
9023 return 0;
9024 }
9025
mb_cp51932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9026 static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9027 {
9028 unsigned char *p = *in, *e = p + *in_len;
9029 uint32_t *out = buf, *limit = buf + bufsize;
9030
9031 while (p < e && out < limit) {
9032 unsigned char c = *p++;
9033
9034 if (c < 0x80) {
9035 *out++ = c;
9036 } else if (c >= 0xA1 && c <= 0xFE && p < e) {
9037 unsigned char c2 = *p++;
9038 if (c2 >= 0xA1 && c2 <= 0xFE) {
9039 unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
9040
9041 if (s <= 137) {
9042 if (s == 31) {
9043 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
9044 } else if (s == 32) {
9045 w = 0xFF5E; /* FULLWIDTH TILDE */
9046 } else if (s == 33) {
9047 w = 0x2225; /* PARALLEL TO */
9048 } else if (s == 60) {
9049 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
9050 } else if (s == 80) {
9051 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
9052 } else if (s == 81) {
9053 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
9054 } else if (s == 137) {
9055 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
9056 }
9057 }
9058
9059 if (w == 0) {
9060 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
9061 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
9062 } else if (s < jisx0208_ucs_table_size) {
9063 w = jisx0208_ucs_table[s];
9064 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
9065 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
9066 }
9067 }
9068
9069 if (!w)
9070 w = MBFL_BAD_INPUT;
9071 *out++ = w;
9072 } else {
9073 *out++ = MBFL_BAD_INPUT;
9074 }
9075 } else if (c == 0x8E && p < e) {
9076 unsigned char c2 = *p++;
9077 if (c2 >= 0xA1 && c2 <= 0xDF) {
9078 *out++ = 0xFEC0 + c2;
9079 } else {
9080 *out++ = MBFL_BAD_INPUT;
9081 }
9082 } else {
9083 *out++ = MBFL_BAD_INPUT;
9084 }
9085 }
9086
9087 *in_len = e - p;
9088 *in = p;
9089 return out - buf;
9090 }
9091
mb_wchar_to_cp51932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9092 static void mb_wchar_to_cp51932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9093 {
9094 unsigned char *out, *limit;
9095 MB_CONVERT_BUF_LOAD(buf, out, limit);
9096 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9097
9098 while (len--) {
9099 uint32_t w = *in++;
9100 unsigned int s = 0;
9101
9102 if (w == 0) {
9103 out = mb_convert_buf_add(out, 0);
9104 continue;
9105 } else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
9106 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
9107 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
9108 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
9109 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
9110 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
9111 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
9112 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
9113 }
9114
9115 if (s >= 0x8080) s = 0; /* We don't support JIS X0213 */
9116
9117 if (s == 0) {
9118 if (w == 0xA5) { /* YEN SIGN */
9119 s = 0x216F; /* FULLWIDTH YEN SIGN */
9120 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
9121 s = 0x2140;
9122 } else if (w == 0x2225) { /* PARALLEL TO */
9123 s = 0x2142;
9124 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
9125 s = 0x215D;
9126 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
9127 s = 0x2171;
9128 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
9129 s = 0x2172;
9130 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
9131 s = 0x224C;
9132 } else {
9133 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
9134 if (cp932ext1_ucs_table[i] == w) {
9135 s = ((i/94 + 0x2D) << 8) + (i%94) + 0x21;
9136 goto found_it;
9137 }
9138 }
9139
9140 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
9141 if (cp932ext2_ucs_table[i] == w) {
9142 s = ((i/94 + 0x79) << 8) + (i%94) + 0x21;
9143 goto found_it;
9144 }
9145 }
9146 }
9147 found_it: ;
9148 }
9149
9150 if (!s || s >= 0x8080) {
9151 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp51932);
9152 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9153 } else if (s < 0x80) {
9154 out = mb_convert_buf_add(out, s);
9155 } else if (s < 0x100) {
9156 out = mb_convert_buf_add2(out, 0x8E, s);
9157 } else {
9158 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9159 }
9160 }
9161
9162 MB_CONVERT_BUF_STORE(buf, out, limit);
9163 }
9164
mb_eucjp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9165 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9166 {
9167 unsigned char *p = *in, *e = p + *in_len;
9168 uint32_t *out = buf, *limit = buf + bufsize - 1;
9169
9170 while (p < e && out < limit) {
9171 unsigned char c = *p++;
9172
9173 if (c <= 0x7F) {
9174 *out++ = c;
9175 } else if (c >= 0xA1 && c <= 0xFE) {
9176 /* Kanji */
9177 if (p == e) {
9178 *out++ = MBFL_BAD_INPUT;
9179 break;
9180 }
9181 unsigned char c2 = *p++;
9182 if (c2 <= 0xA0 || c2 == 0xFF) {
9183 *out++ = MBFL_BAD_INPUT;
9184 continue;
9185 }
9186
9187 unsigned int s1 = c - 0x80, s2 = c2 - 0x80;
9188 unsigned int w1 = (s1 << 8) | s2, w = 0;
9189
9190 /* Conversion for combining characters */
9191 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
9192 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
9193 if (k >= 0) {
9194 *out++ = jisx0213_u2_tbl[2*k];
9195 *out++ = jisx0213_u2_tbl[2*k+1];
9196 continue;
9197 }
9198 }
9199
9200 /* Conversion for BMP */
9201 w1 = (s1 - 0x21)*94 + s2 - 0x21;
9202 if (w1 < jisx0213_ucs_table_size) {
9203 w = jisx0213_ucs_table[w1];
9204 }
9205
9206 /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
9207 if (!w) {
9208 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9209 if (k >= 0) {
9210 w = jisx0213_jis_u5_tbl[k] + 0x20000;
9211 }
9212 }
9213
9214 *out++ = w ? w : MBFL_BAD_INPUT;
9215 } else if (c == 0x8E && p < e) {
9216 /* Kana */
9217 unsigned char c2 = *p++;
9218 if (c2 >= 0xA1 && c2 <= 0xDF) {
9219 *out++ = 0xFEC0 + c2;
9220 } else {
9221 *out++ = MBFL_BAD_INPUT;
9222 }
9223 } else if (c == 0x8F && p < e) {
9224 unsigned char c2 = *p++;
9225 if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) {
9226 unsigned char c3 = *p++;
9227
9228 if (c3 < 0xA1 || c3 == 0xFF) {
9229 *out++ = MBFL_BAD_INPUT;
9230 continue;
9231 }
9232
9233 unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1;
9234
9235 if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
9236 int k;
9237 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
9238 if (s1 == jisx0213_p2_ofst[k]) {
9239 break;
9240 }
9241 }
9242 k -= jisx0213_p2_ofst[k];
9243
9244 /* Check for Japanese chars in BMP */
9245 unsigned int s = (s1 + 94 + k)*94 + s2;
9246 ZEND_ASSERT(s < jisx0213_ucs_table_size);
9247 unsigned int w = jisx0213_ucs_table[s];
9248
9249 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
9250 if (!w) {
9251 k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9252 if (k >= 0) {
9253 w = jisx0213_jis_u5_tbl[k] + 0x20000;
9254 }
9255 }
9256
9257 *out++ = w ? w : MBFL_BAD_INPUT;
9258 } else {
9259 *out++ = MBFL_BAD_INPUT;
9260 }
9261 } else {
9262 *out++ = MBFL_BAD_INPUT;
9263 }
9264 } else {
9265 *out++ = MBFL_BAD_INPUT;
9266 }
9267 }
9268
9269 *in_len = e - p;
9270 *in = p;
9271 return out - buf;
9272 }
9273
mb_wchar_to_eucjp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9274 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9275 {
9276 unsigned char *out, *limit;
9277 MB_CONVERT_BUF_LOAD(buf, out, limit);
9278 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9279
9280 uint32_t w;
9281 if (buf->state) {
9282 w = buf->state;
9283 buf->state = 0;
9284 goto process_codepoint;
9285 }
9286
9287 while (len--) {
9288 w = *in++;
9289 process_codepoint: ;
9290 unsigned int s = 0;
9291
9292 /* Check for 1st char of combining characters */
9293 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
9294 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
9295 if (w == jisx0213_u2_tbl[2*k]) {
9296 if (!len) {
9297 if (!end) {
9298 buf->state = w;
9299 MB_CONVERT_BUF_STORE(buf, out, limit);
9300 return;
9301 }
9302 } else {
9303 uint32_t w2 = *in++; len--;
9304 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
9305 k++;
9306 }
9307 if (w2 == jisx0213_u2_tbl[2*k+1]) {
9308 s = jisx0213_u2_key[k];
9309 break;
9310 }
9311 in--; len++;
9312 }
9313
9314 /* Fallback */
9315 s = jisx0213_u2_fb_tbl[k];
9316 break;
9317 }
9318 }
9319 }
9320
9321 /* Check for major Japanese chars: U+4E00-U+9FFF */
9322 if (!s) {
9323 for (int k = 0; k < uni2jis_tbl_len; k++) {
9324 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
9325 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
9326 break;
9327 }
9328 }
9329 }
9330
9331 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
9332 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
9333 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
9334 if (k >= 0) {
9335 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
9336 }
9337 }
9338
9339 /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
9340 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
9341 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
9342 if (k >= 0) {
9343 s = jisx0213_u5_jis_tbl[k];
9344 }
9345 }
9346
9347 if (!s) {
9348 /* CJK Compatibility Forms: U+FE30-U+FE4F */
9349 if (w == 0xFE45) {
9350 s = 0x233E;
9351 } else if (w == 0xFE46) {
9352 s = 0x233D;
9353 } else if (w >= 0xF91D && w <= 0xF9DC) {
9354 /* CJK Compatibility Ideographs: U+F900-U+F92A */
9355 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
9356 if (k >= 0) {
9357 s = ucs_r2b_jisx0213_cmap_val[k];
9358 }
9359 }
9360 }
9361
9362 if (!s && w) {
9363 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004);
9364 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9365 } else if (s <= 0x7F) {
9366 out = mb_convert_buf_add(out, s);
9367 } else if (s <= 0xFF) {
9368 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9369 out = mb_convert_buf_add2(out, 0x8E, s);
9370 } else if (s <= 0x7EFF) {
9371 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9372 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80);
9373 } else {
9374 unsigned int s2 = s & 0xFF;
9375 int k = ((s >> 8) & 0xFF) - 0x7F;
9376 ZEND_ASSERT(k < jisx0213_p2_ofst_len);
9377 s = jisx0213_p2_ofst[k] + 0x21;
9378 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
9379 out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80);
9380 }
9381 }
9382
9383 MB_CONVERT_BUF_STORE(buf, out, limit);
9384 }
9385
mbfl_filt_conv_euccn_wchar(int c,mbfl_convert_filter * filter)9386 static int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
9387 {
9388 int c1, w;
9389
9390 switch (filter->status) {
9391 case 0:
9392 if (c >= 0 && c < 0x80) { /* latin */
9393 CK((*filter->output_function)(c, filter->data));
9394 } else if ((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) { /* dbcs lead byte */
9395 filter->status = 1;
9396 filter->cache = c;
9397 } else {
9398 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9399 }
9400 break;
9401
9402 case 1: /* dbcs second byte */
9403 filter->status = 0;
9404 c1 = filter->cache;
9405 if (c > 0xA0 && c < 0xFF) {
9406 w = (c1 - 0x81)*192 + c - 0x40;
9407 ZEND_ASSERT(w < cp936_ucs_table_size);
9408 if (w == 0x1864) {
9409 w = 0x30FB;
9410 } else if (w == 0x186A) {
9411 w = 0x2015;
9412 } else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9413 w = 0;
9414 } else {
9415 w = cp936_ucs_table[w];
9416 }
9417
9418 if (w <= 0) {
9419 w = MBFL_BAD_INPUT;
9420 }
9421
9422 CK((*filter->output_function)(w, filter->data));
9423 } else {
9424 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9425 }
9426 break;
9427
9428 EMPTY_SWITCH_DEFAULT_CASE();
9429 }
9430
9431 return 0;
9432 }
9433
mbfl_filt_conv_wchar_euccn(int c,mbfl_convert_filter * filter)9434 static int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter)
9435 {
9436 int s = 0;
9437
9438 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
9439 if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261) {
9440 s = 0;
9441 } else {
9442 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
9443 }
9444 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
9445 if (c == 0x2015) {
9446 s = 0xA1AA;
9447 } else if (c == 0x2014 || (c >= 0x2170 && c <= 0x2179)) {
9448 s = 0;
9449 } else {
9450 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
9451 }
9452 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
9453 if (c == 0x30FB) {
9454 s = 0xA1A4;
9455 } else {
9456 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
9457 }
9458 } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
9459 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
9460 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
9461 if (c == 0xFF04) {
9462 s = 0xA1E7;
9463 } else if (c == 0xFF5E) {
9464 s = 0xA1AB;
9465 } else if (c >= 0xFF01 && c <= 0xFF5D) {
9466 s = c - 0xFF01 + 0xA3A1;
9467 } else if (c >= 0xFFE0 && c <= 0xFFE5) {
9468 s = ucs_hff_s_cp936_table[c - 0xFFE0];
9469 }
9470 }
9471
9472 /* exclude CP936 extensions */
9473 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9474 s = 0;
9475 }
9476
9477 if (s <= 0) {
9478 if (c < 0x80) {
9479 s = c;
9480 } else if (s <= 0) {
9481 s = -1;
9482 }
9483 }
9484
9485 if (s >= 0) {
9486 if (s < 0x80) { /* latin */
9487 CK((*filter->output_function)(s, filter->data));
9488 } else {
9489 CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9490 CK((*filter->output_function)(s & 0xFF, filter->data));
9491 }
9492 } else {
9493 CK(mbfl_filt_conv_illegal_output(c, filter));
9494 }
9495
9496 return 0;
9497 }
9498
mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter * filter)9499 static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter)
9500 {
9501 if (filter->status == 1) {
9502 /* 2-byte character was truncated */
9503 filter->status = 0;
9504 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9505 }
9506
9507 if (filter->flush_function) {
9508 (*filter->flush_function)(filter->data);
9509 }
9510
9511 return 0;
9512 }
9513
mb_euccn_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9514 static size_t mb_euccn_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9515 {
9516 unsigned char *p = *in, *e = p + *in_len;
9517 uint32_t *out = buf, *limit = buf + bufsize;
9518
9519 while (p < e && out < limit) {
9520 unsigned char c = *p++;
9521
9522 if (c < 0x80) {
9523 *out++ = c;
9524 } else if (((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) && p < e) {
9525 unsigned char c2 = *p++;
9526
9527 if (c2 >= 0xA1 && c2 <= 0xFE) {
9528 unsigned int w = (c - 0x81)*192 + c2 - 0x40;
9529 ZEND_ASSERT(w < cp936_ucs_table_size);
9530 if (w == 0x1864) {
9531 w = 0x30FB;
9532 } else if (w == 0x186A) {
9533 w = 0x2015;
9534 } else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9535 w = 0;
9536 } else {
9537 w = cp936_ucs_table[w];
9538 }
9539
9540 if (!w)
9541 w = MBFL_BAD_INPUT;
9542 *out++ = w;
9543 } else {
9544 *out++ = MBFL_BAD_INPUT;
9545 }
9546 } else {
9547 *out++ = MBFL_BAD_INPUT;
9548 }
9549 }
9550
9551 *in_len = e - p;
9552 *in = p;
9553 return out - buf;
9554 }
9555
mb_wchar_to_euccn(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9556 static void mb_wchar_to_euccn(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9557 {
9558 unsigned char *out, *limit;
9559 MB_CONVERT_BUF_LOAD(buf, out, limit);
9560 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9561
9562 while (len--) {
9563 uint32_t w = *in++;
9564 unsigned int s = 0;
9565
9566 if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
9567 if (w != 0xB7 && w != 0x144 && w != 0x148 && w != 0x251 && w != 0x261) {
9568 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
9569 }
9570 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
9571 if (w == 0x2015) {
9572 s = 0xA1AA;
9573 } else if (w != 0x2014 && (w < 0x2170 || w > 0x2179)) {
9574 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
9575 }
9576 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
9577 if (w == 0x30FB) {
9578 s = 0xA1A4;
9579 } else {
9580 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
9581 }
9582 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
9583 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
9584 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
9585 if (w == 0xFF04) {
9586 s = 0xA1E7;
9587 } else if (w == 0xFF5E) {
9588 s = 0xA1AB;
9589 } else if (w >= 0xFF01 && w <= 0xFF5D) {
9590 s = w - 0xFF01 + 0xA3A1;
9591 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
9592 s = ucs_hff_s_cp936_table[w - 0xFFE0];
9593 }
9594 }
9595
9596 /* Exclude CP936 extensions */
9597 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9598 s = 0;
9599 }
9600
9601 if (!s) {
9602 if (w < 0x80) {
9603 out = mb_convert_buf_add(out, w);
9604 } else {
9605 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euccn);
9606 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9607 }
9608 } else if (s < 0x80) {
9609 out = mb_convert_buf_add(out, s);
9610 } else {
9611 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
9612 }
9613 }
9614
9615 MB_CONVERT_BUF_STORE(buf, out, limit);
9616 }
9617
mbfl_filt_conv_euctw_wchar(int c,mbfl_convert_filter * filter)9618 static int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
9619 {
9620 int c1, s, w;
9621
9622 switch (filter->status) {
9623 case 0:
9624 if (c >= 0 && c < 0x80) { /* latin */
9625 CK((*filter->output_function)(c, filter->data));
9626 } else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */
9627 filter->status = 1;
9628 filter->cache = c;
9629 } else if (c == 0x8E) { /* 4-byte character, first byte */
9630 filter->status = 2;
9631 } else {
9632 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9633 }
9634 break;
9635
9636 case 1: /* 2-byte character, second byte */
9637 filter->status = 0;
9638 c1 = filter->cache;
9639 if (c > 0xA0 && c < 0xFF) {
9640 w = (c1 - 0xA1)*94 + (c - 0xA1);
9641 if (w >= 0 && w < cns11643_1_ucs_table_size) {
9642 w = cns11643_1_ucs_table[w];
9643 } else {
9644 w = 0;
9645 }
9646
9647 if (w <= 0) {
9648 w = MBFL_BAD_INPUT;
9649 }
9650
9651 CK((*filter->output_function)(w, filter->data));
9652 } else {
9653 filter->status = filter->cache = 0;
9654 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9655 }
9656 break;
9657
9658 case 2: /* got 0x8e, second byte */
9659 if (c == 0xA1 || c == 0xA2 || c == 0xAE) {
9660 filter->status = 3;
9661 filter->cache = c - 0xA1;
9662 } else {
9663 filter->status = filter->cache = 0;
9664 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9665 }
9666 break;
9667
9668 case 3: /* got 0x8e, third byte */
9669 filter->status = 0;
9670 c1 = filter->cache;
9671 if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) ||
9672 (c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) {
9673 filter->status = 4;
9674 filter->cache = (c1 << 8) + c - 0xA1;
9675 } else {
9676 filter->status = filter->cache = 0;
9677 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9678 }
9679 break;
9680
9681 case 4: /* multi-byte character, fourth byte */
9682 filter->status = 0;
9683 c1 = filter->cache;
9684 if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) {
9685 int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */
9686 s = (c1 & 0xFF)*94 + c - 0xA1;
9687 w = 0;
9688 if (s >= 0) {
9689 /* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9690 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9691 * We only support the older version of CNS-11643
9692 * This is the same as iconv from glibc 2.2 */
9693 if (plane == 0 && s < cns11643_1_ucs_table_size) {
9694 w = cns11643_1_ucs_table[s];
9695 } else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9696 w = cns11643_2_ucs_table[s];
9697 } else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9698 w = cns11643_14_ucs_table[s];
9699 }
9700 }
9701
9702 if (w <= 0) {
9703 w = MBFL_BAD_INPUT;
9704 }
9705
9706 CK((*filter->output_function)(w, filter->data));
9707 } else {
9708 filter->status = filter->cache = 0;
9709 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9710 }
9711 break;
9712
9713 EMPTY_SWITCH_DEFAULT_CASE();
9714 }
9715
9716 return 0;
9717 }
9718
mbfl_filt_conv_wchar_euctw(int c,mbfl_convert_filter * filter)9719 static int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
9720 {
9721 int s = 0;
9722
9723 if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
9724 s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
9725 } else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
9726 s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min];
9727 } else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) {
9728 s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min];
9729 } else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) {
9730 s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min];
9731 } else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
9732 s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
9733 }
9734
9735 if (s <= 0) {
9736 if (c == 0) {
9737 s = 0;
9738 } else if (s <= 0) {
9739 s = -1;
9740 }
9741 }
9742
9743 if (s >= 0) {
9744 int plane = (s & 0x1F0000) >> 16;
9745 if (plane <= 1) {
9746 if (s < 0x80) { /* latin */
9747 CK((*filter->output_function)(s, filter->data));
9748 } else {
9749 s = (s & 0xFFFF) | 0x8080;
9750 CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9751 CK((*filter->output_function)(s & 0xFF, filter->data));
9752 }
9753 } else {
9754 s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080);
9755 CK((*filter->output_function)(0x8e , filter->data));
9756 CK((*filter->output_function)((s >> 16) & 0xFF, filter->data));
9757 CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9758 CK((*filter->output_function)(s & 0xFF, filter->data));
9759 }
9760 } else {
9761 CK(mbfl_filt_conv_illegal_output(c, filter));
9762 }
9763 return 0;
9764 }
9765
mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter * filter)9766 static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter)
9767 {
9768 if (filter->status) {
9769 /* 2-byte or 4-byte character was truncated */
9770 filter->status = 0;
9771 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9772 }
9773
9774 if (filter->flush_function) {
9775 (*filter->flush_function)(filter->data);
9776 }
9777
9778 return 0;
9779 }
9780
mb_euctw_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9781 static size_t mb_euctw_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9782 {
9783 unsigned char *p = *in, *e = p + *in_len;
9784 uint32_t *out = buf, *limit = buf + bufsize;
9785
9786 while (p < e && out < limit) {
9787 unsigned char c = *p++;
9788
9789 if (c < 0x80) {
9790 *out++ = c;
9791 } else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3 && p < e) {
9792 unsigned char c2 = *p++;
9793
9794 if (c2 >= 0xA1 && c2 <= 0xFE) {
9795 unsigned int w = (c - 0xA1)*94 + (c2 - 0xA1);
9796 if (w < cns11643_1_ucs_table_size) {
9797 w = cns11643_1_ucs_table[w];
9798 } else {
9799 w = 0;
9800 }
9801 if (!w)
9802 w = MBFL_BAD_INPUT;
9803 *out++ = w;
9804 } else {
9805 *out++ = MBFL_BAD_INPUT;
9806 }
9807 } else if (c == 0x8E && p < e) {
9808 unsigned char c2 = *p++;
9809
9810 if ((c2 == 0xA1 || c2 == 0xA2 || c2 == 0xAE) && p < e) {
9811 unsigned int plane = c2 - 0xA1; /* This is actually the CNS-11643 plane minus one */
9812 unsigned char c3 = *p++;
9813
9814 if (c3 >= 0xA1 && ((plane == 0 && ((c3 >= 0xA1 && c3 <= 0xA6) || (c3 >= 0xC2 && c3 <= 0xFD)) && c3 != 0xC3) || (plane == 1 && c3 <= 0xF2) || (plane == 13 && c3 <= 0xE7)) && p < e) {
9815 unsigned char c4 = *p++;
9816
9817 if (c2 <= 0xAE && c4 > 0xA0 && c4 < 0xFF) {
9818 unsigned int s = (c3 - 0xA1)*94 + c4 - 0xA1, w = 0;
9819
9820 /* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9821 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9822 * We only support the older version of CNS-11643
9823 * This is the same as iconv from glibc 2.2 */
9824 if (plane == 0 && s < cns11643_1_ucs_table_size) {
9825 w = cns11643_1_ucs_table[s];
9826 } else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9827 w = cns11643_2_ucs_table[s];
9828 } else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9829 w = cns11643_14_ucs_table[s];
9830 }
9831
9832 if (!w)
9833 w = MBFL_BAD_INPUT;
9834 *out++ = w;
9835 continue;
9836 }
9837 }
9838 }
9839
9840 *out++ = MBFL_BAD_INPUT;
9841 } else {
9842 *out++ = MBFL_BAD_INPUT;
9843 }
9844 }
9845
9846 *in_len = e - p;
9847 *in = p;
9848 return out - buf;
9849 }
9850
mb_wchar_to_euctw(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9851 static void mb_wchar_to_euctw(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9852 {
9853 unsigned char *out, *limit;
9854 MB_CONVERT_BUF_LOAD(buf, out, limit);
9855 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9856
9857 while (len--) {
9858 uint32_t w = *in++;
9859 unsigned int s = 0;
9860
9861 if (w >= ucs_a1_cns11643_table_min && w < ucs_a1_cns11643_table_max) {
9862 s = ucs_a1_cns11643_table[w - ucs_a1_cns11643_table_min];
9863 } else if (w >= ucs_a2_cns11643_table_min && w < ucs_a2_cns11643_table_max) {
9864 s = ucs_a2_cns11643_table[w - ucs_a2_cns11643_table_min];
9865 } else if (w >= ucs_a3_cns11643_table_min && w < ucs_a3_cns11643_table_max) {
9866 s = ucs_a3_cns11643_table[w - ucs_a3_cns11643_table_min];
9867 } else if (w >= ucs_i_cns11643_table_min && w < ucs_i_cns11643_table_max) {
9868 s = ucs_i_cns11643_table[w - ucs_i_cns11643_table_min];
9869 } else if (w >= ucs_r_cns11643_table_min && w < ucs_r_cns11643_table_max) {
9870 s = ucs_r_cns11643_table[w - ucs_r_cns11643_table_min];
9871 }
9872
9873 if (!s) {
9874 if (w == 0) {
9875 out = mb_convert_buf_add(out, 0);
9876 } else {
9877 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euctw);
9878 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9879 }
9880 } else {
9881 unsigned int plane = s >> 16;
9882 if (plane <= 1) {
9883 if (s < 0x80) {
9884 out = mb_convert_buf_add(out, s);
9885 } else {
9886 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9887 }
9888 } else {
9889 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
9890 out = mb_convert_buf_add4(out, 0x8E, 0xA0 + plane, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9891 }
9892 }
9893 }
9894
9895 MB_CONVERT_BUF_STORE(buf, out, limit);
9896 }
9897
mbfl_filt_conv_euckr_wchar(int c,mbfl_convert_filter * filter)9898 static int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
9899 {
9900 int c1, w, flag;
9901
9902 switch (filter->status) {
9903 case 0:
9904 if (c >= 0 && c < 0x80) { /* latin */
9905 CK((*filter->output_function)(c, filter->data));
9906 } else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9) { /* dbcs lead byte */
9907 filter->status = 1;
9908 filter->cache = c;
9909 } else {
9910 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9911 }
9912 break;
9913
9914 case 1: /* dbcs second byte */
9915 filter->status = 0;
9916 c1 = filter->cache;
9917 flag = 0;
9918 if (c1 >= 0xa1 && c1 <= 0xc6) {
9919 flag = 1;
9920 } else if (c1 >= 0xc7 && c1 <= 0xfe && c1 != 0xc9) {
9921 flag = 2;
9922 }
9923 if (flag > 0 && c >= 0xa1 && c <= 0xfe) {
9924 if (flag == 1) { /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */
9925 w = (c1 - 0x81)*190 + c - 0x41;
9926 ZEND_ASSERT(w < uhc1_ucs_table_size);
9927 w = uhc1_ucs_table[w];
9928 } else { /* 1st: 0xc7..0xc8,0xca..0xfe, 2nd: 0xa1..0xfe */
9929 w = (c1 - 0xc7)*94 + c - 0xa1;
9930 ZEND_ASSERT(w < uhc3_ucs_table_size);
9931 w = uhc3_ucs_table[w];
9932 }
9933
9934 if (w <= 0) {
9935 w = MBFL_BAD_INPUT;
9936 }
9937 CK((*filter->output_function)(w, filter->data));
9938 } else {
9939 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9940 }
9941 break;
9942
9943 EMPTY_SWITCH_DEFAULT_CASE();
9944 }
9945
9946 return 0;
9947 }
9948
mbfl_filt_conv_wchar_euckr(int c,mbfl_convert_filter * filter)9949 static int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
9950 {
9951 int s = 0;
9952
9953 if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
9954 s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
9955 } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
9956 s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
9957 } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
9958 s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
9959 } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
9960 s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
9961 } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
9962 s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
9963 } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
9964 s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
9965 } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
9966 s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
9967 }
9968
9969 /* exclude UHC extension area (although we are using the UHC conversion tables) */
9970 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9971 s = 0;
9972 }
9973
9974 if (s <= 0) {
9975 if (c < 0x80) {
9976 s = c;
9977 } else {
9978 s = -1;
9979 }
9980 }
9981
9982 if (s >= 0) {
9983 if (s < 0x80) { /* latin */
9984 CK((*filter->output_function)(s, filter->data));
9985 } else {
9986 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
9987 CK((*filter->output_function)(s & 0xff, filter->data));
9988 }
9989 } else {
9990 CK(mbfl_filt_conv_illegal_output(c, filter));
9991 }
9992
9993 return 0;
9994 }
9995
mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter * filter)9996 static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter)
9997 {
9998 if (filter->status == 1) {
9999 /* 2-byte character was truncated */
10000 filter->status = 0;
10001 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10002 }
10003
10004 if (filter->flush_function) {
10005 (*filter->flush_function)(filter->data);
10006 }
10007
10008 return 0;
10009 }
10010
mb_euckr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10011 static size_t mb_euckr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10012 {
10013 unsigned char *p = *in, *e = p + *in_len;
10014 uint32_t *out = buf, *limit = buf + bufsize;
10015
10016 while (p < e && out < limit) {
10017 unsigned char c = *p++;
10018
10019 if (c < 0x80) {
10020 *out++ = c;
10021 } else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9 && p < e) {
10022 unsigned char c2 = *p++;
10023 if (c2 < 0xA1 || c2 == 0xFF) {
10024 *out++ = MBFL_BAD_INPUT;
10025 continue;
10026 }
10027
10028 if (c <= 0xC6) {
10029 unsigned int w = (c - 0x81)*190 + c2 - 0x41;
10030 ZEND_ASSERT(w < uhc1_ucs_table_size);
10031 w = uhc1_ucs_table[w];
10032 if (!w)
10033 w = MBFL_BAD_INPUT;
10034 *out++ = w;
10035 } else {
10036 unsigned int w = (c - 0xC7)*94 + c2 - 0xA1;
10037 ZEND_ASSERT(w < uhc3_ucs_table_size);
10038 w = uhc3_ucs_table[w];
10039 if (!w)
10040 w = MBFL_BAD_INPUT;
10041 *out++ = w;
10042 }
10043 } else {
10044 *out++ = MBFL_BAD_INPUT;
10045 }
10046 }
10047
10048 *in_len = e - p;
10049 *in = p;
10050 return out - buf;
10051 }
10052
mb_wchar_to_euckr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10053 static void mb_wchar_to_euckr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10054 {
10055 unsigned char *out, *limit;
10056 MB_CONVERT_BUF_LOAD(buf, out, limit);
10057 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10058
10059 while (len--) {
10060 uint32_t w = *in++;
10061 unsigned int s = 0;
10062
10063 if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10064 s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10065 } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10066 s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10067 } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10068 s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10069 } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10070 s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10071 } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10072 s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10073 } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10074 s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10075 } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10076 s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10077 }
10078
10079 /* Exclude UHC extension area (although we are using the UHC conversion tables) */
10080 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
10081 s = 0;
10082 }
10083
10084 if (!s) {
10085 if (w < 0x80) {
10086 out = mb_convert_buf_add(out, w);
10087 } else {
10088 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euckr);
10089 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10090 }
10091 } else if (s < 0x80) {
10092 out = mb_convert_buf_add(out, s);
10093 } else {
10094 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10095 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10096 }
10097 }
10098
10099 MB_CONVERT_BUF_STORE(buf, out, limit);
10100 }
10101
mbfl_filt_conv_uhc_wchar(int c,mbfl_convert_filter * filter)10102 static int mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
10103 {
10104 switch (filter->status) {
10105 case 0:
10106 if (c >= 0 && c < 0x80) { /* latin */
10107 CK((*filter->output_function)(c, filter->data));
10108 } else if (c > 0x80 && c < 0xfe && c != 0xc9) { /* dbcs lead byte */
10109 filter->status = 1;
10110 filter->cache = c;
10111 } else {
10112 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10113 }
10114 break;
10115
10116 case 1: /* dbcs second byte */
10117 filter->status = 0;
10118 int c1 = filter->cache, w = 0;
10119
10120 if (c1 >= 0x81 && c1 <= 0xc6 && c >= 0x41 && c <= 0xfe) {
10121 w = (c1 - 0x81)*190 + (c - 0x41);
10122 if (w >= 0 && w < uhc1_ucs_table_size) {
10123 w = uhc1_ucs_table[w];
10124 }
10125 } else if (c1 >= 0xc7 && c1 < 0xfe && c >= 0xa1 && c <= 0xfe) {
10126 w = (c1 - 0xc7)*94 + (c - 0xa1);
10127 if (w >= 0 && w < uhc3_ucs_table_size) {
10128 w = uhc3_ucs_table[w];
10129 }
10130 }
10131
10132 if (w == 0) {
10133 w = MBFL_BAD_INPUT;
10134 }
10135 CK((*filter->output_function)(w, filter->data));
10136 break;
10137
10138 EMPTY_SWITCH_DEFAULT_CASE();
10139 }
10140
10141 return 0;
10142 }
10143
mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter * filter)10144 static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter)
10145 {
10146 if (filter->status == 1) {
10147 /* 2-byte character was truncated */
10148 filter->status = 0;
10149 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10150 }
10151
10152 if (filter->flush_function) {
10153 (*filter->flush_function)(filter->data);
10154 }
10155
10156 return 0;
10157 }
10158
mbfl_filt_conv_wchar_uhc(int c,mbfl_convert_filter * filter)10159 static int mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
10160 {
10161 int s = 0;
10162
10163 if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
10164 s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
10165 } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
10166 s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
10167 } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
10168 s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
10169 } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
10170 s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
10171 } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
10172 s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
10173 } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
10174 s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
10175 } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
10176 s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
10177 }
10178
10179 if (s == 0 && c != 0) {
10180 s = -1;
10181 }
10182
10183 if (s >= 0) {
10184 if (s < 0x80) { /* latin */
10185 CK((*filter->output_function)(s, filter->data));
10186 } else {
10187 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10188 CK((*filter->output_function)(s & 0xff, filter->data));
10189 }
10190 } else {
10191 CK(mbfl_filt_conv_illegal_output(c, filter));
10192 }
10193
10194 return 0;
10195 }
10196
mb_uhc_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10197 static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10198 {
10199 unsigned char *p = *in, *e = p + *in_len;
10200 uint32_t *out = buf, *limit = buf + bufsize;
10201
10202 e--; /* Stop the main loop 1 byte short of the end of the input */
10203
10204 while (p < e && out < limit) {
10205 unsigned char c = *p++;
10206
10207 if (c < 0x80) {
10208 *out++ = c;
10209 } else if (c > 0x80 && c < 0xFE) {
10210 /* We don't need to check p < e here; it's not possible that this pointer dereference
10211 * will be outside the input string, because of e-- above */
10212 unsigned char c2 = *p++;
10213 if (c2 < 0x41 || c2 == 0xFF) {
10214 *out++ = MBFL_BAD_INPUT;
10215 continue;
10216 }
10217 unsigned int w = 0;
10218
10219 if (c <= 0xC6) {
10220 w = (c - 0x81)*190 + c2 - 0x41;
10221 ZEND_ASSERT(w < uhc1_ucs_table_size);
10222 w = uhc1_ucs_table[w];
10223 } else if (c2 >= 0xA1) {
10224 w = (c - 0xC7)*94 + c2 - 0xA1;
10225 ZEND_ASSERT(w < uhc3_ucs_table_size);
10226 w = uhc3_ucs_table[w];
10227 }
10228 if (!w) {
10229 /* If c == 0xC9, we shouldn't have tried to read a 2-byte char at all... but it is faster
10230 * to fix up that rare case here rather than include an extra check in the hot path */
10231 if (c == 0xC9) {
10232 p--;
10233 }
10234 w = MBFL_BAD_INPUT;
10235 }
10236 *out++ = w;
10237 } else {
10238 *out++ = MBFL_BAD_INPUT;
10239 }
10240 }
10241
10242 /* Finish up last byte of input string if there is one */
10243 if (p == e && out < limit) {
10244 unsigned char c = *p++;
10245 *out++ = (c < 0x80) ? c : MBFL_BAD_INPUT;
10246 }
10247
10248 *in_len = e - p + 1;
10249 *in = p;
10250 return out - buf;
10251 }
10252
mb_wchar_to_uhc(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10253 static void mb_wchar_to_uhc(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10254 {
10255 unsigned char *out, *limit;
10256 MB_CONVERT_BUF_LOAD(buf, out, limit);
10257 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10258
10259 while (len--) {
10260 uint32_t w = *in++;
10261 unsigned int s = 0;
10262
10263 if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10264 s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10265 } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10266 s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10267 } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10268 s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10269 } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10270 s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10271 } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10272 s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10273 } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10274 s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10275 } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10276 s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10277 }
10278
10279 if (!s) {
10280 if (w == 0) {
10281 out = mb_convert_buf_add(out, 0);
10282 } else {
10283 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_uhc);
10284 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10285 }
10286 } else if (s < 0x80) {
10287 out = mb_convert_buf_add(out, s);
10288 } else {
10289 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10290 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10291 }
10292 }
10293
10294 MB_CONVERT_BUF_STORE(buf, out, limit);
10295 }
10296
10297 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
10298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10299 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10300 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10301 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10302 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10303 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10304 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10305 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10306 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
10307 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10308 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10309 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10310 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10311 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10312 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10313 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10314 };
10315
10316 static const char *mbfl_encoding_euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
10317
10318 static const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
10319 mbfl_no_encoding_euc_jp,
10320 mbfl_no_encoding_wchar,
10321 mbfl_filt_conv_common_ctor,
10322 NULL,
10323 mbfl_filt_conv_eucjp_wchar,
10324 mbfl_filt_conv_eucjp_wchar_flush,
10325 NULL,
10326 };
10327
10328 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp = {
10329 mbfl_no_encoding_wchar,
10330 mbfl_no_encoding_euc_jp,
10331 mbfl_filt_conv_common_ctor,
10332 NULL,
10333 mbfl_filt_conv_wchar_eucjp,
10334 mbfl_filt_conv_common_flush,
10335 NULL,
10336 };
10337
10338 const mbfl_encoding mbfl_encoding_euc_jp = {
10339 mbfl_no_encoding_euc_jp,
10340 "EUC-JP",
10341 "EUC-JP",
10342 mbfl_encoding_euc_jp_aliases,
10343 mblen_table_eucjp,
10344 0,
10345 &vtbl_eucjp_wchar,
10346 &vtbl_wchar_eucjp,
10347 mb_eucjp_to_wchar,
10348 mb_wchar_to_eucjp,
10349 NULL
10350 };
10351
10352 static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
10353
10354 static const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
10355 mbfl_no_encoding_eucjp2004,
10356 mbfl_no_encoding_wchar,
10357 mbfl_filt_conv_common_ctor,
10358 NULL,
10359 mbfl_filt_conv_jis2004_wchar,
10360 mbfl_filt_conv_jis2004_wchar_flush,
10361 NULL,
10362 };
10363
10364 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = {
10365 mbfl_no_encoding_wchar,
10366 mbfl_no_encoding_eucjp2004,
10367 mbfl_filt_conv_common_ctor,
10368 NULL,
10369 mbfl_filt_conv_wchar_jis2004,
10370 mbfl_filt_conv_wchar_jis2004_flush,
10371 NULL,
10372 };
10373
10374 const mbfl_encoding mbfl_encoding_eucjp2004 = {
10375 mbfl_no_encoding_eucjp2004,
10376 "EUC-JP-2004",
10377 "EUC-JP",
10378 mbfl_encoding_eucjp2004_aliases,
10379 mblen_table_eucjp,
10380 0,
10381 &vtbl_eucjp2004_wchar,
10382 &vtbl_wchar_eucjp2004,
10383 mb_eucjp2004_to_wchar,
10384 mb_wchar_to_eucjp2004,
10385 NULL
10386 };
10387
10388 static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
10389
10390 static const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
10391 mbfl_no_encoding_eucjp_win,
10392 mbfl_no_encoding_wchar,
10393 mbfl_filt_conv_common_ctor,
10394 NULL,
10395 mbfl_filt_conv_eucjpwin_wchar,
10396 mbfl_filt_conv_eucjpwin_wchar_flush,
10397 NULL,
10398 };
10399
10400 static const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = {
10401 mbfl_no_encoding_wchar,
10402 mbfl_no_encoding_eucjp_win,
10403 mbfl_filt_conv_common_ctor,
10404 NULL,
10405 mbfl_filt_conv_wchar_eucjpwin,
10406 mbfl_filt_conv_common_flush,
10407 NULL,
10408 };
10409
10410 const mbfl_encoding mbfl_encoding_eucjp_win = {
10411 mbfl_no_encoding_eucjp_win,
10412 "eucJP-win",
10413 "EUC-JP",
10414 mbfl_encoding_eucjp_win_aliases,
10415 mblen_table_eucjp,
10416 0,
10417 &vtbl_eucjpwin_wchar,
10418 &vtbl_wchar_eucjpwin,
10419 mb_eucjpwin_to_wchar,
10420 mb_wchar_to_eucjpwin,
10421 NULL
10422 };
10423
10424 static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
10425
10426 static const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
10427 mbfl_no_encoding_cp51932,
10428 mbfl_no_encoding_wchar,
10429 mbfl_filt_conv_common_ctor,
10430 NULL,
10431 mbfl_filt_conv_cp51932_wchar,
10432 mbfl_filt_conv_cp51932_wchar_flush,
10433 NULL,
10434 };
10435
10436 static const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = {
10437 mbfl_no_encoding_wchar,
10438 mbfl_no_encoding_cp51932,
10439 mbfl_filt_conv_common_ctor,
10440 NULL,
10441 mbfl_filt_conv_wchar_cp51932,
10442 mbfl_filt_conv_common_flush,
10443 NULL,
10444 };
10445
10446 const mbfl_encoding mbfl_encoding_cp51932 = {
10447 mbfl_no_encoding_cp51932,
10448 "CP51932",
10449 "CP51932",
10450 mbfl_encoding_cp51932_aliases,
10451 mblen_table_eucjp,
10452 0,
10453 &vtbl_cp51932_wchar,
10454 &vtbl_wchar_cp51932,
10455 mb_cp51932_to_wchar,
10456 mb_wchar_to_cp51932,
10457 NULL
10458 };
10459
10460 static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
10461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10467 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10468 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10469 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10470 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10471 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10472 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10473 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10474 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10475 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10476 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10477 };
10478
10479 static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
10480
10481 static const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
10482 mbfl_no_encoding_euc_cn,
10483 mbfl_no_encoding_wchar,
10484 mbfl_filt_conv_common_ctor,
10485 NULL,
10486 mbfl_filt_conv_euccn_wchar,
10487 mbfl_filt_conv_euccn_wchar_flush,
10488 NULL,
10489 };
10490
10491 static const struct mbfl_convert_vtbl vtbl_wchar_euccn = {
10492 mbfl_no_encoding_wchar,
10493 mbfl_no_encoding_euc_cn,
10494 mbfl_filt_conv_common_ctor,
10495 NULL,
10496 mbfl_filt_conv_wchar_euccn,
10497 mbfl_filt_conv_common_flush,
10498 NULL,
10499 };
10500
10501 const mbfl_encoding mbfl_encoding_euc_cn = {
10502 mbfl_no_encoding_euc_cn,
10503 "EUC-CN",
10504 "CN-GB",
10505 mbfl_encoding_euc_cn_aliases,
10506 mblen_table_euccn,
10507 0,
10508 &vtbl_euccn_wchar,
10509 &vtbl_wchar_euccn,
10510 mb_euccn_to_wchar,
10511 mb_wchar_to_euccn,
10512 NULL
10513 };
10514
10515 static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
10516
10517 static const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
10518 mbfl_no_encoding_euc_tw,
10519 mbfl_no_encoding_wchar,
10520 mbfl_filt_conv_common_ctor,
10521 NULL,
10522 mbfl_filt_conv_euctw_wchar,
10523 mbfl_filt_conv_euctw_wchar_flush,
10524 NULL,
10525 };
10526
10527 static const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
10528 mbfl_no_encoding_wchar,
10529 mbfl_no_encoding_euc_tw,
10530 mbfl_filt_conv_common_ctor,
10531 NULL,
10532 mbfl_filt_conv_wchar_euctw,
10533 mbfl_filt_conv_common_flush,
10534 NULL,
10535 };
10536
10537 const mbfl_encoding mbfl_encoding_euc_tw = {
10538 mbfl_no_encoding_euc_tw,
10539 "EUC-TW",
10540 "EUC-TW",
10541 mbfl_encoding_euc_tw_aliases,
10542 mblen_table_euccn,
10543 0,
10544 &vtbl_euctw_wchar,
10545 &vtbl_wchar_euctw,
10546 mb_euctw_to_wchar,
10547 mb_wchar_to_euctw,
10548 NULL
10549 };
10550
10551 static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
10552
10553 static const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
10554 mbfl_no_encoding_euc_kr,
10555 mbfl_no_encoding_wchar,
10556 mbfl_filt_conv_common_ctor,
10557 NULL,
10558 mbfl_filt_conv_euckr_wchar,
10559 mbfl_filt_conv_euckr_wchar_flush,
10560 NULL,
10561 };
10562
10563 static const struct mbfl_convert_vtbl vtbl_wchar_euckr = {
10564 mbfl_no_encoding_wchar,
10565 mbfl_no_encoding_euc_kr,
10566 mbfl_filt_conv_common_ctor,
10567 NULL,
10568 mbfl_filt_conv_wchar_euckr,
10569 mbfl_filt_conv_common_flush,
10570 NULL,
10571 };
10572
10573 const mbfl_encoding mbfl_encoding_euc_kr = {
10574 mbfl_no_encoding_euc_kr,
10575 "EUC-KR",
10576 "EUC-KR",
10577 mbfl_encoding_euc_kr_aliases,
10578 mblen_table_euccn,
10579 0,
10580 &vtbl_euckr_wchar,
10581 &vtbl_wchar_euckr,
10582 mb_euckr_to_wchar,
10583 mb_wchar_to_euckr,
10584 NULL
10585 };
10586
10587 /* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
10588 * It is the same as EUC-KR, but with 8,822 additional characters added to
10589 * complete all the characters in the Johab charset. */
10590
10591 static const unsigned char mblen_table_81_to_fe[] = { /* 0x81-0xFE */
10592 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10593 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10594 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10595 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10596 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10597 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10598 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10599 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10600 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10601 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10602 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10603 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10604 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10606 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10607 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10608 };
10609
10610 static const char *mbfl_encoding_uhc_aliases[] = {"CP949", NULL};
10611
10612 static const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
10613 mbfl_no_encoding_uhc,
10614 mbfl_no_encoding_wchar,
10615 mbfl_filt_conv_common_ctor,
10616 NULL,
10617 mbfl_filt_conv_uhc_wchar,
10618 mbfl_filt_conv_uhc_wchar_flush,
10619 NULL,
10620 };
10621
10622 static const struct mbfl_convert_vtbl vtbl_wchar_uhc = {
10623 mbfl_no_encoding_wchar,
10624 mbfl_no_encoding_uhc,
10625 mbfl_filt_conv_common_ctor,
10626 NULL,
10627 mbfl_filt_conv_wchar_uhc,
10628 mbfl_filt_conv_common_flush,
10629 NULL,
10630 };
10631
10632 const mbfl_encoding mbfl_encoding_uhc = {
10633 mbfl_no_encoding_uhc,
10634 "UHC",
10635 "UHC",
10636 mbfl_encoding_uhc_aliases,
10637 mblen_table_81_to_fe,
10638 0,
10639 &vtbl_uhc_wchar,
10640 &vtbl_wchar_uhc,
10641 mb_uhc_to_wchar,
10642 mb_wchar_to_uhc,
10643 NULL
10644 };
10645
10646 /*
10647 * GB18030/CP936
10648 */
10649
mbfl_filt_conv_gb18030_wchar(int c,mbfl_convert_filter * filter)10650 static int mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
10651 {
10652 int k;
10653 int c1, c2, c3, w = -1;
10654
10655 switch (filter->status) {
10656 case 0:
10657 if (c >= 0 && c < 0x80) { /* latin */
10658 CK((*filter->output_function)(c, filter->data));
10659 } else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */
10660 filter->status = 1;
10661 filter->cache = c;
10662 } else {
10663 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10664 }
10665 break;
10666
10667 case 1: /* dbcs/qbcs second byte */
10668 c1 = filter->cache;
10669 filter->status = 0;
10670
10671 if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) {
10672 /* 4 byte range: Unicode BMP */
10673 filter->status = 2;
10674 filter->cache = (c1 << 8) | c;
10675 return 0;
10676 } else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) {
10677 /* 4 byte range: Unicode 16 planes */
10678 filter->status = 2;
10679 filter->cache = (c1 << 8) | c;
10680 return 0;
10681 } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
10682 /* UDA part 1,2: U+E000-U+E4C5 */
10683 w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
10684 CK((*filter->output_function)(w, filter->data));
10685 } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
10686 /* UDA part3 : U+E4C6-U+E765*/
10687 w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
10688 CK((*filter->output_function)(w, filter->data));
10689 }
10690
10691 c2 = (c1 << 8) | c;
10692
10693 if (w <= 0 &&
10694 ((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
10695 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
10696 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
10697 for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
10698 if (c2 >= mbfl_gb18030_pua_tbl[k][2] && c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1] - mbfl_gb18030_pua_tbl[k][0]) {
10699 w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
10700 CK((*filter->output_function)(w, filter->data));
10701 break;
10702 }
10703 }
10704 }
10705
10706 if (w <= 0) {
10707 if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
10708 (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
10709 (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
10710 (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
10711 (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
10712 w = (c1 - 0x81)*192 + c - 0x40;
10713 ZEND_ASSERT(w < cp936_ucs_table_size);
10714 CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
10715 } else {
10716 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10717 }
10718 }
10719 break;
10720
10721 case 2: /* qbcs third byte */
10722 c1 = (filter->cache >> 8) & 0xff;
10723 c2 = filter->cache & 0xff;
10724 filter->status = filter->cache = 0;
10725 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
10726 filter->cache = (c1 << 16) | (c2 << 8) | c;
10727 filter->status = 3;
10728 } else {
10729 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10730 }
10731 break;
10732
10733 case 3: /* qbcs fourth byte */
10734 c1 = (filter->cache >> 16) & 0xff;
10735 c2 = (filter->cache >> 8) & 0xff;
10736 c3 = filter->cache & 0xff;
10737 filter->status = filter->cache = 0;
10738 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
10739 if (c1 >= 0x90 && c1 <= 0xe3) {
10740 w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
10741 if (w > 0x10FFFF) {
10742 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10743 return 0;
10744 }
10745 } else { /* Unicode BMP */
10746 w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
10747 if (w >= 0 && w <= 39419) {
10748 k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
10749 w += mbfl_gb_uni_ofst[k];
10750 } else {
10751 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10752 return 0;
10753 }
10754 }
10755 CK((*filter->output_function)(w, filter->data));
10756 } else {
10757 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10758 }
10759 break;
10760
10761 EMPTY_SWITCH_DEFAULT_CASE();
10762 }
10763
10764 return 0;
10765 }
10766
mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter * filter)10767 static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter)
10768 {
10769 if (filter->status) {
10770 /* multi-byte character was truncated */
10771 filter->status = 0;
10772 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10773 }
10774
10775 if (filter->flush_function) {
10776 (*filter->flush_function)(filter->data);
10777 }
10778
10779 return 0;
10780 }
10781
mbfl_filt_conv_wchar_gb18030(int c,mbfl_convert_filter * filter)10782 static int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
10783 {
10784 int k, k1, k2;
10785 int c1, s = 0, s1 = 0;
10786
10787 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
10788 if (c == 0x01f9) {
10789 s = 0xa8bf;
10790 } else {
10791 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
10792 }
10793 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
10794 if (c == 0x20ac) { /* euro-sign */
10795 s = 0xa2e3;
10796 } else {
10797 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
10798 }
10799 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
10800 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
10801 } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
10802 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
10803 } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
10804 /* U+F900-FA2F CJK Compatibility Ideographs */
10805 if (c == 0xf92c) {
10806 s = 0xfd9c;
10807 } else if (c == 0xf979) {
10808 s = 0xfd9d;
10809 } else if (c == 0xf995) {
10810 s = 0xfd9e;
10811 } else if (c == 0xf9e7) {
10812 s = 0xfd9f;
10813 } else if (c == 0xf9f1) {
10814 s = 0xfda0;
10815 } else if (c >= 0xfa0c && c <= 0xfa29) {
10816 s = ucs_ci_s_cp936_table[c - 0xfa0c];
10817 }
10818 } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
10819 /* FE30h CJK Compatibility Forms */
10820 s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
10821 } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
10822 /* U+FE50-FE6F Small Form Variants */
10823 s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min];
10824 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
10825 /* U+FF00-FFFF HW/FW Forms */
10826 if (c == 0xff04) {
10827 s = 0xa1e7;
10828 } else if (c == 0xff5e) {
10829 s = 0xa1ab;
10830 } else if (c >= 0xff01 && c <= 0xff5d) {
10831 s = c - 0xff01 + 0xa3a1;
10832 } else if (c >= 0xffe0 && c <= 0xffe5) {
10833 s = ucs_hff_s_cp936_table[c-0xffe0];
10834 }
10835 }
10836
10837 /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
10838 * do a binary search in a table of differing codepoints to see if we have one */
10839 if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
10840 k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
10841 if (k1 >= 0) {
10842 s = mbfl_gb18030_c_tbl_val[k1];
10843 }
10844 }
10845
10846 if (c >= 0xe000 && c <= 0xe864) { /* PUA */
10847 if (c < 0xe766) {
10848 if (c < 0xe4c6) {
10849 c1 = c - 0xe000;
10850 s = (c1 % 94) + 0xa1;
10851 c1 /= 94;
10852 s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
10853 } else {
10854 c1 = c - 0xe4c6;
10855 s = ((c1 / 96) + 0xa1) << 8;
10856 c1 %= 96;
10857 s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
10858 }
10859 } else {
10860 /* U+E766..U+E864 */
10861 k1 = 0;
10862 k2 = mbfl_gb18030_pua_tbl_max;
10863 while (k1 < k2) {
10864 k = (k1 + k2) >> 1;
10865 if (c < mbfl_gb18030_pua_tbl[k][0]) {
10866 k2 = k;
10867 } else if (c > mbfl_gb18030_pua_tbl[k][1]) {
10868 k1 = k + 1;
10869 } else {
10870 s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
10871 break;
10872 }
10873 }
10874 }
10875 }
10876
10877 /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
10878 if (s <= 0 && c >= 0x0080 && c <= 0xffff) {
10879 /* BMP */
10880 s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
10881 if (s >= 0) {
10882 c1 = c - mbfl_gb_uni_ofst[s];
10883 s = (c1 % 10) + 0x30;
10884 c1 /= 10;
10885 s |= ((c1 % 126) + 0x81) << 8;
10886 c1 /= 126;
10887 s |= ((c1 % 10) + 0x30) << 16;
10888 c1 /= 10;
10889 s1 = c1 + 0x81;
10890 }
10891 } else if (c >= 0x10000 && c <= 0x10ffff) {
10892 /* Code set 3: Unicode U+10000..U+10FFFF */
10893 c1 = c - 0x10000;
10894 s = (c1 % 10) + 0x30;
10895 c1 /= 10;
10896 s |= ((c1 % 126) + 0x81) << 8;
10897 c1 /= 126;
10898 s |= ((c1 % 10) + 0x30) << 16;
10899 c1 /= 10;
10900 s1 = c1 + 0x90;
10901 }
10902
10903 if (c == 0) {
10904 s = 0;
10905 } else if (s == 0) {
10906 s = -1;
10907 }
10908
10909 if (s >= 0) {
10910 if (s <= 0x80) { /* latin */
10911 CK((*filter->output_function)(s, filter->data));
10912 } else if (s1 > 0) { /* qbcs */
10913 CK((*filter->output_function)(s1 & 0xff, filter->data));
10914 CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
10915 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10916 CK((*filter->output_function)(s & 0xff, filter->data));
10917 } else { /* dbcs */
10918 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10919 CK((*filter->output_function)(s & 0xff, filter->data));
10920 }
10921 } else {
10922 CK(mbfl_filt_conv_illegal_output(c, filter));
10923 }
10924
10925 return 0;
10926 }
10927
10928 static const unsigned short gb18030_pua_tbl3[] = {
10929 /* 0xFE50 */
10930 0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
10931 0x0000,0xE81E,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10932 0x0000,0xE826,0x0000,0x0000,0x0000,0x0000,0xE82B,0xE82C,
10933 0x0000,0x0000,0x0000,0x0000,0xE831,0xE832,0x0000,0x0000,
10934 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
10935 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE843,0x0000,
10936 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10937 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10938 0xE854,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10939 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10940 /* 0xFEA0 */
10941 0xE864
10942 };
10943
mb_gb18030_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10944 static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10945 {
10946 unsigned char *p = *in, *e = p + *in_len;
10947 uint32_t *out = buf, *limit = buf + bufsize;
10948
10949 while (p < e && out < limit) {
10950 unsigned char c = *p++;
10951
10952 if (c < 0x80) {
10953 *out++ = c;
10954 } else if (c == 0x80 || c == 0xFF) {
10955 *out++ = MBFL_BAD_INPUT;
10956 } else {
10957 if (p == e) {
10958 *out++ = MBFL_BAD_INPUT;
10959 break;
10960 }
10961 unsigned char c2 = *p++;
10962
10963 if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
10964 if (p >= e) {
10965 *out++ = MBFL_BAD_INPUT;
10966 break;
10967 }
10968 unsigned char c3 = *p++;
10969
10970 if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
10971 unsigned char c4 = *p++;
10972
10973 if (c4 >= 0x30 && c4 <= 0x39) {
10974 if (c >= 0x90 && c <= 0xE3) {
10975 unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
10976 *out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
10977 } else {
10978 /* Unicode BMP */
10979 unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
10980 if (w <= 39419) {
10981 *out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
10982 } else {
10983 *out++ = MBFL_BAD_INPUT;
10984 }
10985 }
10986 } else {
10987 *out++ = MBFL_BAD_INPUT;
10988 }
10989 } else {
10990 *out++ = MBFL_BAD_INPUT;
10991 }
10992 } else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
10993 /* UDA part 1, 2: U+E000-U+E4C5 */
10994 *out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
10995 } else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
10996 /* UDA part 3: U+E4C6-U+E765 */
10997 *out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
10998 } else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
10999 unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11000
11001 if (w >= 0x192B) {
11002 if (w <= 0x1EBE) {
11003 if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11004 *out++ = cp936_pua_tbl1[w - 0x192B];
11005 continue;
11006 }
11007 } else if (w >= 0x413A) {
11008 if (w <= 0x413E) {
11009 *out++ = cp936_pua_tbl2[w - 0x413A];
11010 continue;
11011 } else if (w >= 0x5DD0 && w <= 0x5E20) {
11012 unsigned int c = gb18030_pua_tbl3[w - 0x5DD0];
11013 if (c) {
11014 *out++ = c;
11015 continue;
11016 }
11017 }
11018 }
11019 }
11020
11021 if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11022 ZEND_ASSERT(w < cp936_ucs_table_size);
11023 *out++ = cp936_ucs_table[w];
11024 } else {
11025 *out++ = MBFL_BAD_INPUT;
11026 }
11027 } else {
11028 *out++ = MBFL_BAD_INPUT;
11029 }
11030 }
11031 }
11032
11033 *in_len = e - p;
11034 *in = p;
11035 return out - buf;
11036 }
11037
mb_wchar_to_gb18030(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11038 static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11039 {
11040 unsigned char *out, *limit;
11041 MB_CONVERT_BUF_LOAD(buf, out, limit);
11042 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11043
11044 while (len--) {
11045 uint32_t w = *in++;
11046 unsigned int s = 0;
11047
11048 if (w == 0) {
11049 out = mb_convert_buf_add(out, 0);
11050 continue;
11051 } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11052 if (w == 0x1F9) {
11053 s = 0xA8Bf;
11054 } else {
11055 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11056 }
11057 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11058 if (w == 0x20AC) { /* Euro sign */
11059 s = 0xA2E3;
11060 } else {
11061 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11062 }
11063 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11064 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11065 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11066 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11067 } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11068 /* U+F900-U+FA2F CJK Compatibility Ideographs */
11069 if (w == 0xF92C) {
11070 s = 0xFD9C;
11071 } else if (w == 0xF979) {
11072 s = 0xFD9D;
11073 } else if (w == 0xF995) {
11074 s = 0xFD9E;
11075 } else if (w == 0xF9E7) {
11076 s = 0xFD9F;
11077 } else if (w == 0xF9F1) {
11078 s = 0xFDA0;
11079 } else if (w >= 0xFA0C && w <= 0xFA29) {
11080 s = ucs_ci_s_cp936_table[w - 0xFA0C];
11081 }
11082 } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11083 /* CJK Compatibility Forms */
11084 s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11085 } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11086 /* U+FE50-U+FE6F Small Form Variants */
11087 s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11088 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11089 /* U+FF00-U+FFFF HW/FW Forms */
11090 if (w == 0xFF04) {
11091 s = 0xA1E7;
11092 } else if (w == 0xFF5E) {
11093 s = 0xA1AB;
11094 } else if (w >= 0xFF01 && w <= 0xFF5D) {
11095 s = w - 0xFF01 + 0xA3A1;
11096 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
11097 s = ucs_hff_s_cp936_table[w - 0xFFE0];
11098 }
11099 } else if (w >= 0xE000 && w <= 0xE864) {
11100 /* PUA */
11101 if (w < 0xE766) {
11102 if (w < 0xE4C6) {
11103 unsigned int c1 = w - 0xE000;
11104 s = (c1 % 94) + 0xA1;
11105 c1 /= 94;
11106 s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11107 } else {
11108 unsigned int c1 = w - 0xE4C6;
11109 s = ((c1 / 96) + 0xA1) << 8;
11110 c1 %= 96;
11111 s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11112 }
11113 } else {
11114 /* U+E766-U+E864 */
11115 unsigned int k1 = 0, k2 = mbfl_gb18030_pua_tbl_max;
11116 while (k1 < k2) {
11117 unsigned int k = (k1 + k2) >> 1;
11118 if (w < mbfl_gb18030_pua_tbl[k][0]) {
11119 k2 = k;
11120 } else if (w > mbfl_gb18030_pua_tbl[k][1]) {
11121 k1 = k + 1;
11122 } else {
11123 s = w - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
11124 break;
11125 }
11126 }
11127 }
11128 }
11129
11130 /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11131 * do a binary search in a table of differing codepoints to see if we have one */
11132 if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11133 int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11134 if (i >= 0) {
11135 s = mbfl_gb18030_c_tbl_val[i];
11136 }
11137 }
11138
11139 /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11140 if (!s && w >= 0x80 && w <= 0xFFFF) {
11141 /* BMP */
11142 int i = mbfl_bisec_srch(w, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
11143 if (i >= 0) {
11144 unsigned int c1 = w - mbfl_gb_uni_ofst[i];
11145 s = (c1 % 10) + 0x30;
11146 c1 /= 10;
11147 s |= ((c1 % 126) + 0x81) << 8;
11148 c1 /= 126;
11149 s |= ((c1 % 10) + 0x30) << 16;
11150 c1 /= 10;
11151 s |= (c1 + 0x81) << 24;
11152 }
11153 } else if (w >= 0x10000 && w <= 0x10FFFF) {
11154 /* Code set 3: Unicode U+10000-U+10FFFF */
11155 unsigned int c1 = w - 0x10000;
11156 s = (c1 % 10) + 0x30;
11157 c1 /= 10;
11158 s |= ((c1 % 126) + 0x81) << 8;
11159 c1 /= 126;
11160 s |= ((c1 % 10) + 0x30) << 16;
11161 c1 /= 10;
11162 s |= (c1 + 0x90) << 24;
11163 }
11164
11165 if (!s) {
11166 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11167 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11168 } else if (s < 0x80) {
11169 out = mb_convert_buf_add(out, s);
11170 } else if (s > 0xFFFFFF) {
11171 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11172 out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11173 } else {
11174 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11175 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11176 }
11177 }
11178
11179 MB_CONVERT_BUF_STORE(buf, out, limit);
11180 }
11181
mbfl_filt_conv_cp936_wchar(int c,mbfl_convert_filter * filter)11182 static int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter)
11183 {
11184 int k;
11185 int c1, c2, w = -1;
11186
11187 switch (filter->status) {
11188 case 0:
11189 if (c >= 0 && c < 0x80) { /* latin */
11190 CK((*filter->output_function)(c, filter->data));
11191 } else if (c == 0x80) { /* euro sign */
11192 CK((*filter->output_function)(0x20ac, filter->data));
11193 } else if (c < 0xff) { /* dbcs lead byte */
11194 filter->status = 1;
11195 filter->cache = c;
11196 } else { /* 0xff */
11197 CK((*filter->output_function)(0xf8f5, filter->data));
11198 }
11199 break;
11200
11201 case 1: /* dbcs second byte */
11202 filter->status = 0;
11203 c1 = filter->cache;
11204
11205 if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
11206 (c >= 0xa1 && c <= 0xfe)) {
11207 /* UDA part1,2: U+E000-U+E4C5 */
11208 w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
11209 CK((*filter->output_function)(w, filter->data));
11210 } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
11211 /* UDA part3 : U+E4C6-U+E765*/
11212 w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
11213 CK((*filter->output_function)(w, filter->data));
11214 }
11215
11216 c2 = (c1 << 8) | c;
11217
11218 if (w <= 0 &&
11219 ((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
11220 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
11221 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
11222 for (k = 0; k < mbfl_cp936_pua_tbl_max; k++) {
11223 if (c2 >= mbfl_cp936_pua_tbl[k][2] &&
11224 c2 <= mbfl_cp936_pua_tbl[k][2] +
11225 mbfl_cp936_pua_tbl[k][1] - mbfl_cp936_pua_tbl[k][0]) {
11226 w = c2 - mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][0];
11227 CK((*filter->output_function)(w, filter->data));
11228 break;
11229 }
11230 }
11231 }
11232
11233 if (w <= 0) {
11234 if (c1 < 0xff && c1 > 0x80 && c >= 0x40 && c < 0xff && c != 0x7f) {
11235 w = (c1 - 0x81)*192 + c - 0x40;
11236 ZEND_ASSERT(w < cp936_ucs_table_size);
11237 CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
11238 } else {
11239 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11240 }
11241 }
11242 break;
11243
11244 EMPTY_SWITCH_DEFAULT_CASE();
11245 }
11246
11247 return 0;
11248 }
11249
mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter * filter)11250 static int mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter *filter)
11251 {
11252 if (filter->status) {
11253 /* 2-byte character was truncated */
11254 filter->status = 0;
11255 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11256 }
11257
11258 if (filter->flush_function) {
11259 (*filter->flush_function)(filter->data);
11260 }
11261
11262 return 0;
11263 }
11264
mbfl_filt_conv_wchar_cp936(int c,mbfl_convert_filter * filter)11265 static int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter)
11266 {
11267 int k, k1, k2;
11268 int c1, s = 0;
11269
11270 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
11271 /* U+0000 - U+0451 */
11272 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
11273 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
11274 /* U+2000 - U+26FF */
11275 if (c == 0x203e) {
11276 s = 0xa3fe;
11277 } else if (c == 0x2218) {
11278 s = 0xa1e3;
11279 } else if (c == 0x223c) {
11280 s = 0xa1ab;
11281 } else {
11282 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
11283 }
11284 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
11285 /* U+2F00 - U+33FF */
11286 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
11287 } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
11288 /* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11289 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
11290 } else if (c >= 0xe000 && c <= 0xe864) { /* PUA */
11291 if (c < 0xe766) {
11292 if (c < 0xe4c6) {
11293 c1 = c - 0xe000;
11294 s = (c1 % 94) + 0xa1; c1 /= 94;
11295 s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
11296 } else {
11297 c1 = c - 0xe4c6;
11298 s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
11299 s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
11300 }
11301 } else {
11302 /* U+E766..U+E864 */
11303 k1 = 0; k2 = mbfl_cp936_pua_tbl_max;
11304 while (k1 < k2) {
11305 k = (k1 + k2) >> 1;
11306 if (c < mbfl_cp936_pua_tbl[k][0]) {
11307 k2 = k;
11308 } else if (c > mbfl_cp936_pua_tbl[k][1]) {
11309 k1 = k + 1;
11310 } else {
11311 s = c - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11312 break;
11313 }
11314 }
11315 }
11316 } else if (c == 0xf8f5) {
11317 s = 0xff;
11318 } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
11319 /* U+F900-FA2F CJK Compatibility Ideographs */
11320 s = ucs_ci_cp936_table[c - ucs_ci_cp936_table_min];
11321 } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
11322 s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
11323 } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
11324 s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; /* U+FE50-FE6F Small Form Variants */
11325 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
11326 /* U+FF00-FFFF HW/FW Forms */
11327 if (c == 0xff04) {
11328 s = 0xa1e7;
11329 } else if (c == 0xff5e) {
11330 s = 0xa1ab;
11331 } else if (c >= 0xff01 && c <= 0xff5d) {
11332 s = c - 0xff01 + 0xa3a1;
11333 } else if (c >= 0xffe0 && c <= 0xffe5) {
11334 s = ucs_hff_s_cp936_table[c-0xffe0];
11335 }
11336 }
11337
11338 if (s <= 0) {
11339 if (c == 0) {
11340 s = 0;
11341 } else if (s <= 0) {
11342 s = -1;
11343 }
11344 }
11345
11346 if (s >= 0) {
11347 if (s <= 0x80 || s == 0xff) { /* latin */
11348 CK((*filter->output_function)(s, filter->data));
11349 } else {
11350 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
11351 CK((*filter->output_function)(s & 0xff, filter->data));
11352 }
11353 } else {
11354 CK(mbfl_filt_conv_illegal_output(c, filter));
11355 }
11356
11357 return 0;
11358 }
11359
mb_cp936_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11360 static size_t mb_cp936_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11361 {
11362 unsigned char *p = *in, *e = p + *in_len;
11363 uint32_t *out = buf, *limit = buf + bufsize;
11364
11365 while (p < e && out < limit) {
11366 unsigned char c = *p++;
11367
11368 if (c < 0x80) {
11369 *out++ = c;
11370 } else if (c == 0x80) {
11371 *out++ = 0x20AC; /* Euro sign */
11372 } else if (c < 0xFF) {
11373 if (p >= e) {
11374 *out++ = MBFL_BAD_INPUT;
11375 continue;
11376 }
11377
11378 unsigned char c2 = *p++;
11379 if (c2 < 0x40 || c2 == 0x7F || c2 == 0xFF) {
11380 *out++ = MBFL_BAD_INPUT;
11381 continue;
11382 }
11383
11384 if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && c2 >= 0xA1) {
11385 /* UDA part 1, 2: U+E000-U+E4C5 */
11386 *out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11387 } else if (c >= 0xA1 && c <= 0xA7 && c2 < 0xA1) {
11388 /* UDA part 3: U+E4C6-U+E765*/
11389 *out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11390 } else {
11391 unsigned int w = (c - 0x81)*192 + c2 - 0x40; /* Convert c, c2 into GB 2312 table lookup index */
11392
11393 /* For CP936 and GB18030, certain GB 2312 byte combinations are mapped to PUA codepoints,
11394 * whereas the same combinations aren't mapped to any codepoint for HZ and EUC-CN
11395 * To avoid duplicating the entire GB 2312 -> Unicode lookup table, we have three
11396 * auxiliary tables which are consulted instead for specific ranges of lookup indices */
11397 if (w >= 0x192B) {
11398 if (w <= 0x1EBE) {
11399 *out++ = cp936_pua_tbl1[w - 0x192B];
11400 continue;
11401 } else if (w >= 0x413A) {
11402 if (w <= 0x413E) {
11403 *out++ = cp936_pua_tbl2[w - 0x413A];
11404 continue;
11405 } else if (w >= 0x5DD0 && w <= 0x5E20) {
11406 *out++ = cp936_pua_tbl3[w - 0x5DD0];
11407 continue;
11408 }
11409 }
11410 }
11411
11412 ZEND_ASSERT(w < cp936_ucs_table_size);
11413 *out++ = cp936_ucs_table[w];
11414 }
11415 } else {
11416 *out++ = 0xF8F5;
11417 }
11418 }
11419
11420 *in_len = e - p;
11421 *in = p;
11422 return out - buf;
11423 }
11424
mb_wchar_to_cp936(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11425 static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11426 {
11427 unsigned char *out, *limit;
11428 MB_CONVERT_BUF_LOAD(buf, out, limit);
11429 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11430
11431 while (len--) {
11432 uint32_t w = *in++;
11433 unsigned int s = 0;
11434
11435 if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11436 /* U+0000-U+0451 */
11437 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11438 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11439 /* U+2000-U+26FF */
11440 if (w == 0x203E) {
11441 s = 0xA3FE;
11442 } else if (w == 0x2218) {
11443 s = 0xA1E3;
11444 } else if (w == 0x223C) {
11445 s = 0xA1AB;
11446 } else {
11447 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11448 }
11449 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11450 /* U+2F00-U+33FF */
11451 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11452 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11453 /* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11454 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11455 } else if (w >= 0xE000 && w <= 0xE864) {
11456 /* PUA */
11457 if (w < 0xe766) {
11458 if (w < 0xe4c6) {
11459 unsigned int c1 = w - 0xE000;
11460 s = (c1 % 94) + 0xA1;
11461 c1 /= 94;
11462 s |= (c1 < 0x6 ? c1 + 0xAA : c1 + 0xF2) << 8;
11463 } else {
11464 unsigned int c1 = w - 0xE4C6;
11465 s = ((c1 / 96) + 0xA1) << 8;
11466 c1 %= 96;
11467 s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11468 }
11469 } else {
11470 /* U+E766-U+E864 */
11471 unsigned int k1 = 0;
11472 unsigned int k2 = mbfl_cp936_pua_tbl_max;
11473 while (k1 < k2) {
11474 int k = (k1 + k2) >> 1;
11475 if (w < mbfl_cp936_pua_tbl[k][0]) {
11476 k2 = k;
11477 } else if (w > mbfl_cp936_pua_tbl[k][1]) {
11478 k1 = k + 1;
11479 } else {
11480 s = w - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11481 break;
11482 }
11483 }
11484 }
11485 } else if (w == 0xF8F5) {
11486 s = 0xFF;
11487 } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11488 /* U+F900-U+FA2F CJK Compatibility Ideographs */
11489 s = ucs_ci_cp936_table[w - ucs_ci_cp936_table_min];
11490 } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11491 s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11492 } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11493 /* U+FE50-U+FE6F Small Form Variants */
11494 s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11495 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11496 /* U+FF00-U+FFFF HW/FW Forms */
11497 if (w == 0xFF04) {
11498 s = 0xA1E7;
11499 } else if (w == 0xFF5E) {
11500 s = 0xA1AB;
11501 } else if (w >= 0xFF01 && w <= 0xFF5D) {
11502 s = w - 0xFF01 + 0xA3A1;
11503 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
11504 s = ucs_hff_s_cp936_table[w - 0xFFE0];
11505 }
11506 }
11507
11508 if (!s) {
11509 if (w == 0) {
11510 out = mb_convert_buf_add(out, 0);
11511 } else {
11512 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp936);
11513 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11514 }
11515 } else if (s <= 0x80 || s == 0xFF) {
11516 out = mb_convert_buf_add(out, s);
11517 } else {
11518 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11519 }
11520 }
11521
11522 MB_CONVERT_BUF_STORE(buf, out, limit);
11523 }
11524
11525 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
11526
11527 static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
11528 mbfl_no_encoding_gb18030,
11529 mbfl_no_encoding_wchar,
11530 mbfl_filt_conv_common_ctor,
11531 NULL,
11532 mbfl_filt_conv_gb18030_wchar,
11533 mbfl_filt_conv_gb18030_wchar_flush,
11534 NULL,
11535 };
11536
11537 static const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = {
11538 mbfl_no_encoding_wchar,
11539 mbfl_no_encoding_gb18030,
11540 mbfl_filt_conv_common_ctor,
11541 NULL,
11542 mbfl_filt_conv_wchar_gb18030,
11543 mbfl_filt_conv_common_flush,
11544 NULL,
11545 };
11546
11547 const mbfl_encoding mbfl_encoding_gb18030 = {
11548 mbfl_no_encoding_gb18030,
11549 "GB18030",
11550 "GB18030",
11551 mbfl_encoding_gb18030_aliases,
11552 NULL,
11553 MBFL_ENCTYPE_GL_UNSAFE,
11554 &vtbl_gb18030_wchar,
11555 &vtbl_wchar_gb18030,
11556 mb_gb18030_to_wchar,
11557 mb_wchar_to_gb18030,
11558 NULL
11559 };
11560
11561 static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
11562
11563 static const struct mbfl_convert_vtbl vtbl_cp936_wchar = {
11564 mbfl_no_encoding_cp936,
11565 mbfl_no_encoding_wchar,
11566 mbfl_filt_conv_common_ctor,
11567 NULL,
11568 mbfl_filt_conv_cp936_wchar,
11569 mbfl_filt_conv_cp936_wchar_flush,
11570 NULL,
11571 };
11572
11573 static const struct mbfl_convert_vtbl vtbl_wchar_cp936 = {
11574 mbfl_no_encoding_wchar,
11575 mbfl_no_encoding_cp936,
11576 mbfl_filt_conv_common_ctor,
11577 NULL,
11578 mbfl_filt_conv_wchar_cp936,
11579 mbfl_filt_conv_common_flush,
11580 NULL,
11581 };
11582
11583 const mbfl_encoding mbfl_encoding_cp936 = {
11584 mbfl_no_encoding_cp936,
11585 "CP936",
11586 "CP936",
11587 mbfl_encoding_cp936_aliases,
11588 mblen_table_81_to_fe,
11589 MBFL_ENCTYPE_GL_UNSAFE,
11590 &vtbl_cp936_wchar,
11591 &vtbl_wchar_cp936,
11592 mb_cp936_to_wchar,
11593 mb_wchar_to_cp936,
11594 NULL
11595 };
11596
11597 /*
11598 * BIG5/CP950
11599 */
11600
11601 /* 63 + 94 = 157 or 94 */
11602 static unsigned short cp950_pua_tbl[][4] = {
11603 {0xe000, 0xe310, 0xfa40, 0xfefe},
11604 {0xe311, 0xeeb7, 0x8e40, 0xa0fe},
11605 {0xeeb8, 0xf6b0, 0x8140, 0x8dfe},
11606 {0xf6b1, 0xf70e, 0xc6a1, 0xc6fe},
11607 {0xf70f, 0xf848, 0xc740, 0xc8fe},
11608 };
11609
is_in_cp950_pua(int c1,int c)11610 static inline int is_in_cp950_pua(int c1, int c)
11611 {
11612 if ((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) || (c1 >= 0x81 && c1 <= 0x8d) || (c1 >= 0xc7 && c1 <= 0xc8)) {
11613 return (c >= 0x40 && c <= 0x7e) || (c >= 0xa1 && c <= 0xfe);
11614 } else if (c1 == 0xc6) {
11615 return c >= 0xa1 && c <= 0xfe;
11616 }
11617 return 0;
11618 }
11619
mbfl_filt_conv_big5_wchar(int c,mbfl_convert_filter * filter)11620 static int mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
11621 {
11622 int k, c1, w;
11623
11624 switch (filter->status) {
11625 case 0:
11626 if (c >= 0 && c < 0x80) { /* latin */
11627 CK((*filter->output_function)(c, filter->data));
11628 } else if (filter->from->no_encoding != mbfl_no_encoding_cp950 && c > 0xA0 && c <= 0xF9 && c != 0xC8) {
11629 filter->status = 1;
11630 filter->cache = c;
11631 } else if (filter->from->no_encoding == mbfl_no_encoding_cp950 && c > 0x80 && c <= 0xFE) {
11632 filter->status = 1;
11633 filter->cache = c;
11634 } else {
11635 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11636 }
11637 break;
11638
11639 case 1: /* dbcs second byte */
11640 filter->status = 0;
11641 c1 = filter->cache;
11642 if ((c > 0x3f && c < 0x7f) || (c > 0xa0 && c < 0xff)) {
11643 if (c < 0x7f) {
11644 w = (c1 - 0xa1)*157 + (c - 0x40);
11645 } else {
11646 w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
11647 }
11648 if (w >= 0 && w < big5_ucs_table_size) {
11649 w = big5_ucs_table[w];
11650 } else {
11651 w = 0;
11652 }
11653
11654 if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
11655 /* PUA for CP950 */
11656 if (is_in_cp950_pua(c1, c)) {
11657 int c2 = (c1 << 8) | c;
11658
11659 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
11660 if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
11661 break;
11662 }
11663 }
11664
11665 if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
11666 w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
11667 } else {
11668 w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
11669 }
11670 } else if (c1 == 0xA1) {
11671 if (c == 0x45) {
11672 w = 0x2027;
11673 } else if (c == 0x4E) {
11674 w = 0xFE51;
11675 } else if (c == 0x5A) {
11676 w = 0x2574;
11677 } else if (c == 0xC2) {
11678 w = 0x00AF;
11679 } else if (c == 0xC3) {
11680 w = 0xFFE3;
11681 } else if (c == 0xC5) {
11682 w = 0x02CD;
11683 } else if (c == 0xE3) {
11684 w = 0xFF5E;
11685 } else if (c == 0xF2) {
11686 w = 0x2295;
11687 } else if (c == 0xF3) {
11688 w = 0x2299;
11689 } else if (c == 0xFE) {
11690 w = 0xFF0F;
11691 }
11692 } else if (c1 == 0xA2) {
11693 if (c == 0x40) {
11694 w = 0xFF3C;
11695 } else if (c == 0x41) {
11696 w = 0x2215;
11697 } else if (c == 0x42) {
11698 w = 0xFE68;
11699 } else if (c == 0x46) {
11700 w = 0xFFE0;
11701 } else if (c == 0x47) {
11702 w = 0xFFE1;
11703 } else if (c == 0xCC) {
11704 w = 0x5341;
11705 } else if (c == 0xCE) {
11706 w = 0x5345;
11707 }
11708 }
11709 }
11710
11711 if (w <= 0) {
11712 w = MBFL_BAD_INPUT;
11713 }
11714 CK((*filter->output_function)(w, filter->data));
11715 } else {
11716 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11717 }
11718 break;
11719
11720 EMPTY_SWITCH_DEFAULT_CASE();
11721 }
11722
11723 return 0;
11724 }
11725
mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter * filter)11726 static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter)
11727 {
11728 if (filter->status == 1) {
11729 /* 2-byte character was truncated */
11730 filter->status = 0;
11731 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11732 }
11733
11734 if (filter->flush_function) {
11735 (*filter->flush_function)(filter->data);
11736 }
11737
11738 return 0;
11739 }
11740
mbfl_filt_conv_wchar_big5(int c,mbfl_convert_filter * filter)11741 static int mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
11742 {
11743 int k, s = 0;
11744
11745 if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
11746 s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
11747 } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
11748 s = ucs_a2_big5_table[c - ucs_a2_big5_table_min];
11749 } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) {
11750 s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
11751 } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
11752 s = ucs_i_big5_table[c - ucs_i_big5_table_min];
11753 } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
11754 s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
11755 } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
11756 s = ucs_r2_big5_table[c - ucs_r2_big5_table_min];
11757 }
11758
11759 if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
11760 if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
11761 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
11762 if (c <= cp950_pua_tbl[k][1]) {
11763 break;
11764 }
11765 }
11766
11767 int c1 = c - cp950_pua_tbl[k][0];
11768 if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
11769 int c2 = cp950_pua_tbl[k][2] >> 8;
11770 s = ((c1 / 157) + c2) << 8;
11771 c1 %= 157;
11772 s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
11773 } else {
11774 s = c1 + cp950_pua_tbl[k][2];
11775 }
11776 } else if (c == 0x00A2) {
11777 s = 0;
11778 } else if (c == 0x00A3) {
11779 s = 0;
11780 } else if (c == 0x00AF) {
11781 s = 0xA1C2;
11782 } else if (c == 0x02CD) {
11783 s = 0xA1C5;
11784 } else if (c == 0x0401) {
11785 s = 0;
11786 } else if (c >= 0x0414 && c <= 0x041C) {
11787 s = 0;
11788 } else if (c >= 0x0423 && c <= 0x044F) {
11789 s = 0;
11790 } else if (c == 0x0451) {
11791 s = 0;
11792 } else if (c == 0x2022) {
11793 s = 0;
11794 } else if (c == 0x2027) {
11795 s = 0xA145;
11796 } else if (c == 0x203E) {
11797 s = 0;
11798 } else if (c == 0x2215) {
11799 s = 0xA241;
11800 } else if (c == 0x223C) {
11801 s = 0;
11802 } else if (c == 0x2295) {
11803 s = 0xA1F2;
11804 } else if (c == 0x2299) {
11805 s = 0xA1F3;
11806 } else if (c >= 0x2460 && c <= 0x247D) {
11807 s = 0;
11808 } else if (c == 0x2574) {
11809 s = 0xA15A;
11810 } else if (c == 0x2609) {
11811 s = 0;
11812 } else if (c == 0x2641) {
11813 s = 0;
11814 } else if (c == 0x3005 || (c >= 0x302A && c <= 0x30FF)) {
11815 s = 0;
11816 } else if (c == 0xFE51) {
11817 s = 0xA14E;
11818 } else if (c == 0xFE68) {
11819 s = 0xA242;
11820 } else if (c == 0xFF3C) {
11821 s = 0xA240;
11822 } else if (c == 0xFF5E) {
11823 s = 0xA1E3;
11824 } else if (c == 0xFF64) {
11825 s = 0;
11826 } else if (c == 0xFFE0) {
11827 s = 0xA246;
11828 } else if (c == 0xFFE1) {
11829 s = 0xA247;
11830 } else if (c == 0xFFE3) {
11831 s = 0xA1C3;
11832 } else if (c == 0xFF0F) {
11833 s = 0xA1FE;
11834 }
11835 }
11836
11837 if (s <= 0) {
11838 if (c == 0) {
11839 s = 0;
11840 } else {
11841 s = -1;
11842 }
11843 }
11844
11845 if (s >= 0) {
11846 if (s <= 0x80) { /* latin */
11847 CK((*filter->output_function)(s, filter->data));
11848 } else {
11849 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
11850 CK((*filter->output_function)(s & 0xff, filter->data));
11851 }
11852 } else {
11853 CK(mbfl_filt_conv_illegal_output(c, filter));
11854 }
11855
11856 return 0;
11857 }
11858
mb_big5_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11859 static size_t mb_big5_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11860 {
11861 unsigned char *p = *in, *e = p + *in_len;
11862 uint32_t *out = buf, *limit = buf + bufsize;
11863
11864 e--; /* Stop the main loop 1 byte short of the end of the input */
11865
11866 while (p < e && out < limit) {
11867 unsigned char c = *p++;
11868
11869 if (c <= 0x7F) {
11870 *out++ = c;
11871 } else if (c > 0xA0 && c <= 0xF9) {
11872 /* We don't need to check p < e here; it's not possible that this pointer dereference
11873 * will be outside the input string, because of e-- above */
11874 unsigned char c2 = *p++;
11875
11876 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
11877 unsigned int w = (c - 0xA1)*157 + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
11878 ZEND_ASSERT(w < big5_ucs_table_size);
11879 w = big5_ucs_table[w];
11880 if (!w) {
11881 if (c == 0xC8) {
11882 p--;
11883 }
11884 w = MBFL_BAD_INPUT;
11885 }
11886 *out++ = w;
11887 } else {
11888 *out++ = MBFL_BAD_INPUT;
11889 }
11890 } else {
11891 *out++ = MBFL_BAD_INPUT;
11892 }
11893 }
11894
11895 /* Finish up last byte of input string if there is one */
11896 if (p == e && out < limit) {
11897 unsigned char c = *p++;
11898 *out++ = (c <= 0x7F) ? c : MBFL_BAD_INPUT;
11899 }
11900
11901 *in_len = e - p + 1;
11902 *in = p;
11903 return out - buf;
11904 }
11905
mb_wchar_to_big5(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11906 static void mb_wchar_to_big5(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11907 {
11908 unsigned char *out, *limit;
11909 MB_CONVERT_BUF_LOAD(buf, out, limit);
11910 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11911
11912 while (len--) {
11913 uint32_t w = *in++;
11914 unsigned int s = 0;
11915
11916 if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
11917 s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
11918 } else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
11919 s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
11920 } else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
11921 s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
11922 } else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
11923 s = ucs_i_big5_table[w - ucs_i_big5_table_min];
11924 } else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
11925 s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
11926 } else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
11927 s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
11928 }
11929
11930 if (!s) {
11931 if (w == 0) {
11932 out = mb_convert_buf_add(out, 0);
11933 } else {
11934 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
11935 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11936 }
11937 } else if (s <= 0x80) {
11938 out = mb_convert_buf_add(out, s);
11939 } else {
11940 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11941 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11942 }
11943 }
11944
11945 MB_CONVERT_BUF_STORE(buf, out, limit);
11946 }
11947
mb_cp950_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11948 static size_t mb_cp950_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11949 {
11950 unsigned char *p = *in, *e = p + *in_len;
11951 uint32_t *out = buf, *limit = buf + bufsize;
11952
11953 while (p < e && out < limit) {
11954 unsigned char c = *p++;
11955
11956 if (c <= 0x7F) {
11957 *out++ = c;
11958 } else if (c > 0x80 && c <= 0xFE && p < e) {
11959 unsigned char c2 = *p++;
11960
11961 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
11962 unsigned int w = ((c - 0xA1)*157) + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
11963 w = (w < big5_ucs_table_size) ? big5_ucs_table[w] : 0;
11964
11965 /* PUA for CP950 */
11966 if (is_in_cp950_pua(c, c2)) {
11967 unsigned int s = (c << 8) | c2;
11968
11969 int k;
11970 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
11971 if (s >= cp950_pua_tbl[k][2] && s <= cp950_pua_tbl[k][3]) {
11972 break;
11973 }
11974 }
11975
11976 if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
11977 w = 157*(c - (cp950_pua_tbl[k][2] >> 8)) + c2 - (c2 >= 0xA1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
11978 } else {
11979 w = s - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
11980 }
11981 } else if (c == 0xA1) {
11982 if (c2 == 0x45) {
11983 w = 0x2027;
11984 } else if (c2 == 0x4E) {
11985 w = 0xFE51;
11986 } else if (c2 == 0x5A) {
11987 w = 0x2574;
11988 } else if (c2 == 0xC2) {
11989 w = 0x00AF;
11990 } else if (c2 == 0xC3) {
11991 w = 0xFFE3;
11992 } else if (c2 == 0xC5) {
11993 w = 0x02CD;
11994 } else if (c2 == 0xE3) {
11995 w = 0xFF5E;
11996 } else if (c2 == 0xF2) {
11997 w = 0x2295;
11998 } else if (c2 == 0xF3) {
11999 w = 0x2299;
12000 } else if (c2 == 0xFE) {
12001 w = 0xFF0F;
12002 }
12003 } else if (c == 0xA2) {
12004 if (c2 == 0x40) {
12005 w = 0xFF3C;
12006 } else if (c2 == 0x41) {
12007 w = 0x2215;
12008 } else if (c2 == 0x42) {
12009 w = 0xFE68;
12010 } else if (c2 == 0x46) {
12011 w = 0xFFE0;
12012 } else if (c2 == 0x47) {
12013 w = 0xFFE1;
12014 } else if (c2 == 0xCC) {
12015 w = 0x5341;
12016 } else if (c2 == 0xCE) {
12017 w = 0x5345;
12018 }
12019 }
12020
12021 if (!w)
12022 w = MBFL_BAD_INPUT;
12023 *out++ = w;
12024 } else {
12025 *out++ = MBFL_BAD_INPUT;
12026 }
12027 } else {
12028 *out++ = MBFL_BAD_INPUT;
12029 }
12030 }
12031
12032 *in_len = e - p;
12033 *in = p;
12034 return out - buf;
12035 }
12036
mb_wchar_to_cp950(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12037 static void mb_wchar_to_cp950(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12038 {
12039 unsigned char *out, *limit;
12040 MB_CONVERT_BUF_LOAD(buf, out, limit);
12041 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12042
12043 while (len--) {
12044 uint32_t w = *in++;
12045 unsigned int s = 0;
12046
12047 if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
12048 s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
12049 } else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
12050 s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
12051 } else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
12052 s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
12053 } else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
12054 s = ucs_i_big5_table[w - ucs_i_big5_table_min];
12055 } else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
12056 s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
12057 } else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
12058 s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
12059 }
12060
12061 if (w >= 0xE000 && w <= 0xF848) {
12062 int k;
12063 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12064 if (w <= cp950_pua_tbl[k][1]) {
12065 break;
12066 }
12067 }
12068
12069 int c1 = w - cp950_pua_tbl[k][0];
12070 if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
12071 int c2 = cp950_pua_tbl[k][2] >> 8;
12072 s = ((c1 / 157) + c2) << 8;
12073 c1 %= 157;
12074 s |= c1 + (c1 >= 0x3F ? 0x62 : 0x40);
12075 } else {
12076 s = c1 + cp950_pua_tbl[k][2];
12077 }
12078 } else if (w == 0xA2 || w == 0xA3 || w == 0x401 || (w >= 0x414 && w <= 0x41C) || (w >= 0x423 && w <= 0x44F) || w == 0x451 || w == 0x2022 || w == 0x203E || w == 0x223C || (w >= 0x2460 && w <= 0x247D) || w == 0x2609 || w == 0x2641 || w == 0x3005 || (w >= 0x302A && w <= 0x30FF) || w == 0xFF64) {
12079 s = 0;
12080 } else if (w == 0xAF) {
12081 s = 0xA1C2;
12082 } else if (w == 0x2CD) {
12083 s = 0xA1C5;
12084 } else if (w == 0x2027) {
12085 s = 0xA145;
12086 } else if (w == 0x2215) {
12087 s = 0xA241;
12088 } else if (w == 0x2295) {
12089 s = 0xA1F2;
12090 } else if (w == 0x2299) {
12091 s = 0xA1F3;
12092 } else if (w == 0x2574) {
12093 s = 0xA15A;
12094 } else if (w == 0xFE51) {
12095 s = 0xA14E;
12096 } else if (w == 0xFE68) {
12097 s = 0xA242;
12098 } else if (w == 0xFF3C) {
12099 s = 0xA240;
12100 } else if (w == 0xFF5E) {
12101 s = 0xA1E3;
12102 } else if (w == 0xFFE0) {
12103 s = 0xA246;
12104 } else if (w == 0xFFE1) {
12105 s = 0xA247;
12106 } else if (w == 0xFFE3) {
12107 s = 0xA1C3;
12108 } else if (w == 0xFF0F) {
12109 s = 0xA1FE;
12110 }
12111
12112 if (!s) {
12113 if (w == 0) {
12114 out = mb_convert_buf_add(out, 0);
12115 } else {
12116 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
12117 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12118 }
12119 } else if (s <= 0x80) {
12120 out = mb_convert_buf_add(out, s);
12121 } else {
12122 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12123 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
12124 }
12125 }
12126
12127 MB_CONVERT_BUF_STORE(buf, out, limit);
12128 }
12129
12130 static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL};
12131
12132 static const struct mbfl_convert_vtbl vtbl_big5_wchar = {
12133 mbfl_no_encoding_big5,
12134 mbfl_no_encoding_wchar,
12135 mbfl_filt_conv_common_ctor,
12136 NULL,
12137 mbfl_filt_conv_big5_wchar,
12138 mbfl_filt_conv_big5_wchar_flush,
12139 NULL,
12140 };
12141
12142 static const struct mbfl_convert_vtbl vtbl_wchar_big5 = {
12143 mbfl_no_encoding_wchar,
12144 mbfl_no_encoding_big5,
12145 mbfl_filt_conv_common_ctor,
12146 NULL,
12147 mbfl_filt_conv_wchar_big5,
12148 mbfl_filt_conv_common_flush,
12149 NULL
12150 };
12151
12152 const mbfl_encoding mbfl_encoding_big5 = {
12153 mbfl_no_encoding_big5,
12154 "BIG-5",
12155 "BIG5",
12156 mbfl_encoding_big5_aliases,
12157 mblen_table_81_to_fe,
12158 MBFL_ENCTYPE_GL_UNSAFE,
12159 &vtbl_big5_wchar,
12160 &vtbl_wchar_big5,
12161 mb_big5_to_wchar,
12162 mb_wchar_to_big5,
12163 NULL
12164 };
12165
12166 static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
12167 mbfl_no_encoding_cp950,
12168 mbfl_no_encoding_wchar,
12169 mbfl_filt_conv_common_ctor,
12170 NULL,
12171 mbfl_filt_conv_big5_wchar,
12172 mbfl_filt_conv_big5_wchar_flush,
12173 NULL,
12174 };
12175
12176 static const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
12177 mbfl_no_encoding_wchar,
12178 mbfl_no_encoding_cp950,
12179 mbfl_filt_conv_common_ctor,
12180 NULL,
12181 mbfl_filt_conv_wchar_big5,
12182 mbfl_filt_conv_common_flush,
12183 NULL,
12184 };
12185
12186 const mbfl_encoding mbfl_encoding_cp950 = {
12187 mbfl_no_encoding_cp950,
12188 "CP950",
12189 "BIG5",
12190 NULL,
12191 mblen_table_81_to_fe,
12192 MBFL_ENCTYPE_GL_UNSAFE,
12193 &vtbl_cp950_wchar,
12194 &vtbl_wchar_cp950,
12195 mb_cp950_to_wchar,
12196 mb_wchar_to_cp950,
12197 NULL
12198 };
12199
12200 /*
12201 * HZ
12202 */
12203
mbfl_filt_conv_hz_wchar(int c,mbfl_convert_filter * filter)12204 static int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
12205 {
12206 int c1, s, w;
12207
12208 switch (filter->status & 0xf) {
12209 /* case 0x00: ASCII */
12210 /* case 0x10: GB2312 */
12211 case 0:
12212 if (c == '~') {
12213 filter->status += 2;
12214 } else if (filter->status == 0x10 && ((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77))) {
12215 /* DBCS first char */
12216 filter->cache = c;
12217 filter->status += 1;
12218 } else if (filter->status == 0 && c >= 0 && c < 0x80) { /* latin, CTLs */
12219 CK((*filter->output_function)(c, filter->data));
12220 } else {
12221 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12222 }
12223 break;
12224
12225 /* case 0x11: GB2312 second char */
12226 case 1:
12227 filter->status &= ~0xf;
12228 c1 = filter->cache;
12229 if (c1 > 0x20 && c1 < 0x7F && c > 0x20 && c < 0x7F) {
12230 s = (c1 - 1)*192 + c + 0x40; /* GB2312 */
12231 ZEND_ASSERT(s < cp936_ucs_table_size);
12232 if (s == 0x1864) {
12233 w = 0x30FB;
12234 } else if (s == 0x186A) {
12235 w = 0x2015;
12236 } else if (s == 0x186C) {
12237 w = 0x2225;
12238 } else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12239 w = 0;
12240 } else {
12241 w = cp936_ucs_table[s];
12242 }
12243
12244 if (w <= 0) {
12245 w = MBFL_BAD_INPUT;
12246 }
12247
12248 CK((*filter->output_function)(w, filter->data));
12249 } else {
12250 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12251 }
12252 break;
12253
12254 /* '~' */
12255 case 2:
12256 if (c == '}' && filter->status == 0x12) {
12257 filter->status = 0;
12258 } else if (c == '{' && filter->status == 2) {
12259 filter->status = 0x10;
12260 } else if (c == '~' && filter->status == 2) {
12261 CK((*filter->output_function)('~', filter->data));
12262 filter->status -= 2;
12263 } else if (c == '\n') {
12264 /* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12265 filter->status -= 2;
12266 } else {
12267 /* Invalid character after ~ */
12268 filter->status -= 2;
12269 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12270 }
12271 break;
12272
12273 EMPTY_SWITCH_DEFAULT_CASE();
12274 }
12275
12276 return 0;
12277 }
12278
mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter * filter)12279 static int mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter *filter)
12280 {
12281 if (filter->status == 0x11) {
12282 /* 2-byte character was truncated */
12283 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12284 }
12285
12286 filter->status = 0;
12287
12288 if (filter->flush_function) {
12289 (*filter->flush_function)(filter->data);
12290 }
12291
12292 return 0;
12293 }
12294
mbfl_filt_conv_wchar_hz(int c,mbfl_convert_filter * filter)12295 static int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
12296 {
12297 int s = 0;
12298
12299 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
12300 if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261 || c == 0x2CA || c == 0x2CB || c == 0x2D9) {
12301 s = 0;
12302 } else {
12303 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
12304 }
12305 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
12306 if (c == 0x2015) {
12307 s = 0xA1AA;
12308 } else if (c == 0x2010 || c == 0x2013 || c == 0x2014 || c == 0x2016 || c == 0x2025 || c == 0x2035 ||
12309 c == 0x2105 || c == 0x2109 || c == 0x2121 || (c >= 0x2170 && c <= 0x2179) || (c >= 0x2196 && c <= 0x2199) ||
12310 c == 0x2215 || c == 0x221F || c == 0x2223 || c == 0x2252 || c == 0x2266 || c == 0x2267 || c == 0x2295 ||
12311 (c >= 0x2550 && c <= 0x2573) || c == 0x22BF || c == 0x2609 || (c >= 0x2581 && c <= 0x258F) ||
12312 (c >= 0x2593 && c <= 0x2595) || c == 0x25BC || c == 0x25BD || (c >= 0x25E2 && c <= 0x25E5)) {
12313 s = 0;
12314 } else {
12315 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
12316 }
12317 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
12318 if (c == 0x30FB) {
12319 s = 0xA1A4;
12320 } else if (c == 0x3006 || c == 0x3007 || c == 0x3012 || c == 0x3231 || c == 0x32A3 || c >= 0x3300 ||
12321 (c >= 0x3018 && c <= 0x3040) || (c >= 0x309B && c <= 0x309E) || (c >= 0x30FC && c <= 0x30FE)) {
12322 s = 0;
12323 } else {
12324 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
12325 }
12326 } else if (c >= ucs_i_gb2312_table_min && c < ucs_i_gb2312_table_max) {
12327 s = ucs_i_gb2312_table[c - ucs_i_gb2312_table_min];
12328 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
12329 if (c == 0xFF04) {
12330 s = 0xA1E7;
12331 } else if (c == 0xFF5E) {
12332 s = 0xA1AB;
12333 } else if (c >= 0xFF01 && c <= 0xFF5D) {
12334 s = c - 0xFF01 + 0xA3A1;
12335 } else if (c == 0xFFE0 || c == 0xFFE1 || c == 0xFFE3 || c == 0xFFE5) {
12336 s = ucs_hff_s_cp936_table[c - 0xFFE0];
12337 }
12338 }
12339
12340 if (s & 0x8000) {
12341 s -= 0x8080;
12342 }
12343
12344 if (s <= 0) {
12345 s = (c == 0) ? 0 : -1;
12346 } else if ((s >= 0x80 && s < 0x2121) || s > 0x8080) {
12347 s = -1;
12348 }
12349
12350 if (s >= 0) {
12351 if (s < 0x80) { /* ASCII */
12352 if ((filter->status & 0xff00) != 0) {
12353 CK((*filter->output_function)('~', filter->data));
12354 CK((*filter->output_function)('}', filter->data));
12355 }
12356 filter->status = 0;
12357 if (s == 0x7E) {
12358 CK((*filter->output_function)('~', filter->data));
12359 }
12360 CK((*filter->output_function)(s, filter->data));
12361 } else { /* GB 2312-80 */
12362 if ((filter->status & 0xFF00) != 0x200) {
12363 CK((*filter->output_function)('~', filter->data));
12364 CK((*filter->output_function)('{', filter->data));
12365 }
12366 filter->status = 0x200;
12367 CK((*filter->output_function)((s >> 8) & 0x7F, filter->data));
12368 CK((*filter->output_function)(s & 0x7F, filter->data));
12369 }
12370 } else {
12371 CK(mbfl_filt_conv_illegal_output(c, filter));
12372 }
12373
12374 return 0;
12375 }
12376
mbfl_filt_conv_any_hz_flush(mbfl_convert_filter * filter)12377 static int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter)
12378 {
12379 /* back to latin */
12380 if (filter->status & 0xFF00) {
12381 CK((*filter->output_function)('~', filter->data));
12382 CK((*filter->output_function)('}', filter->data));
12383 }
12384 filter->status = 0;
12385 return 0;
12386 }
12387
12388 #define ASCII 0
12389 #define GB2312 1
12390
mb_hz_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12391 static size_t mb_hz_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12392 {
12393 unsigned char *p = *in, *e = p + *in_len;
12394 uint32_t *out = buf, *limit = buf + bufsize;
12395
12396 while (p < e && out < limit) {
12397 unsigned char c = *p++;
12398
12399 if (c == '~') {
12400 if (p == e) {
12401 break;
12402 }
12403 unsigned char c2 = *p++;
12404
12405 if (c2 == '}' && *state == GB2312) {
12406 *state = ASCII;
12407 } else if (c2 == '{' && *state == ASCII) {
12408 *state = GB2312;
12409 } else if (c2 == '~' && *state == ASCII) {
12410 *out++ = '~';
12411 } else if (c2 == '\n') {
12412 /* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12413 } else {
12414 /* Invalid character after ~ */
12415 *out++ = MBFL_BAD_INPUT;
12416 }
12417 } else if (((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77)) && p < e && *state == GB2312) {
12418 unsigned char c2 = *p++;
12419
12420 if (c > 0x20 && c < 0x7F && c2 > 0x20 && c2 < 0x7F) {
12421 unsigned int s = (c - 1)*192 + c2 + 0x40;
12422 ZEND_ASSERT(s < cp936_ucs_table_size);
12423
12424 if (s == 0x1864) {
12425 s = 0x30FB;
12426 } else if (s == 0x186A) {
12427 s = 0x2015;
12428 } else if (s == 0x186C) {
12429 s = 0x2225;
12430 } else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12431 s = 0;
12432 } else {
12433 s = cp936_ucs_table[s];
12434 }
12435 if (!s)
12436 s = MBFL_BAD_INPUT;
12437 *out++ = s;
12438 } else {
12439 *out++ = MBFL_BAD_INPUT;
12440 }
12441 } else if (c < 0x80 && *state == ASCII) {
12442 *out++ = c;
12443 } else {
12444 *out++ = MBFL_BAD_INPUT;
12445 }
12446 }
12447
12448 *in_len = e - p;
12449 *in = p;
12450 return out - buf;
12451 }
12452
mb_wchar_to_hz(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12453 static void mb_wchar_to_hz(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12454 {
12455 unsigned char *out, *limit;
12456 MB_CONVERT_BUF_LOAD(buf, out, limit);
12457 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12458
12459 while (len--) {
12460 uint32_t w = *in++;
12461 unsigned int s = 0;
12462
12463 if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
12464 if (w == 0xB7 || w == 0x144 || w == 0x148 || w == 0x251 || w == 0x261 || w == 0x2CA || w == 0x2CB || w == 0x2D9) {
12465 s = 0;
12466 } else {
12467 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
12468 }
12469 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
12470 if (w == 0x2015) {
12471 s = 0xA1AA;
12472 } else if (w == 0x2010 || w == 0x2013 || w == 0x2014 || w == 0x2016 || w == 0x2025 || w == 0x2035 || w == 0x2105 || w == 0x2109 || w == 0x2121 || (w >= 0x2170 && w <= 0x2179) || (w >= 0x2196 && w <= 0x2199) || w == 0x2215 || w == 0x221F || w == 0x2223 || w == 0x2252 || w == 0x2266 || w == 0x2267 || w == 0x2295 || (w >= 0x2550 && w <= 0x2573) || w == 0x22BF || w == 0x2609 || (w >= 0x2581 && w <= 0x258F) || (w >= 0x2593 && w <= 0x2595) || w == 0x25BC || w == 0x25BD || (w >= 0x25E2 && w <= 0x25E5)) {
12473 s = 0;
12474 } else {
12475 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
12476 }
12477 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
12478 if (w == 0x30FB) {
12479 s = 0xA1A4;
12480 } else if (w == 0x3006 || w == 0x3007 || w == 0x3012 || w == 0x3231 || w == 0x32A3 || w >= 0x3300 || (w >= 0x3018 && w <= 0x3040) || (w >= 0x309B && w <= 0x309E) || (w >= 0x30FC && w <= 0x30FE)) {
12481 s = 0;
12482 } else {
12483 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
12484 }
12485 } else if (w >= ucs_i_gb2312_table_min && w < ucs_i_gb2312_table_max) {
12486 s = ucs_i_gb2312_table[w - ucs_i_gb2312_table_min];
12487 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
12488 if (w == 0xFF04) {
12489 s = 0xA1E7;
12490 } else if (w == 0xFF5E) {
12491 s = 0xA1AB;
12492 } else if (w >= 0xFF01 && w <= 0xFF5D) {
12493 s = w - 0xFF01 + 0xA3A1;
12494 } else if (w == 0xFFE0 || w == 0xFFE1 || w == 0xFFE3 || w == 0xFFE5) {
12495 s = ucs_hff_s_cp936_table[w - 0xFFE0];
12496 }
12497 }
12498
12499 s &= ~0x8080;
12500
12501 if ((!s && w) || (s >= 0x80 && s < 0x2121)) {
12502 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_hz);
12503 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12504 } else if (s < 0x80) {
12505 /* ASCII */
12506 if (buf->state != ASCII) {
12507 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
12508 out = mb_convert_buf_add2(out, '~', '}');
12509 buf->state = ASCII;
12510 }
12511 if (s == '~') {
12512 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12513 out = mb_convert_buf_add2(out, '~', '~');
12514 } else {
12515 out = mb_convert_buf_add(out, s);
12516 }
12517 } else {
12518 /* GB 2312-80 */
12519 if (buf->state != GB2312) {
12520 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
12521 out = mb_convert_buf_add2(out, '~', '{');
12522 buf->state = GB2312;
12523 } else {
12524 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12525 }
12526 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
12527 }
12528 }
12529
12530 if (end && buf->state != ASCII) {
12531 /* If not in ASCII state, need to emit closing control chars */
12532 MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
12533 out = mb_convert_buf_add2(out, '~', '}');
12534 }
12535
12536 MB_CONVERT_BUF_STORE(buf, out, limit);
12537 }
12538
12539 static const struct mbfl_convert_vtbl vtbl_hz_wchar = {
12540 mbfl_no_encoding_hz,
12541 mbfl_no_encoding_wchar,
12542 mbfl_filt_conv_common_ctor,
12543 NULL,
12544 mbfl_filt_conv_hz_wchar,
12545 mbfl_filt_conv_hz_wchar_flush,
12546 NULL,
12547 };
12548
12549 static const struct mbfl_convert_vtbl vtbl_wchar_hz = {
12550 mbfl_no_encoding_wchar,
12551 mbfl_no_encoding_hz,
12552 mbfl_filt_conv_common_ctor,
12553 NULL,
12554 mbfl_filt_conv_wchar_hz,
12555 mbfl_filt_conv_any_hz_flush,
12556 NULL,
12557 };
12558
12559 const mbfl_encoding mbfl_encoding_hz = {
12560 mbfl_no_encoding_hz,
12561 "HZ",
12562 "HZ-GB-2312",
12563 NULL,
12564 NULL,
12565 MBFL_ENCTYPE_GL_UNSAFE,
12566 &vtbl_hz_wchar,
12567 &vtbl_wchar_hz,
12568 mb_hz_to_wchar,
12569 mb_wchar_to_hz,
12570 NULL
12571 };
12572