1 #include "mbfilter_cjk.h"
2
3 #include "unicode_table_jis.h"
4 #include "unicode_table_jis2004.h"
5 #include "unicode_table_big5.h"
6 #include "unicode_table_cns11643.h"
7 #include "unicode_table_cp932_ext.h"
8 #include "unicode_table_cp936.h"
9 #include "unicode_table_gb18030.h"
10 #include "unicode_table_gb2312.h"
11 #include "unicode_table_uhc.h"
12 #include "cp932_table.h"
13 #include "sjis_mac2uni.h"
14 #include "translit_kana_jisx0201_jisx0208.h"
15 #include "emoji2uni.h"
16
17 /* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
18 * These correspond to the letters A-Z
19 * To display the flag emoji for a country, two unicode codepoints are combined,
20 * which correspond to the two-letter code for that country
21 * This macro converts uppercase ASCII values to Regional Indicator codepoints */
22 #define NFLAGS(c) (0x1F1A5+((unsigned int)(c)))
23
24 static const char nflags_s[10][2] = {"CN", "DE", "ES", "FR", "GB", "IT", "JP", "KR", "RU", "US"};
25 static const int nflags_code_kddi[10] = { 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 };
26 static const int nflags_code_sb[10] = { 0x2B0A, 0x2B05, 0x2B08, 0x2B04, 0x2B07, 0x2B06, 0x2B02, 0x2B0B, 0x2B09, 0x2B03 };
27
28 #define EMIT_KEYPAD_EMOJI(c) do { *snd = (c); return 0x20E3; } while(0)
29 #define EMIT_FLAG_EMOJI(country) do { *snd = NFLAGS((country)[0]); return NFLAGS((country)[1]); } while(0)
30
31 static const char nflags_kddi[6][2] = {"FR", "DE", "IT", "GB", "CN", "KR"};
32 static const char nflags_sb[10][2] = {"JP", "US", "FR", "DE", "IT", "GB", "ES", "RU", "CN", "KR"};
33
34 /* number -> (ku*94)+ten value for telephone keypad character */
35 #define DOCOMO_KEYPAD(n) ((n) == 0 ? 0x296F : (0x2965 + (n)))
36 #define DOCOMO_KEYPAD_HASH 0x2964
37
38 /* `tbl` contains inclusive ranges, each represented by a pair of unsigned shorts */
mbfl_bisec_srch(int w,const unsigned short * tbl,int n)39 static int mbfl_bisec_srch(int w, const unsigned short *tbl, int n)
40 {
41 int l = 0, r = n-1;
42 while (l <= r) {
43 int probe = (l + r) >> 1;
44 unsigned short lo = tbl[2 * probe], hi = tbl[(2 * probe) + 1];
45 if (w < lo) {
46 r = probe - 1;
47 } else if (w > hi) {
48 l = probe + 1;
49 } else {
50 return probe;
51 }
52 }
53 return -1;
54 }
55
56 /* `tbl` contains single values, not ranges */
mbfl_bisec_srch2(int w,const unsigned short tbl[],int n)57 int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
58 {
59 int l = 0, r = n-1;
60 while (l <= r) {
61 int probe = (l + r) >> 1;
62 unsigned short val = tbl[probe];
63 if (w < val) {
64 r = probe - 1;
65 } else if (w > val) {
66 l = probe + 1;
67 } else {
68 return probe;
69 }
70 }
71 return -1;
72 }
73
mbfl_binary_search_paired_sorted_table(uint32_t w,const unsigned short tbl[][2],int n)74 static const unsigned short *mbfl_binary_search_paired_sorted_table(uint32_t w, const unsigned short tbl[][2], int n)
75 {
76 int r = n;
77 int l = 0;
78 while (l < r) {
79 int probe = (l + r) >> 1;
80 if (w < tbl[probe][0]) {
81 r = probe;
82 } else if (w > tbl[probe][0]) {
83 l = probe + 1;
84 } else {
85 return &tbl[probe][1];
86 }
87 }
88 return NULL;
89 }
90
91 #define SJIS_ENCODE(c1,c2,s1,s2) \
92 do { \
93 s1 = ((c1 - 1) >> 1) + ((c1) < 0x5F ? 0x71 : 0xB1); \
94 s2 = c2; \
95 if ((c1) & 1) { \
96 if ((c2) < 0x60) { \
97 s2--; \
98 } \
99 s2 += 0x20; \
100 } else { \
101 s2 += 0x7e; \
102 } \
103 } while (0)
104
105 #define SJIS_DECODE(c1,c2,s1,s2) \
106 do { \
107 if (c1 < 0xa0) { \
108 s1 = ((c1 - 0x81) << 1) + 0x21; \
109 } else { \
110 s1 = ((c1 - 0xc1) << 1) + 0x21; \
111 } \
112 s2 = c2; \
113 if (c2 < 0x9f) { \
114 if (c2 < 0x7f) { \
115 s2++; \
116 } \
117 s2 -= 0x20; \
118 } else { \
119 s1++; \
120 s2 -= 0x7e; \
121 } \
122 } while (0)
123
124 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
125
126 /*
127 * ISO-2022 variants
128 */
129
130 #define ASCII 0
131 #define JISX0201_KANA 0x20
132 #define JISX0208_KANJI 0x80
133
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)134 static int mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
135 {
136 int c1, s, w;
137
138 retry:
139 switch (filter->status & 0xf) {
140 /* case 0x00: ASCII */
141 /* case 0x10: X 0201 latin */
142 /* case 0x20: X 0201 kana */
143 /* case 0x80: X 0208 */
144 /* case 0x90: X 0212 */
145 case 0:
146 if (c == 0x1b) {
147 filter->status += 2;
148 } else if (c == 0x0e) { /* "kana in" */
149 filter->status = 0x20;
150 } else if (c == 0x0f) { /* "kana out" */
151 filter->status = 0;
152 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
153 CK((*filter->output_function)(0xa5, filter->data));
154 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
155 CK((*filter->output_function)(0x203e, filter->data));
156 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
157 CK((*filter->output_function)(0xff40 + c, filter->data));
158 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) { /* kanji first char */
159 filter->cache = c;
160 filter->status += 1;
161 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
162 CK((*filter->output_function)(c, filter->data));
163 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
164 CK((*filter->output_function)(0xfec0 + c, filter->data));
165 } else {
166 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
167 }
168 break;
169
170 /* case 0x81: X 0208 second char */
171 /* case 0x91: X 0212 second char */
172 case 1:
173 filter->status &= ~0xf;
174 c1 = filter->cache;
175 if (c > 0x20 && c < 0x7f) {
176 s = (c1 - 0x21)*94 + c - 0x21;
177 if (filter->status == 0x80) {
178 if (s >= 0 && s < jisx0208_ucs_table_size) {
179 w = jisx0208_ucs_table[s];
180 } else {
181 w = 0;
182 }
183
184 if (w <= 0) {
185 w = MBFL_BAD_INPUT;
186 }
187 } else {
188 if (s >= 0 && s < jisx0212_ucs_table_size) {
189 w = jisx0212_ucs_table[s];
190 } else {
191 w = 0;
192 }
193
194 if (w <= 0) {
195 w = MBFL_BAD_INPUT;
196 }
197 }
198 CK((*filter->output_function)(w, filter->data));
199 } else {
200 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
201 }
202 break;
203
204 /* ESC */
205 /* case 0x02: */
206 /* case 0x12: */
207 /* case 0x22: */
208 /* case 0x82: */
209 /* case 0x92: */
210 case 2:
211 if (c == 0x24) { /* '$' */
212 filter->status++;
213 } else if (c == 0x28) { /* '(' */
214 filter->status += 3;
215 } else {
216 filter->status &= ~0xf;
217 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
218 goto retry;
219 }
220 break;
221
222 /* ESC $ */
223 /* case 0x03: */
224 /* case 0x13: */
225 /* case 0x23: */
226 /* case 0x83: */
227 /* case 0x93: */
228 case 3:
229 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
230 filter->status = 0x80;
231 } else if (c == 0x28) { /* '(' */
232 filter->status++;
233 } else {
234 filter->status &= ~0xf;
235 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
236 CK((*filter->output_function)(0x24, filter->data));
237 goto retry;
238 }
239 break;
240
241 /* ESC $ ( */
242 /* case 0x04: */
243 /* case 0x14: */
244 /* case 0x24: */
245 /* case 0x84: */
246 /* case 0x94: */
247 case 4:
248 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
249 filter->status = 0x80;
250 } else if (c == 0x44) { /* 'D' */
251 filter->status = 0x90;
252 } else {
253 filter->status &= ~0xf;
254 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
255 CK((*filter->output_function)(0x24, filter->data));
256 CK((*filter->output_function)(0x28, filter->data));
257 goto retry;
258 }
259 break;
260
261 /* ESC ( */
262 /* case 0x05: */
263 /* case 0x15: */
264 /* case 0x25: */
265 /* case 0x85: */
266 /* case 0x95: */
267 case 5:
268 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
269 filter->status = 0;
270 } else if (c == 0x4a) { /* 'J' */
271 filter->status = 0x10;
272 } else if (c == 0x49) { /* 'I' */
273 filter->status = 0x20;
274 } else {
275 filter->status &= ~0xf;
276 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
277 CK((*filter->output_function)(0x28, filter->data));
278 goto retry;
279 }
280 break;
281
282 EMPTY_SWITCH_DEFAULT_CASE();
283 }
284
285 return 0;
286 }
287
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)288 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
289 {
290 if (filter->status & 0xF) {
291 /* 2-byte (JIS X 0208 or 0212) character was truncated,
292 * or else escape sequence was truncated */
293 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
294 }
295 filter->status = 0;
296
297 if (filter->flush_function) {
298 (*filter->flush_function)(filter->data);
299 }
300
301 return 0;
302 }
303
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)304 static int mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
305 {
306 int s = 0;
307
308 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
309 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
310 } else if (c == 0x203E) { /* OVERLINE */
311 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
312 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
313 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
314 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
315 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
316 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
317 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
318 }
319 if (s <= 0) {
320 if (c == 0xa5) { /* YEN SIGN */
321 s = 0x1005c;
322 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
323 s = 0x2140;
324 } else if (c == 0x2225) { /* PARALLEL TO */
325 s = 0x2142;
326 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
327 s = 0x215d;
328 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
329 s = 0x2171;
330 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
331 s = 0x2172;
332 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
333 s = 0x224c;
334 }
335 if (c == 0) {
336 s = 0;
337 } else if (s <= 0) {
338 s = -1;
339 }
340 }
341 if (s >= 0) {
342 if (s < 0x80) { /* ASCII */
343 if ((filter->status & 0xff00) != 0) {
344 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
345 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
346 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
347 }
348 filter->status = 0;
349 CK((*filter->output_function)(s, filter->data));
350 } else if (s < 0x8080) { /* X 0208 */
351 if ((filter->status & 0xff00) != 0x200) {
352 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
353 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
354 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
355 }
356 filter->status = 0x200;
357 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
358 CK((*filter->output_function)(s & 0x7f, filter->data));
359 } else if (s < 0x10000) { /* X 0212 */
360 if ((filter->status & 0xff00) != 0x300) {
361 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
362 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
363 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
364 CK((*filter->output_function)(0x44, filter->data)); /* 'D' */
365 }
366 filter->status = 0x300;
367 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
368 CK((*filter->output_function)(s & 0x7f, filter->data));
369 } else { /* X 0201 latin */
370 if ((filter->status & 0xff00) != 0x400) {
371 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
372 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
373 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
374 }
375 filter->status = 0x400;
376 CK((*filter->output_function)(s & 0x7f, filter->data));
377 }
378 } else {
379 CK(mbfl_filt_conv_illegal_output(c, filter));
380 }
381
382 return 0;
383 }
384
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)385 static int mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
386 {
387 int s;
388
389 s = 0;
390 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
391 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
392 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
393 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
394 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
395 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
396 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
397 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
398 }
399
400 if (s <= 0) {
401 if (c == 0xa5) { /* YEN SIGN */
402 s = 0x1005c;
403 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
404 s = 0x2140;
405 } else if (c == 0x2225) { /* PARALLEL TO */
406 s = 0x2142;
407 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
408 s = 0x215d;
409 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
410 s = 0x2171;
411 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
412 s = 0x2172;
413 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
414 s = 0x224c;
415 }
416 if (c == 0) {
417 s = 0;
418 } else if (s <= 0) {
419 s = -1;
420 }
421 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
422 s = -1;
423 }
424 if (s >= 0) {
425 if (s < 0x80) { /* ASCII */
426 if ((filter->status & 0xff00) != 0) {
427 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
428 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
429 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
430 }
431 filter->status = 0;
432 CK((*filter->output_function)(s, filter->data));
433 } else if (s < 0x10000) { /* X 0208 */
434 if ((filter->status & 0xff00) != 0x200) {
435 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
436 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
437 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
438 }
439 filter->status = 0x200;
440 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
441 CK((*filter->output_function)(s & 0x7f, filter->data));
442 } else { /* X 0201 latin */
443 if ((filter->status & 0xff00) != 0x400) {
444 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
445 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
446 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
447 }
448 filter->status = 0x400;
449 CK((*filter->output_function)(s & 0x7f, filter->data));
450 }
451 }
452
453 return 0;
454 }
455
456 #define ASCII 0
457 #define JISX_0201_LATIN 1
458 #define JISX_0201_KANA 2
459 #define JISX_0208 3
460 #define JISX_0212 4
461
mb_iso2022jp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)462 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
463 {
464 ZEND_ASSERT(bufsize >= 3);
465
466 unsigned char *p = *in, *e = p + *in_len;
467 uint32_t *out = buf, *limit = buf + bufsize;
468
469 while (p < e && out < limit) {
470 unsigned char c = *p++;
471
472 if (c == 0x1B) {
473 /* ESC seen; this is an escape sequence */
474 if ((e - p) < 2) {
475 *out++ = MBFL_BAD_INPUT;
476 if (p != e && (*p == '$' || *p == '('))
477 p++;
478 continue;
479 }
480
481 unsigned char c2 = *p++;
482 if (c2 == '$') {
483 unsigned char c3 = *p++;
484 if (c3 == '@' || c3 == 'B') {
485 *state = JISX_0208;
486 } else if (c3 == '(') {
487 if (p == e) {
488 *out++ = MBFL_BAD_INPUT;
489 break;
490 }
491 unsigned char c4 = *p++;
492 if (c4 == '@' || c4 == 'B') {
493 *state = JISX_0208;
494 } else if (c4 == 'D') {
495 *state = JISX_0212;
496 } else {
497 if ((limit - out) < 3) {
498 p -= 4;
499 break;
500 }
501 *out++ = MBFL_BAD_INPUT;
502 *out++ = '$';
503 *out++ = '(';
504 p--;
505 }
506 } else {
507 if ((limit - out) < 2) {
508 p -= 3;
509 break;
510 }
511 *out++ = MBFL_BAD_INPUT;
512 *out++ = '$';
513 p--;
514 }
515 } else if (c2 == '(') {
516 unsigned char c3 = *p++;
517 if (c3 == 'B' || c3 == 'H') {
518 *state = ASCII;
519 } else if (c3 == 'J') {
520 *state = JISX_0201_LATIN;
521 } else if (c3 == 'I') {
522 *state = JISX_0201_KANA;
523 } else {
524 if ((limit - out) < 2) {
525 p -= 3;
526 break;
527 }
528 *out++ = MBFL_BAD_INPUT;
529 *out++ = '(';
530 p--;
531 }
532 } else {
533 *out++ = MBFL_BAD_INPUT;
534 p--;
535 }
536 } else if (c == 0xE) {
537 /* "Kana In" marker; this is just for JIS-7/8, but we also accept it for ISO-2022-JP */
538 *state = JISX_0201_KANA;
539 } else if (c == 0xF) {
540 /* "Kana Out" marker */
541 *state = ASCII;
542 } else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
543 *out++ = 0xA5;
544 } else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
545 *out++ = 0x203E;
546 } else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
547 *out++ = 0xFF40 + c;
548 } else if (*state >= JISX_0208 && c > 0x20 && c < 0x7F) {
549 if (p == e) {
550 *out++ = MBFL_BAD_INPUT;
551 break;
552 }
553 unsigned char c2 = *p++;
554 if (c2 > 0x20 && c2 < 0x7F) {
555 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
556 uint32_t w = 0;
557 if (*state == JISX_0208) {
558 if (s < jisx0208_ucs_table_size) {
559 w = jisx0208_ucs_table[s];
560 }
561 if (!w) {
562 w = MBFL_BAD_INPUT;
563 }
564 } else {
565 if (s < jisx0212_ucs_table_size) {
566 w = jisx0212_ucs_table[s];
567 }
568 if (!w) {
569 w = MBFL_BAD_INPUT;
570 }
571 }
572 *out++ = w;
573 } else {
574 *out++ = MBFL_BAD_INPUT;
575 }
576 } else if (c < 0x80) {
577 *out++ = c;
578 } else if (c >= 0xA1 && c <= 0xDF) {
579 /* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
580 * with the MSB bit (in the context of ISO-2022 encoding).
581 *
582 * In this regard, Wikipedia states:
583 * "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
584 * encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
585 * escape sequences, using Shift Out and Shift In or setting the eighth bit
586 * (GR-invoked), respectively."
587 *
588 * Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
589 * and the 'JIS8' use of GR-invoked Kana */
590 *out++ = 0xFEC0 + c;
591 } else {
592 *out++ = MBFL_BAD_INPUT;
593 }
594 }
595
596 *in_len = e - p;
597 *in = p;
598 return out - buf;
599 }
600
mb_wchar_to_iso2022jp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)601 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
602 {
603 unsigned char *out, *limit;
604 MB_CONVERT_BUF_LOAD(buf, out, limit);
605 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
606
607 while (len--) {
608 uint32_t w = *in++;
609 unsigned int s = 0;
610
611 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
612 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
613 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
614 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
615 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
616 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
617 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
618 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
619 }
620
621 if (s == 0) {
622 if (w == 0xA5) { /* YEN SIGN */
623 s = 0x1005C;
624 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
625 s = 0x2140;
626 } else if (w == 0x2225) { /* PARALLEL TO */
627 s = 0x2142;
628 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
629 s = 0x215D;
630 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
631 s = 0x2171;
632 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
633 s = 0x2172;
634 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
635 s = 0x224C;
636 } else if (w != 0) {
637 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
638 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
639 continue;
640 }
641 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
642 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
643 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
644 continue;
645 }
646
647 if (s < 0x80) { /* ASCII */
648 if (buf->state != ASCII) {
649 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
650 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
651 buf->state = ASCII;
652 }
653 out = mb_convert_buf_add(out, s);
654 } else if (s < 0x8080) { /* JIS X 0208 */
655 if (buf->state != JISX_0208) {
656 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
657 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
658 buf->state = JISX_0208;
659 }
660 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
661 } else if (s < 0x10000) { /* JIS X 0212 */
662 if (buf->state != JISX_0212) {
663 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
664 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
665 buf->state = JISX_0212;
666 }
667 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
668 } else { /* X 0201 Latin */
669 if (buf->state != JISX_0201_LATIN) {
670 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
671 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
672 buf->state = JISX_0201_LATIN;
673 }
674 out = mb_convert_buf_add(out, s & 0x7F);
675 }
676 }
677
678 if (end && buf->state != ASCII) {
679 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
680 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
681 }
682
683 MB_CONVERT_BUF_STORE(buf, out, limit);
684 }
685
mb_wchar_to_jis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)686 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
687 {
688 unsigned char *out, *limit;
689 MB_CONVERT_BUF_LOAD(buf, out, limit);
690 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
691
692 while (len--) {
693 uint32_t w = *in++;
694 unsigned int s = 0;
695
696 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
697 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
698 } else if (w == 0x203E) { /* OVERLINE */
699 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
700 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
701 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
702 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
703 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
704 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
705 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
706 }
707
708 if (s == 0) {
709 if (w == 0xA5) { /* YEN SIGN */
710 s = 0x1005C;
711 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
712 s = 0x2140;
713 } else if (w == 0x2225) { /* PARALLEL TO */
714 s = 0x2142;
715 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
716 s = 0x215D;
717 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
718 s = 0x2171;
719 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
720 s = 0x2172;
721 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
722 s = 0x224C;
723 } else if (w != 0) {
724 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
725 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
726 continue;
727 }
728 }
729
730 if (s < 0x80) { /* ASCII */
731 if (buf->state != ASCII) {
732 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
733 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
734 buf->state = ASCII;
735 }
736 out = mb_convert_buf_add(out, s);
737 } else if (s >= 0xA1 && s <= 0xDF) {
738 if (buf->state != JISX_0201_KANA) {
739 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
740 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
741 buf->state = JISX_0201_KANA;
742 }
743 out = mb_convert_buf_add(out, s & 0x7F);
744 } else if (s < 0x8080) { /* JIS X 0208 */
745 if (buf->state != JISX_0208) {
746 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
747 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
748 buf->state = JISX_0208;
749 }
750 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
751 } else if (s < 0x10000) { /* JIS X 0212 */
752 if (buf->state != JISX_0212) {
753 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
754 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
755 buf->state = JISX_0212;
756 }
757 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
758 } else { /* X 0201 Latin */
759 if (buf->state != JISX_0201_LATIN) {
760 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
761 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
762 buf->state = JISX_0201_LATIN;
763 }
764 out = mb_convert_buf_add(out, s & 0x7F);
765 }
766 }
767
768 if (end && buf->state != ASCII) {
769 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
770 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
771 }
772
773 MB_CONVERT_BUF_STORE(buf, out, limit);
774 }
775
776 #define JISX_0201_KANA_SO 5
777
mb_check_jis(unsigned char * in,size_t in_len)778 static bool mb_check_jis(unsigned char *in, size_t in_len)
779 {
780 unsigned char *p = in, *e = p + in_len;
781 unsigned int state = ASCII;
782
783 while (p < e) {
784 unsigned char c = *p++;
785 if (c == 0x1B) {
786 /* ESC seen; this is an escape sequence */
787 if (state == JISX_0201_KANA_SO) {
788 return false;
789 }
790 if ((e - p) < 2) {
791 return false;
792 }
793 unsigned char c2 = *p++;
794 if (c2 == '$') {
795 unsigned char c3 = *p++;
796 if (c3 == '@' || c3 == 'B') {
797 state = JISX_0208;
798 } else if (c3 == '(') {
799 if (p == e) {
800 return false;
801 }
802 unsigned char c4 = *p++;
803 if (c4 == '@' || c4 == 'B') {
804 state = JISX_0208;
805 } else if (c4 == 'D') {
806 state = JISX_0212;
807 } else {
808 return false;
809 }
810 } else {
811 return false;
812 }
813 } else if (c2 == '(') {
814 unsigned char c3 = *p++;
815 /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
816 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
817 if (c3 == 'B' || c3 == 'H') {
818 state = ASCII;
819 } else if (c3 == 'J') {
820 state = JISX_0201_LATIN;
821 } else if (c3 == 'I') {
822 state = JISX_0201_KANA;
823 } else {
824 return false;
825 }
826 } else {
827 return false;
828 }
829 } else if (c == 0xE) {
830 /* "Kana In" marker */
831 if (state != ASCII) {
832 return false;
833 }
834 state = JISX_0201_KANA_SO;
835 } else if (c == 0xF) {
836 /* "Kana Out" marker */
837 if (state != JISX_0201_KANA_SO) {
838 return false;
839 }
840 state = ASCII;
841 } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
842 if (p == e) {
843 return false;
844 }
845 unsigned char c2 = *p++;
846 if (c2 > 0x20 && c2 < 0x7F) {
847 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
848 if (state == JISX_0208) {
849 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
850 continue;
851 }
852 } else {
853 if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
854 continue;
855 }
856 }
857 return false;
858 } else {
859 return false;
860 }
861 } else if (c < 0x80) {
862 continue;
863 } else if (c >= 0xA1 && c <= 0xDF) {
864 /* GR-invoked Kana */
865 continue;
866 } else {
867 return false;
868 }
869 }
870
871 return state == ASCII;
872 }
873
mb_check_iso2022jp(unsigned char * in,size_t in_len)874 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
875 {
876 unsigned char *p = in, *e = p + in_len;
877 unsigned int state = ASCII;
878
879 while (p < e) {
880 unsigned char c = *p++;
881 if (c == 0x1B) {
882 /* ESC seen; this is an escape sequence */
883 if ((e - p) < 2) {
884 return false;
885 }
886 unsigned char c2 = *p++;
887 if (c2 == '$') {
888 unsigned char c3 = *p++;
889 if (c3 == '@' || c3 == 'B') {
890 state = JISX_0208;
891 } else {
892 return false;
893 }
894 } else if (c2 == '(') {
895 unsigned char c3 = *p++;
896 if (c3 == 'B') {
897 state = ASCII;
898 } else if (c3 == 'J') {
899 state = JISX_0201_LATIN;
900 } else {
901 return false;
902 }
903 } else {
904 return false;
905 }
906 } else if (c == 0xE || c == 0xF) {
907 /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
908 return false;
909 } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
910 if (p == e) {
911 return false;
912 }
913 unsigned char c2 = *p++;
914 if (c2 > 0x20 && c2 < 0x7F) {
915 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
916 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
917 continue;
918 }
919 return false;
920 } else {
921 return false;
922 }
923 } else if (c < 0x80) {
924 continue;
925 } else {
926 return false;
927 }
928 }
929
930 return state == ASCII;
931 }
932
933 /* Unicode codepoints for emoji are above 0x1F000, but we only store 16-bits
934 * in our tables. Therefore, add 0x10000 to recover the true values.
935 *
936 * Again, for some emoji which are not supported by Unicode, we use codepoints
937 * in the Private Use Area above 0xFE000. Again, add 0xF0000 to recover the
938 * true value. */
convert_emoji_cp(int cp)939 static inline int convert_emoji_cp(int cp)
940 {
941 if (cp > 0xF000)
942 return cp + 0x10000;
943 else if (cp > 0xE000)
944 return cp + 0xF0000;
945 return cp;
946 }
947
mbfilter_sjis_emoji_kddi2unicode(int s,int * snd)948 int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd)
949 {
950 if (s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi1_max) {
951 if (s == 0x24C0) { /* Spain */
952 EMIT_FLAG_EMOJI("ES");
953 } else if (s == 0x24C1) { /* Russia */
954 EMIT_FLAG_EMOJI("RU");
955 } else if (s >= 0x2545 && s <= 0x254A) {
956 EMIT_FLAG_EMOJI(nflags_kddi[s - 0x2545]);
957 } else if (s == 0x25BC) {
958 EMIT_KEYPAD_EMOJI('#');
959 } else {
960 *snd = 0;
961 return convert_emoji_cp(mb_tbl_code2uni_kddi1[s - mb_tbl_code2uni_kddi1_min]);
962 }
963 } else if (s >= mb_tbl_code2uni_kddi2_min && s <= mb_tbl_code2uni_kddi2_max) {
964 if (s == 0x2750) { /* Japan */
965 EMIT_FLAG_EMOJI("JP");
966 } else if (s >= 0x27A6 && s <= 0x27AE) {
967 EMIT_KEYPAD_EMOJI(s - 0x27A6 + '1');
968 } else if (s == 0x27F7) { /* United States */
969 EMIT_FLAG_EMOJI("US");
970 } else if (s == 0x2830) {
971 EMIT_KEYPAD_EMOJI('0');
972 } else {
973 *snd = 0;
974 return convert_emoji_cp(mb_tbl_code2uni_kddi2[s - mb_tbl_code2uni_kddi2_min]);
975 }
976 }
977 return 0;
978 }
979
mbfl_filt_conv_2022jp_mobile_wchar(int c,mbfl_convert_filter * filter)980 static int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
981 {
982 int c1, s, w, snd = 0;
983
984 switch (filter->status & 0xF) {
985 case 0:
986 if (c == 0x1B) {
987 filter->status += 2;
988 } else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
989 CK((*filter->output_function)(0xFF40 + c, filter->data));
990 } else if (filter->status == JISX0208_KANJI && c > 0x20 && c < 0x80) {
991 filter->cache = c;
992 filter->status += 1;
993 } else if (c >= 0 && c < 0x80) { /* ASCII */
994 CK((*filter->output_function)(c, filter->data));
995 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
996 CK((*filter->output_function)(0xFEC0 + c, filter->data));
997 } else {
998 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
999 }
1000 break;
1001
1002 /* JISX 0208, second byte */
1003 case 1:
1004 w = 0;
1005 filter->status &= ~0xF;
1006 c1 = filter->cache;
1007 if (c > 0x20 && c < 0x7F) {
1008 s = ((c1 - 0x21) * 94) + c - 0x21;
1009
1010 if (s <= 137) {
1011 if (s == 31) {
1012 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
1013 } else if (s == 32) {
1014 w = 0xFF5E; /* FULLWIDTH TILDE */
1015 } else if (s == 33) {
1016 w = 0x2225; /* PARALLEL TO */
1017 } else if (s == 60) {
1018 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1019 } else if (s == 80) {
1020 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1021 } else if (s == 81) {
1022 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1023 } else if (s == 137) {
1024 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1025 }
1026 }
1027
1028 if (s >= (84 * 94) && s < (91 * 94)) {
1029 s += 22 * 94;
1030 w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1031 if (w > 0 && snd > 0) {
1032 (*filter->output_function)(snd, filter->data);
1033 }
1034 }
1035
1036 if (w == 0) {
1037 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1038 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1039 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
1040 w = jisx0208_ucs_table[s];
1041 }
1042 }
1043
1044 if (w <= 0) {
1045 w = MBFL_BAD_INPUT;
1046 }
1047 CK((*filter->output_function)(w, filter->data));
1048 } else {
1049 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1050 }
1051 break;
1052
1053 /* ESC */
1054 case 2:
1055 if (c == '$') {
1056 filter->status++;
1057 } else if (c == '(') {
1058 filter->status += 3;
1059 } else {
1060 filter->status &= ~0xF;
1061 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1062 }
1063 break;
1064
1065 /* ESC $ */
1066 case 3:
1067 if (c == '@' || c == 'B') {
1068 filter->status = JISX0208_KANJI;
1069 } else if (c == '(') {
1070 filter->status++;
1071 } else {
1072 filter->status &= ~0xF;
1073 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1074 }
1075 break;
1076
1077 /* ESC $ ( */
1078 case 4:
1079 if (c == '@' || c == 'B') {
1080 filter->status = JISX0208_KANJI;
1081 } else {
1082 filter->status &= ~0xF;
1083 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1084 }
1085 break;
1086
1087 /* ESC ( */
1088 case 5:
1089 if (c == 'B' || c == 'J') {
1090 filter->status = 0; /* ASCII mode */
1091 } else if (c == 'I') {
1092 filter->status = JISX0201_KANA;
1093 } else {
1094 filter->status &= ~0xF;
1095 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1096 }
1097 }
1098
1099 return 0;
1100 }
1101
mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter * filter)1102 static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
1103 {
1104 if (filter->status & 0xF) {
1105 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
1106 }
1107 filter->status = 0;
1108
1109 if (filter->flush_function) {
1110 (*filter->flush_function)(filter->data);
1111 }
1112
1113 return 0;
1114 }
1115
mbfilter_unicode2sjis_emoji_kddi(int c,int * s1,mbfl_convert_filter * filter)1116 static int mbfilter_unicode2sjis_emoji_kddi(int c, int *s1, mbfl_convert_filter *filter)
1117 {
1118 if ((filter->status & 0xF) == 1) {
1119 int c1 = filter->cache;
1120 filter->cache = 0;
1121 filter->status &= ~0xFF;
1122 if (c == 0x20E3) {
1123 if (c1 == '#') {
1124 *s1 = 0x25BC;
1125 } else if (c1 == '0') {
1126 *s1 = 0x2830;
1127 } else { /* Previous character was '1'-'9' */
1128 *s1 = 0x27A6 + (c1 - '1');
1129 }
1130 return 1;
1131 } else {
1132 if (filter->status & 0xFF00) {
1133 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1134 CK((*filter->output_function)('(', filter->data));
1135 CK((*filter->output_function)('B', filter->data));
1136 }
1137 CK((*filter->output_function)(c1, filter->data));
1138 filter->status = 0;
1139 }
1140 }
1141
1142 if (c == '#' || (c >= '0' && c <= '9')) {
1143 filter->status |= 1;
1144 filter->cache = c;
1145 return 0;
1146 }
1147
1148 if (c == 0xA9) { /* Copyright sign */
1149 *s1 = 0x27DC;
1150 return 1;
1151 } else if (c == 0xAE) { /* Registered sign */
1152 *s1 = 0x27DD;
1153 return 1;
1154 } else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
1155 int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1156 if (i >= 0) {
1157 *s1 = mb_tbl_uni_kddi2code2_value[i];
1158 return 1;
1159 }
1160 } else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
1161 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1162 if (i >= 0) {
1163 *s1 = mb_tbl_uni_kddi2code3_value[i];
1164 return 1;
1165 }
1166 } else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
1167 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1168 if (i >= 0) {
1169 *s1 = mb_tbl_uni_kddi2code5_val[i];
1170 return 1;
1171 }
1172 }
1173 return 0;
1174 }
1175
1176 /* (ku*94)+ten value -> Shift-JIS byte sequence */
1177 #define CODE2JIS(c1,c2,s1,s2) \
1178 c1 = (s1)/94+0x21; \
1179 c2 = (s1)-94*((c1)-0x21)+0x21; \
1180 s1 = ((c1) << 8) | (c2); \
1181 s2 = 1
1182
mbfl_filt_conv_wchar_2022jp_mobile(int c,mbfl_convert_filter * filter)1183 static int mbfl_filt_conv_wchar_2022jp_mobile(int c, mbfl_convert_filter *filter)
1184 {
1185 int c1, c2, s1 = 0, s2 = 0;
1186
1187 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
1188 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
1189 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
1190 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
1191 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
1192 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
1193 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
1194 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
1195 }
1196
1197 if (s1 <= 0) {
1198 if (c == 0xA5) { /* YEN SIGN */
1199 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
1200 } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1201 s1 = 0x2140;
1202 } else if (c == 0x2225) { /* PARALLEL TO */
1203 s1 = 0x2142;
1204 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1205 s1 = 0x215d;
1206 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1207 s1 = 0x2171;
1208 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1209 s1 = 0x2172;
1210 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1211 s1 = 0x224c;
1212 }
1213 }
1214
1215 if (mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0) {
1216 /* A KDDI emoji was detected and stored in s1 */
1217 CODE2JIS(c1,c2,s1,s2);
1218 s1 -= 0x1600;
1219 } else if ((filter->status & 0xFF) == 1 && filter->cache) {
1220 /* We are just processing one of KDDI's special emoji for a phone keypad button */
1221 return 0;
1222 }
1223
1224 if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
1225 s1 = -1;
1226 for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
1227 if (c == cp932ext1_ucs_table[c1]) {
1228 s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
1229 break;
1230 }
1231 }
1232
1233 if (c == 0) {
1234 s1 = 0;
1235 }
1236 }
1237
1238 if (s1 >= 0) {
1239 if (s1 < 0x80) { /* ASCII */
1240 if (filter->status & 0xFF00) {
1241 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1242 CK((*filter->output_function)('(', filter->data));
1243 CK((*filter->output_function)('B', filter->data));
1244 }
1245 CK((*filter->output_function)(s1, filter->data));
1246 filter->status = 0;
1247 } else if (s1 > 0xA0 && s1 < 0xE0) { /* Kana */
1248 if ((filter->status & 0xFF00) != 0x100) {
1249 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1250 CK((*filter->output_function)('(', filter->data));
1251 CK((*filter->output_function)('I', filter->data));
1252 }
1253 filter->status = 0x100;
1254 CK((*filter->output_function)(s1 & 0x7F, filter->data));
1255 } else if (s1 < 0x7E7F) { /* JIS X 0208 */
1256 if ((filter->status & 0xFF00) != 0x200) {
1257 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1258 CK((*filter->output_function)('$', filter->data));
1259 CK((*filter->output_function)('B', filter->data));
1260 }
1261 filter->status = 0x200;
1262 CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
1263 CK((*filter->output_function)(s1 & 0x7F, filter->data));
1264 }
1265 } else {
1266 CK(mbfl_filt_conv_illegal_output(c, filter));
1267 }
1268
1269 return 0;
1270 }
1271
mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter * filter)1272 static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter)
1273 {
1274 /* Go back to ASCII mode (so strings can be safely concatenated) */
1275 if (filter->status & 0xFF00) {
1276 (*filter->output_function)(0x1B, filter->data); /* ESC */
1277 (*filter->output_function)('(', filter->data);
1278 (*filter->output_function)('B', filter->data);
1279 }
1280
1281 int c1 = filter->cache;
1282 if ((filter->status & 0xFF) == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
1283 (*filter->output_function)(c1, filter->data);
1284 }
1285 filter->status = filter->cache = 0;
1286
1287 if (filter->flush_function) {
1288 (*filter->flush_function)(filter->data);
1289 }
1290
1291 return 0;
1292 }
1293
mb_iso2022jp_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1294 static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1295 {
1296 unsigned char *p = *in, *e = p + *in_len;
1297 uint32_t *out = buf, *limit = buf + bufsize - 1;
1298
1299 while (p < e && out < limit) {
1300 unsigned char c = *p++;
1301
1302 if (c == 0x1B) {
1303 if ((e - p) < 2) {
1304 p = e;
1305 *out++ = MBFL_BAD_INPUT;
1306 break;
1307 }
1308 unsigned char c2 = *p++;
1309 unsigned char c3 = *p++;
1310
1311 if (c2 == '$') {
1312 if (c3 == '@' || c3 == 'B') {
1313 *state = JISX0208_KANJI;
1314 } else if (c3 == '(') {
1315 if (p == e) {
1316 *out++ = MBFL_BAD_INPUT;
1317 break;
1318 }
1319 unsigned char c4 = *p++;
1320
1321 if (c4 == '@' || c4 == 'B') {
1322 *state = JISX0208_KANJI;
1323 } else {
1324 *out++ = MBFL_BAD_INPUT;
1325 }
1326 } else {
1327 *out++ = MBFL_BAD_INPUT;
1328 }
1329 } else if (c2 == '(') {
1330 if (c3 == 'B' || c3 == 'J') {
1331 *state = ASCII;
1332 } else if (c3 == 'I') {
1333 *state = JISX0201_KANA;
1334 } else {
1335 *out++ = MBFL_BAD_INPUT;
1336 }
1337 } else {
1338 p--;
1339 *out++ = MBFL_BAD_INPUT;
1340 }
1341 } else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
1342 *out++ = 0xFF40 + c;
1343 } else if (*state == JISX0208_KANJI && c >= 0x21 && c <= 0x7F) {
1344 if (p == e) {
1345 *out++ = MBFL_BAD_INPUT;
1346 break;
1347 }
1348 unsigned char c2 = *p++;
1349
1350 if (c2 >= 0x21 && c2 <= 0x7E) {
1351 unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
1352 uint32_t w = 0;
1353
1354 if (s <= 137) {
1355 if (s == 31) {
1356 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
1357 } else if (s == 32) {
1358 w = 0xFF5E; /* FULLWIDTH TILDE */
1359 } else if (s == 33) {
1360 w = 0x2225; /* PARALLEL TO */
1361 } else if (s == 60) {
1362 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1363 } else if (s == 80) {
1364 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1365 } else if (s == 81) {
1366 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1367 } else if (s == 137) {
1368 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1369 }
1370 }
1371
1372 if (s >= (84 * 94) && s < (91 * 94)) {
1373 int snd = 0;
1374 s += 22 * 94;
1375 w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1376 if (w && snd) {
1377 *out++ = snd;
1378 }
1379 }
1380
1381 if (!w) {
1382 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1383 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1384 } else if (s < jisx0208_ucs_table_size) {
1385 w = jisx0208_ucs_table[s];
1386 }
1387 }
1388
1389 *out++ = w ? w : MBFL_BAD_INPUT;
1390 } else {
1391 *out++ = MBFL_BAD_INPUT;
1392 }
1393 } else if (c <= 0x7F) {
1394 *out++ = c;
1395 } else if (c >= 0xA1 && c <= 0xDF) {
1396 *out++ = 0xFEC0 + c;
1397 } else {
1398 *out++ = MBFL_BAD_INPUT;
1399 }
1400 }
1401
1402 *in_len = e - p;
1403 *in = p;
1404 return out - buf;
1405 }
1406
mb_wchar_to_iso2022jp_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1407 static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1408 {
1409 unsigned char *out, *limit;
1410 MB_CONVERT_BUF_LOAD(buf, out, limit);
1411 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1412
1413 while (len--) {
1414 uint32_t w = *in++;
1415 unsigned int s = 0;
1416
1417 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
1418 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
1419 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
1420 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
1421 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
1422 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
1423 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
1424 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
1425 }
1426
1427 if (!s) {
1428 if (w == 0xA5) { /* YEN SIGN */
1429 s = 0x216F; /* FULLWIDTH YEN SIGN */
1430 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1431 s = 0x2140;
1432 } else if (w == 0x2225) { /* PARALLEL TO */
1433 s = 0x2142;
1434 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1435 s = 0x215D;
1436 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1437 s = 0x2171;
1438 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1439 s = 0x2172;
1440 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1441 s = 0x224C;
1442 }
1443 }
1444
1445 if ((w == '#' || (w >= '0' && w <= '9')) && len) {
1446 uint32_t w2 = *in++; len--;
1447
1448 if (w2 == 0x20E3) {
1449 unsigned int s1 = 0;
1450 if (w == '#') {
1451 s1 = 0x25BC;
1452 } else if (w == '0') {
1453 s1 = 0x2830;
1454 } else { /* Previous character was '1'-'9' */
1455 s1 = 0x27A6 + (w - '1');
1456 }
1457 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1458 } else {
1459 in--; len++;
1460 }
1461 } else if (w >= NFLAGS('C') && w <= NFLAGS('U') && len) { /* C for CN, U for US */
1462 uint32_t w2 = *in++; len--;
1463
1464 if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
1465 for (int i = 0; i < 10; i++) {
1466 if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
1467 unsigned int s1 = nflags_code_kddi[i];
1468 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1469 goto found_flag_emoji;
1470 }
1471 }
1472 }
1473
1474 in--; len++;
1475 found_flag_emoji: ;
1476 }
1477
1478 if (w == 0xA9) { /* Copyright sign */
1479 unsigned int s1 = 0x27DC;
1480 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1481 } else if (w == 0xAE) { /* Registered sign */
1482 unsigned int s1 = 0x27DD;
1483 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1484 } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
1485 int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1486 if (i >= 0) {
1487 unsigned int s1 = mb_tbl_uni_kddi2code2_value[i];
1488 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1489 }
1490 } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
1491 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1492 if (i >= 0) {
1493 unsigned int s1 = mb_tbl_uni_kddi2code3_value[i];
1494 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1495 }
1496 } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
1497 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1498 if (i >= 0) {
1499 unsigned int s1 = mb_tbl_uni_kddi2code5_val[i];
1500 s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1501 }
1502 }
1503
1504 if (!s || s >= 0xA1A1) {
1505 s = 0;
1506 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
1507 if (w == cp932ext1_ucs_table[i]) {
1508 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
1509 break;
1510 }
1511 }
1512 if (w == 0)
1513 s = 0;
1514 }
1515
1516 if (!s && w) {
1517 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1518 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1519 } else if (s <= 0x7F) {
1520 if (buf->state != ASCII) {
1521 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1522 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1523 buf->state = ASCII;
1524 }
1525 out = mb_convert_buf_add(out, s);
1526 } else if (s >= 0xA1 && s <= 0xDF) {
1527 if (buf->state != JISX0201_KANA) {
1528 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1529 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1530 buf->state = JISX0201_KANA;
1531 }
1532 out = mb_convert_buf_add(out, s & 0x7F);
1533 } else if (s <= 0x7E7E) {
1534 if (buf->state != JISX0208_KANJI) {
1535 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1536 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1537 buf->state = JISX0208_KANJI;
1538 } else {
1539 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1540 }
1541 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1542 } else {
1543 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1544 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1545 }
1546 }
1547
1548 if (end && buf->state != ASCII) {
1549 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1550 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1551 }
1552
1553 MB_CONVERT_BUF_STORE(buf, out, limit);
1554 }
1555
mbfl_filt_conv_jis2004_wchar(int c,mbfl_convert_filter * filter)1556 static int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
1557 {
1558 int k;
1559 int c1, c2, s, s1 = 0, s2 = 0, w = 0, w1;
1560
1561 switch (filter->status & 0xf) {
1562 case 0:
1563 if (c >= 0 && c < 0x80) { /* latin */
1564 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1565 CK((*filter->output_function)(c, filter->data));
1566 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1567 if (c == 0x5c) {
1568 CK((*filter->output_function)(0x00a5, filter->data));
1569 } else if (c == 0x7e) {
1570 CK((*filter->output_function)(0x203e, filter->data));
1571 } else {
1572 CK((*filter->output_function)(c, filter->data));
1573 }
1574 } else { /* ISO-2022-JP-2004 */
1575 if (c == 0x1b) {
1576 filter->status += 6;
1577 } else if ((filter->status == 0x80 || filter->status == 0x90 || filter->status == 0xa0)
1578 && c > 0x20 && c < 0x7f) { /* kanji first char */
1579 filter->cache = c;
1580 if (filter->status == 0x90) {
1581 filter->status += 1; /* JIS X 0213 plane 1 */
1582 } else if (filter->status == 0xa0) {
1583 filter->status += 4; /* JIS X 0213 plane 2 */
1584 } else {
1585 filter->status += 5; /* JIS X 0208 */
1586 }
1587 } else {
1588 CK((*filter->output_function)(c, filter->data));
1589 }
1590 }
1591 } else {
1592 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1593 if (c > 0xa0 && c < 0xff) { /* X 0213 plane 1 first char */
1594 filter->status = 1;
1595 filter->cache = c;
1596 } else if (c == 0x8e) { /* kana first char */
1597 filter->cache = 0x8E; /* So error will be reported if input is truncated right here */
1598 filter->status = 2;
1599 } else if (c == 0x8f) { /* X 0213 plane 2 first char */
1600 filter->status = 3;
1601 } else {
1602 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1603 }
1604 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1605 if (c > 0xa0 && c < 0xe0) { /* kana */
1606 CK((*filter->output_function)(0xfec0 + c, filter->data));
1607 } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
1608 filter->status = 1;
1609 filter->cache = c;
1610 } else {
1611 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1612 }
1613 } else {
1614 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1615 }
1616 }
1617 break;
1618
1619 case 1: /* kanji second char */
1620 filter->status &= ~0xf;
1621 c1 = filter->cache;
1622
1623 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1624 if (c > 0xa0 && c < 0xff) {
1625 s1 = c1 - 0x80;
1626 s2 = c - 0x80;
1627 } else {
1628 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1629 break;
1630 }
1631 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1632 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
1633 SJIS_DECODE(c1, c, s1, s2);
1634 } else {
1635 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1636 break;
1637 }
1638 } else { /* ISO-2022-JP-2004 */
1639 if (c >= 0x21 && c <= 0x7E) {
1640 s1 = c1;
1641 s2 = c;
1642 } else {
1643 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1644 break;
1645 }
1646 }
1647 w1 = (s1 << 8) | s2;
1648
1649 /* conversion for combining characters */
1650 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) ||
1651 (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 ||
1652 (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
1653 k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
1654 if (k >= 0) {
1655 w = jisx0213_u2_tbl[2*k];
1656 CK((*filter->output_function)(w, filter->data));
1657 w = jisx0213_u2_tbl[2*k+1];
1658 }
1659 }
1660
1661 /* conversion for BMP */
1662 if (w <= 0) {
1663 w1 = (s1 - 0x21)*94 + s2 - 0x21;
1664 if (w1 >= 0 && w1 < jisx0213_ucs_table_size) {
1665 w = jisx0213_ucs_table[w1];
1666 }
1667 }
1668
1669 /* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
1670 if (w <= 0) {
1671 k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1672 if (k >= 0) {
1673 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1674 }
1675 }
1676
1677 if (w <= 0) {
1678 w = MBFL_BAD_INPUT;
1679 }
1680 CK((*filter->output_function)(w, filter->data));
1681 break;
1682
1683 case 2: /* got 0x8e: EUC-JP-2004 kana */
1684 filter->status = 0;
1685 if (c > 0xa0 && c < 0xe0) {
1686 w = 0xfec0 + c;
1687 CK((*filter->output_function)(w, filter->data));
1688 } else {
1689 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1690 }
1691 break;
1692
1693 case 3: /* X 0213 plane 2 first char: EUC-JP-2004 (0x8f) */
1694 if (c == 0xA1 || (c >= 0xA3 && c <= 0xA5) || c == 0xA8 || (c >= 0xAC && c <= 0xAF) || (c >= 0xEE && c <= 0xFE)) {
1695 filter->cache = c - 0x80;
1696 filter->status++;
1697 } else {
1698 filter->status = 0;
1699 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1700 }
1701 break;
1702
1703 case 4: /* X 0213 plane 2 second char: EUC-JP-2004, ISO-2022-JP-2004 */
1704 filter->status &= ~0xF;
1705 c1 = filter->cache;
1706 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1707 c2 = c - 0x80;
1708 } else {
1709 c2 = c;
1710 }
1711
1712 if (c2 < 0x21 || c2 > 0x7E) {
1713 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1714 break;
1715 }
1716
1717 s1 = c1 - 0x21;
1718 s2 = c2 - 0x21;
1719
1720 if (((s1 >= 0 && s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) ||
1721 (s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) {
1722 /* calc offset from ku */
1723 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
1724 if (s1 == jisx0213_p2_ofst[k]) {
1725 break;
1726 }
1727 }
1728 k -= jisx0213_p2_ofst[k];
1729
1730 /* check for japanese chars in BMP */
1731 s = (s1 + 94 + k)*94 + s2;
1732 ZEND_ASSERT(s < jisx0213_ucs_table_size);
1733 w = jisx0213_ucs_table[s];
1734
1735 /* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1736 if (w <= 0) {
1737 k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1738 if (k >= 0) {
1739 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1740 }
1741 }
1742
1743 if (w <= 0) {
1744 w = MBFL_BAD_INPUT;
1745 }
1746
1747 CK((*filter->output_function)(w, filter->data));
1748 } else {
1749 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1750 }
1751 break;
1752
1753 case 5: /* X 0208: ISO-2022-JP-2004 */
1754 filter->status &= ~0xf;
1755 c1 = filter->cache;
1756 if (c > 0x20 && c < 0x7f) {
1757 s = (c1 - 0x21)*94 + c - 0x21;
1758 if (s >= 0 && s < jisx0208_ucs_table_size) {
1759 w = jisx0208_ucs_table[s];
1760 }
1761 }
1762
1763 if (w <= 0) {
1764 w = MBFL_BAD_INPUT;
1765 }
1766
1767 CK((*filter->output_function)(w, filter->data));
1768 break;
1769
1770 /* ESC: ISO-2022-JP-2004 */
1771 /* case 0x06: */
1772 /* case 0x16: */
1773 /* case 0x26: */
1774 /* case 0x86: */
1775 /* case 0x96: */
1776 /* case 0xa6: */
1777 case 6:
1778 if (c == '$') {
1779 filter->status++;
1780 } else if (c == '(') {
1781 filter->status += 3;
1782 } else {
1783 filter->status &= ~0xf;
1784 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1785 }
1786 break;
1787
1788 /* ESC $: ISO-2022-JP-2004 */
1789 /* case 0x07: */
1790 /* case 0x17: */
1791 /* case 0x27: */
1792 /* case 0x87: */
1793 /* case 0x97: */
1794 /* case 0xa7: */
1795 case 7:
1796 if (c == 'B') { /* JIS X 0208-1983 */
1797 filter->status = 0x80;
1798 } else if (c == '(') {
1799 filter->status++;
1800 } else {
1801 filter->status &= ~0xf;
1802 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1803 }
1804 break;
1805
1806 /* ESC $ (: ISO-2022-JP-2004 */
1807 /* case 0x08: */
1808 /* case 0x18: */
1809 /* case 0x28: */
1810 /* case 0x88: */
1811 /* case 0x98: */
1812 /* case 0xa8: */
1813 case 8:
1814 if (c == 'Q') { /* JIS X 0213 plane 1 */
1815 filter->status = 0x90;
1816 } else if (c == 'P') { /* JIS X 0213 plane 2 */
1817 filter->status = 0xa0;
1818 } else {
1819 filter->status &= ~0xf;
1820 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1821 }
1822 break;
1823
1824 /* ESC (: ISO-2022-JP-2004 */
1825 /* case 0x09: */
1826 /* case 0x19: */
1827 /* case 0x29: */
1828 /* case 0x89: */
1829 /* case 0x99: */
1830 case 9:
1831 if (c == 'B') {
1832 filter->status = 0;
1833 } else {
1834 filter->status &= ~0xf;
1835 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1836 }
1837 break;
1838
1839 EMPTY_SWITCH_DEFAULT_CASE();
1840 }
1841
1842 return 0;
1843 }
1844
mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter * filter)1845 static int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
1846 {
1847 if (filter->status & 0xF) {
1848 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1849 }
1850 filter->status = 0;
1851
1852 if (filter->flush_function) {
1853 return (*filter->flush_function)(filter->data);
1854 }
1855
1856 return 0;
1857 }
1858
mbfl_filt_conv_wchar_jis2004(int c,mbfl_convert_filter * filter)1859 static int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
1860 {
1861 int k;
1862 int c1, c2, s1, s2;
1863
1864 retry:
1865 s1 = 0;
1866 /* check for 1st char of combining characters */
1867 if ((filter->status & 0xf) == 0 && (
1868 c == 0x00E6 ||
1869 (c >= 0x0254 && c <= 0x02E9) ||
1870 (c >= 0x304B && c <= 0x3053) ||
1871 (c >= 0x30AB && c <= 0x30C8) ||
1872 c == 0x31F7)) {
1873 for (k = 0; k < jisx0213_u2_tbl_len; k++) {
1874 if (c == jisx0213_u2_tbl[2*k]) {
1875 filter->status++;
1876 filter->cache = k;
1877 return 0;
1878 }
1879 }
1880 }
1881
1882 /* check for 2nd char of combining characters */
1883 if ((filter->status & 0xf) == 1 && filter->cache >= 0 && filter->cache < jisx0213_u2_tbl_len) {
1884 k = filter->cache;
1885 filter->status &= ~0xf;
1886 filter->cache = 0;
1887
1888 c1 = jisx0213_u2_tbl[2*k];
1889 if ((c1 == 0x0254 || c1 == 0x028C || c1 == 0x0259 || c1 == 0x025A) && c == 0x0301) {
1890 k++;
1891 }
1892 if (c == jisx0213_u2_tbl[2*k+1]) {
1893 s1 = jisx0213_u2_key[k];
1894 } else { /* fallback */
1895 s1 = jisx0213_u2_fb_tbl[k];
1896
1897 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1898 c1 = (s1 >> 8) & 0xff;
1899 c2 = s1 & 0xff;
1900 SJIS_ENCODE(c1, c2, s1, s2);
1901 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1902 s2 = (s1 & 0xff) + 0x80;
1903 s1 = ((s1 >> 8) & 0xff) + 0x80;
1904 } else {
1905 if (filter->status != 0x200) {
1906 CK((*filter->output_function)(0x1b, filter->data));
1907 CK((*filter->output_function)('$', filter->data));
1908 CK((*filter->output_function)('(', filter->data));
1909 CK((*filter->output_function)('Q', filter->data));
1910 }
1911 filter->status = 0x200;
1912
1913 s2 = s1 & 0x7f;
1914 s1 = (s1 >> 8) & 0x7f;
1915 }
1916
1917 /* Flush out cached data */
1918 CK((*filter->output_function)(s1, filter->data));
1919 CK((*filter->output_function)(s2, filter->data));
1920 goto retry;
1921 }
1922 }
1923
1924 /* check for major japanese chars: U+4E00 - U+9FFF */
1925 if (s1 <= 0) {
1926 for (k = 0; k < uni2jis_tbl_len; k++) {
1927 if (c >= uni2jis_tbl_range[k][0] && c <= uni2jis_tbl_range[k][1]) {
1928 s1 = uni2jis_tbl[k][c-uni2jis_tbl_range[k][0]];
1929 break;
1930 }
1931 }
1932 }
1933
1934 /* check for japanese chars in compressed mapping area: U+1E00 - U+4DBF */
1935 if (s1 <= 0 && c >= ucs_c1_jisx0213_min && c <= ucs_c1_jisx0213_max) {
1936 k = mbfl_bisec_srch(c, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1937 if (k >= 0) {
1938 s1 = ucs_c1_jisx0213_ofst[k] + c - ucs_c1_jisx0213_tbl[2*k];
1939 }
1940 }
1941
1942 /* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1943 if (s1 <= 0 && c >= jisx0213_u5_tbl_min && c <= jisx0213_u5_tbl_max) {
1944 k = mbfl_bisec_srch2(c - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1945 if (k >= 0) {
1946 s1 = jisx0213_u5_jis_tbl[k];
1947 }
1948 }
1949
1950 if (s1 <= 0) {
1951 /* CJK Compatibility Forms: U+FE30 - U+FE4F */
1952 if (c == 0xfe45) {
1953 s1 = 0x233e;
1954 } else if (c == 0xfe46) {
1955 s1 = 0x233d;
1956 } else if (c >= 0xf91d && c <= 0xf9dc) {
1957 /* CJK Compatibility Ideographs: U+F900 - U+F92A */
1958 k = mbfl_bisec_srch2(c, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1959 if (k >= 0) {
1960 s1 = ucs_r2b_jisx0213_cmap_val[k];
1961 }
1962 }
1963 }
1964
1965 if (s1 <= 0) {
1966 if (c == 0) {
1967 s1 = 0;
1968 } else {
1969 s1 = -1;
1970 }
1971 }
1972
1973 if (s1 >= 0) {
1974 if (s1 < 0x80) { /* ASCII */
1975 if (filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (filter->status & 0xff00)) {
1976 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
1977 CK((*filter->output_function)('(', filter->data));
1978 CK((*filter->output_function)('B', filter->data));
1979 }
1980 filter->status = 0;
1981 CK((*filter->output_function)(s1, filter->data));
1982 } else if (s1 < 0x100) { /* latin or kana */
1983 if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1984 CK((*filter->output_function)(0x8e, filter->data));
1985 CK((*filter->output_function)(s1, filter->data));
1986 } else if (filter->to->no_encoding == mbfl_no_encoding_sjis2004 && (s1 >= 0xA1 && s1 <= 0xDF)) {
1987 CK((*filter->output_function)(s1, filter->data));
1988 } else {
1989 CK(mbfl_filt_conv_illegal_output(c, filter));
1990 }
1991 } else if (s1 < 0x7f00) { /* X 0213 plane 1 */
1992 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1993 c1 = (s1 >> 8) & 0xff;
1994 c2 = s1 & 0xff;
1995 SJIS_ENCODE(c1, c2, s1, s2);
1996 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1997 s2 = (s1 & 0xff) + 0x80;
1998 s1 = ((s1 >> 8) & 0xff) + 0x80;
1999 } else {
2000 if ((filter->status & 0xff00) != 0x200) {
2001 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2002 CK((*filter->output_function)('$', filter->data));
2003 CK((*filter->output_function)('(', filter->data));
2004 CK((*filter->output_function)('Q', filter->data));
2005 }
2006 filter->status = 0x200;
2007 s2 = s1 & 0xff;
2008 s1 = (s1 >> 8) & 0xff;
2009 }
2010 CK((*filter->output_function)(s1, filter->data));
2011 CK((*filter->output_function)(s2, filter->data));
2012 } else { /* X 0213 plane 2 */
2013 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
2014 c1 = (s1 >> 8) & 0xff;
2015 c2 = s1 & 0xff;
2016 SJIS_ENCODE(c1, c2, s1, s2);
2017 } else {
2018 s2 = s1 & 0xff;
2019 k = ((s1 >> 8) & 0xff) - 0x7f;
2020 if (k >= 0 && k < jisx0213_p2_ofst_len) {
2021 s1 = jisx0213_p2_ofst[k] + 0x21;
2022 }
2023 if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2024 s2 |= 0x80;
2025 s1 |= 0x80;
2026 CK((*filter->output_function)(0x8f, filter->data));
2027 } else {
2028 if ((filter->status & 0xff00) != 0x200) {
2029 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2030 CK((*filter->output_function)('$', filter->data));
2031 CK((*filter->output_function)('(', filter->data));
2032 CK((*filter->output_function)('P', filter->data));
2033 }
2034 filter->status = 0x200;
2035 }
2036 }
2037
2038 CK((*filter->output_function)(s1, filter->data));
2039 CK((*filter->output_function)(s2, filter->data));
2040 }
2041 } else {
2042 CK(mbfl_filt_conv_illegal_output(c, filter));
2043 }
2044
2045 return 0;
2046 }
2047
mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter * filter)2048 static int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter)
2049 {
2050 int k, c1, c2, s1, s2;
2051
2052 k = filter->cache;
2053 filter->cache = 0;
2054
2055 if (filter->status == 1 && k >= 0 && k <= jisx0213_u2_tbl_len) {
2056 s1 = jisx0213_u2_fb_tbl[k];
2057
2058 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
2059 c1 = (s1 >> 8) & 0xff;
2060 c2 = s1 & 0xff;
2061 SJIS_ENCODE(c1, c2, s1, s2);
2062 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2063 s2 = (s1 & 0xff) | 0x80;
2064 s1 = ((s1 >> 8) & 0xff) | 0x80;
2065 } else {
2066 s2 = s1 & 0x7f;
2067 s1 = (s1 >> 8) & 0x7f;
2068 if ((filter->status & 0xff00) != 0x200) {
2069 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2070 CK((*filter->output_function)('$', filter->data));
2071 CK((*filter->output_function)('(', filter->data));
2072 CK((*filter->output_function)('Q', filter->data));
2073 }
2074 filter->status = 0x200;
2075 }
2076
2077 CK((*filter->output_function)(s1, filter->data));
2078 CK((*filter->output_function)(s2, filter->data));
2079 }
2080
2081 /* If we had switched to a different charset, go back to ASCII mode
2082 * This makes it possible to concatenate arbitrary valid strings
2083 * together and get a valid string */
2084 if (filter->status & 0xff00) {
2085 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2086 CK((*filter->output_function)('(', filter->data));
2087 CK((*filter->output_function)('B', filter->data));
2088 }
2089
2090 filter->status = 0;
2091
2092 if (filter->flush_function) {
2093 return (*filter->flush_function)(filter->data);
2094 }
2095
2096 return 0;
2097 }
2098
2099 #define ASCII 0
2100 #define JISX0208 1
2101 #define JISX0213_PLANE1 2
2102 #define JISX0213_PLANE2 3
2103
mb_iso2022jp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)2104 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
2105 {
2106 unsigned char *p = *in, *e = p + *in_len;
2107 uint32_t *out = buf, *limit = buf + bufsize - 1;
2108
2109 while (p < e && out < limit) {
2110 unsigned char c = *p++;
2111
2112 if (c <= 0x7F) {
2113 if (c == 0x1B) {
2114 if ((e - p) < 2) {
2115 *out++ = MBFL_BAD_INPUT;
2116 p = e;
2117 break;
2118 }
2119 unsigned char c2 = *p++;
2120 unsigned char c3 = *p++;
2121 if (c2 == '$') {
2122 if (c3 == 'B') {
2123 *state = JISX0208;
2124 } else if (c3 == '(') {
2125 if (p == e) {
2126 *out++ = MBFL_BAD_INPUT;
2127 break;
2128 }
2129 unsigned char c4 = *p++;
2130 if (c4 == 'Q') {
2131 *state = JISX0213_PLANE1;
2132 } else if (c4 == 'P') {
2133 *state = JISX0213_PLANE2;
2134 } else {
2135 *out++ = MBFL_BAD_INPUT;
2136 }
2137 } else {
2138 *out++ = MBFL_BAD_INPUT;
2139 }
2140 } else if (c2 == '(') {
2141 if (c3 == 'B') {
2142 *state = ASCII;
2143 } else {
2144 *out++ = MBFL_BAD_INPUT;
2145 }
2146 } else {
2147 p--;
2148 *out++ = MBFL_BAD_INPUT;
2149 }
2150 } else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) {
2151 if (p == e) {
2152 *out++ = MBFL_BAD_INPUT;
2153 break;
2154 }
2155 unsigned char c2 = *p++;
2156 if (c2 < 0x21 || c2 > 0x7E) {
2157 *out++ = MBFL_BAD_INPUT;
2158 continue;
2159 }
2160
2161 if (*state == JISX0213_PLANE1) {
2162 unsigned int w1 = (c << 8) | c2;
2163
2164 /* Conversion for combining characters */
2165 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
2166 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
2167 if (k >= 0) {
2168 *out++ = jisx0213_u2_tbl[2*k];
2169 *out++ = jisx0213_u2_tbl[2*k+1];
2170 continue;
2171 }
2172 }
2173
2174 /* Conversion for BMP */
2175 uint32_t w = 0;
2176 w1 = (c - 0x21)*94 + c2 - 0x21;
2177 if (w1 < jisx0213_ucs_table_size) {
2178 w = jisx0213_ucs_table[w1];
2179 }
2180
2181 /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
2182 if (!w) {
2183 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2184 if (k >= 0) {
2185 w = jisx0213_jis_u5_tbl[k] + 0x20000;
2186 }
2187 }
2188
2189 *out++ = w ? w : MBFL_BAD_INPUT;
2190 } else if (*state == JISX0213_PLANE2) {
2191
2192 unsigned int s1 = c - 0x21, s2 = c2 - 0x21;
2193
2194 if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
2195 int k;
2196 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
2197 if (s1 == jisx0213_p2_ofst[k]) {
2198 break;
2199 }
2200 }
2201 k -= jisx0213_p2_ofst[k];
2202
2203 /* Check for Japanese chars in BMP */
2204 unsigned int s = (s1 + 94 + k)*94 + s2;
2205 ZEND_ASSERT(s < jisx0213_ucs_table_size);
2206 uint32_t w = jisx0213_ucs_table[s];
2207
2208 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2209 if (!w) {
2210 k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2211 if (k >= 0) {
2212 w = jisx0213_jis_u5_tbl[k] + 0x20000;
2213 }
2214 }
2215
2216 *out++ = w ? w : MBFL_BAD_INPUT;
2217 } else {
2218 *out++ = MBFL_BAD_INPUT;
2219 }
2220 } else { /* state == JISX0208 */
2221 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
2222 uint32_t w = 0;
2223 if (s < jisx0208_ucs_table_size) {
2224 w = jisx0208_ucs_table[s];
2225 }
2226 *out++ = w ? w : MBFL_BAD_INPUT;
2227 }
2228 } else {
2229 *out++ = c;
2230 }
2231 } else {
2232 *out++ = MBFL_BAD_INPUT;
2233 }
2234 }
2235
2236 *in_len = e - p;
2237 *in = p;
2238 return out - buf;
2239 }
2240
mb_wchar_to_iso2022jp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)2241 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
2242 {
2243 unsigned char *out, *limit;
2244 MB_CONVERT_BUF_LOAD(buf, out, limit);
2245 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2246
2247 uint32_t w;
2248 if (buf->state & 0xFF00) {
2249 int k = (buf->state >> 8) - 1;
2250 w = jisx0213_u2_tbl[2*k];
2251 buf->state &= 0xFF;
2252 goto process_codepoint;
2253 }
2254
2255 while (len--) {
2256 w = *in++;
2257 process_codepoint: ;
2258 unsigned int s = 0;
2259
2260 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
2261 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
2262 if (w == jisx0213_u2_tbl[2*k]) {
2263 if (!len) {
2264 if (!end) {
2265 buf->state |= (k+1) << 8;
2266 MB_CONVERT_BUF_STORE(buf, out, limit);
2267 return;
2268 }
2269 } else {
2270 uint32_t w2 = *in++; len--;
2271 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
2272 k++;
2273 }
2274 if (w2 == jisx0213_u2_tbl[2*k+1]) {
2275 s = jisx0213_u2_key[k];
2276 break;
2277 }
2278 in--; len++;
2279 }
2280
2281 s = jisx0213_u2_fb_tbl[k];
2282 break;
2283 }
2284 }
2285 }
2286
2287 /* Check for major Japanese chars: U+4E00-U+9FFF */
2288 if (!s) {
2289 for (int k = 0; k < uni2jis_tbl_len; k++) {
2290 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
2291 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
2292 break;
2293 }
2294 }
2295 }
2296
2297 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
2298 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
2299 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
2300 if (k >= 0) {
2301 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
2302 }
2303 }
2304
2305 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2306 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
2307 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
2308 if (k >= 0) {
2309 s = jisx0213_u5_jis_tbl[k];
2310 }
2311 }
2312
2313 if (!s) {
2314 /* CJK Compatibility Forms: U+FE30-U+FE4F */
2315 if (w == 0xFE45) {
2316 s = 0x233E;
2317 } else if (w == 0xFE46) {
2318 s = 0x233D;
2319 } else if (w >= 0xF91D && w <= 0xF9DC) {
2320 /* CJK Compatibility Ideographs: U+F900-U+F92A */
2321 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
2322 if (k >= 0) {
2323 s = ucs_r2b_jisx0213_cmap_val[k];
2324 }
2325 }
2326 }
2327
2328 if (!s && w) {
2329 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2330 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2331 } else if (s <= 0x7F) {
2332 if (buf->state != ASCII) {
2333 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
2334 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2335 buf->state = ASCII;
2336 }
2337 out = mb_convert_buf_add(out, s);
2338 } else if (s <= 0xFF) {
2339 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2340 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2341 } else if (s <= 0x7EFF) {
2342 if (buf->state != JISX0213_PLANE1) {
2343 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2344 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q');
2345 buf->state = JISX0213_PLANE1;
2346 } else {
2347 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2348 }
2349 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
2350 } else {
2351 if (buf->state != JISX0213_PLANE2) {
2352 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2353 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P');
2354 buf->state = JISX0213_PLANE2;
2355 } else {
2356 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2357 }
2358 unsigned int s2 = s & 0xFF;
2359 int k = ((s >> 8) & 0xFF) - 0x7F;
2360 ZEND_ASSERT(k < jisx0213_p2_ofst_len);
2361 s = jisx0213_p2_ofst[k] + 0x21;
2362 out = mb_convert_buf_add2(out, s, s2);
2363 }
2364 }
2365
2366 if (end && buf->state != ASCII) {
2367 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
2368 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2369 }
2370
2371 MB_CONVERT_BUF_STORE(buf, out, limit);
2372 }
2373
mbfl_filt_conv_cp5022x_wchar(int c,mbfl_convert_filter * filter)2374 static int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
2375 {
2376 int c1, s, w;
2377
2378 retry:
2379 switch (filter->status & 0xf) {
2380 /* case 0x00: ASCII */
2381 /* case 0x10: X 0201 latin */
2382 /* case 0x20: X 0201 kana */
2383 /* case 0x80: X 0208 */
2384 /* case 0x90: X 0212 */
2385 case 0:
2386 if (c == 0x1b) {
2387 filter->status += 2;
2388 } else if (c == 0x0e) { /* "kana in" */
2389 filter->status = 0x20;
2390 } else if (c == 0x0f) { /* "kana out" */
2391 filter->status = 0;
2392 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
2393 CK((*filter->output_function)(0xa5, filter->data));
2394 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
2395 CK((*filter->output_function)(0x203e, filter->data));
2396 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
2397 CK((*filter->output_function)(0xff40 + c, filter->data));
2398 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
2399 filter->cache = c;
2400 filter->status += 1;
2401 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
2402 CK((*filter->output_function)(c, filter->data));
2403 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
2404 CK((*filter->output_function)(0xfec0 + c, filter->data));
2405 } else {
2406 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2407 }
2408 break;
2409
2410 /* case 0x81: X 0208 second char */
2411 /* case 0x91: X 0212 second char */
2412 case 1:
2413 filter->status &= ~0xf;
2414 c1 = filter->cache;
2415 if (c > 0x20 && c < 0x7f) {
2416 s = (c1 - 0x21)*94 + c - 0x21;
2417 if (filter->status == 0x80) {
2418 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
2419 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
2420 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
2421 w = jisx0208_ucs_table[s];
2422 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
2423 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
2424 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
2425 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
2426 } else if (s >= 94 * 94 && s < 114 * 94) {
2427 /* user-defined => PUA (Microsoft extended) */
2428 w = s - 94*94 + 0xe000;
2429 } else {
2430 w = 0;
2431 }
2432
2433 if (w <= 0) {
2434 w = MBFL_BAD_INPUT;
2435 }
2436 } else {
2437 if (s >= 0 && s < jisx0212_ucs_table_size) {
2438 w = jisx0212_ucs_table[s];
2439 } else {
2440 w = 0;
2441 }
2442
2443 if (w <= 0) {
2444 w = MBFL_BAD_INPUT;
2445 }
2446 }
2447 CK((*filter->output_function)(w, filter->data));
2448 } else {
2449 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2450 }
2451 break;
2452
2453 /* ESC */
2454 /* case 0x02: */
2455 /* case 0x12: */
2456 /* case 0x22: */
2457 /* case 0x82: */
2458 /* case 0x92: */
2459 case 2:
2460 if (c == 0x24) { /* '$' */
2461 filter->status++;
2462 } else if (c == 0x28) { /* '(' */
2463 filter->status += 3;
2464 } else {
2465 filter->status &= ~0xf;
2466 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2467 goto retry;
2468 }
2469 break;
2470
2471 /* ESC $ */
2472 /* case 0x03: */
2473 /* case 0x13: */
2474 /* case 0x23: */
2475 /* case 0x83: */
2476 /* case 0x93: */
2477 case 3:
2478 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
2479 filter->status = 0x80;
2480 } else if (c == 0x28) { /* '(' */
2481 filter->status++;
2482 } else {
2483 filter->status &= ~0xf;
2484 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2485 CK((*filter->output_function)(0x24, filter->data));
2486 goto retry;
2487 }
2488 break;
2489
2490 /* ESC $ ( */
2491 /* case 0x04: */
2492 /* case 0x14: */
2493 /* case 0x24: */
2494 /* case 0x84: */
2495 /* case 0x94: */
2496 case 4:
2497 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
2498 filter->status = 0x80;
2499 } else if (c == 0x44) { /* 'D' */
2500 filter->status = 0x90;
2501 } else {
2502 filter->status &= ~0xf;
2503 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2504 CK((*filter->output_function)(0x24, filter->data));
2505 CK((*filter->output_function)(0x28, filter->data));
2506 goto retry;
2507 }
2508 break;
2509
2510 /* ESC ( */
2511 /* case 0x05: */
2512 /* case 0x15: */
2513 /* case 0x25: */
2514 /* case 0x85: */
2515 /* case 0x95: */
2516 case 5:
2517 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
2518 filter->status = 0;
2519 } else if (c == 0x4a) { /* 'J' */
2520 filter->status = 0x10;
2521 } else if (c == 0x49) { /* 'I' */
2522 filter->status = 0x20;
2523 } else {
2524 filter->status &= ~0xf;
2525 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2526 CK((*filter->output_function)(0x28, filter->data));
2527 goto retry;
2528 }
2529 break;
2530
2531 EMPTY_SWITCH_DEFAULT_CASE();
2532 }
2533
2534 return 0;
2535 }
2536
mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter * filter)2537 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
2538 {
2539 if (filter->status & 0xF) {
2540 /* 2-byte (JIS X 0208 or 0212) character was truncated, or else
2541 * escape sequence was truncated */
2542 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2543 }
2544 filter->status = 0;
2545
2546 if (filter->flush_function) {
2547 (*filter->flush_function)(filter->data);
2548 }
2549
2550 return 0;
2551 }
2552
2553 static const unsigned char hankana2zenkana_table[64] = {
2554 0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
2555 0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,
2556 0xA8,0xAA,0xAB,0xAD,0xAF,0xB1,0xB3,0xB5,0xB7,0xB9,
2557 0xBB,0xBD,0xBF,0xC1,0xC4,0xC6,0xC8,0xCA,0xCB,0xCC,
2558 0xCD,0xCE,0xCF,0xD2,0xD5,0xD8,0xDB,0xDE,0xDF,0xE0,
2559 0xE1,0xE2,0xE4,0xE6,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,
2560 0xEF,0xF3,0x9B,0x9C
2561 };
2562
2563 static const unsigned char hankana2zenhira_table[64] = {
2564 0x00,0x02,0x0C,0x0D,0x01,0xFB,0x92,0x41,0x43,0x45,
2565 0x47,0x49,0x83,0x85,0x87,0x63,0xFC,0x42,0x44,0x46,
2566 0x48,0x4A,0x4B,0x4D,0x4F,0x51,0x53,0x55,0x57,0x59,
2567 0x5B,0x5D,0x5F,0x61,0x64,0x66,0x68,0x6A,0x6B,0x6C,
2568 0x6D,0x6E,0x6F,0x72,0x75,0x78,0x7B,0x7E,0x7F,0x80,
2569 0x81,0x82,0x84,0x86,0x88,0x89,0x8A,0x8B,0x8C,0x8D,
2570 0x8F,0x93,0x9B,0x9C
2571 };
2572
2573 static const unsigned char zenkana2hankana_table[84][2] = {
2574 {0x67,0x00},{0x71,0x00},{0x68,0x00},{0x72,0x00},{0x69,0x00},
2575 {0x73,0x00},{0x6A,0x00},{0x74,0x00},{0x6B,0x00},{0x75,0x00},
2576 {0x76,0x00},{0x76,0x9E},{0x77,0x00},{0x77,0x9E},{0x78,0x00},
2577 {0x78,0x9E},{0x79,0x00},{0x79,0x9E},{0x7A,0x00},{0x7A,0x9E},
2578 {0x7B,0x00},{0x7B,0x9E},{0x7C,0x00},{0x7C,0x9E},{0x7D,0x00},
2579 {0x7D,0x9E},{0x7E,0x00},{0x7E,0x9E},{0x7F,0x00},{0x7F,0x9E},
2580 {0x80,0x00},{0x80,0x9E},{0x81,0x00},{0x81,0x9E},{0x6F,0x00},
2581 {0x82,0x00},{0x82,0x9E},{0x83,0x00},{0x83,0x9E},{0x84,0x00},
2582 {0x84,0x9E},{0x85,0x00},{0x86,0x00},{0x87,0x00},{0x88,0x00},
2583 {0x89,0x00},{0x8A,0x00},{0x8A,0x9E},{0x8A,0x9F},{0x8B,0x00},
2584 {0x8B,0x9E},{0x8B,0x9F},{0x8C,0x00},{0x8C,0x9E},{0x8C,0x9F},
2585 {0x8D,0x00},{0x8D,0x9E},{0x8D,0x9F},{0x8E,0x00},{0x8E,0x9E},
2586 {0x8E,0x9F},{0x8F,0x00},{0x90,0x00},{0x91,0x00},{0x92,0x00},
2587 {0x93,0x00},{0x6C,0x00},{0x94,0x00},{0x6D,0x00},{0x95,0x00},
2588 {0x6E,0x00},{0x96,0x00},{0x97,0x00},{0x98,0x00},{0x99,0x00},
2589 {0x9A,0x00},{0x9B,0x00},{0x9C,0x00},{0x9C,0x00},{0x72,0x00},
2590 {0x74,0x00},{0x66,0x00},{0x9D,0x00},{0x73,0x9E}
2591 };
2592
2593 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
2594 * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
2595 * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
2596 * `mode` must not call for transforms which are inverses (i.e. which would cancel
2597 * each other out).
2598 *
2599 * In some cases, successive input codepoints may be merged into one output codepoint.
2600 * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
2601 * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
2602 * will not be modified. If there is no following codepoint, `next` should be zero.
2603 *
2604 * Again, in some cases, one input codepoint may convert to two output codepoints.
2605 * If so, the second output codepoint will be stored in `*second`.
2606 *
2607 * Return the resulting codepoint. If none of the requested transforms apply, return
2608 * the input codepoint unchanged.
2609 */
mb_convert_kana_codepoint(uint32_t c,uint32_t next,bool * consumed,uint32_t * second,unsigned int mode)2610 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
2611 {
2612 if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
2613 return c + 0xFEE0;
2614 }
2615 if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2616 return c + 0xFEE0;
2617 }
2618 if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
2619 return c + 0xFEE0;
2620 }
2621 if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
2622 return 0x3000;
2623 }
2624
2625 if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
2626 /* Convert Hankaku kana to Zenkaku kana
2627 * Either all Hankaku kana (including katakana and hiragana) will be converted
2628 * to Zenkaku katakana, or to Zenkaku hiragana */
2629 if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2630 if (c >= 0xFF61 && c <= 0xFF9F) {
2631 int n = c - 0xFF60;
2632
2633 if (next >= 0xFF61 && next <= 0xFF9F) {
2634 if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2635 *consumed = true;
2636 return 0x3001 + hankana2zenkana_table[n];
2637 }
2638 if (next == 0xFF9E && n == 19) {
2639 *consumed = true;
2640 return 0x30F4;
2641 }
2642 if (next == 0xFF9F && n >= 42 && n <= 46) {
2643 *consumed = true;
2644 return 0x3002 + hankana2zenkana_table[n];
2645 }
2646 }
2647
2648 return 0x3000 + hankana2zenkana_table[n];
2649 }
2650 }
2651 if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2652 if (c >= 0xFF61 && c <= 0xFF9F) {
2653 int n = c - 0xFF60;
2654
2655 if (next >= 0xFF61 && next <= 0xFF9F) {
2656 if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2657 *consumed = true;
2658 return 0x3001 + hankana2zenhira_table[n];
2659 }
2660 if (next == 0xFF9F && n >= 42 && n <= 46) {
2661 *consumed = true;
2662 return 0x3002 + hankana2zenhira_table[n];
2663 }
2664 }
2665
2666 return 0x3000 + hankana2zenhira_table[n];
2667 }
2668 }
2669 if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
2670 return 0x3000 + hankana2zenkana_table[c - 0xFF60];
2671 }
2672 if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
2673 return 0x3000 + hankana2zenhira_table[c - 0xFF60];
2674 }
2675 }
2676
2677 if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
2678 if (c == '\\' || c == 0xA5) { /* YEN SIGN */
2679 return 0xFFE5; /* FULLWIDTH YEN SIGN */
2680 }
2681 if (c == 0x7E || c == 0x203E) {
2682 return 0xFFE3; /* FULLWIDTH MACRON */
2683 }
2684 if (c == '\'') {
2685 return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
2686 }
2687 if (c == '"') {
2688 return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
2689 }
2690 }
2691
2692 if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
2693 /* Zenkaku to Hankaku */
2694 if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
2695 /* all except " ' \ ~ */
2696 return c - 0xFEE0;
2697 }
2698 if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
2699 return c - 0xFEE0;
2700 }
2701 if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
2702 return c - 0xFEE0;
2703 }
2704 if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
2705 return ' ';
2706 }
2707 if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
2708 return '-';
2709 }
2710 }
2711
2712 if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
2713 /* Zenkaku kana to hankaku kana */
2714 if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
2715 /* Zenkaku katakana to hankaku kana */
2716 int n = c - 0x30A1;
2717 if (zenkana2hankana_table[n][1]) {
2718 *second = 0xFF00 + zenkana2hankana_table[n][1];
2719 }
2720 return 0xFF00 + zenkana2hankana_table[n][0];
2721 }
2722 if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
2723 /* Zenkaku hiragana to hankaku kana */
2724 int n = c - 0x3041;
2725 if (zenkana2hankana_table[n][1]) {
2726 *second = 0xFF00 + zenkana2hankana_table[n][1];
2727 }
2728 return 0xFF00 + zenkana2hankana_table[n][0];
2729 }
2730 if (c == 0x3001) {
2731 return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
2732 }
2733 if (c == 0x3002) {
2734 return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
2735 }
2736 if (c == 0x300C) {
2737 return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
2738 }
2739 if (c == 0x300D) {
2740 return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
2741 }
2742 if (c == 0x309B) {
2743 return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
2744 }
2745 if (c == 0x309C) {
2746 return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
2747 }
2748 if (c == 0x30FC) {
2749 return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
2750 }
2751 if (c == 0x30FB) {
2752 return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
2753 }
2754 }
2755
2756 if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
2757 if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
2758 /* Zenkaku hiragana to Zenkaku katakana */
2759 return c + 0x60;
2760 }
2761 if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
2762 /* Zenkaku katakana to Zenkaku hiragana */
2763 return c - 0x60;
2764 }
2765 }
2766
2767 if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
2768 if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
2769 return '\\';
2770 }
2771 if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
2772 return '~';
2773 }
2774 if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
2775 return '\'';
2776 }
2777 if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
2778 return '"';
2779 }
2780 }
2781
2782 return c;
2783 }
2784
2785 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
2786
mbfl_filt_conv_wchar_cp50220(int c,mbfl_convert_filter * filter)2787 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
2788 {
2789 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2790 bool consumed = false;
2791
2792 if (filter->cache) {
2793 int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
2794 filter->cache = consumed ? 0 : c;
2795 /* Terrible hack to get CP50220 to emit error markers in the proper
2796 * position, not reordering them with subsequent characters */
2797 filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2798 mbfl_filt_conv_wchar_cp50221(s, filter);
2799 filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2800 if (c == 0 && !consumed) {
2801 (*filter->output_function)(0, filter->data);
2802 }
2803 } else if (c == 0) {
2804 /* This case has to be handled separately, since `filter->cache == 0` means
2805 * no codepoint is cached */
2806 (*filter->output_function)(0, filter->data);
2807 } else {
2808 filter->cache = c;
2809 }
2810
2811 return 0;
2812 }
2813
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)2814 static int mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
2815 {
2816 /* back to latin */
2817 if ((filter->status & 0xff00) != 0) {
2818 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2819 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2820 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
2821 }
2822 filter->status = 0;
2823
2824 if (filter->flush_function != NULL) {
2825 return (*filter->flush_function)(filter->data);
2826 }
2827
2828 return 0;
2829 }
2830
mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter * filter)2831 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
2832 {
2833 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2834
2835 if (filter->cache) {
2836 int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
2837 filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2838 mbfl_filt_conv_wchar_cp50221(s, filter);
2839 filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2840 filter->cache = 0;
2841 }
2842
2843 return mbfl_filt_conv_any_jis_flush(filter);
2844 }
2845
mbfl_filt_conv_wchar_cp50221(int c,mbfl_convert_filter * filter)2846 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
2847 {
2848 int s = 0;
2849
2850 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2851 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2852 } else if (c == 0x203E) { /* OVERLINE */
2853 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2854 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2855 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2856 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2857 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2858 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2859 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2860 } else if (c >= 0xE000 && c <= 0xE757) {
2861 /* 'private'/'user' codepoints */
2862 s = c - 0xE000;
2863 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2864 }
2865
2866 if (s <= 0) {
2867 if (c == 0xa5) { /* YEN SIGN */
2868 s = 0x1005c;
2869 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
2870 s = 0x2140;
2871 } else if (c == 0x2225) { /* PARALLEL TO */
2872 s = 0x2142;
2873 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
2874 s = 0x215d;
2875 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
2876 s = 0x2171;
2877 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
2878 s = 0x2172;
2879 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
2880 s = 0x224c;
2881 }
2882 }
2883
2884 /* Above, we do a series of lookups in `ucs_*_jis_table` to find a
2885 * corresponding kuten code for this Unicode codepoint
2886 * If we get zero, that means the codepoint is not in JIS X 0208
2887 * On the other hand, if we get a result with the high bits set on both
2888 * upper and lower bytes, that is not a code in JIS X 0208 but rather
2889 * in JIS X 0213
2890 * In either case, check if this codepoint is one of the extensions added
2891 * to JIS X 0208 by MicroSoft (to make CP932) */
2892 if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
2893 int i;
2894 s = -1;
2895
2896 for (i = 0;
2897 i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
2898 i++) {
2899 const int oh = cp932ext1_ucs_table_min / 94;
2900
2901 if (c == cp932ext1_ucs_table[i]) {
2902 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2903 break;
2904 }
2905 }
2906
2907 if (s < 0) {
2908 const int oh = cp932ext2_ucs_table_min / 94;
2909 const int cp932ext2_ucs_table_size =
2910 cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
2911 for (i = 0; i < cp932ext2_ucs_table_size; i++) {
2912 if (c == cp932ext2_ucs_table[i]) {
2913 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2914 break;
2915 }
2916 }
2917 }
2918
2919 if (c == 0) {
2920 s = 0;
2921 } else if (s <= 0) {
2922 s = -1;
2923 }
2924 }
2925
2926 if (s >= 0) {
2927 if (s < 0x80) { /* ASCII */
2928 if ((filter->status & 0xff00) != 0) {
2929 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2930 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2931 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
2932 filter->status = 0;
2933 }
2934 CK((*filter->output_function)(s, filter->data));
2935 } else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
2936 if ((filter->status & 0xff00) != 0x500) {
2937 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2938 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2939 CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
2940 filter->status = 0x500;
2941 }
2942 CK((*filter->output_function)(s - 0x80, filter->data));
2943 } else if (s <= 0x927E) { /* X 0208 + extensions */
2944 if ((filter->status & 0xff00) != 0x200) {
2945 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2946 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
2947 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
2948 filter->status = 0x200;
2949 }
2950 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
2951 CK((*filter->output_function)(s & 0xff, filter->data));
2952 } else if (s < 0x10000) { /* X0212 */
2953 CK(mbfl_filt_conv_illegal_output(c, filter));
2954 } else { /* X 0201 latin */
2955 if ((filter->status & 0xff00) != 0x400) {
2956 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2957 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
2958 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
2959 }
2960 filter->status = 0x400;
2961 CK((*filter->output_function)(s & 0x7f, filter->data));
2962 }
2963 } else {
2964 CK(mbfl_filt_conv_illegal_output(c, filter));
2965 }
2966
2967 return 0;
2968 }
2969
mbfl_filt_conv_wchar_cp50222(int c,mbfl_convert_filter * filter)2970 static int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
2971 {
2972 int s = 0;
2973
2974 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2975 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2976 } else if (c == 0x203E) { /* OVERLINE */
2977 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2978 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2979 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2980 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2981 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2982 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2983 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2984 } else if (c >= 0xE000 && c <= 0xE757) {
2985 /* 'private'/'user' codepoints */
2986 s = c - 0xE000;
2987 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2988 }
2989
2990 if (s <= 0) {
2991 if (c == 0xa5) { /* YEN SIGN */
2992 s = 0x1005c;
2993 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
2994 s = 0x2140;
2995 } else if (c == 0x2225) { /* PARALLEL TO */
2996 s = 0x2142;
2997 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
2998 s = 0x215d;
2999 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
3000 s = 0x2171;
3001 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
3002 s = 0x2172;
3003 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
3004 s = 0x224c;
3005 }
3006 }
3007 if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
3008 int i;
3009 s = -1;
3010
3011 for (i = 0;
3012 i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3013 const int oh = cp932ext1_ucs_table_min / 94;
3014
3015 if (c == cp932ext1_ucs_table[i]) {
3016 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3017 break;
3018 }
3019 }
3020
3021 if (s <= 0) {
3022 const int oh = cp932ext2_ucs_table_min / 94;
3023 const int cp932ext2_ucs_table_size =
3024 cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
3025 for (i = 0; i < cp932ext2_ucs_table_size; i++) {
3026 if (c == cp932ext2_ucs_table[i]) {
3027 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3028 break;
3029 }
3030 }
3031 }
3032
3033 if (c == 0) {
3034 s = 0;
3035 } else if (s <= 0) {
3036 s = -1;
3037 }
3038 }
3039
3040 if (s >= 0) {
3041 if (s < 0x80) { /* ASCII */
3042 if ((filter->status & 0xff00) == 0x500) {
3043 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3044 filter->status = 0;
3045 } else if ((filter->status & 0xff00) != 0) {
3046 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3047 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
3048 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
3049 filter->status = 0;
3050 }
3051 CK((*filter->output_function)(s, filter->data));
3052 } else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
3053 if ((filter->status & 0xff00) != 0x500) {
3054 CK((*filter->output_function)(0x0e, filter->data)); /* SI */
3055 filter->status = 0x500;
3056 }
3057 CK((*filter->output_function)(s - 0x80, filter->data));
3058 } else if (s <= 0x927E) { /* X 0208 */
3059 if ((filter->status & 0xff00) == 0x500) {
3060 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3061 filter->status = 0;
3062 }
3063 if ((filter->status & 0xff00) != 0x200) {
3064 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3065 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
3066 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
3067 filter->status = 0x200;
3068 }
3069 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
3070 CK((*filter->output_function)(s & 0xff, filter->data));
3071 } else if (s < 0x10000) { /* X0212 */
3072 CK(mbfl_filt_conv_illegal_output(c, filter));
3073 } else { /* X 0201 latin */
3074 if ((filter->status & 0xff00) == 0x500) {
3075 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3076 filter->status = 0;
3077 }
3078 if ((filter->status & 0xff00) != 0x400) {
3079 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3080 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
3081 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
3082 }
3083 filter->status = 0x400;
3084 CK((*filter->output_function)(s & 0x7f, filter->data));
3085 }
3086 } else {
3087 CK(mbfl_filt_conv_illegal_output(c, filter));
3088 }
3089
3090 return 0;
3091 }
3092
mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter * filter)3093 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
3094 {
3095 /* back to latin */
3096 if ((filter->status & 0xff00) == 0x500) {
3097 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
3098 } else if ((filter->status & 0xff00) != 0) {
3099 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
3100 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
3101 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
3102 }
3103 filter->status = 0;
3104
3105 if (filter->flush_function) {
3106 (*filter->flush_function)(filter->data);
3107 }
3108
3109 return 0;
3110 }
3111
3112 #define ASCII 0
3113 #define JISX_0201_LATIN 1
3114 #define JISX_0201_KANA 2
3115 #define JISX_0208 3
3116 #define JISX_0212 4
3117
mb_cp5022x_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3118 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3119 {
3120 ZEND_ASSERT(bufsize >= 3);
3121
3122 unsigned char *p = *in, *e = p + *in_len;
3123 uint32_t *out = buf, *limit = buf + bufsize;
3124
3125 while (p < e && out < limit) {
3126 unsigned char c = *p++;
3127
3128 if (c == 0x1B) {
3129 /* Escape sequence */
3130 if ((e - p) < 2) {
3131 *out++ = MBFL_BAD_INPUT;
3132 /* Duplicate error-handling behavior of legacy code */
3133 if (p < e && (*p == '(' || *p == '$'))
3134 p++;
3135 continue;
3136 }
3137 unsigned char c2 = *p++;
3138 if (c2 == '$') {
3139 unsigned char c3 = *p++;
3140 if (c3 == '@' || c3 == 'B') {
3141 *state = JISX_0208;
3142 } else if (c3 == '(') {
3143 if (p == e) {
3144 *out++ = MBFL_BAD_INPUT;
3145 break;
3146 }
3147 unsigned char c4 = *p++;
3148 if (c4 == '@' || c4 == 'B') {
3149 *state = JISX_0208;
3150 } else if (c4 == 'D') {
3151 *state = JISX_0212;
3152 } else {
3153 if ((limit - out) < 3) {
3154 p -= 4;
3155 break;
3156 }
3157 *out++ = MBFL_BAD_INPUT;
3158 *out++ = '$';
3159 *out++ = '(';
3160 p--;
3161 }
3162 } else {
3163 if ((limit - out) < 2) {
3164 p -= 3;
3165 break;
3166 }
3167 *out++ = MBFL_BAD_INPUT;
3168 *out++ = '$';
3169 p--;
3170 }
3171 } else if (c2 == '(') {
3172 unsigned char c3 = *p++;
3173 if (c3 == 'B' || c3 == 'H') {
3174 *state = ASCII;
3175 } else if (c3 == 'J') {
3176 *state = JISX_0201_LATIN;
3177 } else if (c3 == 'I') {
3178 *state = JISX_0201_KANA;
3179 } else {
3180 if ((limit - out) < 2) {
3181 p -= 3;
3182 break;
3183 }
3184 *out++ = MBFL_BAD_INPUT;
3185 *out++ = '(';
3186 p--;
3187 }
3188 } else {
3189 *out++ = MBFL_BAD_INPUT;
3190 p--;
3191 }
3192 } else if (c == 0xE) {
3193 *state = JISX_0201_KANA;
3194 } else if (c == 0xF) {
3195 *state = ASCII;
3196 } else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
3197 *out++ = 0xA5;
3198 } else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
3199 *out++ = 0x203E;
3200 } else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
3201 *out++ = 0xFF40 + c;
3202 } else if (*state >= JISX_0208 && c > 0x20 && c <= 0x97) {
3203 if (p == e) {
3204 *out++ = MBFL_BAD_INPUT;
3205 break;
3206 }
3207 unsigned char c2 = *p++;
3208 if (c2 > 0x20 && c2 < 0x7F) {
3209 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
3210 uint32_t w = 0;
3211 if (*state == JISX_0208) {
3212 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3213 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3214 } else if (s < jisx0208_ucs_table_size) {
3215 w = jisx0208_ucs_table[s];
3216 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3217 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3218 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
3219 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
3220 } else if (s >= 94*94 && s < 114*94) {
3221 /* MicroSoft extension */
3222 w = s - 94*94 + 0xE000;
3223 }
3224 if (!w)
3225 w = MBFL_BAD_INPUT;
3226 } else {
3227 if (s < jisx0212_ucs_table_size) {
3228 w = jisx0212_ucs_table[s];
3229 }
3230 if (!w)
3231 w = MBFL_BAD_INPUT;
3232 }
3233 *out++ = w;
3234 } else {
3235 *out++ = MBFL_BAD_INPUT;
3236 }
3237 } else if (c < 0x80) {
3238 *out++ = c;
3239 } else if (c >= 0xA1 && c <= 0xDF) {
3240 *out++ = 0xFEC0 + c;
3241 } else {
3242 *out++ = MBFL_BAD_INPUT;
3243 }
3244 }
3245
3246 *in_len = e - p;
3247 *in = p;
3248 return out - buf;
3249 }
3250
lookup_wchar(uint32_t w)3251 static unsigned int lookup_wchar(uint32_t w)
3252 {
3253 unsigned int s = 0;
3254
3255 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3256 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3257 } else if (w == 0x203E) { /* OVERLINE */
3258 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
3259 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3260 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3261 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3262 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3263 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3264 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3265 } else if (w >= 0xE000 && w <= 0xE757) {
3266 /* Private Use Area codepoints */
3267 s = w - 0xE000;
3268 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
3269 }
3270
3271 if (!s) {
3272 if (w == 0xA5) { /* YEN SIGN */
3273 s = 0x1005C;
3274 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3275 s = 0x2140;
3276 } else if (w == 0x2225) { /* PARALLEL TO */
3277 s = 0x2142;
3278 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3279 s = 0x215D;
3280 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3281 s = 0x2171;
3282 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3283 s = 0x2172;
3284 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3285 s = 0x224C;
3286 } else if (w == 0) {
3287 return 0;
3288 }
3289 }
3290
3291 /* Above, we do a series of lookups in `ucs_*_jis_table` to find a
3292 * corresponding kuten code for this Unicode codepoint
3293 * If we get zero, that means the codepoint is not in JIS X 0208
3294 * On the other hand, if we get a result with the high bits set on both
3295 * upper and lower bytes, that is not a code in JIS X 0208 but rather
3296 * in JIS X 0213
3297 * In either case, check if this codepoint is one of the extensions added
3298 * to JIS X 0208 by MicroSoft (to make CP932) */
3299 if (!s || s >= 0x8080) {
3300 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3301 if (w == cp932ext1_ucs_table[i]) {
3302 return (((i / 94) + (cp932ext1_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3303 }
3304 }
3305
3306 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
3307 if (w == cp932ext2_ucs_table[i]) {
3308 return (((i / 94) + (cp932ext2_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3309 }
3310 }
3311 }
3312
3313 return s;
3314 }
3315
3316 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3317
mb_wchar_to_cp50220(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3318 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3319 {
3320 unsigned char *out, *limit;
3321 MB_CONVERT_BUF_LOAD(buf, out, limit);
3322 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3323
3324 uint32_t w;
3325
3326 if (buf->state & 0xFFFF00) {
3327 /* Reprocess cached codepoint */
3328 w = buf->state >> 8;
3329 buf->state &= 0xFF;
3330 goto reprocess_codepoint;
3331 }
3332
3333 while (len--) {
3334 w = *in++;
3335 reprocess_codepoint:
3336
3337 if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
3338 /* This codepoint may need to combine with the next one,
3339 * but the 'next one' will come in a separate buffer */
3340 buf->state |= w << 8;
3341 break;
3342 }
3343
3344 bool consumed = false;
3345 w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
3346 if (consumed) {
3347 /* Two successive codepoints were converted into one */
3348 in++; len--; consumed = false;
3349 }
3350
3351 unsigned int s = lookup_wchar(w);
3352
3353 if (!s && w) {
3354 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3355 } else if (s < 0x80) {
3356 /* ASCII */
3357 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3358 if (buf->state != ASCII) {
3359 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3360 buf->state = ASCII;
3361 }
3362 out = mb_convert_buf_add(out, s);
3363 } else if (s >= 0xA0 && s < 0xE0) {
3364 /* JISX 0201 Kana */
3365 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3366 if (buf->state != JISX_0201_KANA) {
3367 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3368 buf->state = JISX_0201_KANA;
3369 }
3370 out = mb_convert_buf_add(out, s - 0x80);
3371 } else if (s <= 0x927E) {
3372 /* JISX 0208 Kanji */
3373 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3374 if (buf->state != JISX_0208) {
3375 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3376 buf->state = JISX_0208;
3377 }
3378 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3379 } else if (s >= 0x10000) {
3380 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3381 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3382 if (buf->state != JISX_0201_LATIN) {
3383 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3384 buf->state = JISX_0201_LATIN;
3385 }
3386 out = mb_convert_buf_add(out, s & 0x7F);
3387 } else {
3388 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3389 }
3390 }
3391
3392 if (end && buf->state != ASCII) {
3393 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3394 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3395 }
3396
3397 MB_CONVERT_BUF_STORE(buf, out, limit);
3398 }
3399
mb_wchar_to_cp50221(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3400 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3401 {
3402 unsigned char *out, *limit;
3403 MB_CONVERT_BUF_LOAD(buf, out, limit);
3404 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3405
3406 while (len--) {
3407 uint32_t w = *in++;
3408 unsigned int s = lookup_wchar(w);
3409
3410 if (!s && w) {
3411 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3412 } else if (s < 0x80) {
3413 /* ASCII */
3414 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3415 if (buf->state != ASCII) {
3416 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3417 buf->state = ASCII;
3418 }
3419 out = mb_convert_buf_add(out, s);
3420 } else if (s >= 0xA0 && s < 0xE0) {
3421 /* JISX 0201 Kana */
3422 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3423 if (buf->state != JISX_0201_KANA) {
3424 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3425 buf->state = JISX_0201_KANA;
3426 }
3427 out = mb_convert_buf_add(out, s - 0x80);
3428 } else if (s <= 0x927E) {
3429 /* JISX 0208 Kanji */
3430 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3431 if (buf->state != JISX_0208) {
3432 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3433 buf->state = JISX_0208;
3434 }
3435 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3436 } else if (s >= 0x10000) {
3437 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3438 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3439 if (buf->state != JISX_0201_LATIN) {
3440 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3441 buf->state = JISX_0201_LATIN;
3442 }
3443 out = mb_convert_buf_add(out, s & 0x7F);
3444 } else {
3445 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3446 }
3447 }
3448
3449 if (end && buf->state != ASCII) {
3450 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3451 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3452 }
3453
3454 MB_CONVERT_BUF_STORE(buf, out, limit);
3455 }
3456
mb_wchar_to_cp50222(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3457 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3458 {
3459 unsigned char *out, *limit;
3460 MB_CONVERT_BUF_LOAD(buf, out, limit);
3461 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3462
3463 while (len--) {
3464 uint32_t w = *in++;
3465 unsigned int s = lookup_wchar(w);
3466
3467 if (!s && w) {
3468 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3469 } else if (s < 0x80) {
3470 /* ASCII */
3471 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3472 if (buf->state == JISX_0201_KANA) {
3473 out = mb_convert_buf_add(out, 0xF);
3474 buf->state = ASCII;
3475 } else if (buf->state != ASCII) {
3476 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3477 buf->state = ASCII;
3478 }
3479 out = mb_convert_buf_add(out, s);
3480 } else if (s >= 0xA0 && s < 0xE0) {
3481 /* JISX 0201 Kana */
3482 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3483 if (buf->state != JISX_0201_KANA) {
3484 out = mb_convert_buf_add(out, 0xE);
3485 buf->state = JISX_0201_KANA;
3486 }
3487 out = mb_convert_buf_add(out, s - 0x80);
3488 } else if (s <= 0x927E) {
3489 /* JISX 0208 Kanji */
3490 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
3491 if (buf->state == JISX_0201_KANA) {
3492 out = mb_convert_buf_add(out, 0xF);
3493 }
3494 if (buf->state != JISX_0208) {
3495 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3496 buf->state = JISX_0208;
3497 }
3498 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3499 } else if (s >= 0x10000) {
3500 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3501 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3502 if (buf->state == JISX_0201_KANA) {
3503 out = mb_convert_buf_add(out, 0xF);
3504 }
3505 if (buf->state != JISX_0201_LATIN) {
3506 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3507 buf->state = JISX_0201_LATIN;
3508 }
3509 out = mb_convert_buf_add(out, s & 0x7F);
3510 } else {
3511 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3512 }
3513 }
3514
3515 if (end) {
3516 if (buf->state == JISX_0201_KANA) {
3517 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
3518 out = mb_convert_buf_add(out, 0xF);
3519 } else if (buf->state != ASCII) {
3520 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3521 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3522 }
3523 }
3524
3525 MB_CONVERT_BUF_STORE(buf, out, limit);
3526 }
3527
3528 #define ASCII 0
3529 #define JISX0201_KANA 0x20
3530 #define JISX0208_KANJI 0x80
3531 #define UDC 0xA0
3532
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)3533 static int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
3534 {
3535 int c1, s, w;
3536
3537 switch (filter->status & 0xF) {
3538 case 0:
3539 if (c == 0x1B) {
3540 filter->status += 2;
3541 } else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
3542 CK((*filter->output_function)(0xFF40 + c, filter->data));
3543 } else if ((filter->status == JISX0208_KANJI || filter->status == UDC) && c > 0x20 && c < 0x80) {
3544 filter->cache = c;
3545 filter->status += 1;
3546 } else if (c >= 0 && c < 0x80) { /* ASCII */
3547 CK((*filter->output_function)(c, filter->data));
3548 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
3549 CK((*filter->output_function)(0xFEC0 + c, filter->data));
3550 } else {
3551 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3552 }
3553 break;
3554
3555 /* Kanji, second byte */
3556 case 1:
3557 w = 0;
3558 filter->status &= ~0xF;
3559 c1 = filter->cache;
3560 if (c > 0x20 && c < 0x7F) {
3561 s = ((c1 - 0x21) * 94) + c - 0x21;
3562 if (filter->status == JISX0208_KANJI) {
3563 if (s <= 137) {
3564 if (s == 31) {
3565 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3566 } else if (s == 32) {
3567 w = 0xFF5E; /* FULLWIDTH TILDE */
3568 } else if (s == 33) {
3569 w = 0x2225; /* PARALLEL TO */
3570 } else if (s == 60) {
3571 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3572 } else if (s == 80) {
3573 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3574 } else if (s == 81) {
3575 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3576 } else if (s == 137) {
3577 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3578 }
3579 }
3580
3581 if (w == 0) {
3582 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
3583 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3584 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
3585 w = jisx0208_ucs_table[s];
3586 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
3587 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3588 }
3589 }
3590
3591 if (w <= 0) {
3592 w = MBFL_BAD_INPUT;
3593 }
3594 } else {
3595 if (c1 > 0x20 && c1 < 0x35) {
3596 w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
3597 } else {
3598 w = MBFL_BAD_INPUT;
3599 }
3600 }
3601 CK((*filter->output_function)(w, filter->data));
3602 } else {
3603 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3604 }
3605 break;
3606
3607 /* ESC */
3608 case 2:
3609 if (c == '$') {
3610 filter->status++;
3611 } else if (c == '(') {
3612 filter->status += 3;
3613 } else {
3614 filter->status &= ~0xF;
3615 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3616 }
3617 break;
3618
3619 /* ESC $ */
3620 case 3:
3621 if (c == '@' || c == 'B') {
3622 filter->status = JISX0208_KANJI;
3623 } else if (c == '(') {
3624 filter->status++;
3625 } else {
3626 filter->status &= ~0xF;
3627 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3628 }
3629 break;
3630
3631 /* ESC $ ( */
3632 case 4:
3633 if (c == '@' || c == 'B') {
3634 filter->status = JISX0208_KANJI;
3635 } else if (c == '?') {
3636 filter->status = UDC;
3637 } else {
3638 filter->status &= ~0xF;
3639 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3640 }
3641 break;
3642
3643 /* ESC ( */
3644 case 5:
3645 if (c == 'B' || c == 'J') {
3646 filter->status = 0;
3647 } else if (c == 'I') {
3648 filter->status = JISX0201_KANA;
3649 } else {
3650 filter->status &= ~0xF;
3651 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3652 }
3653 }
3654
3655 return 0;
3656 }
3657
mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter * filter)3658 static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
3659 {
3660 if (filter->status & 0xF) {
3661 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
3662 }
3663 filter->status = 0;
3664
3665 if (filter->flush_function) {
3666 (*filter->flush_function)(filter->data);
3667 }
3668
3669 return 0;
3670 }
3671
3672 #define sjistoidx(c1, c2) \
3673 (((c1) > 0x9f) ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
3674 #define idxtojis1(c) (((c) / 94) + 0x21)
3675 #define idxtojis2(c) (((c) % 94) + 0x21)
3676
cp932ext3_cp932ext2_jis(int c)3677 static int cp932ext3_cp932ext2_jis(int c)
3678 {
3679 int idx;
3680
3681 idx = sjistoidx(0xfa, 0x40) + c;
3682 if (idx >= sjistoidx(0xfa, 0x5c))
3683 idx -= sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
3684 else if (idx >= sjistoidx(0xfa, 0x55))
3685 idx -= sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
3686 else if (idx >= sjistoidx(0xfa, 0x40))
3687 idx -= sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
3688 return idxtojis1(idx) << 8 | idxtojis2(idx);
3689 }
3690
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)3691 static int mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
3692 {
3693 int c1, c2, s1 = 0, s2 = 0;
3694
3695 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
3696 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
3697 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
3698 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
3699 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
3700 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
3701 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
3702 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
3703 } else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
3704 /* Private User Area (95ku - 114ku) */
3705 s1 = c - 0xE000;
3706 c1 = (s1 / 94) + 0x7f;
3707 c2 = (s1 % 94) + 0x21;
3708 s1 = (c1 << 8) | c2;
3709 }
3710
3711 if (s1 <= 0) {
3712 if (c == 0xA5) { /* YEN SIGN */
3713 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
3714 } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3715 s1 = 0x2140;
3716 } else if (c == 0x2225) { /* PARALLEL TO */
3717 s1 = 0x2142;
3718 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3719 s1 = 0x215d;
3720 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3721 s1 = 0x2171;
3722 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3723 s1 = 0x2172;
3724 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3725 s1 = 0x224C;
3726 }
3727 }
3728
3729 if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
3730 s1 = -1;
3731 for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
3732 if (c == cp932ext1_ucs_table[c1]) {
3733 s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
3734 break;
3735 }
3736 }
3737
3738 if (s1 <= 0) {
3739 for (c1 = 0; c1 < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; c1++) {
3740 if (c == cp932ext3_ucs_table[c1]) {
3741 s1 = cp932ext3_cp932ext2_jis(c1);
3742 break;
3743 }
3744 }
3745 }
3746
3747 if (c == 0) {
3748 s1 = 0;
3749 }
3750 }
3751
3752 if (s1 >= 0) {
3753 if (s1 < 0x80) { /* latin */
3754 if (filter->status & 0xFF00) {
3755 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3756 CK((*filter->output_function)('(', filter->data));
3757 CK((*filter->output_function)('B', filter->data));
3758 }
3759 CK((*filter->output_function)(s1, filter->data));
3760 filter->status = 0;
3761 } else if (s1 > 0xA0 && s1 < 0xE0) { /* kana */
3762 if ((filter->status & 0xFF00) != 0x100) {
3763 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3764 CK((*filter->output_function)('(', filter->data));
3765 CK((*filter->output_function)('I', filter->data));
3766 }
3767 filter->status = 0x100;
3768 CK((*filter->output_function)(s1 & 0x7F, filter->data));
3769 } else if (s1 < 0x7E7F) { /* X 0208 */
3770 if ((filter->status & 0xFF00) != 0x200) {
3771 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3772 CK((*filter->output_function)('$', filter->data));
3773 CK((*filter->output_function)('B', filter->data));
3774 }
3775 filter->status = 0x200;
3776 CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
3777 CK((*filter->output_function)(s1 & 0x7F, filter->data));
3778 } else if (s1 < 0x927F) { /* UDC */
3779 if ((filter->status & 0xFF00) != 0x800) {
3780 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3781 CK((*filter->output_function)('$', filter->data));
3782 CK((*filter->output_function)('(', filter->data));
3783 CK((*filter->output_function)('?', filter->data));
3784 }
3785 filter->status = 0x800;
3786 CK((*filter->output_function)(((s1 >> 8) - 0x5E) & 0x7F, filter->data));
3787 CK((*filter->output_function)(s1 & 0x7F, filter->data));
3788 }
3789 } else {
3790 CK(mbfl_filt_conv_illegal_output(c, filter));
3791 }
3792
3793 return 0;
3794 }
3795
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)3796 static int mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
3797 {
3798 /* Go back to ASCII (so strings can be safely concatenated) */
3799 if ((filter->status & 0xFF00) != 0) {
3800 CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3801 CK((*filter->output_function)('(', filter->data));
3802 CK((*filter->output_function)('B', filter->data));
3803 }
3804 filter->status = 0;
3805
3806 if (filter->flush_function) {
3807 (*filter->flush_function)(filter->data);
3808 }
3809
3810 return 0;
3811 }
3812
mb_iso2022jpms_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3813 static size_t mb_iso2022jpms_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3814 {
3815 unsigned char *p = *in, *e = p + *in_len;
3816 uint32_t *out = buf, *limit = buf + bufsize;
3817
3818 while (p < e && out < limit) {
3819 unsigned char c = *p++;
3820
3821 if (c == 0x1B) {
3822 if ((e - p) < 2) {
3823 *out++ = MBFL_BAD_INPUT;
3824 p = e;
3825 break;
3826 }
3827 unsigned char c2 = *p++;
3828 unsigned char c3 = *p++;
3829
3830 if (c2 == '$') {
3831 if (c3 == '@' || c3 == 'B') {
3832 *state = JISX0208_KANJI;
3833 } else if (c3 == '(' && p < e) {
3834 unsigned char c4 = *p++;
3835
3836 if (c4 == '@' || c4 == 'B') {
3837 *state = JISX0208_KANJI;
3838 } else if (c4 == '?') {
3839 *state = UDC;
3840 } else {
3841 *out++ = MBFL_BAD_INPUT;
3842 }
3843 } else {
3844 *out++ = MBFL_BAD_INPUT;
3845 }
3846 } else if (c2 == '(') {
3847 if (c3 == 'B' || c3 == 'J') {
3848 *state = ASCII;
3849 } else if (c3 == 'I') {
3850 *state = JISX0201_KANA;
3851 } else {
3852 *out++ = MBFL_BAD_INPUT;
3853 }
3854 } else {
3855 p--;
3856 *out++ = MBFL_BAD_INPUT;
3857 }
3858 } else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
3859 *out++ = 0xFF40 + c;
3860 } else if ((*state == JISX0208_KANJI || *state == UDC) && c >= 0x21 && c <= 0x7F) {
3861 if (p == e) {
3862 *out++ = MBFL_BAD_INPUT;
3863 break;
3864 }
3865 unsigned char c2 = *p++;
3866 unsigned int w = 0;
3867
3868 if (c2 >= 0x21 && c2 <= 0x7E) {
3869 unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
3870 if (*state == JISX0208_KANJI) {
3871 if (s <= 137) {
3872 if (s == 31) {
3873 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3874 } else if (s == 32) {
3875 w = 0xFF5E; /* FULLWIDTH TILDE */
3876 } else if (s == 33) {
3877 w = 0x2225; /* PARALLEL TO */
3878 } else if (s == 60) {
3879 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3880 } else if (s == 80) {
3881 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3882 } else if (s == 81) {
3883 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3884 } else if (s == 137) {
3885 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3886 }
3887 }
3888
3889 if (!w) {
3890 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3891 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3892 } else if (s < jisx0208_ucs_table_size) {
3893 w = jisx0208_ucs_table[s];
3894 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3895 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3896 }
3897 }
3898 } else if (c >= 0x21 && c <= 0x34) {
3899 w = 0xE000 + ((c - 0x21) * 94) + c2 - 0x21;
3900 }
3901
3902 *out++ = w ? w : MBFL_BAD_INPUT;
3903 } else {
3904 *out++ = MBFL_BAD_INPUT;
3905 }
3906 } else if (c <= 0x7F) {
3907 *out++ = c;
3908 } else if (c >= 0xA1 && c <= 0xDF) {
3909 *out++ = 0xFEC0 + c;
3910 } else {
3911 *out++ = MBFL_BAD_INPUT;
3912 }
3913 }
3914
3915 *in_len = e - p;
3916 *in = p;
3917 return out - buf;
3918 }
3919
mb_wchar_to_iso2022jpms(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3920 static void mb_wchar_to_iso2022jpms(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3921 {
3922 unsigned char *out, *limit;
3923 MB_CONVERT_BUF_LOAD(buf, out, limit);
3924 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3925
3926 while (len--) {
3927 uint32_t w = *in++;
3928 unsigned int s = 0;
3929
3930 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3931 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3932 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3933 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3934 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3935 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3936 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3937 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3938 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
3939 /* Private User Area (95ku - 114ku) */
3940 s = ((((w - 0xE000) / 94) + 0x7F) << 8) | (((w - 0xE000) % 94) + 0x21);
3941 }
3942
3943 if (!s) {
3944 if (w == 0xA5) { /* YEN SIGN */
3945 s = 0x216F; /* FULLWIDTH YEN SIGN */
3946 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3947 s = 0x2140;
3948 } else if (w == 0x2225) { /* PARALLEL TO */
3949 s = 0x2142;
3950 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3951 s = 0x215D;
3952 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3953 s = 0x2171;
3954 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3955 s = 0x2172;
3956 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3957 s = 0x224C;
3958 }
3959 }
3960
3961 if (s >= 0xA1A1) /* JISX 0212 */
3962 s = 0;
3963
3964 if (!s && w) {
3965 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3966 if (w == cp932ext1_ucs_table[i]) {
3967 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
3968 break;
3969 }
3970 }
3971
3972 if (!s) {
3973 for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
3974 if (w == cp932ext3_ucs_table[i]) {
3975 s = cp932ext3_cp932ext2_jis(i);
3976 break;
3977 }
3978 }
3979 }
3980 }
3981
3982 if (!s && w) {
3983 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
3984 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3985 } else if (s <= 0x7F) {
3986 if (buf->state != ASCII) {
3987 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3988 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3989 buf->state = ASCII;
3990 }
3991 out = mb_convert_buf_add(out, s);
3992 } else if (s >= 0xA1 && s <= 0xDF) {
3993 if (buf->state != JISX0201_KANA) {
3994 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3995 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3996 buf->state = JISX0201_KANA;
3997 }
3998 out = mb_convert_buf_add(out, s & 0x7F);
3999 } else if (s <= 0x7E7E) {
4000 if (buf->state != JISX0208_KANJI) {
4001 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
4002 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
4003 buf->state = JISX0208_KANJI;
4004 } else {
4005 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4006 }
4007 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0x7F);
4008 } else if (s < 0x927F) {
4009 if (buf->state != UDC) {
4010 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
4011 out = mb_convert_buf_add4(out, 0x1B, '$', '(', '?');
4012 buf->state = UDC;
4013 } else {
4014 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4015 }
4016 out = mb_convert_buf_add2(out, ((s >> 8) - 0x5E) & 0x7F, s & 0x7F);
4017 } else {
4018 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
4019 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4020 }
4021 }
4022
4023 if (end && buf->state != ASCII) {
4024 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
4025 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
4026 }
4027
4028 MB_CONVERT_BUF_STORE(buf, out, limit);
4029 }
4030
mbfl_filt_conv_2022kr_wchar(int c,mbfl_convert_filter * filter)4031 static int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
4032 {
4033 int w = 0;
4034
4035 switch (filter->status & 0xf) {
4036 /* case 0x00: ASCII */
4037 /* case 0x10: KSC5601 */
4038 case 0:
4039 if (c == 0x1b) { /* ESC */
4040 filter->status += 2;
4041 } else if (c == 0x0f) { /* shift in (ASCII) */
4042 filter->status = 0;
4043 } else if (c == 0x0e) { /* shift out (KSC5601) */
4044 filter->status = 0x10;
4045 } else if ((filter->status & 0x10) && c > 0x20 && c < 0x7f) {
4046 /* KSC5601 lead byte */
4047 filter->cache = c;
4048 filter->status = 0x11;
4049 } else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) {
4050 /* latin, CTLs */
4051 CK((*filter->output_function)(c, filter->data));
4052 } else {
4053 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4054 }
4055 break;
4056
4057 case 1: /* dbcs second byte */
4058 filter->status = 0x10;
4059 int c1 = filter->cache;
4060 int flag = 0;
4061
4062 if (c1 > 0x20 && c1 < 0x47) {
4063 flag = 1;
4064 } else if (c1 >= 0x47 && c1 <= 0x7e && c1 != 0x49) {
4065 flag = 2;
4066 }
4067
4068 if (flag > 0 && c > 0x20 && c < 0x7f) {
4069 if (flag == 1) {
4070 if (c1 != 0x22 || c <= 0x65) {
4071 w = (c1 - 1)*190 + (c - 0x41) + 0x80;
4072 ZEND_ASSERT(w < uhc1_ucs_table_size);
4073 w = uhc1_ucs_table[w];
4074 }
4075 } else {
4076 w = (c1 - 0x47)*94 + c - 0x21;
4077 if (w < uhc3_ucs_table_size) {
4078 w = uhc3_ucs_table[w];
4079 } else {
4080 w = MBFL_BAD_INPUT;
4081 }
4082 }
4083
4084 if (w <= 0) {
4085 w = MBFL_BAD_INPUT;
4086 }
4087 CK((*filter->output_function)(w, filter->data));
4088 } else {
4089 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4090 }
4091 break;
4092
4093 case 2: /* ESC */
4094 if (c == '$') {
4095 filter->status++;
4096 } else {
4097 filter->status &= ~0xF;
4098 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4099 }
4100 break;
4101
4102 case 3: /* ESC $ */
4103 if (c == ')') {
4104 filter->status++;
4105 } else {
4106 filter->status &= ~0xF;
4107 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4108 }
4109 break;
4110
4111 case 4: /* ESC $ ) */
4112 filter->status = 0;
4113 if (c != 'C') {
4114 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4115 }
4116 break;
4117
4118 EMPTY_SWITCH_DEFAULT_CASE();
4119 }
4120
4121 return 0;
4122 }
4123
mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter * filter)4124 static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
4125 {
4126 if (filter->status & 0xF) {
4127 /* 2-byte character was truncated */
4128 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4129 }
4130 filter->status = 0;
4131
4132 if (filter->flush_function) {
4133 (*filter->flush_function)(filter->data);
4134 }
4135
4136 return 0;
4137 }
4138
mbfl_filt_conv_wchar_2022kr(int c,mbfl_convert_filter * filter)4139 static int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
4140 {
4141 int c1, c2, s = 0;
4142
4143 if ((filter->status & 0x100) == 0) {
4144 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
4145 CK((*filter->output_function)('$', filter->data));
4146 CK((*filter->output_function)(')', filter->data));
4147 CK((*filter->output_function)('C', filter->data));
4148 filter->status |= 0x100;
4149 }
4150
4151 if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
4152 s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
4153 } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
4154 s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
4155 } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
4156 s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
4157 } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
4158 s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
4159 } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
4160 s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
4161 } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
4162 s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
4163 } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
4164 s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
4165 }
4166
4167 c1 = (s >> 8) & 0xff;
4168 c2 = s & 0xff;
4169 /* exclude UHC extension area */
4170 if (c1 < 0xa1 || c2 < 0xa1) {
4171 s = c;
4172 } else if (s & 0x8000) {
4173 s -= 0x8080;
4174 }
4175
4176 if (s <= 0) {
4177 if (c == 0) {
4178 s = 0;
4179 } else {
4180 s = -1;
4181 }
4182 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4183 s = -1;
4184 }
4185
4186 if (s >= 0) {
4187 if (s < 0x80 && s >= 0) { /* ASCII */
4188 if (filter->status & 0x10) {
4189 CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4190 filter->status &= ~0x10;
4191 }
4192 CK((*filter->output_function)(s, filter->data));
4193 } else {
4194 if ((filter->status & 0x10) == 0) {
4195 CK((*filter->output_function)(0x0e, filter->data)); /* shift out */
4196 filter->status |= 0x10;
4197 }
4198 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
4199 CK((*filter->output_function)(s & 0xff, filter->data));
4200 }
4201 } else {
4202 CK(mbfl_filt_conv_illegal_output(c, filter));
4203 }
4204
4205 return 0;
4206 }
4207
mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter * filter)4208 static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter)
4209 {
4210 if (filter->status & 0xF) {
4211 /* Escape sequence or 2-byte character was truncated */
4212 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4213 }
4214 /* back to ascii */
4215 if (filter->status & 0x10) {
4216 CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4217 }
4218
4219 filter->status = filter->cache = 0;
4220
4221 if (filter->flush_function) {
4222 return (*filter->flush_function)(filter->data);
4223 }
4224
4225 return 0;
4226 }
4227
4228 #define ASCII 0
4229 #define KSC5601 1
4230
mb_iso2022kr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4231 static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4232 {
4233 unsigned char *p = *in, *e = p + *in_len;
4234 uint32_t *out = buf, *limit = buf + bufsize;
4235
4236 while (p < e && out < limit) {
4237 unsigned char c = *p++;
4238
4239 if (c == 0x1B) {
4240 if ((e - p) < 3) {
4241 *out++ = MBFL_BAD_INPUT;
4242 if (p < e && *p++ == '$') {
4243 if (p < e) {
4244 p++;
4245 }
4246 }
4247 continue;
4248 }
4249 unsigned char c2 = *p++;
4250 unsigned char c3 = *p++;
4251 unsigned char c4 = *p++;
4252 if (c2 == '$' && c3 == ')' && c4 == 'C') {
4253 *state = ASCII;
4254 } else {
4255 if (c3 != ')') {
4256 p--;
4257 if (c2 != '$')
4258 p--;
4259 }
4260 *out++ = MBFL_BAD_INPUT;
4261 }
4262 } else if (c == 0xF) {
4263 *state = ASCII;
4264 } else if (c == 0xE) {
4265 *state = KSC5601;
4266 } else if (c >= 0x21 && c <= 0x7E && *state == KSC5601) {
4267 if (p == e) {
4268 *out++ = MBFL_BAD_INPUT;
4269 break;
4270 }
4271 unsigned char c2 = *p++;
4272 unsigned int w = 0;
4273
4274 if (c2 < 0x21 || c2 > 0x7E) {
4275 *out++ = MBFL_BAD_INPUT;
4276 continue;
4277 }
4278
4279 if (c < 0x47) {
4280 if (c != 0x22 || c2 <= 0x65) {
4281 w = (c - 1)*190 + c2 - 0x41 + 0x80;
4282 ZEND_ASSERT(w < uhc1_ucs_table_size);
4283 w = uhc1_ucs_table[w];
4284 }
4285 } else if (c != 0x49 && c <= 0x7D) {
4286 w = (c - 0x47)*94 + c2 - 0x21;
4287 ZEND_ASSERT(w < uhc3_ucs_table_size);
4288 w = uhc3_ucs_table[w];
4289 }
4290
4291 if (!w)
4292 w = MBFL_BAD_INPUT;
4293 *out++ = w;
4294 } else if (c < 0x80 && *state == ASCII) {
4295 *out++ = c;
4296 } else {
4297 *out++ = MBFL_BAD_INPUT;
4298 }
4299 }
4300
4301 *in_len = e - p;
4302 *in = p;
4303 return out - buf;
4304 }
4305
4306 #define EMITTED_ESC_SEQUENCE 0x10
4307
mb_wchar_to_iso2022kr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4308 static void mb_wchar_to_iso2022kr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4309 {
4310 unsigned char *out, *limit;
4311 MB_CONVERT_BUF_LOAD(buf, out, limit);
4312
4313 /* This escape sequence needs to come *somewhere* at the beginning of a line before
4314 * we can use the Shift In/Shift Out bytes, but it only needs to come once in a string
4315 * Rather than tracking newlines, we can just emit the sequence once at the beginning
4316 * of the output string... since that will always be "the beginning of a line" */
4317 if (len && !(buf->state & EMITTED_ESC_SEQUENCE)) {
4318 MB_CONVERT_BUF_ENSURE(buf, out, limit, 4 + len);
4319 out = mb_convert_buf_add4(out, 0x1B, '$', ')', 'C');
4320 buf->state |= EMITTED_ESC_SEQUENCE;
4321 } else {
4322 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4323 }
4324
4325 while (len--) {
4326 uint32_t w = *in++;
4327 unsigned int s = 0;
4328
4329 if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
4330 s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
4331 } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
4332 s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
4333 } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
4334 s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
4335 } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
4336 s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
4337 } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
4338 s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
4339 } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
4340 s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
4341 } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
4342 s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
4343 }
4344
4345 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
4346 s = w;
4347 } else {
4348 s -= 0x8080;
4349 }
4350
4351 if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4352 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022kr);
4353 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4354 } else if (s < 0x80) {
4355 if ((buf->state & 1) != ASCII) {
4356 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4357 out = mb_convert_buf_add(out, 0xF);
4358 buf->state &= ~KSC5601;
4359 }
4360 out = mb_convert_buf_add(out, s);
4361 } else {
4362 if ((buf->state & 1) != KSC5601) {
4363 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
4364 out = mb_convert_buf_add(out, 0xE);
4365 buf->state |= KSC5601;
4366 } else {
4367 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4368 }
4369 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
4370 }
4371 }
4372
4373 if (end && (buf->state & 1) != ASCII) {
4374 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
4375 out = mb_convert_buf_add(out, 0xF);
4376 }
4377
4378 MB_CONVERT_BUF_STORE(buf, out, limit);
4379 }
4380
4381 static const struct mbfl_convert_vtbl vtbl_jis_wchar = {
4382 mbfl_no_encoding_jis,
4383 mbfl_no_encoding_wchar,
4384 mbfl_filt_conv_common_ctor,
4385 NULL,
4386 mbfl_filt_conv_jis_wchar,
4387 mbfl_filt_conv_jis_wchar_flush,
4388 NULL,
4389 };
4390
4391 static const struct mbfl_convert_vtbl vtbl_wchar_jis = {
4392 mbfl_no_encoding_wchar,
4393 mbfl_no_encoding_jis,
4394 mbfl_filt_conv_common_ctor,
4395 NULL,
4396 mbfl_filt_conv_wchar_jis,
4397 mbfl_filt_conv_any_jis_flush,
4398 NULL,
4399 };
4400
4401 const mbfl_encoding mbfl_encoding_jis = {
4402 mbfl_no_encoding_jis,
4403 "JIS",
4404 "ISO-2022-JP",
4405 NULL,
4406 NULL,
4407 MBFL_ENCTYPE_GL_UNSAFE,
4408 &vtbl_jis_wchar,
4409 &vtbl_wchar_jis,
4410 mb_iso2022jp_to_wchar,
4411 mb_wchar_to_jis,
4412 mb_check_jis,
4413 NULL,
4414 };
4415
4416 static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
4417 mbfl_no_encoding_2022jp,
4418 mbfl_no_encoding_wchar,
4419 mbfl_filt_conv_common_ctor,
4420 NULL,
4421 mbfl_filt_conv_jis_wchar,
4422 mbfl_filt_conv_jis_wchar_flush,
4423 NULL,
4424 };
4425
4426 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
4427 mbfl_no_encoding_wchar,
4428 mbfl_no_encoding_2022jp,
4429 mbfl_filt_conv_common_ctor,
4430 NULL,
4431 mbfl_filt_conv_wchar_2022jp,
4432 mbfl_filt_conv_any_jis_flush,
4433 NULL,
4434 };
4435
4436 const mbfl_encoding mbfl_encoding_2022jp = {
4437 mbfl_no_encoding_2022jp,
4438 "ISO-2022-JP",
4439 "ISO-2022-JP",
4440 NULL,
4441 NULL,
4442 MBFL_ENCTYPE_GL_UNSAFE,
4443 &vtbl_2022jp_wchar,
4444 &vtbl_wchar_2022jp,
4445 mb_iso2022jp_to_wchar,
4446 mb_wchar_to_iso2022jp,
4447 mb_check_iso2022jp,
4448 NULL,
4449 };
4450
4451 static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
4452
4453 static const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
4454 mbfl_no_encoding_2022jp_kddi,
4455 mbfl_no_encoding_wchar,
4456 mbfl_filt_conv_common_ctor,
4457 NULL,
4458 mbfl_filt_conv_2022jp_mobile_wchar,
4459 mbfl_filt_conv_2022jp_mobile_wchar_flush,
4460 NULL,
4461 };
4462
4463 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_kddi = {
4464 mbfl_no_encoding_wchar,
4465 mbfl_no_encoding_2022jp_kddi,
4466 mbfl_filt_conv_common_ctor,
4467 NULL,
4468 mbfl_filt_conv_wchar_2022jp_mobile,
4469 mbfl_filt_conv_wchar_2022jp_mobile_flush,
4470 NULL,
4471 };
4472
4473 const mbfl_encoding mbfl_encoding_2022jp_kddi = {
4474 mbfl_no_encoding_2022jp_kddi,
4475 "ISO-2022-JP-MOBILE#KDDI",
4476 "ISO-2022-JP",
4477 mbfl_encoding_2022jp_kddi_aliases,
4478 NULL,
4479 MBFL_ENCTYPE_GL_UNSAFE,
4480 &vtbl_2022jp_kddi_wchar,
4481 &vtbl_wchar_2022jp_kddi,
4482 mb_iso2022jp_kddi_to_wchar,
4483 mb_wchar_to_iso2022jp_kddi,
4484 NULL,
4485 NULL,
4486 };
4487
4488 static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
4489 mbfl_no_encoding_2022jp_2004,
4490 mbfl_no_encoding_wchar,
4491 mbfl_filt_conv_common_ctor,
4492 NULL,
4493 mbfl_filt_conv_jis2004_wchar,
4494 mbfl_filt_conv_jis2004_wchar_flush,
4495 NULL,
4496 };
4497
4498 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = {
4499 mbfl_no_encoding_wchar,
4500 mbfl_no_encoding_2022jp_2004,
4501 mbfl_filt_conv_common_ctor,
4502 NULL,
4503 mbfl_filt_conv_wchar_jis2004,
4504 mbfl_filt_conv_wchar_jis2004_flush,
4505 NULL,
4506 };
4507
4508 const mbfl_encoding mbfl_encoding_2022jp_2004 = {
4509 mbfl_no_encoding_2022jp_2004,
4510 "ISO-2022-JP-2004",
4511 "ISO-2022-JP-2004",
4512 NULL,
4513 NULL,
4514 MBFL_ENCTYPE_GL_UNSAFE,
4515 &vtbl_2022jp_2004_wchar,
4516 &vtbl_wchar_2022jp_2004,
4517 mb_iso2022jp2004_to_wchar,
4518 mb_wchar_to_iso2022jp2004,
4519 NULL,
4520 NULL,
4521 };
4522
4523 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4524 * This was just CP50220, but the implementation was less strict regarding
4525 * invalid characters; it would silently pass some through
4526 * This 'encoding' only existed in mbstring. In case some poor, lost soul is
4527 * still using it, retain minimal support by aliasing it to CP50220
4528 *
4529 * Further, mbstring also had a made-up encoding called "JIS-ms"
4530 * This was the same as CP5022{0,1,2}, but without their special ways of
4531 * handling conversion of Unicode half-width katakana */
4532 static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
4533
4534 static const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
4535 mbfl_no_encoding_cp50220,
4536 mbfl_no_encoding_wchar,
4537 mbfl_filt_conv_common_ctor,
4538 NULL,
4539 mbfl_filt_conv_cp5022x_wchar,
4540 mbfl_filt_conv_cp5022x_wchar_flush,
4541 NULL,
4542 };
4543
4544 static const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
4545 mbfl_no_encoding_wchar,
4546 mbfl_no_encoding_cp50220,
4547 mbfl_filt_conv_common_ctor,
4548 NULL,
4549 mbfl_filt_conv_wchar_cp50220,
4550 mbfl_filt_conv_wchar_cp50220_flush,
4551 NULL,
4552 };
4553
4554 static const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
4555 mbfl_no_encoding_cp50221,
4556 mbfl_no_encoding_wchar,
4557 mbfl_filt_conv_common_ctor,
4558 NULL,
4559 mbfl_filt_conv_cp5022x_wchar,
4560 mbfl_filt_conv_cp5022x_wchar_flush,
4561 NULL,
4562 };
4563
4564 static const struct mbfl_convert_vtbl vtbl_wchar_cp50221 = {
4565 mbfl_no_encoding_wchar,
4566 mbfl_no_encoding_cp50221,
4567 mbfl_filt_conv_common_ctor,
4568 NULL,
4569 mbfl_filt_conv_wchar_cp50221,
4570 mbfl_filt_conv_any_jis_flush,
4571 NULL,
4572 };
4573
4574 static const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
4575 mbfl_no_encoding_cp50222,
4576 mbfl_no_encoding_wchar,
4577 mbfl_filt_conv_common_ctor,
4578 NULL,
4579 mbfl_filt_conv_cp5022x_wchar,
4580 mbfl_filt_conv_cp5022x_wchar_flush,
4581 NULL,
4582 };
4583
4584 static const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
4585 mbfl_no_encoding_wchar,
4586 mbfl_no_encoding_cp50222,
4587 mbfl_filt_conv_common_ctor,
4588 NULL,
4589 mbfl_filt_conv_wchar_cp50222,
4590 mbfl_filt_conv_wchar_cp50222_flush,
4591 NULL,
4592 };
4593
4594 const mbfl_encoding mbfl_encoding_cp50220 = {
4595 mbfl_no_encoding_cp50220,
4596 "CP50220",
4597 "ISO-2022-JP",
4598 cp50220_aliases,
4599 NULL,
4600 MBFL_ENCTYPE_GL_UNSAFE,
4601 &vtbl_cp50220_wchar,
4602 &vtbl_wchar_cp50220,
4603 mb_cp5022x_to_wchar,
4604 mb_wchar_to_cp50220,
4605 NULL,
4606 NULL,
4607 };
4608
4609 const mbfl_encoding mbfl_encoding_cp50221 = {
4610 mbfl_no_encoding_cp50221,
4611 "CP50221",
4612 "ISO-2022-JP",
4613 NULL,
4614 NULL,
4615 MBFL_ENCTYPE_GL_UNSAFE,
4616 &vtbl_cp50221_wchar,
4617 &vtbl_wchar_cp50221,
4618 mb_cp5022x_to_wchar,
4619 mb_wchar_to_cp50221,
4620 NULL,
4621 NULL,
4622 };
4623
4624 const mbfl_encoding mbfl_encoding_cp50222 = {
4625 mbfl_no_encoding_cp50222,
4626 "CP50222",
4627 "ISO-2022-JP",
4628 NULL,
4629 NULL,
4630 MBFL_ENCTYPE_GL_UNSAFE,
4631 &vtbl_cp50222_wchar,
4632 &vtbl_wchar_cp50222,
4633 mb_cp5022x_to_wchar,
4634 mb_wchar_to_cp50222,
4635 NULL,
4636 NULL,
4637 };
4638
4639 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
4640
4641 static const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
4642 mbfl_no_encoding_2022jpms,
4643 mbfl_no_encoding_wchar,
4644 mbfl_filt_conv_common_ctor,
4645 NULL,
4646 mbfl_filt_conv_2022jpms_wchar,
4647 mbfl_filt_conv_2022jpms_wchar_flush,
4648 NULL,
4649 };
4650
4651 static const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
4652 mbfl_no_encoding_wchar,
4653 mbfl_no_encoding_2022jpms,
4654 mbfl_filt_conv_common_ctor,
4655 NULL,
4656 mbfl_filt_conv_wchar_2022jpms,
4657 mbfl_filt_conv_any_2022jpms_flush,
4658 NULL,
4659 };
4660
4661 const mbfl_encoding mbfl_encoding_2022jpms = {
4662 mbfl_no_encoding_2022jpms,
4663 "ISO-2022-JP-MS",
4664 "ISO-2022-JP",
4665 mbfl_encoding_2022jpms_aliases,
4666 NULL,
4667 MBFL_ENCTYPE_GL_UNSAFE,
4668 &vtbl_2022jpms_wchar,
4669 &vtbl_wchar_2022jpms,
4670 mb_iso2022jpms_to_wchar,
4671 mb_wchar_to_iso2022jpms,
4672 NULL,
4673 NULL,
4674 };
4675
4676 /* ISO-2022-KR is defined in RFC 1557
4677 *
4678 * The RFC says that ESC $ ) C must appear once in a ISO-2022-KR string,
4679 * at the beginning of a line, before any instances of the Shift In or
4680 * Shift Out bytes which are used to switch between ASCII/KSC 5601 modes
4681 *
4682 * We don't enforce that for ISO-2022-KR input */
4683
4684 static const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {
4685 mbfl_no_encoding_wchar,
4686 mbfl_no_encoding_2022kr,
4687 mbfl_filt_conv_common_ctor,
4688 NULL,
4689 mbfl_filt_conv_wchar_2022kr,
4690 mbfl_filt_conv_any_2022kr_flush,
4691 NULL,
4692 };
4693
4694 static const struct mbfl_convert_vtbl vtbl_2022kr_wchar = {
4695 mbfl_no_encoding_2022kr,
4696 mbfl_no_encoding_wchar,
4697 mbfl_filt_conv_common_ctor,
4698 NULL,
4699 mbfl_filt_conv_2022kr_wchar,
4700 mbfl_filt_conv_2022kr_wchar_flush,
4701 NULL,
4702 };
4703
4704 const mbfl_encoding mbfl_encoding_2022kr = {
4705 mbfl_no_encoding_2022kr,
4706 "ISO-2022-KR",
4707 "ISO-2022-KR",
4708 NULL,
4709 NULL,
4710 MBFL_ENCTYPE_GL_UNSAFE,
4711 &vtbl_2022kr_wchar,
4712 &vtbl_wchar_2022kr,
4713 mb_iso2022kr_to_wchar,
4714 mb_wchar_to_iso2022kr,
4715 NULL,
4716 NULL,
4717 };
4718
4719 /*
4720 * SJIS variants
4721 */
4722
mbfl_filt_conv_sjis_wchar(int c,mbfl_convert_filter * filter)4723 static int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
4724 {
4725 int s1, s2, w;
4726
4727 switch (filter->status) {
4728 case 0:
4729 if (c >= 0 && c < 0x80) { /* ASCII */
4730 CK((*filter->output_function)(c, filter->data));
4731 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
4732 CK((*filter->output_function)(0xFEC0 + c, filter->data));
4733 } else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */
4734 filter->status = 1;
4735 filter->cache = c;
4736 } else {
4737 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4738 }
4739 break;
4740
4741 case 1: /* Kanji, second byte */
4742 filter->status = 0;
4743 int c1 = filter->cache;
4744 if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
4745 SJIS_DECODE(c1, c, s1, s2);
4746 w = (s1 - 0x21)*94 + s2 - 0x21;
4747 if (w >= 0 && w < jisx0208_ucs_table_size) {
4748 w = jisx0208_ucs_table[w];
4749 if (!w)
4750 w = MBFL_BAD_INPUT;
4751 } else {
4752 w = MBFL_BAD_INPUT;
4753 }
4754 CK((*filter->output_function)(w, filter->data));
4755 } else {
4756 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4757 }
4758 }
4759
4760 return 0;
4761 }
4762
mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter * filter)4763 static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter)
4764 {
4765 if (filter->status && filter->status != 4) {
4766 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4767 }
4768 filter->status = 0;
4769
4770 if (filter->flush_function) {
4771 (*filter->flush_function)(filter->data);
4772 }
4773
4774 return 0;
4775 }
4776
mbfl_filt_conv_wchar_sjis(int c,mbfl_convert_filter * filter)4777 static int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
4778 {
4779 int c1, c2, s1 = 0, s2;
4780
4781 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
4782 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
4783 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
4784 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
4785 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
4786 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
4787 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
4788 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
4789 }
4790 if (s1 <= 0) {
4791 if (c == 0xA5) { /* YEN SIGN */
4792 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
4793 } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
4794 s1 = 0x2131; /* FULLWIDTH MACRON */
4795 } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4796 s1 = 0x2140;
4797 } else if (c == 0x2225) { /* PARALLEL TO */
4798 s1 = 0x2142;
4799 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4800 s1 = 0x215D;
4801 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4802 s1 = 0x2171;
4803 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4804 s1 = 0x2172;
4805 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4806 s1 = 0x224C;
4807 } else if (c == 0) {
4808 s1 = 0;
4809 } else {
4810 s1 = -1;
4811 }
4812 } else if (s1 >= 0x8080) { /* JIS X 0212; not supported */
4813 s1 = -1;
4814 }
4815
4816 if (s1 >= 0) {
4817 if (s1 < 0x100) { /* Latin/Kana */
4818 CK((*filter->output_function)(s1, filter->data));
4819 } else { /* Kanji */
4820 c1 = (s1 >> 8) & 0xFF;
4821 c2 = s1 & 0xFF;
4822 SJIS_ENCODE(c1, c2, s1, s2);
4823 CK((*filter->output_function)(s1, filter->data));
4824 CK((*filter->output_function)(s2, filter->data));
4825 }
4826 } else {
4827 CK(mbfl_filt_conv_illegal_output(c, filter));
4828 }
4829
4830 return 0;
4831 }
4832
4833 static const unsigned short sjis_decode_tbl1[] = {
4834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
4835 };
4836
4837 static const unsigned short sjis_decode_tbl2[] = {
4838 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 0xFFFF, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 0xFFFF, 0xFFFF, 0xFFFF
4839 };
4840
mb_sjis_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4841 static size_t mb_sjis_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4842 {
4843 unsigned char *p = *in, *e = p + *in_len;
4844 uint32_t *out = buf, *limit = buf + bufsize;
4845
4846 e--; /* Stop the main loop 1 byte short of the end of the input */
4847
4848 while (p < e && out < limit) {
4849 unsigned char c = *p++;
4850
4851 if (c <= 0x7F) {
4852 *out++ = c;
4853 } else if (c >= 0xA1 && c <= 0xDF) { /* Kana */
4854 *out++ = 0xFEC0 + c;
4855 } else {
4856 /* Don't need to check p < e; it's not possible to go out of bounds here, due to e-- above */
4857 unsigned char c2 = *p++;
4858 /* This is only legal if c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F
4859 * But the values in the above conversion tables have been chosen such that
4860 * illegal values of c2 will always result in w > jisx0208_ucs_table_size,
4861 * so we don't need to do a separate bounds check on c2
4862 * Likewise, the values in the conversion tables are such that illegal values
4863 * for c will always result in w > jisx0208_ucs_table_size */
4864 uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
4865 if (w < jisx0208_ucs_table_size) {
4866 w = jisx0208_ucs_table[w];
4867 if (!w)
4868 w = MBFL_BAD_INPUT;
4869 *out++ = w;
4870 } else {
4871 if (c == 0x80 || c == 0xA0 || c > 0xEF) {
4872 p--;
4873 }
4874 *out++ = MBFL_BAD_INPUT;
4875 }
4876 }
4877 }
4878
4879 /* Finish up last byte of input string if there is one */
4880 if (p == e && out < limit) {
4881 unsigned char c = *p++;
4882 if (c <= 0x7F) {
4883 *out++ = c;
4884 } else if (c >= 0xA1 && c <= 0xDF) {
4885 *out++ = 0xFEC0 + c;
4886 } else {
4887 *out++ = MBFL_BAD_INPUT;
4888 }
4889 }
4890
4891 *in_len = e - p + 1;
4892 *in = p;
4893 return out - buf;
4894 }
4895
mb_wchar_to_sjis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4896 static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4897 {
4898 unsigned char *out, *limit;
4899 MB_CONVERT_BUF_LOAD(buf, out, limit);
4900 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4901
4902 while (len--) {
4903 uint32_t w = *in++;
4904 unsigned int s = 0;
4905
4906 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
4907 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
4908 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
4909 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
4910 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
4911 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
4912 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
4913 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
4914 }
4915
4916 if (s == 0) {
4917 if (w == 0xA5) { /* YEN SIGN */
4918 s = 0x216F; /* FULLWIDTH YEN SIGN */
4919 } else if (w == 0xAF || w == 0x203E) {
4920 s = 0x2131; /* FULLWIDTH MACRON */
4921 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4922 s = 0x2140;
4923 } else if (w == 0x2225) { /* PARALLEL TO */
4924 s = 0x2142;
4925 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4926 s = 0x215D;
4927 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4928 s = 0x2171;
4929 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4930 s = 0x2172;
4931 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4932 s = 0x224C;
4933 } else if (w != 0) {
4934 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4935 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4936 continue;
4937 }
4938 } else if (s >= 0x8080) { /* JIS X 0212; not supported */
4939 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4940 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4941 continue;
4942 }
4943
4944 if (s <= 0xFF) {
4945 /* Latin/Kana */
4946 out = mb_convert_buf_add(out, s);
4947 } else {
4948 /* Kanji */
4949 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s2;
4950 SJIS_ENCODE(c1, c2, s, s2);
4951 out = mb_convert_buf_add2(out, s, s2);
4952 }
4953 }
4954
4955 MB_CONVERT_BUF_STORE(buf, out, limit);
4956 }
4957
mbfl_filt_conv_sjis_mac_wchar(int c,mbfl_convert_filter * filter)4958 static int mbfl_filt_conv_sjis_mac_wchar(int c, mbfl_convert_filter *filter)
4959 {
4960 int i, j, n;
4961 int c1, s, s1, s2, w;
4962
4963 switch (filter->status) {
4964 case 0:
4965 if (c >= 0 && c < 0x80 && c != 0x5c) { /* latin */
4966 CK((*filter->output_function)(c, filter->data));
4967 } else if (c > 0xa0 && c < 0xe0) { /* kana */
4968 CK((*filter->output_function)(0xfec0 + c, filter->data));
4969 } else if (c > 0x80 && c <= 0xed && c != 0xa0) { /* kanji first char */
4970 filter->status = 1;
4971 filter->cache = c;
4972 } else if (c == 0x5c) {
4973 CK((*filter->output_function)(0x00a5, filter->data));
4974 } else if (c == 0x80) {
4975 CK((*filter->output_function)(0x005c, filter->data));
4976 } else if (c == 0xa0) {
4977 CK((*filter->output_function)(0x00a0, filter->data));
4978 } else if (c == 0xfd) {
4979 CK((*filter->output_function)(0x00a9, filter->data));
4980 } else if (c == 0xfe) {
4981 CK((*filter->output_function)(0x2122, filter->data));
4982 } else if (c == 0xff) {
4983 CK((*filter->output_function)(0x2026, filter->data));
4984 CK((*filter->output_function)(0xf87f, filter->data));
4985 } else {
4986 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4987 }
4988 break;
4989
4990 case 1: /* kanji second char */
4991 filter->status = 0;
4992 c1 = filter->cache;
4993 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
4994 w = 0;
4995 SJIS_DECODE(c1, c, s1, s2);
4996 s = (s1 - 0x21)*94 + s2 - 0x21;
4997 if (s <= 0x89) {
4998 if (s == 0x1c) {
4999 w = 0x2014; /* EM DASH */
5000 } else if (s == 0x1f) {
5001 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
5002 } else if (s == 0x20) {
5003 w = 0x301c; /* FULLWIDTH TILDE */
5004 } else if (s == 0x21) {
5005 w = 0x2016; /* PARALLEL TO */
5006 } else if (s == 0x3c) {
5007 w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
5008 } else if (s == 0x50) {
5009 w = 0x00a2; /* FULLWIDTH CENT SIGN */
5010 } else if (s == 0x51) {
5011 w = 0x00a3; /* FULLWIDTH POUND SIGN */
5012 } else if (s == 0x89) {
5013 w = 0x00ac; /* FULLWIDTH NOT SIGN */
5014 }
5015 }
5016
5017 /* apple gaiji area 0x8540 - 0x886d */
5018 if (w == 0) {
5019 for (i=0; i<7; i++) {
5020 if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) {
5021 w = s - code_tbl[i][0] + code_tbl[i][2];
5022 break;
5023 }
5024 }
5025 }
5026
5027 if (w == 0) {
5028
5029 for (i=0; i<code_tbl_m_len; i++) {
5030 if (s == code_tbl_m[i][0]) {
5031 if (code_tbl_m[i][1] == 0xf860) {
5032 n = 4;
5033 } else if (code_tbl_m[i][1] == 0xf861) {
5034 n = 5;
5035 } else {
5036 n = 6;
5037 }
5038 for (j=1; j<n-1; j++) {
5039 CK((*filter->output_function)(code_tbl_m[i][j], filter->data));
5040 }
5041 w = code_tbl_m[i][n-1];
5042 break;
5043 }
5044 }
5045 }
5046
5047 if (w == 0) {
5048 for (i=0; i<8; i++) {
5049 if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) {
5050 w = code_map[i][s - code_ofst_tbl[i][0]];
5051 if (w == 0) {
5052 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5053 return 0;
5054 }
5055 s2 = 0;
5056 if (s >= 0x043e && s <= 0x0441) {
5057 s2 = 0xf87a;
5058 } else if (s == 0x03b1 || s == 0x03b7) {
5059 s2 = 0xf87f;
5060 } else if (s == 0x04b8 || s == 0x04b9 || s == 0x04c4) {
5061 s2 = 0x20dd;
5062 } else if (s == 0x1ed9 || s == 0x1eda || s == 0x1ee8 || s == 0x1ef3 ||
5063 (s >= 0x1ef5 && s <= 0x1efb) || s == 0x1f05 || s == 0x1f06 ||
5064 s == 0x1f18 || (s >= 0x1ff2 && s <= 0x20a5)) {
5065 s2 = 0xf87e;
5066 }
5067 if (s2 > 0) {
5068 CK((*filter->output_function)(w, filter->data));
5069 w = s2;
5070 }
5071 break;
5072 }
5073 }
5074 }
5075
5076 if (w == 0 && s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
5077 w = jisx0208_ucs_table[s];
5078 }
5079
5080 if (w <= 0) {
5081 w = MBFL_BAD_INPUT;
5082 }
5083 CK((*filter->output_function)(w, filter->data));
5084 } else {
5085 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5086 }
5087 break;
5088
5089 EMPTY_SWITCH_DEFAULT_CASE();
5090 }
5091
5092 return 0;
5093 }
5094
mbfl_filt_conv_wchar_sjis_mac(int c,mbfl_convert_filter * filter)5095 static int mbfl_filt_conv_wchar_sjis_mac(int c, mbfl_convert_filter *filter)
5096 {
5097 int i, c1, c2, s1 = 0, s2 = 0, mode;
5098
5099 // a1: U+0000 -> U+046F
5100 // a2: U+2000 -> U+30FF
5101 // i: U+4E00 -> U+9FFF
5102 // r: U+FF00 -> U+FFFF
5103
5104 switch (filter->status) {
5105 case 1:
5106 c1 = filter->cache;
5107 filter->cache = filter->status = 0;
5108
5109 if (c == 0xf87a) {
5110 for (i = 0; i < 4; i++) {
5111 if (c1 == s_form_tbl[i+34+3+3]) {
5112 s1 = s_form_sjis_tbl[i+34+3+3];
5113 break;
5114 }
5115 }
5116 if (s1 <= 0) {
5117 s2 = c1;
5118 }
5119 } else if (c == 0x20dd) {
5120 for (i = 0; i < 3; i++) {
5121 if (c1 == s_form_tbl[i+34+3]) {
5122 s1 = s_form_sjis_tbl[i+34+3];
5123 break;
5124 }
5125 }
5126 if (s1 <= 0) {
5127 s2 = c1;
5128 }
5129 } else if (c == 0xf87f) {
5130 for (i = 0; i < 3; i++) {
5131 if (c1 == s_form_tbl[i+34]) {
5132 s1 = s_form_sjis_tbl[i+34];
5133 break;
5134 }
5135 }
5136 if (s1 <= 0) {
5137 s2 = c1;
5138 s1 = -1;
5139 }
5140 } else if (c == 0xf87e) {
5141 for (i = 0; i < 34; i++) {
5142 if (c1 == s_form_tbl[i]) {
5143 s1 = s_form_sjis_tbl[i];
5144 break;
5145 }
5146 }
5147 if (s1 <= 0) {
5148 s2 = c1;
5149 s1 = -1;
5150 }
5151 } else {
5152 s2 = c1;
5153 s1 = c;
5154 }
5155
5156 if (s2 > 0) {
5157 for (i = 0; i < s_form_tbl_len; i++) {
5158 if (c1 == s_form_tbl[i]) {
5159 s1 = s_form_sjis_fallback_tbl[i];
5160 break;
5161 }
5162 }
5163 }
5164
5165 if (s1 >= 0) {
5166 if (s1 < 0x100) {
5167 CK((*filter->output_function)(s1, filter->data));
5168 } else {
5169 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5170 CK((*filter->output_function)(s1 & 0xff, filter->data));
5171 }
5172 } else {
5173 CK(mbfl_filt_conv_illegal_output(c, filter));
5174 }
5175
5176 if (s2 <= 0 || s1 == -1) {
5177 break;
5178 }
5179 s1 = s2 = 0;
5180 ZEND_FALLTHROUGH;
5181
5182 case 0:
5183 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
5184 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
5185 if (c == 0x5c) {
5186 s1 = 0x80;
5187 } else if (c == 0xa9) {
5188 s1 = 0xfd;
5189 }
5190 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
5191 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
5192 if (c == 0x2122) {
5193 s1 = 0xfe;
5194 } else if (c == 0x2014) {
5195 s1 = 0x213d;
5196 } else if (c == 0x2116) {
5197 s1 = 0x2c1d;
5198 }
5199 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
5200 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
5201 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
5202 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
5203 }
5204
5205 if (c >= 0x2000) {
5206 for (i = 0; i < s_form_tbl_len; i++) {
5207 if (c == s_form_tbl[i]) {
5208 filter->status = 1;
5209 filter->cache = c;
5210 return 0;
5211 }
5212 }
5213
5214 if (c == 0xf860 || c == 0xf861 || c == 0xf862) {
5215 /* Apple 'transcoding hint' codepoints (from private use area) */
5216 filter->status = 2;
5217 filter->cache = c;
5218 return 0;
5219 }
5220 }
5221
5222 if (s1 <= 0) {
5223 if (c == 0xa0) {
5224 s1 = 0x00a0;
5225 } else if (c == 0xa5) { /* YEN SIGN */
5226 /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5227 * convert codepoint 0xA5 to halfwidth Yen sign */
5228 s1 = 0x5c; /* HALFWIDTH YEN SIGN */
5229 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
5230 s1 = 0x2140;
5231 }
5232 }
5233
5234 if (s1 <= 0) {
5235 for (i=0; i<wchar2sjis_mac_r_tbl_len; i++) {
5236 if (c >= wchar2sjis_mac_r_tbl[i][0] && c <= wchar2sjis_mac_r_tbl[i][1]) {
5237 s1 = c - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5238 break;
5239 }
5240 }
5241
5242 if (s1 <= 0) {
5243 for (i=0; i<wchar2sjis_mac_r_map_len; i++) {
5244 if (c >= wchar2sjis_mac_r_map[i][0] && c <= wchar2sjis_mac_r_map[i][1]) {
5245 s1 = wchar2sjis_mac_code_map[i][c-wchar2sjis_mac_r_map[i][0]];
5246 break;
5247 }
5248 }
5249 }
5250
5251 if (s1 <= 0) {
5252 for (i=0; i<wchar2sjis_mac_wchar_tbl_len ; i++) {
5253 if ( c == wchar2sjis_mac_wchar_tbl[i][0]) {
5254 s1 = wchar2sjis_mac_wchar_tbl[i][1] & 0xffff;
5255 break;
5256 }
5257 }
5258 }
5259
5260 if (s1 > 0) {
5261 c1 = s1/94+0x21;
5262 c2 = s1-94*(c1-0x21)+0x21;
5263 s1 = (c1 << 8) | c2;
5264 s2 = 1;
5265 }
5266 }
5267
5268 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
5269 s1 = -1;
5270 c1 = 0;
5271
5272 if (c == 0) {
5273 s1 = 0;
5274 } else if (s1 <= 0) {
5275 s1 = -1;
5276 }
5277 }
5278
5279 if (s1 >= 0) {
5280 if (s1 < 0x100) { /* latin or kana */
5281 CK((*filter->output_function)(s1, filter->data));
5282 } else { /* kanji */
5283 c1 = (s1 >> 8) & 0xff;
5284 c2 = s1 & 0xff;
5285 SJIS_ENCODE(c1, c2, s1, s2);
5286 CK((*filter->output_function)(s1, filter->data));
5287 CK((*filter->output_function)(s2, filter->data));
5288 }
5289 } else {
5290 CK(mbfl_filt_conv_illegal_output(c, filter));
5291 }
5292 break;
5293
5294 case 2:
5295 c1 = filter->cache;
5296 filter->cache = 0;
5297 filter->status = 0;
5298 if (c1 == 0xf860) {
5299 for (i = 0; i < 5; i++) {
5300 if (c == code_tbl_m[i][2]) {
5301 filter->cache = c | 0x10000;
5302 filter->status = 3;
5303 break;
5304 }
5305 }
5306 } else if (c1 == 0xf861) {
5307 for (i = 0; i < 3; i++) {
5308 if (c == code_tbl_m[i+5][2]) {
5309 filter->cache = c | 0x20000;
5310 filter->status = 3;
5311 break;
5312 }
5313 }
5314 } else if (c1 == 0xf862) {
5315 for (i = 0; i < 4; i++) {
5316 if (c == code_tbl_m[i+5+3][2]) {
5317 filter->cache = c | 0x40000;
5318 filter->status = 3;
5319 break;
5320 }
5321 }
5322 }
5323
5324 if (filter->status == 0) {
5325 /* Didn't find any of expected codepoints after Apple transcoding hint */
5326 CK(mbfl_filt_conv_illegal_output(c1, filter));
5327 return mbfl_filt_conv_wchar_sjis_mac(c, filter);
5328 }
5329 break;
5330
5331 case 3:
5332 s1 = 0;
5333 c1 = filter->cache & 0xffff;
5334 mode = (filter->cache & 0xf0000) >> 16;
5335
5336 filter->cache = filter->status = 0;
5337
5338 if (mode == 0x1) {
5339 for (i = 0; i < 5; i++) {
5340 if (c1 == code_tbl_m[i][2] && c == code_tbl_m[i][3]) {
5341 s1 = code_tbl_m[i][0];
5342 break;
5343 }
5344 }
5345
5346 if (s1 > 0) {
5347 c1 = s1/94+0x21;
5348 c2 = s1-94*(c1-0x21)+0x21;
5349 SJIS_ENCODE(c1, c2, s1, s2);
5350 CK((*filter->output_function)(s1, filter->data));
5351 CK((*filter->output_function)(s2, filter->data));
5352 } else {
5353 CK(mbfl_filt_conv_illegal_output(0xf860, filter));
5354 CK(mbfl_filt_conv_illegal_output(c1, filter));
5355 CK(mbfl_filt_conv_illegal_output(c, filter));
5356 }
5357 } else if (mode == 0x2) {
5358 for (i = 0; i < 3; i++) {
5359 if (c1 == code_tbl_m[i+5][2] && c == code_tbl_m[i+5][3]) {
5360 filter->cache = c | 0x20000;
5361 filter->status = 4;
5362 break;
5363 }
5364 }
5365 } else if (mode == 0x4) {
5366 for (i = 0; i < 4; i++) {
5367 if (c1 == code_tbl_m[i+8][2] && c == code_tbl_m[i+8][3]) {
5368 filter->cache = c | 0x40000;
5369 filter->status = 4;
5370 break;
5371 }
5372 }
5373 }
5374 break;
5375
5376 case 4:
5377 s1 = 0;
5378 c1 = filter->cache & 0xffff;
5379 mode = (filter->cache & 0xf0000) >> 16;
5380
5381 filter->cache = 0;
5382 filter->status = 0;
5383
5384 if (mode == 0x2) {
5385 for (i = 0; i < 3; i++) {
5386 if (c1 == code_tbl_m[i+5][3] && c == code_tbl_m[i+5][4]) {
5387 s1 = code_tbl_m[i+5][0];
5388 break;
5389 }
5390 }
5391
5392 if (s1 > 0) {
5393 c1 = s1/94+0x21;
5394 c2 = s1-94*(c1-0x21)+0x21;
5395 SJIS_ENCODE(c1, c2, s1, s2);
5396 CK((*filter->output_function)(s1, filter->data));
5397 CK((*filter->output_function)(s2, filter->data));
5398 } else {
5399 CK(mbfl_filt_conv_illegal_output(0xf861, filter));
5400 for (i = 0; i < 3; i++) {
5401 if (c1 == code_tbl_m[i+5][3]) {
5402 CK(mbfl_filt_conv_illegal_output(code_tbl_m[i+5][2], filter));
5403 break;
5404 }
5405 }
5406 CK(mbfl_filt_conv_illegal_output(c1, filter));
5407 CK(mbfl_filt_conv_illegal_output(c, filter));
5408 }
5409 } else if (mode == 0x4) {
5410 for (i = 0; i < 4; i++) {
5411 if (c1 == code_tbl_m[i+8][3] && c == code_tbl_m[i+8][4]) {
5412 filter->cache = c | 0x40000;
5413 filter->status = 5;
5414 break;
5415 }
5416 }
5417 }
5418 break;
5419
5420 case 5:
5421 s1 = 0;
5422 c1 = filter->cache & 0xffff;
5423 mode = (filter->cache & 0xf0000) >> 16;
5424
5425 filter->cache = filter->status = 0;
5426
5427 if (mode == 0x4) {
5428 for (i = 0; i < 4; i++) {
5429 if (c1 == code_tbl_m[i+8][4] && c == code_tbl_m[i+8][5]) {
5430 s1 = code_tbl_m[i+8][0];
5431 break;
5432 }
5433 }
5434
5435 if (s1 > 0) {
5436 c1 = s1/94+0x21;
5437 c2 = s1-94*(c1-0x21)+0x21;
5438 SJIS_ENCODE(c1, c2, s1, s2);
5439 CK((*filter->output_function)(s1, filter->data));
5440 CK((*filter->output_function)(s2, filter->data));
5441 } else {
5442 CK(mbfl_filt_conv_illegal_output(0xf862, filter));
5443 for (i = 0; i < 4; i++) {
5444 if (c1 == code_tbl_m[i+8][4]) {
5445 CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][2], filter));
5446 CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][3], filter));
5447 break;
5448 }
5449 }
5450 CK(mbfl_filt_conv_illegal_output(c1, filter));
5451 CK(mbfl_filt_conv_illegal_output(c, filter));
5452 }
5453 }
5454 break;
5455
5456 EMPTY_SWITCH_DEFAULT_CASE();
5457 }
5458
5459 return 0;
5460 }
5461
mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter * filter)5462 static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter)
5463 {
5464 int i, c1, s1 = 0;
5465 if (filter->status == 1 && filter->cache > 0) {
5466 c1 = filter->cache;
5467 for (i=0;i<s_form_tbl_len;i++) {
5468 if (c1 == s_form_tbl[i]) {
5469 s1 = s_form_sjis_fallback_tbl[i];
5470 break;
5471 }
5472 }
5473 if (s1 > 0) {
5474 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5475 CK((*filter->output_function)(s1 & 0xff, filter->data));
5476 }
5477 }
5478 filter->cache = 0;
5479 filter->status = 0;
5480
5481 if (filter->flush_function != NULL) {
5482 return (*filter->flush_function)(filter->data);
5483 }
5484
5485 return 0;
5486 }
5487
mb_sjismac_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)5488 static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
5489 {
5490 /* A single SJIS-Mac kuten code can convert to up to 5 Unicode codepoints, oh my! */
5491 ZEND_ASSERT(bufsize >= 5);
5492
5493 unsigned char *p = *in, *e = p + *in_len;
5494 uint32_t *out = buf, *limit = buf + bufsize;
5495
5496 while (p < e && out < limit) {
5497 unsigned char c = *p++;
5498
5499 if (c <= 0x80 || c == 0xA0) {
5500 if (c == 0x5C) {
5501 *out++ = 0xA5;
5502 } else if (c == 0x80) {
5503 *out++ = 0x5C;
5504 } else {
5505 *out++ = c;
5506 }
5507 } else if (c >= 0xA1 && c <= 0xDF) {
5508 *out++ = 0xFEC0 + c;
5509 } else if (c <= 0xED) {
5510 if (p == e) {
5511 *out++ = MBFL_BAD_INPUT;
5512 break;
5513 }
5514 unsigned char c2 = *p++;
5515 uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
5516
5517 if (w <= 0x89) {
5518 if (w == 0x1C) {
5519 *out++ = 0x2014; /* EM DASH */
5520 continue;
5521 } else if (w == 0x1F) {
5522 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
5523 continue;
5524 } else if (w == 0x20) {
5525 *out++ = 0x301C; /* FULLWIDTH TILDE */
5526 continue;
5527 } else if (w == 0x21) {
5528 *out++ = 0x2016; /* PARALLEL TO */
5529 continue;
5530 } else if (w == 0x3C) {
5531 *out++ = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
5532 continue;
5533 } else if (w == 0x50) {
5534 *out++ = 0xA2; /* FULLWIDTH CENT SIGN */
5535 continue;
5536 } else if (w == 0x51) {
5537 *out++ = 0xA3; /* FULLWIDTH POUND SIGN */
5538 continue;
5539 } else if (w == 0x89) {
5540 *out++ = 0xAC; /* FULLWIDTH NOT SIGN */
5541 continue;
5542 }
5543 } else {
5544 if (w >= 0x2F0 && w <= 0x3A3) {
5545 for (int i = 0; i < 7; i++) {
5546 if (w >= code_tbl[i][0] && w <= code_tbl[i][1]) {
5547 *out++ = w - code_tbl[i][0] + code_tbl[i][2];
5548 goto next_iteration;
5549 }
5550 }
5551 }
5552
5553 if (w >= 0x340 && w <= 0x523) {
5554 for (int i = 0; i < code_tbl_m_len; i++) {
5555 if (w == code_tbl_m[i][0]) {
5556 int n = 5;
5557 if (code_tbl_m[i][1] == 0xF860) {
5558 n = 3;
5559 } else if (code_tbl_m[i][1] == 0xF861) {
5560 n = 4;
5561 }
5562 if ((limit - out) < n) {
5563 p -= 2;
5564 goto finished;
5565 }
5566 for (int j = 1; j <= n; j++) {
5567 *out++ = code_tbl_m[i][j];
5568 }
5569 goto next_iteration;
5570 }
5571 }
5572 }
5573
5574 if (w >= 0x3AC && w <= 0x20A5) {
5575 for (int i = 0; i < 8; i++) {
5576 if (w >= code_ofst_tbl[i][0] && w <= code_ofst_tbl[i][1]) {
5577 uint32_t w2 = code_map[i][w - code_ofst_tbl[i][0]];
5578 if (!w2) {
5579 *out++ = MBFL_BAD_INPUT;
5580 goto next_iteration;
5581 }
5582 if ((limit - out) < 2) {
5583 p -= 2;
5584 goto finished;
5585 }
5586 *out++ = w2;
5587 if (w >= 0x43E && w <= 0x441) {
5588 *out++ = 0xF87A;
5589 } else if (w == 0x3B1 || w == 0x3B7) {
5590 *out++ = 0xF87F;
5591 } else if (w == 0x4B8 || w == 0x4B9 || w == 0x4C4) {
5592 *out++ = 0x20DD;
5593 } else if (w == 0x1ED9 || w == 0x1EDA || w == 0x1EE8 || w == 0x1EF3 || (w >= 0x1EF5 && w <= 0x1EFB) || w == 0x1F05 || w == 0x1F06 || w == 0x1F18 || (w >= 0x1FF2 && w <= 0x20A5)) {
5594 *out++ = 0xF87E;
5595 }
5596 goto next_iteration;
5597 }
5598 }
5599 }
5600 }
5601
5602 if (w < jisx0208_ucs_table_size) {
5603 w = jisx0208_ucs_table[w];
5604 if (!w)
5605 w = MBFL_BAD_INPUT;
5606 *out++ = w;
5607 } else {
5608 *out++ = MBFL_BAD_INPUT;
5609 }
5610 } else if (c == 0xFD) {
5611 *out++ = 0xA9;
5612 } else if (c == 0xFE) {
5613 *out++ = 0x2122;
5614 } else if (c == 0xFF) {
5615 if ((limit - out) < 2) {
5616 p--;
5617 break;
5618 }
5619 *out++ = 0x2026;
5620 *out++ = 0xF87F;
5621 } else {
5622 *out++ = MBFL_BAD_INPUT;
5623 }
5624 next_iteration: ;
5625 }
5626
5627 finished:
5628 *in_len = e - p;
5629 *in = p;
5630 return out - buf;
5631 }
5632
process_s_form(uint32_t w,uint32_t w2,unsigned int * s)5633 static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s)
5634 {
5635 if (w2 == 0xF87A) {
5636 for (int i = 0; i < 4; i++) {
5637 if (w == s_form_tbl[i+34+3+3]) {
5638 *s = s_form_sjis_tbl[i+34+3+3];
5639 return true;
5640 }
5641 }
5642 } else if (w2 == 0x20DD) {
5643 for (int i = 0; i < 3; i++) {
5644 if (w == s_form_tbl[i+34+3]) {
5645 *s = s_form_sjis_tbl[i+34+3];
5646 return true;
5647 }
5648 }
5649 } else if (w2 == 0xF87F) {
5650 for (int i = 0; i < 3; i++) {
5651 if (w == s_form_tbl[i+34]) {
5652 *s = s_form_sjis_tbl[i+34];
5653 return true;
5654 }
5655 }
5656 } else if (w2 == 0xF87E) {
5657 for (int i = 0; i < 34; i++) {
5658 if (w == s_form_tbl[i]) {
5659 *s = s_form_sjis_tbl[i];
5660 return true;
5661 }
5662 }
5663 }
5664
5665 return false;
5666 }
5667
5668 /* For codepoints F860-F862, which are treated specially in MacJapanese */
5669 static int transcoding_hint_cp_width[3] = { 3, 4, 5 };
5670
mb_wchar_to_sjismac(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)5671 static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
5672 {
5673 unsigned char *out, *limit;
5674 MB_CONVERT_BUF_LOAD(buf, out, limit);
5675 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5676
5677 uint32_t w;
5678
5679 if (buf->state) {
5680 w = buf->state & 0xFFFF;
5681 if (buf->state & 0xFF000000L) {
5682 goto resume_transcoding_hint;
5683 } else {
5684 buf->state = 0;
5685 goto process_codepoint;
5686 }
5687 }
5688
5689 while (len--) {
5690 w = *in++;
5691 process_codepoint: ;
5692 unsigned int s = 0;
5693
5694 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
5695 if (w == 0x5C) {
5696 s = 0x80;
5697 } else if (w == 0xA9) {
5698 s = 0xFD;
5699 } else {
5700 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
5701 }
5702 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
5703 if (w == 0x2122) {
5704 s = 0xFE;
5705 } else if (w == 0x2014) {
5706 s = 0x213D;
5707 } else if (w == 0x2116) {
5708 s = 0x2C1D;
5709 } else {
5710 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
5711 }
5712 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
5713 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
5714 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
5715 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
5716 }
5717
5718 if (w >= 0x2000) {
5719 for (int i = 0; i < s_form_tbl_len; i++) {
5720 if (w == s_form_tbl[i]) {
5721 if (!len) {
5722 if (end) {
5723 s = s_form_sjis_fallback_tbl[i];
5724 if (s) {
5725 MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
5726 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5727 } else {
5728 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5729 }
5730 } else {
5731 buf->state = w;
5732 }
5733 MB_CONVERT_BUF_STORE(buf, out, limit);
5734 return;
5735 }
5736 uint32_t w2 = *in++;
5737 len--;
5738
5739 if (!process_s_form(w, w2, &s)) {
5740 in--; len++;
5741
5742 for (int i = 0; i < s_form_tbl_len; i++) {
5743 if (w == s_form_tbl[i]) {
5744 s = s_form_sjis_fallback_tbl[i];
5745 break;
5746 }
5747 }
5748 }
5749
5750 if (s <= 0xFF) {
5751 out = mb_convert_buf_add(out, s);
5752 } else {
5753 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5754 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5755 }
5756
5757 goto next_iteration;
5758 }
5759 }
5760
5761 if (w == 0xF860 || w == 0xF861 || w == 0xF862) {
5762 /* Apple 'transcoding hint' codepoints (from private use area) */
5763 if (!len) {
5764 if (end) {
5765 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5766 } else {
5767 buf->state = w;
5768 }
5769 MB_CONVERT_BUF_STORE(buf, out, limit);
5770 return;
5771 }
5772
5773 uint32_t w2 = *in++;
5774 len--;
5775
5776 for (int i = 0; i < code_tbl_m_len; i++) {
5777 if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) {
5778 /* This might be a valid transcoding hint sequence */
5779 int index = 3;
5780
5781 if (buf->state) {
5782 resume_transcoding_hint:
5783 i = buf->state >> 24;
5784 index = (buf->state >> 16) & 0xFF;
5785 buf->state = 0;
5786 }
5787
5788 int expected = transcoding_hint_cp_width[w - 0xF860];
5789
5790 while (index <= expected) {
5791 if (!len) {
5792 if (end) {
5793 for (int j = 1; j < index; j++) {
5794 MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5795 }
5796 } else {
5797 buf->state = (i << 24) | (index << 16) | (w & 0xFFFF);
5798 }
5799 MB_CONVERT_BUF_STORE(buf, out, limit);
5800 return;
5801 }
5802
5803 w2 = *in++;
5804 len--;
5805
5806 if (w2 != code_tbl_m[i][index]) {
5807 /* Didn't match */
5808 for (int j = 1; j < index; j++) {
5809 MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5810 }
5811 MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac);
5812 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5813 goto next_iteration;
5814 }
5815
5816 index++;
5817 }
5818
5819 /* Successful match, emit SJIS-mac bytes */
5820 s = code_tbl_m[i][0];
5821 unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2;
5822 SJIS_ENCODE(c1, c2, s1, s2);
5823 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5824 out = mb_convert_buf_add2(out, s1, s2);
5825 goto next_iteration;
5826 }
5827 }
5828
5829 /* No valid transcoding hint sequence found */
5830 in--; len++;
5831 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5832 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5833 continue;
5834 }
5835 }
5836
5837 if (!s) {
5838 if (w == 0xA0) {
5839 s = 0xA0;
5840 } else if (w == 0xA5) { /* YEN SIGN */
5841 /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5842 * convert codepoint 0xA5 to halfwidth Yen sign */
5843 s = 0x5C; /* HALFWIDTH YEN SIGN */
5844 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
5845 s = 0x2140;
5846 } else {
5847 for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) {
5848 if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) {
5849 s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5850 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5851 goto found_kuten_code;
5852 }
5853 }
5854
5855 for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) {
5856 if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) {
5857 s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]];
5858 if (s) {
5859 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5860 goto found_kuten_code;
5861 }
5862 }
5863 }
5864
5865 for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) {
5866 if (w == wchar2sjis_mac_wchar_tbl[i][0]) {
5867 s = wchar2sjis_mac_wchar_tbl[i][1];
5868 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5869 goto found_kuten_code;
5870 }
5871 }
5872 }
5873 }
5874
5875 found_kuten_code:
5876 if ((!s && w) || s >= 0x8080) {
5877 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5878 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5879 } else if (s <= 0xFF) {
5880 out = mb_convert_buf_add(out, s);
5881 } else {
5882 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
5883 SJIS_ENCODE(c1, c2, s1, s2);
5884 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5885 out = mb_convert_buf_add2(out, s1, s2);
5886 }
5887
5888 next_iteration: ;
5889 }
5890
5891 MB_CONVERT_BUF_STORE(buf, out, limit);
5892 }
5893
mbfilter_sjis_emoji_docomo2unicode(int s,int * snd)5894 int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd)
5895 {
5896 /* All three mobile vendors had emoji for numbers on a telephone keypad
5897 * Unicode doesn't have those, but it has a combining character which puts
5898 * a 'keypad button' around the following character, making it look like
5899 * a key on a telephone or keyboard. That combining char is codepoint 0x20E3. */
5900 if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
5901 if ((s >= DOCOMO_KEYPAD(1) && s <= DOCOMO_KEYPAD(9)) || s == DOCOMO_KEYPAD(0) || s == DOCOMO_KEYPAD_HASH) {
5902 EMIT_KEYPAD_EMOJI(convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]));
5903 } else {
5904 *snd = 0;
5905 return convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]);
5906 }
5907 }
5908 return 0;
5909 }
5910
mbfilter_sjis_emoji_sb2unicode(int s,int * snd)5911 int mbfilter_sjis_emoji_sb2unicode(int s, int *snd)
5912 {
5913 if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb1_max) {
5914 if (s == 0x2817 || (s >= 0x2823 && s <= 0x282C)) {
5915 EMIT_KEYPAD_EMOJI(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5916 } else {
5917 *snd = 0;
5918 return convert_emoji_cp(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5919 }
5920 } else if (s >= mb_tbl_code2uni_sb2_min && s <= mb_tbl_code2uni_sb2_max) {
5921 *snd = 0;
5922 return convert_emoji_cp(mb_tbl_code2uni_sb2[s - mb_tbl_code2uni_sb2_min]);
5923 } else if (s >= mb_tbl_code2uni_sb3_min && s <= mb_tbl_code2uni_sb3_max) {
5924 if (s >= 0x2B02 && s <= 0x2B0B) {
5925 EMIT_FLAG_EMOJI(nflags_sb[s - 0x2B02]);
5926 } else {
5927 *snd = 0;
5928 return convert_emoji_cp(mb_tbl_code2uni_sb3[s - mb_tbl_code2uni_sb3_min]);
5929 }
5930 }
5931 return 0;
5932 }
5933
mbfilter_unicode2sjis_emoji_docomo(int c,int * s1,mbfl_convert_filter * filter)5934 int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter)
5935 {
5936 /* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
5937 * to a sequence of 2 codepoints, one of which is a combining character which
5938 * adds the 'key' image around the other
5939 *
5940 * In the other direction, look for such sequences and convert them to a
5941 * single emoji */
5942 if (filter->status == 1) {
5943 int c1 = filter->cache;
5944 filter->cache = filter->status = 0;
5945 if (c == 0x20E3) {
5946 if (c1 == '#') {
5947 *s1 = 0x2964;
5948 } else if (c1 == '0') {
5949 *s1 = 0x296F;
5950 } else { /* Previous character was '1'-'9' */
5951 *s1 = 0x2966 + (c1 - '1');
5952 }
5953 return 1;
5954 } else {
5955 /* This character wasn't combining character to make keypad symbol,
5956 * so pass the previous character through... and proceed to process the
5957 * current character as usual
5958 * (Single-byte ASCII characters are valid in Shift-JIS...) */
5959 CK((*filter->output_function)(c1, filter->data));
5960 }
5961 }
5962
5963 if (c == '#' || (c >= '0' && c <= '9')) {
5964 filter->status = 1;
5965 filter->cache = c;
5966 return 0;
5967 }
5968
5969 if (c == 0xA9) { /* Copyright sign */
5970 *s1 = 0x29B5;
5971 return 1;
5972 } else if (c == 0x00AE) { /* Registered sign */
5973 *s1 = 0x29BA;
5974 return 1;
5975 } else if (c >= mb_tbl_uni_docomo2code2_min && c <= mb_tbl_uni_docomo2code2_max) {
5976 int i = mbfl_bisec_srch2(c, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
5977 if (i >= 0) {
5978 *s1 = mb_tbl_uni_docomo2code2_value[i];
5979 return 1;
5980 }
5981 } else if (c >= mb_tbl_uni_docomo2code3_min && c <= mb_tbl_uni_docomo2code3_max) {
5982 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
5983 if (i >= 0) {
5984 *s1 = mb_tbl_uni_docomo2code3_value[i];
5985 return 1;
5986 }
5987 } else if (c >= mb_tbl_uni_docomo2code5_min && c <= mb_tbl_uni_docomo2code5_max) {
5988 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
5989 if (i >= 0) {
5990 *s1 = mb_tbl_uni_docomo2code5_val[i];
5991 return 1;
5992 }
5993 }
5994 return 0;
5995 }
5996
mbfilter_unicode2sjis_emoji_kddi_sjis(int c,int * s1,mbfl_convert_filter * filter)5997 int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter)
5998 {
5999 if (filter->status == 1) {
6000 int c1 = filter->cache;
6001 filter->cache = filter->status = 0;
6002 if (c == 0x20E3) {
6003 if (c1 == '#') {
6004 *s1 = 0x25BC;
6005 } else if (c1 == '0') {
6006 *s1 = 0x2830;
6007 } else { /* Previous character was '1'-'9' */
6008 *s1 = 0x27a6 + (c1 - '1');
6009 }
6010 return 1;
6011 } else {
6012 CK((*filter->output_function)(c1, filter->data));
6013 }
6014 } else if (filter->status == 2) {
6015 int c1 = filter->cache;
6016 filter->cache = filter->status = 0;
6017 if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
6018 for (int i = 0; i < 10; i++) {
6019 if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
6020 *s1 = nflags_code_kddi[i];
6021 return 1;
6022 }
6023 }
6024 }
6025
6026 /* If none of the KDDI national flag emoji matched, then we have no way
6027 * to convert the previous codepoint... */
6028 mbfl_filt_conv_illegal_output(c1, filter);
6029 }
6030
6031 if (c == '#' || (c >= '0' && c <= '9')) {
6032 filter->status = 1;
6033 filter->cache = c;
6034 return 0;
6035 } else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6036 filter->status = 2;
6037 filter->cache = c;
6038 return 0;
6039 }
6040
6041 if (c == 0xA9) { /* Copyright sign */
6042 *s1 = 0x27DC;
6043 return 1;
6044 } else if (c == 0xAE) { /* Registered sign */
6045 *s1 = 0x27DD;
6046 return 1;
6047 } else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
6048 int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6049 if (i >= 0) {
6050 *s1 = mb_tbl_uni_kddi2code2_value[i];
6051 return 1;
6052 }
6053 } else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
6054 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6055 if (i >= 0) {
6056 *s1 = mb_tbl_uni_kddi2code3_value[i];
6057 return 1;
6058 }
6059 } else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
6060 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6061 if (i >= 0) {
6062 *s1 = mb_tbl_uni_kddi2code5_val[i];
6063 return 1;
6064 }
6065 }
6066 return 0;
6067 }
6068
mbfilter_unicode2sjis_emoji_sb(int c,int * s1,mbfl_convert_filter * filter)6069 int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter)
6070 {
6071 if (filter->status == 1) {
6072 int c1 = filter->cache;
6073 filter->cache = filter->status = 0;
6074 if (c == 0x20E3) {
6075 if (c1 == '#') {
6076 *s1 = 0x2817;
6077 } else if (c1 == '0') {
6078 *s1 = 0x282c;
6079 } else { /* Previous character was '1'-'9' */
6080 *s1 = 0x2823 + (c1 - '1');
6081 }
6082 return 1;
6083 } else {
6084 (*filter->output_function)(c1, filter->data);
6085 }
6086 } else if (filter->status == 2) {
6087 int c1 = filter->cache;
6088 filter->cache = filter->status = 0;
6089 if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
6090 for (int i = 0; i < 10; i++) {
6091 if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
6092 *s1 = nflags_code_sb[i];
6093 return 1;
6094 }
6095 }
6096 }
6097
6098 /* If none of the SoftBank national flag emoji matched, then we have no way
6099 * to convert the previous codepoint... */
6100 mbfl_filt_conv_illegal_output(c1, filter);
6101 }
6102
6103 if (c == '#' || (c >= '0' && c <= '9')) {
6104 filter->status = 1;
6105 filter->cache = c;
6106 return 0;
6107 } else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6108 filter->status = 2;
6109 filter->cache = c;
6110 return 0;
6111 }
6112
6113 if (c == 0xA9) { /* Copyright sign */
6114 *s1 = 0x2855;
6115 return 1;
6116 } else if (c == 0xAE) { /* Registered sign */
6117 *s1 = 0x2856;
6118 return 1;
6119 } else if (c >= mb_tbl_uni_sb2code2_min && c <= mb_tbl_uni_sb2code2_max) {
6120 int i = mbfl_bisec_srch2(c, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
6121 if (i >= 0) {
6122 *s1 = mb_tbl_uni_sb2code2_value[i];
6123 return 1;
6124 }
6125 } else if (c >= mb_tbl_uni_sb2code3_min && c <= mb_tbl_uni_sb2code3_max) {
6126 int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
6127 if (i >= 0) {
6128 *s1 = mb_tbl_uni_sb2code3_value[i];
6129 return 1;
6130 }
6131 } else if (c >= mb_tbl_uni_sb2code5_min && c <= mb_tbl_uni_sb2code5_max) {
6132 int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
6133 if (i >= 0) {
6134 *s1 = mb_tbl_uni_sb2code5_val[i];
6135 return 1;
6136 }
6137 }
6138 return 0;
6139 }
6140
mbfl_filt_conv_sjis_mobile_wchar(int c,mbfl_convert_filter * filter)6141 static int mbfl_filt_conv_sjis_mobile_wchar(int c, mbfl_convert_filter *filter)
6142 {
6143 int c1, s, s1, s2, w, snd = 0;
6144
6145 switch (filter->status) {
6146 case 0:
6147 if (c >= 0 && c < 0x80) { /* ASCII */
6148 if (filter->from == &mbfl_encoding_sjis_sb && c == 0x1B) {
6149 /* ESC; escape sequences were used on older SoftBank phones for emoji */
6150 filter->cache = c;
6151 filter->status = 2;
6152 } else {
6153 CK((*filter->output_function)(c, filter->data));
6154 }
6155 } else if (c > 0xA0 && c < 0xE0) { /* Kana */
6156 CK((*filter->output_function)(0xFEC0 + c, filter->data));
6157 } else if (c > 0x80 && c < 0xFD && c != 0xA0) { /* Kanji, first byte */
6158 filter->status = 1;
6159 filter->cache = c;
6160 } else {
6161 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6162 }
6163 break;
6164
6165 case 1: /* Kanji, second byte */
6166 filter->status = 0;
6167 c1 = filter->cache;
6168 if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
6169 w = 0;
6170 SJIS_DECODE(c1, c, s1, s2);
6171 s = ((s1 - 0x21) * 94) + s2 - 0x21;
6172 if (s <= 137) {
6173 if (s == 31) {
6174 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6175 } else if (s == 32) {
6176 w = 0xFF5E; /* FULLWIDTH TILDE */
6177 } else if (s == 33) {
6178 w = 0x2225; /* PARALLEL TO */
6179 } else if (s == 60) {
6180 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6181 } else if (s == 80) {
6182 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
6183 } else if (s == 81) {
6184 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
6185 } else if (s == 137) {
6186 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
6187 }
6188 }
6189 if (w == 0) {
6190 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
6191 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
6192 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
6193 w = jisx0208_ucs_table[s];
6194 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
6195 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
6196 }
6197
6198 /* Emoji */
6199 if (filter->from == &mbfl_encoding_sjis_docomo && s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
6200 w = mbfilter_sjis_emoji_docomo2unicode(s, &snd);
6201 if (snd > 0) {
6202 CK((*filter->output_function)(snd, filter->data));
6203 }
6204 } else if (filter->from == &mbfl_encoding_sjis_kddi && s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi2_max) {
6205 w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
6206 if (snd > 0) {
6207 CK((*filter->output_function)(snd, filter->data));
6208 }
6209 } else if (filter->from == &mbfl_encoding_sjis_sb && s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb3_max) {
6210 w = mbfilter_sjis_emoji_sb2unicode(s, &snd);
6211 if (snd > 0) {
6212 CK((*filter->output_function)(snd, filter->data));
6213 }
6214 }
6215
6216 if (w == 0) {
6217 if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */
6218 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
6219 } else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */
6220 w = s - (94*94) + 0xe000;
6221 }
6222 }
6223 }
6224 if (w <= 0) {
6225 w = MBFL_BAD_INPUT;
6226 }
6227 CK((*filter->output_function)(w, filter->data));
6228 } else {
6229 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6230 }
6231 break;
6232
6233 /* ESC: Softbank Emoji */
6234 case 2:
6235 if (c == '$') {
6236 filter->cache = c;
6237 filter->status++;
6238 } else {
6239 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6240 filter->status = filter->cache = 0;
6241 }
6242 break;
6243
6244 /* ESC $: Softbank Emoji */
6245 case 3:
6246 if ((c >= 'E' && c <= 'G') || (c >= 'O' && c <= 'Q')) {
6247 filter->cache = c;
6248 filter->status++;
6249 } else {
6250 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6251 filter->status = filter->cache = 0;
6252 }
6253 break;
6254
6255 /* ESC $ [GEFOPQ]: Softbank Emoji */
6256 case 4:
6257 c1 = filter->cache;
6258 if (c == 0xF) { /* Terminate sequence of emoji */
6259 filter->status = filter->cache = 0;
6260 return 0;
6261 } else {
6262 if (c1 == 'G' && c >= 0x21 && c <= 0x7a) {
6263 s1 = (0x91 - 0x21) * 94;
6264 } else if (c1 == 'E' && c >= 0x21 && c <= 0x7A) {
6265 s1 = (0x8D - 0x21) * 94;
6266 } else if (c1 == 'F' && c >= 0x21 && c <= 0x7A) {
6267 s1 = (0x8E - 0x21) * 94;
6268 } else if (c1 == 'O' && c >= 0x21 && c <= 0x6D) {
6269 s1 = (0x92 - 0x21) * 94;
6270 } else if (c1 == 'P' && c >= 0x21 && c <= 0x6C) {
6271 s1 = (0x95 - 0x21) * 94;
6272 } else if (c1 == 'Q' && c >= 0x21 && c <= 0x5E) {
6273 s1 = (0x96 - 0x21) * 94;
6274 } else {
6275 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6276 filter->status = filter->cache = 0;
6277 return 0;
6278 }
6279
6280 w = mbfilter_sjis_emoji_sb2unicode(s1 + c - 0x21, &snd);
6281 if (w > 0) {
6282 if (snd > 0) {
6283 CK((*filter->output_function)(snd, filter->data));
6284 }
6285 CK((*filter->output_function)(w, filter->data));
6286 } else {
6287 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6288 filter->status = filter->cache = 0;
6289 }
6290 }
6291 }
6292
6293 return 0;
6294 }
6295
mbfl_filt_conv_wchar_sjis_mobile(int c,mbfl_convert_filter * filter)6296 static int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter)
6297 {
6298 int c1, c2, s1 = 0, s2 = 0;
6299
6300 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
6301 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
6302 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
6303 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
6304 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
6305 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
6306 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
6307 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
6308 } else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
6309 /* Private User Area (95ku - 114ku) */
6310 s1 = c - 0xE000;
6311 c1 = (s1 / 94) + 0x7F;
6312 c2 = (s1 % 94) + 0x21;
6313 s1 = (c1 << 8) | c2;
6314 s2 = 1;
6315 }
6316
6317 if (s1 <= 0) {
6318 if (c == 0xA5) { /* YEN SIGN */
6319 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
6320 } else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6321 s1 = 0x2140;
6322 } else if (c == 0x2225) { /* PARALLEL TO */
6323 s1 = 0x2142;
6324 } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6325 s1 = 0x215D;
6326 } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6327 s1 = 0x2171;
6328 } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6329 s1 = 0x2172;
6330 } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6331 s1 = 0x224C;
6332 }
6333 }
6334
6335 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
6336 s1 = -1;
6337
6338 /* CP932 vendor ext1 (13ku) */
6339 for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
6340 if (c == cp932ext1_ucs_table[c1]) {
6341 s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
6342 break;
6343 }
6344 }
6345
6346 if (s1 <= 0) {
6347 /* CP932 vendor ext2 (115ku - 119ku) */
6348 for (c1 = 0; c1 < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; c1++) {
6349 if (c == cp932ext2_ucs_table[c1]) {
6350 s1 = (((c1 / 94) + 0x79) << 8) + (c1 % 94) + 0x21;
6351 break;
6352 }
6353 }
6354 }
6355
6356 if (c == 0) {
6357 s1 = 0;
6358 }
6359 }
6360
6361 if ((filter->to == &mbfl_encoding_sjis_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter)) ||
6362 (filter->to == &mbfl_encoding_sjis_kddi && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter)) ||
6363 (filter->to == &mbfl_encoding_sjis_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter))) {
6364 s1 = (((s1 / 94) + 0x21) << 8) | ((s1 % 94) + 0x21);
6365 }
6366
6367 if (filter->status) {
6368 return 0;
6369 }
6370
6371 if (s1 >= 0) {
6372 if (s1 < 0x100) { /* Latin/Kana */
6373 CK((*filter->output_function)(s1, filter->data));
6374 } else { /* Kanji */
6375 c1 = (s1 >> 8) & 0xff;
6376 c2 = s1 & 0xff;
6377 SJIS_ENCODE(c1, c2, s1, s2);
6378 CK((*filter->output_function)(s1, filter->data));
6379 CK((*filter->output_function)(s2, filter->data));
6380 }
6381 } else {
6382 CK(mbfl_filt_conv_illegal_output(c, filter));
6383 }
6384
6385 return 0;
6386 }
6387
mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter * filter)6388 int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter)
6389 {
6390 int c1 = filter->cache;
6391 if (filter->status == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
6392 filter->cache = filter->status = 0;
6393 CK((*filter->output_function)(c1, filter->data));
6394 } else if (filter->status == 2) {
6395 /* First of a pair of Regional Indicator codepoints came at the end of a string */
6396 filter->cache = filter->status = 0;
6397 mbfl_filt_conv_illegal_output(c1, filter);
6398 }
6399
6400 if (filter->flush_function) {
6401 (*filter->flush_function)(filter->data);
6402 }
6403
6404 return 0;
6405 }
6406
6407 static const unsigned short sjis_mobile_decode_tbl1[] = {
6408 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 8836, 9024, 9212, 9400, 9588, 9776, 9964, 10152, 10340, 10528, 10716, 10904, 11092, 0xFFFF, 0xFFFF, 0xFFFF
6409 };
6410
mb_sjis_docomo_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6411 static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6412 {
6413 unsigned char *p = *in, *e = p + *in_len;
6414 /* Leave one extra space available in output buffer, since some iterations of
6415 * main loop (below) may emit two wchars */
6416 uint32_t *out = buf, *limit = buf + bufsize - 1;
6417
6418 while (p < e && out < limit) {
6419 unsigned char c = *p++;
6420
6421 if (c <= 0x7F) {
6422 *out++ = c;
6423 } else if (c >= 0xA1 && c <= 0xDF) {
6424 /* Kana */
6425 *out++ = 0xFEC0 + c;
6426 } else {
6427 /* Kanji */
6428 if (p == e) {
6429 *out++ = MBFL_BAD_INPUT;
6430 break;
6431 }
6432 unsigned char c2 = *p++;
6433 uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6434
6435 if (w <= 137) {
6436 if (w == 31) {
6437 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6438 continue;
6439 } else if (w == 32) {
6440 *out++ = 0xFF5E; /* FULLWIDTH TILDE */
6441 continue;
6442 } else if (w == 33) {
6443 *out++ = 0x2225; /* PARALLEL TO */
6444 continue;
6445 } else if (w == 60) {
6446 *out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6447 continue;
6448 } else if (w == 80) {
6449 *out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6450 continue;
6451 } else if (w == 81) {
6452 *out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6453 continue;
6454 } else if (w == 137) {
6455 *out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6456 continue;
6457 }
6458 }
6459
6460 if (w >= mb_tbl_code2uni_docomo1_min && w <= mb_tbl_code2uni_docomo1_max) {
6461 int snd = 0;
6462 w = mbfilter_sjis_emoji_docomo2unicode(w, &snd);
6463 if (snd) {
6464 *out++ = snd;
6465 }
6466 } else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6467 w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6468 } else if (w < jisx0208_ucs_table_size) {
6469 w = jisx0208_ucs_table[w];
6470 } else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6471 w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6472 } else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6473 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6474 } else if (w >= (94*94) && w < (114*94)) {
6475 w = w - (94*94) + 0xE000;
6476 } else {
6477 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6478 p--;
6479 }
6480 *out++ = MBFL_BAD_INPUT;
6481 continue;
6482 }
6483
6484 *out++ = w ? w : MBFL_BAD_INPUT;
6485 }
6486 }
6487
6488 *in_len = e - p;
6489 *in = p;
6490 return out - buf;
6491 }
6492
mb_wchar_to_sjis_docomo(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6493 static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6494 {
6495 unsigned char *out, *limit;
6496 MB_CONVERT_BUF_LOAD(buf, out, limit);
6497 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6498
6499 uint32_t w;
6500 unsigned int s = 0;
6501
6502 if (buf->state) {
6503 /* Continue what we were doing on the previous call */
6504 w = buf->state;
6505 buf->state = 0;
6506 goto reprocess_wchar;
6507 }
6508
6509 while (len--) {
6510 w = *in++;
6511 reprocess_wchar:
6512 s = 0;
6513
6514 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6515 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6516 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6517 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6518 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6519 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6520 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6521 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6522 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6523 /* Private User Area (95ku - 114ku) */
6524 s = w - 0xE000;
6525 s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6526 goto process_emoji;
6527 }
6528
6529 if (!s) {
6530 if (w == 0xA5) { /* YEN SIGN */
6531 s = 0x216F; /* FULLWIDTH YEN SIGN */
6532 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
6533 s = 0x2140;
6534 } else if (w == 0x2225) { /* PARALLEL TO */
6535 s = 0x2142;
6536 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6537 s = 0x215D;
6538 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6539 s = 0x2171;
6540 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6541 s = 0x2172;
6542 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6543 s = 0x224C;
6544 }
6545 }
6546
6547 if (w && (!s || s >= 0x8080)) {
6548 s = 0;
6549
6550 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6551 if (w == cp932ext1_ucs_table[i]) {
6552 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6553 goto process_emoji;
6554 }
6555 }
6556
6557 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6558 if (w == cp932ext2_ucs_table[i]) {
6559 s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6560 goto process_emoji;
6561 }
6562 }
6563 }
6564
6565 process_emoji:
6566 /* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
6567 * to a sequence of 2 codepoints, one of which is a combining character which
6568 * adds the 'key' image around the other
6569 *
6570 * In the other direction, look for such sequences and convert them to a
6571 * single emoji */
6572 if (w == '#' || (w >= '0' && w <= '9')) {
6573 if (!len) {
6574 if (end) {
6575 goto emit_output;
6576 } else {
6577 /* If we are at the end of the current buffer of codepoints, but another
6578 * buffer is coming, then remember that we have to reprocess `w` */
6579 buf->state = w;
6580 break;
6581 }
6582 }
6583 uint32_t w2 = *in++; len--;
6584 if (w2 == 0x20E3) {
6585 if (w == '#') {
6586 s = 0x2964;
6587 } else if (w == '0') {
6588 s = 0x296F;
6589 } else { /* Previous character was '1'-'9' */
6590 s = 0x2966 + (w - '1');
6591 }
6592 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6593 } else {
6594 in--; len++;
6595 }
6596 } else if (w == 0xA9) { /* Copyright sign */
6597 s = (((0x29B5 / 94) + 0x21) << 8) | ((0x29B5 % 94) + 0x21);
6598 } else if (w == 0xAE) { /* Registered sign */
6599 s = (((0x29BA / 94) + 0x21) << 8) | ((0x29BA % 94) + 0x21);
6600 } else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) {
6601 int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
6602 if (i >= 0) {
6603 s = mb_tbl_uni_docomo2code2_value[i];
6604 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6605 }
6606 } else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) {
6607 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
6608 if (i >= 0) {
6609 s = mb_tbl_uni_docomo2code3_value[i];
6610 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6611 }
6612 } else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) {
6613 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
6614 if (i >= 0) {
6615 s = mb_tbl_uni_docomo2code5_val[i];
6616 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6617 }
6618 }
6619
6620 emit_output:
6621 if (!s && w) {
6622 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_docomo);
6623 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6624 } else if (s <= 0xFF) {
6625 out = mb_convert_buf_add(out, s);
6626 } else {
6627 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6628 SJIS_ENCODE(c1, c2, s1, s2);
6629 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6630 out = mb_convert_buf_add2(out, s1, s2);
6631 }
6632 }
6633
6634 MB_CONVERT_BUF_STORE(buf, out, limit);
6635 }
6636
mb_sjis_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6637 static size_t mb_sjis_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6638 {
6639 unsigned char *p = *in, *e = p + *in_len;
6640 uint32_t *out = buf, *limit = buf + bufsize - 1;
6641
6642 while (p < e && out < limit) {
6643 unsigned char c = *p++;
6644
6645 if (c <= 0x7F) {
6646 *out++ = c;
6647 } else if (c >= 0xA1 && c <= 0xDF) {
6648 /* Kana */
6649 *out++ = 0xFEC0 + c;
6650 } else {
6651 /* Kanji */
6652 if (p == e) {
6653 *out++ = MBFL_BAD_INPUT;
6654 break;
6655 }
6656 unsigned char c2 = *p++;
6657 uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6658
6659 if (w <= 137) {
6660 if (w == 31) {
6661 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6662 continue;
6663 } else if (w == 32) {
6664 *out++ = 0xFF5E; /* FULLWIDTH TILDE */
6665 continue;
6666 } else if (w == 33) {
6667 *out++ = 0x2225; /* PARALLEL TO */
6668 continue;
6669 } else if (w == 60) {
6670 *out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6671 continue;
6672 } else if (w == 80) {
6673 *out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6674 continue;
6675 } else if (w == 81) {
6676 *out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6677 continue;
6678 } else if (w == 137) {
6679 *out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6680 continue;
6681 }
6682 }
6683
6684 if (w >= mb_tbl_code2uni_kddi1_min && w <= mb_tbl_code2uni_kddi2_max) {
6685 int snd = 0;
6686 w = mbfilter_sjis_emoji_kddi2unicode(w, &snd);
6687 if (!w) {
6688 w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6689 if (w >= (94*94) && w < (114*94)) {
6690 w = w - (94*94) + 0xE000;
6691 }
6692 } else if (snd) {
6693 *out++ = snd;
6694 }
6695 } else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6696 w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6697 } else if (w < jisx0208_ucs_table_size) {
6698 w = jisx0208_ucs_table[w];
6699 } else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6700 w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6701 } else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6702 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6703 } else if (w >= (94*94) && w < (114*94)) {
6704 w = w - (94*94) + 0xE000;
6705 } else {
6706 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6707 p--;
6708 }
6709 *out++ = MBFL_BAD_INPUT;
6710 continue;
6711 }
6712
6713 *out++ = w ? w : MBFL_BAD_INPUT;
6714 }
6715 }
6716
6717 *in_len = e - p;
6718 *in = p;
6719 return out - buf;
6720 }
6721
mb_wchar_to_sjis_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6722 static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6723 {
6724 unsigned char *out, *limit;
6725 MB_CONVERT_BUF_LOAD(buf, out, limit);
6726 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6727
6728 uint32_t w;
6729 unsigned int s = 0;
6730
6731 if (buf->state) {
6732 w = buf->state;
6733 buf->state = 0;
6734 goto reprocess_wchar;
6735 }
6736
6737 while (len--) {
6738 w = *in++;
6739 reprocess_wchar:
6740 s = 0;
6741
6742 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6743 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6744 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6745 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6746 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6747 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6748 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6749 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6750 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6751 /* Private User Area (95ku - 114ku) */
6752 s = w - 0xE000;
6753 s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6754 goto process_emoji;
6755 }
6756
6757 if (!s) {
6758 if (w == 0xA5) { /* YEN SIGN */
6759 s = 0x216F; /* FULLWIDTH YEN SIGN */
6760 } else if (w == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6761 s = 0x2140;
6762 } else if (w == 0x2225) { /* PARALLEL TO */
6763 s = 0x2142;
6764 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6765 s = 0x215D;
6766 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6767 s = 0x2171;
6768 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6769 s = 0x2172;
6770 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6771 s = 0x224C;
6772 }
6773 }
6774
6775 if (w && (!s || s >= 0x8080)) {
6776 s = 0;
6777
6778 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6779 if (w == cp932ext1_ucs_table[i]) {
6780 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6781 goto process_emoji;
6782 }
6783 }
6784
6785 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6786 if (w == cp932ext2_ucs_table[i]) {
6787 s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6788 goto process_emoji;
6789 }
6790 }
6791 }
6792
6793 process_emoji:
6794 if (w == '#' || (w >= '0' && w <= '9')) {
6795 if (!len) {
6796 if (end) {
6797 goto emit_output;
6798 } else {
6799 /* If we are at the end of the current buffer of codepoints, but another
6800 * buffer is coming, then remember that we have to reprocess `w` */
6801 buf->state = w;
6802 break;
6803 }
6804 }
6805 uint32_t w2 = *in++; len--;
6806 if (w2 == 0x20E3) {
6807 if (w == '#') {
6808 s = 0x25BC;
6809 } else if (w == '0') {
6810 s = 0x2830;
6811 } else { /* Previous character was '1'-'9' */
6812 s = 0x27A6 + (w - '1');
6813 }
6814 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6815 } else {
6816 in--; len++;
6817 }
6818 } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
6819 if (!len) {
6820 if (end) {
6821 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6822 } else {
6823 /* Reprocess `w` when this function is called again with another buffer
6824 * of wchars */
6825 buf->state = w;
6826 }
6827 break;
6828 }
6829 uint32_t w2 = *in++; len--;
6830 if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
6831 for (int i = 0; i < 10; i++) {
6832 if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
6833 s = nflags_code_kddi[i];
6834 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6835 goto emit_output;
6836 }
6837 }
6838 }
6839 in--; len++;
6840 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6841 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6842 continue;
6843 } else if (w == 0xA9) { /* Copyright sign */
6844 s = (((0x27DC / 94) + 0x21) << 8) | ((0x27DC % 94) + 0x21);
6845 } else if (w == 0xAE) { /* Registered sign */
6846 s = (((0x27DD / 94) + 0x21) << 8) | ((0x27DD % 94) + 0x21);
6847 } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
6848 int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6849 if (i >= 0) {
6850 s = mb_tbl_uni_kddi2code2_value[i];
6851 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6852 }
6853 } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
6854 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6855 if (i >= 0) {
6856 s = mb_tbl_uni_kddi2code3_value[i];
6857 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6858 }
6859 } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
6860 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6861 if (i >= 0) {
6862 s = mb_tbl_uni_kddi2code5_val[i];
6863 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6864 }
6865 }
6866
6867 emit_output:
6868 if (!s && w) {
6869 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6870 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6871 } else if (s <= 0xFF) {
6872 out = mb_convert_buf_add(out, s);
6873 } else {
6874 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6875 SJIS_ENCODE(c1, c2, s1, s2);
6876 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6877 out = mb_convert_buf_add2(out, s1, s2);
6878 }
6879 }
6880
6881 MB_CONVERT_BUF_STORE(buf, out, limit);
6882 }
6883
mb_sjis_sb_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6884 static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6885 {
6886 unsigned char *p = *in, *e = p + *in_len;
6887 uint32_t *out = buf, *limit = buf + bufsize - 1;
6888
6889 if (*state) {
6890 goto softbank_emoji_escapes;
6891 }
6892
6893 while (p < e && out < limit) {
6894 unsigned char c = *p++;
6895
6896 if (c == 0x1B) {
6897 /* Escape sequence */
6898 if (p == e || *p++ != '$' || p == e) {
6899 *out++ = MBFL_BAD_INPUT;
6900 continue;
6901 }
6902 unsigned char c2 = *p++;
6903 if ((c2 < 'E' || c2 > 'G') && (c2 < 'O' || c2 > 'Q')) {
6904 *out++ = MBFL_BAD_INPUT;
6905 continue;
6906 }
6907 /* Escape sequence was valid, next should be a series of specially
6908 * encoded Softbank emoji */
6909 *state = c2;
6910
6911 softbank_emoji_escapes:
6912 while (p < e && out < limit) {
6913 c = *p++;
6914 if (c == 0xF) {
6915 *state = 0;
6916 break;
6917 }
6918 unsigned int s = 0;
6919 if (*state == 'G' && c >= 0x21 && c <= 0x7A) {
6920 s = (0x91 - 0x21) * 94;
6921 } else if (*state == 'E' && c >= 0x21 && c <= 0x7A) {
6922 s = (0x8D - 0x21) * 94;
6923 } else if (*state == 'F' && c >= 0x21 && c <= 0x7A) {
6924 s = (0x8E - 0x21) * 94;
6925 } else if (*state == 'O' && c >= 0x21 && c <= 0x6D) {
6926 s = (0x92 - 0x21) * 94;
6927 } else if (*state == 'P' && c >= 0x21 && c <= 0x6C) {
6928 s = (0x95 - 0x21) * 94;
6929 } else if (*state == 'Q' && c >= 0x21 && c <= 0x5E) {
6930 s = (0x96 - 0x21) * 94;
6931 } else {
6932 *out++ = MBFL_BAD_INPUT;
6933 *state = 0;
6934 break;
6935 }
6936
6937 int snd = 0;
6938 uint32_t w = mbfilter_sjis_emoji_sb2unicode(s + c - 0x21, &snd);
6939 if (w) {
6940 if (snd) {
6941 *out++ = snd;
6942 }
6943 *out++ = w;
6944 } else {
6945 *out++ = MBFL_BAD_INPUT;
6946 *state = 0;
6947 break;
6948 }
6949 }
6950 } else if (c <= 0x7F) {
6951 *out++ = c;
6952 } else if (c >= 0xA1 && c <= 0xDF) {
6953 /* Kana */
6954 *out++ = 0xFEC0 + c;
6955 } else {
6956 /* Kanji */
6957 if (p == e) {
6958 *out++ = MBFL_BAD_INPUT;
6959 break;
6960 }
6961 unsigned char c2 = *p++;
6962 uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6963
6964 if (w <= 137) {
6965 if (w == 31) {
6966 *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6967 continue;
6968 } else if (w == 32) {
6969 *out++ = 0xFF5E; /* FULLWIDTH TILDE */
6970 continue;
6971 } else if (w == 33) {
6972 *out++ = 0x2225; /* PARALLEL TO */
6973 continue;
6974 } else if (w == 60) {
6975 *out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6976 continue;
6977 } else if (w == 80) {
6978 *out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6979 continue;
6980 } else if (w == 81) {
6981 *out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6982 continue;
6983 } else if (w == 137) {
6984 *out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6985 continue;
6986 }
6987 }
6988
6989 if (w >= mb_tbl_code2uni_sb1_min && w <= mb_tbl_code2uni_sb3_max) {
6990 int snd = 0;
6991 w = mbfilter_sjis_emoji_sb2unicode(w, &snd);
6992 if (!w) {
6993 w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6994 if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6995 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6996 } else if (w >= (94*94) && w < (114*94)) {
6997 w = w - (94*94) + 0xE000;
6998 }
6999 } else if (snd) {
7000 *out++ = snd;
7001 }
7002 } else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
7003 w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
7004 } else if (w < jisx0208_ucs_table_size) {
7005 w = jisx0208_ucs_table[w];
7006 } else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
7007 w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
7008 } else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
7009 w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
7010 } else if (w >= (94*94) && w < (114*94)) {
7011 w = w - (94*94) + 0xE000;
7012 } else {
7013 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7014 p--;
7015 }
7016 *out++ = MBFL_BAD_INPUT;
7017 continue;
7018 }
7019
7020 *out++ = w ? w : MBFL_BAD_INPUT;
7021 }
7022 }
7023
7024 *in_len = e - p;
7025 *in = p;
7026 return out - buf;
7027 }
7028
mb_wchar_to_sjis_sb(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7029 static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7030 {
7031 unsigned char *out, *limit;
7032 MB_CONVERT_BUF_LOAD(buf, out, limit);
7033 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
7034
7035 uint32_t w;
7036 unsigned int s = 0;
7037
7038 if (buf->state) {
7039 w = buf->state;
7040 buf->state = 0;
7041 goto reprocess_wchar;
7042 }
7043
7044 while (len--) {
7045 w = *in++;
7046 reprocess_wchar:
7047 s = 0;
7048
7049 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7050 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7051 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7052 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7053 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7054 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
7055 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7056 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
7057 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7058 /* Private User Area (95ku - 114ku) */
7059 s = w - 0xE000;
7060 s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
7061 goto process_emoji;
7062 }
7063
7064 if (!s) {
7065 if (w == 0xA5) { /* YEN SIGN */
7066 s = 0x216F; /* FULLWIDTH YEN SIGN */
7067 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7068 s = 0x2140;
7069 } else if (w == 0x2225) { /* PARALLEL TO */
7070 s = 0x2142;
7071 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7072 s = 0x215D;
7073 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7074 s = 0x2171;
7075 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7076 s = 0x2172;
7077 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7078 s = 0x224C;
7079 }
7080 }
7081
7082 if (w && (!s || s >= 0x8080)) {
7083 s = 0;
7084
7085 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7086 if (w == cp932ext1_ucs_table[i]) {
7087 s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
7088 goto process_emoji;
7089 }
7090 }
7091
7092 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
7093 if (w == cp932ext2_ucs_table[i]) {
7094 s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
7095 goto process_emoji;
7096 }
7097 }
7098 }
7099
7100 process_emoji:
7101 if (w == '#' || (w >= '0' && w <= '9')) {
7102 if (!len) {
7103 if (end) {
7104 goto emit_output;
7105 } else {
7106 /* If we are at the end of the current buffer of codepoints, but another
7107 * buffer is coming, then remember that we have to reprocess `w` */
7108 buf->state = w;
7109 break;
7110 }
7111 }
7112 uint32_t w2 = *in++; len--;
7113 if (w2 == 0x20E3) {
7114 if (w == '#') {
7115 s = 0x2817;
7116 } else if (w == '0') {
7117 s = 0x282c;
7118 } else { /* Previous character was '1'-'9' */
7119 s = 0x2823 + (w - '1');
7120 }
7121 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7122 } else {
7123 in--; len++;
7124 }
7125 } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
7126 if (!len) {
7127 if (end) {
7128 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7129 } else {
7130 /* Reprocess `w` when this function is called again with
7131 * another buffer of wchars */
7132 buf->state = w;
7133 }
7134 break;
7135 }
7136 uint32_t w2 = *in++; len--;
7137 if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
7138 for (int i = 0; i < 10; i++) {
7139 if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
7140 s = nflags_code_sb[i];
7141 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7142 goto emit_output;
7143 }
7144 }
7145 }
7146 in--; len++;
7147 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7148 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7149 continue;
7150 } else if (w == 0xA9) { /* Copyright sign */
7151 s = (((0x2855 / 94) + 0x21) << 8) | ((0x2855 % 94) + 0x21);
7152 } else if (w == 0xAE) { /* Registered sign */
7153 s = (((0x2856 / 94) + 0x21) << 8) | ((0x2856 % 94) + 0x21);
7154 } else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) {
7155 int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
7156 if (i >= 0) {
7157 s = mb_tbl_uni_sb2code2_value[i];
7158 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7159 }
7160 } else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) {
7161 int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
7162 if (i >= 0) {
7163 s = mb_tbl_uni_sb2code3_value[i];
7164 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7165 }
7166 } else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) {
7167 int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
7168 if (i >= 0) {
7169 s = mb_tbl_uni_sb2code5_val[i];
7170 s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7171 }
7172 }
7173
7174 emit_output:
7175 if (!s && w) {
7176 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7177 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7178 } else if (s <= 0xFF) {
7179 out = mb_convert_buf_add(out, s);
7180 } else {
7181 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7182 SJIS_ENCODE(c1, c2, s1, s2);
7183 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7184 out = mb_convert_buf_add2(out, s1, s2);
7185 }
7186 }
7187
7188 MB_CONVERT_BUF_STORE(buf, out, limit);
7189 }
7190
mb_sjis2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7191 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7192 {
7193 unsigned char *p = *in, *e = p + *in_len;
7194 uint32_t *out = buf, *limit = buf + bufsize - 1;
7195
7196 while (p < e && out < limit) {
7197 unsigned char c = *p++;
7198
7199 if (c <= 0x7F) {
7200 if (c == 0x5C) {
7201 *out++ = 0xA5;
7202 } else if (c == 0x7E) {
7203 *out++ = 0x203E;
7204 } else {
7205 *out++ = c;
7206 }
7207 } else if (c >= 0xA1 && c <= 0xDF) {
7208 *out++ = 0xFEC0 + c;
7209 } else {
7210 if (p == e) {
7211 *out++ = MBFL_BAD_INPUT;
7212 break;
7213 }
7214 unsigned char c2 = *p++;
7215 uint32_t w1 = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7216
7217 /* Conversion for combining characters */
7218 if (w1 >= 0x0170 && w1 <= 0x03F1) {
7219 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key_b, jisx0213_u2_tbl_len);
7220 if (k >= 0) {
7221 *out++ = jisx0213_u2_tbl[2*k];
7222 *out++ = jisx0213_u2_tbl[2*k+1];
7223 continue;
7224 }
7225 }
7226
7227 /* Conversion for BMP */
7228 if (w1 < jisx0213_ucs_table_size) {
7229 uint32_t w = jisx0213_ucs_table[w1];
7230 if (w) {
7231 *out++ = w;
7232 continue;
7233 }
7234 }
7235
7236 /* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
7237 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
7238 if (k >= 0) {
7239 *out++ = jisx0213_jis_u5_tbl[k] + 0x20000;
7240 } else {
7241 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7242 p--;
7243 }
7244 *out++ = MBFL_BAD_INPUT;
7245 }
7246 }
7247 }
7248
7249 *in_len = e - p;
7250 *in = p;
7251 return out - buf;
7252 }
7253
mb_wchar_to_sjis2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7254 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7255 {
7256 unsigned char *out, *limit;
7257 MB_CONVERT_BUF_LOAD(buf, out, limit);
7258 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7259
7260 uint32_t w;
7261 if (buf->state) {
7262 w = buf->state;
7263 buf->state = 0;
7264 goto process_codepoint;
7265 }
7266
7267 while (len--) {
7268 w = *in++;
7269 process_codepoint: ;
7270 unsigned int s = 0;
7271
7272 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
7273 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
7274 if (w == jisx0213_u2_tbl[2*k]) {
7275 if (!len) {
7276 if (!end) {
7277 buf->state = w;
7278 MB_CONVERT_BUF_STORE(buf, out, limit);
7279 return;
7280 }
7281 } else {
7282 uint32_t w2 = *in++; len--;
7283 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
7284 k++;
7285 }
7286 if (w2 == jisx0213_u2_tbl[2*k+1]) {
7287 s = jisx0213_u2_key[k];
7288 break;
7289 }
7290 in--; len++;
7291 }
7292
7293 /* Fallback */
7294 s = jisx0213_u2_fb_tbl[k];
7295 break;
7296 }
7297 }
7298 }
7299
7300 /* Check for major Japanese chars: U+4E00-U+9FFF */
7301 if (!s) {
7302 for (int k = 0; k < uni2jis_tbl_len; k++) {
7303 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
7304 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
7305 break;
7306 }
7307 }
7308 }
7309
7310 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
7311 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
7312 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
7313 if (k >= 0) {
7314 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
7315 }
7316 }
7317
7318 /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
7319 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
7320 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
7321 if (k >= 0) {
7322 s = jisx0213_u5_jis_tbl[k];
7323 }
7324 }
7325
7326 if (!s) {
7327 /* CJK Compatibility Forms: U+FE30-U+FE4F */
7328 if (w == 0xFE45) {
7329 s = 0x233E;
7330 } else if (w == 0xFE46) {
7331 s = 0x233D;
7332 } else if (w >= 0xF91D && w <= 0xF9DC) {
7333 /* CJK Compatibility Ideographs: U+F900-U+F92A */
7334 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
7335 if (k >= 0) {
7336 s = ucs_r2b_jisx0213_cmap_val[k];
7337 }
7338 }
7339 }
7340
7341 if (!s && w) {
7342 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004);
7343 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7344 } else if (s <= 0xFF) {
7345 out = mb_convert_buf_add(out, s);
7346 } else {
7347 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7348 SJIS_ENCODE(c1, c2, s1, s2);
7349 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7350 out = mb_convert_buf_add2(out, s1, s2);
7351 }
7352 }
7353
7354 MB_CONVERT_BUF_STORE(buf, out, limit);
7355 }
7356
mbfl_filt_conv_cp932_wchar(int c,mbfl_convert_filter * filter)7357 static int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
7358 {
7359 int c1, s, s1, s2, w;
7360
7361 switch (filter->status) {
7362 case 0:
7363 if (c >= 0 && c < 0x80) { /* latin */
7364 CK((*filter->output_function)(c, filter->data));
7365 } else if (c > 0xa0 && c < 0xe0) { /* kana */
7366 CK((*filter->output_function)(0xfec0 + c, filter->data));
7367 } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
7368 filter->status = 1;
7369 filter->cache = c;
7370 } else {
7371 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7372 }
7373 break;
7374
7375 case 1: /* kanji second char */
7376 filter->status = 0;
7377 c1 = filter->cache;
7378 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
7379 w = 0;
7380 SJIS_DECODE(c1, c, s1, s2);
7381 s = (s1 - 0x21)*94 + s2 - 0x21;
7382 if (s <= 137) {
7383 if (s == 31) {
7384 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
7385 } else if (s == 32) {
7386 w = 0xff5e; /* FULLWIDTH TILDE */
7387 } else if (s == 33) {
7388 w = 0x2225; /* PARALLEL TO */
7389 } else if (s == 60) {
7390 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
7391 } else if (s == 80) {
7392 w = 0xffe0; /* FULLWIDTH CENT SIGN */
7393 } else if (s == 81) {
7394 w = 0xffe1; /* FULLWIDTH POUND SIGN */
7395 } else if (s == 137) {
7396 w = 0xffe2; /* FULLWIDTH NOT SIGN */
7397 }
7398 }
7399 if (w == 0) {
7400 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
7401 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7402 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
7403 w = jisx0208_ucs_table[s];
7404 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
7405 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7406 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */
7407 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7408 } else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */
7409 w = s - (94*94) + 0xe000;
7410 }
7411 }
7412
7413 if (w <= 0) {
7414 w = MBFL_BAD_INPUT;
7415 }
7416
7417 CK((*filter->output_function)(w, filter->data));
7418 } else {
7419 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7420 }
7421 break;
7422
7423 EMPTY_SWITCH_DEFAULT_CASE();
7424 }
7425
7426 return 0;
7427 }
7428
mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter * filter)7429 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
7430 {
7431 if (filter->status) {
7432 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
7433 filter->status = 0;
7434 }
7435
7436 if (filter->flush_function) {
7437 (*filter->flush_function)(filter->data);
7438 }
7439
7440 return 0;
7441 }
7442
mbfl_filt_conv_wchar_cp932(int c,mbfl_convert_filter * filter)7443 static int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
7444 {
7445 int c1, c2, s1, s2;
7446
7447 s1 = 0;
7448 s2 = 0;
7449 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
7450 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
7451 } else if (c == 0x203E) {
7452 s1 = 0x7E;
7453 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
7454 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
7455 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
7456 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
7457 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
7458 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
7459 } else if (c >= 0xe000 && c < (0xe000 + 20*94)) { /* user (95ku - 114ku) */
7460 s1 = c - 0xe000;
7461 c1 = s1/94 + 0x7f;
7462 c2 = s1%94 + 0x21;
7463 s1 = (c1 << 8) | c2;
7464 s2 = 1;
7465 }
7466 if (s1 <= 0) {
7467 if (c == 0xa5) { /* YEN SIGN */
7468 s1 = 0x5C;
7469 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
7470 s1 = 0x2140;
7471 } else if (c == 0x2225) { /* PARALLEL TO */
7472 s1 = 0x2142;
7473 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
7474 s1 = 0x215d;
7475 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
7476 s1 = 0x2171;
7477 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
7478 s1 = 0x2172;
7479 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
7480 s1 = 0x224c;
7481 }
7482 }
7483 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
7484 s1 = -1;
7485 c1 = 0;
7486 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
7487 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
7488 if (c == cp932ext1_ucs_table[c1]) {
7489 s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
7490 break;
7491 }
7492 c1++;
7493 }
7494 if (s1 <= 0) {
7495 c1 = 0;
7496 c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
7497 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
7498 if (c == cp932ext3_ucs_table[c1]) {
7499 s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21);
7500 break;
7501 }
7502 c1++;
7503 }
7504 }
7505 if (c == 0) {
7506 s1 = 0;
7507 } else if (s1 <= 0) {
7508 s1 = -1;
7509 }
7510 }
7511 if (s1 >= 0) {
7512 if (s1 < 0x100) { /* latin or kana */
7513 CK((*filter->output_function)(s1, filter->data));
7514 } else { /* kanji */
7515 c1 = (s1 >> 8) & 0xff;
7516 c2 = s1 & 0xff;
7517 SJIS_ENCODE(c1, c2, s1, s2);
7518 CK((*filter->output_function)(s1, filter->data));
7519 CK((*filter->output_function)(s2, filter->data));
7520 }
7521 } else {
7522 CK(mbfl_filt_conv_illegal_output(c, filter));
7523 }
7524
7525 return 0;
7526 }
7527
mbfl_filt_conv_wchar_sjiswin(int c,mbfl_convert_filter * filter)7528 static int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter)
7529 {
7530 if (c == 0xA5) {
7531 CK((*filter->output_function)(0x81, filter->data));
7532 CK((*filter->output_function)(0x8F, filter->data));
7533 } else if (c == 0x203E) {
7534 CK((*filter->output_function)(0x81, filter->data));
7535 CK((*filter->output_function)(0x50, filter->data));
7536 } else {
7537 return mbfl_filt_conv_wchar_cp932(c, filter);
7538 }
7539 return 0;
7540 }
7541
mb_cp932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7542 static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7543 {
7544 unsigned char *p = *in, *e = p + *in_len;
7545 uint32_t *out = buf, *limit = buf + bufsize;
7546
7547 while (p < e && out < limit) {
7548 unsigned char c = *p++;
7549
7550 if (c < 0x80) {
7551 *out++ = c;
7552 } else if (c > 0xA0 && c < 0xE0) {
7553 /* Kana */
7554 *out++ = 0xFEC0 + c;
7555 } else {
7556 if (p == e) {
7557 *out++ = MBFL_BAD_INPUT;
7558 break;
7559 }
7560 unsigned char c2 = *p++;
7561 unsigned int w = 0;
7562 unsigned int s = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7563
7564 if (s <= 137) {
7565 if (s == 31) {
7566 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
7567 } else if (s == 32) {
7568 w = 0xFF5E; /* FULLWIDTH TILDE */
7569 } else if (s == 33) {
7570 w = 0x2225; /* PARALLEL TO */
7571 } else if (s == 60) {
7572 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
7573 } else if (s == 80) {
7574 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
7575 } else if (s == 81) {
7576 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
7577 } else if (s == 137) {
7578 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
7579 }
7580 }
7581
7582 if (w == 0) {
7583 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
7584 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7585 } else if (s < jisx0208_ucs_table_size) {
7586 w = jisx0208_ucs_table[s];
7587 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
7588 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7589 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
7590 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7591 } else if (s >= (94*94) && s < (114*94)) {
7592 w = s - (94*94) + 0xE000;
7593 }
7594 }
7595
7596 if (!w) {
7597 if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7598 p--;
7599 }
7600 w = MBFL_BAD_INPUT;
7601 }
7602 *out++ = w;
7603 }
7604 }
7605
7606 *in_len = e - p;
7607 *in = p;
7608 return out - buf;
7609 }
7610
mb_wchar_to_cp932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7611 static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7612 {
7613 unsigned char *out, *limit;
7614 MB_CONVERT_BUF_LOAD(buf, out, limit);
7615 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7616
7617 while (len--) {
7618 uint32_t w = *in++;
7619 unsigned int s1 = 0, s2 = 0, c1, c2;
7620
7621 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7622 s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7623 } else if (w == 0x203E) {
7624 s1 = 0x7E;
7625 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7626 s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7627 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7628 s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7629 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7630 s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7631 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7632 s1 = w - 0xE000;
7633 c1 = s1/94 + 0x7F;
7634 c2 = s1%94 + 0x21;
7635 s1 = (c1 << 8) | c2;
7636 s2 = 1;
7637 }
7638
7639 if (w == 0xA5) { /* YEN SIGN */
7640 s1 = 0x5C;
7641 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7642 s1 = 0x2140;
7643 } else if (w == 0x2225) { /* PARALLEL TO */
7644 s1 = 0x2142;
7645 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7646 s1 = 0x215D;
7647 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7648 s1 = 0x2171;
7649 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7650 s1 = 0x2172;
7651 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7652 s1 = 0x224C;
7653 } else if (w == 0) {
7654 out = mb_convert_buf_add(out, 0);
7655 continue;
7656 }
7657
7658 if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7659 const unsigned short *lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext1_ucs_table_paired_sorted, sizeof(cp932ext1_ucs_table_paired_sorted) / sizeof(*cp932ext1_ucs_table_paired_sorted));
7660 if (lookup) {
7661 s1 = ((*lookup/94 + 0x2D) << 8) + (*lookup%94 + 0x21);
7662 goto emit_output;
7663 }
7664
7665 lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext3_ucs_table_paired_sorted, sizeof(cp932ext3_ucs_table_paired_sorted) / sizeof(*cp932ext3_ucs_table_paired_sorted));
7666 if (lookup) {
7667 s1 = ((*lookup/94 + 0x93) << 8) + (*lookup%94 + 0x21);
7668 goto emit_output;
7669 }
7670
7671 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7672 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7673 continue;
7674 }
7675
7676 emit_output:
7677 if (s1 < 0x100) {
7678 out = mb_convert_buf_add(out, s1);
7679 } else {
7680 c1 = (s1 >> 8) & 0xFF;
7681 c2 = s1 & 0xFF;
7682 SJIS_ENCODE(c1, c2, s1, s2);
7683 out = mb_convert_buf_add2(out, s1, s2);
7684 }
7685 }
7686
7687 MB_CONVERT_BUF_STORE(buf, out, limit);
7688 }
7689
mb_wchar_to_sjiswin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7690 static void mb_wchar_to_sjiswin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7691 {
7692 unsigned char *out, *limit;
7693 MB_CONVERT_BUF_LOAD(buf, out, limit);
7694 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7695
7696 while (len--) {
7697 uint32_t w = *in++;
7698 unsigned int s1 = 0, s2 = 0, c1, c2;
7699
7700 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7701 s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7702 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7703 s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7704 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7705 s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7706 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7707 s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7708 } else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7709 s1 = w - 0xE000;
7710 c1 = s1/94 + 0x7F;
7711 c2 = s1%94 + 0x21;
7712 s1 = (c1 << 8) | c2;
7713 s2 = 1;
7714 }
7715
7716 if (w == 0xA5) { /* YEN SIGN */
7717 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
7718 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7719 s1 = 0x2140;
7720 } else if (w == 0x2225) { /* PARALLEL TO */
7721 s1 = 0x2142;
7722 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7723 s1 = 0x215D;
7724 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7725 s1 = 0x2171;
7726 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7727 s1 = 0x2172;
7728 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7729 s1 = 0x224C;
7730 } else if (w == 0) {
7731 out = mb_convert_buf_add(out, 0);
7732 continue;
7733 }
7734
7735 if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7736 const unsigned short *lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext1_ucs_table_paired_sorted, sizeof(cp932ext1_ucs_table_paired_sorted) / sizeof(*cp932ext1_ucs_table_paired_sorted));
7737 if (lookup) {
7738 s1 = ((*lookup/94 + 0x2D) << 8) + (*lookup%94 + 0x21);
7739 goto emit_output;
7740 }
7741
7742 lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext3_ucs_table_paired_sorted, sizeof(cp932ext3_ucs_table_paired_sorted) / sizeof(*cp932ext3_ucs_table_paired_sorted));
7743 if (lookup) {
7744 s1 = ((*lookup/94 + 0x93) << 8) + (*lookup%94 + 0x21);
7745 goto emit_output;
7746 }
7747
7748 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7749 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7750 continue;
7751 }
7752
7753 emit_output:
7754 if (s1 < 0x100) {
7755 out = mb_convert_buf_add(out, s1);
7756 } else {
7757 c1 = (s1 >> 8) & 0xFF;
7758 c2 = s1 & 0xFF;
7759 SJIS_ENCODE(c1, c2, s1, s2);
7760 out = mb_convert_buf_add2(out, s1, s2);
7761 }
7762 }
7763
7764 MB_CONVERT_BUF_STORE(buf, out, limit);
7765 }
7766
7767 static const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */
7768 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7769 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7770 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7771 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7772 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7773 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7774 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7775 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7776 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7777 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7782 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7784 };
7785
7786 static const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */
7787 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7788 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7789 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7790 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7791 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7792 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7793 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7794 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7795 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7796 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7797 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7799 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7800 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7801 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
7802 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7803 };
7804
7805 static const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */
7806 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7807 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7808 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7809 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7810 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7812 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7813 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7814 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7815 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7817 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7818 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7819 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7820 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7821 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
7822 };
7823
7824 static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
7825
7826 static const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
7827 mbfl_no_encoding_sjis,
7828 mbfl_no_encoding_wchar,
7829 mbfl_filt_conv_common_ctor,
7830 NULL,
7831 mbfl_filt_conv_sjis_wchar,
7832 mbfl_filt_conv_sjis_wchar_flush,
7833 NULL
7834 };
7835
7836 static const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
7837 mbfl_no_encoding_wchar,
7838 mbfl_no_encoding_sjis,
7839 mbfl_filt_conv_common_ctor,
7840 NULL,
7841 mbfl_filt_conv_wchar_sjis,
7842 mbfl_filt_conv_common_flush,
7843 NULL
7844 };
7845
7846 const mbfl_encoding mbfl_encoding_sjis = {
7847 mbfl_no_encoding_sjis,
7848 "SJIS",
7849 "Shift_JIS",
7850 mbfl_encoding_sjis_aliases,
7851 mblen_table_sjis,
7852 MBFL_ENCTYPE_GL_UNSAFE,
7853 &vtbl_sjis_wchar,
7854 &vtbl_wchar_sjis,
7855 mb_sjis_to_wchar,
7856 mb_wchar_to_sjis,
7857 NULL,
7858 NULL,
7859 };
7860
7861 static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
7862
7863 static const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
7864 mbfl_no_encoding_sjis_mac,
7865 mbfl_no_encoding_wchar,
7866 mbfl_filt_conv_common_ctor,
7867 NULL,
7868 mbfl_filt_conv_sjis_mac_wchar,
7869 mbfl_filt_conv_sjis_wchar_flush,
7870 NULL,
7871 };
7872
7873 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_mac = {
7874 mbfl_no_encoding_wchar,
7875 mbfl_no_encoding_sjis_mac,
7876 mbfl_filt_conv_common_ctor,
7877 NULL,
7878 mbfl_filt_conv_wchar_sjis_mac,
7879 mbfl_filt_conv_wchar_sjis_mac_flush,
7880 NULL,
7881 };
7882
7883 const mbfl_encoding mbfl_encoding_sjis_mac = {
7884 mbfl_no_encoding_sjis_mac,
7885 "SJIS-mac",
7886 "Shift_JIS",
7887 mbfl_encoding_sjis_mac_aliases,
7888 mblen_table_sjismac,
7889 MBFL_ENCTYPE_GL_UNSAFE,
7890 &vtbl_sjis_mac_wchar,
7891 &vtbl_wchar_sjis_mac,
7892 mb_sjismac_to_wchar,
7893 mb_wchar_to_sjismac,
7894 NULL,
7895 NULL,
7896 };
7897
7898 static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
7899 static const char *mbfl_encoding_sjis_kddi_aliases[] = {"SJIS-KDDI", "shift_jis-kddi", "x-sjis-emoji-kddi", NULL};
7900 static const char *mbfl_encoding_sjis_sb_aliases[] = {"SJIS-SOFTBANK", "shift_jis-softbank", "x-sjis-emoji-softbank", NULL};
7901
7902 static const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = {
7903 mbfl_no_encoding_sjis_docomo,
7904 mbfl_no_encoding_wchar,
7905 mbfl_filt_conv_common_ctor,
7906 NULL,
7907 mbfl_filt_conv_sjis_mobile_wchar,
7908 mbfl_filt_conv_sjis_wchar_flush,
7909 NULL,
7910 };
7911
7912 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_docomo = {
7913 mbfl_no_encoding_wchar,
7914 mbfl_no_encoding_sjis_docomo,
7915 mbfl_filt_conv_common_ctor,
7916 NULL,
7917 mbfl_filt_conv_wchar_sjis_mobile,
7918 mbfl_filt_conv_sjis_mobile_flush,
7919 NULL,
7920 };
7921
7922 const mbfl_encoding mbfl_encoding_sjis_docomo = {
7923 mbfl_no_encoding_sjis_docomo,
7924 "SJIS-Mobile#DOCOMO",
7925 "Shift_JIS",
7926 mbfl_encoding_sjis_docomo_aliases,
7927 mblen_table_sjis_mobile,
7928 MBFL_ENCTYPE_GL_UNSAFE,
7929 &vtbl_sjis_docomo_wchar,
7930 &vtbl_wchar_sjis_docomo,
7931 mb_sjis_docomo_to_wchar,
7932 mb_wchar_to_sjis_docomo,
7933 NULL,
7934 NULL,
7935 };
7936
7937 static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
7938 mbfl_no_encoding_sjis_kddi,
7939 mbfl_no_encoding_wchar,
7940 mbfl_filt_conv_common_ctor,
7941 NULL,
7942 mbfl_filt_conv_sjis_mobile_wchar,
7943 mbfl_filt_conv_sjis_wchar_flush,
7944 NULL,
7945 };
7946
7947 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_kddi = {
7948 mbfl_no_encoding_wchar,
7949 mbfl_no_encoding_sjis_kddi,
7950 mbfl_filt_conv_common_ctor,
7951 NULL,
7952 mbfl_filt_conv_wchar_sjis_mobile,
7953 mbfl_filt_conv_sjis_mobile_flush,
7954 NULL,
7955 };
7956
7957 const mbfl_encoding mbfl_encoding_sjis_kddi = {
7958 mbfl_no_encoding_sjis_kddi,
7959 "SJIS-Mobile#KDDI",
7960 "Shift_JIS",
7961 mbfl_encoding_sjis_kddi_aliases,
7962 mblen_table_sjis_mobile,
7963 MBFL_ENCTYPE_GL_UNSAFE,
7964 &vtbl_sjis_kddi_wchar,
7965 &vtbl_wchar_sjis_kddi,
7966 mb_sjis_kddi_to_wchar,
7967 mb_wchar_to_sjis_kddi,
7968 NULL,
7969 NULL,
7970 };
7971
7972 static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
7973 mbfl_no_encoding_sjis_sb,
7974 mbfl_no_encoding_wchar,
7975 mbfl_filt_conv_common_ctor,
7976 NULL,
7977 mbfl_filt_conv_sjis_mobile_wchar,
7978 mbfl_filt_conv_sjis_wchar_flush,
7979 NULL,
7980 };
7981
7982 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_sb = {
7983 mbfl_no_encoding_wchar,
7984 mbfl_no_encoding_sjis_sb,
7985 mbfl_filt_conv_common_ctor,
7986 NULL,
7987 mbfl_filt_conv_wchar_sjis_mobile,
7988 mbfl_filt_conv_sjis_mobile_flush,
7989 NULL,
7990 };
7991
7992 const mbfl_encoding mbfl_encoding_sjis_sb = {
7993 mbfl_no_encoding_sjis_sb,
7994 "SJIS-Mobile#SOFTBANK",
7995 "Shift_JIS",
7996 mbfl_encoding_sjis_sb_aliases,
7997 mblen_table_sjis_mobile,
7998 MBFL_ENCTYPE_GL_UNSAFE,
7999 &vtbl_sjis_sb_wchar,
8000 &vtbl_wchar_sjis_sb,
8001 mb_sjis_sb_to_wchar,
8002 mb_wchar_to_sjis_sb,
8003 NULL,
8004 NULL,
8005 };
8006
8007 /* Although the specification for Shift-JIS-2004 indicates that 0x5C and
8008 * 0x7E should (respectively) represent a Yen sign and an overbar, feedback
8009 * from Japanese PHP users indicates that they prefer 0x5C and 0x7E to be
8010 * treated as equivalent to U+005C and U+007E. This is the historical
8011 * behavior of mbstring, and promotes compatibility with other software
8012 * which handles Shift-JIS and Shift-JIS-2004 text in this way. */
8013
8014 static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL};
8015
8016 static const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
8017 mbfl_no_encoding_sjis2004,
8018 mbfl_no_encoding_wchar,
8019 mbfl_filt_conv_common_ctor,
8020 NULL,
8021 mbfl_filt_conv_jis2004_wchar,
8022 mbfl_filt_conv_jis2004_wchar_flush,
8023 NULL,
8024 };
8025
8026 static const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = {
8027 mbfl_no_encoding_wchar,
8028 mbfl_no_encoding_sjis2004,
8029 mbfl_filt_conv_common_ctor,
8030 NULL,
8031 mbfl_filt_conv_wchar_jis2004,
8032 mbfl_filt_conv_wchar_jis2004_flush,
8033 NULL,
8034 };
8035
8036 const mbfl_encoding mbfl_encoding_sjis2004 = {
8037 mbfl_no_encoding_sjis2004,
8038 "SJIS-2004",
8039 "Shift_JIS",
8040 mbfl_encoding_sjis2004_aliases,
8041 mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
8042 MBFL_ENCTYPE_GL_UNSAFE,
8043 &vtbl_sjis2004_wchar,
8044 &vtbl_wchar_sjis2004,
8045 mb_sjis2004_to_wchar,
8046 mb_wchar_to_sjis2004,
8047 NULL,
8048 NULL,
8049 };
8050
8051 /* CP932 is Microsoft's version of Shift-JIS.
8052 *
8053 * What we call "SJIS-win" is a variant of CP932 which maps U+00A5
8054 * and U+203E the same way as eucJP-win; namely, instead of mapping
8055 * U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
8056 * these codepoints are mapped to appropriate JIS X 0208 characters.
8057 *
8058 * When converting from Shift-JIS to Unicode, there is no difference
8059 * between CP932 and "SJIS-win".
8060 *
8061 * Additional facts:
8062 *
8063 * • In the libmbfl library which formed the base for mbstring, "CP932" and
8064 * "SJIS-win" were originally aliases. The differing mappings were added in
8065 * December 2002. The libmbfl author later stated that this was done so that
8066 * "CP932" would comply with a certain specification, while "SJIS-win" would
8067 * maintain the existing mappings. He does not remember which specification
8068 * it was.
8069 * • The WHATWG specification for "Shift_JIS" (followed by web browsers)
8070 * agrees with our mappings for "CP932".
8071 * • Microsoft Windows' "best-fit" mappings for CP932 (via the
8072 * WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
8073 * our mappings for "CP932".
8074 * • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
8075 * our mappings for "CP932".
8076 * • When converting Shift-JIS to CP932, the conversion goes through Unicode.
8077 * Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
8078 * 0x7E will go to 0x7E when converting Shift-JIS to CP932.
8079 */
8080
8081 static const unsigned char mblen_table_sjiswin[] = { /* 0x81-0x9F,0xE0-0xFF */
8082 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8083 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8084 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8090 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8091 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8096 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
8098 };
8099
8100 static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
8101 static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL};
8102
8103 static const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
8104 mbfl_no_encoding_cp932,
8105 mbfl_no_encoding_wchar,
8106 mbfl_filt_conv_common_ctor,
8107 NULL,
8108 mbfl_filt_conv_cp932_wchar,
8109 mbfl_filt_conv_cp932_wchar_flush,
8110 NULL,
8111 };
8112
8113 static const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
8114 mbfl_no_encoding_wchar,
8115 mbfl_no_encoding_cp932,
8116 mbfl_filt_conv_common_ctor,
8117 NULL,
8118 mbfl_filt_conv_wchar_cp932,
8119 mbfl_filt_conv_common_flush,
8120 NULL,
8121 };
8122
8123 const mbfl_encoding mbfl_encoding_cp932 = {
8124 mbfl_no_encoding_cp932,
8125 "CP932",
8126 "Shift_JIS",
8127 mbfl_encoding_cp932_aliases,
8128 mblen_table_sjiswin,
8129 MBFL_ENCTYPE_GL_UNSAFE,
8130 &vtbl_cp932_wchar,
8131 &vtbl_wchar_cp932,
8132 mb_cp932_to_wchar,
8133 mb_wchar_to_cp932,
8134 NULL,
8135 NULL,
8136 };
8137
8138 static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
8139 mbfl_no_encoding_sjiswin,
8140 mbfl_no_encoding_wchar,
8141 mbfl_filt_conv_common_ctor,
8142 NULL,
8143 mbfl_filt_conv_cp932_wchar,
8144 mbfl_filt_conv_cp932_wchar_flush,
8145 NULL,
8146 };
8147
8148 static const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
8149 mbfl_no_encoding_wchar,
8150 mbfl_no_encoding_sjiswin,
8151 mbfl_filt_conv_common_ctor,
8152 NULL,
8153 mbfl_filt_conv_wchar_sjiswin,
8154 mbfl_filt_conv_common_flush,
8155 NULL,
8156 };
8157
8158 const mbfl_encoding mbfl_encoding_sjiswin = {
8159 mbfl_no_encoding_sjiswin,
8160 "SJIS-win",
8161 "Shift_JIS",
8162 mbfl_encoding_sjiswin_aliases,
8163 mblen_table_sjiswin,
8164 MBFL_ENCTYPE_GL_UNSAFE,
8165 &vtbl_sjiswin_wchar,
8166 &vtbl_wchar_sjiswin,
8167 mb_cp932_to_wchar,
8168 mb_wchar_to_sjiswin,
8169 NULL,
8170 NULL,
8171 };
8172
8173 /*
8174 * EUC variants
8175 */
8176
mbfl_filt_conv_eucjp_wchar(int c,mbfl_convert_filter * filter)8177 static int mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
8178 {
8179 int c1, s, w = 0;
8180
8181 switch (filter->status) {
8182 case 0:
8183 if (c >= 0 && c < 0x80) { /* latin */
8184 CK((*filter->output_function)(c, filter->data));
8185 } else if (c > 0xa0 && c < 0xff) { /* X 0208 first char */
8186 filter->status = 1;
8187 filter->cache = c;
8188 } else if (c == 0x8e) { /* kana first char */
8189 filter->status = 2;
8190 } else if (c == 0x8f) { /* X 0212 first char */
8191 filter->status = 3;
8192 } else {
8193 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8194 }
8195 break;
8196
8197 case 1: /* got first half */
8198 filter->status = 0;
8199 c1 = filter->cache;
8200 if (c > 0xa0 && c < 0xff) {
8201 s = (c1 - 0xa1)*94 + c - 0xa1;
8202 if (s >= 0 && s < jisx0208_ucs_table_size) {
8203 w = jisx0208_ucs_table[s];
8204 if (!w)
8205 w = MBFL_BAD_INPUT;
8206 } else {
8207 w = MBFL_BAD_INPUT;
8208 }
8209
8210 CK((*filter->output_function)(w, filter->data));
8211 } else {
8212 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8213 }
8214 break;
8215
8216 case 2: /* got 0x8e */
8217 filter->status = 0;
8218 if (c > 0xa0 && c < 0xe0) {
8219 w = 0xfec0 + c;
8220 CK((*filter->output_function)(w, filter->data));
8221 } else {
8222 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8223 }
8224 break;
8225
8226 case 3: /* got 0x8f, JIS X 0212 first byte */
8227 filter->status++;
8228 filter->cache = c;
8229 break;
8230
8231 case 4: /* got 0x8f, JIS X 0212 second byte */
8232 filter->status = 0;
8233 c1 = filter->cache;
8234 if (c > 0xA0 && c < 0xFF && c1 > 0xA0 && c1 < 0xFF) {
8235 s = (c1 - 0xa1)*94 + c - 0xa1;
8236 if (s >= 0 && s < jisx0212_ucs_table_size) {
8237 w = jisx0212_ucs_table[s];
8238 if (!w)
8239 w = MBFL_BAD_INPUT;
8240 } else {
8241 w = MBFL_BAD_INPUT;
8242 }
8243
8244 CK((*filter->output_function)(w, filter->data));
8245 } else {
8246 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8247 }
8248 break;
8249
8250 EMPTY_SWITCH_DEFAULT_CASE();
8251 }
8252
8253 return 0;
8254 }
8255
mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter * filter)8256 static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter)
8257 {
8258 if (filter->status) {
8259 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8260 filter->status = 0;
8261 }
8262
8263 if (filter->flush_function) {
8264 (*filter->flush_function)(filter->data);
8265 }
8266
8267 return 0;
8268 }
8269
mbfl_filt_conv_wchar_eucjp(int c,mbfl_convert_filter * filter)8270 static int mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
8271 {
8272 int s = 0;
8273
8274 if (c == 0xAF) { /* U+00AF is MACRON */
8275 s = 0xA2B4; /* Use JIS X 0212 overline */
8276 } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8277 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8278 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8279 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8280 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8281 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
8282 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8283 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
8284 }
8285 if (s <= 0) {
8286 if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
8287 s = 0x2140;
8288 } else if (c == 0x2225) { /* PARALLEL TO */
8289 s = 0x2142;
8290 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
8291 s = 0x215d;
8292 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
8293 s = 0x2171;
8294 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
8295 s = 0x2172;
8296 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
8297 s = 0x224c;
8298 } else if (c == 0) {
8299 s = 0;
8300 } else {
8301 s = -1;
8302 }
8303 }
8304 if (s >= 0) {
8305 if (s < 0x80) { /* latin */
8306 CK((*filter->output_function)(s, filter->data));
8307 } else if (s < 0x100) { /* kana */
8308 CK((*filter->output_function)(0x8e, filter->data));
8309 CK((*filter->output_function)(s, filter->data));
8310 } else if (s < 0x8080) { /* X 0208 */
8311 CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8312 CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8313 } else { /* X 0212 */
8314 CK((*filter->output_function)(0x8f, filter->data));
8315 CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8316 CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8317 }
8318 } else {
8319 CK(mbfl_filt_conv_illegal_output(c, filter));
8320 }
8321
8322 return 0;
8323 }
8324
mb_eucjp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8325 static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8326 {
8327 unsigned char *p = *in, *e = p + *in_len;
8328 uint32_t *out = buf, *limit = buf + bufsize;
8329
8330 while (p < e && out < limit) {
8331 unsigned char c = *p++;
8332
8333 if (c < 0x80) {
8334 *out++ = c;
8335 } else if (c >= 0xA1 && c <= 0xFE && p < e) {
8336 /* JISX 0208 */
8337 unsigned char c2 = *p++;
8338 if (c2 >= 0xA1 && c2 <= 0xFE) {
8339 unsigned int s = (c - 0xA1)*94 + c2 - 0xA1;
8340 if (s < jisx0208_ucs_table_size) {
8341 uint32_t w = jisx0208_ucs_table[s];
8342 if (!w)
8343 w = MBFL_BAD_INPUT;
8344 *out++ = w;
8345 } else {
8346 *out++ = MBFL_BAD_INPUT;
8347 }
8348 } else {
8349 *out++ = MBFL_BAD_INPUT;
8350 }
8351 } else if (c == 0x8E && p < e) {
8352 /* Kana */
8353 unsigned char c2 = *p++;
8354 *out++ = (c2 >= 0xA1 && c2 <= 0xDF) ? 0xFEC0 + c2 : MBFL_BAD_INPUT;
8355 } else if (c == 0x8F) {
8356 /* JISX 0212 */
8357 if ((e - p) >= 2) {
8358 unsigned char c2 = *p++;
8359 unsigned char c3 = *p++;
8360 if (c3 >= 0xA1 && c3 <= 0xFE && c2 >= 0xA1 && c2 <= 0xFE) {
8361 unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1;
8362 if (s < jisx0212_ucs_table_size) {
8363 uint32_t w = jisx0212_ucs_table[s];
8364 if (!w)
8365 w = MBFL_BAD_INPUT;
8366 *out++ = w;
8367 } else {
8368 *out++ = MBFL_BAD_INPUT;
8369 }
8370 } else {
8371 *out++ = MBFL_BAD_INPUT;
8372 }
8373 } else {
8374 *out++ = MBFL_BAD_INPUT;
8375 p = e; /* Jump to end of string */
8376 }
8377 } else {
8378 *out++ = MBFL_BAD_INPUT;
8379 }
8380 }
8381
8382 *in_len = e - p;
8383 *in = p;
8384 return out - buf;
8385 }
8386
mb_wchar_to_eucjp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8387 static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8388 {
8389 unsigned char *out, *limit;
8390 MB_CONVERT_BUF_LOAD(buf, out, limit);
8391 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8392
8393 while (len--) {
8394 uint32_t w = *in++;
8395 unsigned int s = 0;
8396
8397 if (w == 0xAF) { /* U+00AF is MACRON */
8398 s = 0xA2B4; /* Use JIS X 0212 overline */
8399 } else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8400 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8401 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8402 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8403 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8404 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8405 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8406 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8407 }
8408
8409 if (s == 0) {
8410 if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8411 s = 0x2140;
8412 } else if (w == 0x2225) { /* PARALLEL TO */
8413 s = 0x2142;
8414 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8415 s = 0x215D;
8416 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8417 s = 0x2171;
8418 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8419 s = 0x2172;
8420 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8421 s = 0x224C;
8422 } else if (w == 0) {
8423 out = mb_convert_buf_add(out, 0);
8424 continue;
8425 } else {
8426 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp);
8427 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8428 continue;
8429 }
8430 }
8431
8432 if (s < 0x80) {
8433 out = mb_convert_buf_add(out, s);
8434 } else if (s < 0x100) {
8435 out = mb_convert_buf_add2(out, 0x8E, s);
8436 } else if (s < 0x8080) {
8437 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8438 } else {
8439 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8440 out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8441 }
8442 }
8443
8444 MB_CONVERT_BUF_STORE(buf, out, limit);
8445 }
8446
mbfl_filt_conv_eucjpwin_wchar(int c,mbfl_convert_filter * filter)8447 static int mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter)
8448 {
8449 int c1, s, w, n;
8450
8451 switch (filter->status) {
8452 case 0:
8453 if (c >= 0 && c < 0x80) { /* latin */
8454 CK((*filter->output_function)(c, filter->data));
8455 } else if (c >= 0xa1 && c <= 0xfe) { /* CP932 first char */
8456 filter->status = 1;
8457 filter->cache = c;
8458 } else if (c == 0x8e) { /* kana first char */
8459 filter->status = 2;
8460 } else if (c == 0x8f) { /* X 0212 first char */
8461 filter->status = 3;
8462 } else {
8463 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8464 }
8465 break;
8466
8467 case 1: /* got first half */
8468 filter->status = 0;
8469 c1 = filter->cache;
8470 if (c > 0xa0 && c < 0xff) {
8471 w = 0;
8472 s = (c1 - 0xa1)*94 + c - 0xa1;
8473 if (s <= 137) {
8474 if (s == 31) {
8475 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
8476 } else if (s == 32) {
8477 w = 0xff5e; /* FULLWIDTH TILDE */
8478 } else if (s == 33) {
8479 w = 0x2225; /* PARALLEL TO */
8480 } else if (s == 60) {
8481 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
8482 } else if (s == 80) {
8483 w = 0xffe0; /* FULLWIDTH CENT SIGN */
8484 } else if (s == 81) {
8485 w = 0xffe1; /* FULLWIDTH POUND SIGN */
8486 } else if (s == 137) {
8487 w = 0xffe2; /* FULLWIDTH NOT SIGN */
8488 }
8489 }
8490
8491 if (w == 0) {
8492 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
8493 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8494 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
8495 w = jisx0208_ucs_table[s];
8496 } else if (s >= (84 * 94)) { /* user (85ku - 94ku) */
8497 w = s - (84 * 94) + 0xe000;
8498 }
8499 }
8500
8501 if (w <= 0) {
8502 w = MBFL_BAD_INPUT;
8503 }
8504 CK((*filter->output_function)(w, filter->data));
8505 } else {
8506 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8507 }
8508 break;
8509
8510 case 2: /* got 0x8e, X0201 kana */
8511 filter->status = 0;
8512 if (c > 0xa0 && c < 0xe0) {
8513 w = 0xfec0 + c;
8514 CK((*filter->output_function)(w, filter->data));
8515 } else {
8516 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8517 }
8518 break;
8519
8520 case 3: /* got 0x8f, X 0212 first char */
8521 filter->status++;
8522 filter->cache = c;
8523 break;
8524
8525 case 4: /* got 0x8f, X 0212 second char */
8526 filter->status = 0;
8527 c1 = filter->cache;
8528 if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
8529 s = (c1 - 0xa1)*94 + c - 0xa1;
8530
8531 if (s >= 0 && s < jisx0212_ucs_table_size) {
8532 w = jisx0212_ucs_table[s];
8533
8534 if (w == 0x007e) {
8535 w = 0xff5e; /* FULLWIDTH TILDE */
8536 }
8537 } else if (s >= (82*94) && s < (84*94)) { /* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */
8538 s = (c1 << 8) | c;
8539 w = 0;
8540 n = 0;
8541 while (n < cp932ext3_eucjp_table_size) {
8542 if (s == cp932ext3_eucjp_table[n]) {
8543 if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) {
8544 w = cp932ext3_ucs_table[n];
8545 }
8546 break;
8547 }
8548 n++;
8549 }
8550 } else if (s >= (84*94)) { /* user (85ku - 94ku) */
8551 w = s - (84*94) + (0xe000 + (94*10));
8552 } else {
8553 w = 0;
8554 }
8555
8556 if (w == 0x00A6) {
8557 w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
8558 }
8559
8560 if (w <= 0) {
8561 w = MBFL_BAD_INPUT;
8562 }
8563 CK((*filter->output_function)(w, filter->data));
8564 } else {
8565 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8566 }
8567 break;
8568
8569 EMPTY_SWITCH_DEFAULT_CASE();
8570 }
8571
8572 return 0;
8573 }
8574
mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter * filter)8575 static int mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter *filter)
8576 {
8577 if (filter->status) {
8578 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8579 filter->status = 0;
8580 }
8581
8582 if (filter->flush_function) {
8583 (*filter->flush_function)(filter->data);
8584 }
8585
8586 return 0;
8587 }
8588
mbfl_filt_conv_wchar_eucjpwin(int c,mbfl_convert_filter * filter)8589 static int mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
8590 {
8591 int c1, c2, s1 = 0;
8592
8593 if (c == 0xAF) { /* U+00AF is MACRON */
8594 s1 = 0xA2B4; /* Use JIS X 0212 overline */
8595 } else if (c == 0x203E) {
8596 s1 = 0x7E;
8597 } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8598 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8599 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8600 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8601 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8602 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8603 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8604 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8605 } else if (c >= 0xe000 && c < (0xe000 + 10*94)) { /* user (X0208 85ku - 94ku) */
8606 s1 = c - 0xe000;
8607 c1 = s1/94 + 0x75;
8608 c2 = s1%94 + 0x21;
8609 s1 = (c1 << 8) | c2;
8610 } else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) { /* user (X0212 85ku - 94ku) */
8611 s1 = c - (0xe000 + 10*94);
8612 c1 = s1/94 + 0xf5;
8613 c2 = s1%94 + 0xa1;
8614 s1 = (c1 << 8) | c2;
8615 }
8616
8617 if (s1 == 0xa2f1) {
8618 s1 = 0x2d62; /* NUMERO SIGN */
8619 }
8620
8621 if (s1 <= 0) {
8622 if (c == 0xa5) { /* YEN SIGN */
8623 s1 = 0x5C;
8624 } else if (c == 0x2014) {
8625 s1 = 0x213D;
8626 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
8627 s1 = 0x2140;
8628 } else if (c == 0x2225) { /* PARALLEL TO */
8629 s1 = 0x2142;
8630 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
8631 s1 = 0x215d;
8632 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
8633 s1 = 0x2171;
8634 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
8635 s1 = 0x2172;
8636 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
8637 s1 = 0x224c;
8638 } else {
8639 s1 = -1;
8640 c1 = 0;
8641 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
8642 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
8643 const int oh = cp932ext1_ucs_table_min / 94;
8644
8645 if (c == cp932ext1_ucs_table[c1]) {
8646 s1 = ((c1 / 94 + oh + 0x21) << 8) + (c1 % 94 + 0x21);
8647 break;
8648 }
8649 c1++;
8650 }
8651 if (s1 < 0) {
8652 c1 = 0;
8653 c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
8654 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
8655 if (c == cp932ext3_ucs_table[c1]) {
8656 if (c1 < cp932ext3_eucjp_table_size) {
8657 s1 = cp932ext3_eucjp_table[c1];
8658 }
8659 break;
8660 }
8661 c1++;
8662 }
8663 }
8664 }
8665
8666 if (c == 0) {
8667 s1 = 0;
8668 } else if (s1 <= 0) {
8669 s1 = -1;
8670 }
8671 }
8672
8673 if (s1 >= 0) {
8674 if (s1 < 0x80) { /* latin */
8675 CK((*filter->output_function)(s1, filter->data));
8676 } else if (s1 < 0x100) { /* kana */
8677 CK((*filter->output_function)(0x8e, filter->data));
8678 CK((*filter->output_function)(s1, filter->data));
8679 } else if (s1 < 0x8080) { /* X 0208 */
8680 CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8681 CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8682 } else { /* X 0212 */
8683 CK((*filter->output_function)(0x8f, filter->data));
8684 CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8685 CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8686 }
8687 } else {
8688 CK(mbfl_filt_conv_illegal_output(c, filter));
8689 }
8690
8691 return 0;
8692 }
8693
mb_eucjpwin_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8694 static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8695 {
8696 unsigned char *p = *in, *e = p + *in_len;
8697 uint32_t *out = buf, *limit = buf + bufsize;
8698
8699 while (p < e && out < limit) {
8700 unsigned char c = *p++;
8701
8702 if (c < 0x80) {
8703 *out++ = c;
8704 } else if (c >= 0xA1 && c <= 0xFE && p < e) {
8705 unsigned char c2 = *p++;
8706
8707 if (c2 >= 0xA1 && c2 <= 0xFE) {
8708 unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
8709
8710 if (s <= 137) {
8711 if (s == 31) {
8712 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
8713 } else if (s == 32) {
8714 w = 0xFF5E; /* FULLWIDTH TILDE */
8715 } else if (s == 33) {
8716 w = 0x2225; /* PARALLEL TO */
8717 } else if (s == 60) {
8718 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
8719 } else if (s == 80) {
8720 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
8721 } else if (s == 81) {
8722 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
8723 } else if (s == 137) {
8724 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
8725 }
8726 }
8727
8728 if (w == 0) {
8729 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
8730 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8731 } else if (s < jisx0208_ucs_table_size) {
8732 w = jisx0208_ucs_table[s];
8733 } else if (s >= (84 * 94)) {
8734 w = s - (84 * 94) + 0xE000;
8735 }
8736 }
8737
8738 if (!w)
8739 w = MBFL_BAD_INPUT;
8740 *out++ = w;
8741 } else {
8742 *out++ = MBFL_BAD_INPUT;
8743 }
8744 } else if (c == 0x8E && p < e) {
8745 unsigned char c2 = *p++;
8746 if (c2 >= 0xA1 && c2 <= 0xDF) {
8747 *out++ = 0xFEC0 + c2;
8748 } else {
8749 *out++ = MBFL_BAD_INPUT;
8750 }
8751 } else if (c == 0x8F && p < e) {
8752 unsigned char c2 = *p++;
8753 if (p == e) {
8754 *out++ = MBFL_BAD_INPUT;
8755 continue;
8756 }
8757 unsigned char c3 = *p++;
8758
8759 if (c2 >= 0xA1 && c2 <= 0xFE && c3 >= 0xA1 && c3 <= 0xFE) {
8760 unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1, w = 0;
8761
8762 if (s < jisx0212_ucs_table_size) {
8763 w = jisx0212_ucs_table[s];
8764 if (w == 0x7E)
8765 w = 0xFF5E; /* FULLWIDTH TILDE */
8766 } else if (s >= (82*94) && s < (84*94)) {
8767 s = (c2 << 8) | c3;
8768 for (int i = 0; i < cp932ext3_eucjp_table_size; i++) {
8769 if (cp932ext3_eucjp_table[i] == s) {
8770 w = cp932ext3_ucs_table[i];
8771 break;
8772 }
8773 }
8774 } else if (s >= (84*94)) {
8775 w = s - (84*94) + 0xE000 + (94*10);
8776 }
8777
8778 if (w == 0xA6)
8779 w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
8780
8781 if (!w)
8782 w = MBFL_BAD_INPUT;
8783 *out++ = w;
8784 } else {
8785 *out++ = MBFL_BAD_INPUT;
8786 }
8787 } else {
8788 *out++ = MBFL_BAD_INPUT;
8789 }
8790 }
8791
8792 *in_len = e - p;
8793 *in = p;
8794 return out - buf;
8795 }
8796
mb_wchar_to_eucjpwin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8797 static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8798 {
8799 unsigned char *out, *limit;
8800 MB_CONVERT_BUF_LOAD(buf, out, limit);
8801 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8802
8803 while (len--) {
8804 uint32_t w = *in++;
8805 unsigned int s = 0;
8806
8807 if (w == 0) {
8808 out = mb_convert_buf_add(out, 0);
8809 continue;
8810 } else if (w == 0xAF) { /* U+00AF is MACRON */
8811 s = 0xA2B4; /* Use JIS X 0212 overline */
8812 } else if (w == 0x203E) {
8813 s = 0x7E;
8814 } else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8815 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8816 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8817 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8818 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8819 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8820 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8821 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8822 } else if (w >= 0xE000 && w < (0xE000 + 10*94)) {
8823 s = w - 0xE000;
8824 s = ((s/94 + 0x75) << 8) + (s%94) + 0x21;
8825 } else if (w >= (0xE000 + 10*94) && w < (0xE000 + 20*94)) {
8826 s = w - (0xE000 + 10*94);
8827 s = ((s/94 + 0xF5) << 8) + (s%94) + 0xA1;
8828 }
8829
8830 if (s == 0xA2F1)
8831 s = 0x2D62; /* NUMERO SIGN */
8832
8833 if (s == 0) {
8834 if (w == 0xA5) { /* YEN SIGN */
8835 s = 0x5C;
8836 } else if (w == 0x2014) { /* EM DASH */
8837 s = 0x213D;
8838 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8839 s = 0x2140;
8840 } else if (w == 0x2225) { /* PARALLEL TO */
8841 s = 0x2142;
8842 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8843 s = 0x215D;
8844 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8845 s = 0x2171;
8846 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8847 s = 0x2172;
8848 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8849 s = 0x224C;
8850 } else {
8851 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
8852 if (cp932ext1_ucs_table[i] == w) {
8853 s = (((i/94) + (cp932ext1_ucs_table_min/94) + 0x21) << 8) + (i%94) + 0x21;
8854 break;
8855 }
8856 }
8857
8858 if (!s) {
8859 for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
8860 if (cp932ext3_ucs_table[i] == w) {
8861 s = cp932ext3_eucjp_table[i];
8862 break;
8863 }
8864 }
8865 }
8866 }
8867 }
8868
8869 if (!s) {
8870 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjpwin);
8871 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8872 } else if (s < 0x80) {
8873 out = mb_convert_buf_add(out, s);
8874 } else if (s < 0x100) {
8875 out = mb_convert_buf_add2(out, 0x8E, s);
8876 } else if (s < 0x8080) {
8877 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8878 } else {
8879 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8880 out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8881 }
8882 }
8883
8884 MB_CONVERT_BUF_STORE(buf, out, limit);
8885 }
8886
mbfl_filt_conv_cp51932_wchar(int c,mbfl_convert_filter * filter)8887 static int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
8888 {
8889 int c1, s, w;
8890
8891 switch (filter->status) {
8892 case 0:
8893 if (c >= 0 && c < 0x80) { /* latin */
8894 CK((*filter->output_function)(c, filter->data));
8895 } else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
8896 filter->status = 1;
8897 filter->cache = c;
8898 } else if (c == 0x8e) { /* kana first char */
8899 filter->status = 2;
8900 } else {
8901 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8902 }
8903 break;
8904
8905 case 1: /* got first half */
8906 filter->status = 0;
8907 c1 = filter->cache;
8908 if (c > 0xa0 && c < 0xff) {
8909 w = 0;
8910 s = (c1 - 0xa1)*94 + c - 0xa1;
8911 if (s <= 137) {
8912 if (s == 31) {
8913 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
8914 } else if (s == 32) {
8915 w = 0xff5e; /* FULLWIDTH TILDE */
8916 } else if (s == 33) {
8917 w = 0x2225; /* PARALLEL TO */
8918 } else if (s == 60) {
8919 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
8920 } else if (s == 80) {
8921 w = 0xffe0; /* FULLWIDTH CENT SIGN */
8922 } else if (s == 81) {
8923 w = 0xffe1; /* FULLWIDTH POUND SIGN */
8924 } else if (s == 137) {
8925 w = 0xffe2; /* FULLWIDTH NOT SIGN */
8926 }
8927 }
8928 if (w == 0) {
8929 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
8930 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8931 } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
8932 w = jisx0208_ucs_table[s];
8933 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
8934 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
8935 }
8936 }
8937 if (w <= 0) {
8938 w = MBFL_BAD_INPUT;
8939 }
8940 CK((*filter->output_function)(w, filter->data));
8941 } else {
8942 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8943 }
8944 break;
8945
8946 case 2: /* got 0x8e, X0201 kana */
8947 filter->status = 0;
8948 if (c > 0xa0 && c < 0xe0) {
8949 w = 0xfec0 + c;
8950 CK((*filter->output_function)(w, filter->data));
8951 } else {
8952 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8953 }
8954 break;
8955
8956 EMPTY_SWITCH_DEFAULT_CASE();
8957 }
8958
8959 return 0;
8960 }
8961
mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter * filter)8962 static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
8963 {
8964 if (filter->status) {
8965 /* Input string was truncated */
8966 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8967 filter->status = 0;
8968 }
8969
8970 if (filter->flush_function) {
8971 (*filter->flush_function)(filter->data);
8972 }
8973
8974 return 0;
8975 }
8976
mbfl_filt_conv_wchar_cp51932(int c,mbfl_convert_filter * filter)8977 static int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
8978 {
8979 int c1, c2, s1;
8980
8981 s1 = 0;
8982 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8983 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8984 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8985 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8986 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8987 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8988 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8989 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8990 }
8991 if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */
8992 if (s1 <= 0) {
8993 if (c == 0xa5) { /* YEN SIGN */
8994 s1 = 0x216F; /* FULLWIDTH YEN SIGN */
8995 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
8996 s1 = 0x2140;
8997 } else if (c == 0x2225) { /* PARALLEL TO */
8998 s1 = 0x2142;
8999 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
9000 s1 = 0x215d;
9001 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
9002 s1 = 0x2171;
9003 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
9004 s1 = 0x2172;
9005 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
9006 s1 = 0x224c;
9007 } else {
9008 s1 = -1;
9009 c1 = 0;
9010 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
9011 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
9012 if (c == cp932ext1_ucs_table[c1]) {
9013 s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
9014 break;
9015 }
9016 c1++;
9017 }
9018 if (s1 < 0) {
9019 c1 = 0;
9020 c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
9021 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
9022 if (c == cp932ext2_ucs_table[c1]) {
9023 s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21);
9024 break;
9025 }
9026 c1++;
9027 }
9028 }
9029 }
9030 if (c == 0) {
9031 s1 = 0;
9032 } else if (s1 <= 0) {
9033 s1 = -1;
9034 }
9035 }
9036
9037 if (s1 >= 0) {
9038 if (s1 < 0x80) { /* latin */
9039 CK((*filter->output_function)(s1, filter->data));
9040 } else if (s1 < 0x100) { /* kana */
9041 CK((*filter->output_function)(0x8e, filter->data));
9042 CK((*filter->output_function)(s1, filter->data));
9043 } else if (s1 < 0x8080) { /* X 0208 */
9044 CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
9045 CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
9046 } else {
9047 CK(mbfl_filt_conv_illegal_output(c, filter));
9048 }
9049 } else {
9050 CK(mbfl_filt_conv_illegal_output(c, filter));
9051 }
9052
9053 return 0;
9054 }
9055
mb_cp51932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9056 static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9057 {
9058 unsigned char *p = *in, *e = p + *in_len;
9059 uint32_t *out = buf, *limit = buf + bufsize;
9060
9061 while (p < e && out < limit) {
9062 unsigned char c = *p++;
9063
9064 if (c < 0x80) {
9065 *out++ = c;
9066 } else if (c >= 0xA1 && c <= 0xFE && p < e) {
9067 unsigned char c2 = *p++;
9068 if (c2 >= 0xA1 && c2 <= 0xFE) {
9069 unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
9070
9071 if (s <= 137) {
9072 if (s == 31) {
9073 w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
9074 } else if (s == 32) {
9075 w = 0xFF5E; /* FULLWIDTH TILDE */
9076 } else if (s == 33) {
9077 w = 0x2225; /* PARALLEL TO */
9078 } else if (s == 60) {
9079 w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
9080 } else if (s == 80) {
9081 w = 0xFFE0; /* FULLWIDTH CENT SIGN */
9082 } else if (s == 81) {
9083 w = 0xFFE1; /* FULLWIDTH POUND SIGN */
9084 } else if (s == 137) {
9085 w = 0xFFE2; /* FULLWIDTH NOT SIGN */
9086 }
9087 }
9088
9089 if (w == 0) {
9090 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
9091 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
9092 } else if (s < jisx0208_ucs_table_size) {
9093 w = jisx0208_ucs_table[s];
9094 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
9095 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
9096 }
9097 }
9098
9099 if (!w)
9100 w = MBFL_BAD_INPUT;
9101 *out++ = w;
9102 } else {
9103 *out++ = MBFL_BAD_INPUT;
9104 }
9105 } else if (c == 0x8E && p < e) {
9106 unsigned char c2 = *p++;
9107 if (c2 >= 0xA1 && c2 <= 0xDF) {
9108 *out++ = 0xFEC0 + c2;
9109 } else {
9110 *out++ = MBFL_BAD_INPUT;
9111 }
9112 } else {
9113 *out++ = MBFL_BAD_INPUT;
9114 }
9115 }
9116
9117 *in_len = e - p;
9118 *in = p;
9119 return out - buf;
9120 }
9121
mb_wchar_to_cp51932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9122 static void mb_wchar_to_cp51932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9123 {
9124 unsigned char *out, *limit;
9125 MB_CONVERT_BUF_LOAD(buf, out, limit);
9126 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9127
9128 while (len--) {
9129 uint32_t w = *in++;
9130 unsigned int s = 0;
9131
9132 if (w == 0) {
9133 out = mb_convert_buf_add(out, 0);
9134 continue;
9135 } else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
9136 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
9137 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
9138 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
9139 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
9140 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
9141 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
9142 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
9143 }
9144
9145 if (s >= 0x8080) s = 0; /* We don't support JIS X0213 */
9146
9147 if (s == 0) {
9148 if (w == 0xA5) { /* YEN SIGN */
9149 s = 0x216F; /* FULLWIDTH YEN SIGN */
9150 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
9151 s = 0x2140;
9152 } else if (w == 0x2225) { /* PARALLEL TO */
9153 s = 0x2142;
9154 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
9155 s = 0x215D;
9156 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
9157 s = 0x2171;
9158 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
9159 s = 0x2172;
9160 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
9161 s = 0x224C;
9162 } else {
9163 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
9164 if (cp932ext1_ucs_table[i] == w) {
9165 s = ((i/94 + 0x2D) << 8) + (i%94) + 0x21;
9166 goto found_it;
9167 }
9168 }
9169
9170 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
9171 if (cp932ext2_ucs_table[i] == w) {
9172 s = ((i/94 + 0x79) << 8) + (i%94) + 0x21;
9173 goto found_it;
9174 }
9175 }
9176 }
9177 found_it: ;
9178 }
9179
9180 if (!s || s >= 0x8080) {
9181 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp51932);
9182 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9183 } else if (s < 0x80) {
9184 out = mb_convert_buf_add(out, s);
9185 } else if (s < 0x100) {
9186 out = mb_convert_buf_add2(out, 0x8E, s);
9187 } else {
9188 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9189 }
9190 }
9191
9192 MB_CONVERT_BUF_STORE(buf, out, limit);
9193 }
9194
mb_eucjp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9195 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9196 {
9197 unsigned char *p = *in, *e = p + *in_len;
9198 uint32_t *out = buf, *limit = buf + bufsize - 1;
9199
9200 while (p < e && out < limit) {
9201 unsigned char c = *p++;
9202
9203 if (c <= 0x7F) {
9204 *out++ = c;
9205 } else if (c >= 0xA1 && c <= 0xFE) {
9206 /* Kanji */
9207 if (p == e) {
9208 *out++ = MBFL_BAD_INPUT;
9209 break;
9210 }
9211 unsigned char c2 = *p++;
9212 if (c2 <= 0xA0 || c2 == 0xFF) {
9213 *out++ = MBFL_BAD_INPUT;
9214 continue;
9215 }
9216
9217 unsigned int s1 = c - 0x80, s2 = c2 - 0x80;
9218 unsigned int w1 = (s1 << 8) | s2, w = 0;
9219
9220 /* Conversion for combining characters */
9221 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
9222 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
9223 if (k >= 0) {
9224 *out++ = jisx0213_u2_tbl[2*k];
9225 *out++ = jisx0213_u2_tbl[2*k+1];
9226 continue;
9227 }
9228 }
9229
9230 /* Conversion for BMP */
9231 w1 = (s1 - 0x21)*94 + s2 - 0x21;
9232 if (w1 < jisx0213_ucs_table_size) {
9233 w = jisx0213_ucs_table[w1];
9234 }
9235
9236 /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
9237 if (!w) {
9238 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9239 if (k >= 0) {
9240 w = jisx0213_jis_u5_tbl[k] + 0x20000;
9241 }
9242 }
9243
9244 *out++ = w ? w : MBFL_BAD_INPUT;
9245 } else if (c == 0x8E && p < e) {
9246 /* Kana */
9247 unsigned char c2 = *p++;
9248 if (c2 >= 0xA1 && c2 <= 0xDF) {
9249 *out++ = 0xFEC0 + c2;
9250 } else {
9251 *out++ = MBFL_BAD_INPUT;
9252 }
9253 } else if (c == 0x8F && p < e) {
9254 unsigned char c2 = *p++;
9255 if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) {
9256 unsigned char c3 = *p++;
9257
9258 if (c3 < 0xA1 || c3 == 0xFF) {
9259 *out++ = MBFL_BAD_INPUT;
9260 continue;
9261 }
9262
9263 unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1;
9264
9265 if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
9266 int k;
9267 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
9268 if (s1 == jisx0213_p2_ofst[k]) {
9269 break;
9270 }
9271 }
9272 k -= jisx0213_p2_ofst[k];
9273
9274 /* Check for Japanese chars in BMP */
9275 unsigned int s = (s1 + 94 + k)*94 + s2;
9276 ZEND_ASSERT(s < jisx0213_ucs_table_size);
9277 unsigned int w = jisx0213_ucs_table[s];
9278
9279 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
9280 if (!w) {
9281 k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9282 if (k >= 0) {
9283 w = jisx0213_jis_u5_tbl[k] + 0x20000;
9284 }
9285 }
9286
9287 *out++ = w ? w : MBFL_BAD_INPUT;
9288 } else {
9289 *out++ = MBFL_BAD_INPUT;
9290 }
9291 } else {
9292 *out++ = MBFL_BAD_INPUT;
9293 }
9294 } else {
9295 *out++ = MBFL_BAD_INPUT;
9296 }
9297 }
9298
9299 *in_len = e - p;
9300 *in = p;
9301 return out - buf;
9302 }
9303
mb_wchar_to_eucjp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9304 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9305 {
9306 unsigned char *out, *limit;
9307 MB_CONVERT_BUF_LOAD(buf, out, limit);
9308 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9309
9310 uint32_t w;
9311 if (buf->state) {
9312 w = buf->state;
9313 buf->state = 0;
9314 goto process_codepoint;
9315 }
9316
9317 while (len--) {
9318 w = *in++;
9319 process_codepoint: ;
9320 unsigned int s = 0;
9321
9322 /* Check for 1st char of combining characters */
9323 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
9324 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
9325 if (w == jisx0213_u2_tbl[2*k]) {
9326 if (!len) {
9327 if (!end) {
9328 buf->state = w;
9329 MB_CONVERT_BUF_STORE(buf, out, limit);
9330 return;
9331 }
9332 } else {
9333 uint32_t w2 = *in++; len--;
9334 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
9335 k++;
9336 }
9337 if (w2 == jisx0213_u2_tbl[2*k+1]) {
9338 s = jisx0213_u2_key[k];
9339 break;
9340 }
9341 in--; len++;
9342 }
9343
9344 /* Fallback */
9345 s = jisx0213_u2_fb_tbl[k];
9346 break;
9347 }
9348 }
9349 }
9350
9351 /* Check for major Japanese chars: U+4E00-U+9FFF */
9352 if (!s) {
9353 for (int k = 0; k < uni2jis_tbl_len; k++) {
9354 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
9355 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
9356 break;
9357 }
9358 }
9359 }
9360
9361 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
9362 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
9363 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
9364 if (k >= 0) {
9365 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
9366 }
9367 }
9368
9369 /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
9370 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
9371 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
9372 if (k >= 0) {
9373 s = jisx0213_u5_jis_tbl[k];
9374 }
9375 }
9376
9377 if (!s) {
9378 /* CJK Compatibility Forms: U+FE30-U+FE4F */
9379 if (w == 0xFE45) {
9380 s = 0x233E;
9381 } else if (w == 0xFE46) {
9382 s = 0x233D;
9383 } else if (w >= 0xF91D && w <= 0xF9DC) {
9384 /* CJK Compatibility Ideographs: U+F900-U+F92A */
9385 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
9386 if (k >= 0) {
9387 s = ucs_r2b_jisx0213_cmap_val[k];
9388 }
9389 }
9390 }
9391
9392 if (!s && w) {
9393 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004);
9394 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9395 } else if (s <= 0x7F) {
9396 out = mb_convert_buf_add(out, s);
9397 } else if (s <= 0xFF) {
9398 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9399 out = mb_convert_buf_add2(out, 0x8E, s);
9400 } else if (s <= 0x7EFF) {
9401 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9402 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80);
9403 } else {
9404 unsigned int s2 = s & 0xFF;
9405 int k = ((s >> 8) & 0xFF) - 0x7F;
9406 ZEND_ASSERT(k < jisx0213_p2_ofst_len);
9407 s = jisx0213_p2_ofst[k] + 0x21;
9408 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
9409 out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80);
9410 }
9411 }
9412
9413 MB_CONVERT_BUF_STORE(buf, out, limit);
9414 }
9415
mbfl_filt_conv_euccn_wchar(int c,mbfl_convert_filter * filter)9416 static int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
9417 {
9418 int c1, w;
9419
9420 switch (filter->status) {
9421 case 0:
9422 if (c >= 0 && c < 0x80) { /* latin */
9423 CK((*filter->output_function)(c, filter->data));
9424 } else if ((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) { /* dbcs lead byte */
9425 filter->status = 1;
9426 filter->cache = c;
9427 } else {
9428 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9429 }
9430 break;
9431
9432 case 1: /* dbcs second byte */
9433 filter->status = 0;
9434 c1 = filter->cache;
9435 if (c > 0xA0 && c < 0xFF) {
9436 w = (c1 - 0x81)*192 + c - 0x40;
9437 ZEND_ASSERT(w < cp936_ucs_table_size);
9438 if (w == 0x1864) {
9439 w = 0x30FB;
9440 } else if (w == 0x186A) {
9441 w = 0x2015;
9442 } else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9443 w = 0;
9444 } else {
9445 w = cp936_ucs_table[w];
9446 }
9447
9448 if (w <= 0) {
9449 w = MBFL_BAD_INPUT;
9450 }
9451
9452 CK((*filter->output_function)(w, filter->data));
9453 } else {
9454 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9455 }
9456 break;
9457
9458 EMPTY_SWITCH_DEFAULT_CASE();
9459 }
9460
9461 return 0;
9462 }
9463
mbfl_filt_conv_wchar_euccn(int c,mbfl_convert_filter * filter)9464 static int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter)
9465 {
9466 int s = 0;
9467
9468 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
9469 if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261) {
9470 s = 0;
9471 } else {
9472 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
9473 }
9474 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
9475 if (c == 0x2015) {
9476 s = 0xA1AA;
9477 } else if (c == 0x2014 || (c >= 0x2170 && c <= 0x2179)) {
9478 s = 0;
9479 } else {
9480 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
9481 }
9482 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
9483 if (c == 0x30FB) {
9484 s = 0xA1A4;
9485 } else {
9486 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
9487 }
9488 } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
9489 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
9490 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
9491 if (c == 0xFF04) {
9492 s = 0xA1E7;
9493 } else if (c == 0xFF5E) {
9494 s = 0xA1AB;
9495 } else if (c >= 0xFF01 && c <= 0xFF5D) {
9496 s = c - 0xFF01 + 0xA3A1;
9497 } else if (c >= 0xFFE0 && c <= 0xFFE5) {
9498 s = ucs_hff_s_cp936_table[c - 0xFFE0];
9499 }
9500 }
9501
9502 /* exclude CP936 extensions */
9503 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9504 s = 0;
9505 }
9506
9507 if (s <= 0) {
9508 if (c < 0x80) {
9509 s = c;
9510 } else if (s <= 0) {
9511 s = -1;
9512 }
9513 }
9514
9515 if (s >= 0) {
9516 if (s < 0x80) { /* latin */
9517 CK((*filter->output_function)(s, filter->data));
9518 } else {
9519 CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9520 CK((*filter->output_function)(s & 0xFF, filter->data));
9521 }
9522 } else {
9523 CK(mbfl_filt_conv_illegal_output(c, filter));
9524 }
9525
9526 return 0;
9527 }
9528
mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter * filter)9529 static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter)
9530 {
9531 if (filter->status == 1) {
9532 /* 2-byte character was truncated */
9533 filter->status = 0;
9534 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9535 }
9536
9537 if (filter->flush_function) {
9538 (*filter->flush_function)(filter->data);
9539 }
9540
9541 return 0;
9542 }
9543
mb_euccn_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9544 static size_t mb_euccn_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9545 {
9546 unsigned char *p = *in, *e = p + *in_len;
9547 uint32_t *out = buf, *limit = buf + bufsize;
9548
9549 while (p < e && out < limit) {
9550 unsigned char c = *p++;
9551
9552 if (c < 0x80) {
9553 *out++ = c;
9554 } else if (((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) && p < e) {
9555 unsigned char c2 = *p++;
9556
9557 if (c2 >= 0xA1 && c2 <= 0xFE) {
9558 unsigned int w = (c - 0x81)*192 + c2 - 0x40;
9559 ZEND_ASSERT(w < cp936_ucs_table_size);
9560 if (w == 0x1864) {
9561 w = 0x30FB;
9562 } else if (w == 0x186A) {
9563 w = 0x2015;
9564 } else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9565 w = 0;
9566 } else {
9567 w = cp936_ucs_table[w];
9568 }
9569
9570 if (!w)
9571 w = MBFL_BAD_INPUT;
9572 *out++ = w;
9573 } else {
9574 *out++ = MBFL_BAD_INPUT;
9575 }
9576 } else {
9577 *out++ = MBFL_BAD_INPUT;
9578 }
9579 }
9580
9581 *in_len = e - p;
9582 *in = p;
9583 return out - buf;
9584 }
9585
mb_wchar_to_euccn(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9586 static void mb_wchar_to_euccn(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9587 {
9588 unsigned char *out, *limit;
9589 MB_CONVERT_BUF_LOAD(buf, out, limit);
9590 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9591
9592 while (len--) {
9593 uint32_t w = *in++;
9594 unsigned int s = 0;
9595
9596 if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
9597 if (w != 0xB7 && w != 0x144 && w != 0x148 && w != 0x251 && w != 0x261) {
9598 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
9599 }
9600 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
9601 if (w == 0x2015) {
9602 s = 0xA1AA;
9603 } else if (w != 0x2014 && (w < 0x2170 || w > 0x2179)) {
9604 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
9605 }
9606 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
9607 if (w == 0x30FB) {
9608 s = 0xA1A4;
9609 } else {
9610 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
9611 }
9612 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
9613 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
9614 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
9615 if (w == 0xFF04) {
9616 s = 0xA1E7;
9617 } else if (w == 0xFF5E) {
9618 s = 0xA1AB;
9619 } else if (w >= 0xFF01 && w <= 0xFF5D) {
9620 s = w - 0xFF01 + 0xA3A1;
9621 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
9622 s = ucs_hff_s_cp936_table[w - 0xFFE0];
9623 }
9624 }
9625
9626 /* Exclude CP936 extensions */
9627 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9628 s = 0;
9629 }
9630
9631 if (!s) {
9632 if (w < 0x80) {
9633 out = mb_convert_buf_add(out, w);
9634 } else {
9635 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euccn);
9636 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9637 }
9638 } else if (s < 0x80) {
9639 out = mb_convert_buf_add(out, s);
9640 } else {
9641 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
9642 }
9643 }
9644
9645 MB_CONVERT_BUF_STORE(buf, out, limit);
9646 }
9647
mbfl_filt_conv_euctw_wchar(int c,mbfl_convert_filter * filter)9648 static int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
9649 {
9650 int c1, s, w;
9651
9652 switch (filter->status) {
9653 case 0:
9654 if (c >= 0 && c < 0x80) { /* latin */
9655 CK((*filter->output_function)(c, filter->data));
9656 } else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */
9657 filter->status = 1;
9658 filter->cache = c;
9659 } else if (c == 0x8E) { /* 4-byte character, first byte */
9660 filter->status = 2;
9661 } else {
9662 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9663 }
9664 break;
9665
9666 case 1: /* 2-byte character, second byte */
9667 filter->status = 0;
9668 c1 = filter->cache;
9669 if (c > 0xA0 && c < 0xFF) {
9670 w = (c1 - 0xA1)*94 + (c - 0xA1);
9671 if (w >= 0 && w < cns11643_1_ucs_table_size) {
9672 w = cns11643_1_ucs_table[w];
9673 } else {
9674 w = 0;
9675 }
9676
9677 if (w <= 0) {
9678 w = MBFL_BAD_INPUT;
9679 }
9680
9681 CK((*filter->output_function)(w, filter->data));
9682 } else {
9683 filter->status = filter->cache = 0;
9684 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9685 }
9686 break;
9687
9688 case 2: /* got 0x8e, second byte */
9689 if (c == 0xA1 || c == 0xA2 || c == 0xAE) {
9690 filter->status = 3;
9691 filter->cache = c - 0xA1;
9692 } else {
9693 filter->status = filter->cache = 0;
9694 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9695 }
9696 break;
9697
9698 case 3: /* got 0x8e, third byte */
9699 filter->status = 0;
9700 c1 = filter->cache;
9701 if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) ||
9702 (c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) {
9703 filter->status = 4;
9704 filter->cache = (c1 << 8) + c - 0xA1;
9705 } else {
9706 filter->status = filter->cache = 0;
9707 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9708 }
9709 break;
9710
9711 case 4: /* multi-byte character, fourth byte */
9712 filter->status = 0;
9713 c1 = filter->cache;
9714 if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) {
9715 int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */
9716 s = (c1 & 0xFF)*94 + c - 0xA1;
9717 w = 0;
9718 if (s >= 0) {
9719 /* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9720 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9721 * We only support the older version of CNS-11643
9722 * This is the same as iconv from glibc 2.2 */
9723 if (plane == 0 && s < cns11643_1_ucs_table_size) {
9724 w = cns11643_1_ucs_table[s];
9725 } else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9726 w = cns11643_2_ucs_table[s];
9727 } else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9728 w = cns11643_14_ucs_table[s];
9729 }
9730 }
9731
9732 if (w <= 0) {
9733 w = MBFL_BAD_INPUT;
9734 }
9735
9736 CK((*filter->output_function)(w, filter->data));
9737 } else {
9738 filter->status = filter->cache = 0;
9739 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9740 }
9741 break;
9742
9743 EMPTY_SWITCH_DEFAULT_CASE();
9744 }
9745
9746 return 0;
9747 }
9748
mbfl_filt_conv_wchar_euctw(int c,mbfl_convert_filter * filter)9749 static int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
9750 {
9751 int s = 0;
9752
9753 if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
9754 s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
9755 } else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
9756 s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min];
9757 } else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) {
9758 s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min];
9759 } else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) {
9760 s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min];
9761 } else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
9762 s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
9763 }
9764
9765 if (s <= 0) {
9766 if (c == 0) {
9767 s = 0;
9768 } else if (s <= 0) {
9769 s = -1;
9770 }
9771 }
9772
9773 if (s >= 0) {
9774 int plane = (s & 0x1F0000) >> 16;
9775 if (plane <= 1) {
9776 if (s < 0x80) { /* latin */
9777 CK((*filter->output_function)(s, filter->data));
9778 } else {
9779 s = (s & 0xFFFF) | 0x8080;
9780 CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9781 CK((*filter->output_function)(s & 0xFF, filter->data));
9782 }
9783 } else {
9784 s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080);
9785 CK((*filter->output_function)(0x8e , filter->data));
9786 CK((*filter->output_function)((s >> 16) & 0xFF, filter->data));
9787 CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9788 CK((*filter->output_function)(s & 0xFF, filter->data));
9789 }
9790 } else {
9791 CK(mbfl_filt_conv_illegal_output(c, filter));
9792 }
9793 return 0;
9794 }
9795
mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter * filter)9796 static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter)
9797 {
9798 if (filter->status) {
9799 /* 2-byte or 4-byte character was truncated */
9800 filter->status = 0;
9801 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9802 }
9803
9804 if (filter->flush_function) {
9805 (*filter->flush_function)(filter->data);
9806 }
9807
9808 return 0;
9809 }
9810
mb_euctw_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9811 static size_t mb_euctw_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9812 {
9813 unsigned char *p = *in, *e = p + *in_len;
9814 uint32_t *out = buf, *limit = buf + bufsize;
9815
9816 while (p < e && out < limit) {
9817 unsigned char c = *p++;
9818
9819 if (c < 0x80) {
9820 *out++ = c;
9821 } else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3 && p < e) {
9822 unsigned char c2 = *p++;
9823
9824 if (c2 >= 0xA1 && c2 <= 0xFE) {
9825 unsigned int w = (c - 0xA1)*94 + (c2 - 0xA1);
9826 if (w < cns11643_1_ucs_table_size) {
9827 w = cns11643_1_ucs_table[w];
9828 } else {
9829 w = 0;
9830 }
9831 if (!w)
9832 w = MBFL_BAD_INPUT;
9833 *out++ = w;
9834 } else {
9835 *out++ = MBFL_BAD_INPUT;
9836 }
9837 } else if (c == 0x8E && p < e) {
9838 unsigned char c2 = *p++;
9839
9840 if ((c2 == 0xA1 || c2 == 0xA2 || c2 == 0xAE) && p < e) {
9841 unsigned int plane = c2 - 0xA1; /* This is actually the CNS-11643 plane minus one */
9842 unsigned char c3 = *p++;
9843
9844 if (c3 >= 0xA1 && ((plane == 0 && ((c3 >= 0xA1 && c3 <= 0xA6) || (c3 >= 0xC2 && c3 <= 0xFD)) && c3 != 0xC3) || (plane == 1 && c3 <= 0xF2) || (plane == 13 && c3 <= 0xE7)) && p < e) {
9845 unsigned char c4 = *p++;
9846
9847 if (c2 <= 0xAE && c4 > 0xA0 && c4 < 0xFF) {
9848 unsigned int s = (c3 - 0xA1)*94 + c4 - 0xA1, w = 0;
9849
9850 /* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9851 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9852 * We only support the older version of CNS-11643
9853 * This is the same as iconv from glibc 2.2 */
9854 if (plane == 0 && s < cns11643_1_ucs_table_size) {
9855 w = cns11643_1_ucs_table[s];
9856 } else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9857 w = cns11643_2_ucs_table[s];
9858 } else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9859 w = cns11643_14_ucs_table[s];
9860 }
9861
9862 if (!w)
9863 w = MBFL_BAD_INPUT;
9864 *out++ = w;
9865 continue;
9866 }
9867 }
9868 }
9869
9870 *out++ = MBFL_BAD_INPUT;
9871 } else {
9872 *out++ = MBFL_BAD_INPUT;
9873 }
9874 }
9875
9876 *in_len = e - p;
9877 *in = p;
9878 return out - buf;
9879 }
9880
mb_wchar_to_euctw(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9881 static void mb_wchar_to_euctw(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9882 {
9883 unsigned char *out, *limit;
9884 MB_CONVERT_BUF_LOAD(buf, out, limit);
9885 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9886
9887 while (len--) {
9888 uint32_t w = *in++;
9889 unsigned int s = 0;
9890
9891 if (w >= ucs_a1_cns11643_table_min && w < ucs_a1_cns11643_table_max) {
9892 s = ucs_a1_cns11643_table[w - ucs_a1_cns11643_table_min];
9893 } else if (w >= ucs_a2_cns11643_table_min && w < ucs_a2_cns11643_table_max) {
9894 s = ucs_a2_cns11643_table[w - ucs_a2_cns11643_table_min];
9895 } else if (w >= ucs_a3_cns11643_table_min && w < ucs_a3_cns11643_table_max) {
9896 s = ucs_a3_cns11643_table[w - ucs_a3_cns11643_table_min];
9897 } else if (w >= ucs_i_cns11643_table_min && w < ucs_i_cns11643_table_max) {
9898 s = ucs_i_cns11643_table[w - ucs_i_cns11643_table_min];
9899 } else if (w >= ucs_r_cns11643_table_min && w < ucs_r_cns11643_table_max) {
9900 s = ucs_r_cns11643_table[w - ucs_r_cns11643_table_min];
9901 }
9902
9903 if (!s) {
9904 if (w == 0) {
9905 out = mb_convert_buf_add(out, 0);
9906 } else {
9907 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euctw);
9908 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9909 }
9910 } else {
9911 unsigned int plane = s >> 16;
9912 if (plane <= 1) {
9913 if (s < 0x80) {
9914 out = mb_convert_buf_add(out, s);
9915 } else {
9916 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9917 }
9918 } else {
9919 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
9920 out = mb_convert_buf_add4(out, 0x8E, 0xA0 + plane, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9921 }
9922 }
9923 }
9924
9925 MB_CONVERT_BUF_STORE(buf, out, limit);
9926 }
9927
mbfl_filt_conv_euckr_wchar(int c,mbfl_convert_filter * filter)9928 static int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
9929 {
9930 int c1, w, flag;
9931
9932 switch (filter->status) {
9933 case 0:
9934 if (c >= 0 && c < 0x80) { /* latin */
9935 CK((*filter->output_function)(c, filter->data));
9936 } else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9) { /* dbcs lead byte */
9937 filter->status = 1;
9938 filter->cache = c;
9939 } else {
9940 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9941 }
9942 break;
9943
9944 case 1: /* dbcs second byte */
9945 filter->status = 0;
9946 c1 = filter->cache;
9947 flag = 0;
9948 if (c1 >= 0xa1 && c1 <= 0xc6) {
9949 flag = 1;
9950 } else if (c1 >= 0xc7 && c1 <= 0xfe && c1 != 0xc9) {
9951 flag = 2;
9952 }
9953 if (flag > 0 && c >= 0xa1 && c <= 0xfe) {
9954 if (flag == 1) { /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */
9955 w = (c1 - 0x81)*190 + c - 0x41;
9956 ZEND_ASSERT(w < uhc1_ucs_table_size);
9957 w = uhc1_ucs_table[w];
9958 } else { /* 1st: 0xc7..0xc8,0xca..0xfe, 2nd: 0xa1..0xfe */
9959 w = (c1 - 0xc7)*94 + c - 0xa1;
9960 ZEND_ASSERT(w < uhc3_ucs_table_size);
9961 w = uhc3_ucs_table[w];
9962 }
9963
9964 if (w <= 0) {
9965 w = MBFL_BAD_INPUT;
9966 }
9967 CK((*filter->output_function)(w, filter->data));
9968 } else {
9969 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9970 }
9971 break;
9972
9973 EMPTY_SWITCH_DEFAULT_CASE();
9974 }
9975
9976 return 0;
9977 }
9978
mbfl_filt_conv_wchar_euckr(int c,mbfl_convert_filter * filter)9979 static int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
9980 {
9981 int s = 0;
9982
9983 if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
9984 s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
9985 } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
9986 s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
9987 } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
9988 s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
9989 } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
9990 s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
9991 } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
9992 s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
9993 } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
9994 s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
9995 } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
9996 s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
9997 }
9998
9999 /* exclude UHC extension area (although we are using the UHC conversion tables) */
10000 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
10001 s = 0;
10002 }
10003
10004 if (s <= 0) {
10005 if (c < 0x80) {
10006 s = c;
10007 } else {
10008 s = -1;
10009 }
10010 }
10011
10012 if (s >= 0) {
10013 if (s < 0x80) { /* latin */
10014 CK((*filter->output_function)(s, filter->data));
10015 } else {
10016 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10017 CK((*filter->output_function)(s & 0xff, filter->data));
10018 }
10019 } else {
10020 CK(mbfl_filt_conv_illegal_output(c, filter));
10021 }
10022
10023 return 0;
10024 }
10025
mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter * filter)10026 static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter)
10027 {
10028 if (filter->status == 1) {
10029 /* 2-byte character was truncated */
10030 filter->status = 0;
10031 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10032 }
10033
10034 if (filter->flush_function) {
10035 (*filter->flush_function)(filter->data);
10036 }
10037
10038 return 0;
10039 }
10040
mb_euckr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10041 static size_t mb_euckr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10042 {
10043 unsigned char *p = *in, *e = p + *in_len;
10044 uint32_t *out = buf, *limit = buf + bufsize;
10045
10046 while (p < e && out < limit) {
10047 unsigned char c = *p++;
10048
10049 if (c < 0x80) {
10050 *out++ = c;
10051 } else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9 && p < e) {
10052 unsigned char c2 = *p++;
10053 if (c2 < 0xA1 || c2 == 0xFF) {
10054 *out++ = MBFL_BAD_INPUT;
10055 continue;
10056 }
10057
10058 if (c <= 0xC6) {
10059 unsigned int w = (c - 0x81)*190 + c2 - 0x41;
10060 ZEND_ASSERT(w < uhc1_ucs_table_size);
10061 w = uhc1_ucs_table[w];
10062 if (!w)
10063 w = MBFL_BAD_INPUT;
10064 *out++ = w;
10065 } else {
10066 unsigned int w = (c - 0xC7)*94 + c2 - 0xA1;
10067 ZEND_ASSERT(w < uhc3_ucs_table_size);
10068 w = uhc3_ucs_table[w];
10069 if (!w)
10070 w = MBFL_BAD_INPUT;
10071 *out++ = w;
10072 }
10073 } else {
10074 *out++ = MBFL_BAD_INPUT;
10075 }
10076 }
10077
10078 *in_len = e - p;
10079 *in = p;
10080 return out - buf;
10081 }
10082
mb_wchar_to_euckr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10083 static void mb_wchar_to_euckr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10084 {
10085 unsigned char *out, *limit;
10086 MB_CONVERT_BUF_LOAD(buf, out, limit);
10087 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10088
10089 while (len--) {
10090 uint32_t w = *in++;
10091 unsigned int s = 0;
10092
10093 if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10094 s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10095 } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10096 s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10097 } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10098 s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10099 } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10100 s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10101 } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10102 s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10103 } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10104 s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10105 } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10106 s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10107 }
10108
10109 /* Exclude UHC extension area (although we are using the UHC conversion tables) */
10110 if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
10111 s = 0;
10112 }
10113
10114 if (!s) {
10115 if (w < 0x80) {
10116 out = mb_convert_buf_add(out, w);
10117 } else {
10118 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euckr);
10119 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10120 }
10121 } else if (s < 0x80) {
10122 out = mb_convert_buf_add(out, s);
10123 } else {
10124 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10125 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10126 }
10127 }
10128
10129 MB_CONVERT_BUF_STORE(buf, out, limit);
10130 }
10131
mbfl_filt_conv_uhc_wchar(int c,mbfl_convert_filter * filter)10132 static int mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
10133 {
10134 switch (filter->status) {
10135 case 0:
10136 if (c >= 0 && c < 0x80) { /* latin */
10137 CK((*filter->output_function)(c, filter->data));
10138 } else if (c > 0x80 && c < 0xfe && c != 0xc9) { /* dbcs lead byte */
10139 filter->status = 1;
10140 filter->cache = c;
10141 } else {
10142 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10143 }
10144 break;
10145
10146 case 1: /* dbcs second byte */
10147 filter->status = 0;
10148 int c1 = filter->cache, w = 0;
10149
10150 if (c1 >= 0x81 && c1 <= 0xc6 && c >= 0x41 && c <= 0xfe) {
10151 w = (c1 - 0x81)*190 + (c - 0x41);
10152 if (w >= 0 && w < uhc1_ucs_table_size) {
10153 w = uhc1_ucs_table[w];
10154 }
10155 } else if (c1 >= 0xc7 && c1 < 0xfe && c >= 0xa1 && c <= 0xfe) {
10156 w = (c1 - 0xc7)*94 + (c - 0xa1);
10157 if (w >= 0 && w < uhc3_ucs_table_size) {
10158 w = uhc3_ucs_table[w];
10159 }
10160 }
10161
10162 if (w == 0) {
10163 w = MBFL_BAD_INPUT;
10164 }
10165 CK((*filter->output_function)(w, filter->data));
10166 break;
10167
10168 EMPTY_SWITCH_DEFAULT_CASE();
10169 }
10170
10171 return 0;
10172 }
10173
mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter * filter)10174 static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter)
10175 {
10176 if (filter->status == 1) {
10177 /* 2-byte character was truncated */
10178 filter->status = 0;
10179 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10180 }
10181
10182 if (filter->flush_function) {
10183 (*filter->flush_function)(filter->data);
10184 }
10185
10186 return 0;
10187 }
10188
mbfl_filt_conv_wchar_uhc(int c,mbfl_convert_filter * filter)10189 static int mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
10190 {
10191 int s = 0;
10192
10193 if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
10194 s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
10195 } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
10196 s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
10197 } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
10198 s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
10199 } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
10200 s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
10201 } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
10202 s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
10203 } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
10204 s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
10205 } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
10206 s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
10207 }
10208
10209 if (s == 0 && c != 0) {
10210 s = -1;
10211 }
10212
10213 if (s >= 0) {
10214 if (s < 0x80) { /* latin */
10215 CK((*filter->output_function)(s, filter->data));
10216 } else {
10217 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10218 CK((*filter->output_function)(s & 0xff, filter->data));
10219 }
10220 } else {
10221 CK(mbfl_filt_conv_illegal_output(c, filter));
10222 }
10223
10224 return 0;
10225 }
10226
mb_uhc_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10227 static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10228 {
10229 unsigned char *p = *in, *e = p + *in_len;
10230 uint32_t *out = buf, *limit = buf + bufsize;
10231
10232 e--; /* Stop the main loop 1 byte short of the end of the input */
10233
10234 while (p < e && out < limit) {
10235 unsigned char c = *p++;
10236
10237 if (c < 0x80) {
10238 *out++ = c;
10239 } else if (c > 0x80 && c < 0xFE) {
10240 /* We don't need to check p < e here; it's not possible that this pointer dereference
10241 * will be outside the input string, because of e-- above */
10242 unsigned char c2 = *p++;
10243 if (c2 < 0x41 || c2 == 0xFF) {
10244 *out++ = MBFL_BAD_INPUT;
10245 continue;
10246 }
10247 unsigned int w = 0;
10248
10249 if (c <= 0xC6) {
10250 w = (c - 0x81)*190 + c2 - 0x41;
10251 ZEND_ASSERT(w < uhc1_ucs_table_size);
10252 w = uhc1_ucs_table[w];
10253 } else if (c2 >= 0xA1) {
10254 w = (c - 0xC7)*94 + c2 - 0xA1;
10255 ZEND_ASSERT(w < uhc3_ucs_table_size);
10256 w = uhc3_ucs_table[w];
10257 }
10258 if (!w) {
10259 /* If c == 0xC9, we shouldn't have tried to read a 2-byte char at all... but it is faster
10260 * to fix up that rare case here rather than include an extra check in the hot path */
10261 if (c == 0xC9) {
10262 p--;
10263 }
10264 w = MBFL_BAD_INPUT;
10265 }
10266 *out++ = w;
10267 } else {
10268 *out++ = MBFL_BAD_INPUT;
10269 }
10270 }
10271
10272 /* Finish up last byte of input string if there is one */
10273 if (p == e && out < limit) {
10274 unsigned char c = *p++;
10275 *out++ = (c < 0x80) ? c : MBFL_BAD_INPUT;
10276 }
10277
10278 *in_len = e - p + 1;
10279 *in = p;
10280 return out - buf;
10281 }
10282
mb_wchar_to_uhc(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10283 static void mb_wchar_to_uhc(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10284 {
10285 unsigned char *out, *limit;
10286 MB_CONVERT_BUF_LOAD(buf, out, limit);
10287 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10288
10289 while (len--) {
10290 uint32_t w = *in++;
10291 unsigned int s = 0;
10292
10293 if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10294 s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10295 } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10296 s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10297 } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10298 s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10299 } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10300 s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10301 } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10302 s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10303 } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10304 s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10305 } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10306 s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10307 }
10308
10309 if (!s) {
10310 if (w == 0) {
10311 out = mb_convert_buf_add(out, 0);
10312 } else {
10313 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_uhc);
10314 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10315 }
10316 } else if (s < 0x80) {
10317 out = mb_convert_buf_add(out, s);
10318 } else {
10319 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10320 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10321 }
10322 }
10323
10324 MB_CONVERT_BUF_STORE(buf, out, limit);
10325 }
10326
10327 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
10328 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10330 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10331 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10332 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10334 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
10337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10338 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10339 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10340 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10341 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10342 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10343 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10344 };
10345
10346 static const char *mbfl_encoding_euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
10347
10348 static const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
10349 mbfl_no_encoding_euc_jp,
10350 mbfl_no_encoding_wchar,
10351 mbfl_filt_conv_common_ctor,
10352 NULL,
10353 mbfl_filt_conv_eucjp_wchar,
10354 mbfl_filt_conv_eucjp_wchar_flush,
10355 NULL,
10356 };
10357
10358 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp = {
10359 mbfl_no_encoding_wchar,
10360 mbfl_no_encoding_euc_jp,
10361 mbfl_filt_conv_common_ctor,
10362 NULL,
10363 mbfl_filt_conv_wchar_eucjp,
10364 mbfl_filt_conv_common_flush,
10365 NULL,
10366 };
10367
10368 const mbfl_encoding mbfl_encoding_euc_jp = {
10369 mbfl_no_encoding_euc_jp,
10370 "EUC-JP",
10371 "EUC-JP",
10372 mbfl_encoding_euc_jp_aliases,
10373 mblen_table_eucjp,
10374 0,
10375 &vtbl_eucjp_wchar,
10376 &vtbl_wchar_eucjp,
10377 mb_eucjp_to_wchar,
10378 mb_wchar_to_eucjp,
10379 NULL,
10380 NULL,
10381 };
10382
10383 static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
10384
10385 static const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
10386 mbfl_no_encoding_eucjp2004,
10387 mbfl_no_encoding_wchar,
10388 mbfl_filt_conv_common_ctor,
10389 NULL,
10390 mbfl_filt_conv_jis2004_wchar,
10391 mbfl_filt_conv_jis2004_wchar_flush,
10392 NULL,
10393 };
10394
10395 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = {
10396 mbfl_no_encoding_wchar,
10397 mbfl_no_encoding_eucjp2004,
10398 mbfl_filt_conv_common_ctor,
10399 NULL,
10400 mbfl_filt_conv_wchar_jis2004,
10401 mbfl_filt_conv_wchar_jis2004_flush,
10402 NULL,
10403 };
10404
10405 const mbfl_encoding mbfl_encoding_eucjp2004 = {
10406 mbfl_no_encoding_eucjp2004,
10407 "EUC-JP-2004",
10408 "EUC-JP",
10409 mbfl_encoding_eucjp2004_aliases,
10410 mblen_table_eucjp,
10411 0,
10412 &vtbl_eucjp2004_wchar,
10413 &vtbl_wchar_eucjp2004,
10414 mb_eucjp2004_to_wchar,
10415 mb_wchar_to_eucjp2004,
10416 NULL,
10417 NULL,
10418 };
10419
10420 static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
10421
10422 static const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
10423 mbfl_no_encoding_eucjp_win,
10424 mbfl_no_encoding_wchar,
10425 mbfl_filt_conv_common_ctor,
10426 NULL,
10427 mbfl_filt_conv_eucjpwin_wchar,
10428 mbfl_filt_conv_eucjpwin_wchar_flush,
10429 NULL,
10430 };
10431
10432 static const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = {
10433 mbfl_no_encoding_wchar,
10434 mbfl_no_encoding_eucjp_win,
10435 mbfl_filt_conv_common_ctor,
10436 NULL,
10437 mbfl_filt_conv_wchar_eucjpwin,
10438 mbfl_filt_conv_common_flush,
10439 NULL,
10440 };
10441
10442 const mbfl_encoding mbfl_encoding_eucjp_win = {
10443 mbfl_no_encoding_eucjp_win,
10444 "eucJP-win",
10445 "EUC-JP",
10446 mbfl_encoding_eucjp_win_aliases,
10447 mblen_table_eucjp,
10448 0,
10449 &vtbl_eucjpwin_wchar,
10450 &vtbl_wchar_eucjpwin,
10451 mb_eucjpwin_to_wchar,
10452 mb_wchar_to_eucjpwin,
10453 NULL,
10454 NULL,
10455 };
10456
10457 static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
10458
10459 static const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
10460 mbfl_no_encoding_cp51932,
10461 mbfl_no_encoding_wchar,
10462 mbfl_filt_conv_common_ctor,
10463 NULL,
10464 mbfl_filt_conv_cp51932_wchar,
10465 mbfl_filt_conv_cp51932_wchar_flush,
10466 NULL,
10467 };
10468
10469 static const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = {
10470 mbfl_no_encoding_wchar,
10471 mbfl_no_encoding_cp51932,
10472 mbfl_filt_conv_common_ctor,
10473 NULL,
10474 mbfl_filt_conv_wchar_cp51932,
10475 mbfl_filt_conv_common_flush,
10476 NULL,
10477 };
10478
10479 const mbfl_encoding mbfl_encoding_cp51932 = {
10480 mbfl_no_encoding_cp51932,
10481 "CP51932",
10482 "CP51932",
10483 mbfl_encoding_cp51932_aliases,
10484 mblen_table_eucjp,
10485 0,
10486 &vtbl_cp51932_wchar,
10487 &vtbl_wchar_cp51932,
10488 mb_cp51932_to_wchar,
10489 mb_wchar_to_cp51932,
10490 NULL,
10491 NULL,
10492 };
10493
10494 static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
10495 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10496 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10497 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10500 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10503 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10505 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10506 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10507 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10508 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10509 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10510 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10511 };
10512
10513 static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
10514
10515 static const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
10516 mbfl_no_encoding_euc_cn,
10517 mbfl_no_encoding_wchar,
10518 mbfl_filt_conv_common_ctor,
10519 NULL,
10520 mbfl_filt_conv_euccn_wchar,
10521 mbfl_filt_conv_euccn_wchar_flush,
10522 NULL,
10523 };
10524
10525 static const struct mbfl_convert_vtbl vtbl_wchar_euccn = {
10526 mbfl_no_encoding_wchar,
10527 mbfl_no_encoding_euc_cn,
10528 mbfl_filt_conv_common_ctor,
10529 NULL,
10530 mbfl_filt_conv_wchar_euccn,
10531 mbfl_filt_conv_common_flush,
10532 NULL,
10533 };
10534
10535 const mbfl_encoding mbfl_encoding_euc_cn = {
10536 mbfl_no_encoding_euc_cn,
10537 "EUC-CN",
10538 "CN-GB",
10539 mbfl_encoding_euc_cn_aliases,
10540 mblen_table_euccn,
10541 0,
10542 &vtbl_euccn_wchar,
10543 &vtbl_wchar_euccn,
10544 mb_euccn_to_wchar,
10545 mb_wchar_to_euccn,
10546 NULL,
10547 NULL,
10548 };
10549
10550 static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
10551
10552 static const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
10553 mbfl_no_encoding_euc_tw,
10554 mbfl_no_encoding_wchar,
10555 mbfl_filt_conv_common_ctor,
10556 NULL,
10557 mbfl_filt_conv_euctw_wchar,
10558 mbfl_filt_conv_euctw_wchar_flush,
10559 NULL,
10560 };
10561
10562 static const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
10563 mbfl_no_encoding_wchar,
10564 mbfl_no_encoding_euc_tw,
10565 mbfl_filt_conv_common_ctor,
10566 NULL,
10567 mbfl_filt_conv_wchar_euctw,
10568 mbfl_filt_conv_common_flush,
10569 NULL,
10570 };
10571
10572 const mbfl_encoding mbfl_encoding_euc_tw = {
10573 mbfl_no_encoding_euc_tw,
10574 "EUC-TW",
10575 "EUC-TW",
10576 mbfl_encoding_euc_tw_aliases,
10577 mblen_table_euccn,
10578 0,
10579 &vtbl_euctw_wchar,
10580 &vtbl_wchar_euctw,
10581 mb_euctw_to_wchar,
10582 mb_wchar_to_euctw,
10583 NULL,
10584 NULL,
10585 };
10586
10587 static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
10588
10589 static const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
10590 mbfl_no_encoding_euc_kr,
10591 mbfl_no_encoding_wchar,
10592 mbfl_filt_conv_common_ctor,
10593 NULL,
10594 mbfl_filt_conv_euckr_wchar,
10595 mbfl_filt_conv_euckr_wchar_flush,
10596 NULL,
10597 };
10598
10599 static const struct mbfl_convert_vtbl vtbl_wchar_euckr = {
10600 mbfl_no_encoding_wchar,
10601 mbfl_no_encoding_euc_kr,
10602 mbfl_filt_conv_common_ctor,
10603 NULL,
10604 mbfl_filt_conv_wchar_euckr,
10605 mbfl_filt_conv_common_flush,
10606 NULL,
10607 };
10608
10609 const mbfl_encoding mbfl_encoding_euc_kr = {
10610 mbfl_no_encoding_euc_kr,
10611 "EUC-KR",
10612 "EUC-KR",
10613 mbfl_encoding_euc_kr_aliases,
10614 mblen_table_euccn,
10615 0,
10616 &vtbl_euckr_wchar,
10617 &vtbl_wchar_euckr,
10618 mb_euckr_to_wchar,
10619 mb_wchar_to_euckr,
10620 NULL,
10621 NULL,
10622 };
10623
10624 /* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
10625 * It is the same as EUC-KR, but with 8,822 additional characters added to
10626 * complete all the characters in the Johab charset. */
10627
10628 static const unsigned char mblen_table_81_to_fe[] = { /* 0x81-0xFE */
10629 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10630 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10632 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10633 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10634 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10635 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10636 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10637 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10638 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10639 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10640 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10641 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10642 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10643 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10644 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10645 };
10646
10647 static const char *mbfl_encoding_uhc_aliases[] = {"CP949", NULL};
10648
10649 static const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
10650 mbfl_no_encoding_uhc,
10651 mbfl_no_encoding_wchar,
10652 mbfl_filt_conv_common_ctor,
10653 NULL,
10654 mbfl_filt_conv_uhc_wchar,
10655 mbfl_filt_conv_uhc_wchar_flush,
10656 NULL,
10657 };
10658
10659 static const struct mbfl_convert_vtbl vtbl_wchar_uhc = {
10660 mbfl_no_encoding_wchar,
10661 mbfl_no_encoding_uhc,
10662 mbfl_filt_conv_common_ctor,
10663 NULL,
10664 mbfl_filt_conv_wchar_uhc,
10665 mbfl_filt_conv_common_flush,
10666 NULL,
10667 };
10668
10669 const mbfl_encoding mbfl_encoding_uhc = {
10670 mbfl_no_encoding_uhc,
10671 "UHC",
10672 "UHC",
10673 mbfl_encoding_uhc_aliases,
10674 mblen_table_81_to_fe,
10675 0,
10676 &vtbl_uhc_wchar,
10677 &vtbl_wchar_uhc,
10678 mb_uhc_to_wchar,
10679 mb_wchar_to_uhc,
10680 NULL,
10681 NULL,
10682 };
10683
10684 /*
10685 * GB18030/CP936
10686 */
10687
mbfl_filt_conv_gb18030_wchar(int c,mbfl_convert_filter * filter)10688 static int mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
10689 {
10690 int k;
10691 int c1, c2, c3, w = -1;
10692
10693 switch (filter->status) {
10694 case 0:
10695 if (c >= 0 && c < 0x80) { /* latin */
10696 CK((*filter->output_function)(c, filter->data));
10697 } else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */
10698 filter->status = 1;
10699 filter->cache = c;
10700 } else {
10701 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10702 }
10703 break;
10704
10705 case 1: /* dbcs/qbcs second byte */
10706 c1 = filter->cache;
10707 filter->status = 0;
10708
10709 if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) {
10710 /* 4 byte range: Unicode BMP */
10711 filter->status = 2;
10712 filter->cache = (c1 << 8) | c;
10713 return 0;
10714 } else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) {
10715 /* 4 byte range: Unicode 16 planes */
10716 filter->status = 2;
10717 filter->cache = (c1 << 8) | c;
10718 return 0;
10719 } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
10720 /* UDA part 1,2: U+E000-U+E4C5 */
10721 w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
10722 CK((*filter->output_function)(w, filter->data));
10723 } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
10724 /* UDA part3 : U+E4C6-U+E765*/
10725 w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
10726 CK((*filter->output_function)(w, filter->data));
10727 }
10728
10729 c2 = (c1 << 8) | c;
10730
10731 if (w <= 0 &&
10732 ((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
10733 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
10734 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
10735 for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
10736 if (c2 >= mbfl_gb18030_pua_tbl[k][2] && c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1] - mbfl_gb18030_pua_tbl[k][0]) {
10737 w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
10738 CK((*filter->output_function)(w, filter->data));
10739 break;
10740 }
10741 }
10742 }
10743
10744 if (w <= 0) {
10745 if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
10746 (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
10747 (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
10748 (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
10749 (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
10750 w = (c1 - 0x81)*192 + c - 0x40;
10751 ZEND_ASSERT(w < cp936_ucs_table_size);
10752 CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
10753 } else {
10754 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10755 }
10756 }
10757 break;
10758
10759 case 2: /* qbcs third byte */
10760 c1 = (filter->cache >> 8) & 0xff;
10761 c2 = filter->cache & 0xff;
10762 filter->status = filter->cache = 0;
10763 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
10764 filter->cache = (c1 << 16) | (c2 << 8) | c;
10765 filter->status = 3;
10766 } else {
10767 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10768 }
10769 break;
10770
10771 case 3: /* qbcs fourth byte */
10772 c1 = (filter->cache >> 16) & 0xff;
10773 c2 = (filter->cache >> 8) & 0xff;
10774 c3 = filter->cache & 0xff;
10775 filter->status = filter->cache = 0;
10776 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
10777 if (c1 >= 0x90 && c1 <= 0xe3) {
10778 w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
10779 if (w > 0x10FFFF) {
10780 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10781 return 0;
10782 }
10783 } else { /* Unicode BMP */
10784 w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
10785 if (w >= 0 && w <= 39419) {
10786 k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
10787 w += mbfl_gb_uni_ofst[k];
10788 } else {
10789 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10790 return 0;
10791 }
10792 }
10793 CK((*filter->output_function)(w, filter->data));
10794 } else {
10795 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10796 }
10797 break;
10798
10799 EMPTY_SWITCH_DEFAULT_CASE();
10800 }
10801
10802 return 0;
10803 }
10804
mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter * filter)10805 static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter)
10806 {
10807 if (filter->status) {
10808 /* multi-byte character was truncated */
10809 filter->status = 0;
10810 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10811 }
10812
10813 if (filter->flush_function) {
10814 (*filter->flush_function)(filter->data);
10815 }
10816
10817 return 0;
10818 }
10819
mbfl_filt_conv_wchar_gb18030(int c,mbfl_convert_filter * filter)10820 static int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
10821 {
10822 int k, k1, k2;
10823 int c1, s = 0, s1 = 0;
10824
10825 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
10826 if (c == 0x01f9) {
10827 s = 0xa8bf;
10828 } else {
10829 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
10830 }
10831 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
10832 if (c == 0x20ac) { /* euro-sign */
10833 s = 0xa2e3;
10834 } else {
10835 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
10836 }
10837 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
10838 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
10839 } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
10840 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
10841 } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
10842 /* U+F900-FA2F CJK Compatibility Ideographs */
10843 if (c == 0xf92c) {
10844 s = 0xfd9c;
10845 } else if (c == 0xf979) {
10846 s = 0xfd9d;
10847 } else if (c == 0xf995) {
10848 s = 0xfd9e;
10849 } else if (c == 0xf9e7) {
10850 s = 0xfd9f;
10851 } else if (c == 0xf9f1) {
10852 s = 0xfda0;
10853 } else if (c >= 0xfa0c && c <= 0xfa29) {
10854 s = ucs_ci_s_cp936_table[c - 0xfa0c];
10855 }
10856 } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
10857 /* FE30h CJK Compatibility Forms */
10858 s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
10859 } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
10860 /* U+FE50-FE6F Small Form Variants */
10861 s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min];
10862 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
10863 /* U+FF00-FFFF HW/FW Forms */
10864 if (c == 0xff04) {
10865 s = 0xa1e7;
10866 } else if (c == 0xff5e) {
10867 s = 0xa1ab;
10868 } else if (c >= 0xff01 && c <= 0xff5d) {
10869 s = c - 0xff01 + 0xa3a1;
10870 } else if (c >= 0xffe0 && c <= 0xffe5) {
10871 s = ucs_hff_s_cp936_table[c-0xffe0];
10872 }
10873 }
10874
10875 /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
10876 * do a binary search in a table of differing codepoints to see if we have one */
10877 if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
10878 k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
10879 if (k1 >= 0) {
10880 s = mbfl_gb18030_c_tbl_val[k1];
10881 }
10882 }
10883
10884 if (c >= 0xe000 && c <= 0xe864) { /* PUA */
10885 if (c < 0xe766) {
10886 if (c < 0xe4c6) {
10887 c1 = c - 0xe000;
10888 s = (c1 % 94) + 0xa1;
10889 c1 /= 94;
10890 s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
10891 } else {
10892 c1 = c - 0xe4c6;
10893 s = ((c1 / 96) + 0xa1) << 8;
10894 c1 %= 96;
10895 s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
10896 }
10897 } else {
10898 /* U+E766..U+E864 */
10899 k1 = 0;
10900 k2 = mbfl_gb18030_pua_tbl_max;
10901 while (k1 < k2) {
10902 k = (k1 + k2) >> 1;
10903 if (c < mbfl_gb18030_pua_tbl[k][0]) {
10904 k2 = k;
10905 } else if (c > mbfl_gb18030_pua_tbl[k][1]) {
10906 k1 = k + 1;
10907 } else {
10908 s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
10909 break;
10910 }
10911 }
10912 }
10913 }
10914
10915 /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
10916 if (s <= 0 && c >= 0x0080 && c <= 0xffff) {
10917 /* BMP */
10918 s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
10919 if (s >= 0) {
10920 c1 = c - mbfl_gb_uni_ofst[s];
10921 s = (c1 % 10) + 0x30;
10922 c1 /= 10;
10923 s |= ((c1 % 126) + 0x81) << 8;
10924 c1 /= 126;
10925 s |= ((c1 % 10) + 0x30) << 16;
10926 c1 /= 10;
10927 s1 = c1 + 0x81;
10928 }
10929 } else if (c >= 0x10000 && c <= 0x10ffff) {
10930 /* Code set 3: Unicode U+10000..U+10FFFF */
10931 c1 = c - 0x10000;
10932 s = (c1 % 10) + 0x30;
10933 c1 /= 10;
10934 s |= ((c1 % 126) + 0x81) << 8;
10935 c1 /= 126;
10936 s |= ((c1 % 10) + 0x30) << 16;
10937 c1 /= 10;
10938 s1 = c1 + 0x90;
10939 }
10940
10941 if (c == 0) {
10942 s = 0;
10943 } else if (s == 0) {
10944 s = -1;
10945 }
10946
10947 if (s >= 0) {
10948 if (s <= 0x80) { /* latin */
10949 CK((*filter->output_function)(s, filter->data));
10950 } else if (s1 > 0) { /* qbcs */
10951 CK((*filter->output_function)(s1 & 0xff, filter->data));
10952 CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
10953 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10954 CK((*filter->output_function)(s & 0xff, filter->data));
10955 } else { /* dbcs */
10956 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10957 CK((*filter->output_function)(s & 0xff, filter->data));
10958 }
10959 } else {
10960 CK(mbfl_filt_conv_illegal_output(c, filter));
10961 }
10962
10963 return 0;
10964 }
10965
10966 static const unsigned short gb18030_pua_tbl3[] = {
10967 /* 0xFE50 */
10968 0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
10969 0x0000,0xE81E,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10970 0x0000,0xE826,0x0000,0x0000,0x0000,0x0000,0xE82B,0xE82C,
10971 0x0000,0x0000,0x0000,0x0000,0xE831,0xE832,0x0000,0x0000,
10972 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
10973 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE843,0x0000,
10974 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10975 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10976 0xE854,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10977 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10978 /* 0xFEA0 */
10979 0xE864
10980 };
10981
mb_gb18030_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10982 static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10983 {
10984 unsigned char *p = *in, *e = p + *in_len;
10985 uint32_t *out = buf, *limit = buf + bufsize;
10986
10987 while (p < e && out < limit) {
10988 unsigned char c = *p++;
10989
10990 if (c < 0x80) {
10991 *out++ = c;
10992 } else if (c == 0x80 || c == 0xFF) {
10993 *out++ = MBFL_BAD_INPUT;
10994 } else {
10995 if (p == e) {
10996 *out++ = MBFL_BAD_INPUT;
10997 break;
10998 }
10999 unsigned char c2 = *p++;
11000
11001 if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
11002 if (p >= e) {
11003 *out++ = MBFL_BAD_INPUT;
11004 break;
11005 }
11006 unsigned char c3 = *p++;
11007
11008 if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
11009 unsigned char c4 = *p++;
11010
11011 if (c4 >= 0x30 && c4 <= 0x39) {
11012 if (c >= 0x90 && c <= 0xE3) {
11013 unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
11014 *out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
11015 } else {
11016 /* Unicode BMP */
11017 unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
11018 if (w <= 39419) {
11019 *out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
11020 } else {
11021 *out++ = MBFL_BAD_INPUT;
11022 }
11023 }
11024 } else {
11025 *out++ = MBFL_BAD_INPUT;
11026 }
11027 } else {
11028 *out++ = MBFL_BAD_INPUT;
11029 }
11030 } else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
11031 /* UDA part 1, 2: U+E000-U+E4C5 */
11032 *out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11033 } else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
11034 /* UDA part 3: U+E4C6-U+E765 */
11035 *out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11036 } else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
11037 unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11038
11039 if (w >= 0x192B) {
11040 if (w <= 0x1EBE) {
11041 if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11042 *out++ = cp936_pua_tbl1[w - 0x192B];
11043 continue;
11044 }
11045 } else if (w >= 0x413A) {
11046 if (w <= 0x413E) {
11047 *out++ = cp936_pua_tbl2[w - 0x413A];
11048 continue;
11049 } else if (w >= 0x5DD0 && w <= 0x5E20) {
11050 unsigned int c = gb18030_pua_tbl3[w - 0x5DD0];
11051 if (c) {
11052 *out++ = c;
11053 continue;
11054 }
11055 }
11056 }
11057 }
11058
11059 if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11060 ZEND_ASSERT(w < cp936_ucs_table_size);
11061 *out++ = cp936_ucs_table[w];
11062 } else {
11063 *out++ = MBFL_BAD_INPUT;
11064 }
11065 } else {
11066 *out++ = MBFL_BAD_INPUT;
11067 }
11068 }
11069 }
11070
11071 *in_len = e - p;
11072 *in = p;
11073 return out - buf;
11074 }
11075
mb_wchar_to_gb18030(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11076 static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11077 {
11078 unsigned char *out, *limit;
11079 MB_CONVERT_BUF_LOAD(buf, out, limit);
11080 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11081
11082 while (len--) {
11083 uint32_t w = *in++;
11084 unsigned int s = 0;
11085
11086 if (w == 0) {
11087 out = mb_convert_buf_add(out, 0);
11088 continue;
11089 } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11090 if (w == 0x1F9) {
11091 s = 0xA8BF;
11092 } else {
11093 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11094 }
11095 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11096 if (w == 0x20AC) { /* Euro sign */
11097 s = 0xA2E3;
11098 } else {
11099 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11100 }
11101 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11102 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11103 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11104 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11105 } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11106 /* U+F900-U+FA2F CJK Compatibility Ideographs */
11107 if (w == 0xF92C) {
11108 s = 0xFD9C;
11109 } else if (w == 0xF979) {
11110 s = 0xFD9D;
11111 } else if (w == 0xF995) {
11112 s = 0xFD9E;
11113 } else if (w == 0xF9E7) {
11114 s = 0xFD9F;
11115 } else if (w == 0xF9F1) {
11116 s = 0xFDA0;
11117 } else if (w >= 0xFA0C && w <= 0xFA29) {
11118 s = ucs_ci_s_cp936_table[w - 0xFA0C];
11119 }
11120 } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11121 /* CJK Compatibility Forms */
11122 s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11123 } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11124 /* U+FE50-U+FE6F Small Form Variants */
11125 s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11126 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11127 /* U+FF00-U+FFFF HW/FW Forms */
11128 if (w == 0xFF04) {
11129 s = 0xA1E7;
11130 } else if (w == 0xFF5E) {
11131 s = 0xA1AB;
11132 } else if (w >= 0xFF01 && w <= 0xFF5D) {
11133 s = w - 0xFF01 + 0xA3A1;
11134 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
11135 s = ucs_hff_s_cp936_table[w - 0xFFE0];
11136 }
11137 } else if (w >= 0xE000 && w <= 0xE864) {
11138 /* PUA */
11139 if (w < 0xE766) {
11140 if (w < 0xE4C6) {
11141 unsigned int c1 = w - 0xE000;
11142 s = (c1 % 94) + 0xA1;
11143 c1 /= 94;
11144 s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11145 } else {
11146 unsigned int c1 = w - 0xE4C6;
11147 s = ((c1 / 96) + 0xA1) << 8;
11148 c1 %= 96;
11149 s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11150 }
11151 } else {
11152 /* U+E766-U+E864 */
11153 unsigned int k1 = 0, k2 = mbfl_gb18030_pua_tbl_max;
11154 while (k1 < k2) {
11155 unsigned int k = (k1 + k2) >> 1;
11156 if (w < mbfl_gb18030_pua_tbl[k][0]) {
11157 k2 = k;
11158 } else if (w > mbfl_gb18030_pua_tbl[k][1]) {
11159 k1 = k + 1;
11160 } else {
11161 s = w - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
11162 break;
11163 }
11164 }
11165 }
11166 }
11167
11168 /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11169 * do a binary search in a table of differing codepoints to see if we have one */
11170 if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11171 int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11172 if (i >= 0) {
11173 s = mbfl_gb18030_c_tbl_val[i];
11174 }
11175 }
11176
11177 /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11178 if (!s && w >= 0x80 && w <= 0xFFFF) {
11179 /* BMP */
11180 int i = mbfl_bisec_srch(w, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
11181 if (i >= 0) {
11182 unsigned int c1 = w - mbfl_gb_uni_ofst[i];
11183 s = (c1 % 10) + 0x30;
11184 c1 /= 10;
11185 s |= ((c1 % 126) + 0x81) << 8;
11186 c1 /= 126;
11187 s |= ((c1 % 10) + 0x30) << 16;
11188 c1 /= 10;
11189 s |= (c1 + 0x81) << 24;
11190 }
11191 } else if (w >= 0x10000 && w <= 0x10FFFF) {
11192 /* Code set 3: Unicode U+10000-U+10FFFF */
11193 unsigned int c1 = w - 0x10000;
11194 s = (c1 % 10) + 0x30;
11195 c1 /= 10;
11196 s |= ((c1 % 126) + 0x81) << 8;
11197 c1 /= 126;
11198 s |= ((c1 % 10) + 0x30) << 16;
11199 c1 /= 10;
11200 s |= (c1 + 0x90) << 24;
11201 }
11202
11203 if (!s) {
11204 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11205 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11206 } else if (s < 0x80) {
11207 out = mb_convert_buf_add(out, s);
11208 } else if (s > 0xFFFFFF) {
11209 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11210 out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11211 } else {
11212 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11213 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11214 }
11215 }
11216
11217 MB_CONVERT_BUF_STORE(buf, out, limit);
11218 }
11219
mbfl_filt_conv_cp936_wchar(int c,mbfl_convert_filter * filter)11220 static int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter)
11221 {
11222 int k;
11223 int c1, c2, w = -1;
11224
11225 switch (filter->status) {
11226 case 0:
11227 if (c >= 0 && c < 0x80) { /* latin */
11228 CK((*filter->output_function)(c, filter->data));
11229 } else if (c == 0x80) { /* euro sign */
11230 CK((*filter->output_function)(0x20ac, filter->data));
11231 } else if (c < 0xff) { /* dbcs lead byte */
11232 filter->status = 1;
11233 filter->cache = c;
11234 } else { /* 0xff */
11235 CK((*filter->output_function)(0xf8f5, filter->data));
11236 }
11237 break;
11238
11239 case 1: /* dbcs second byte */
11240 filter->status = 0;
11241 c1 = filter->cache;
11242
11243 if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
11244 (c >= 0xa1 && c <= 0xfe)) {
11245 /* UDA part1,2: U+E000-U+E4C5 */
11246 w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
11247 CK((*filter->output_function)(w, filter->data));
11248 } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
11249 /* UDA part3 : U+E4C6-U+E765*/
11250 w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
11251 CK((*filter->output_function)(w, filter->data));
11252 }
11253
11254 c2 = (c1 << 8) | c;
11255
11256 if (w <= 0 &&
11257 ((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
11258 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
11259 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
11260 for (k = 0; k < mbfl_cp936_pua_tbl_max; k++) {
11261 if (c2 >= mbfl_cp936_pua_tbl[k][2] &&
11262 c2 <= mbfl_cp936_pua_tbl[k][2] +
11263 mbfl_cp936_pua_tbl[k][1] - mbfl_cp936_pua_tbl[k][0]) {
11264 w = c2 - mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][0];
11265 CK((*filter->output_function)(w, filter->data));
11266 break;
11267 }
11268 }
11269 }
11270
11271 if (w <= 0) {
11272 if (c1 < 0xff && c1 > 0x80 && c >= 0x40 && c < 0xff && c != 0x7f) {
11273 w = (c1 - 0x81)*192 + c - 0x40;
11274 ZEND_ASSERT(w < cp936_ucs_table_size);
11275 CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
11276 } else {
11277 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11278 }
11279 }
11280 break;
11281
11282 EMPTY_SWITCH_DEFAULT_CASE();
11283 }
11284
11285 return 0;
11286 }
11287
mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter * filter)11288 static int mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter *filter)
11289 {
11290 if (filter->status) {
11291 /* 2-byte character was truncated */
11292 filter->status = 0;
11293 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11294 }
11295
11296 if (filter->flush_function) {
11297 (*filter->flush_function)(filter->data);
11298 }
11299
11300 return 0;
11301 }
11302
mbfl_filt_conv_wchar_cp936(int c,mbfl_convert_filter * filter)11303 static int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter)
11304 {
11305 int k, k1, k2;
11306 int c1, s = 0;
11307
11308 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
11309 /* U+0000 - U+0451 */
11310 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
11311 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
11312 /* U+2000 - U+26FF */
11313 if (c == 0x203e) {
11314 s = 0xa3fe;
11315 } else if (c == 0x2218) {
11316 s = 0xa1e3;
11317 } else if (c == 0x223c) {
11318 s = 0xa1ab;
11319 } else {
11320 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
11321 }
11322 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
11323 /* U+2F00 - U+33FF */
11324 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
11325 } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
11326 /* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11327 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
11328 } else if (c >= 0xe000 && c <= 0xe864) { /* PUA */
11329 if (c < 0xe766) {
11330 if (c < 0xe4c6) {
11331 c1 = c - 0xe000;
11332 s = (c1 % 94) + 0xa1; c1 /= 94;
11333 s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
11334 } else {
11335 c1 = c - 0xe4c6;
11336 s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
11337 s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
11338 }
11339 } else {
11340 /* U+E766..U+E864 */
11341 k1 = 0; k2 = mbfl_cp936_pua_tbl_max;
11342 while (k1 < k2) {
11343 k = (k1 + k2) >> 1;
11344 if (c < mbfl_cp936_pua_tbl[k][0]) {
11345 k2 = k;
11346 } else if (c > mbfl_cp936_pua_tbl[k][1]) {
11347 k1 = k + 1;
11348 } else {
11349 s = c - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11350 break;
11351 }
11352 }
11353 }
11354 } else if (c == 0xf8f5) {
11355 s = 0xff;
11356 } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
11357 /* U+F900-FA2F CJK Compatibility Ideographs */
11358 s = ucs_ci_cp936_table[c - ucs_ci_cp936_table_min];
11359 } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
11360 s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
11361 } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
11362 s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; /* U+FE50-FE6F Small Form Variants */
11363 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
11364 /* U+FF00-FFFF HW/FW Forms */
11365 if (c == 0xff04) {
11366 s = 0xa1e7;
11367 } else if (c == 0xff5e) {
11368 s = 0xa1ab;
11369 } else if (c >= 0xff01 && c <= 0xff5d) {
11370 s = c - 0xff01 + 0xa3a1;
11371 } else if (c >= 0xffe0 && c <= 0xffe5) {
11372 s = ucs_hff_s_cp936_table[c-0xffe0];
11373 }
11374 }
11375
11376 if (s <= 0) {
11377 if (c == 0) {
11378 s = 0;
11379 } else if (s <= 0) {
11380 s = -1;
11381 }
11382 }
11383
11384 if (s >= 0) {
11385 if (s <= 0x80 || s == 0xff) { /* latin */
11386 CK((*filter->output_function)(s, filter->data));
11387 } else {
11388 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
11389 CK((*filter->output_function)(s & 0xff, filter->data));
11390 }
11391 } else {
11392 CK(mbfl_filt_conv_illegal_output(c, filter));
11393 }
11394
11395 return 0;
11396 }
11397
mb_cp936_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11398 static size_t mb_cp936_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11399 {
11400 unsigned char *p = *in, *e = p + *in_len;
11401 uint32_t *out = buf, *limit = buf + bufsize;
11402
11403 while (p < e && out < limit) {
11404 unsigned char c = *p++;
11405
11406 if (c < 0x80) {
11407 *out++ = c;
11408 } else if (c == 0x80) {
11409 *out++ = 0x20AC; /* Euro sign */
11410 } else if (c < 0xFF) {
11411 if (p >= e) {
11412 *out++ = MBFL_BAD_INPUT;
11413 continue;
11414 }
11415
11416 unsigned char c2 = *p++;
11417 if (c2 < 0x40 || c2 == 0x7F || c2 == 0xFF) {
11418 *out++ = MBFL_BAD_INPUT;
11419 continue;
11420 }
11421
11422 if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && c2 >= 0xA1) {
11423 /* UDA part 1, 2: U+E000-U+E4C5 */
11424 *out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11425 } else if (c >= 0xA1 && c <= 0xA7 && c2 < 0xA1) {
11426 /* UDA part 3: U+E4C6-U+E765*/
11427 *out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11428 } else {
11429 unsigned int w = (c - 0x81)*192 + c2 - 0x40; /* Convert c, c2 into GB 2312 table lookup index */
11430
11431 /* For CP936 and GB18030, certain GB 2312 byte combinations are mapped to PUA codepoints,
11432 * whereas the same combinations aren't mapped to any codepoint for HZ and EUC-CN
11433 * To avoid duplicating the entire GB 2312 -> Unicode lookup table, we have three
11434 * auxiliary tables which are consulted instead for specific ranges of lookup indices */
11435 if (w >= 0x192B) {
11436 if (w <= 0x1EBE) {
11437 *out++ = cp936_pua_tbl1[w - 0x192B];
11438 continue;
11439 } else if (w >= 0x413A) {
11440 if (w <= 0x413E) {
11441 *out++ = cp936_pua_tbl2[w - 0x413A];
11442 continue;
11443 } else if (w >= 0x5DD0 && w <= 0x5E20) {
11444 *out++ = cp936_pua_tbl3[w - 0x5DD0];
11445 continue;
11446 }
11447 }
11448 }
11449
11450 ZEND_ASSERT(w < cp936_ucs_table_size);
11451 *out++ = cp936_ucs_table[w];
11452 }
11453 } else {
11454 *out++ = 0xF8F5;
11455 }
11456 }
11457
11458 *in_len = e - p;
11459 *in = p;
11460 return out - buf;
11461 }
11462
mb_wchar_to_cp936(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11463 static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11464 {
11465 unsigned char *out, *limit;
11466 MB_CONVERT_BUF_LOAD(buf, out, limit);
11467 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11468
11469 while (len--) {
11470 uint32_t w = *in++;
11471 unsigned int s = 0;
11472
11473 if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11474 /* U+0000-U+0451 */
11475 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11476 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11477 /* U+2000-U+26FF */
11478 if (w == 0x203E) {
11479 s = 0xA3FE;
11480 } else if (w == 0x2218) {
11481 s = 0xA1E3;
11482 } else if (w == 0x223C) {
11483 s = 0xA1AB;
11484 } else {
11485 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11486 }
11487 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11488 /* U+2F00-U+33FF */
11489 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11490 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11491 /* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11492 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11493 } else if (w >= 0xE000 && w <= 0xE864) {
11494 /* PUA */
11495 if (w < 0xe766) {
11496 if (w < 0xe4c6) {
11497 unsigned int c1 = w - 0xE000;
11498 s = (c1 % 94) + 0xA1;
11499 c1 /= 94;
11500 s |= (c1 < 0x6 ? c1 + 0xAA : c1 + 0xF2) << 8;
11501 } else {
11502 unsigned int c1 = w - 0xE4C6;
11503 s = ((c1 / 96) + 0xA1) << 8;
11504 c1 %= 96;
11505 s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11506 }
11507 } else {
11508 /* U+E766-U+E864 */
11509 unsigned int k1 = 0;
11510 unsigned int k2 = mbfl_cp936_pua_tbl_max;
11511 while (k1 < k2) {
11512 int k = (k1 + k2) >> 1;
11513 if (w < mbfl_cp936_pua_tbl[k][0]) {
11514 k2 = k;
11515 } else if (w > mbfl_cp936_pua_tbl[k][1]) {
11516 k1 = k + 1;
11517 } else {
11518 s = w - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11519 break;
11520 }
11521 }
11522 }
11523 } else if (w == 0xF8F5) {
11524 s = 0xFF;
11525 } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11526 /* U+F900-U+FA2F CJK Compatibility Ideographs */
11527 s = ucs_ci_cp936_table[w - ucs_ci_cp936_table_min];
11528 } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11529 s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11530 } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11531 /* U+FE50-U+FE6F Small Form Variants */
11532 s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11533 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11534 /* U+FF00-U+FFFF HW/FW Forms */
11535 if (w == 0xFF04) {
11536 s = 0xA1E7;
11537 } else if (w == 0xFF5E) {
11538 s = 0xA1AB;
11539 } else if (w >= 0xFF01 && w <= 0xFF5D) {
11540 s = w - 0xFF01 + 0xA3A1;
11541 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
11542 s = ucs_hff_s_cp936_table[w - 0xFFE0];
11543 }
11544 }
11545
11546 if (!s) {
11547 if (w == 0) {
11548 out = mb_convert_buf_add(out, 0);
11549 } else {
11550 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp936);
11551 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11552 }
11553 } else if (s <= 0x80 || s == 0xFF) {
11554 out = mb_convert_buf_add(out, s);
11555 } else {
11556 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11557 }
11558 }
11559
11560 MB_CONVERT_BUF_STORE(buf, out, limit);
11561 }
11562
11563 static const unsigned short gb18030_2022_pua_tbl3[] = {
11564 /* 0xFE50 */
11565 0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
11566 0x0000,0x9FB4,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11567 0x0000,0x9FB5,0x0000,0x0000,0x0000,0x0000,0x9FB6,0x9FB7,
11568 0x0000,0x0000,0x0000,0x0000,0xE831,0x9FB8,0x0000,0x0000,
11569 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
11570 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x9FB9,0x0000,
11571 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11572 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11573 0x9FBA,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11574 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11575 /* 0xFEA0 */
11576 0x9FBB
11577 };
11578
mb_gb18030_2022_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11579 static size_t mb_gb18030_2022_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11580 {
11581 unsigned char *p = *in, *e = p + *in_len;
11582 uint32_t *out = buf, *limit = buf + bufsize;
11583
11584 while (p < e && out < limit) {
11585 unsigned char c = *p++;
11586
11587 if (c < 0x80) {
11588 *out++ = c;
11589 } else if (c == 0x80 || c == 0xFF) {
11590 *out++ = MBFL_BAD_INPUT;
11591 } else {
11592 if (p == e) {
11593 *out++ = MBFL_BAD_INPUT;
11594 break;
11595 }
11596 unsigned char c2 = *p++;
11597
11598 if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
11599 if (p >= e) {
11600 *out++ = MBFL_BAD_INPUT;
11601 break;
11602 }
11603 unsigned char c3 = *p++;
11604
11605 if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
11606 unsigned char c4 = *p++;
11607
11608 if (c4 >= 0x30 && c4 <= 0x39) {
11609 if (c >= 0x90 && c <= 0xE3) {
11610 unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
11611 *out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
11612 } else {
11613 /* Unicode BMP */
11614 unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
11615 if (w == 0x98A4) {
11616 *out++ = 0xE78D;
11617 } else if (w == 0x98A6) {
11618 *out++ = 0xE78E;
11619 } else if (w == 0x98A5) {
11620 *out++ = 0xE78F;
11621 } else if (w >= 0x98A7 && w <= 0x98AD) {
11622 *out++ = w + (0xE790 - 0x98A7);
11623 } else if (w == 0x1D21) {
11624 *out++ = 0xE7C7;
11625 } else if (w == 0x4A71) {
11626 *out++ = 0xE81E;
11627 } else if (w == 0x4A72) {
11628 *out++ = 0xE826;
11629 } else if (w >= 0x4A73 && w <= 0x4A74) {
11630 *out++ = w + (0xE82B - 0x4A73);
11631 } else if (w == 0x4A75) {
11632 *out++ = 0xE832;
11633 } else if (w == 0x4A76) {
11634 *out++ = 0xE843;
11635 } else if (w == 0x4A77) {
11636 *out++ = 0xE854;
11637 } else if (w == 0x4A78) {
11638 *out++ = 0xE864;
11639 } else if (w <= 0x99FB) {
11640 *out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
11641 } else {
11642 *out++ = MBFL_BAD_INPUT;
11643 }
11644 }
11645 } else {
11646 *out++ = MBFL_BAD_INPUT;
11647 }
11648 } else {
11649 *out++ = MBFL_BAD_INPUT;
11650 }
11651 } else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
11652 /* UDA part 1, 2: U+E000-U+E4C5 */
11653 *out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11654 } else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
11655 /* UDA part 3: U+E4C6-U+E765 */
11656 *out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11657 } else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
11658 unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11659
11660 if (w >= 0x192B) {
11661 if (w <= 0x1EBE) {
11662 if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11663 *out++ = gb18030_2022_pua_tbl1[w - 0x192B];
11664 continue;
11665 }
11666 } else if (w >= 0x413A) {
11667 if (w <= 0x413E) {
11668 *out++ = cp936_pua_tbl2[w - 0x413A];
11669 continue;
11670 } else if (w >= 0x5DD0 && w <= 0x5E20) {
11671 unsigned int c = gb18030_2022_pua_tbl3[w - 0x5DD0];
11672 if (c) {
11673 *out++ = c;
11674 continue;
11675 }
11676 }
11677 }
11678 }
11679
11680 if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11681 ZEND_ASSERT(w < cp936_ucs_table_size);
11682 *out++ = cp936_ucs_table[w];
11683 } else {
11684 *out++ = MBFL_BAD_INPUT;
11685 }
11686 } else {
11687 *out++ = MBFL_BAD_INPUT;
11688 }
11689 }
11690 }
11691
11692 *in_len = e - p;
11693 *in = p;
11694 return out - buf;
11695 }
11696
mb_wchar_to_gb18030_2022(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11697 static void mb_wchar_to_gb18030_2022(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11698 {
11699 unsigned char *out, *limit;
11700 MB_CONVERT_BUF_LOAD(buf, out, limit);
11701 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11702
11703 while (len--) {
11704 uint32_t w = *in++;
11705 unsigned int s = 0;
11706
11707 if (w == 0) {
11708 out = mb_convert_buf_add(out, 0);
11709 continue;
11710 } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11711 if (w == 0x1F9) {
11712 s = 0xA8BF;
11713 } else {
11714 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11715 }
11716 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11717 if (w == 0x20AC) { /* Euro sign */
11718 s = 0xA2E3;
11719 } else {
11720 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11721 }
11722 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11723 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11724 } else if (w >= 0x9FB4 && w <= 0x9FBB) {
11725 /* Newly mapped in GB18030-2022 */
11726 if (w == 0x9FB4) {
11727 s = 0xFE59;
11728 } else if (w == 0x9FB5) {
11729 s = 0xFE61;
11730 } else if (w == 0x9FB6) {
11731 s = 0xFE66;
11732 } else if (w == 0x9FB7) {
11733 s = 0xFE67;
11734 } else if (w == 0x9FB8) {
11735 s = 0xFE6D;
11736 } else if (w == 0x9FB9) {
11737 s = 0xFE7E;
11738 } else if (w == 0x9FBA) {
11739 s = 0xFE90;
11740 } else {
11741 s = 0xFEA0;
11742 }
11743 } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11744 s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11745 } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11746 /* U+F900-U+FA2F CJK Compatibility Ideographs */
11747 if (w == 0xF92C) {
11748 s = 0xFD9C;
11749 } else if (w == 0xF979) {
11750 s = 0xFD9D;
11751 } else if (w == 0xF995) {
11752 s = 0xFD9E;
11753 } else if (w == 0xF9E7) {
11754 s = 0xFD9F;
11755 } else if (w == 0xF9F1) {
11756 s = 0xFDA0;
11757 } else if (w >= 0xFA0C && w <= 0xFA29) {
11758 s = ucs_ci_s_cp936_table[w - 0xFA0C];
11759 }
11760 } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11761 /* CJK Compatibility Forms */
11762 s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11763 } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11764 /* U+FE50-U+FE6F Small Form Variants */
11765 s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11766 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11767 /* U+FF00-U+FFFF HW/FW Forms */
11768 if (w == 0xFF04) {
11769 s = 0xA1E7;
11770 } else if (w == 0xFF5E) {
11771 s = 0xA1AB;
11772 } else if (w >= 0xFF01 && w <= 0xFF5D) {
11773 s = w - 0xFF01 + 0xA3A1;
11774 } else if (w >= 0xFFE0 && w <= 0xFFE5) {
11775 s = ucs_hff_s_cp936_table[w - 0xFFE0];
11776 }
11777 } else if (w >= 0xE000 && w <= 0xE864) {
11778 /* PUA */
11779 if (w < 0xE766) {
11780 if (w < 0xE4C6) {
11781 unsigned int c1 = w - 0xE000;
11782 s = (c1 % 94) + 0xA1;
11783 c1 /= 94;
11784 s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11785 } else {
11786 unsigned int c1 = w - 0xE4C6;
11787 s = ((c1 / 96) + 0xA1) << 8;
11788 c1 %= 96;
11789 s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11790 }
11791 } else {
11792 /* U+E766-U+E864 */
11793 unsigned int k1 = 0, k2 = mbfl_gb18030_2022_pua_tbl_max;
11794 while (k1 < k2) {
11795 unsigned int k = (k1 + k2) >> 1;
11796 if (w < mbfl_gb18030_2022_pua_tbl[k][0]) {
11797 k2 = k;
11798 } else if (w > mbfl_gb18030_2022_pua_tbl[k][1]) {
11799 k1 = k + 1;
11800 } else {
11801 s = w - mbfl_gb18030_2022_pua_tbl[k][0] + mbfl_gb18030_2022_pua_tbl[k][2];
11802 break;
11803 }
11804 }
11805 }
11806 } else if (w >= 0xFE10 && w <= 0xFE19) {
11807 /* Newly mapped codepoints in GB18030-2022 */
11808 if (w == 0xFE11) {
11809 s = 0xA6DB;
11810 } else if (w == 0xFE12) {
11811 s = 0xA6DA;
11812 } else if (w <= 0xFE16) {
11813 s = w - (0xFE10 - 0xA6D9);
11814 } else if (w <= 0xFE18) {
11815 s = w - (0xFE17 - 0xA6EC);
11816 } else {
11817 s = 0xA6F3;
11818 }
11819 } else if (w == 0x1E3F) {
11820 /* Newly mapped codepoint in GB18030-2022 */
11821 s = 0xA8BC;
11822 }
11823
11824 /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11825 * do a binary search in a table of differing codepoints to see if we have one */
11826 if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11827 int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11828 if (i >= 0) {
11829 s = mbfl_gb18030_c_tbl_val[i];
11830 }
11831 }
11832
11833 /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11834 if (!s && w >= 0x80 && w <= 0xFFFF) {
11835 /* BMP */
11836 int i = mbfl_bisec_srch(w, mbfl_uni2gb2022_tbl, mbfl_gb2022_uni_max);
11837 if (i >= 0) {
11838 unsigned int c1 = w - mbfl_gb2022_uni_ofst[i];
11839 s = (c1 % 10) + 0x30;
11840 c1 /= 10;
11841 s |= ((c1 % 126) + 0x81) << 8;
11842 c1 /= 126;
11843 s |= ((c1 % 10) + 0x30) << 16;
11844 c1 /= 10;
11845 s |= (c1 + 0x81) << 24;
11846 }
11847 } else if (w >= 0x10000 && w <= 0x10FFFF) {
11848 /* Code set 3: Unicode U+10000-U+10FFFF */
11849 unsigned int c1 = w - 0x10000;
11850 s = (c1 % 10) + 0x30;
11851 c1 /= 10;
11852 s |= ((c1 % 126) + 0x81) << 8;
11853 c1 /= 126;
11854 s |= ((c1 % 10) + 0x30) << 16;
11855 c1 /= 10;
11856 s |= (c1 + 0x90) << 24;
11857 }
11858
11859 if (!s) {
11860 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11861 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11862 } else if (s < 0x80) {
11863 out = mb_convert_buf_add(out, s);
11864 } else if (s > 0xFFFFFF) {
11865 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11866 out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11867 } else {
11868 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11869 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11870 }
11871 }
11872
11873 MB_CONVERT_BUF_STORE(buf, out, limit);
11874 }
11875
11876 /* Step through a GB18030 string one character at a time. Find the last position at or
11877 * before `limit` which falls directly after the end of a (single or multi-byte) character */
step_through_gb18030_str(unsigned char * p,unsigned char * limit)11878 static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
11879 {
11880 while (p < limit) {
11881 unsigned char c = *p;
11882 if (c < 0x81 || c == 0xFF) {
11883 p++;
11884 } else {
11885 if (limit - p == 1) {
11886 break;
11887 }
11888 unsigned char c2 = p[1];
11889 /* For a 4-byte char, the 2nd byte will be 0x30-0x39 */
11890 unsigned int w = (c2 >= 0x30 && c2 <= 0x39) ? 4 : 2;
11891 if (limit - p < w) {
11892 break;
11893 }
11894 p += w;
11895 }
11896 }
11897 return p;
11898 }
11899
mb_cut_gb18030(unsigned char * str,size_t from,size_t len,unsigned char * end)11900 static zend_string* mb_cut_gb18030(unsigned char *str, size_t from, size_t len, unsigned char *end)
11901 {
11902 ZEND_ASSERT(str + from <= end);
11903 unsigned char *start = step_through_gb18030_str(str, str + from);
11904 if (str + from + len > end) {
11905 len = (end - str) - from;
11906 }
11907 if (start + len >= end) {
11908 return zend_string_init_fast((const char*)start, end - start);
11909 } else {
11910 unsigned char *_end = step_through_gb18030_str(start, start + len);
11911 return zend_string_init_fast((const char*)start, _end - start);
11912 }
11913 }
11914
11915 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
11916
11917 static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
11918 mbfl_no_encoding_gb18030,
11919 mbfl_no_encoding_wchar,
11920 mbfl_filt_conv_common_ctor,
11921 NULL,
11922 mbfl_filt_conv_gb18030_wchar,
11923 mbfl_filt_conv_gb18030_wchar_flush,
11924 NULL,
11925 };
11926
11927 static const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = {
11928 mbfl_no_encoding_wchar,
11929 mbfl_no_encoding_gb18030,
11930 mbfl_filt_conv_common_ctor,
11931 NULL,
11932 mbfl_filt_conv_wchar_gb18030,
11933 mbfl_filt_conv_common_flush,
11934 NULL,
11935 };
11936
11937 const mbfl_encoding mbfl_encoding_gb18030 = {
11938 mbfl_no_encoding_gb18030,
11939 "GB18030",
11940 "GB18030",
11941 mbfl_encoding_gb18030_aliases,
11942 NULL,
11943 MBFL_ENCTYPE_GL_UNSAFE,
11944 &vtbl_gb18030_wchar,
11945 &vtbl_wchar_gb18030,
11946 mb_gb18030_to_wchar,
11947 mb_wchar_to_gb18030,
11948 NULL,
11949 mb_cut_gb18030,
11950 };
11951
11952 static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
11953
11954 static const struct mbfl_convert_vtbl vtbl_cp936_wchar = {
11955 mbfl_no_encoding_cp936,
11956 mbfl_no_encoding_wchar,
11957 mbfl_filt_conv_common_ctor,
11958 NULL,
11959 mbfl_filt_conv_cp936_wchar,
11960 mbfl_filt_conv_cp936_wchar_flush,
11961 NULL,
11962 };
11963
11964 static const struct mbfl_convert_vtbl vtbl_wchar_cp936 = {
11965 mbfl_no_encoding_wchar,
11966 mbfl_no_encoding_cp936,
11967 mbfl_filt_conv_common_ctor,
11968 NULL,
11969 mbfl_filt_conv_wchar_cp936,
11970 mbfl_filt_conv_common_flush,
11971 NULL,
11972 };
11973
11974 const mbfl_encoding mbfl_encoding_cp936 = {
11975 mbfl_no_encoding_cp936,
11976 "CP936",
11977 "CP936",
11978 mbfl_encoding_cp936_aliases,
11979 mblen_table_81_to_fe,
11980 MBFL_ENCTYPE_GL_UNSAFE,
11981 &vtbl_cp936_wchar,
11982 &vtbl_wchar_cp936,
11983 mb_cp936_to_wchar,
11984 mb_wchar_to_cp936,
11985 NULL,
11986 NULL,
11987 };
11988
11989 const mbfl_encoding mbfl_encoding_gb18030_2022 = {
11990 mbfl_no_encoding_gb18030_2022,
11991 "GB18030-2022",
11992 "GB18030-2022",
11993 NULL,
11994 NULL,
11995 MBFL_ENCTYPE_GL_UNSAFE,
11996 NULL,
11997 NULL,
11998 mb_gb18030_2022_to_wchar,
11999 mb_wchar_to_gb18030_2022,
12000 NULL,
12001 mb_cut_gb18030,
12002 };
12003
12004 /*
12005 * BIG5/CP950
12006 */
12007
12008 /* 63 + 94 = 157 or 94 */
12009 static unsigned short cp950_pua_tbl[][4] = {
12010 {0xe000, 0xe310, 0xfa40, 0xfefe},
12011 {0xe311, 0xeeb7, 0x8e40, 0xa0fe},
12012 {0xeeb8, 0xf6b0, 0x8140, 0x8dfe},
12013 {0xf6b1, 0xf70e, 0xc6a1, 0xc6fe},
12014 {0xf70f, 0xf848, 0xc740, 0xc8fe},
12015 };
12016
is_in_cp950_pua(int c1,int c)12017 static inline int is_in_cp950_pua(int c1, int c)
12018 {
12019 if ((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) || (c1 >= 0x81 && c1 <= 0x8d) || (c1 >= 0xc7 && c1 <= 0xc8)) {
12020 return (c >= 0x40 && c <= 0x7e) || (c >= 0xa1 && c <= 0xfe);
12021 } else if (c1 == 0xc6) {
12022 return c >= 0xa1 && c <= 0xfe;
12023 }
12024 return 0;
12025 }
12026
mbfl_filt_conv_big5_wchar(int c,mbfl_convert_filter * filter)12027 static int mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
12028 {
12029 int k, c1, w;
12030
12031 switch (filter->status) {
12032 case 0:
12033 if (c >= 0 && c < 0x80) { /* latin */
12034 CK((*filter->output_function)(c, filter->data));
12035 } else if (filter->from->no_encoding != mbfl_no_encoding_cp950 && c > 0xA0 && c <= 0xF9 && c != 0xC8) {
12036 filter->status = 1;
12037 filter->cache = c;
12038 } else if (filter->from->no_encoding == mbfl_no_encoding_cp950 && c > 0x80 && c <= 0xFE) {
12039 filter->status = 1;
12040 filter->cache = c;
12041 } else {
12042 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12043 }
12044 break;
12045
12046 case 1: /* dbcs second byte */
12047 filter->status = 0;
12048 c1 = filter->cache;
12049 if ((c > 0x3f && c < 0x7f) || (c > 0xa0 && c < 0xff)) {
12050 if (c < 0x7f) {
12051 w = (c1 - 0xa1)*157 + (c - 0x40);
12052 } else {
12053 w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
12054 }
12055 if (w >= 0 && w < big5_ucs_table_size) {
12056 w = big5_ucs_table[w];
12057 } else {
12058 w = 0;
12059 }
12060
12061 if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
12062 /* PUA for CP950 */
12063 if (is_in_cp950_pua(c1, c)) {
12064 int c2 = (c1 << 8) | c;
12065
12066 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12067 if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
12068 break;
12069 }
12070 }
12071
12072 if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
12073 w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
12074 } else {
12075 w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
12076 }
12077 } else if (c1 == 0xA1) {
12078 if (c == 0x45) {
12079 w = 0x2027;
12080 } else if (c == 0x4E) {
12081 w = 0xFE51;
12082 } else if (c == 0x5A) {
12083 w = 0x2574;
12084 } else if (c == 0xC2) {
12085 w = 0x00AF;
12086 } else if (c == 0xC3) {
12087 w = 0xFFE3;
12088 } else if (c == 0xC5) {
12089 w = 0x02CD;
12090 } else if (c == 0xE3) {
12091 w = 0xFF5E;
12092 } else if (c == 0xF2) {
12093 w = 0x2295;
12094 } else if (c == 0xF3) {
12095 w = 0x2299;
12096 } else if (c == 0xFE) {
12097 w = 0xFF0F;
12098 }
12099 } else if (c1 == 0xA2) {
12100 if (c == 0x40) {
12101 w = 0xFF3C;
12102 } else if (c == 0x41) {
12103 w = 0x2215;
12104 } else if (c == 0x42) {
12105 w = 0xFE68;
12106 } else if (c == 0x46) {
12107 w = 0xFFE0;
12108 } else if (c == 0x47) {
12109 w = 0xFFE1;
12110 } else if (c == 0xCC) {
12111 w = 0x5341;
12112 } else if (c == 0xCE) {
12113 w = 0x5345;
12114 }
12115 }
12116 }
12117
12118 if (w <= 0) {
12119 w = MBFL_BAD_INPUT;
12120 }
12121 CK((*filter->output_function)(w, filter->data));
12122 } else {
12123 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12124 }
12125 break;
12126
12127 EMPTY_SWITCH_DEFAULT_CASE();
12128 }
12129
12130 return 0;
12131 }
12132
mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter * filter)12133 static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter)
12134 {
12135 if (filter->status == 1) {
12136 /* 2-byte character was truncated */
12137 filter->status = 0;
12138 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12139 }
12140
12141 if (filter->flush_function) {
12142 (*filter->flush_function)(filter->data);
12143 }
12144
12145 return 0;
12146 }
12147
mbfl_filt_conv_wchar_big5(int c,mbfl_convert_filter * filter)12148 static int mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
12149 {
12150 int k, s = 0;
12151
12152 if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
12153 s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
12154 } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
12155 s = ucs_a2_big5_table[c - ucs_a2_big5_table_min];
12156 } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) {
12157 s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
12158 } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
12159 s = ucs_i_big5_table[c - ucs_i_big5_table_min];
12160 } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
12161 s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
12162 } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
12163 s = ucs_r2_big5_table[c - ucs_r2_big5_table_min];
12164 }
12165
12166 if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
12167 if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
12168 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12169 if (c <= cp950_pua_tbl[k][1]) {
12170 break;
12171 }
12172 }
12173
12174 int c1 = c - cp950_pua_tbl[k][0];
12175 if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
12176 int c2 = cp950_pua_tbl[k][2] >> 8;
12177 s = ((c1 / 157) + c2) << 8;
12178 c1 %= 157;
12179 s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
12180 } else {
12181 s = c1 + cp950_pua_tbl[k][2];
12182 }
12183 } else if (c == 0x00A2) {
12184 s = 0;
12185 } else if (c == 0x00A3) {
12186 s = 0;
12187 } else if (c == 0x00AF) {
12188 s = 0xA1C2;
12189 } else if (c == 0x02CD) {
12190 s = 0xA1C5;
12191 } else if (c == 0x0401) {
12192 s = 0;
12193 } else if (c >= 0x0414 && c <= 0x041C) {
12194 s = 0;
12195 } else if (c >= 0x0423 && c <= 0x044F) {
12196 s = 0;
12197 } else if (c == 0x0451) {
12198 s = 0;
12199 } else if (c == 0x2022) {
12200 s = 0;
12201 } else if (c == 0x2027) {
12202 s = 0xA145;
12203 } else if (c == 0x203E) {
12204 s = 0;
12205 } else if (c == 0x2215) {
12206 s = 0xA241;
12207 } else if (c == 0x223C) {
12208 s = 0;
12209 } else if (c == 0x2295) {
12210 s = 0xA1F2;
12211 } else if (c == 0x2299) {
12212 s = 0xA1F3;
12213 } else if (c >= 0x2460 && c <= 0x247D) {
12214 s = 0;
12215 } else if (c == 0x2574) {
12216 s = 0xA15A;
12217 } else if (c == 0x2609) {
12218 s = 0;
12219 } else if (c == 0x2641) {
12220 s = 0;
12221 } else if (c == 0x3005 || (c >= 0x302A && c <= 0x30FF)) {
12222 s = 0;
12223 } else if (c == 0xFE51) {
12224 s = 0xA14E;
12225 } else if (c == 0xFE68) {
12226 s = 0xA242;
12227 } else if (c == 0xFF3C) {
12228 s = 0xA240;
12229 } else if (c == 0xFF5E) {
12230 s = 0xA1E3;
12231 } else if (c == 0xFF64) {
12232 s = 0;
12233 } else if (c == 0xFFE0) {
12234 s = 0xA246;
12235 } else if (c == 0xFFE1) {
12236 s = 0xA247;
12237 } else if (c == 0xFFE3) {
12238 s = 0xA1C3;
12239 } else if (c == 0xFF0F) {
12240 s = 0xA1FE;
12241 }
12242 }
12243
12244 if (s <= 0) {
12245 if (c == 0) {
12246 s = 0;
12247 } else {
12248 s = -1;
12249 }
12250 }
12251
12252 if (s >= 0) {
12253 if (s <= 0x80) { /* latin */
12254 CK((*filter->output_function)(s, filter->data));
12255 } else {
12256 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
12257 CK((*filter->output_function)(s & 0xff, filter->data));
12258 }
12259 } else {
12260 CK(mbfl_filt_conv_illegal_output(c, filter));
12261 }
12262
12263 return 0;
12264 }
12265
mb_big5_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12266 static size_t mb_big5_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12267 {
12268 unsigned char *p = *in, *e = p + *in_len;
12269 uint32_t *out = buf, *limit = buf + bufsize;
12270
12271 e--; /* Stop the main loop 1 byte short of the end of the input */
12272
12273 while (p < e && out < limit) {
12274 unsigned char c = *p++;
12275
12276 if (c <= 0x7F) {
12277 *out++ = c;
12278 } else if (c > 0xA0 && c <= 0xF9) {
12279 /* We don't need to check p < e here; it's not possible that this pointer dereference
12280 * will be outside the input string, because of e-- above */
12281 unsigned char c2 = *p++;
12282
12283 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
12284 unsigned int w = (c - 0xA1)*157 + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
12285 ZEND_ASSERT(w < big5_ucs_table_size);
12286 w = big5_ucs_table[w];
12287 if (!w) {
12288 if (c == 0xC8) {
12289 p--;
12290 }
12291 w = MBFL_BAD_INPUT;
12292 }
12293 *out++ = w;
12294 } else {
12295 *out++ = MBFL_BAD_INPUT;
12296 }
12297 } else {
12298 *out++ = MBFL_BAD_INPUT;
12299 }
12300 }
12301
12302 /* Finish up last byte of input string if there is one */
12303 if (p == e && out < limit) {
12304 unsigned char c = *p++;
12305 *out++ = (c <= 0x7F) ? c : MBFL_BAD_INPUT;
12306 }
12307
12308 *in_len = e - p + 1;
12309 *in = p;
12310 return out - buf;
12311 }
12312
mb_wchar_to_big5(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12313 static void mb_wchar_to_big5(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12314 {
12315 unsigned char *out, *limit;
12316 MB_CONVERT_BUF_LOAD(buf, out, limit);
12317 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12318
12319 while (len--) {
12320 uint32_t w = *in++;
12321 unsigned int s = 0;
12322
12323 if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
12324 s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
12325 } else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
12326 s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
12327 } else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
12328 s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
12329 } else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
12330 s = ucs_i_big5_table[w - ucs_i_big5_table_min];
12331 } else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
12332 s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
12333 } else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
12334 s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
12335 }
12336
12337 if (!s) {
12338 if (w == 0) {
12339 out = mb_convert_buf_add(out, 0);
12340 } else {
12341 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
12342 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12343 }
12344 } else if (s <= 0x80) {
12345 out = mb_convert_buf_add(out, s);
12346 } else {
12347 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12348 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
12349 }
12350 }
12351
12352 MB_CONVERT_BUF_STORE(buf, out, limit);
12353 }
12354
mb_cp950_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12355 static size_t mb_cp950_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12356 {
12357 unsigned char *p = *in, *e = p + *in_len;
12358 uint32_t *out = buf, *limit = buf + bufsize;
12359
12360 while (p < e && out < limit) {
12361 unsigned char c = *p++;
12362
12363 if (c <= 0x7F) {
12364 *out++ = c;
12365 } else if (c > 0x80 && c <= 0xFE && p < e) {
12366 unsigned char c2 = *p++;
12367
12368 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
12369 unsigned int w = ((c - 0xA1)*157) + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
12370 w = (w < big5_ucs_table_size) ? big5_ucs_table[w] : 0;
12371
12372 /* PUA for CP950 */
12373 if (is_in_cp950_pua(c, c2)) {
12374 unsigned int s = (c << 8) | c2;
12375
12376 int k;
12377 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12378 if (s >= cp950_pua_tbl[k][2] && s <= cp950_pua_tbl[k][3]) {
12379 break;
12380 }
12381 }
12382
12383 if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
12384 w = 157*(c - (cp950_pua_tbl[k][2] >> 8)) + c2 - (c2 >= 0xA1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
12385 } else {
12386 w = s - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
12387 }
12388 } else if (c == 0xA1) {
12389 if (c2 == 0x45) {
12390 w = 0x2027;
12391 } else if (c2 == 0x4E) {
12392 w = 0xFE51;
12393 } else if (c2 == 0x5A) {
12394 w = 0x2574;
12395 } else if (c2 == 0xC2) {
12396 w = 0x00AF;
12397 } else if (c2 == 0xC3) {
12398 w = 0xFFE3;
12399 } else if (c2 == 0xC5) {
12400 w = 0x02CD;
12401 } else if (c2 == 0xE3) {
12402 w = 0xFF5E;
12403 } else if (c2 == 0xF2) {
12404 w = 0x2295;
12405 } else if (c2 == 0xF3) {
12406 w = 0x2299;
12407 } else if (c2 == 0xFE) {
12408 w = 0xFF0F;
12409 }
12410 } else if (c == 0xA2) {
12411 if (c2 == 0x40) {
12412 w = 0xFF3C;
12413 } else if (c2 == 0x41) {
12414 w = 0x2215;
12415 } else if (c2 == 0x42) {
12416 w = 0xFE68;
12417 } else if (c2 == 0x46) {
12418 w = 0xFFE0;
12419 } else if (c2 == 0x47) {
12420 w = 0xFFE1;
12421 } else if (c2 == 0xCC) {
12422 w = 0x5341;
12423 } else if (c2 == 0xCE) {
12424 w = 0x5345;
12425 }
12426 }
12427
12428 if (!w)
12429 w = MBFL_BAD_INPUT;
12430 *out++ = w;
12431 } else {
12432 *out++ = MBFL_BAD_INPUT;
12433 }
12434 } else {
12435 *out++ = MBFL_BAD_INPUT;
12436 }
12437 }
12438
12439 *in_len = e - p;
12440 *in = p;
12441 return out - buf;
12442 }
12443
mb_wchar_to_cp950(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12444 static void mb_wchar_to_cp950(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12445 {
12446 unsigned char *out, *limit;
12447 MB_CONVERT_BUF_LOAD(buf, out, limit);
12448 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12449
12450 while (len--) {
12451 uint32_t w = *in++;
12452 unsigned int s = 0;
12453
12454 if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
12455 s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
12456 } else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
12457 s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
12458 } else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
12459 s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
12460 } else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
12461 s = ucs_i_big5_table[w - ucs_i_big5_table_min];
12462 } else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
12463 s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
12464 } else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
12465 s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
12466 }
12467
12468 if (w >= 0xE000 && w <= 0xF848) {
12469 int k;
12470 for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12471 if (w <= cp950_pua_tbl[k][1]) {
12472 break;
12473 }
12474 }
12475
12476 int c1 = w - cp950_pua_tbl[k][0];
12477 if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
12478 int c2 = cp950_pua_tbl[k][2] >> 8;
12479 s = ((c1 / 157) + c2) << 8;
12480 c1 %= 157;
12481 s |= c1 + (c1 >= 0x3F ? 0x62 : 0x40);
12482 } else {
12483 s = c1 + cp950_pua_tbl[k][2];
12484 }
12485 } else if (w == 0xA2 || w == 0xA3 || w == 0x401 || (w >= 0x414 && w <= 0x41C) || (w >= 0x423 && w <= 0x44F) || w == 0x451 || w == 0x2022 || w == 0x203E || w == 0x223C || (w >= 0x2460 && w <= 0x247D) || w == 0x2609 || w == 0x2641 || w == 0x3005 || (w >= 0x302A && w <= 0x30FF) || w == 0xFF64) {
12486 s = 0;
12487 } else if (w == 0xAF) {
12488 s = 0xA1C2;
12489 } else if (w == 0x2CD) {
12490 s = 0xA1C5;
12491 } else if (w == 0x2027) {
12492 s = 0xA145;
12493 } else if (w == 0x2215) {
12494 s = 0xA241;
12495 } else if (w == 0x2295) {
12496 s = 0xA1F2;
12497 } else if (w == 0x2299) {
12498 s = 0xA1F3;
12499 } else if (w == 0x2574) {
12500 s = 0xA15A;
12501 } else if (w == 0xFE51) {
12502 s = 0xA14E;
12503 } else if (w == 0xFE68) {
12504 s = 0xA242;
12505 } else if (w == 0xFF3C) {
12506 s = 0xA240;
12507 } else if (w == 0xFF5E) {
12508 s = 0xA1E3;
12509 } else if (w == 0xFFE0) {
12510 s = 0xA246;
12511 } else if (w == 0xFFE1) {
12512 s = 0xA247;
12513 } else if (w == 0xFFE3) {
12514 s = 0xA1C3;
12515 } else if (w == 0xFF0F) {
12516 s = 0xA1FE;
12517 }
12518
12519 if (!s) {
12520 if (w == 0) {
12521 out = mb_convert_buf_add(out, 0);
12522 } else {
12523 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
12524 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12525 }
12526 } else if (s <= 0x80) {
12527 out = mb_convert_buf_add(out, s);
12528 } else {
12529 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12530 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
12531 }
12532 }
12533
12534 MB_CONVERT_BUF_STORE(buf, out, limit);
12535 }
12536
12537 static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL};
12538
12539 static const struct mbfl_convert_vtbl vtbl_big5_wchar = {
12540 mbfl_no_encoding_big5,
12541 mbfl_no_encoding_wchar,
12542 mbfl_filt_conv_common_ctor,
12543 NULL,
12544 mbfl_filt_conv_big5_wchar,
12545 mbfl_filt_conv_big5_wchar_flush,
12546 NULL,
12547 };
12548
12549 static const struct mbfl_convert_vtbl vtbl_wchar_big5 = {
12550 mbfl_no_encoding_wchar,
12551 mbfl_no_encoding_big5,
12552 mbfl_filt_conv_common_ctor,
12553 NULL,
12554 mbfl_filt_conv_wchar_big5,
12555 mbfl_filt_conv_common_flush,
12556 NULL
12557 };
12558
12559 const mbfl_encoding mbfl_encoding_big5 = {
12560 mbfl_no_encoding_big5,
12561 "BIG-5",
12562 "BIG5",
12563 mbfl_encoding_big5_aliases,
12564 mblen_table_81_to_fe,
12565 MBFL_ENCTYPE_GL_UNSAFE,
12566 &vtbl_big5_wchar,
12567 &vtbl_wchar_big5,
12568 mb_big5_to_wchar,
12569 mb_wchar_to_big5,
12570 NULL,
12571 NULL,
12572 };
12573
12574 static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
12575 mbfl_no_encoding_cp950,
12576 mbfl_no_encoding_wchar,
12577 mbfl_filt_conv_common_ctor,
12578 NULL,
12579 mbfl_filt_conv_big5_wchar,
12580 mbfl_filt_conv_big5_wchar_flush,
12581 NULL,
12582 };
12583
12584 static const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
12585 mbfl_no_encoding_wchar,
12586 mbfl_no_encoding_cp950,
12587 mbfl_filt_conv_common_ctor,
12588 NULL,
12589 mbfl_filt_conv_wchar_big5,
12590 mbfl_filt_conv_common_flush,
12591 NULL,
12592 };
12593
12594 const mbfl_encoding mbfl_encoding_cp950 = {
12595 mbfl_no_encoding_cp950,
12596 "CP950",
12597 "BIG5",
12598 NULL,
12599 mblen_table_81_to_fe,
12600 MBFL_ENCTYPE_GL_UNSAFE,
12601 &vtbl_cp950_wchar,
12602 &vtbl_wchar_cp950,
12603 mb_cp950_to_wchar,
12604 mb_wchar_to_cp950,
12605 NULL,
12606 NULL,
12607 };
12608
12609 /*
12610 * HZ
12611 */
12612
mbfl_filt_conv_hz_wchar(int c,mbfl_convert_filter * filter)12613 static int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
12614 {
12615 int c1, s, w;
12616
12617 switch (filter->status & 0xf) {
12618 /* case 0x00: ASCII */
12619 /* case 0x10: GB2312 */
12620 case 0:
12621 if (c == '~') {
12622 filter->status += 2;
12623 } else if (filter->status == 0x10 && ((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77))) {
12624 /* DBCS first char */
12625 filter->cache = c;
12626 filter->status += 1;
12627 } else if (filter->status == 0 && c >= 0 && c < 0x80) { /* latin, CTLs */
12628 CK((*filter->output_function)(c, filter->data));
12629 } else {
12630 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12631 }
12632 break;
12633
12634 /* case 0x11: GB2312 second char */
12635 case 1:
12636 filter->status &= ~0xf;
12637 c1 = filter->cache;
12638 if (c1 > 0x20 && c1 < 0x7F && c > 0x20 && c < 0x7F) {
12639 s = (c1 - 1)*192 + c + 0x40; /* GB2312 */
12640 ZEND_ASSERT(s < cp936_ucs_table_size);
12641 if (s == 0x1864) {
12642 w = 0x30FB;
12643 } else if (s == 0x186A) {
12644 w = 0x2015;
12645 } else if (s == 0x186C) {
12646 w = 0x2225;
12647 } else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12648 w = 0;
12649 } else {
12650 w = cp936_ucs_table[s];
12651 }
12652
12653 if (w <= 0) {
12654 w = MBFL_BAD_INPUT;
12655 }
12656
12657 CK((*filter->output_function)(w, filter->data));
12658 } else {
12659 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12660 }
12661 break;
12662
12663 /* '~' */
12664 case 2:
12665 if (c == '}' && filter->status == 0x12) {
12666 filter->status = 0;
12667 } else if (c == '{' && filter->status == 2) {
12668 filter->status = 0x10;
12669 } else if (c == '~' && filter->status == 2) {
12670 CK((*filter->output_function)('~', filter->data));
12671 filter->status -= 2;
12672 } else if (c == '\n') {
12673 /* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12674 filter->status -= 2;
12675 } else {
12676 /* Invalid character after ~ */
12677 filter->status -= 2;
12678 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12679 }
12680 break;
12681
12682 EMPTY_SWITCH_DEFAULT_CASE();
12683 }
12684
12685 return 0;
12686 }
12687
mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter * filter)12688 static int mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter *filter)
12689 {
12690 if (filter->status == 0x11) {
12691 /* 2-byte character was truncated */
12692 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12693 }
12694
12695 filter->status = 0;
12696
12697 if (filter->flush_function) {
12698 (*filter->flush_function)(filter->data);
12699 }
12700
12701 return 0;
12702 }
12703
mbfl_filt_conv_wchar_hz(int c,mbfl_convert_filter * filter)12704 static int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
12705 {
12706 int s = 0;
12707
12708 if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
12709 if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261 || c == 0x2CA || c == 0x2CB || c == 0x2D9) {
12710 s = 0;
12711 } else {
12712 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
12713 }
12714 } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
12715 if (c == 0x2015) {
12716 s = 0xA1AA;
12717 } else if (c == 0x2010 || c == 0x2013 || c == 0x2014 || c == 0x2016 || c == 0x2025 || c == 0x2035 ||
12718 c == 0x2105 || c == 0x2109 || c == 0x2121 || (c >= 0x2170 && c <= 0x2179) || (c >= 0x2196 && c <= 0x2199) ||
12719 c == 0x2215 || c == 0x221F || c == 0x2223 || c == 0x2252 || c == 0x2266 || c == 0x2267 || c == 0x2295 ||
12720 (c >= 0x2550 && c <= 0x2573) || c == 0x22BF || c == 0x2609 || (c >= 0x2581 && c <= 0x258F) ||
12721 (c >= 0x2593 && c <= 0x2595) || c == 0x25BC || c == 0x25BD || (c >= 0x25E2 && c <= 0x25E5)) {
12722 s = 0;
12723 } else {
12724 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
12725 }
12726 } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
12727 if (c == 0x30FB) {
12728 s = 0xA1A4;
12729 } else if (c == 0x3006 || c == 0x3007 || c == 0x3012 || c == 0x3231 || c == 0x32A3 || c >= 0x3300 ||
12730 (c >= 0x3018 && c <= 0x3040) || (c >= 0x309B && c <= 0x309E) || (c >= 0x30FC && c <= 0x30FE)) {
12731 s = 0;
12732 } else {
12733 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
12734 }
12735 } else if (c >= ucs_i_gb2312_table_min && c < ucs_i_gb2312_table_max) {
12736 s = ucs_i_gb2312_table[c - ucs_i_gb2312_table_min];
12737 } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
12738 if (c == 0xFF04) {
12739 s = 0xA1E7;
12740 } else if (c == 0xFF5E) {
12741 s = 0xA1AB;
12742 } else if (c >= 0xFF01 && c <= 0xFF5D) {
12743 s = c - 0xFF01 + 0xA3A1;
12744 } else if (c == 0xFFE0 || c == 0xFFE1 || c == 0xFFE3 || c == 0xFFE5) {
12745 s = ucs_hff_s_cp936_table[c - 0xFFE0];
12746 }
12747 }
12748
12749 if (s & 0x8000) {
12750 s -= 0x8080;
12751 }
12752
12753 if (s <= 0) {
12754 s = (c == 0) ? 0 : -1;
12755 } else if ((s >= 0x80 && s < 0x2121) || s > 0x8080) {
12756 s = -1;
12757 }
12758
12759 if (s >= 0) {
12760 if (s < 0x80) { /* ASCII */
12761 if ((filter->status & 0xff00) != 0) {
12762 CK((*filter->output_function)('~', filter->data));
12763 CK((*filter->output_function)('}', filter->data));
12764 }
12765 filter->status = 0;
12766 if (s == 0x7E) {
12767 CK((*filter->output_function)('~', filter->data));
12768 }
12769 CK((*filter->output_function)(s, filter->data));
12770 } else { /* GB 2312-80 */
12771 if ((filter->status & 0xFF00) != 0x200) {
12772 CK((*filter->output_function)('~', filter->data));
12773 CK((*filter->output_function)('{', filter->data));
12774 }
12775 filter->status = 0x200;
12776 CK((*filter->output_function)((s >> 8) & 0x7F, filter->data));
12777 CK((*filter->output_function)(s & 0x7F, filter->data));
12778 }
12779 } else {
12780 CK(mbfl_filt_conv_illegal_output(c, filter));
12781 }
12782
12783 return 0;
12784 }
12785
mbfl_filt_conv_any_hz_flush(mbfl_convert_filter * filter)12786 static int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter)
12787 {
12788 /* back to latin */
12789 if (filter->status & 0xFF00) {
12790 CK((*filter->output_function)('~', filter->data));
12791 CK((*filter->output_function)('}', filter->data));
12792 }
12793 filter->status = 0;
12794 return 0;
12795 }
12796
12797 #define ASCII 0
12798 #define GB2312 1
12799
mb_hz_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12800 static size_t mb_hz_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12801 {
12802 unsigned char *p = *in, *e = p + *in_len;
12803 uint32_t *out = buf, *limit = buf + bufsize;
12804
12805 while (p < e && out < limit) {
12806 unsigned char c = *p++;
12807
12808 if (c == '~') {
12809 if (p == e) {
12810 break;
12811 }
12812 unsigned char c2 = *p++;
12813
12814 if (c2 == '}' && *state == GB2312) {
12815 *state = ASCII;
12816 } else if (c2 == '{' && *state == ASCII) {
12817 *state = GB2312;
12818 } else if (c2 == '~' && *state == ASCII) {
12819 *out++ = '~';
12820 } else if (c2 == '\n') {
12821 /* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12822 } else {
12823 /* Invalid character after ~ */
12824 *out++ = MBFL_BAD_INPUT;
12825 }
12826 } else if (((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77)) && p < e && *state == GB2312) {
12827 unsigned char c2 = *p++;
12828
12829 if (c > 0x20 && c < 0x7F && c2 > 0x20 && c2 < 0x7F) {
12830 unsigned int s = (c - 1)*192 + c2 + 0x40;
12831 ZEND_ASSERT(s < cp936_ucs_table_size);
12832
12833 if (s == 0x1864) {
12834 s = 0x30FB;
12835 } else if (s == 0x186A) {
12836 s = 0x2015;
12837 } else if (s == 0x186C) {
12838 s = 0x2225;
12839 } else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12840 s = 0;
12841 } else {
12842 s = cp936_ucs_table[s];
12843 }
12844 if (!s)
12845 s = MBFL_BAD_INPUT;
12846 *out++ = s;
12847 } else {
12848 *out++ = MBFL_BAD_INPUT;
12849 }
12850 } else if (c < 0x80 && *state == ASCII) {
12851 *out++ = c;
12852 } else {
12853 *out++ = MBFL_BAD_INPUT;
12854 }
12855 }
12856
12857 *in_len = e - p;
12858 *in = p;
12859 return out - buf;
12860 }
12861
mb_wchar_to_hz(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12862 static void mb_wchar_to_hz(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12863 {
12864 unsigned char *out, *limit;
12865 MB_CONVERT_BUF_LOAD(buf, out, limit);
12866 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12867
12868 while (len--) {
12869 uint32_t w = *in++;
12870 unsigned int s = 0;
12871
12872 if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
12873 if (w == 0xB7 || w == 0x144 || w == 0x148 || w == 0x251 || w == 0x261 || w == 0x2CA || w == 0x2CB || w == 0x2D9) {
12874 s = 0;
12875 } else {
12876 s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
12877 }
12878 } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
12879 if (w == 0x2015) {
12880 s = 0xA1AA;
12881 } else if (w == 0x2010 || w == 0x2013 || w == 0x2014 || w == 0x2016 || w == 0x2025 || w == 0x2035 || w == 0x2105 || w == 0x2109 || w == 0x2121 || (w >= 0x2170 && w <= 0x2179) || (w >= 0x2196 && w <= 0x2199) || w == 0x2215 || w == 0x221F || w == 0x2223 || w == 0x2252 || w == 0x2266 || w == 0x2267 || w == 0x2295 || (w >= 0x2550 && w <= 0x2573) || w == 0x22BF || w == 0x2609 || (w >= 0x2581 && w <= 0x258F) || (w >= 0x2593 && w <= 0x2595) || w == 0x25BC || w == 0x25BD || (w >= 0x25E2 && w <= 0x25E5)) {
12882 s = 0;
12883 } else {
12884 s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
12885 }
12886 } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
12887 if (w == 0x30FB) {
12888 s = 0xA1A4;
12889 } else if (w == 0x3006 || w == 0x3007 || w == 0x3012 || w == 0x3231 || w == 0x32A3 || w >= 0x3300 || (w >= 0x3018 && w <= 0x3040) || (w >= 0x309B && w <= 0x309E) || (w >= 0x30FC && w <= 0x30FE)) {
12890 s = 0;
12891 } else {
12892 s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
12893 }
12894 } else if (w >= ucs_i_gb2312_table_min && w < ucs_i_gb2312_table_max) {
12895 s = ucs_i_gb2312_table[w - ucs_i_gb2312_table_min];
12896 } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
12897 if (w == 0xFF04) {
12898 s = 0xA1E7;
12899 } else if (w == 0xFF5E) {
12900 s = 0xA1AB;
12901 } else if (w >= 0xFF01 && w <= 0xFF5D) {
12902 s = w - 0xFF01 + 0xA3A1;
12903 } else if (w == 0xFFE0 || w == 0xFFE1 || w == 0xFFE3 || w == 0xFFE5) {
12904 s = ucs_hff_s_cp936_table[w - 0xFFE0];
12905 }
12906 }
12907
12908 s &= ~0x8080;
12909
12910 if ((!s && w) || (s >= 0x80 && s < 0x2121)) {
12911 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_hz);
12912 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12913 } else if (s < 0x80) {
12914 /* ASCII */
12915 if (buf->state != ASCII) {
12916 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
12917 out = mb_convert_buf_add2(out, '~', '}');
12918 buf->state = ASCII;
12919 }
12920 if (s == '~') {
12921 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12922 out = mb_convert_buf_add2(out, '~', '~');
12923 } else {
12924 out = mb_convert_buf_add(out, s);
12925 }
12926 } else {
12927 /* GB 2312-80 */
12928 if (buf->state != GB2312) {
12929 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
12930 out = mb_convert_buf_add2(out, '~', '{');
12931 buf->state = GB2312;
12932 } else {
12933 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12934 }
12935 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
12936 }
12937 }
12938
12939 if (end && buf->state != ASCII) {
12940 /* If not in ASCII state, need to emit closing control chars */
12941 MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
12942 out = mb_convert_buf_add2(out, '~', '}');
12943 }
12944
12945 MB_CONVERT_BUF_STORE(buf, out, limit);
12946 }
12947
12948 static const struct mbfl_convert_vtbl vtbl_hz_wchar = {
12949 mbfl_no_encoding_hz,
12950 mbfl_no_encoding_wchar,
12951 mbfl_filt_conv_common_ctor,
12952 NULL,
12953 mbfl_filt_conv_hz_wchar,
12954 mbfl_filt_conv_hz_wchar_flush,
12955 NULL,
12956 };
12957
12958 static const struct mbfl_convert_vtbl vtbl_wchar_hz = {
12959 mbfl_no_encoding_wchar,
12960 mbfl_no_encoding_hz,
12961 mbfl_filt_conv_common_ctor,
12962 NULL,
12963 mbfl_filt_conv_wchar_hz,
12964 mbfl_filt_conv_any_hz_flush,
12965 NULL,
12966 };
12967
12968 const mbfl_encoding mbfl_encoding_hz = {
12969 mbfl_no_encoding_hz,
12970 "HZ",
12971 "HZ-GB-2312",
12972 NULL,
12973 NULL,
12974 MBFL_ENCTYPE_GL_UNSAFE,
12975 &vtbl_hz_wchar,
12976 &vtbl_wchar_hz,
12977 mb_hz_to_wchar,
12978 mb_wchar_to_hz,
12979 NULL,
12980 NULL,
12981 };
12982