1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
22 *
23 */
24
25 #include "mbfilter.h"
26 #include "mbfilter_cp5022x.h"
27 #include "mbfilter_jis.h"
28
29 #include "unicode_table_cp932_ext.h"
30 #include "unicode_table_jis.h"
31 #include "cp932_table.h"
32 #include "translit_kana_jisx0201_jisx0208.h"
33
34 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
35 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
36 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter);
37 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
38 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
39 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
40 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
41 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
42
43 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
44 * This was just CP50220, but the implementation was less strict regarding
45 * invalid characters; it would silently pass some through
46 * This 'encoding' only existed in mbstring. In case some poor, lost soul is
47 * still using it, retain minimal support by aliasing it to CP50220
48 *
49 * Further, mbstring also had a made-up encoding called "JIS-ms"
50 * This was the same as CP5022{0,1,2}, but without their special ways of
51 * handling conversion of Unicode half-width katakana */
52 static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
53
54 const mbfl_encoding mbfl_encoding_cp50220 = {
55 mbfl_no_encoding_cp50220,
56 "CP50220",
57 "ISO-2022-JP",
58 cp50220_aliases,
59 NULL,
60 MBFL_ENCTYPE_GL_UNSAFE,
61 &vtbl_cp50220_wchar,
62 &vtbl_wchar_cp50220,
63 mb_cp5022x_to_wchar,
64 mb_wchar_to_cp50220,
65 NULL
66 };
67
68 const mbfl_encoding mbfl_encoding_cp50221 = {
69 mbfl_no_encoding_cp50221,
70 "CP50221",
71 "ISO-2022-JP",
72 NULL,
73 NULL,
74 MBFL_ENCTYPE_GL_UNSAFE,
75 &vtbl_cp50221_wchar,
76 &vtbl_wchar_cp50221,
77 mb_cp5022x_to_wchar,
78 mb_wchar_to_cp50221,
79 NULL
80 };
81
82 const mbfl_encoding mbfl_encoding_cp50222 = {
83 mbfl_no_encoding_cp50222,
84 "CP50222",
85 "ISO-2022-JP",
86 NULL,
87 NULL,
88 MBFL_ENCTYPE_GL_UNSAFE,
89 &vtbl_cp50222_wchar,
90 &vtbl_wchar_cp50222,
91 mb_cp5022x_to_wchar,
92 mb_wchar_to_cp50222,
93 NULL
94 };
95
96 const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
97 mbfl_no_encoding_cp50220,
98 mbfl_no_encoding_wchar,
99 mbfl_filt_conv_common_ctor,
100 NULL,
101 mbfl_filt_conv_cp5022x_wchar,
102 mbfl_filt_conv_cp5022x_wchar_flush,
103 NULL,
104 };
105
106 const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
107 mbfl_no_encoding_wchar,
108 mbfl_no_encoding_cp50220,
109 mbfl_filt_conv_common_ctor,
110 NULL,
111 mbfl_filt_conv_wchar_cp50220,
112 mbfl_filt_conv_wchar_cp50220_flush,
113 NULL,
114 };
115
116 const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
117 mbfl_no_encoding_cp50221,
118 mbfl_no_encoding_wchar,
119 mbfl_filt_conv_common_ctor,
120 NULL,
121 mbfl_filt_conv_cp5022x_wchar,
122 mbfl_filt_conv_cp5022x_wchar_flush,
123 NULL,
124 };
125
126 const struct mbfl_convert_vtbl vtbl_wchar_cp50221 = {
127 mbfl_no_encoding_wchar,
128 mbfl_no_encoding_cp50221,
129 mbfl_filt_conv_common_ctor,
130 NULL,
131 mbfl_filt_conv_wchar_cp50221,
132 mbfl_filt_conv_any_jis_flush,
133 NULL,
134 };
135
136 const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
137 mbfl_no_encoding_cp50222,
138 mbfl_no_encoding_wchar,
139 mbfl_filt_conv_common_ctor,
140 NULL,
141 mbfl_filt_conv_cp5022x_wchar,
142 mbfl_filt_conv_cp5022x_wchar_flush,
143 NULL,
144 };
145
146 const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
147 mbfl_no_encoding_wchar,
148 mbfl_no_encoding_cp50222,
149 mbfl_filt_conv_common_ctor,
150 NULL,
151 mbfl_filt_conv_wchar_cp50222,
152 mbfl_filt_conv_wchar_cp50222_flush,
153 NULL,
154 };
155
156 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
157
mbfl_filt_conv_cp5022x_wchar(int c,mbfl_convert_filter * filter)158 int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
159 {
160 int c1, s, w;
161
162 retry:
163 switch (filter->status & 0xf) {
164 /* case 0x00: ASCII */
165 /* case 0x10: X 0201 latin */
166 /* case 0x20: X 0201 kana */
167 /* case 0x80: X 0208 */
168 /* case 0x90: X 0212 */
169 case 0:
170 if (c == 0x1b) {
171 filter->status += 2;
172 } else if (c == 0x0e) { /* "kana in" */
173 filter->status = 0x20;
174 } else if (c == 0x0f) { /* "kana out" */
175 filter->status = 0;
176 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
177 CK((*filter->output_function)(0xa5, filter->data));
178 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
179 CK((*filter->output_function)(0x203e, filter->data));
180 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
181 CK((*filter->output_function)(0xff40 + c, filter->data));
182 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
183 filter->cache = c;
184 filter->status += 1;
185 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
186 CK((*filter->output_function)(c, filter->data));
187 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
188 CK((*filter->output_function)(0xfec0 + c, filter->data));
189 } else {
190 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
191 }
192 break;
193
194 /* case 0x81: X 0208 second char */
195 /* case 0x91: X 0212 second char */
196 case 1:
197 filter->status &= ~0xf;
198 c1 = filter->cache;
199 if (c > 0x20 && c < 0x7f) {
200 s = (c1 - 0x21)*94 + c - 0x21;
201 if (filter->status == 0x80) {
202 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
203 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
204 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
205 w = jisx0208_ucs_table[s];
206 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
207 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
208 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
209 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
210 } else if (s >= 94 * 94 && s < 114 * 94) {
211 /* user-defined => PUA (Microsoft extended) */
212 w = s - 94*94 + 0xe000;
213 } else {
214 w = 0;
215 }
216
217 if (w <= 0) {
218 w = MBFL_BAD_INPUT;
219 }
220 } else {
221 if (s >= 0 && s < jisx0212_ucs_table_size) {
222 w = jisx0212_ucs_table[s];
223 } else {
224 w = 0;
225 }
226
227 if (w <= 0) {
228 w = MBFL_BAD_INPUT;
229 }
230 }
231 CK((*filter->output_function)(w, filter->data));
232 } else {
233 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
234 }
235 break;
236
237 /* ESC */
238 /* case 0x02: */
239 /* case 0x12: */
240 /* case 0x22: */
241 /* case 0x82: */
242 /* case 0x92: */
243 case 2:
244 if (c == 0x24) { /* '$' */
245 filter->status++;
246 } else if (c == 0x28) { /* '(' */
247 filter->status += 3;
248 } else {
249 filter->status &= ~0xf;
250 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
251 goto retry;
252 }
253 break;
254
255 /* ESC $ */
256 /* case 0x03: */
257 /* case 0x13: */
258 /* case 0x23: */
259 /* case 0x83: */
260 /* case 0x93: */
261 case 3:
262 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
263 filter->status = 0x80;
264 } else if (c == 0x28) { /* '(' */
265 filter->status++;
266 } else {
267 filter->status &= ~0xf;
268 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
269 CK((*filter->output_function)(0x24, filter->data));
270 goto retry;
271 }
272 break;
273
274 /* ESC $ ( */
275 /* case 0x04: */
276 /* case 0x14: */
277 /* case 0x24: */
278 /* case 0x84: */
279 /* case 0x94: */
280 case 4:
281 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
282 filter->status = 0x80;
283 } else if (c == 0x44) { /* 'D' */
284 filter->status = 0x90;
285 } else {
286 filter->status &= ~0xf;
287 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
288 CK((*filter->output_function)(0x24, filter->data));
289 CK((*filter->output_function)(0x28, filter->data));
290 goto retry;
291 }
292 break;
293
294 /* ESC ( */
295 /* case 0x05: */
296 /* case 0x15: */
297 /* case 0x25: */
298 /* case 0x85: */
299 /* case 0x95: */
300 case 5:
301 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
302 filter->status = 0;
303 } else if (c == 0x4a) { /* 'J' */
304 filter->status = 0x10;
305 } else if (c == 0x49) { /* 'I' */
306 filter->status = 0x20;
307 } else {
308 filter->status &= ~0xf;
309 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
310 CK((*filter->output_function)(0x28, filter->data));
311 goto retry;
312 }
313 break;
314
315 EMPTY_SWITCH_DEFAULT_CASE();
316 }
317
318 return 0;
319 }
320
mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter * filter)321 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
322 {
323 if (filter->status & 0xF) {
324 /* 2-byte (JIS X 0208 or 0212) character was truncated, or else
325 * escape sequence was truncated */
326 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
327 }
328 filter->status = 0;
329
330 if (filter->flush_function) {
331 (*filter->flush_function)(filter->data);
332 }
333
334 return 0;
335 }
336
337 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
338 * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
339 * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
340 * `mode` must not call for transforms which are inverses (i.e. which would cancel
341 * each other out).
342 *
343 * In some cases, successive input codepoints may be merged into one output codepoint.
344 * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
345 * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
346 * will not be modified. If there is no following codepoint, `next` should be zero.
347 *
348 * Again, in some cases, one input codepoint may convert to two output codepoints.
349 * If so, the second output codepoint will be stored in `*second`.
350 *
351 * Return the resulting codepoint. If none of the requested transforms apply, return
352 * the input codepoint unchanged.
353 */
mb_convert_kana_codepoint(uint32_t c,uint32_t next,bool * consumed,uint32_t * second,unsigned int mode)354 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
355 {
356 if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
357 return c + 0xFEE0;
358 }
359 if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
360 return c + 0xFEE0;
361 }
362 if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
363 return c + 0xFEE0;
364 }
365 if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
366 return 0x3000;
367 }
368
369 if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
370 /* Convert Hankaku kana to Zenkaku kana
371 * Either all Hankaku kana (including katakana and hiragana) will be converted
372 * to Zenkaku katakana, or to Zenkaku hiragana */
373 if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
374 if (c >= 0xFF61 && c <= 0xFF9F) {
375 int n = c - 0xFF60;
376
377 if (next >= 0xFF61 && next <= 0xFF9F) {
378 if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
379 *consumed = true;
380 return 0x3001 + hankana2zenkana_table[n];
381 }
382 if (next == 0xFF9E && n == 19) {
383 *consumed = true;
384 return 0x30F4;
385 }
386 if (next == 0xFF9F && n >= 42 && n <= 46) {
387 *consumed = true;
388 return 0x3002 + hankana2zenkana_table[n];
389 }
390 }
391
392 return 0x3000 + hankana2zenkana_table[n];
393 }
394 }
395 if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
396 if (c >= 0xFF61 && c <= 0xFF9F) {
397 int n = c - 0xFF60;
398
399 if (next >= 0xFF61 && next <= 0xFF9F) {
400 if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
401 *consumed = true;
402 return 0x3001 + hankana2zenhira_table[n];
403 }
404 if (next == 0xFF9F && n >= 42 && n <= 46) {
405 *consumed = true;
406 return 0x3002 + hankana2zenhira_table[n];
407 }
408 }
409
410 return 0x3000 + hankana2zenhira_table[n];
411 }
412 }
413 if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
414 return 0x3000 + hankana2zenkana_table[c - 0xFF60];
415 }
416 if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
417 return 0x3000 + hankana2zenhira_table[c - 0xFF60];
418 }
419 }
420
421 if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
422 if (c == '\\' || c == 0xA5) { /* YEN SIGN */
423 return 0xFFE5; /* FULLWIDTH YEN SIGN */
424 }
425 if (c == 0x7E || c == 0x203E) {
426 return 0xFFE3; /* FULLWIDTH MACRON */
427 }
428 if (c == '\'') {
429 return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
430 }
431 if (c == '"') {
432 return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
433 }
434 }
435
436 if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
437 /* Zenkaku to Hankaku */
438 if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
439 /* all except " ' \ ~ */
440 return c - 0xFEE0;
441 }
442 if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
443 return c - 0xFEE0;
444 }
445 if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
446 return c - 0xFEE0;
447 }
448 if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
449 return ' ';
450 }
451 if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
452 return '-';
453 }
454 }
455
456 if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
457 /* Zenkaku kana to hankaku kana */
458 if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
459 /* Zenkaku katakana to hankaku kana */
460 int n = c - 0x30A1;
461 if (zenkana2hankana_table[n][1]) {
462 *second = 0xFF00 + zenkana2hankana_table[n][1];
463 }
464 return 0xFF00 + zenkana2hankana_table[n][0];
465 }
466 if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
467 /* Zenkaku hiragana to hankaku kana */
468 int n = c - 0x3041;
469 if (zenkana2hankana_table[n][1]) {
470 *second = 0xFF00 + zenkana2hankana_table[n][1];
471 }
472 return 0xFF00 + zenkana2hankana_table[n][0];
473 }
474 if (c == 0x3001) {
475 return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
476 }
477 if (c == 0x3002) {
478 return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
479 }
480 if (c == 0x300C) {
481 return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
482 }
483 if (c == 0x300D) {
484 return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
485 }
486 if (c == 0x309B) {
487 return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
488 }
489 if (c == 0x309C) {
490 return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
491 }
492 if (c == 0x30FC) {
493 return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
494 }
495 if (c == 0x30FB) {
496 return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
497 }
498 }
499
500 if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
501 if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
502 /* Zenkaku hiragana to Zenkaku katakana */
503 return c + 0x60;
504 }
505 if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
506 /* Zenkaku katakana to Zenkaku hiragana */
507 return c - 0x60;
508 }
509 }
510
511 if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
512 if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
513 return '\\';
514 }
515 if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
516 return '~';
517 }
518 if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
519 return '\'';
520 }
521 if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
522 return '"';
523 }
524 }
525
526 return c;
527 }
528
mbfl_filt_conv_wchar_cp50220(int c,mbfl_convert_filter * filter)529 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
530 {
531 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
532 bool consumed = false;
533
534 if (filter->cache) {
535 int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
536 filter->cache = consumed ? 0 : c;
537 /* Terrible hack to get CP50220 to emit error markers in the proper
538 * position, not reordering them with subsequent characters */
539 filter->filter_function = mbfl_filt_conv_wchar_cp50221;
540 mbfl_filt_conv_wchar_cp50221(s, filter);
541 filter->filter_function = mbfl_filt_conv_wchar_cp50220;
542 if (c == 0 && !consumed) {
543 (*filter->output_function)(0, filter->data);
544 }
545 } else if (c == 0) {
546 /* This case has to be handled separately, since `filter->cache == 0` means
547 * no codepoint is cached */
548 (*filter->output_function)(0, filter->data);
549 } else {
550 filter->cache = c;
551 }
552
553 return 0;
554 }
555
mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter * filter)556 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
557 {
558 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
559
560 if (filter->cache) {
561 int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
562 filter->filter_function = mbfl_filt_conv_wchar_cp50221;
563 mbfl_filt_conv_wchar_cp50221(s, filter);
564 filter->filter_function = mbfl_filt_conv_wchar_cp50220;
565 filter->cache = 0;
566 }
567
568 return mbfl_filt_conv_any_jis_flush(filter);
569 }
570
mbfl_filt_conv_wchar_cp50221(int c,mbfl_convert_filter * filter)571 int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
572 {
573 int s = 0;
574
575 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
576 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
577 } else if (c == 0x203E) { /* OVERLINE */
578 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
579 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
580 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
581 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
582 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
583 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
584 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
585 } else if (c >= 0xE000 && c <= 0xE757) {
586 /* 'private'/'user' codepoints */
587 s = c - 0xE000;
588 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
589 }
590
591 if (s <= 0) {
592 if (c == 0xa5) { /* YEN SIGN */
593 s = 0x1005c;
594 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
595 s = 0x2140;
596 } else if (c == 0x2225) { /* PARALLEL TO */
597 s = 0x2142;
598 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
599 s = 0x215d;
600 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
601 s = 0x2171;
602 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
603 s = 0x2172;
604 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
605 s = 0x224c;
606 }
607 }
608
609 /* Above, we do a series of lookups in `ucs_*_jis_table` to find a
610 * corresponding kuten code for this Unicode codepoint
611 * If we get zero, that means the codepoint is not in JIS X 0208
612 * On the other hand, if we get a result with the high bits set on both
613 * upper and lower bytes, that is not a code in JIS X 0208 but rather
614 * in JIS X 0213
615 * In either case, check if this codepoint is one of the extensions added
616 * to JIS X 0208 by MicroSoft (to make CP932) */
617 if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
618 int i;
619 s = -1;
620
621 for (i = 0;
622 i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
623 i++) {
624 const int oh = cp932ext1_ucs_table_min / 94;
625
626 if (c == cp932ext1_ucs_table[i]) {
627 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
628 break;
629 }
630 }
631
632 if (s < 0) {
633 const int oh = cp932ext2_ucs_table_min / 94;
634 const int cp932ext2_ucs_table_size =
635 cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
636 for (i = 0; i < cp932ext2_ucs_table_size; i++) {
637 if (c == cp932ext2_ucs_table[i]) {
638 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
639 break;
640 }
641 }
642 }
643
644 if (c == 0) {
645 s = 0;
646 } else if (s <= 0) {
647 s = -1;
648 }
649 }
650
651 if (s >= 0) {
652 if (s < 0x80) { /* ASCII */
653 if ((filter->status & 0xff00) != 0) {
654 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
655 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
656 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
657 filter->status = 0;
658 }
659 CK((*filter->output_function)(s, filter->data));
660 } else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
661 if ((filter->status & 0xff00) != 0x500) {
662 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
663 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
664 CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
665 filter->status = 0x500;
666 }
667 CK((*filter->output_function)(s - 0x80, filter->data));
668 } else if (s <= 0x927E) { /* X 0208 + extensions */
669 if ((filter->status & 0xff00) != 0x200) {
670 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
671 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
672 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
673 filter->status = 0x200;
674 }
675 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
676 CK((*filter->output_function)(s & 0xff, filter->data));
677 } else if (s < 0x10000) { /* X0212 */
678 CK(mbfl_filt_conv_illegal_output(c, filter));
679 } else { /* X 0201 latin */
680 if ((filter->status & 0xff00) != 0x400) {
681 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
682 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
683 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
684 }
685 filter->status = 0x400;
686 CK((*filter->output_function)(s & 0x7f, filter->data));
687 }
688 } else {
689 CK(mbfl_filt_conv_illegal_output(c, filter));
690 }
691
692 return 0;
693 }
694
695 /*
696 * wchar => CP50222
697 */
mbfl_filt_conv_wchar_cp50222(int c,mbfl_convert_filter * filter)698 int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
699 {
700 int s = 0;
701
702 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
703 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
704 } else if (c == 0x203E) { /* OVERLINE */
705 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
706 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
707 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
708 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
709 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
710 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
711 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
712 } else if (c >= 0xE000 && c <= 0xE757) {
713 /* 'private'/'user' codepoints */
714 s = c - 0xE000;
715 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
716 }
717
718 if (s <= 0) {
719 if (c == 0xa5) { /* YEN SIGN */
720 s = 0x1005c;
721 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
722 s = 0x2140;
723 } else if (c == 0x2225) { /* PARALLEL TO */
724 s = 0x2142;
725 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
726 s = 0x215d;
727 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
728 s = 0x2171;
729 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
730 s = 0x2172;
731 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
732 s = 0x224c;
733 }
734 }
735 if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
736 int i;
737 s = -1;
738
739 for (i = 0;
740 i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
741 const int oh = cp932ext1_ucs_table_min / 94;
742
743 if (c == cp932ext1_ucs_table[i]) {
744 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
745 break;
746 }
747 }
748
749 if (s <= 0) {
750 const int oh = cp932ext2_ucs_table_min / 94;
751 const int cp932ext2_ucs_table_size =
752 cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
753 for (i = 0; i < cp932ext2_ucs_table_size; i++) {
754 if (c == cp932ext2_ucs_table[i]) {
755 s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
756 break;
757 }
758 }
759 }
760
761 if (c == 0) {
762 s = 0;
763 } else if (s <= 0) {
764 s = -1;
765 }
766 }
767
768 if (s >= 0) {
769 if (s < 0x80) { /* ASCII */
770 if ((filter->status & 0xff00) == 0x500) {
771 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
772 filter->status = 0;
773 } else if ((filter->status & 0xff00) != 0) {
774 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
775 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
776 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
777 filter->status = 0;
778 }
779 CK((*filter->output_function)(s, filter->data));
780 } else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
781 if ((filter->status & 0xff00) != 0x500) {
782 CK((*filter->output_function)(0x0e, filter->data)); /* SI */
783 filter->status = 0x500;
784 }
785 CK((*filter->output_function)(s - 0x80, filter->data));
786 } else if (s <= 0x927E) { /* X 0208 */
787 if ((filter->status & 0xff00) == 0x500) {
788 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
789 filter->status = 0;
790 }
791 if ((filter->status & 0xff00) != 0x200) {
792 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
793 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
794 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
795 filter->status = 0x200;
796 }
797 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
798 CK((*filter->output_function)(s & 0xff, filter->data));
799 } else if (s < 0x10000) { /* X0212 */
800 CK(mbfl_filt_conv_illegal_output(c, filter));
801 } else { /* X 0201 latin */
802 if ((filter->status & 0xff00) == 0x500) {
803 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
804 filter->status = 0;
805 }
806 if ((filter->status & 0xff00) != 0x400) {
807 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
808 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
809 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
810 }
811 filter->status = 0x400;
812 CK((*filter->output_function)(s & 0x7f, filter->data));
813 }
814 } else {
815 CK(mbfl_filt_conv_illegal_output(c, filter));
816 }
817
818 return 0;
819 }
820
mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter * filter)821 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
822 {
823 /* back to latin */
824 if ((filter->status & 0xff00) == 0x500) {
825 CK((*filter->output_function)(0x0f, filter->data)); /* SO */
826 } else if ((filter->status & 0xff00) != 0) {
827 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
828 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
829 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
830 }
831 filter->status = 0;
832
833 if (filter->flush_function) {
834 (*filter->flush_function)(filter->data);
835 }
836
837 return 0;
838 }
839
840 #define ASCII 0
841 #define JISX_0201_LATIN 1
842 #define JISX_0201_KANA 2
843 #define JISX_0208 3
844 #define JISX_0212 4
845
mb_cp5022x_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)846 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
847 {
848 ZEND_ASSERT(bufsize >= 3);
849
850 unsigned char *p = *in, *e = p + *in_len;
851 uint32_t *out = buf, *limit = buf + bufsize;
852
853 while (p < e && out < limit) {
854 unsigned char c = *p++;
855
856 if (c == 0x1B) {
857 /* Escape sequence */
858 if ((e - p) < 2) {
859 *out++ = MBFL_BAD_INPUT;
860 /* Duplicate error-handling behavior of legacy code */
861 if (p < e && (*p == '(' || *p == '$'))
862 p++;
863 continue;
864 }
865 unsigned char c2 = *p++;
866 if (c2 == '$') {
867 unsigned char c3 = *p++;
868 if (c3 == '@' || c3 == 'B') {
869 *state = JISX_0208;
870 } else if (c3 == '(') {
871 if (p == e) {
872 *out++ = MBFL_BAD_INPUT;
873 break;
874 }
875 unsigned char c4 = *p++;
876 if (c4 == '@' || c4 == 'B') {
877 *state = JISX_0208;
878 } else if (c4 == 'D') {
879 *state = JISX_0212;
880 } else {
881 if ((limit - out) < 3) {
882 p -= 4;
883 break;
884 }
885 *out++ = MBFL_BAD_INPUT;
886 *out++ = '$';
887 *out++ = '(';
888 p--;
889 }
890 } else {
891 if ((limit - out) < 2) {
892 p -= 3;
893 break;
894 }
895 *out++ = MBFL_BAD_INPUT;
896 *out++ = '$';
897 p--;
898 }
899 } else if (c2 == '(') {
900 unsigned char c3 = *p++;
901 if (c3 == 'B' || c3 == 'H') {
902 *state = ASCII;
903 } else if (c3 == 'J') {
904 *state = JISX_0201_LATIN;
905 } else if (c3 == 'I') {
906 *state = JISX_0201_KANA;
907 } else {
908 if ((limit - out) < 2) {
909 p -= 3;
910 break;
911 }
912 *out++ = MBFL_BAD_INPUT;
913 *out++ = '(';
914 p--;
915 }
916 } else {
917 *out++ = MBFL_BAD_INPUT;
918 p--;
919 }
920 } else if (c == 0xE) {
921 *state = JISX_0201_KANA;
922 } else if (c == 0xF) {
923 *state = ASCII;
924 } else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
925 *out++ = 0xA5;
926 } else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
927 *out++ = 0x203E;
928 } else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
929 *out++ = 0xFF40 + c;
930 } else if (*state >= JISX_0208 && c > 0x20 && c <= 0x97) {
931 if (p == e) {
932 *out++ = MBFL_BAD_INPUT;
933 break;
934 }
935 unsigned char c2 = *p++;
936 if (c2 > 0x20 && c2 < 0x7F) {
937 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
938 uint32_t w = 0;
939 if (*state == JISX_0208) {
940 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
941 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
942 } else if (s < jisx0208_ucs_table_size) {
943 w = jisx0208_ucs_table[s];
944 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
945 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
946 } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
947 w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
948 } else if (s >= 94*94 && s < 114*94) {
949 /* MicroSoft extension */
950 w = s - 94*94 + 0xE000;
951 }
952 if (!w)
953 w = MBFL_BAD_INPUT;
954 } else {
955 if (s < jisx0212_ucs_table_size) {
956 w = jisx0212_ucs_table[s];
957 }
958 if (!w)
959 w = MBFL_BAD_INPUT;
960 }
961 *out++ = w;
962 } else {
963 *out++ = MBFL_BAD_INPUT;
964 }
965 } else if (c < 0x80) {
966 *out++ = c;
967 } else if (c >= 0xA1 && c <= 0xDF) {
968 *out++ = 0xFEC0 + c;
969 } else {
970 *out++ = MBFL_BAD_INPUT;
971 }
972 }
973
974 *in_len = e - p;
975 *in = p;
976 return out - buf;
977 }
978
lookup_wchar(uint32_t w)979 static unsigned int lookup_wchar(uint32_t w)
980 {
981 unsigned int s = 0;
982
983 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
984 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
985 } else if (w == 0x203E) { /* OVERLINE */
986 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
987 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
988 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
989 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
990 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
991 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
992 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
993 } else if (w >= 0xE000 && w <= 0xE757) {
994 /* Private Use Area codepoints */
995 s = w - 0xE000;
996 s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
997 }
998
999 if (!s) {
1000 if (w == 0xA5) { /* YEN SIGN */
1001 s = 0x1005C;
1002 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1003 s = 0x2140;
1004 } else if (w == 0x2225) { /* PARALLEL TO */
1005 s = 0x2142;
1006 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1007 s = 0x215D;
1008 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1009 s = 0x2171;
1010 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1011 s = 0x2172;
1012 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1013 s = 0x224C;
1014 } else if (w == 0) {
1015 return 0;
1016 }
1017 }
1018
1019 /* Above, we do a series of lookups in `ucs_*_jis_table` to find a
1020 * corresponding kuten code for this Unicode codepoint
1021 * If we get zero, that means the codepoint is not in JIS X 0208
1022 * On the other hand, if we get a result with the high bits set on both
1023 * upper and lower bytes, that is not a code in JIS X 0208 but rather
1024 * in JIS X 0213
1025 * In either case, check if this codepoint is one of the extensions added
1026 * to JIS X 0208 by MicroSoft (to make CP932) */
1027 if (!s || s >= 0x8080) {
1028 for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
1029 if (w == cp932ext1_ucs_table[i]) {
1030 return (((i / 94) + (cp932ext1_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
1031 }
1032 }
1033
1034 for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
1035 if (w == cp932ext2_ucs_table[i]) {
1036 return (((i / 94) + (cp932ext2_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
1037 }
1038 }
1039 }
1040
1041 return s;
1042 }
1043
mb_wchar_to_cp50220(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1044 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1045 {
1046 unsigned char *out, *limit;
1047 MB_CONVERT_BUF_LOAD(buf, out, limit);
1048 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1049
1050 uint32_t w;
1051
1052 if (buf->state & 0xFFFF00) {
1053 /* Reprocess cached codepoint */
1054 w = buf->state >> 8;
1055 buf->state &= 0xFF;
1056 goto reprocess_codepoint;
1057 }
1058
1059 while (len--) {
1060 w = *in++;
1061 reprocess_codepoint:
1062
1063 if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
1064 /* This codepoint may need to combine with the next one,
1065 * but the 'next one' will come in a separate buffer */
1066 buf->state |= w << 8;
1067 break;
1068 }
1069
1070 bool consumed = false;
1071 w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
1072 if (consumed) {
1073 /* Two successive codepoints were converted into one */
1074 in++; len--; consumed = false;
1075 }
1076
1077 unsigned int s = lookup_wchar(w);
1078
1079 if (!s && w) {
1080 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1081 } else if (s < 0x80) {
1082 /* ASCII */
1083 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1084 if (buf->state != ASCII) {
1085 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1086 buf->state = ASCII;
1087 }
1088 out = mb_convert_buf_add(out, s);
1089 } else if (s >= 0xA0 && s < 0xE0) {
1090 /* JISX 0201 Kana */
1091 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1092 if (buf->state != JISX_0201_KANA) {
1093 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1094 buf->state = JISX_0201_KANA;
1095 }
1096 out = mb_convert_buf_add(out, s - 0x80);
1097 } else if (s <= 0x927E) {
1098 /* JISX 0208 Kanji */
1099 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1100 if (buf->state != JISX_0208) {
1101 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1102 buf->state = JISX_0208;
1103 }
1104 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1105 } else if (s >= 0x10000) {
1106 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
1107 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1108 if (buf->state != JISX_0201_LATIN) {
1109 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
1110 buf->state = JISX_0201_LATIN;
1111 }
1112 out = mb_convert_buf_add(out, s & 0x7F);
1113 } else {
1114 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1115 }
1116 }
1117
1118 if (end && buf->state != ASCII) {
1119 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1120 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1121 }
1122
1123 MB_CONVERT_BUF_STORE(buf, out, limit);
1124 }
1125
mb_wchar_to_cp50221(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1126 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1127 {
1128 unsigned char *out, *limit;
1129 MB_CONVERT_BUF_LOAD(buf, out, limit);
1130 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1131
1132 while (len--) {
1133 uint32_t w = *in++;
1134 unsigned int s = lookup_wchar(w);
1135
1136 if (!s && w) {
1137 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1138 } else if (s < 0x80) {
1139 /* ASCII */
1140 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1141 if (buf->state != ASCII) {
1142 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1143 buf->state = ASCII;
1144 }
1145 out = mb_convert_buf_add(out, s);
1146 } else if (s >= 0xA0 && s < 0xE0) {
1147 /* JISX 0201 Kana */
1148 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1149 if (buf->state != JISX_0201_KANA) {
1150 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1151 buf->state = JISX_0201_KANA;
1152 }
1153 out = mb_convert_buf_add(out, s - 0x80);
1154 } else if (s <= 0x927E) {
1155 /* JISX 0208 Kanji */
1156 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1157 if (buf->state != JISX_0208) {
1158 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1159 buf->state = JISX_0208;
1160 }
1161 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1162 } else if (s >= 0x10000) {
1163 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
1164 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1165 if (buf->state != JISX_0201_LATIN) {
1166 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
1167 buf->state = JISX_0201_LATIN;
1168 }
1169 out = mb_convert_buf_add(out, s & 0x7F);
1170 } else {
1171 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1172 }
1173 }
1174
1175 if (end && buf->state != ASCII) {
1176 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1177 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1178 }
1179
1180 MB_CONVERT_BUF_STORE(buf, out, limit);
1181 }
1182
mb_wchar_to_cp50222(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1183 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1184 {
1185 unsigned char *out, *limit;
1186 MB_CONVERT_BUF_LOAD(buf, out, limit);
1187 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1188
1189 while (len--) {
1190 uint32_t w = *in++;
1191 unsigned int s = lookup_wchar(w);
1192
1193 if (!s && w) {
1194 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
1195 } else if (s < 0x80) {
1196 /* ASCII */
1197 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1198 if (buf->state == JISX_0201_KANA) {
1199 out = mb_convert_buf_add(out, 0xF);
1200 buf->state = ASCII;
1201 } else if (buf->state != ASCII) {
1202 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1203 buf->state = ASCII;
1204 }
1205 out = mb_convert_buf_add(out, s);
1206 } else if (s >= 0xA0 && s < 0xE0) {
1207 /* JISX 0201 Kana */
1208 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1209 if (buf->state != JISX_0201_KANA) {
1210 out = mb_convert_buf_add(out, 0xE);
1211 buf->state = JISX_0201_KANA;
1212 }
1213 out = mb_convert_buf_add(out, s - 0x80);
1214 } else if (s <= 0x927E) {
1215 /* JISX 0208 Kanji */
1216 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
1217 if (buf->state == JISX_0201_KANA) {
1218 out = mb_convert_buf_add(out, 0xF);
1219 }
1220 if (buf->state != JISX_0208) {
1221 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1222 buf->state = JISX_0208;
1223 }
1224 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1225 } else if (s >= 0x10000) {
1226 /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
1227 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1228 if (buf->state == JISX_0201_KANA) {
1229 out = mb_convert_buf_add(out, 0xF);
1230 }
1231 if (buf->state != JISX_0201_LATIN) {
1232 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
1233 buf->state = JISX_0201_LATIN;
1234 }
1235 out = mb_convert_buf_add(out, s & 0x7F);
1236 } else {
1237 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
1238 }
1239 }
1240
1241 if (end) {
1242 if (buf->state == JISX_0201_KANA) {
1243 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
1244 out = mb_convert_buf_add(out, 0xF);
1245 } else if (buf->state != ASCII) {
1246 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1247 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1248 }
1249 }
1250
1251 MB_CONVERT_BUF_STORE(buf, out, limit);
1252 }
1253