1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
22  *
23  */
24 
25 #include "mbfilter.h"
26 #include "mbfilter_cp5022x.h"
27 #include "mbfilter_jis.h"
28 
29 #include "unicode_table_cp932_ext.h"
30 #include "unicode_table_jis.h"
31 #include "cp932_table.h"
32 #include "translit_kana_jisx0201_jisx0208.h"
33 
34 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
35 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
36 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter);
37 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
38 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
39 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
40 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
41 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
42 
43 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
44  * This was just CP50220, but the implementation was less strict regarding
45  * invalid characters; it would silently pass some through
46  * This 'encoding' only existed in mbstring. In case some poor, lost soul is
47  * still using it, retain minimal support by aliasing it to CP50220
48  *
49  * Further, mbstring also had a made-up encoding called "JIS-ms"
50  * This was the same as CP5022{0,1,2}, but without their special ways of
51  * handling conversion of Unicode half-width katakana */
52 static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
53 
54 const mbfl_encoding mbfl_encoding_cp50220 = {
55 	mbfl_no_encoding_cp50220,
56 	"CP50220",
57 	"ISO-2022-JP",
58 	cp50220_aliases,
59 	NULL,
60 	MBFL_ENCTYPE_GL_UNSAFE,
61 	&vtbl_cp50220_wchar,
62 	&vtbl_wchar_cp50220,
63 	mb_cp5022x_to_wchar,
64 	mb_wchar_to_cp50220,
65 	NULL
66 };
67 
68 const mbfl_encoding mbfl_encoding_cp50221 = {
69 	mbfl_no_encoding_cp50221,
70 	"CP50221",
71 	"ISO-2022-JP",
72 	NULL,
73 	NULL,
74 	MBFL_ENCTYPE_GL_UNSAFE,
75 	&vtbl_cp50221_wchar,
76 	&vtbl_wchar_cp50221,
77 	mb_cp5022x_to_wchar,
78 	mb_wchar_to_cp50221,
79 	NULL
80 };
81 
82 const mbfl_encoding mbfl_encoding_cp50222 = {
83 	mbfl_no_encoding_cp50222,
84 	"CP50222",
85 	"ISO-2022-JP",
86 	NULL,
87 	NULL,
88 	MBFL_ENCTYPE_GL_UNSAFE,
89 	&vtbl_cp50222_wchar,
90 	&vtbl_wchar_cp50222,
91 	mb_cp5022x_to_wchar,
92 	mb_wchar_to_cp50222,
93 	NULL
94 };
95 
96 const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
97 	mbfl_no_encoding_cp50220,
98 	mbfl_no_encoding_wchar,
99 	mbfl_filt_conv_common_ctor,
100 	NULL,
101 	mbfl_filt_conv_cp5022x_wchar,
102 	mbfl_filt_conv_cp5022x_wchar_flush,
103 	NULL,
104 };
105 
106 const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
107 	mbfl_no_encoding_wchar,
108 	mbfl_no_encoding_cp50220,
109 	mbfl_filt_conv_common_ctor,
110 	NULL,
111 	mbfl_filt_conv_wchar_cp50220,
112 	mbfl_filt_conv_wchar_cp50220_flush,
113 	NULL,
114 };
115 
116 const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
117 	mbfl_no_encoding_cp50221,
118 	mbfl_no_encoding_wchar,
119 	mbfl_filt_conv_common_ctor,
120 	NULL,
121 	mbfl_filt_conv_cp5022x_wchar,
122 	mbfl_filt_conv_cp5022x_wchar_flush,
123 	NULL,
124 };
125 
126 const struct mbfl_convert_vtbl vtbl_wchar_cp50221 = {
127 	mbfl_no_encoding_wchar,
128 	mbfl_no_encoding_cp50221,
129 	mbfl_filt_conv_common_ctor,
130 	NULL,
131 	mbfl_filt_conv_wchar_cp50221,
132 	mbfl_filt_conv_any_jis_flush,
133 	NULL,
134 };
135 
136 const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
137 	mbfl_no_encoding_cp50222,
138 	mbfl_no_encoding_wchar,
139 	mbfl_filt_conv_common_ctor,
140 	NULL,
141 	mbfl_filt_conv_cp5022x_wchar,
142 	mbfl_filt_conv_cp5022x_wchar_flush,
143 	NULL,
144 };
145 
146 const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
147 	mbfl_no_encoding_wchar,
148 	mbfl_no_encoding_cp50222,
149 	mbfl_filt_conv_common_ctor,
150 	NULL,
151 	mbfl_filt_conv_wchar_cp50222,
152 	mbfl_filt_conv_wchar_cp50222_flush,
153 	NULL,
154 };
155 
156 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
157 
mbfl_filt_conv_cp5022x_wchar(int c,mbfl_convert_filter * filter)158 int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
159 {
160 	int c1, s, w;
161 
162 retry:
163 	switch (filter->status & 0xf) {
164 /*	case 0x00:	 ASCII */
165 /*	case 0x10:	 X 0201 latin */
166 /*	case 0x20:	 X 0201 kana */
167 /*	case 0x80:	 X 0208 */
168 /*	case 0x90:	 X 0212 */
169 	case 0:
170 		if (c == 0x1b) {
171 			filter->status += 2;
172 		} else if (c == 0x0e) {		/* "kana in" */
173 			filter->status = 0x20;
174 		} else if (c == 0x0f) {		/* "kana out" */
175 			filter->status = 0;
176 		} else if (filter->status == 0x10 && c == 0x5c) {	/* YEN SIGN */
177 			CK((*filter->output_function)(0xa5, filter->data));
178 		} else if (filter->status == 0x10 && c == 0x7e) {	/* OVER LINE */
179 			CK((*filter->output_function)(0x203e, filter->data));
180 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
181 			CK((*filter->output_function)(0xff40 + c, filter->data));
182 		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
183 			filter->cache = c;
184 			filter->status += 1;
185 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
186 			CK((*filter->output_function)(c, filter->data));
187 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
188 			CK((*filter->output_function)(0xfec0 + c, filter->data));
189 		} else {
190 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
191 		}
192 		break;
193 
194 /*	case 0x81:	 X 0208 second char */
195 /*	case 0x91:	 X 0212 second char */
196 	case 1:
197 		filter->status &= ~0xf;
198 		c1 = filter->cache;
199 		if (c > 0x20 && c < 0x7f) {
200 			s = (c1 - 0x21)*94 + c - 0x21;
201 			if (filter->status == 0x80) {
202 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
203 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
204 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {
205 					w = jisx0208_ucs_table[s];
206 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
207 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
208 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
209 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
210 				} else if (s >= 94 * 94 && s < 114 * 94) {
211 					/* user-defined => PUA (Microsoft extended) */
212 					w = s - 94*94 + 0xe000;
213 				} else {
214 					w = 0;
215 				}
216 
217 				if (w <= 0) {
218 					w = MBFL_BAD_INPUT;
219 				}
220 			} else {
221 				if (s >= 0 && s < jisx0212_ucs_table_size) {
222 					w = jisx0212_ucs_table[s];
223 				} else {
224 					w = 0;
225 				}
226 
227 				if (w <= 0) {
228 					w = MBFL_BAD_INPUT;
229 				}
230 			}
231 			CK((*filter->output_function)(w, filter->data));
232 		} else {
233 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
234 		}
235 		break;
236 
237 	/* ESC */
238 /*	case 0x02:	*/
239 /*	case 0x12:	*/
240 /*	case 0x22:	*/
241 /*	case 0x82:	*/
242 /*	case 0x92:	*/
243 	case 2:
244 		if (c == 0x24) {		/* '$' */
245 			filter->status++;
246 		} else if (c == 0x28) {		/* '(' */
247 			filter->status += 3;
248 		} else {
249 			filter->status &= ~0xf;
250 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
251 			goto retry;
252 		}
253 		break;
254 
255 	/* ESC $ */
256 /*	case 0x03:	*/
257 /*	case 0x13:	*/
258 /*	case 0x23:	*/
259 /*	case 0x83:	*/
260 /*	case 0x93:	*/
261 	case 3:
262 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
263 			filter->status = 0x80;
264 		} else if (c == 0x28) {			/* '(' */
265 			filter->status++;
266 		} else {
267 			filter->status &= ~0xf;
268 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
269 			CK((*filter->output_function)(0x24, filter->data));
270 			goto retry;
271 		}
272 		break;
273 
274 	/* ESC $ ( */
275 /*	case 0x04:	*/
276 /*	case 0x14:	*/
277 /*	case 0x24:	*/
278 /*	case 0x84:	*/
279 /*	case 0x94:	*/
280 	case 4:
281 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
282 			filter->status = 0x80;
283 		} else if (c == 0x44) {			/* 'D' */
284 			filter->status = 0x90;
285 		} else {
286 			filter->status &= ~0xf;
287 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
288 			CK((*filter->output_function)(0x24, filter->data));
289 			CK((*filter->output_function)(0x28, filter->data));
290 			goto retry;
291 		}
292 		break;
293 
294 	/* ESC ( */
295 /*	case 0x05:	*/
296 /*	case 0x15:	*/
297 /*	case 0x25:	*/
298 /*	case 0x85:	*/
299 /*	case 0x95:	*/
300 	case 5:
301 		if (c == 0x42 || c == 0x48) {		/* 'B' or 'H' */
302 			filter->status = 0;
303 		} else if (c == 0x4a) {		/* 'J' */
304 			filter->status = 0x10;
305 		} else if (c == 0x49) {		/* 'I' */
306 			filter->status = 0x20;
307 		} else {
308 			filter->status &= ~0xf;
309 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
310 			CK((*filter->output_function)(0x28, filter->data));
311 			goto retry;
312 		}
313 		break;
314 
315 		EMPTY_SWITCH_DEFAULT_CASE();
316 	}
317 
318 	return 0;
319 }
320 
mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter * filter)321 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
322 {
323 	if (filter->status & 0xF) {
324 		/* 2-byte (JIS X 0208 or 0212) character was truncated, or else
325 		 * escape sequence was truncated */
326 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
327 	}
328 	filter->status = 0;
329 
330 	if (filter->flush_function) {
331 		(*filter->flush_function)(filter->data);
332 	}
333 
334 	return 0;
335 }
336 
337 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
338  * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
339  * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
340  * `mode` must not call for transforms which are inverses (i.e. which would cancel
341  * each other out).
342  *
343  * In some cases, successive input codepoints may be merged into one output codepoint.
344  * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
345  * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
346  * will not be modified. If there is no following codepoint, `next` should be zero.
347  *
348  * Again, in some cases, one input codepoint may convert to two output codepoints.
349  * If so, the second output codepoint will be stored in `*second`.
350  *
351  * Return the resulting codepoint. If none of the requested transforms apply, return
352  * the input codepoint unchanged.
353  */
mb_convert_kana_codepoint(uint32_t c,uint32_t next,bool * consumed,uint32_t * second,unsigned int mode)354 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
355 {
356 	if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
357 		return c + 0xFEE0;
358 	}
359 	if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
360 		return c + 0xFEE0;
361 	}
362 	if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
363 		return c + 0xFEE0;
364 	}
365 	if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
366 		return 0x3000;
367 	}
368 
369 	if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
370 		/* Convert Hankaku kana to Zenkaku kana
371 		 * Either all Hankaku kana (including katakana and hiragana) will be converted
372 		 * to Zenkaku katakana, or to Zenkaku hiragana */
373 		if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
374 			if (c >= 0xFF61 && c <= 0xFF9F) {
375 				int n = c - 0xFF60;
376 
377 				if (next >= 0xFF61 && next <= 0xFF9F) {
378 					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
379 						*consumed = true;
380 						return 0x3001 + hankana2zenkana_table[n];
381 					}
382 					if (next == 0xFF9E && n == 19) {
383 						*consumed = true;
384 						return 0x30F4;
385 					}
386 					if (next == 0xFF9F && n >= 42 && n <= 46) {
387 						*consumed = true;
388 						return 0x3002 + hankana2zenkana_table[n];
389 					}
390 				}
391 
392 				return 0x3000 + hankana2zenkana_table[n];
393 			}
394 		}
395 		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
396 			if (c >= 0xFF61 && c <= 0xFF9F) {
397 				int n = c - 0xFF60;
398 
399 				if (next >= 0xFF61 && next <= 0xFF9F) {
400 					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
401 						*consumed = true;
402 						return 0x3001 + hankana2zenhira_table[n];
403 					}
404 					if (next == 0xFF9F && n >= 42 && n <= 46) {
405 						*consumed = true;
406 						return 0x3002 + hankana2zenhira_table[n];
407 					}
408 				}
409 
410 				return 0x3000 + hankana2zenhira_table[n];
411 			}
412 		}
413 		if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
414 			return 0x3000 + hankana2zenkana_table[c - 0xFF60];
415 		}
416 		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
417 			return 0x3000 + hankana2zenhira_table[c - 0xFF60];
418 		}
419 	}
420 
421 	if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
422 		if (c == '\\' || c == 0xA5) { /* YEN SIGN */
423 			return 0xFFE5; /* FULLWIDTH YEN SIGN */
424 		}
425 		if (c == 0x7E || c == 0x203E) {
426 			return 0xFFE3; /* FULLWIDTH MACRON */
427 		}
428 		if (c == '\'') {
429 			return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
430 		}
431 		if (c == '"') {
432 			return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
433 		}
434 	}
435 
436 	if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
437 		/* Zenkaku to Hankaku */
438 		if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
439 			/* all except " ' \ ~ */
440 			return c - 0xFEE0;
441 		}
442 		if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
443 			return c - 0xFEE0;
444 		}
445 		if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
446 			return c - 0xFEE0;
447 		}
448 		if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
449 			return ' ';
450 		}
451 		if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
452 			return '-';
453 		}
454 	}
455 
456 	if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
457 		/* Zenkaku kana to hankaku kana */
458 		if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
459 			/* Zenkaku katakana to hankaku kana */
460 			int n = c - 0x30A1;
461 			if (zenkana2hankana_table[n][1]) {
462 				*second = 0xFF00 + zenkana2hankana_table[n][1];
463 			}
464 			return 0xFF00 + zenkana2hankana_table[n][0];
465 		}
466 		if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
467 			/* Zenkaku hiragana to hankaku kana */
468 			int n = c - 0x3041;
469 			if (zenkana2hankana_table[n][1]) {
470 				*second = 0xFF00 + zenkana2hankana_table[n][1];
471 			}
472 			return 0xFF00 + zenkana2hankana_table[n][0];
473 		}
474 		if (c == 0x3001) {
475 			return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
476 		}
477 		if (c == 0x3002) {
478 			return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
479 		}
480 		if (c == 0x300C) {
481 			return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
482 		}
483 		if (c == 0x300D) {
484 			return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
485 		}
486 		if (c == 0x309B) {
487 			return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
488 		}
489 		if (c == 0x309C) {
490 			return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
491 		}
492 		if (c == 0x30FC) {
493 			return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
494 		}
495 		if (c == 0x30FB) {
496 			return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
497 		}
498 	}
499 
500 	if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
501 		if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
502 			/* Zenkaku hiragana to Zenkaku katakana */
503 			return c + 0x60;
504 		}
505 		if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
506 			/* Zenkaku katakana to Zenkaku hiragana */
507 			return c - 0x60;
508 		}
509 	}
510 
511 	if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
512 		if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
513 			return '\\';
514 		}
515 		if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
516 			return '~';
517 		}
518 		if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
519 			return '\'';
520 		}
521 		if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
522 			return '"';
523 		}
524 	}
525 
526 	return c;
527 }
528 
mbfl_filt_conv_wchar_cp50220(int c,mbfl_convert_filter * filter)529 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
530 {
531 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
532 	bool consumed = false;
533 
534 	if (filter->cache) {
535 		int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
536 		filter->cache = consumed ? 0 : c;
537 		/* Terrible hack to get CP50220 to emit error markers in the proper
538 		 * position, not reordering them with subsequent characters */
539 		filter->filter_function = mbfl_filt_conv_wchar_cp50221;
540 		mbfl_filt_conv_wchar_cp50221(s, filter);
541 		filter->filter_function = mbfl_filt_conv_wchar_cp50220;
542 		if (c == 0 && !consumed) {
543 			(*filter->output_function)(0, filter->data);
544 		}
545 	} else if (c == 0) {
546 		/* This case has to be handled separately, since `filter->cache == 0` means
547 		 * no codepoint is cached */
548 		(*filter->output_function)(0, filter->data);
549 	} else {
550 		filter->cache = c;
551 	}
552 
553 	return 0;
554 }
555 
mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter * filter)556 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
557 {
558 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
559 
560 	if (filter->cache) {
561 		int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
562 		filter->filter_function = mbfl_filt_conv_wchar_cp50221;
563 		mbfl_filt_conv_wchar_cp50221(s, filter);
564 		filter->filter_function = mbfl_filt_conv_wchar_cp50220;
565 		filter->cache = 0;
566 	}
567 
568 	return mbfl_filt_conv_any_jis_flush(filter);
569 }
570 
mbfl_filt_conv_wchar_cp50221(int c,mbfl_convert_filter * filter)571 int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
572 {
573 	int s = 0;
574 
575 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
576 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
577 	} else if (c == 0x203E) { /* OVERLINE */
578 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
579 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
580 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
581 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
582 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
583 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
584 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
585 	} else if (c >= 0xE000 && c <= 0xE757) {
586 		/* 'private'/'user' codepoints */
587 		s = c - 0xE000;
588 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
589 	}
590 
591 	if (s <= 0) {
592 		if (c == 0xa5) {			/* YEN SIGN */
593 			s = 0x1005c;
594 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
595 			s = 0x2140;
596 		} else if (c == 0x2225) {	/* PARALLEL TO */
597 			s = 0x2142;
598 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
599 			s = 0x215d;
600 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
601 			s = 0x2171;
602 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
603 			s = 0x2172;
604 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
605 			s = 0x224c;
606 		}
607 	}
608 
609 	/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
610 	 * corresponding kuten code for this Unicode codepoint
611 	 * If we get zero, that means the codepoint is not in JIS X 0208
612 	 * On the other hand, if we get a result with the high bits set on both
613 	 * upper and lower bytes, that is not a code in JIS X 0208 but rather
614 	 * in JIS X 0213
615 	 * In either case, check if this codepoint is one of the extensions added
616 	 * to JIS X 0208 by MicroSoft (to make CP932) */
617 	if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
618 		int i;
619 		s = -1;
620 
621 		for (i = 0;
622 				i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
623 				i++) {
624 			const int oh = cp932ext1_ucs_table_min / 94;
625 
626 			if (c == cp932ext1_ucs_table[i]) {
627 				s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
628 				break;
629 			}
630 		}
631 
632 		if (s < 0) {
633 			const int oh = cp932ext2_ucs_table_min / 94;
634 			const int cp932ext2_ucs_table_size =
635 					cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
636 			for (i = 0; i < cp932ext2_ucs_table_size; i++) {
637 				if (c == cp932ext2_ucs_table[i]) {
638 					s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
639 					break;
640 				}
641 			}
642 		}
643 
644 		if (c == 0) {
645 			s = 0;
646 		} else if (s <= 0) {
647 			s = -1;
648 		}
649 	}
650 
651 	if (s >= 0) {
652 		if (s < 0x80) { /* ASCII */
653 			if ((filter->status & 0xff00) != 0) {
654 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
655 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
656 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
657 				filter->status = 0;
658 			}
659 			CK((*filter->output_function)(s, filter->data));
660 		} else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
661 			if ((filter->status & 0xff00) != 0x500) {
662 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
663 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
664 				CK((*filter->output_function)(0x49, filter->data));		/* 'I' */
665 				filter->status = 0x500;
666 			}
667 			CK((*filter->output_function)(s - 0x80, filter->data));
668 		} else if (s <= 0x927E) { /* X 0208 + extensions */
669 			if ((filter->status & 0xff00) != 0x200) {
670 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
671 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
672 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
673 				filter->status = 0x200;
674 			}
675 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
676 			CK((*filter->output_function)(s & 0xff, filter->data));
677 		} else if (s < 0x10000) { /* X0212 */
678 			CK(mbfl_filt_conv_illegal_output(c, filter));
679 		} else { /* X 0201 latin */
680 			if ((filter->status & 0xff00) != 0x400) {
681 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
682 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
683 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
684 			}
685 			filter->status = 0x400;
686 			CK((*filter->output_function)(s & 0x7f, filter->data));
687 		}
688 	} else {
689 		CK(mbfl_filt_conv_illegal_output(c, filter));
690 	}
691 
692 	return 0;
693 }
694 
695 /*
696  * wchar => CP50222
697  */
mbfl_filt_conv_wchar_cp50222(int c,mbfl_convert_filter * filter)698 int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
699 {
700 	int s = 0;
701 
702 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
703 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
704 	} else if (c == 0x203E) { /* OVERLINE */
705 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
706 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
707 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
708 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
709 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
710 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
711 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
712 	} else if (c >= 0xE000 && c <= 0xE757) {
713 		/* 'private'/'user' codepoints */
714 		s = c - 0xE000;
715 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
716 	}
717 
718 	if (s <= 0) {
719 		if (c == 0xa5) {			/* YEN SIGN */
720 			s = 0x1005c;
721 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
722 			s = 0x2140;
723 		} else if (c == 0x2225) {	/* PARALLEL TO */
724 			s = 0x2142;
725 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
726 			s = 0x215d;
727 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
728 			s = 0x2171;
729 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
730 			s = 0x2172;
731 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
732 			s = 0x224c;
733 		}
734 	}
735 	if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
736 		int i;
737 		s = -1;
738 
739 		for (i = 0;
740 				i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
741 			const int oh = cp932ext1_ucs_table_min / 94;
742 
743 			if (c == cp932ext1_ucs_table[i]) {
744 				s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
745 				break;
746 			}
747 		}
748 
749 		if (s <= 0) {
750 			const int oh = cp932ext2_ucs_table_min / 94;
751 			const int cp932ext2_ucs_table_size =
752 					cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
753 			for (i = 0; i < cp932ext2_ucs_table_size; i++) {
754 				if (c == cp932ext2_ucs_table[i]) {
755 					s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
756 					break;
757 				}
758 			}
759 		}
760 
761 		if (c == 0) {
762 			s = 0;
763 		} else if (s <= 0) {
764 			s = -1;
765 		}
766 	}
767 
768 	if (s >= 0) {
769 		if (s < 0x80) { /* ASCII */
770 			if ((filter->status & 0xff00) == 0x500) {
771 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
772 				filter->status = 0;
773 			} else if ((filter->status & 0xff00) != 0) {
774 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
775 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
776 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
777 				filter->status = 0;
778 			}
779 			CK((*filter->output_function)(s, filter->data));
780 		} else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
781 			if ((filter->status & 0xff00) != 0x500) {
782 				CK((*filter->output_function)(0x0e, filter->data));		/* SI */
783 				filter->status = 0x500;
784 			}
785 			CK((*filter->output_function)(s - 0x80, filter->data));
786 		} else if (s <= 0x927E) { /* X 0208 */
787 			if ((filter->status & 0xff00) == 0x500) {
788 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
789 				filter->status = 0;
790 			}
791 			if ((filter->status & 0xff00) != 0x200) {
792 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
793 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
794 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
795 				filter->status = 0x200;
796 			}
797 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
798 			CK((*filter->output_function)(s & 0xff, filter->data));
799 		} else if (s < 0x10000) { /* X0212 */
800 			CK(mbfl_filt_conv_illegal_output(c, filter));
801 		} else { /* X 0201 latin */
802 			if ((filter->status & 0xff00) == 0x500) {
803 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
804 				filter->status = 0;
805 			}
806 			if ((filter->status & 0xff00) != 0x400) {
807 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
808 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
809 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
810 			}
811 			filter->status = 0x400;
812 			CK((*filter->output_function)(s & 0x7f, filter->data));
813 		}
814 	} else {
815 		CK(mbfl_filt_conv_illegal_output(c, filter));
816 	}
817 
818 	return 0;
819 }
820 
mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter * filter)821 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
822 {
823 	/* back to latin */
824 	if ((filter->status & 0xff00) == 0x500) {
825 		CK((*filter->output_function)(0x0f, filter->data));		/* SO */
826 	} else if ((filter->status & 0xff00) != 0) {
827 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
828 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
829 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
830 	}
831 	filter->status = 0;
832 
833 	if (filter->flush_function) {
834 		(*filter->flush_function)(filter->data);
835 	}
836 
837 	return 0;
838 }
839 
840 #define ASCII 0
841 #define JISX_0201_LATIN 1
842 #define JISX_0201_KANA 2
843 #define JISX_0208 3
844 #define JISX_0212 4
845 
mb_cp5022x_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)846 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
847 {
848 	ZEND_ASSERT(bufsize >= 3);
849 
850 	unsigned char *p = *in, *e = p + *in_len;
851 	uint32_t *out = buf, *limit = buf + bufsize;
852 
853 	while (p < e && out < limit) {
854 		unsigned char c = *p++;
855 
856 		if (c == 0x1B) {
857 			/* Escape sequence */
858 			if ((e - p) < 2) {
859 				*out++ = MBFL_BAD_INPUT;
860 				/* Duplicate error-handling behavior of legacy code */
861 				if (p < e && (*p == '(' || *p == '$'))
862 					p++;
863 				continue;
864 			}
865 			unsigned char c2 = *p++;
866 			if (c2 == '$') {
867 				unsigned char c3 = *p++;
868 				if (c3 == '@' || c3 == 'B') {
869 					*state = JISX_0208;
870 				} else if (c3 == '(') {
871 					if (p == e) {
872 						*out++ = MBFL_BAD_INPUT;
873 						break;
874 					}
875 					unsigned char c4 = *p++;
876 					if (c4 == '@' || c4 == 'B') {
877 						*state = JISX_0208;
878 					} else if (c4 == 'D') {
879 						*state = JISX_0212;
880 					} else {
881 						if ((limit - out) < 3) {
882 							p -= 4;
883 							break;
884 						}
885 						*out++ = MBFL_BAD_INPUT;
886 						*out++ = '$';
887 						*out++ = '(';
888 						p--;
889 					}
890 				} else {
891 					if ((limit - out) < 2) {
892 						p -= 3;
893 						break;
894 					}
895 					*out++ = MBFL_BAD_INPUT;
896 					*out++ = '$';
897 					p--;
898 				}
899 			} else if (c2 == '(') {
900 				unsigned char c3 = *p++;
901 				if (c3 == 'B' || c3 == 'H') {
902 					*state = ASCII;
903 				} else if (c3 == 'J') {
904 					*state = JISX_0201_LATIN;
905 				} else if (c3 == 'I') {
906 					*state = JISX_0201_KANA;
907 				} else {
908 					if ((limit - out) < 2) {
909 						p -= 3;
910 						break;
911 					}
912 					*out++ = MBFL_BAD_INPUT;
913 					*out++ = '(';
914 					p--;
915 				}
916 			} else {
917 				*out++ = MBFL_BAD_INPUT;
918 				p--;
919 			}
920 		} else if (c == 0xE) {
921 			*state = JISX_0201_KANA;
922 		} else if (c == 0xF) {
923 			*state = ASCII;
924 		} else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
925 			*out++ = 0xA5;
926 		} else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
927 			*out++ = 0x203E;
928 		} else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
929 			*out++ = 0xFF40 + c;
930 		} else if (*state >= JISX_0208 && c > 0x20 && c <= 0x97) {
931 			if (p == e) {
932 				*out++ = MBFL_BAD_INPUT;
933 				break;
934 			}
935 			unsigned char c2 = *p++;
936 			if (c2 > 0x20 && c2 < 0x7F) {
937 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
938 				uint32_t w = 0;
939 				if (*state == JISX_0208) {
940 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
941 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
942 					} else if (s < jisx0208_ucs_table_size) {
943 						w = jisx0208_ucs_table[s];
944 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
945 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
946 					} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
947 						w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
948 					} else if (s >= 94*94 && s < 114*94) {
949 						/* MicroSoft extension */
950 						w = s - 94*94 + 0xE000;
951 					}
952 					if (!w)
953 						w = MBFL_BAD_INPUT;
954 				} else {
955 					if (s < jisx0212_ucs_table_size) {
956 						w = jisx0212_ucs_table[s];
957 					}
958 					if (!w)
959 						w = MBFL_BAD_INPUT;
960 				}
961 				*out++ = w;
962 			} else {
963 				*out++ = MBFL_BAD_INPUT;
964 			}
965 		} else if (c < 0x80) {
966 			*out++ = c;
967 		} else if (c >= 0xA1 && c <= 0xDF) {
968 			*out++ = 0xFEC0 + c;
969 		} else {
970 			*out++ = MBFL_BAD_INPUT;
971 		}
972 	}
973 
974 	*in_len = e - p;
975 	*in = p;
976 	return out - buf;
977 }
978 
lookup_wchar(uint32_t w)979 static unsigned int lookup_wchar(uint32_t w)
980 {
981 	unsigned int s = 0;
982 
983 	if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
984 		s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
985 	} else if (w == 0x203E) { /* OVERLINE */
986 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
987 	} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
988 		s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
989 	} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
990 		s = ucs_i_jis_table[w - ucs_i_jis_table_min];
991 	} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
992 		s = ucs_r_jis_table[w - ucs_r_jis_table_min];
993 	} else if (w >= 0xE000 && w <= 0xE757) {
994 		/* Private Use Area codepoints */
995 		s = w - 0xE000;
996 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
997 	}
998 
999 	if (!s) {
1000 		if (w == 0xA5) { /* YEN SIGN */
1001 			s = 0x1005C;
1002 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1003 			s = 0x2140;
1004 		} else if (w == 0x2225) { /* PARALLEL TO */
1005 			s = 0x2142;
1006 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1007 			s = 0x215D;
1008 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1009 			s = 0x2171;
1010 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1011 			s = 0x2172;
1012 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1013 			s = 0x224C;
1014 		} else if (w == 0) {
1015 			return 0;
1016 		}
1017 	}
1018 
1019 	/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
1020 	 * corresponding kuten code for this Unicode codepoint
1021 	 * If we get zero, that means the codepoint is not in JIS X 0208
1022 	 * On the other hand, if we get a result with the high bits set on both
1023 	 * upper and lower bytes, that is not a code in JIS X 0208 but rather
1024 	 * in JIS X 0213
1025 	 * In either case, check if this codepoint is one of the extensions added
1026 	 * to JIS X 0208 by MicroSoft (to make CP932) */
1027 	if (!s || s >= 0x8080) {
1028 		for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
1029 			if (w == cp932ext1_ucs_table[i]) {
1030 				return (((i / 94) + (cp932ext1_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
1031 			}
1032 		}
1033 
1034 		for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
1035 			if (w == cp932ext2_ucs_table[i]) {
1036 				return (((i / 94) + (cp932ext2_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
1037 			}
1038 		}
1039 	}
1040 
1041 	return s;
1042 }
1043 
mb_wchar_to_cp50220(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1044 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1045 {
1046 	unsigned char *out, *limit;
1047 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1048 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1049 
1050 	uint32_t w;
1051 
1052 	if (buf->state & 0xFFFF00) {
1053 		/* Reprocess cached codepoint */
1054 		w = buf->state >> 8;
1055 		buf->state &= 0xFF;
1056 		goto reprocess_codepoint;
1057 	}
1058 
1059 	while (len--) {
1060 		w = *in++;
1061 reprocess_codepoint:
1062 
1063 		if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
1064 			/* This codepoint may need to combine with the next one,
1065 			 * but the 'next one' will come in a separate buffer */
1066 			buf->state |= w << 8;
1067 			break;
1068 		}
1069 
1070 		bool consumed = false;
1071 		w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
1072 		if (consumed) {
1073 			/* Two successive codepoints were converted into one */
1074 			in++; len--; consumed = false;
1075 		}
1076 
1077 		unsigned int s = lookup_wchar(w);
1078 
1079 		if (!s && w) {
1080 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1081 		} else if (s < 0x80) {
1082 			/* ASCII */
1083 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1084 			if (buf->state != ASCII) {
1085 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1086 				buf->state = ASCII;
1087 			}
1088 			out = mb_convert_buf_add(out, s);
1089 		} else if (s >= 0xA0 && s < 0xE0) {
1090 			/* JISX 0201 Kana */
1091 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1092 			if (buf->state != JISX_0201_KANA) {
1093 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1094 				buf->state = JISX_0201_KANA;
1095 			}
1096 			out = mb_convert_buf_add(out, s - 0x80);
1097 		} else if (s <= 0x927E) {
1098 			/* JISX 0208 Kanji */
1099 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1100 			if (buf->state != JISX_0208) {
1101 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1102 				buf->state = JISX_0208;
1103 			}
1104 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1105 		} else if (s >= 0x10000) {
1106 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
1107 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1108 			if (buf->state != JISX_0201_LATIN) {
1109 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
1110 				buf->state = JISX_0201_LATIN;
1111 			}
1112 			out = mb_convert_buf_add(out, s & 0x7F);
1113 		} else {
1114 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1115 		}
1116 	}
1117 
1118 	if (end && buf->state != ASCII) {
1119 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1120 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1121 	}
1122 
1123 	MB_CONVERT_BUF_STORE(buf, out, limit);
1124 }
1125 
mb_wchar_to_cp50221(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1126 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1127 {
1128 	unsigned char *out, *limit;
1129 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1130 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1131 
1132 	while (len--) {
1133 		uint32_t w = *in++;
1134 		unsigned int s = lookup_wchar(w);
1135 
1136 		if (!s && w) {
1137 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1138 		} else if (s < 0x80) {
1139 			/* ASCII */
1140 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1141 			if (buf->state != ASCII) {
1142 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1143 				buf->state = ASCII;
1144 			}
1145 			out = mb_convert_buf_add(out, s);
1146 		} else if (s >= 0xA0 && s < 0xE0) {
1147 			/* JISX 0201 Kana */
1148 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1149 			if (buf->state != JISX_0201_KANA) {
1150 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1151 				buf->state = JISX_0201_KANA;
1152 			}
1153 			out = mb_convert_buf_add(out, s - 0x80);
1154 		} else if (s <= 0x927E) {
1155 			/* JISX 0208 Kanji */
1156 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1157 			if (buf->state != JISX_0208) {
1158 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1159 				buf->state = JISX_0208;
1160 			}
1161 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1162 		} else if (s >= 0x10000) {
1163 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
1164 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1165 			if (buf->state != JISX_0201_LATIN) {
1166 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
1167 				buf->state = JISX_0201_LATIN;
1168 			}
1169 			out = mb_convert_buf_add(out, s & 0x7F);
1170 		} else {
1171 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
1172 		}
1173 	}
1174 
1175 	if (end && buf->state != ASCII) {
1176 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1177 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1178 	}
1179 
1180 	MB_CONVERT_BUF_STORE(buf, out, limit);
1181 }
1182 
mb_wchar_to_cp50222(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1183 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1184 {
1185 	unsigned char *out, *limit;
1186 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1187 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1188 
1189 	while (len--) {
1190 		uint32_t w = *in++;
1191 		unsigned int s = lookup_wchar(w);
1192 
1193 		if (!s && w) {
1194 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
1195 		} else if (s < 0x80) {
1196 			/* ASCII */
1197 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1198 			if (buf->state == JISX_0201_KANA) {
1199 				out = mb_convert_buf_add(out, 0xF);
1200 				buf->state = ASCII;
1201 			} else if (buf->state != ASCII) {
1202 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1203 				buf->state = ASCII;
1204 			}
1205 			out = mb_convert_buf_add(out, s);
1206 		} else if (s >= 0xA0 && s < 0xE0) {
1207 			/* JISX 0201 Kana */
1208 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1209 			if (buf->state != JISX_0201_KANA) {
1210 				out = mb_convert_buf_add(out, 0xE);
1211 				buf->state = JISX_0201_KANA;
1212 			}
1213 			out = mb_convert_buf_add(out, s - 0x80);
1214 		} else if (s <= 0x927E) {
1215 			/* JISX 0208 Kanji */
1216 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
1217 			if (buf->state == JISX_0201_KANA) {
1218 				out = mb_convert_buf_add(out, 0xF);
1219 			}
1220 			if (buf->state != JISX_0208) {
1221 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1222 				buf->state = JISX_0208;
1223 			}
1224 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1225 		} else if (s >= 0x10000) {
1226 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
1227 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1228 			if (buf->state == JISX_0201_KANA) {
1229 				out = mb_convert_buf_add(out, 0xF);
1230 			}
1231 			if (buf->state != JISX_0201_LATIN) {
1232 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
1233 				buf->state = JISX_0201_LATIN;
1234 			}
1235 			out = mb_convert_buf_add(out, s & 0x7F);
1236 		} else {
1237 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
1238 		}
1239 	}
1240 
1241 	if (end) {
1242 		if (buf->state == JISX_0201_KANA) {
1243 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
1244 			out = mb_convert_buf_add(out, 0xF);
1245 		} else if (buf->state != ASCII) {
1246 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1247 			out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1248 		}
1249 	}
1250 
1251 	MB_CONVERT_BUF_STORE(buf, out, limit);
1252 }
1253