1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_sjis.c
26  * by rui hirokawa <hirokawa@php.net> on 15 aug 2011.
27  */
28 
29 /* Although the specification for Shift-JIS-2004 indicates that 0x5C and
30  * 0x7E should (respectively) represent a Yen sign and an overbar, feedback
31  * from Japanese PHP users indicates that they prefer 0x5C and 0x7E to be
32  * treated as equivalent to U+005C and U+007E. This is the historical
33  * behavior of mbstring, and promotes compatibility with other software
34  * which handles Shift-JIS and Shift-JIS-2004 text in this way. */
35 
36 #include "mbfilter.h"
37 #include "mbfilter_sjis_2004.h"
38 #include "mbfilter_euc_jp_2004.h"
39 #include "mbfilter_iso2022jp_2004.h"
40 
41 #include "unicode_table_jis2004.h"
42 #include "unicode_table_jis.h"
43 
44 extern const unsigned char mblen_table_sjis_mobile[];
45 extern const unsigned char mblen_table_eucjp[];
46 
47 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
48 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
49 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
50 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
51 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
52 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
53 
54 extern int mbfl_bisec_srch(int w, const unsigned short *tbl, int n);
55 extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n);
56 
57 static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL};
58 static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
59 
60 const mbfl_encoding mbfl_encoding_sjis2004 = {
61 	mbfl_no_encoding_sjis2004,
62 	"SJIS-2004",
63 	"Shift_JIS",
64 	mbfl_encoding_sjis2004_aliases,
65 	mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
66 	MBFL_ENCTYPE_GL_UNSAFE,
67 	&vtbl_sjis2004_wchar,
68 	&vtbl_wchar_sjis2004,
69 	mb_sjis2004_to_wchar,
70 	mb_wchar_to_sjis2004,
71 	NULL
72 };
73 
74 const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
75 	mbfl_no_encoding_sjis2004,
76 	mbfl_no_encoding_wchar,
77 	mbfl_filt_conv_common_ctor,
78 	NULL,
79 	mbfl_filt_conv_jis2004_wchar,
80 	mbfl_filt_conv_jis2004_wchar_flush,
81 	NULL,
82 };
83 
84 const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = {
85 	mbfl_no_encoding_wchar,
86 	mbfl_no_encoding_sjis2004,
87 	mbfl_filt_conv_common_ctor,
88 	NULL,
89 	mbfl_filt_conv_wchar_jis2004,
90 	mbfl_filt_conv_wchar_jis2004_flush,
91 	NULL,
92 };
93 
94 const mbfl_encoding mbfl_encoding_eucjp2004 = {
95 	mbfl_no_encoding_eucjp2004,
96 	"EUC-JP-2004",
97 	"EUC-JP",
98 	mbfl_encoding_eucjp2004_aliases,
99 	mblen_table_eucjp,
100 	0,
101 	&vtbl_eucjp2004_wchar,
102 	&vtbl_wchar_eucjp2004,
103 	mb_eucjp2004_to_wchar,
104 	mb_wchar_to_eucjp2004,
105 	NULL
106 };
107 
108 const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
109 	mbfl_no_encoding_eucjp2004,
110 	mbfl_no_encoding_wchar,
111 	mbfl_filt_conv_common_ctor,
112 	NULL,
113 	mbfl_filt_conv_jis2004_wchar,
114 	mbfl_filt_conv_jis2004_wchar_flush,
115 	NULL,
116 };
117 
118 const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = {
119 	mbfl_no_encoding_wchar,
120 	mbfl_no_encoding_eucjp2004,
121 	mbfl_filt_conv_common_ctor,
122 	NULL,
123 	mbfl_filt_conv_wchar_jis2004,
124 	mbfl_filt_conv_wchar_jis2004_flush,
125 	NULL,
126 };
127 
128 const mbfl_encoding mbfl_encoding_2022jp_2004 = {
129 	mbfl_no_encoding_2022jp_2004,
130 	"ISO-2022-JP-2004",
131 	"ISO-2022-JP-2004",
132 	NULL,
133 	NULL,
134 	MBFL_ENCTYPE_GL_UNSAFE,
135 	&vtbl_2022jp_2004_wchar,
136 	&vtbl_wchar_2022jp_2004,
137 	mb_iso2022jp2004_to_wchar,
138 	mb_wchar_to_iso2022jp2004,
139 	NULL
140 };
141 
142 const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
143 	mbfl_no_encoding_2022jp_2004,
144 	mbfl_no_encoding_wchar,
145 	mbfl_filt_conv_common_ctor,
146 	NULL,
147 	mbfl_filt_conv_jis2004_wchar,
148 	mbfl_filt_conv_jis2004_wchar_flush,
149 	NULL,
150 };
151 
152 const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = {
153 	mbfl_no_encoding_wchar,
154 	mbfl_no_encoding_2022jp_2004,
155 	mbfl_filt_conv_common_ctor,
156 	NULL,
157 	mbfl_filt_conv_wchar_jis2004,
158 	mbfl_filt_conv_wchar_jis2004_flush,
159 	NULL,
160 };
161 
162 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
163 
164 #define SJIS_ENCODE(c1,c2,s1,s2)	\
165 		do {						\
166 			s1 = c1;				\
167 			s1--;					\
168 			s1 >>= 1;				\
169 			if ((c1) < 0x5f) {		\
170 				s1 += 0x71;			\
171 			} else {				\
172 				s1 += 0xb1;			\
173 			}						\
174 			s2 = c2;				\
175 			if ((c1) & 1) {			\
176 				if ((c2) < 0x60) {	\
177 					s2--;			\
178 				}					\
179 				s2 += 0x20;			\
180 			} else {				\
181 				s2 += 0x7e;			\
182 			}						\
183 		} while (0)
184 
185 #define SJIS_DECODE(c1,c2,s1,s2)	\
186 		do {						\
187 			s1 = c1;				\
188 			if (s1 < 0xa0) {		\
189 				s1 -= 0x81;			\
190 			} else {				\
191 				s1 -= 0xc1;			\
192 			}						\
193 			s1 <<= 1;				\
194 			s1 += 0x21;				\
195 			s2 = c2;				\
196 			if (s2 < 0x9f) {		\
197 				if (s2 < 0x7f) {	\
198 					s2++;			\
199 				}					\
200 				s2 -= 0x20;			\
201 			} else {				\
202 				s1++;				\
203 				s2 -= 0x7e;			\
204 			}						\
205 		} while (0)
206 
mbfl_filt_conv_jis2004_wchar(int c,mbfl_convert_filter * filter)207 int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
208 {
209 	int k;
210 	int c1, c2, s, s1 = 0, s2 = 0, w = 0, w1;
211 
212 	switch (filter->status & 0xf) {
213 	case 0:
214 		if (c >= 0 && c < 0x80) { /* latin */
215 			if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
216 				CK((*filter->output_function)(c, filter->data));
217 			} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
218 				if (c == 0x5c) {
219 					CK((*filter->output_function)(0x00a5, filter->data));
220 				} else if (c == 0x7e) {
221 					CK((*filter->output_function)(0x203e, filter->data));
222 				} else {
223 					CK((*filter->output_function)(c, filter->data));
224 				}
225 			} else { /* ISO-2022-JP-2004 */
226 				if (c == 0x1b) {
227 					filter->status += 6;
228 				} else if ((filter->status == 0x80 || filter->status == 0x90 || filter->status == 0xa0)
229 				   && c > 0x20 && c < 0x7f) { /* kanji first char */
230 					filter->cache = c;
231 					if (filter->status == 0x90) {
232 						filter->status += 1; /* JIS X 0213 plane 1 */
233 					} else if (filter->status == 0xa0) {
234 						filter->status += 4; /* JIS X 0213 plane 2 */
235 					} else {
236 						filter->status += 5; /* JIS X 0208 */
237 					}
238 				} else {
239 					CK((*filter->output_function)(c, filter->data));
240 				}
241 			}
242 		} else {
243 			if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
244 				if (c > 0xa0 && c < 0xff) { /* X 0213 plane 1 first char */
245 					filter->status = 1;
246 					filter->cache = c;
247 				} else if (c == 0x8e) { /* kana first char */
248 					filter->cache = 0x8E; /* So error will be reported if input is truncated right here */
249 					filter->status = 2;
250 				} else if (c == 0x8f) { /* X 0213 plane 2 first char */
251 					filter->status = 3;
252 				} else {
253 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
254 				}
255 			} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
256 				if (c > 0xa0 && c < 0xe0) { /* kana */
257 					CK((*filter->output_function)(0xfec0 + c, filter->data));
258 				} else if (c > 0x80 && c < 0xfd && c != 0xa0) {	/* kanji first char */
259 					filter->status = 1;
260 					filter->cache = c;
261 				} else {
262 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
263 				}
264 			} else {
265 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
266 			}
267 		}
268 		break;
269 
270 	case 1: /* kanji second char */
271 		filter->status &= ~0xf;
272 		c1 = filter->cache;
273 
274 		if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
275 			if (c > 0xa0 && c < 0xff) {
276 				s1 = c1 - 0x80;
277 				s2 = c - 0x80;
278 			} else {
279 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
280 				break;
281 			}
282 		} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
283 			if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
284 				SJIS_DECODE(c1, c, s1, s2);
285 			} else {
286 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
287 				break;
288 			}
289 		} else { /* ISO-2022-JP-2004 */
290 			if (c >= 0x21 && c <= 0x7E) {
291 				s1 = c1;
292 				s2 = c;
293 			} else {
294 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
295 				break;
296 			}
297 		}
298 		w1 = (s1 << 8) | s2;
299 
300 		/* conversion for combining characters */
301 		if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) ||
302 			(w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 ||
303 			(w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
304 			k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
305 			if (k >= 0) {
306 				w = jisx0213_u2_tbl[2*k];
307 				CK((*filter->output_function)(w, filter->data));
308 				w = jisx0213_u2_tbl[2*k+1];
309 			}
310 		}
311 
312 		/* conversion for BMP  */
313 		if (w <= 0) {
314 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
315 			if (w1 >= 0 && w1 < jisx0213_ucs_table_size) {
316 				w = jisx0213_ucs_table[w1];
317 			}
318 		}
319 
320 		/* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
321 		if (w <= 0) {
322 			w1 = (s1 << 8) | s2;
323 			k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
324 			if (k >= 0) {
325 				w = jisx0213_jis_u5_tbl[k] + 0x20000;
326 			}
327 		}
328 
329 		if (w <= 0) {
330 			w = MBFL_BAD_INPUT;
331 		}
332 		CK((*filter->output_function)(w, filter->data));
333 		break;
334 
335 	case 2: /* got 0x8e: EUC-JP-2004 kana */
336 		filter->status = 0;
337 		if (c > 0xa0 && c < 0xe0) {
338 			w = 0xfec0 + c;
339 			CK((*filter->output_function)(w, filter->data));
340 		} else {
341 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
342 		}
343 		break;
344 
345 	case 3: /* X 0213 plane 2 first char: EUC-JP-2004 (0x8f) */
346 		if (c == 0xA1 || (c >= 0xA3 && c <= 0xA5) || c == 0xA8 || (c >= 0xAC && c <= 0xAF) || (c >= 0xEE && c <= 0xFE)) {
347 			filter->cache = c - 0x80;
348 			filter->status++;
349 		} else {
350 			filter->status = 0;
351 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
352 		}
353 		break;
354 
355 	case 4: /* X 0213 plane 2 second char: EUC-JP-2004, ISO-2022-JP-2004 */
356 		filter->status &= ~0xF;
357 		c1 = filter->cache;
358 		if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
359 			c2 = c - 0x80;
360 		} else {
361 			c2 = c;
362 		}
363 
364 		if (c2 < 0x21 || c2 > 0x7E) {
365 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
366 			break;
367 		}
368 
369 		s1 = c1 - 0x21;
370 		s2 = c2 - 0x21;
371 
372 		if (((s1 >= 0 && s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) ||
373 			(s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) {
374 			/* calc offset from ku */
375 			for (k = 0; k < jisx0213_p2_ofst_len; k++) {
376 				if (s1 == jisx0213_p2_ofst[k]) {
377 					break;
378 				}
379 			}
380 			k -= jisx0213_p2_ofst[k];
381 
382 			/* check for japanese chars in BMP */
383 			s = (s1 + 94 + k)*94 + s2;
384 			ZEND_ASSERT(s < jisx0213_ucs_table_size);
385 			w = jisx0213_ucs_table[s];
386 
387 			/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
388 			if (w <= 0) {
389 				w1 = ((c1 + k + 94) << 8) | c2;
390 				k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
391 				if (k >= 0) {
392 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
393 				}
394 			}
395 
396 			if (w <= 0) {
397 				w = MBFL_BAD_INPUT;
398 			}
399 
400 			CK((*filter->output_function)(w, filter->data));
401 		} else {
402 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
403 		}
404 		break;
405 
406 	case 5: /* X 0208: ISO-2022-JP-2004 */
407 		filter->status &= ~0xf;
408 		c1 = filter->cache;
409 		if (c > 0x20 && c < 0x7f) {
410 			s = (c1 - 0x21)*94 + c - 0x21;
411 			if (s >= 0 && s < jisx0208_ucs_table_size) {
412 				w = jisx0208_ucs_table[s];
413 			}
414 		}
415 
416 		if (w <= 0) {
417 			w = MBFL_BAD_INPUT;
418 		}
419 
420 		CK((*filter->output_function)(w, filter->data));
421 		break;
422 
423 	/* ESC: ISO-2022-JP-2004 */
424 /*	case 0x06:	*/
425 /*	case 0x16:	*/
426 /*	case 0x26:	*/
427 /*	case 0x86:	*/
428 /*	case 0x96:	*/
429 /*	case 0xa6:	*/
430 	case 6:
431 		if (c == '$') {
432 			filter->status++;
433 		} else if (c == '(') {
434 			filter->status += 3;
435 		} else {
436 			filter->status &= ~0xf;
437 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
438 		}
439 		break;
440 
441 	/* ESC $: ISO-2022-JP-2004 */
442 /*	case 0x07:	*/
443 /*	case 0x17:	*/
444 /*	case 0x27:	*/
445 /*	case 0x87:	*/
446 /*	case 0x97:	*/
447 /*	case 0xa7:	*/
448 	case 7:
449 		if (c == 'B') { /* JIS X 0208-1983 */
450 			filter->status = 0x80;
451 		} else if (c == '(') {
452 			filter->status++;
453 		} else {
454 			filter->status &= ~0xf;
455 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
456 		}
457 		break;
458 
459 	/* ESC $ (: ISO-2022-JP-2004 */
460 /*	case 0x08:	*/
461 /*	case 0x18:	*/
462 /*	case 0x28:	*/
463 /*	case 0x88:	*/
464 /*	case 0x98:	*/
465 /*	case 0xa8:	*/
466 	case 8:
467 		if (c == 'Q') { /* JIS X 0213 plane 1 */
468 			filter->status = 0x90;
469 		} else if (c == 'P') { /* JIS X 0213 plane 2 */
470 			filter->status = 0xa0;
471 		} else {
472 			filter->status &= ~0xf;
473 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
474 		}
475 		break;
476 
477 	/* ESC (: ISO-2022-JP-2004 */
478 /*	case 0x09:	*/
479 /*	case 0x19:	*/
480 /*	case 0x29:	*/
481 /*	case 0x89:	*/
482 /*	case 0x99:	*/
483 	case 9:
484 		if (c == 'B') {
485 			filter->status = 0;
486 		} else {
487 			filter->status &= ~0xf;
488 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
489 		}
490 		break;
491 
492 		EMPTY_SWITCH_DEFAULT_CASE();
493 	}
494 
495 	return 0;
496 }
497 
mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter * filter)498 int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
499 {
500 	if (filter->status & 0xF) {
501 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
502 	}
503 	filter->status = 0;
504 
505 	if (filter->flush_function) {
506 		return (*filter->flush_function)(filter->data);
507 	}
508 
509 	return 0;
510 }
511 
mbfl_filt_conv_wchar_jis2004(int c,mbfl_convert_filter * filter)512 int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
513 {
514 	int k;
515 	int c1, c2, s1, s2;
516 
517 retry:
518 	s1 = 0;
519 	/* check for 1st char of combining characters */
520 	if ((filter->status & 0xf) == 0 && (
521 			c == 0x00E6 ||
522 			(c >= 0x0254 && c <= 0x02E9) ||
523 			(c >= 0x304B && c <= 0x3053) ||
524 			(c >= 0x30AB && c <= 0x30C8) ||
525 			c == 0x31F7)) {
526 		for (k = 0; k < jisx0213_u2_tbl_len; k++) {
527 			if (c == jisx0213_u2_tbl[2*k]) {
528 				filter->status++;
529 				filter->cache = k;
530 				return 0;
531 			}
532 		}
533 	}
534 
535 	/* check for 2nd char of combining characters */
536 	if ((filter->status & 0xf) == 1 && filter->cache >= 0 && filter->cache < jisx0213_u2_tbl_len) {
537 		k = filter->cache;
538 		filter->status &= ~0xf;
539 		filter->cache = 0;
540 
541 		c1 = jisx0213_u2_tbl[2*k];
542 		if ((c1 == 0x0254 || c1 == 0x028C || c1 == 0x0259 || c1 == 0x025A) && c == 0x0301) {
543 			k++;
544 		}
545 		if (c == jisx0213_u2_tbl[2*k+1]) {
546 			s1 = jisx0213_u2_key[k];
547 		} else { /* fallback */
548 			s1 = jisx0213_u2_fb_tbl[k];
549 
550 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
551 				c1 = (s1 >> 8) & 0xff;
552 				c2 = s1 & 0xff;
553 				SJIS_ENCODE(c1, c2, s1, s2);
554 			} else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
555 				s2 = (s1 & 0xff) + 0x80;
556 				s1 = ((s1 >> 8) & 0xff) + 0x80;
557 			} else {
558 				if (filter->status != 0x200) {
559 					CK((*filter->output_function)(0x1b, filter->data));
560 					CK((*filter->output_function)('$', filter->data));
561 					CK((*filter->output_function)('(', filter->data));
562 					CK((*filter->output_function)('Q', filter->data));
563 				}
564 				filter->status = 0x200;
565 
566 				s2 = s1 & 0x7f;
567 				s1 = (s1 >> 8) & 0x7f;
568 			}
569 
570 			/* Flush out cached data */
571 			CK((*filter->output_function)(s1, filter->data));
572 			CK((*filter->output_function)(s2, filter->data));
573 			goto retry;
574 		}
575 	}
576 
577 	/* check for major japanese chars: U+4E00 - U+9FFF */
578 	if (s1 <= 0) {
579 		for (k = 0; k < uni2jis_tbl_len; k++) {
580 			if (c >= uni2jis_tbl_range[k][0] && c <= uni2jis_tbl_range[k][1]) {
581 				s1 = uni2jis_tbl[k][c-uni2jis_tbl_range[k][0]];
582 				break;
583 			}
584 		}
585 	}
586 
587 	/* check for japanese chars in compressed mapping area: U+1E00 - U+4DBF */
588 	if (s1 <= 0 && c >= ucs_c1_jisx0213_min && c <= ucs_c1_jisx0213_max) {
589 		k = mbfl_bisec_srch(c, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
590 		if (k >= 0) {
591 			s1 = ucs_c1_jisx0213_ofst[k] + c - ucs_c1_jisx0213_tbl[2*k];
592 		}
593 	}
594 
595 	/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
596 	if (s1 <= 0 && c >= jisx0213_u5_tbl_min && c <= jisx0213_u5_tbl_max) {
597 		k = mbfl_bisec_srch2(c - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
598 		if (k >= 0) {
599 			s1 = jisx0213_u5_jis_tbl[k];
600 		}
601 	}
602 
603 	if (s1 <= 0) {
604 		/* CJK Compatibility Forms: U+FE30 - U+FE4F */
605 		if (c == 0xfe45) {
606 			s1 = 0x233e;
607 		} else if (c == 0xfe46) {
608 			s1 = 0x233d;
609 		} else if (c >= 0xf91d && c <= 0xf9dc) {
610 			/* CJK Compatibility Ideographs: U+F900 - U+F92A */
611 			k = mbfl_bisec_srch2(c, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
612 			if (k >= 0) {
613 				s1 = ucs_r2b_jisx0213_cmap_val[k];
614 			}
615 		}
616 	}
617 
618 	if (s1 <= 0) {
619 		if (c == 0) {
620 			s1 = 0;
621 		} else {
622 			s1 = -1;
623 		}
624 	}
625 
626 	if (s1 >= 0) {
627 		if (s1 < 0x80) { /* ASCII */
628 			if (filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (filter->status & 0xff00)) {
629 				CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
630 				CK((*filter->output_function)('(', filter->data));
631 				CK((*filter->output_function)('B', filter->data));
632 			}
633 			filter->status = 0;
634 			CK((*filter->output_function)(s1, filter->data));
635 		} else if (s1 < 0x100) { /* latin or kana */
636 			if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
637 				CK((*filter->output_function)(0x8e, filter->data));
638 				CK((*filter->output_function)(s1, filter->data));
639 			} else if (filter->to->no_encoding == mbfl_no_encoding_sjis2004 && (s1 >= 0xA1 && s1 <= 0xDF)) {
640 				CK((*filter->output_function)(s1, filter->data));
641 			} else {
642 				CK(mbfl_filt_conv_illegal_output(c, filter));
643 			}
644 		} else if (s1 < 0x7f00) { /* X 0213 plane 1 */
645 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
646 				c1 = (s1 >> 8) & 0xff;
647 				c2 = s1 & 0xff;
648 				SJIS_ENCODE(c1, c2, s1, s2);
649 			} else if  (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
650 				s2 = (s1 & 0xff) + 0x80;
651 				s1 = ((s1 >> 8) & 0xff) + 0x80;
652 			} else {
653 				if ((filter->status & 0xff00) != 0x200) {
654 					CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
655 					CK((*filter->output_function)('$', filter->data));
656 					CK((*filter->output_function)('(', filter->data));
657 					CK((*filter->output_function)('Q', filter->data));
658 				}
659 				filter->status = 0x200;
660 				s2 = s1 & 0xff;
661 				s1 = (s1 >> 8) & 0xff;
662 			}
663 			CK((*filter->output_function)(s1, filter->data));
664 			CK((*filter->output_function)(s2, filter->data));
665 		} else { /* X 0213 plane 2 */
666 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
667 				c1 = (s1 >> 8) & 0xff;
668 				c2 = s1 & 0xff;
669 				SJIS_ENCODE(c1, c2, s1, s2);
670 			} else {
671 				s2 = s1 & 0xff;
672 				k = ((s1 >> 8) & 0xff) - 0x7f;
673 				if (k >= 0 && k < jisx0213_p2_ofst_len) {
674 					s1 = jisx0213_p2_ofst[k] + 0x21;
675 				}
676 				if  (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
677 					s2 |= 0x80;
678 					s1 |= 0x80;
679 					CK((*filter->output_function)(0x8f, filter->data));
680 				} else {
681 					if ((filter->status & 0xff00) != 0x200) {
682 						CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
683 						CK((*filter->output_function)('$', filter->data));
684 						CK((*filter->output_function)('(', filter->data));
685 						CK((*filter->output_function)('P', filter->data));
686 					}
687 					filter->status = 0x200;
688 				}
689 			}
690 
691 			CK((*filter->output_function)(s1, filter->data));
692 			CK((*filter->output_function)(s2, filter->data));
693 		}
694 	} else {
695 		CK(mbfl_filt_conv_illegal_output(c, filter));
696 	}
697 
698 	return 0;
699 }
700 
mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter * filter)701 int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter)
702 {
703 	int k, c1, c2, s1, s2;
704 
705 	k = filter->cache;
706 	filter->cache = 0;
707 
708 	if (filter->status == 1 && k >= 0 && k <= jisx0213_u2_tbl_len) {
709 		s1 = jisx0213_u2_fb_tbl[k];
710 
711 		if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
712 			c1 = (s1 >> 8) & 0xff;
713 			c2 = s1 & 0xff;
714 			SJIS_ENCODE(c1, c2, s1, s2);
715 		} else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
716 			s2 = (s1 & 0xff) | 0x80;
717 			s1 = ((s1 >> 8) & 0xff) | 0x80;
718 		} else {
719 			s2 = s1 & 0x7f;
720 			s1 = (s1 >> 8) & 0x7f;
721 			if ((filter->status & 0xff00) != 0x200) {
722 				CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
723 				CK((*filter->output_function)('$', filter->data));
724 				CK((*filter->output_function)('(', filter->data));
725 				CK((*filter->output_function)('Q', filter->data));
726 			}
727 			filter->status = 0x200;
728 		}
729 
730 		CK((*filter->output_function)(s1, filter->data));
731 		CK((*filter->output_function)(s2, filter->data));
732 	}
733 
734 	/* If we had switched to a different charset, go back to ASCII mode
735 	 * This makes it possible to concatenate arbitrary valid strings
736 	 * together and get a valid string */
737 	if (filter->status & 0xff00) {
738 		CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
739 		CK((*filter->output_function)('(', filter->data));
740 		CK((*filter->output_function)('B', filter->data));
741 	}
742 
743 	filter->status = 0;
744 
745 	if (filter->flush_function) {
746 		return (*filter->flush_function)(filter->data);
747 	}
748 
749 	return 0;
750 }
751 
mb_sjis2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)752 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
753 {
754 	unsigned char *p = *in, *e = p + *in_len;
755 	uint32_t *out = buf, *limit = buf + bufsize - 1;
756 
757 	while (p < e && out < limit) {
758 		unsigned char c = *p++;
759 
760 		if (c <= 0x7F) {
761 			if (c == 0x5C) {
762 				*out++ = 0xA5;
763 			} else if (c == 0x7E) {
764 				*out++ = 0x203E;
765 			} else {
766 				*out++ = c;
767 			}
768 		} else if (c >= 0xA1 && c <= 0xDF) {
769 			*out++ = 0xFEC0 + c;
770 		} else if (c > 0x80 && c < 0xFD && c != 0xA0) {
771 			if (p == e) {
772 				*out++ = MBFL_BAD_INPUT;
773 				break;
774 			}
775 			unsigned char c2 = *p++;
776 
777 			if (c2 < 0x40 || c2 > 0xFC || c2 == 0x7F) {
778 				*out++ = MBFL_BAD_INPUT;
779 				continue;
780 			}
781 
782 			unsigned int s1, s2;
783 			SJIS_DECODE(c, c2, s1, s2);
784 			unsigned int w1 = (s1 << 8) | s2, w = 0;
785 
786 			/* Conversion for combining characters */
787 			if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
788 				int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
789 				if (k >= 0) {
790 					*out++ = jisx0213_u2_tbl[2*k];
791 					*out++ = jisx0213_u2_tbl[2*k+1];
792 					continue;
793 				}
794 			}
795 
796 			/* Conversion for BMP */
797 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
798 			if (w1 < jisx0213_ucs_table_size) {
799 				w = jisx0213_ucs_table[w1];
800 			}
801 
802 			/* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
803 			if (!w) {
804 				w1 = (s1 << 8) | s2;
805 				int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
806 				if (k >= 0) {
807 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
808 				}
809 			}
810 
811 			*out++ = w ? w : MBFL_BAD_INPUT;
812 		} else {
813 			*out++ = MBFL_BAD_INPUT;
814 		}
815 	}
816 
817 	 *in_len = e - p;
818 	 *in = p;
819 	 return out - buf;
820 }
821 
mb_wchar_to_sjis2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)822 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
823 {
824 	unsigned char *out, *limit;
825 	MB_CONVERT_BUF_LOAD(buf, out, limit);
826 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
827 
828 	uint32_t w;
829 	if (buf->state) {
830 		w = buf->state;
831 		buf->state = 0;
832 		goto process_codepoint;
833 	}
834 
835 	while (len--) {
836 		w = *in++;
837 process_codepoint: ;
838 		unsigned int s = 0;
839 
840 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
841 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
842 				if (w == jisx0213_u2_tbl[2*k]) {
843 					if (!len) {
844 						if (!end) {
845 							buf->state = w;
846 							MB_CONVERT_BUF_STORE(buf, out, limit);
847 							return;
848 						}
849 					} else {
850 						uint32_t w2 = *in++; len--;
851 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
852 							k++;
853 						}
854 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
855 							s = jisx0213_u2_key[k];
856 							break;
857 						}
858 						in--; len++;
859 					}
860 
861 					/* Fallback */
862 					s = jisx0213_u2_fb_tbl[k];
863 					break;
864 				}
865 			}
866 		}
867 
868 		/* Check for major Japanese chars: U+4E00-U+9FFF */
869 		if (!s) {
870 			for (int k = 0; k < uni2jis_tbl_len; k++) {
871 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
872 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
873 					break;
874 				}
875 			}
876 		}
877 
878 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
879 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
880 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
881 			if (k >= 0) {
882 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
883 			}
884 		}
885 
886 		/* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
887 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
888 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
889 			if (k >= 0) {
890 				s = jisx0213_u5_jis_tbl[k];
891 			}
892 		}
893 
894 		if (!s) {
895 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
896 			if (w == 0xFE45) {
897 				s = 0x233E;
898 			} else if (w == 0xFE46) {
899 				s = 0x233D;
900 			} else if (w >= 0xF91D && w <= 0xF9DC) {
901 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
902 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
903 				if (k >= 0) {
904 					s = ucs_r2b_jisx0213_cmap_val[k];
905 				}
906 			}
907 		}
908 
909 		if (!s && w) {
910 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004);
911 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
912 		} else if (s <= 0xFF) {
913 			out = mb_convert_buf_add(out, s);
914 		} else {
915 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
916 			SJIS_ENCODE(c1, c2, s1, s2);
917 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
918 			out = mb_convert_buf_add2(out, s1, s2);
919 		}
920 	}
921 
922 	MB_CONVERT_BUF_STORE(buf, out, limit);
923 }
924 
mb_eucjp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)925 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
926 {
927 	unsigned char *p = *in, *e = p + *in_len;
928 	uint32_t *out = buf, *limit = buf + bufsize - 1;
929 
930 	while (p < e && out < limit) {
931 		unsigned char c = *p++;
932 
933 		if (c <= 0x7F) {
934 			*out++ = c;
935 		} else if (c >= 0xA1 && c <= 0xFE) {
936 			/* Kanji */
937 			if (p == e) {
938 				*out++ = MBFL_BAD_INPUT;
939 				break;
940 			}
941 			unsigned char c2 = *p++;
942 			if (c2 <= 0xA0 || c2 == 0xFF) {
943 				*out++ = MBFL_BAD_INPUT;
944 				continue;
945 			}
946 
947 			unsigned int s1 = c - 0x80, s2 = c2 - 0x80;
948 			unsigned int w1 = (s1 << 8) | s2, w = 0;
949 
950 			/* Conversion for combining characters */
951 			if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
952 				int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
953 				if (k >= 0) {
954 					*out++ = jisx0213_u2_tbl[2*k];
955 					*out++ = jisx0213_u2_tbl[2*k+1];
956 					continue;
957 				}
958 			}
959 
960 			/* Conversion for BMP  */
961 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
962 			if (w1 < jisx0213_ucs_table_size) {
963 				w = jisx0213_ucs_table[w1];
964 			}
965 
966 			/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
967 			if (!w) {
968 				w1 = (s1 << 8) | s2;
969 				int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
970 				if (k >= 0) {
971 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
972 				}
973 			}
974 
975 			*out++ = w ? w : MBFL_BAD_INPUT;
976 		} else if (c == 0x8E && p < e) {
977 			/* Kana */
978 			unsigned char c2 = *p++;
979 			if (c2 >= 0xA1 && c2 <= 0xDF) {
980 				*out++ = 0xFEC0 + c2;
981 			} else {
982 				*out++ = MBFL_BAD_INPUT;
983 			}
984 		} else if (c == 0x8F && p < e) {
985 			unsigned char c2 = *p++;
986 			if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) {
987 				unsigned char c3 = *p++;
988 
989 				if (c3 < 0xA1 || c3 == 0xFF) {
990 					*out++ = MBFL_BAD_INPUT;
991 					continue;
992 				}
993 
994 				unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1;
995 
996 				if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
997 					int k;
998 					for (k = 0; k < jisx0213_p2_ofst_len; k++) {
999 						if (s1 == jisx0213_p2_ofst[k]) {
1000 							break;
1001 						}
1002 					}
1003 					k -= jisx0213_p2_ofst[k];
1004 
1005 					/* Check for Japanese chars in BMP */
1006 					unsigned int s = (s1 + 94 + k)*94 + s2;
1007 					ZEND_ASSERT(s < jisx0213_ucs_table_size);
1008 					unsigned int w = jisx0213_ucs_table[s];
1009 
1010 					/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
1011 					if (!w) {
1012 						k = mbfl_bisec_srch2(((c2 - 0x80 + k + 94) << 8) | (c3 - 0x80), jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1013 						if (k >= 0) {
1014 							w = jisx0213_jis_u5_tbl[k] + 0x20000;
1015 						}
1016 					}
1017 
1018 					*out++ = w ? w : MBFL_BAD_INPUT;
1019 				} else {
1020 					*out++ = MBFL_BAD_INPUT;
1021 				}
1022 			} else {
1023 				*out++ = MBFL_BAD_INPUT;
1024 			}
1025 		} else {
1026 			*out++ = MBFL_BAD_INPUT;
1027 		}
1028 	}
1029 
1030 	*in_len = e - p;
1031 	*in = p;
1032 	return out - buf;
1033 }
1034 
mb_wchar_to_eucjp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1035 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1036 {
1037 	unsigned char *out, *limit;
1038 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1039 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1040 
1041 	uint32_t w;
1042 	if (buf->state) {
1043 		w = buf->state;
1044 		buf->state = 0;
1045 		goto process_codepoint;
1046 	}
1047 
1048 	while (len--) {
1049 		w = *in++;
1050 process_codepoint: ;
1051 		unsigned int s = 0;
1052 
1053 		/* Check for 1st char of combining characters */
1054 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
1055 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
1056 				if (w == jisx0213_u2_tbl[2*k]) {
1057 					if (!len) {
1058 						if (!end) {
1059 							buf->state = w;
1060 							MB_CONVERT_BUF_STORE(buf, out, limit);
1061 							return;
1062 						}
1063 					} else {
1064 						uint32_t w2 = *in++; len--;
1065 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
1066 							k++;
1067 						}
1068 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
1069 							s = jisx0213_u2_key[k];
1070 							break;
1071 						}
1072 						in--; len++;
1073 					}
1074 
1075 					/* Fallback */
1076 					s = jisx0213_u2_fb_tbl[k];
1077 					break;
1078 				}
1079 			}
1080 		}
1081 
1082 		/* Check for major Japanese chars: U+4E00-U+9FFF */
1083 		if (!s) {
1084 			for (int k = 0; k < uni2jis_tbl_len; k++) {
1085 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
1086 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
1087 					break;
1088 				}
1089 			}
1090 		}
1091 
1092 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
1093 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
1094 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1095 			if (k >= 0) {
1096 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
1097 			}
1098 		}
1099 
1100 		/* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1101 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
1102 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1103 			if (k >= 0) {
1104 				s = jisx0213_u5_jis_tbl[k];
1105 			}
1106 		}
1107 
1108 		if (!s) {
1109 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
1110 			if (w == 0xFE45) {
1111 				s = 0x233E;
1112 			} else if (w == 0xFE46) {
1113 				s = 0x233D;
1114 			} else if (w >= 0xF91D && w <= 0xF9DC) {
1115 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
1116 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1117 				if (k >= 0) {
1118 					s = ucs_r2b_jisx0213_cmap_val[k];
1119 				}
1120 			}
1121 		}
1122 
1123 		if (!s && w) {
1124 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004);
1125 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1126 		} else if (s <= 0x7F) {
1127 			out = mb_convert_buf_add(out, s);
1128 		} else if (s <= 0xFF) {
1129 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1130 			out = mb_convert_buf_add2(out, 0x8E, s);
1131 		} else if (s <= 0x7EFF) {
1132 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1133 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80);
1134 		} else {
1135 			unsigned int s2 = s & 0xFF;
1136 			int k = ((s >> 8) & 0xFF) - 0x7F;
1137 			ZEND_ASSERT(k < jisx0213_p2_ofst_len);
1138 			s = jisx0213_p2_ofst[k] + 0x21;
1139 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
1140 			out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80);
1141 		}
1142 	}
1143 
1144 	MB_CONVERT_BUF_STORE(buf, out, limit);
1145 }
1146 
1147 #define ASCII 0
1148 #define JISX0208 1
1149 #define JISX0213_PLANE1 2
1150 #define JISX0213_PLANE2 3
1151 
mb_iso2022jp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1152 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1153 {
1154 	unsigned char *p = *in, *e = p + *in_len;
1155 	uint32_t *out = buf, *limit = buf + bufsize - 1;
1156 
1157 	while (p < e && out < limit) {
1158 		unsigned char c = *p++;
1159 
1160 		if (c <= 0x7F) {
1161 			if (c == 0x1B) {
1162 				if ((e - p) < 2) {
1163 					*out++ = MBFL_BAD_INPUT;
1164 					p = e;
1165 					break;
1166 				}
1167 				unsigned char c2 = *p++;
1168 				unsigned char c3 = *p++;
1169 				if (c2 == '$') {
1170 					if (c3 == 'B') {
1171 						*state = JISX0208;
1172 					} else if (c3 == '(') {
1173 						if (p == e) {
1174 							*out++ = MBFL_BAD_INPUT;
1175 							break;
1176 						}
1177 						unsigned char c4 = *p++;
1178 						if (c4 == 'Q') {
1179 							*state = JISX0213_PLANE1;
1180 						} else if (c4 == 'P') {
1181 							*state = JISX0213_PLANE2;
1182 						} else {
1183 							*out++ = MBFL_BAD_INPUT;
1184 						}
1185 					} else {
1186 						*out++ = MBFL_BAD_INPUT;
1187 					}
1188 				} else if (c2 == '(') {
1189 					if (c3 == 'B') {
1190 						*state = ASCII;
1191 					} else {
1192 						*out++ = MBFL_BAD_INPUT;
1193 					}
1194 				} else {
1195 					p--;
1196 					*out++ = MBFL_BAD_INPUT;
1197 				}
1198 			} else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) {
1199 				if (p == e) {
1200 					*out++ = MBFL_BAD_INPUT;
1201 					break;
1202 				}
1203 				unsigned char c2 = *p++;
1204 				if (c2 < 0x21 || c2 > 0x7E) {
1205 					*out++ = MBFL_BAD_INPUT;
1206 					continue;
1207 				}
1208 
1209 				if (*state == JISX0213_PLANE1) {
1210 					unsigned int w1 = (c << 8) | c2;
1211 
1212 					/* Conversion for combining characters */
1213 					if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
1214 						int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
1215 						if (k >= 0) {
1216 							*out++ = jisx0213_u2_tbl[2*k];
1217 							*out++ = jisx0213_u2_tbl[2*k+1];
1218 							continue;
1219 						}
1220 					}
1221 
1222 					/* Conversion for BMP */
1223 					uint32_t w = 0;
1224 					w1 = (c - 0x21)*94 + c2 - 0x21;
1225 					if (w1 < jisx0213_ucs_table_size) {
1226 						w = jisx0213_ucs_table[w1];
1227 					}
1228 
1229 					/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
1230 					if (!w) {
1231 						int k = mbfl_bisec_srch2((c << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1232 						if (k >= 0) {
1233 							w = jisx0213_jis_u5_tbl[k] + 0x20000;
1234 						}
1235 					}
1236 
1237 					*out++ = w ? w : MBFL_BAD_INPUT;
1238 				} else if (*state == JISX0213_PLANE2) {
1239 
1240 					unsigned int s1 = c - 0x21, s2 = c2 - 0x21;
1241 
1242 					if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
1243 						int k;
1244 						for (k = 0; k < jisx0213_p2_ofst_len; k++) {
1245 							if (s1 == jisx0213_p2_ofst[k]) {
1246 								break;
1247 							}
1248 						}
1249 						k -= jisx0213_p2_ofst[k];
1250 
1251 						/* Check for Japanese chars in BMP */
1252 						unsigned int s = (s1 + 94 + k)*94 + s2;
1253 						ZEND_ASSERT(s < jisx0213_ucs_table_size);
1254 						uint32_t w = jisx0213_ucs_table[s];
1255 
1256 						/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
1257 						if (!w) {
1258 							k = mbfl_bisec_srch2(((c + k + 94) << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1259 							if (k >= 0) {
1260 								w = jisx0213_jis_u5_tbl[k] + 0x20000;
1261 							}
1262 						}
1263 
1264 						*out++ = w ? w : MBFL_BAD_INPUT;
1265 					} else {
1266 						*out++ = MBFL_BAD_INPUT;
1267 					}
1268 				} else { /* state == JISX0208 */
1269 					unsigned int s = (c - 0x21)*94 + c2 - 0x21;
1270 					uint32_t w = 0;
1271 					if (s < jisx0208_ucs_table_size) {
1272 						w = jisx0208_ucs_table[s];
1273 					}
1274 					*out++ = w ? w : MBFL_BAD_INPUT;
1275 				}
1276 			} else {
1277 				*out++ = c;
1278 			}
1279 		} else {
1280 			*out++ = MBFL_BAD_INPUT;
1281 		}
1282 	}
1283 
1284 	*in_len = e - p;
1285 	*in = p;
1286 	return out - buf;
1287 }
1288 
mb_wchar_to_iso2022jp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1289 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1290 {
1291 	unsigned char *out, *limit;
1292 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1293 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1294 
1295 	uint32_t w;
1296 	if (buf->state & 0xFF00) {
1297 		int k = (buf->state >> 8) - 1;
1298 		w = jisx0213_u2_tbl[2*k];
1299 		buf->state &= 0xFF;
1300 		goto process_codepoint;
1301 	}
1302 
1303 	while (len--) {
1304 		w = *in++;
1305 process_codepoint: ;
1306 		unsigned int s = 0;
1307 
1308 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
1309 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
1310 				if (w == jisx0213_u2_tbl[2*k]) {
1311 					if (!len) {
1312 						if (!end) {
1313 							buf->state |= (k+1) << 8;
1314 							MB_CONVERT_BUF_STORE(buf, out, limit);
1315 							return;
1316 						}
1317 					}	else {
1318 						uint32_t w2 = *in++; len--;
1319 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
1320 							k++;
1321 						}
1322 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
1323 							s = jisx0213_u2_key[k];
1324 							break;
1325 						}
1326 						in--; len++;
1327 					}
1328 
1329 					s = jisx0213_u2_fb_tbl[k];
1330 					break;
1331 				}
1332 			}
1333 		}
1334 
1335 		/* Check for major Japanese chars: U+4E00-U+9FFF */
1336 		if (!s) {
1337 			for (int k = 0; k < uni2jis_tbl_len; k++) {
1338 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
1339 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
1340 					break;
1341 				}
1342 			}
1343 		}
1344 
1345 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
1346 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
1347 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1348 			if (k >= 0) {
1349 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
1350 			}
1351 		}
1352 
1353 		/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
1354 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
1355 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1356 			if (k >= 0) {
1357 				s = jisx0213_u5_jis_tbl[k];
1358 			}
1359 		}
1360 
1361 		if (!s) {
1362 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
1363 			if (w == 0xFE45) {
1364 				s = 0x233E;
1365 			} else if (w == 0xFE46) {
1366 				s = 0x233D;
1367 			} else if (w >= 0xF91D && w <= 0xF9DC) {
1368 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
1369 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1370 				if (k >= 0) {
1371 					s = ucs_r2b_jisx0213_cmap_val[k];
1372 				}
1373 			}
1374 		}
1375 
1376 		if (!s && w) {
1377 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
1378 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1379 		} else if (s <= 0x7F) {
1380 			if (buf->state != ASCII) {
1381 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1382 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1383 				buf->state = ASCII;
1384 			}
1385 			out = mb_convert_buf_add(out, s);
1386 		} else if (s <= 0xFF) {
1387 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
1388 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1389 		} else if (s <= 0x7EFF) {
1390 			if (buf->state != JISX0213_PLANE1) {
1391 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
1392 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q');
1393 				buf->state = JISX0213_PLANE1;
1394 			} else {
1395 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1396 			}
1397 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1398 		} else {
1399 			if (buf->state != JISX0213_PLANE2) {
1400 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
1401 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P');
1402 				buf->state = JISX0213_PLANE2;
1403 			} else {
1404 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1405 			}
1406 			unsigned int s2 = s & 0xFF;
1407 			int k = ((s >> 8) & 0xFF) - 0x7F;
1408 			ZEND_ASSERT(k < jisx0213_p2_ofst_len);
1409 			s = jisx0213_p2_ofst[k] + 0x21;
1410 			out = mb_convert_buf_add2(out, s, s2);
1411 		}
1412 	}
1413 
1414 	if (end && buf->state != ASCII) {
1415 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1416 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1417 	}
1418 
1419 	MB_CONVERT_BUF_STORE(buf, out, limit);
1420 }
1421