1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_ja.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_jis.h"
32 
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35 
36 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter);
37 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
39 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
40 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len);
41 static bool mb_check_jis(unsigned char *in, size_t in_len);
42 
43 const mbfl_encoding mbfl_encoding_jis = {
44 	mbfl_no_encoding_jis,
45 	"JIS",
46 	"ISO-2022-JP",
47 	NULL,
48 	NULL,
49 	MBFL_ENCTYPE_GL_UNSAFE,
50 	&vtbl_jis_wchar,
51 	&vtbl_wchar_jis,
52 	mb_iso2022jp_to_wchar,
53 	mb_wchar_to_jis,
54 	mb_check_jis
55 };
56 
57 const mbfl_encoding mbfl_encoding_2022jp = {
58 	mbfl_no_encoding_2022jp,
59 	"ISO-2022-JP",
60 	"ISO-2022-JP",
61 	NULL,
62 	NULL,
63 	MBFL_ENCTYPE_GL_UNSAFE,
64 	&vtbl_2022jp_wchar,
65 	&vtbl_wchar_2022jp,
66 	mb_iso2022jp_to_wchar,
67 	mb_wchar_to_iso2022jp,
68 	mb_check_iso2022jp
69 };
70 
71 const struct mbfl_convert_vtbl vtbl_jis_wchar = {
72 	mbfl_no_encoding_jis,
73 	mbfl_no_encoding_wchar,
74 	mbfl_filt_conv_common_ctor,
75 	NULL,
76 	mbfl_filt_conv_jis_wchar,
77 	mbfl_filt_conv_jis_wchar_flush,
78 	NULL,
79 };
80 
81 const struct mbfl_convert_vtbl vtbl_wchar_jis = {
82 	mbfl_no_encoding_wchar,
83 	mbfl_no_encoding_jis,
84 	mbfl_filt_conv_common_ctor,
85 	NULL,
86 	mbfl_filt_conv_wchar_jis,
87 	mbfl_filt_conv_any_jis_flush,
88 	NULL,
89 };
90 
91 const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
92 	mbfl_no_encoding_2022jp,
93 	mbfl_no_encoding_wchar,
94 	mbfl_filt_conv_common_ctor,
95 	NULL,
96 	mbfl_filt_conv_jis_wchar,
97 	mbfl_filt_conv_jis_wchar_flush,
98 	NULL,
99 };
100 
101 const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
102 	mbfl_no_encoding_wchar,
103 	mbfl_no_encoding_2022jp,
104 	mbfl_filt_conv_common_ctor,
105 	NULL,
106 	mbfl_filt_conv_wchar_2022jp,
107 	mbfl_filt_conv_any_jis_flush,
108 	NULL,
109 };
110 
111 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
112 
113 /*
114  * JIS => wchar
115  */
116 int
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)117 mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
118 {
119 	int c1, s, w;
120 
121 retry:
122 	switch (filter->status & 0xf) {
123 /*	case 0x00:	 ASCII */
124 /*	case 0x10:	 X 0201 latin */
125 /*	case 0x20:	 X 0201 kana */
126 /*	case 0x80:	 X 0208 */
127 /*	case 0x90:	 X 0212 */
128 	case 0:
129 		if (c == 0x1b) {
130 			filter->status += 2;
131 		} else if (c == 0x0e) {		/* "kana in" */
132 			filter->status = 0x20;
133 		} else if (c == 0x0f) {		/* "kana out" */
134 			filter->status = 0;
135 		} else if (filter->status == 0x10 && c == 0x5c) {	/* YEN SIGN */
136 			CK((*filter->output_function)(0xa5, filter->data));
137 		} else if (filter->status == 0x10 && c == 0x7e) {	/* OVER LINE */
138 			CK((*filter->output_function)(0x203e, filter->data));
139 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
140 			CK((*filter->output_function)(0xff40 + c, filter->data));
141 		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) {		/* kanji first char */
142 			filter->cache = c;
143 			filter->status += 1;
144 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
145 			CK((*filter->output_function)(c, filter->data));
146 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
147 			CK((*filter->output_function)(0xfec0 + c, filter->data));
148 		} else {
149 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
150 		}
151 		break;
152 
153 /*	case 0x81:	 X 0208 second char */
154 /*	case 0x91:	 X 0212 second char */
155 	case 1:
156 		filter->status &= ~0xf;
157 		c1 = filter->cache;
158 		if (c > 0x20 && c < 0x7f) {
159 			s = (c1 - 0x21)*94 + c - 0x21;
160 			if (filter->status == 0x80) {
161 				if (s >= 0 && s < jisx0208_ucs_table_size) {
162 					w = jisx0208_ucs_table[s];
163 				} else {
164 					w = 0;
165 				}
166 
167 				if (w <= 0) {
168 					w = MBFL_BAD_INPUT;
169 				}
170 			} else {
171 				if (s >= 0 && s < jisx0212_ucs_table_size) {
172 					w = jisx0212_ucs_table[s];
173 				} else {
174 					w = 0;
175 				}
176 
177 				if (w <= 0) {
178 					w = MBFL_BAD_INPUT;
179 				}
180 			}
181 			CK((*filter->output_function)(w, filter->data));
182 		} else {
183 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
184 		}
185 		break;
186 
187 	/* ESC */
188 /*	case 0x02:	*/
189 /*	case 0x12:	*/
190 /*	case 0x22:	*/
191 /*	case 0x82:	*/
192 /*	case 0x92:	*/
193 	case 2:
194 		if (c == 0x24) {		/* '$' */
195 			filter->status++;
196 		} else if (c == 0x28) {		/* '(' */
197 			filter->status += 3;
198 		} else {
199 			filter->status &= ~0xf;
200 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
201 			goto retry;
202 		}
203 		break;
204 
205 	/* ESC $ */
206 /*	case 0x03:	*/
207 /*	case 0x13:	*/
208 /*	case 0x23:	*/
209 /*	case 0x83:	*/
210 /*	case 0x93:	*/
211 	case 3:
212 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
213 			filter->status = 0x80;
214 		} else if (c == 0x28) {			/* '(' */
215 			filter->status++;
216 		} else {
217 			filter->status &= ~0xf;
218 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
219 			CK((*filter->output_function)(0x24, filter->data));
220 			goto retry;
221 		}
222 		break;
223 
224 	/* ESC $ ( */
225 /*	case 0x04:	*/
226 /*	case 0x14:	*/
227 /*	case 0x24:	*/
228 /*	case 0x84:	*/
229 /*	case 0x94:	*/
230 	case 4:
231 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
232 			filter->status = 0x80;
233 		} else if (c == 0x44) {			/* 'D' */
234 			filter->status = 0x90;
235 		} else {
236 			filter->status &= ~0xf;
237 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
238 			CK((*filter->output_function)(0x24, filter->data));
239 			CK((*filter->output_function)(0x28, filter->data));
240 			goto retry;
241 		}
242 		break;
243 
244 	/* ESC ( */
245 /*	case 0x05:	*/
246 /*	case 0x15:	*/
247 /*	case 0x25:	*/
248 /*	case 0x85:	*/
249 /*	case 0x95:	*/
250 	case 5:
251 		if (c == 0x42 || c == 0x48) {		/* 'B' or 'H' */
252 			filter->status = 0;
253 		} else if (c == 0x4a) {		/* 'J' */
254 			filter->status = 0x10;
255 		} else if (c == 0x49) {		/* 'I' */
256 			filter->status = 0x20;
257 		} else {
258 			filter->status &= ~0xf;
259 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
260 			CK((*filter->output_function)(0x28, filter->data));
261 			goto retry;
262 		}
263 		break;
264 
265 		EMPTY_SWITCH_DEFAULT_CASE();
266 	}
267 
268 	return 0;
269 }
270 
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)271 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
272 {
273 	if (filter->status & 0xF) {
274 		/* 2-byte (JIS X 0208 or 0212) character was truncated,
275 		 * or else escape sequence was truncated */
276 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
277 	}
278 	filter->status = 0;
279 
280 	if (filter->flush_function) {
281 		(*filter->flush_function)(filter->data);
282 	}
283 
284 	return 0;
285 }
286 
287 /*
288  * wchar => JIS
289  */
290 int
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)291 mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
292 {
293 	int s = 0;
294 
295 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
296 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
297 	} else if (c == 0x203E) { /* OVERLINE */
298 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
299 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
300 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
301 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
302 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
303 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
304 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
305 	}
306 	if (s <= 0) {
307 		if (c == 0xa5) {		/* YEN SIGN */
308 			s = 0x1005c;
309 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
310 			s = 0x2140;
311 		} else if (c == 0x2225) {	/* PARALLEL TO */
312 			s = 0x2142;
313 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
314 			s = 0x215d;
315 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
316 			s = 0x2171;
317 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
318 			s = 0x2172;
319 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
320 			s = 0x224c;
321 		}
322 		if (c == 0) {
323 			s = 0;
324 		} else if (s <= 0) {
325 			s = -1;
326 		}
327 	}
328 	if (s >= 0) {
329 		if (s < 0x80) { /* ASCII */
330 			if ((filter->status & 0xff00) != 0) {
331 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
332 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
333 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
334 			}
335 			filter->status = 0;
336 			CK((*filter->output_function)(s, filter->data));
337 		} else if (s < 0x8080) { /* X 0208 */
338 			if ((filter->status & 0xff00) != 0x200) {
339 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
340 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
341 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
342 			}
343 			filter->status = 0x200;
344 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
345 			CK((*filter->output_function)(s & 0x7f, filter->data));
346 		} else if (s < 0x10000) { /* X 0212 */
347 			if ((filter->status & 0xff00) != 0x300) {
348 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
349 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
350 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
351 				CK((*filter->output_function)(0x44, filter->data));		/* 'D' */
352 			}
353 			filter->status = 0x300;
354 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
355 			CK((*filter->output_function)(s & 0x7f, filter->data));
356 		} else { /* X 0201 latin */
357 			if ((filter->status & 0xff00) != 0x400) {
358 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
359 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
360 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
361 			}
362 			filter->status = 0x400;
363 			CK((*filter->output_function)(s & 0x7f, filter->data));
364 		}
365 	} else {
366 		CK(mbfl_filt_conv_illegal_output(c, filter));
367 	}
368 
369 	return 0;
370 }
371 
372 
373 /*
374  * wchar => ISO-2022-JP
375  */
376 int
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)377 mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
378 {
379 	int s;
380 
381 	s = 0;
382 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
383 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
384 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
385 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
386 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
387 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
388 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
389 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
390 	}
391 
392 	if (s <= 0) {
393 		if (c == 0xa5) {			/* YEN SIGN */
394 			s = 0x1005c;
395 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
396 			s = 0x2140;
397 		} else if (c == 0x2225) {	/* PARALLEL TO */
398 			s = 0x2142;
399 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
400 			s = 0x215d;
401 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
402 			s = 0x2171;
403 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
404 			s = 0x2172;
405 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
406 			s = 0x224c;
407 		}
408 		if (c == 0) {
409 			s = 0;
410 		} else if (s <= 0) {
411 			s = -1;
412 		}
413 	} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
414 		s = -1;
415 	}
416 	if (s >= 0) {
417 		if (s < 0x80) { /* ASCII */
418 			if ((filter->status & 0xff00) != 0) {
419 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
420 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
421 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
422 			}
423 			filter->status = 0;
424 			CK((*filter->output_function)(s, filter->data));
425 		} else if (s < 0x10000) { /* X 0208 */
426 			if ((filter->status & 0xff00) != 0x200) {
427 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
428 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
429 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
430 			}
431 			filter->status = 0x200;
432 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
433 			CK((*filter->output_function)(s & 0x7f, filter->data));
434 		} else { /* X 0201 latin */
435 			if ((filter->status & 0xff00) != 0x400) {
436 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
437 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
438 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
439 			}
440 			filter->status = 0x400;
441 			CK((*filter->output_function)(s & 0x7f, filter->data));
442 		}
443 	} else {
444 		CK(mbfl_filt_conv_illegal_output(c, filter));
445 	}
446 
447 	return 0;
448 }
449 
450 int
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)451 mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
452 {
453 	/* back to latin */
454 	if ((filter->status & 0xff00) != 0) {
455 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
456 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
457 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
458 	}
459 	filter->status = 0;
460 
461 	if (filter->flush_function != NULL) {
462 		return (*filter->flush_function)(filter->data);
463 	}
464 
465 	return 0;
466 }
467 
468 #define ASCII 0
469 #define JISX_0201_LATIN 1
470 #define JISX_0201_KANA 2
471 #define JISX_0208 3
472 #define JISX_0212 4
473 
mb_iso2022jp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)474 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
475 {
476 	ZEND_ASSERT(bufsize >= 3);
477 
478 	unsigned char *p = *in, *e = p + *in_len;
479 	uint32_t *out = buf, *limit = buf + bufsize;
480 
481 	while (p < e && out < limit) {
482 		unsigned char c = *p++;
483 
484 		if (c == 0x1B) {
485 			/* ESC seen; this is an escape sequence */
486 			if ((e - p) < 2) {
487 				*out++ = MBFL_BAD_INPUT;
488 				if (p != e && (*p == '$' || *p == '('))
489 					p++;
490 				continue;
491 			}
492 
493 			unsigned char c2 = *p++;
494 			if (c2 == '$') {
495 				unsigned char c3 = *p++;
496 				if (c3 == '@' || c3 == 'B') {
497 					*state = JISX_0208;
498 				} else if (c3 == '(') {
499 					if (p == e) {
500 						*out++ = MBFL_BAD_INPUT;
501 						break;
502 					}
503 					unsigned char c4 = *p++;
504 					if (c4 == '@' || c4 == 'B') {
505 						*state = JISX_0208;
506 					} else if (c4 == 'D') {
507 						*state = JISX_0212;
508 					} else {
509 						if ((limit - out) < 3) {
510 							p -= 4;
511 							break;
512 						}
513 						*out++ = MBFL_BAD_INPUT;
514 						*out++ = '$';
515 						*out++ = '(';
516 						p--;
517 					}
518 				} else {
519 					if ((limit - out) < 2) {
520 						p -= 3;
521 						break;
522 					}
523 					*out++ = MBFL_BAD_INPUT;
524 					*out++ = '$';
525 					p--;
526 				}
527 			} else if (c2 == '(') {
528 				unsigned char c3 = *p++;
529 				if (c3 == 'B' || c3 == 'H') {
530 					*state = ASCII;
531 				} else if (c3 == 'J') {
532 					*state = JISX_0201_LATIN;
533 				} else if (c3 == 'I') {
534 					*state = JISX_0201_KANA;
535 				} else {
536 					if ((limit - out) < 2) {
537 						p -= 3;
538 						break;
539 					}
540 					*out++ = MBFL_BAD_INPUT;
541 					*out++ = '(';
542 					p--;
543 				}
544 			} else {
545 				*out++ = MBFL_BAD_INPUT;
546 				p--;
547 			}
548 		} else if (c == 0xE) {
549 			/* "Kana In" marker; this is just for JIS-7/8, but we also accept it for ISO-2022-JP */
550 			*state = JISX_0201_KANA;
551 		} else if (c == 0xF) {
552 			/* "Kana Out" marker */
553 			*state = ASCII;
554 		} else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
555 			*out++ = 0xA5;
556 		} else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
557 			*out++ = 0x203E;
558 		} else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
559 			*out++ = 0xFF40 + c;
560 		} else if (*state >= JISX_0208 && c > 0x20 && c < 0x7F) {
561 			if (p == e) {
562 				*out++ = MBFL_BAD_INPUT;
563 				break;
564 			}
565 			unsigned char c2 = *p++;
566 			if (c2 > 0x20 && c2 < 0x7F) {
567 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
568 				uint32_t w = 0;
569 				if (*state == JISX_0208) {
570 					if (s < jisx0208_ucs_table_size) {
571 						w = jisx0208_ucs_table[s];
572 					}
573 					if (!w) {
574 						w = MBFL_BAD_INPUT;
575 					}
576 				} else {
577 					if (s < jisx0212_ucs_table_size) {
578 						w = jisx0212_ucs_table[s];
579 					}
580 					if (!w) {
581 						w = MBFL_BAD_INPUT;
582 					}
583 				}
584 				*out++ = w;
585 			} else {
586 				*out++ = MBFL_BAD_INPUT;
587 			}
588 		} else if (c < 0x80) {
589 			*out++ = c;
590 		} else if (c >= 0xA1 && c <= 0xDF) {
591 			/* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
592 			 * with the MSB bit (in the context of ISO-2022 encoding).
593 			 *
594 			 * In this regard, Wikipedia states:
595 			 * "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
596 			 * encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
597 			 * escape sequences, using Shift Out and Shift In or setting the eighth bit
598 			 * (GR-invoked), respectively."
599 			 *
600 			 * Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
601 			 * and the 'JIS8' use of GR-invoked Kana */
602 			*out++ = 0xFEC0 + c;
603 		} else {
604 			*out++ = MBFL_BAD_INPUT;
605 		}
606 	}
607 
608 	*in_len = e - p;
609 	*in = p;
610 	return out - buf;
611 }
612 
mb_wchar_to_iso2022jp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)613 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
614 {
615 	unsigned char *out, *limit;
616 	MB_CONVERT_BUF_LOAD(buf, out, limit);
617 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
618 
619 	while (len--) {
620 		uint32_t w = *in++;
621 		unsigned int s = 0;
622 
623 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
624 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
625 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
626 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
627 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
628 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
629 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
630 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
631 		}
632 
633 		if (s == 0) {
634 			if (w == 0xA5) { /* YEN SIGN */
635 				s = 0x1005C;
636 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
637 				s = 0x2140;
638 			} else if (w == 0x2225) { /* PARALLEL TO */
639 				s = 0x2142;
640 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
641 				s = 0x215D;
642 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
643 				s = 0x2171;
644 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
645 				s = 0x2172;
646 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
647 				s = 0x224C;
648 			} else if (w != 0) {
649 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
650 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
651 				continue;
652 			}
653 		} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
654 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
655 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
656 			continue;
657 		}
658 
659 		if (s < 0x80) { /* ASCII */
660 			if (buf->state != ASCII) {
661 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
662 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
663 				buf->state = ASCII;
664 			}
665 			out = mb_convert_buf_add(out, s);
666 		} else if (s < 0x8080) { /* JIS X 0208 */
667 			if (buf->state != JISX_0208) {
668 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
669 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
670 				buf->state = JISX_0208;
671 			}
672 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
673 		} else if (s < 0x10000) { /* JIS X 0212 */
674 			if (buf->state != JISX_0212) {
675 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
676 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
677 				buf->state = JISX_0212;
678 			}
679 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
680 		} else { /* X 0201 Latin */
681 			if (buf->state != JISX_0201_LATIN) {
682 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
683 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
684 				buf->state = JISX_0201_LATIN;
685 			}
686 			out = mb_convert_buf_add(out, s & 0x7F);
687 		}
688 	}
689 
690 	if (end && buf->state != ASCII) {
691 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
692 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
693 	}
694 
695 	MB_CONVERT_BUF_STORE(buf, out, limit);
696 }
697 
mb_wchar_to_jis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)698 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
699 {
700 	unsigned char *out, *limit;
701 	MB_CONVERT_BUF_LOAD(buf, out, limit);
702 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
703 
704 	while (len--) {
705 		uint32_t w = *in++;
706 		unsigned int s = 0;
707 
708 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
709 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
710 		} else if (w == 0x203E) { /* OVERLINE */
711 			s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
712 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
713 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
714 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
715 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
716 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
717 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
718 		}
719 
720 		if (s == 0) {
721 			if (w == 0xA5) { /* YEN SIGN */
722 				s = 0x1005C;
723 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
724 				s = 0x2140;
725 			} else if (w == 0x2225) { /* PARALLEL TO */
726 				s = 0x2142;
727 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
728 				s = 0x215D;
729 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
730 				s = 0x2171;
731 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
732 				s = 0x2172;
733 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
734 				s = 0x224C;
735 			} else if (w != 0) {
736 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
737 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
738 				continue;
739 			}
740 		}
741 
742 		if (s < 0x80) { /* ASCII */
743 			if (buf->state != ASCII) {
744 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
745 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
746 				buf->state = ASCII;
747 			}
748 			out = mb_convert_buf_add(out, s);
749 		} else if (s >= 0xA1 && s <= 0xDF) {
750 			if (buf->state != JISX_0201_KANA) {
751 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
752 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
753 				buf->state = JISX_0201_KANA;
754 			}
755 			out = mb_convert_buf_add(out, s & 0x7F);
756 		} else if (s < 0x8080) { /* JIS X 0208 */
757 			if (buf->state != JISX_0208) {
758 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
759 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
760 				buf->state = JISX_0208;
761 			}
762 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
763 		} else if (s < 0x10000) { /* JIS X 0212 */
764 			if (buf->state != JISX_0212) {
765 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
766 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
767 				buf->state = JISX_0212;
768 			}
769 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
770 		} else { /* X 0201 Latin */
771 			if (buf->state != JISX_0201_LATIN) {
772 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
773 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
774 				buf->state = JISX_0201_LATIN;
775 			}
776 			out = mb_convert_buf_add(out, s & 0x7F);
777 		}
778 	}
779 
780 	if (end && buf->state != ASCII) {
781 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
782 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
783 	}
784 
785 	MB_CONVERT_BUF_STORE(buf, out, limit);
786 }
787 
788 #define JISX_0201_KANA_SO 5
789 
mb_check_jis(unsigned char * in,size_t in_len)790 static bool mb_check_jis(unsigned char *in, size_t in_len)
791 {
792 	unsigned char *p = in, *e = p + in_len;
793 	unsigned int state = ASCII;
794 
795 	while (p < e) {
796 		unsigned char c = *p++;
797 		if (c == 0x1B) {
798 			/* ESC seen; this is an escape sequence */
799 			if (state == JISX_0201_KANA_SO) {
800 				return false;
801 			}
802 			if ((e - p) < 2) {
803 				return false;
804 			}
805 			unsigned char c2 = *p++;
806 			if (c2 == '$') {
807 				unsigned char c3 = *p++;
808 				if (c3 == '@' || c3 == 'B') {
809 					state = JISX_0208;
810 				} else if (c3 == '(') {
811 					if (p == e) {
812 						return false;
813 					}
814 					unsigned char c4 = *p++;
815 					if (c4 == '@' || c4 == 'B') {
816 						state = JISX_0208;
817 					} else if (c4 == 'D') {
818 						state = JISX_0212;
819 					} else {
820 						return false;
821 					}
822 				} else {
823 					return false;
824 				}
825 			} else if (c2 == '(') {
826 				unsigned char c3 = *p++;
827 				/* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
828 				 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
829 				if (c3 == 'B' || c3 == 'H') {
830 					state = ASCII;
831 				} else if (c3 == 'J') {
832 					state = JISX_0201_LATIN;
833 				} else if (c3 == 'I') {
834 					state = JISX_0201_KANA;
835 				} else {
836 					return false;
837 				}
838 			} else {
839 				return false;
840 			}
841 		} else if (c == 0xE) {
842 			/* "Kana In" marker */
843 			if (state != ASCII) {
844 				return false;
845 			}
846 			state = JISX_0201_KANA_SO;
847 		} else if (c == 0xF) {
848 			/* "Kana Out" marker */
849 			if (state != JISX_0201_KANA_SO) {
850 				return false;
851 			}
852 			state = ASCII;
853 		} else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
854 			if (p == e) {
855 				return false;
856 			}
857 			unsigned char c2 = *p++;
858 			if (c2 > 0x20 && c2 < 0x7F) {
859 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
860 				if (state == JISX_0208) {
861 					if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
862 						continue;
863 					}
864 				} else {
865 					if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
866 						continue;
867 					}
868 				}
869 				return false;
870 			} else {
871 				return false;
872 			}
873 		} else if (c < 0x80) {
874 			continue;
875 		} else if (c >= 0xA1 && c <= 0xDF) {
876 			/* GR-invoked Kana */
877 			continue;
878 		} else {
879 			return false;
880 		}
881 	}
882 
883 	return state == ASCII;
884 }
885 
886 
mb_check_iso2022jp(unsigned char * in,size_t in_len)887 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
888 {
889 	unsigned char *p = in, *e = p + in_len;
890 	unsigned int state = ASCII;
891 
892 	while (p < e) {
893 		unsigned char c = *p++;
894 		if (c == 0x1B) {
895 			/* ESC seen; this is an escape sequence */
896 			if ((e - p) < 2) {
897 				return false;
898 			}
899 			unsigned char c2 = *p++;
900 			if (c2 == '$') {
901 				unsigned char c3 = *p++;
902 				if (c3 == '@' || c3 == 'B') {
903 					state = JISX_0208;
904 				} else {
905 					return false;
906 				}
907 			} else if (c2 == '(') {
908 				unsigned char c3 = *p++;
909 				if (c3 == 'B') {
910 					state = ASCII;
911 				} else if (c3 == 'J') {
912 					state = JISX_0201_LATIN;
913 				} else {
914 					return false;
915 				}
916 			} else {
917 				return false;
918 			}
919 		} else if (c == 0xE || c == 0xF) {
920 			/* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
921 			return false;
922 		} else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
923 			if (p == e) {
924 				return false;
925 			}
926 			unsigned char c2 = *p++;
927 			if (c2 > 0x20 && c2 < 0x7F) {
928 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
929 				if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
930 					continue;
931 				}
932 				return false;
933 			} else {
934 				return false;
935 			}
936 		} else if (c < 0x80) {
937 			continue;
938 		} else {
939 			return false;
940 		}
941 	}
942 
943 	return state == ASCII;
944 }
945