1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_ja.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include "mbfilter.h"
35 #include "mbfilter_iso2022_jp_ms.h"
36 
37 #include "unicode_table_cp932_ext.h"
38 #include "unicode_table_jis.h"
39 #include "cp932_table.h"
40 
41 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter);
42 
43 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
44 
45 const mbfl_encoding mbfl_encoding_2022jpms = {
46 	mbfl_no_encoding_2022jpms,
47 	"ISO-2022-JP-MS",
48 	"ISO-2022-JP",
49 	(const char *(*)[])&mbfl_encoding_2022jpms_aliases,
50 	NULL,
51 	MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE | MBFL_ENCTYPE_GL_UNSAFE,
52 	&vtbl_2022jpms_wchar,
53 	&vtbl_wchar_2022jpms
54 };
55 
56 const struct mbfl_identify_vtbl vtbl_identify_2022jpms = {
57 	mbfl_no_encoding_2022jpms,
58 	mbfl_filt_ident_common_ctor,
59 	mbfl_filt_ident_common_dtor,
60 	mbfl_filt_ident_2022jpms
61 };
62 
63 const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
64 	mbfl_no_encoding_2022jpms,
65 	mbfl_no_encoding_wchar,
66 	mbfl_filt_conv_common_ctor,
67 	mbfl_filt_conv_common_dtor,
68 	mbfl_filt_conv_2022jpms_wchar,
69 	mbfl_filt_conv_common_flush
70 };
71 
72 const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
73 	mbfl_no_encoding_wchar,
74 	mbfl_no_encoding_2022jpms,
75 	mbfl_filt_conv_common_ctor,
76 	mbfl_filt_conv_common_dtor,
77 	mbfl_filt_conv_wchar_2022jpms,
78 	mbfl_filt_conv_any_2022jpms_flush
79 };
80 
81 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
82 
83 #define sjistoidx(c1, c2) \
84         (((c1) > 0x9f) \
85         ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) \
86         : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
87 #define idxtojis1(c) (((c) / 94) + 0x21)
88 #define idxtojis2(c) (((c) % 94) + 0x21)
89 
90 /*
91  * ISO-2022-JP-MS => wchar
92  */
93 int
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)94 mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
95 {
96 	int c1, s, w;
97 
98 retry:
99 	switch (filter->status & 0xf) {
100 /*	case 0x00:	 ASCII */
101 /*	case 0x10:	 X 0201 latin */
102 /*	case 0x20:	 X 0201 kana */
103 /*	case 0x80:	 X 0208 */
104 /*	case 0xa0:	 UDC */
105 	case 0:
106 		if (c == 0x1b) {
107 			filter->status += 2;
108 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
109 			CK((*filter->output_function)(0xff40 + c, filter->data));
110 		} else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) {		/* kanji first char */
111 			filter->cache = c;
112 			filter->status += 1;
113 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
114 			CK((*filter->output_function)(c, filter->data));
115 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
116 			CK((*filter->output_function)(0xfec0 + c, filter->data));
117 		} else {
118 			w = c & MBFL_WCSGROUP_MASK;
119 			w |= MBFL_WCSGROUP_THROUGH;
120 			CK((*filter->output_function)(w, filter->data));
121 		}
122 		break;
123 
124 /*	case 0x81:	 X 0208 second char */
125 /*	case 0xa1:	 UDC second char */
126 	case 1:
127 		w = 0;
128 		filter->status &= ~0xf;
129 		c1 = filter->cache;
130 		if (c > 0x20 && c < 0x7f) {
131 			s = (c1 - 0x21)*94 + c - 0x21;
132 			if (filter->status == 0x80) {
133 				if (s <= 137) {
134 					if (s == 31) {
135 						w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
136 					} else if (s == 32) {
137 						w = 0xff5e;			/* FULLWIDTH TILDE */
138 					} else if (s == 33) {
139 						w = 0x2225;			/* PARALLEL TO */
140 					} else if (s == 60) {
141 						w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
142 					} else if (s == 80) {
143 						w = 0xffe0;			/* FULLWIDTH CENT SIGN */
144 					} else if (s == 81) {
145 						w = 0xffe1;			/* FULLWIDTH POUND SIGN */
146 					} else if (s == 137) {
147 						w = 0xffe2;			/* FULLWIDTH NOT SIGN */
148 					}
149 				}
150 				if (w == 0) {
151 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
152 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
153 					} else if (s >= 0 && s < jisx0208_ucs_table_size) {
154 						w = jisx0208_ucs_table[s];
155 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
156 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
157 					} else {
158 						w = 0;
159 					}
160 				}
161 				if (w <= 0) {
162 					w = (c1 << 8) | c;
163 					w &= MBFL_WCSPLANE_MASK;
164 					w |= MBFL_WCSPLANE_JIS0208;
165 				}
166 				CK((*filter->output_function)(w, filter->data));
167 			} else {
168 				if (c1 > 0x20 && c1 < 0x35) {
169 					w = 0xe000 + (c1 - 0x21)*94 + c - 0x21;
170 				}
171 				if (w <= 0) {
172 					w = (((c1 - 0x21) + 0x7f) << 8) | c;
173 					w &= MBFL_WCSPLANE_MASK;
174 					w |= MBFL_WCSPLANE_JIS0208;
175 				}
176 				CK((*filter->output_function)(w, filter->data));
177 			}
178 		} else if (c == 0x1b) {
179 			filter->status += 2;
180 		} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
181 			CK((*filter->output_function)(c, filter->data));
182 		} else {
183 			w = (c1 << 8) | c;
184 			w &= MBFL_WCSGROUP_MASK;
185 			w |= MBFL_WCSGROUP_THROUGH;
186 			CK((*filter->output_function)(w, filter->data));
187 		}
188 		break;
189 
190 	/* ESC */
191 /*	case 0x02:	*/
192 /*	case 0x12:	*/
193 /*	case 0x22:	*/
194 /*	case 0x82:	*/
195 /*	case 0xa2:	*/
196 	case 2:
197 		if (c == 0x24) {		/* '$' */
198 			filter->status++;
199 		} else if (c == 0x28) {		/* '(' */
200 			filter->status += 3;
201 		} else {
202 			filter->status &= ~0xf;
203 			CK((*filter->output_function)(0x1b, filter->data));
204 			goto retry;
205 		}
206 		break;
207 
208 	/* ESC $ */
209 /*	case 0x03:	*/
210 /*	case 0x13:	*/
211 /*	case 0x23:	*/
212 /*	case 0x83:	*/
213 /*	case 0xa3:	*/
214 	case 3:
215 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
216 			filter->status = 0x80;
217 		} else if (c == 0x28) {     /* '(' */
218 			filter->status++;
219 		} else {
220 			filter->status &= ~0xf;
221 			CK((*filter->output_function)(0x1b, filter->data));
222 			CK((*filter->output_function)(0x24, filter->data));
223 			goto retry;
224 		}
225 		break;
226 
227 	/* ESC $ ( */
228 /*	case 0x04:	*/
229 /*	case 0x14:	*/
230 /*	case 0x24:	*/
231 /*	case 0x84:	*/
232 /*	case 0xa4:	*/
233 	case 4:
234 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
235 			filter->status = 0x80;
236 		} else if (c == 0x3f) {			/* '?' */
237 			filter->status = 0xa0;
238 		} else {
239 			filter->status &= ~0xf;
240 			CK((*filter->output_function)(0x1b, filter->data));
241 			CK((*filter->output_function)(0x24, filter->data));
242 			CK((*filter->output_function)(0x28, filter->data));
243 			goto retry;
244 		}
245 		break;
246 
247 	/* ESC ( */
248 /*	case 0x05:	*/
249 /*	case 0x15:	*/
250 /*	case 0x25:	*/
251 /*	case 0x85:	*/
252 /*	case 0xa5:	*/
253 	case 5:
254 		if (c == 0x42) {		/* 'B' */
255 			filter->status = 0;
256 		} else if (c == 0x4a) {		/* 'J' */
257 			filter->status = 0;
258 		} else if (c == 0x49) {		/* 'I' */
259 			filter->status = 0x20;
260 		} else {
261 			filter->status &= ~0xf;
262 			CK((*filter->output_function)(0x1b, filter->data));
263 			CK((*filter->output_function)(0x28, filter->data));
264 			goto retry;
265 		}
266 		break;
267 
268 	default:
269 		filter->status = 0;
270 		break;
271 	}
272 
273 	return c;
274 }
275 
276 static int
cp932ext3_cp932ext2_jis(int c)277 cp932ext3_cp932ext2_jis(int c)
278 {
279 	int idx;
280 
281 	idx = sjistoidx(0xfa, 0x40) + c;
282 	if (idx >= sjistoidx(0xfa, 0x5c))
283 		idx -=  sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
284 	else if (idx >= sjistoidx(0xfa, 0x55))
285 		idx -=  sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
286 	else if (idx >= sjistoidx(0xfa, 0x40))
287 		idx -=  sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
288 	return idxtojis1(idx) << 8 | idxtojis2(idx);
289 }
290 
291 /*
292  * wchar => ISO-2022-JP-MS
293  */
294 int
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)295 mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
296 {
297 	int c1, c2, s1, s2;
298 
299 	s1 = 0;
300 	s2 = 0;
301 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
302 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
303 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
304 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
305 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
306 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
307 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
308 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
309 	} else if (c >= 0xe000 && c < (0xe000 + 20*94)) {	/* user  (95ku - 114ku) */
310 		s1 = c - 0xe000;
311 		c1 = s1/94 + 0x7f;
312 		c2 = s1%94 + 0x21;
313 		s1 = (c1 << 8) | c2;
314 	}
315 	if (s1 <= 0) {
316 		c1 = c & ~MBFL_WCSPLANE_MASK;
317 		if (c1 == MBFL_WCSPLANE_WINCP932) {
318 			s1 = c & MBFL_WCSPLANE_MASK;
319 			s2 = 1;
320 		} else if (c1 == MBFL_WCSPLANE_JIS0208) {
321 			s1 = c & MBFL_WCSPLANE_MASK;
322 		} else if (c1 == MBFL_WCSPLANE_JIS0212) {
323 			s1 = c & MBFL_WCSPLANE_MASK;
324 			s1 |= 0x8080;
325 		} else if (c == 0xa5) {		/* YEN SIGN */
326 			s1 = 0x216f;	            /* FULLWIDTH YEN SIGN */
327 		} else if (c == 0x203e) {	/* OVER LINE */
328 			s1 = 0x2131;	/* FULLWIDTH MACRON */
329 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
330 			s1 = 0x2140;
331 		} else if (c == 0xff5e) {	/* FULLWIDTH TILDE */
332 			s1 = 0x2141;
333 		} else if (c == 0x2225) {	/* PARALLEL TO */
334 			s1 = 0x2142;
335 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
336 			s1 = 0x215d;
337 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
338 			s1 = 0x2171;
339 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
340 			s1 = 0x2172;
341 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
342 			s1 = 0x224c;
343 		}
344 	}
345 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
346 		s1 = -1;
347 		c1 = 0;
348 		c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
349 		while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
350 			if (c == cp932ext1_ucs_table[c1]) {
351 				s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
352 				break;
353 			}
354 			c1++;
355 		}
356 		if (s1 <= 0) {
357 			c1 = 0;
358 			c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
359 			while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
360 				if (c == cp932ext3_ucs_table[c1]) {
361 					s1 = cp932ext3_cp932ext2_jis(c1);
362 					break;
363 				}
364 				c1++;
365 			}
366 		}
367 		if (c == 0) {
368 			s1 = 0;
369 		} else if (s1 <= 0) {
370 			s1 = -1;
371 		}
372 	}
373 	if (s1 >= 0) {
374 		if (s1 < 0x80) { /* latin */
375 			if ((filter->status & 0xff00) != 0) {
376 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
377 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
378 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
379 			}
380 			CK((*filter->output_function)(s1, filter->data));
381 			filter->status = 0;
382 		} else if (s1 > 0xa0 && s1 < 0xe0) { /* kana */
383 			if ((filter->status & 0xff00) != 0x100) {
384 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
385 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
386 				CK((*filter->output_function)(0x49, filter->data));		/* 'I' */
387 			}
388 			filter->status = 0x100;
389 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
390 		} else if (s1 < 0x7e7f) { /* X 0208 */
391 			if ((filter->status & 0xff00) != 0x200) {
392 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
393 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
394 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
395 			}
396 			filter->status = 0x200;
397 			CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
398 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
399 		} else if (s1 < 0x927f) { /* UDC */
400 			if ((filter->status & 0xff00) != 0x800) {
401 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
402 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
403 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
404 				CK((*filter->output_function)(0x3f, filter->data));		/* '?' */
405 			}
406 			filter->status = 0x800;
407 			CK((*filter->output_function)(((s1 >> 8) - 0x5e) & 0x7f, filter->data));
408 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
409 		}
410 	} else {
411 		CK(mbfl_filt_conv_illegal_output(c, filter));
412 	}
413 
414 	return c;
415 }
416 
417 int
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)418 mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
419 {
420 	/* back to latin */
421 	if ((filter->status & 0xff00) != 0) {
422 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
423 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
424 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
425 	}
426 
427 	filter->status &= 0xff;
428 
429 	if (filter->flush_function != NULL) {
430 		return (*filter->flush_function)(filter->data);
431 	}
432 
433 	return 0;
434 }
435 
mbfl_filt_ident_2022jpms(int c,mbfl_identify_filter * filter)436 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter)
437 {
438 retry:
439 	switch (filter->status & 0xf) {
440 /*	case 0x00:	 ASCII */
441 /*	case 0x10:	 X 0201 latin */
442 /*	case 0x20:	 X 0201 kana */
443 /*	case 0x80:	 X 0208 */
444 /*	case 0xa0:	 X UDC */
445 	case 0:
446 		if (c == 0x1b) {
447 			filter->status += 2;
448 		} else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) {		/* kanji first char */
449 			filter->status += 1;
450 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
451 			;
452 		} else {
453 			filter->flag = 1;	/* bad */
454 		}
455 		break;
456 
457 /*	case 0x81:	 X 0208 second char */
458 /*	case 0xa1:	 UDC second char */
459 	case 1:
460 		filter->status &= ~0xf;
461 		if (c == 0x1b) {
462 			goto retry;
463 		} else if (c < 0x21 || c > 0x7e) {		/* bad */
464 			filter->flag = 1;
465 		}
466 		break;
467 
468 	/* ESC */
469 	case 2:
470 		if (c == 0x24) {		/* '$' */
471 			filter->status++;
472 		} else if (c == 0x28) {		/* '(' */
473 			filter->status += 3;
474 		} else {
475 			filter->flag = 1;	/* bad */
476 			filter->status &= ~0xf;
477 			goto retry;
478 		}
479 		break;
480 
481 	/* ESC $ */
482 	case 3:
483 		if (c == 0x40 || c == 0x42) {		/* '@' or 'B' */
484 			filter->status = 0x80;
485 		} else if (c == 0x28) {     /* '(' */
486 			filter->status++;
487 		} else {
488 			filter->flag = 1;	/* bad */
489 			filter->status &= ~0xf;
490 			goto retry;
491 		}
492 		break;
493 
494 	/* ESC $ ( */
495 	case 4:
496 		if (c == 0x40 || c == 0x42) {		/* '@' or 'B' */
497 			filter->status = 0x80;
498 		} else if (c == 0x3f) {		/* '?' */
499 			filter->status = 0xa0;
500 		} else {
501 			filter->flag = 1;	/* bad */
502 			filter->status &= ~0xf;
503 			goto retry;
504 		}
505 		break;
506 
507 	/* ESC ( */
508 	case 5:
509 		if (c == 0x42) {		/* 'B' */
510 			filter->status = 0;
511 		} else if (c == 0x4a) {		/* 'J' */
512 			filter->status = 0;
513 		} else if (c == 0x49) {		/* 'I' */
514 			filter->status = 0x20;
515 		} else {
516 			filter->flag = 1;	/* bad */
517 			filter->status &= ~0xf;
518 			goto retry;
519 		}
520 		break;
521 
522 	default:
523 		filter->status = 0;
524 		break;
525 	}
526 
527 	return c;
528 }
529