1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_ja.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_iso2022_jp_ms.h"
32 
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35 #include "cp932_table.h"
36 
37 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter);
38 
39 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
40 
41 const mbfl_encoding mbfl_encoding_2022jpms = {
42 	mbfl_no_encoding_2022jpms,
43 	"ISO-2022-JP-MS",
44 	"ISO-2022-JP",
45 	(const char *(*)[])&mbfl_encoding_2022jpms_aliases,
46 	NULL,
47 	MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
48 	&vtbl_2022jpms_wchar,
49 	&vtbl_wchar_2022jpms
50 };
51 
52 const struct mbfl_identify_vtbl vtbl_identify_2022jpms = {
53 	mbfl_no_encoding_2022jpms,
54 	mbfl_filt_ident_common_ctor,
55 	mbfl_filt_ident_2022jpms
56 };
57 
58 const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
59 	mbfl_no_encoding_2022jpms,
60 	mbfl_no_encoding_wchar,
61 	mbfl_filt_conv_common_ctor,
62 	NULL,
63 	mbfl_filt_conv_2022jpms_wchar,
64 	mbfl_filt_conv_common_flush,
65 	NULL,
66 };
67 
68 const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
69 	mbfl_no_encoding_wchar,
70 	mbfl_no_encoding_2022jpms,
71 	mbfl_filt_conv_common_ctor,
72 	NULL,
73 	mbfl_filt_conv_wchar_2022jpms,
74 	mbfl_filt_conv_any_2022jpms_flush,
75 	NULL,
76 };
77 
78 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
79 
80 #define sjistoidx(c1, c2) \
81         (((c1) > 0x9f) \
82         ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) \
83         : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
84 #define idxtojis1(c) (((c) / 94) + 0x21)
85 #define idxtojis2(c) (((c) % 94) + 0x21)
86 
87 /*
88  * ISO-2022-JP-MS => wchar
89  */
90 int
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)91 mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
92 {
93 	int c1, s, w;
94 
95 retry:
96 	switch (filter->status & 0xf) {
97 /*	case 0x00:	 ASCII */
98 /*	case 0x10:	 X 0201 latin */
99 /*	case 0x20:	 X 0201 kana */
100 /*	case 0x80:	 X 0208 */
101 /*	case 0xa0:	 UDC */
102 	case 0:
103 		if (c == 0x1b) {
104 			filter->status += 2;
105 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
106 			CK((*filter->output_function)(0xff40 + c, filter->data));
107 		} else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) {		/* kanji first char */
108 			filter->cache = c;
109 			filter->status += 1;
110 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
111 			CK((*filter->output_function)(c, filter->data));
112 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
113 			CK((*filter->output_function)(0xfec0 + c, filter->data));
114 		} else {
115 			w = c & MBFL_WCSGROUP_MASK;
116 			w |= MBFL_WCSGROUP_THROUGH;
117 			CK((*filter->output_function)(w, filter->data));
118 		}
119 		break;
120 
121 /*	case 0x81:	 X 0208 second char */
122 /*	case 0xa1:	 UDC second char */
123 	case 1:
124 		w = 0;
125 		filter->status &= ~0xf;
126 		c1 = filter->cache;
127 		if (c > 0x20 && c < 0x7f) {
128 			s = (c1 - 0x21)*94 + c - 0x21;
129 			if (filter->status == 0x80) {
130 				if (s <= 137) {
131 					if (s == 31) {
132 						w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
133 					} else if (s == 32) {
134 						w = 0xff5e;			/* FULLWIDTH TILDE */
135 					} else if (s == 33) {
136 						w = 0x2225;			/* PARALLEL TO */
137 					} else if (s == 60) {
138 						w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
139 					} else if (s == 80) {
140 						w = 0xffe0;			/* FULLWIDTH CENT SIGN */
141 					} else if (s == 81) {
142 						w = 0xffe1;			/* FULLWIDTH POUND SIGN */
143 					} else if (s == 137) {
144 						w = 0xffe2;			/* FULLWIDTH NOT SIGN */
145 					}
146 				}
147 				if (w == 0) {
148 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
149 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
150 					} else if (s >= 0 && s < jisx0208_ucs_table_size) {
151 						w = jisx0208_ucs_table[s];
152 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
153 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
154 					} else {
155 						w = 0;
156 					}
157 				}
158 				if (w <= 0) {
159 					w = (c1 << 8) | c;
160 					w &= MBFL_WCSPLANE_MASK;
161 					w |= MBFL_WCSPLANE_JIS0208;
162 				}
163 				CK((*filter->output_function)(w, filter->data));
164 			} else {
165 				if (c1 > 0x20 && c1 < 0x35) {
166 					w = 0xe000 + (c1 - 0x21)*94 + c - 0x21;
167 				}
168 				if (w <= 0) {
169 					w = (((c1 - 0x21) + 0x7f) << 8) | c;
170 					w &= MBFL_WCSPLANE_MASK;
171 					w |= MBFL_WCSPLANE_JIS0208;
172 				}
173 				CK((*filter->output_function)(w, filter->data));
174 			}
175 		} else if (c == 0x1b) {
176 			filter->status += 2;
177 		} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
178 			CK((*filter->output_function)(c, filter->data));
179 		} else {
180 			w = (c1 << 8) | c;
181 			w &= MBFL_WCSGROUP_MASK;
182 			w |= MBFL_WCSGROUP_THROUGH;
183 			CK((*filter->output_function)(w, filter->data));
184 		}
185 		break;
186 
187 	/* ESC */
188 /*	case 0x02:	*/
189 /*	case 0x12:	*/
190 /*	case 0x22:	*/
191 /*	case 0x82:	*/
192 /*	case 0xa2:	*/
193 	case 2:
194 		if (c == 0x24) {		/* '$' */
195 			filter->status++;
196 		} else if (c == 0x28) {		/* '(' */
197 			filter->status += 3;
198 		} else {
199 			filter->status &= ~0xf;
200 			CK((*filter->output_function)(0x1b, filter->data));
201 			goto retry;
202 		}
203 		break;
204 
205 	/* ESC $ */
206 /*	case 0x03:	*/
207 /*	case 0x13:	*/
208 /*	case 0x23:	*/
209 /*	case 0x83:	*/
210 /*	case 0xa3:	*/
211 	case 3:
212 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
213 			filter->status = 0x80;
214 		} else if (c == 0x28) {     /* '(' */
215 			filter->status++;
216 		} else {
217 			filter->status &= ~0xf;
218 			CK((*filter->output_function)(0x1b, filter->data));
219 			CK((*filter->output_function)(0x24, filter->data));
220 			goto retry;
221 		}
222 		break;
223 
224 	/* ESC $ ( */
225 /*	case 0x04:	*/
226 /*	case 0x14:	*/
227 /*	case 0x24:	*/
228 /*	case 0x84:	*/
229 /*	case 0xa4:	*/
230 	case 4:
231 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
232 			filter->status = 0x80;
233 		} else if (c == 0x3f) {			/* '?' */
234 			filter->status = 0xa0;
235 		} else {
236 			filter->status &= ~0xf;
237 			CK((*filter->output_function)(0x1b, filter->data));
238 			CK((*filter->output_function)(0x24, filter->data));
239 			CK((*filter->output_function)(0x28, filter->data));
240 			goto retry;
241 		}
242 		break;
243 
244 	/* ESC ( */
245 /*	case 0x05:	*/
246 /*	case 0x15:	*/
247 /*	case 0x25:	*/
248 /*	case 0x85:	*/
249 /*	case 0xa5:	*/
250 	case 5:
251 		if (c == 0x42) {		/* 'B' */
252 			filter->status = 0;
253 		} else if (c == 0x4a) {		/* 'J' */
254 			filter->status = 0;
255 		} else if (c == 0x49) {		/* 'I' */
256 			filter->status = 0x20;
257 		} else {
258 			filter->status &= ~0xf;
259 			CK((*filter->output_function)(0x1b, filter->data));
260 			CK((*filter->output_function)(0x28, filter->data));
261 			goto retry;
262 		}
263 		break;
264 
265 	default:
266 		filter->status = 0;
267 		break;
268 	}
269 
270 	return c;
271 }
272 
273 static int
cp932ext3_cp932ext2_jis(int c)274 cp932ext3_cp932ext2_jis(int c)
275 {
276 	int idx;
277 
278 	idx = sjistoidx(0xfa, 0x40) + c;
279 	if (idx >= sjistoidx(0xfa, 0x5c))
280 		idx -=  sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
281 	else if (idx >= sjistoidx(0xfa, 0x55))
282 		idx -=  sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
283 	else if (idx >= sjistoidx(0xfa, 0x40))
284 		idx -=  sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
285 	return idxtojis1(idx) << 8 | idxtojis2(idx);
286 }
287 
288 /*
289  * wchar => ISO-2022-JP-MS
290  */
291 int
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)292 mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
293 {
294 	int c1, c2, s1, s2;
295 
296 	s1 = 0;
297 	s2 = 0;
298 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
299 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
300 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
301 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
302 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
303 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
304 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
305 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
306 	} else if (c >= 0xe000 && c < (0xe000 + 20*94)) {	/* user  (95ku - 114ku) */
307 		s1 = c - 0xe000;
308 		c1 = s1/94 + 0x7f;
309 		c2 = s1%94 + 0x21;
310 		s1 = (c1 << 8) | c2;
311 	}
312 	if (s1 <= 0) {
313 		c1 = c & ~MBFL_WCSPLANE_MASK;
314 		if (c1 == MBFL_WCSPLANE_WINCP932) {
315 			s1 = c & MBFL_WCSPLANE_MASK;
316 			s2 = 1;
317 		} else if (c1 == MBFL_WCSPLANE_JIS0208) {
318 			s1 = c & MBFL_WCSPLANE_MASK;
319 		} else if (c1 == MBFL_WCSPLANE_JIS0212) {
320 			s1 = c & MBFL_WCSPLANE_MASK;
321 			s1 |= 0x8080;
322 		} else if (c == 0xa5) {		/* YEN SIGN */
323 			s1 = 0x216f;	            /* FULLWIDTH YEN SIGN */
324 		} else if (c == 0x203e) {	/* OVER LINE */
325 			s1 = 0x2131;	/* FULLWIDTH MACRON */
326 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
327 			s1 = 0x2140;
328 		} else if (c == 0xff5e) {	/* FULLWIDTH TILDE */
329 			s1 = 0x2141;
330 		} else if (c == 0x2225) {	/* PARALLEL TO */
331 			s1 = 0x2142;
332 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
333 			s1 = 0x215d;
334 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
335 			s1 = 0x2171;
336 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
337 			s1 = 0x2172;
338 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
339 			s1 = 0x224c;
340 		}
341 	}
342 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
343 		s1 = -1;
344 		c1 = 0;
345 		c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
346 		while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
347 			if (c == cp932ext1_ucs_table[c1]) {
348 				s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
349 				break;
350 			}
351 			c1++;
352 		}
353 		if (s1 <= 0) {
354 			c1 = 0;
355 			c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
356 			while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
357 				if (c == cp932ext3_ucs_table[c1]) {
358 					s1 = cp932ext3_cp932ext2_jis(c1);
359 					break;
360 				}
361 				c1++;
362 			}
363 		}
364 		if (c == 0) {
365 			s1 = 0;
366 		} else if (s1 <= 0) {
367 			s1 = -1;
368 		}
369 	}
370 	if (s1 >= 0) {
371 		if (s1 < 0x80) { /* latin */
372 			if ((filter->status & 0xff00) != 0) {
373 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
374 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
375 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
376 			}
377 			CK((*filter->output_function)(s1, filter->data));
378 			filter->status = 0;
379 		} else if (s1 > 0xa0 && s1 < 0xe0) { /* kana */
380 			if ((filter->status & 0xff00) != 0x100) {
381 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
382 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
383 				CK((*filter->output_function)(0x49, filter->data));		/* 'I' */
384 			}
385 			filter->status = 0x100;
386 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
387 		} else if (s1 < 0x7e7f) { /* X 0208 */
388 			if ((filter->status & 0xff00) != 0x200) {
389 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
390 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
391 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
392 			}
393 			filter->status = 0x200;
394 			CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
395 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
396 		} else if (s1 < 0x927f) { /* UDC */
397 			if ((filter->status & 0xff00) != 0x800) {
398 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
399 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
400 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
401 				CK((*filter->output_function)(0x3f, filter->data));		/* '?' */
402 			}
403 			filter->status = 0x800;
404 			CK((*filter->output_function)(((s1 >> 8) - 0x5e) & 0x7f, filter->data));
405 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
406 		}
407 	} else {
408 		CK(mbfl_filt_conv_illegal_output(c, filter));
409 	}
410 
411 	return c;
412 }
413 
414 int
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)415 mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
416 {
417 	/* back to latin */
418 	if ((filter->status & 0xff00) != 0) {
419 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
420 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
421 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
422 	}
423 
424 	filter->status &= 0xff;
425 
426 	if (filter->flush_function != NULL) {
427 		return (*filter->flush_function)(filter->data);
428 	}
429 
430 	return 0;
431 }
432 
mbfl_filt_ident_2022jpms(int c,mbfl_identify_filter * filter)433 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter)
434 {
435 retry:
436 	switch (filter->status & 0xf) {
437 /*	case 0x00:	 ASCII */
438 /*	case 0x10:	 X 0201 latin */
439 /*	case 0x20:	 X 0201 kana */
440 /*	case 0x80:	 X 0208 */
441 /*	case 0xa0:	 X UDC */
442 	case 0:
443 		if (c == 0x1b) {
444 			filter->status += 2;
445 		} else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) {		/* kanji first char */
446 			filter->status += 1;
447 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
448 			;
449 		} else {
450 			filter->flag = 1;	/* bad */
451 		}
452 		break;
453 
454 /*	case 0x81:	 X 0208 second char */
455 /*	case 0xa1:	 UDC second char */
456 	case 1:
457 		filter->status &= ~0xf;
458 		if (c == 0x1b) {
459 			goto retry;
460 		} else if (c < 0x21 || c > 0x7e) {		/* bad */
461 			filter->flag = 1;
462 		}
463 		break;
464 
465 	/* ESC */
466 	case 2:
467 		if (c == 0x24) {		/* '$' */
468 			filter->status++;
469 		} else if (c == 0x28) {		/* '(' */
470 			filter->status += 3;
471 		} else {
472 			filter->flag = 1;	/* bad */
473 			filter->status &= ~0xf;
474 			goto retry;
475 		}
476 		break;
477 
478 	/* ESC $ */
479 	case 3:
480 		if (c == 0x40 || c == 0x42) {		/* '@' or 'B' */
481 			filter->status = 0x80;
482 		} else if (c == 0x28) {     /* '(' */
483 			filter->status++;
484 		} else {
485 			filter->flag = 1;	/* bad */
486 			filter->status &= ~0xf;
487 			goto retry;
488 		}
489 		break;
490 
491 	/* ESC $ ( */
492 	case 4:
493 		if (c == 0x40 || c == 0x42) {		/* '@' or 'B' */
494 			filter->status = 0x80;
495 		} else if (c == 0x3f) {		/* '?' */
496 			filter->status = 0xa0;
497 		} else {
498 			filter->flag = 1;	/* bad */
499 			filter->status &= ~0xf;
500 			goto retry;
501 		}
502 		break;
503 
504 	/* ESC ( */
505 	case 5:
506 		if (c == 0x42) {		/* 'B' */
507 			filter->status = 0;
508 		} else if (c == 0x4a) {		/* 'J' */
509 			filter->status = 0;
510 		} else if (c == 0x49) {		/* 'I' */
511 			filter->status = 0x20;
512 		} else {
513 			filter->flag = 1;	/* bad */
514 			filter->status &= ~0xf;
515 			goto retry;
516 		}
517 		break;
518 
519 	default:
520 		filter->status = 0;
521 		break;
522 	}
523 
524 	return c;
525 }
526