1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_ja.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include "mbfilter.h"
35 #include "mbfilter_iso2022_jp_ms.h"
36 
37 #include "unicode_table_cp932_ext.h"
38 #include "unicode_table_jis.h"
39 #include "cp932_table.h"
40 
41 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter);
42 
43 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
44 
45 const mbfl_encoding mbfl_encoding_2022jpms = {
46 	mbfl_no_encoding_2022jpms,
47 	"ISO-2022-JP-MS",
48 	"ISO-2022-JP",
49 	(const char *(*)[])&mbfl_encoding_2022jpms_aliases,
50 	NULL,
51 	MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE | MBFL_ENCTYPE_GL_UNSAFE
52 };
53 
54 const struct mbfl_identify_vtbl vtbl_identify_2022jpms = {
55 	mbfl_no_encoding_2022jpms,
56 	mbfl_filt_ident_common_ctor,
57 	mbfl_filt_ident_common_dtor,
58 	mbfl_filt_ident_2022jpms
59 };
60 
61 const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
62 	mbfl_no_encoding_2022jpms,
63 	mbfl_no_encoding_wchar,
64 	mbfl_filt_conv_common_ctor,
65 	mbfl_filt_conv_common_dtor,
66 	mbfl_filt_conv_2022jpms_wchar,
67 	mbfl_filt_conv_common_flush
68 };
69 
70 const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
71 	mbfl_no_encoding_wchar,
72 	mbfl_no_encoding_2022jpms,
73 	mbfl_filt_conv_common_ctor,
74 	mbfl_filt_conv_common_dtor,
75 	mbfl_filt_conv_wchar_2022jpms,
76 	mbfl_filt_conv_any_2022jpms_flush
77 };
78 
79 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
80 
81 #define sjistoidx(c1, c2) \
82         (((c1) > 0x9f) \
83         ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) \
84         : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
85 #define idxtojis1(c) (((c) / 94) + 0x21)
86 #define idxtojis2(c) (((c) % 94) + 0x21)
87 
88 /*
89  * ISO-2022-JP-MS => wchar
90  */
91 int
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)92 mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
93 {
94 	int c1, s, w;
95 
96 retry:
97 	switch (filter->status & 0xf) {
98 /*	case 0x00:	 ASCII */
99 /*	case 0x10:	 X 0201 latin */
100 /*	case 0x20:	 X 0201 kana */
101 /*	case 0x80:	 X 0208 */
102 /*	case 0xa0:	 UDC */
103 	case 0:
104 		if (c == 0x1b) {
105 			filter->status += 2;
106 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
107 			CK((*filter->output_function)(0xff40 + c, filter->data));
108 		} else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) {		/* kanji first char */
109 			filter->cache = c;
110 			filter->status += 1;
111 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
112 			CK((*filter->output_function)(c, filter->data));
113 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
114 			CK((*filter->output_function)(0xfec0 + c, filter->data));
115 		} else {
116 			w = c & MBFL_WCSGROUP_MASK;
117 			w |= MBFL_WCSGROUP_THROUGH;
118 			CK((*filter->output_function)(w, filter->data));
119 		}
120 		break;
121 
122 /*	case 0x81:	 X 0208 second char */
123 /*	case 0xa1:	 UDC second char */
124 	case 1:
125 		w = 0;
126 		filter->status &= ~0xf;
127 		c1 = filter->cache;
128 		if (c > 0x20 && c < 0x7f) {
129 			s = (c1 - 0x21)*94 + c - 0x21;
130 			if (filter->status == 0x80) {
131 				if (s <= 137) {
132 					if (s == 31) {
133 						w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
134 					} else if (s == 32) {
135 						w = 0xff5e;			/* FULLWIDTH TILDE */
136 					} else if (s == 33) {
137 						w = 0x2225;			/* PARALLEL TO */
138 					} else if (s == 60) {
139 						w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
140 					} else if (s == 80) {
141 						w = 0xffe0;			/* FULLWIDTH CENT SIGN */
142 					} else if (s == 81) {
143 						w = 0xffe1;			/* FULLWIDTH POUND SIGN */
144 					} else if (s == 137) {
145 						w = 0xffe2;			/* FULLWIDTH NOT SIGN */
146 					}
147 				}
148 				if (w == 0) {
149 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
150 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
151 					} else if (s >= 0 && s < jisx0208_ucs_table_size) {
152 						w = jisx0208_ucs_table[s];
153 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
154 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
155 					} else {
156 						w = 0;
157 					}
158 				}
159 				if (w <= 0) {
160 					w = (c1 << 8) | c;
161 					w &= MBFL_WCSPLANE_MASK;
162 					w |= MBFL_WCSPLANE_JIS0208;
163 				}
164 				CK((*filter->output_function)(w, filter->data));
165 			} else {
166 				if (c1 > 0x20 && c1 < 0x35) {
167 					w = 0xe000 + (c1 - 0x21)*94 + c - 0x21;
168 				}
169 				if (w <= 0) {
170 					w = (((c1 - 0x21) + 0x7f) << 8) | c;
171 					w &= MBFL_WCSPLANE_MASK;
172 					w |= MBFL_WCSPLANE_JIS0208;
173 				}
174 				CK((*filter->output_function)(w, filter->data));
175 			}
176 		} else if (c == 0x1b) {
177 			filter->status += 2;
178 		} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
179 			CK((*filter->output_function)(c, filter->data));
180 		} else {
181 			w = (c1 << 8) | c;
182 			w &= MBFL_WCSGROUP_MASK;
183 			w |= MBFL_WCSGROUP_THROUGH;
184 			CK((*filter->output_function)(w, filter->data));
185 		}
186 		break;
187 
188 	/* ESC */
189 /*	case 0x02:	*/
190 /*	case 0x12:	*/
191 /*	case 0x22:	*/
192 /*	case 0x82:	*/
193 /*	case 0xa2:	*/
194 	case 2:
195 		if (c == 0x24) {		/* '$' */
196 			filter->status++;
197 		} else if (c == 0x28) {		/* '(' */
198 			filter->status += 3;
199 		} else {
200 			filter->status &= ~0xf;
201 			CK((*filter->output_function)(0x1b, filter->data));
202 			goto retry;
203 		}
204 		break;
205 
206 	/* ESC $ */
207 /*	case 0x03:	*/
208 /*	case 0x13:	*/
209 /*	case 0x23:	*/
210 /*	case 0x83:	*/
211 /*	case 0xa3:	*/
212 	case 3:
213 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
214 			filter->status = 0x80;
215 		} else if (c == 0x28) {     /* '(' */
216 			filter->status++;
217 		} else {
218 			filter->status &= ~0xf;
219 			CK((*filter->output_function)(0x1b, filter->data));
220 			CK((*filter->output_function)(0x24, filter->data));
221 			goto retry;
222 		}
223 		break;
224 
225 	/* ESC $ ( */
226 /*	case 0x04:	*/
227 /*	case 0x14:	*/
228 /*	case 0x24:	*/
229 /*	case 0x84:	*/
230 /*	case 0xa4:	*/
231 	case 4:
232 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
233 			filter->status = 0x80;
234 		} else if (c == 0x3f) {			/* '?' */
235 			filter->status = 0xa0;
236 		} else {
237 			filter->status &= ~0xf;
238 			CK((*filter->output_function)(0x1b, filter->data));
239 			CK((*filter->output_function)(0x24, filter->data));
240 			CK((*filter->output_function)(0x28, filter->data));
241 			goto retry;
242 		}
243 		break;
244 
245 	/* ESC ( */
246 /*	case 0x05:	*/
247 /*	case 0x15:	*/
248 /*	case 0x25:	*/
249 /*	case 0x85:	*/
250 /*	case 0xa5:	*/
251 	case 5:
252 		if (c == 0x42) {		/* 'B' */
253 			filter->status = 0;
254 		} else if (c == 0x4a) {		/* 'J' */
255 			filter->status = 0;
256 		} else if (c == 0x49) {		/* 'I' */
257 			filter->status = 0x20;
258 		} else {
259 			filter->status &= ~0xf;
260 			CK((*filter->output_function)(0x1b, filter->data));
261 			CK((*filter->output_function)(0x28, filter->data));
262 			goto retry;
263 		}
264 		break;
265 
266 	default:
267 		filter->status = 0;
268 		break;
269 	}
270 
271 	return c;
272 }
273 
274 static int
cp932ext3_cp932ext2_jis(int c)275 cp932ext3_cp932ext2_jis(int c)
276 {
277 	int idx;
278 
279 	idx = sjistoidx(0xfa, 0x40) + c;
280 	if (idx >= sjistoidx(0xfa, 0x5c))
281 		idx -=  sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
282 	else if (idx >= sjistoidx(0xfa, 0x55))
283 		idx -=  sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
284 	else if (idx >= sjistoidx(0xfa, 0x40))
285 		idx -=  sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
286 	return idxtojis1(idx) << 8 | idxtojis2(idx);
287 }
288 
289 /*
290  * wchar => ISO-2022-JP-MS
291  */
292 int
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)293 mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
294 {
295 	int c1, c2, s1, s2;
296 
297 	s1 = 0;
298 	s2 = 0;
299 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
300 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
301 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
302 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
303 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
304 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
305 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
306 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
307 	} else if (c >= 0xe000 && c < (0xe000 + 20*94)) {	/* user  (95ku - 114ku) */
308 		s1 = c - 0xe000;
309 		c1 = s1/94 + 0x7f;
310 		c2 = s1%94 + 0x21;
311 		s1 = (c1 << 8) | c2;
312 	}
313 	if (s1 <= 0) {
314 		c1 = c & ~MBFL_WCSPLANE_MASK;
315 		if (c1 == MBFL_WCSPLANE_WINCP932) {
316 			s1 = c & MBFL_WCSPLANE_MASK;
317 			s2 = 1;
318 		} else if (c1 == MBFL_WCSPLANE_JIS0208) {
319 			s1 = c & MBFL_WCSPLANE_MASK;
320 		} else if (c1 == MBFL_WCSPLANE_JIS0212) {
321 			s1 = c & MBFL_WCSPLANE_MASK;
322 			s1 |= 0x8080;
323 		} else if (c == 0xa5) {		/* YEN SIGN */
324 			s1 = 0x216f;	            /* FULLWIDTH YEN SIGN */
325 		} else if (c == 0x203e) {	/* OVER LINE */
326 			s1 = 0x2131;	/* FULLWIDTH MACRON */
327 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
328 			s1 = 0x2140;
329 		} else if (c == 0xff5e) {	/* FULLWIDTH TILDE */
330 			s1 = 0x2141;
331 		} else if (c == 0x2225) {	/* PARALLEL TO */
332 			s1 = 0x2142;
333 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
334 			s1 = 0x215d;
335 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
336 			s1 = 0x2171;
337 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
338 			s1 = 0x2172;
339 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
340 			s1 = 0x224c;
341 		}
342 	}
343 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
344 		s1 = -1;
345 		c1 = 0;
346 		c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
347 		while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
348 			if (c == cp932ext1_ucs_table[c1]) {
349 				s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
350 				break;
351 			}
352 			c1++;
353 		}
354 		if (s1 <= 0) {
355 			c1 = 0;
356 			c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
357 			while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
358 				if (c == cp932ext3_ucs_table[c1]) {
359 					s1 = cp932ext3_cp932ext2_jis(c1);
360 					break;
361 				}
362 				c1++;
363 			}
364 		}
365 		if (c == 0) {
366 			s1 = 0;
367 		} else if (s1 <= 0) {
368 			s1 = -1;
369 		}
370 	}
371 	if (s1 >= 0) {
372 		if (s1 < 0x80) { /* latin */
373 			if ((filter->status & 0xff00) != 0) {
374 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
375 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
376 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
377 			}
378 			CK((*filter->output_function)(s1, filter->data));
379 			filter->status = 0;
380 		} else if (s1 > 0xa0 && s1 < 0xe0) { /* kana */
381 			if ((filter->status & 0xff00) != 0x100) {
382 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
383 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
384 				CK((*filter->output_function)(0x49, filter->data));		/* 'I' */
385 			}
386 			filter->status = 0x100;
387 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
388 		} else if (s1 < 0x7e7f) { /* X 0208 */
389 			if ((filter->status & 0xff00) != 0x200) {
390 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
391 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
392 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
393 			}
394 			filter->status = 0x200;
395 			CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
396 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
397 		} else if (s1 < 0x927f) { /* UDC */
398 			if ((filter->status & 0xff00) != 0x800) {
399 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
400 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
401 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
402 				CK((*filter->output_function)(0x3f, filter->data));		/* '?' */
403 			}
404 			filter->status = 0x800;
405 			CK((*filter->output_function)(((s1 >> 8) - 0x5e) & 0x7f, filter->data));
406 			CK((*filter->output_function)(s1 & 0x7f, filter->data));
407 		}
408 	} else {
409 		if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
410 			CK(mbfl_filt_conv_illegal_output(c, filter));
411 		}
412 	}
413 
414 	return c;
415 }
416 
417 int
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)418 mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
419 {
420 	/* back to latin */
421 	if ((filter->status & 0xff00) != 0) {
422 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
423 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
424 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
425 	}
426 
427 	filter->status &= 0xff;
428 
429 	if (filter->flush_function != NULL) {
430 		return (*filter->flush_function)(filter->data);
431 	}
432 
433 	return 0;
434 }
435 
mbfl_filt_ident_2022jpms(int c,mbfl_identify_filter * filter)436 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter)
437 {
438 retry:
439 	switch (filter->status & 0xf) {
440 /*	case 0x00:	 ASCII */
441 /*	case 0x10:	 X 0201 latin */
442 /*	case 0x20:	 X 0201 kana */
443 /*	case 0x80:	 X 0208 */
444 /*	case 0xa0:	 X UDC */
445 	case 0:
446 		if (c == 0x1b) {
447 			filter->status += 2;
448 		} else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) {		/* kanji first char */
449 			filter->status += 1;
450 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
451 			;
452 		} else {
453 			filter->flag = 1;	/* bad */
454 		}
455 		break;
456 
457 /*	case 0x81:	 X 0208 second char */
458 /*	case 0xa1:	 UDC second char */
459 	case 1:
460 		filter->status &= ~0xf;
461 		if (c == 0x1b) {
462 			goto retry;
463 		} else if (c < 0x21 || c > 0x7e) {		/* bad */
464 			filter->flag = 1;
465 		}
466 		break;
467 
468 	/* ESC */
469 	case 2:
470 		if (c == 0x24) {		/* '$' */
471 			filter->status++;
472 		} else if (c == 0x28) {		/* '(' */
473 			filter->status += 3;
474 		} else {
475 			filter->flag = 1;	/* bad */
476 			filter->status &= ~0xf;
477 			goto retry;
478 		}
479 		break;
480 
481 	/* ESC $ */
482 	case 3:
483 		if (c == 0x40 || c == 0x42) {		/* '@' or 'B' */
484 			filter->status = 0x80;
485 		} else if (c == 0x28) {     /* '(' */
486 			filter->status++;
487 		} else {
488 			filter->flag = 1;	/* bad */
489 			filter->status &= ~0xf;
490 			goto retry;
491 		}
492 		break;
493 
494 	/* ESC $ ( */
495 	case 4:
496 		if (c == 0x40 || c == 0x42) {		/* '@' or 'B' */
497 			filter->status = 0x80;
498 		} else if (c == 0x3f) {		/* '?' */
499 			filter->status = 0xa0;
500 		} else {
501 			filter->flag = 1;	/* bad */
502 			filter->status &= ~0xf;
503 			goto retry;
504 		}
505 		break;
506 
507 	/* ESC ( */
508 	case 5:
509 		if (c == 0x42) {		/* 'B' */
510 			filter->status = 0;
511 		} else if (c == 0x4a) {		/* 'J' */
512 			filter->status = 0;
513 		} else if (c == 0x49) {		/* 'I' */
514 			filter->status = 0x20;
515 		} else {
516 			filter->flag = 1;	/* bad */
517 			filter->status &= ~0xf;
518 			goto retry;
519 		}
520 		break;
521 
522 	default:
523 		filter->status = 0;
524 		break;
525 	}
526 
527 	return c;
528 }
529