1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter_ja.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_iso2022_jp_ms.h"
32
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35 #include "cp932_table.h"
36
37 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter);
38
39 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
40
41 const mbfl_encoding mbfl_encoding_2022jpms = {
42 mbfl_no_encoding_2022jpms,
43 "ISO-2022-JP-MS",
44 "ISO-2022-JP",
45 (const char *(*)[])&mbfl_encoding_2022jpms_aliases,
46 NULL,
47 MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
48 &vtbl_2022jpms_wchar,
49 &vtbl_wchar_2022jpms
50 };
51
52 const struct mbfl_identify_vtbl vtbl_identify_2022jpms = {
53 mbfl_no_encoding_2022jpms,
54 mbfl_filt_ident_common_ctor,
55 mbfl_filt_ident_2022jpms
56 };
57
58 const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
59 mbfl_no_encoding_2022jpms,
60 mbfl_no_encoding_wchar,
61 mbfl_filt_conv_common_ctor,
62 NULL,
63 mbfl_filt_conv_2022jpms_wchar,
64 mbfl_filt_conv_common_flush,
65 NULL,
66 };
67
68 const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
69 mbfl_no_encoding_wchar,
70 mbfl_no_encoding_2022jpms,
71 mbfl_filt_conv_common_ctor,
72 NULL,
73 mbfl_filt_conv_wchar_2022jpms,
74 mbfl_filt_conv_any_2022jpms_flush,
75 NULL,
76 };
77
78 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
79
80 #define sjistoidx(c1, c2) \
81 (((c1) > 0x9f) \
82 ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) \
83 : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
84 #define idxtojis1(c) (((c) / 94) + 0x21)
85 #define idxtojis2(c) (((c) % 94) + 0x21)
86
87 /*
88 * ISO-2022-JP-MS => wchar
89 */
90 int
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)91 mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
92 {
93 int c1, s, w;
94
95 retry:
96 switch (filter->status & 0xf) {
97 /* case 0x00: ASCII */
98 /* case 0x10: X 0201 latin */
99 /* case 0x20: X 0201 kana */
100 /* case 0x80: X 0208 */
101 /* case 0xa0: UDC */
102 case 0:
103 if (c == 0x1b) {
104 filter->status += 2;
105 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
106 CK((*filter->output_function)(0xff40 + c, filter->data));
107 } else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) { /* kanji first char */
108 filter->cache = c;
109 filter->status += 1;
110 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
111 CK((*filter->output_function)(c, filter->data));
112 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
113 CK((*filter->output_function)(0xfec0 + c, filter->data));
114 } else {
115 w = c & MBFL_WCSGROUP_MASK;
116 w |= MBFL_WCSGROUP_THROUGH;
117 CK((*filter->output_function)(w, filter->data));
118 }
119 break;
120
121 /* case 0x81: X 0208 second char */
122 /* case 0xa1: UDC second char */
123 case 1:
124 w = 0;
125 filter->status &= ~0xf;
126 c1 = filter->cache;
127 if (c > 0x20 && c < 0x7f) {
128 s = (c1 - 0x21)*94 + c - 0x21;
129 if (filter->status == 0x80) {
130 if (s <= 137) {
131 if (s == 31) {
132 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
133 } else if (s == 32) {
134 w = 0xff5e; /* FULLWIDTH TILDE */
135 } else if (s == 33) {
136 w = 0x2225; /* PARALLEL TO */
137 } else if (s == 60) {
138 w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
139 } else if (s == 80) {
140 w = 0xffe0; /* FULLWIDTH CENT SIGN */
141 } else if (s == 81) {
142 w = 0xffe1; /* FULLWIDTH POUND SIGN */
143 } else if (s == 137) {
144 w = 0xffe2; /* FULLWIDTH NOT SIGN */
145 }
146 }
147 if (w == 0) {
148 if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
149 w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
150 } else if (s >= 0 && s < jisx0208_ucs_table_size) {
151 w = jisx0208_ucs_table[s];
152 } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */
153 w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
154 } else {
155 w = 0;
156 }
157 }
158 if (w <= 0) {
159 w = (c1 << 8) | c;
160 w &= MBFL_WCSPLANE_MASK;
161 w |= MBFL_WCSPLANE_JIS0208;
162 }
163 CK((*filter->output_function)(w, filter->data));
164 } else {
165 if (c1 > 0x20 && c1 < 0x35) {
166 w = 0xe000 + (c1 - 0x21)*94 + c - 0x21;
167 }
168 if (w <= 0) {
169 w = (((c1 - 0x21) + 0x7f) << 8) | c;
170 w &= MBFL_WCSPLANE_MASK;
171 w |= MBFL_WCSPLANE_JIS0208;
172 }
173 CK((*filter->output_function)(w, filter->data));
174 }
175 } else if (c == 0x1b) {
176 filter->status += 2;
177 } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
178 CK((*filter->output_function)(c, filter->data));
179 } else {
180 w = (c1 << 8) | c;
181 w &= MBFL_WCSGROUP_MASK;
182 w |= MBFL_WCSGROUP_THROUGH;
183 CK((*filter->output_function)(w, filter->data));
184 }
185 break;
186
187 /* ESC */
188 /* case 0x02: */
189 /* case 0x12: */
190 /* case 0x22: */
191 /* case 0x82: */
192 /* case 0xa2: */
193 case 2:
194 if (c == 0x24) { /* '$' */
195 filter->status++;
196 } else if (c == 0x28) { /* '(' */
197 filter->status += 3;
198 } else {
199 filter->status &= ~0xf;
200 CK((*filter->output_function)(0x1b, filter->data));
201 goto retry;
202 }
203 break;
204
205 /* ESC $ */
206 /* case 0x03: */
207 /* case 0x13: */
208 /* case 0x23: */
209 /* case 0x83: */
210 /* case 0xa3: */
211 case 3:
212 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
213 filter->status = 0x80;
214 } else if (c == 0x28) { /* '(' */
215 filter->status++;
216 } else {
217 filter->status &= ~0xf;
218 CK((*filter->output_function)(0x1b, filter->data));
219 CK((*filter->output_function)(0x24, filter->data));
220 goto retry;
221 }
222 break;
223
224 /* ESC $ ( */
225 /* case 0x04: */
226 /* case 0x14: */
227 /* case 0x24: */
228 /* case 0x84: */
229 /* case 0xa4: */
230 case 4:
231 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
232 filter->status = 0x80;
233 } else if (c == 0x3f) { /* '?' */
234 filter->status = 0xa0;
235 } else {
236 filter->status &= ~0xf;
237 CK((*filter->output_function)(0x1b, filter->data));
238 CK((*filter->output_function)(0x24, filter->data));
239 CK((*filter->output_function)(0x28, filter->data));
240 goto retry;
241 }
242 break;
243
244 /* ESC ( */
245 /* case 0x05: */
246 /* case 0x15: */
247 /* case 0x25: */
248 /* case 0x85: */
249 /* case 0xa5: */
250 case 5:
251 if (c == 0x42) { /* 'B' */
252 filter->status = 0;
253 } else if (c == 0x4a) { /* 'J' */
254 filter->status = 0;
255 } else if (c == 0x49) { /* 'I' */
256 filter->status = 0x20;
257 } else {
258 filter->status &= ~0xf;
259 CK((*filter->output_function)(0x1b, filter->data));
260 CK((*filter->output_function)(0x28, filter->data));
261 goto retry;
262 }
263 break;
264
265 default:
266 filter->status = 0;
267 break;
268 }
269
270 return c;
271 }
272
273 static int
cp932ext3_cp932ext2_jis(int c)274 cp932ext3_cp932ext2_jis(int c)
275 {
276 int idx;
277
278 idx = sjistoidx(0xfa, 0x40) + c;
279 if (idx >= sjistoidx(0xfa, 0x5c))
280 idx -= sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
281 else if (idx >= sjistoidx(0xfa, 0x55))
282 idx -= sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
283 else if (idx >= sjistoidx(0xfa, 0x40))
284 idx -= sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
285 return idxtojis1(idx) << 8 | idxtojis2(idx);
286 }
287
288 /*
289 * wchar => ISO-2022-JP-MS
290 */
291 int
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)292 mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
293 {
294 int c1, c2, s1, s2;
295
296 s1 = 0;
297 s2 = 0;
298 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
299 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
300 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
301 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
302 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
303 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
304 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
305 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
306 } else if (c >= 0xe000 && c < (0xe000 + 20*94)) { /* user (95ku - 114ku) */
307 s1 = c - 0xe000;
308 c1 = s1/94 + 0x7f;
309 c2 = s1%94 + 0x21;
310 s1 = (c1 << 8) | c2;
311 }
312 if (s1 <= 0) {
313 c1 = c & ~MBFL_WCSPLANE_MASK;
314 if (c1 == MBFL_WCSPLANE_WINCP932) {
315 s1 = c & MBFL_WCSPLANE_MASK;
316 s2 = 1;
317 } else if (c1 == MBFL_WCSPLANE_JIS0208) {
318 s1 = c & MBFL_WCSPLANE_MASK;
319 } else if (c1 == MBFL_WCSPLANE_JIS0212) {
320 s1 = c & MBFL_WCSPLANE_MASK;
321 s1 |= 0x8080;
322 } else if (c == 0xa5) { /* YEN SIGN */
323 s1 = 0x216f; /* FULLWIDTH YEN SIGN */
324 } else if (c == 0x203e) { /* OVER LINE */
325 s1 = 0x2131; /* FULLWIDTH MACRON */
326 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
327 s1 = 0x2140;
328 } else if (c == 0xff5e) { /* FULLWIDTH TILDE */
329 s1 = 0x2141;
330 } else if (c == 0x2225) { /* PARALLEL TO */
331 s1 = 0x2142;
332 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
333 s1 = 0x215d;
334 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
335 s1 = 0x2171;
336 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
337 s1 = 0x2172;
338 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
339 s1 = 0x224c;
340 }
341 }
342 if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
343 s1 = -1;
344 c1 = 0;
345 c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
346 while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
347 if (c == cp932ext1_ucs_table[c1]) {
348 s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
349 break;
350 }
351 c1++;
352 }
353 if (s1 <= 0) {
354 c1 = 0;
355 c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
356 while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
357 if (c == cp932ext3_ucs_table[c1]) {
358 s1 = cp932ext3_cp932ext2_jis(c1);
359 break;
360 }
361 c1++;
362 }
363 }
364 if (c == 0) {
365 s1 = 0;
366 } else if (s1 <= 0) {
367 s1 = -1;
368 }
369 }
370 if (s1 >= 0) {
371 if (s1 < 0x80) { /* latin */
372 if ((filter->status & 0xff00) != 0) {
373 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
374 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
375 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
376 }
377 CK((*filter->output_function)(s1, filter->data));
378 filter->status = 0;
379 } else if (s1 > 0xa0 && s1 < 0xe0) { /* kana */
380 if ((filter->status & 0xff00) != 0x100) {
381 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
382 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
383 CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
384 }
385 filter->status = 0x100;
386 CK((*filter->output_function)(s1 & 0x7f, filter->data));
387 } else if (s1 < 0x7e7f) { /* X 0208 */
388 if ((filter->status & 0xff00) != 0x200) {
389 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
390 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
391 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
392 }
393 filter->status = 0x200;
394 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
395 CK((*filter->output_function)(s1 & 0x7f, filter->data));
396 } else if (s1 < 0x927f) { /* UDC */
397 if ((filter->status & 0xff00) != 0x800) {
398 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
399 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
400 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
401 CK((*filter->output_function)(0x3f, filter->data)); /* '?' */
402 }
403 filter->status = 0x800;
404 CK((*filter->output_function)(((s1 >> 8) - 0x5e) & 0x7f, filter->data));
405 CK((*filter->output_function)(s1 & 0x7f, filter->data));
406 }
407 } else {
408 CK(mbfl_filt_conv_illegal_output(c, filter));
409 }
410
411 return c;
412 }
413
414 int
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)415 mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
416 {
417 /* back to latin */
418 if ((filter->status & 0xff00) != 0) {
419 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
420 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
421 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
422 }
423
424 filter->status &= 0xff;
425
426 if (filter->flush_function != NULL) {
427 return (*filter->flush_function)(filter->data);
428 }
429
430 return 0;
431 }
432
mbfl_filt_ident_2022jpms(int c,mbfl_identify_filter * filter)433 int mbfl_filt_ident_2022jpms(int c, mbfl_identify_filter *filter)
434 {
435 retry:
436 switch (filter->status & 0xf) {
437 /* case 0x00: ASCII */
438 /* case 0x10: X 0201 latin */
439 /* case 0x20: X 0201 kana */
440 /* case 0x80: X 0208 */
441 /* case 0xa0: X UDC */
442 case 0:
443 if (c == 0x1b) {
444 filter->status += 2;
445 } else if ((filter->status == 0x80 || filter->status == 0xa0) && c > 0x20 && c < 0x80) { /* kanji first char */
446 filter->status += 1;
447 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
448 ;
449 } else {
450 filter->flag = 1; /* bad */
451 }
452 break;
453
454 /* case 0x81: X 0208 second char */
455 /* case 0xa1: UDC second char */
456 case 1:
457 filter->status &= ~0xf;
458 if (c == 0x1b) {
459 goto retry;
460 } else if (c < 0x21 || c > 0x7e) { /* bad */
461 filter->flag = 1;
462 }
463 break;
464
465 /* ESC */
466 case 2:
467 if (c == 0x24) { /* '$' */
468 filter->status++;
469 } else if (c == 0x28) { /* '(' */
470 filter->status += 3;
471 } else {
472 filter->flag = 1; /* bad */
473 filter->status &= ~0xf;
474 goto retry;
475 }
476 break;
477
478 /* ESC $ */
479 case 3:
480 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
481 filter->status = 0x80;
482 } else if (c == 0x28) { /* '(' */
483 filter->status++;
484 } else {
485 filter->flag = 1; /* bad */
486 filter->status &= ~0xf;
487 goto retry;
488 }
489 break;
490
491 /* ESC $ ( */
492 case 4:
493 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
494 filter->status = 0x80;
495 } else if (c == 0x3f) { /* '?' */
496 filter->status = 0xa0;
497 } else {
498 filter->flag = 1; /* bad */
499 filter->status &= ~0xf;
500 goto retry;
501 }
502 break;
503
504 /* ESC ( */
505 case 5:
506 if (c == 0x42) { /* 'B' */
507 filter->status = 0;
508 } else if (c == 0x4a) { /* 'J' */
509 filter->status = 0;
510 } else if (c == 0x49) { /* 'I' */
511 filter->status = 0x20;
512 } else {
513 filter->flag = 1; /* bad */
514 filter->status &= ~0xf;
515 goto retry;
516 }
517 break;
518
519 default:
520 filter->status = 0;
521 break;
522 }
523
524 return c;
525 }
526