1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * the source code included in this files was separated from mbfilter_sjis_open.c
26 * by Rui Hirokawa <hirokawa@php.net> on 25 July 2011.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_sjis_mac.h"
32
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35
36 #include "sjis_mac2uni.h"
37
38 extern int mbfl_filt_ident_sjis(int c, mbfl_identify_filter *filter);
39 extern const unsigned char mblen_table_sjis[];
40
41 static int mbfl_filt_conv_sjis_mac_flush(mbfl_convert_filter *filter);
42
43 static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
44
45 const mbfl_encoding mbfl_encoding_sjis_mac = {
46 mbfl_no_encoding_sjis_mac,
47 "SJIS-mac",
48 "Shift_JIS",
49 (const char *(*)[])&mbfl_encoding_sjis_mac_aliases,
50 mblen_table_sjis,
51 MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
52 &vtbl_sjis_mac_wchar,
53 &vtbl_wchar_sjis_mac
54 };
55
56 const struct mbfl_identify_vtbl vtbl_identify_sjis_mac = {
57 mbfl_no_encoding_sjis_mac,
58 mbfl_filt_ident_common_ctor,
59 mbfl_filt_ident_sjis
60 };
61
62 const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
63 mbfl_no_encoding_sjis_mac,
64 mbfl_no_encoding_wchar,
65 mbfl_filt_conv_common_ctor,
66 NULL,
67 mbfl_filt_conv_sjis_mac_wchar,
68 mbfl_filt_conv_common_flush,
69 NULL,
70 };
71
72 const struct mbfl_convert_vtbl vtbl_wchar_sjis_mac = {
73 mbfl_no_encoding_wchar,
74 mbfl_no_encoding_sjis_mac,
75 mbfl_filt_conv_common_ctor,
76 NULL,
77 mbfl_filt_conv_wchar_sjis_mac,
78 mbfl_filt_conv_sjis_mac_flush,
79 NULL,
80 };
81
82 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
83
84 #define SJIS_ENCODE(c1,c2,s1,s2) \
85 do { \
86 s1 = c1; \
87 s1--; \
88 s1 >>= 1; \
89 if ((c1) < 0x5f) { \
90 s1 += 0x71; \
91 } else { \
92 s1 += 0xb1; \
93 } \
94 s2 = c2; \
95 if ((c1) & 1) { \
96 if ((c2) < 0x60) { \
97 s2--; \
98 } \
99 s2 += 0x20; \
100 } else { \
101 s2 += 0x7e; \
102 } \
103 } while (0)
104
105 #define SJIS_DECODE(c1,c2,s1,s2) \
106 do { \
107 s1 = c1; \
108 if (s1 < 0xa0) { \
109 s1 -= 0x81; \
110 } else { \
111 s1 -= 0xc1; \
112 } \
113 s1 <<= 1; \
114 s1 += 0x21; \
115 s2 = c2; \
116 if (s2 < 0x9f) { \
117 if (s2 < 0x7f) { \
118 s2++; \
119 } \
120 s2 -= 0x20; \
121 } else { \
122 s1++; \
123 s2 -= 0x7e; \
124 } \
125 } while (0)
126
127 /*
128 * SJIS-mac => wchar
129 */
130 int
mbfl_filt_conv_sjis_mac_wchar(int c,mbfl_convert_filter * filter)131 mbfl_filt_conv_sjis_mac_wchar(int c, mbfl_convert_filter *filter)
132 {
133 int i, j, n;
134 int c1, s, s1, s2, w;
135
136 switch (filter->status) {
137 case 0:
138 if (c >= 0 && c < 0x80 && c != 0x5c) { /* latin */
139 CK((*filter->output_function)(c, filter->data));
140 } else if (c > 0xa0 && c < 0xe0) { /* kana */
141 CK((*filter->output_function)(0xfec0 + c, filter->data));
142 } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
143 filter->status = 1;
144 filter->cache = c;
145 } else if (c == 0x5c) {
146 CK((*filter->output_function)(0x00a5, filter->data));
147 } else if (c == 0x80) {
148 CK((*filter->output_function)(0x005c, filter->data));
149 } else if (c == 0xa0) {
150 CK((*filter->output_function)(0x00a0, filter->data));
151 } else if (c == 0xfd) {
152 CK((*filter->output_function)(0x00a9, filter->data));
153 } else if (c == 0xfe) {
154 CK((*filter->output_function)(0x2122, filter->data));
155 } else if (c == 0xff) {
156 CK((*filter->output_function)(0x2026, filter->data));
157 CK((*filter->output_function)(0xf87f, filter->data));
158 } else {
159 w = c & MBFL_WCSGROUP_MASK;
160 w |= MBFL_WCSGROUP_THROUGH;
161 CK((*filter->output_function)(w, filter->data));
162 }
163 break;
164
165 case 1: /* kanji second char */
166 filter->status = 0;
167 c1 = filter->cache;
168 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
169 w = 0;
170 SJIS_DECODE(c1, c, s1, s2);
171 s = (s1 - 0x21)*94 + s2 - 0x21;
172 if (s <= 0x89) {
173 if (s == 0x1c) {
174 w = 0x2014; /* EM DASH */
175 } else if (s == 0x1f) {
176 w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
177 } else if (s == 0x20) {
178 w = 0x301c; /* FULLWIDTH TILDE */
179 } else if (s == 0x21) {
180 w = 0x2016; /* PARALLEL TO */
181 } else if (s == 0x3c) {
182 w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
183 } else if (s == 0x50) {
184 w = 0x00a2; /* FULLWIDTH CENT SIGN */
185 } else if (s == 0x51) {
186 w = 0x00a3; /* FULLWIDTH POUND SIGN */
187 } else if (s == 0x89) {
188 w = 0x00ac; /* FULLWIDTH NOT SIGN */
189 }
190 }
191
192 /* apple gaiji area 0x8540 - 0x886d */
193 if (w == 0) {
194 for (i=0; i<7; i++) {
195 if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) {
196 w = s - code_tbl[i][0] + code_tbl[i][2];
197 break;
198 }
199 }
200 }
201
202 if (w == 0) {
203
204 for (i=0; i<code_tbl_m_len; i++) {
205 if (s == code_tbl_m[i][0]) {
206 if (code_tbl_m[i][1] == 0xf860) {
207 n = 4;
208 } else if (code_tbl_m[i][1] == 0xf861) {
209 n = 5;
210 } else {
211 n = 6;
212 }
213 for (j=1; j<n-1; j++) {
214 CK((*filter->output_function)(code_tbl_m[i][j], filter->data));
215 }
216 w = code_tbl_m[i][n-1];
217 break;
218 }
219 }
220 }
221
222 if (w == 0) {
223 for (i=0; i<8; i++) {
224 if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) {
225 w = code_map[i][s - code_ofst_tbl[i][0]];
226 s2 = 0;
227 if (s >= 0x043e && s <= 0x0441) {
228 s2 = 0xf87a;
229 } else if (s == 0x03b1 || s == 0x03b7) {
230 s2 = 0xf87f;
231 } else if (s == 0x04b8 || s == 0x04b9 || s == 0x04c4) {
232 s2 = 0x20dd;
233 } else if (s == 0x1ed9 || s == 0x1eda || s == 0x1ee8 || s == 0x1ef3 ||
234 (s >= 0x1ef5 && s <= 0x1efb) || s == 0x1f05 || s == 0x1f06 ||
235 s == 0x1f18 || (s >= 0x1ff2 && s <= 0x20a5)) {
236 s2 = 0xf87e;
237 }
238 if (s2 > 0) {
239 CK((*filter->output_function)(w, filter->data));
240 w = s2;
241 }
242 break;
243 }
244 }
245 }
246
247 if (w == 0 && s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
248 w = jisx0208_ucs_table[s];
249 }
250
251 if (w <= 0) {
252 w = (s1 << 8) | s2;
253 w &= MBFL_WCSPLANE_MASK;
254 w |= MBFL_WCSPLANE_WINCP932;
255 }
256 CK((*filter->output_function)(w, filter->data));
257 } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
258 CK((*filter->output_function)(c, filter->data));
259 } else {
260 w = (c1 << 8) | c;
261 w &= MBFL_WCSGROUP_MASK;
262 w |= MBFL_WCSGROUP_THROUGH;
263 CK((*filter->output_function)(w, filter->data));
264 }
265 break;
266
267 default:
268 filter->status = 0;
269 break;
270 }
271
272 return c;
273 }
274
275 /*
276 * wchar => SJIS-mac
277 */
278 int
mbfl_filt_conv_wchar_sjis_mac(int c,mbfl_convert_filter * filter)279 mbfl_filt_conv_wchar_sjis_mac(int c, mbfl_convert_filter *filter)
280 {
281 int i;
282 int c1, c2, s1, s2, mode;
283
284 s1 = 0;
285 s2 = 0;
286
287 // a1: U+0000 -> U+046F
288 // a2: U+2000 -> U+30FF
289 // i: U+4E00 -> U+9FFF
290 // r: U+FF00 -> U+FFFF
291
292 switch (filter->status) {
293
294 case 1:
295 c1 = filter->cache;
296 filter->cache = 0;
297 filter->status = 0;
298
299 s1 = 0;
300 s2 = 0;
301
302 if (c == 0xf87a) {
303 for (i=0;i<4;i++) {
304 if (c1 == s_form_tbl[i+34+3+3]) {
305 s1 = s_form_sjis_tbl[i+34+3+3];
306 break;
307 }
308 }
309 if (s1 <= 0) {
310 s2 = c1;
311 }
312 } else if (c == 0x20dd) {
313 for (i=0;i<3;i++) {
314 if (c1 == s_form_tbl[i+34+3]) {
315 s1 = s_form_sjis_tbl[i+34+3];
316 break;
317 }
318 }
319 if (s1 <= 0) {
320 s2 = c1;
321 }
322 } else if (c == 0xf87f) {
323 for (i=0;i<3;i++) {
324 if (c1 == s_form_tbl[i+34]) {
325 s1 = s_form_sjis_tbl[i+34];
326 break;
327 }
328 }
329 if (s1 <= 0) {
330 s2 = c1; s1 = -1;
331 }
332 } else if (c == 0xf87e) {
333 for (i=0;i<34;i++) {
334 if (c1 == s_form_tbl[i]) {
335 s1 = s_form_sjis_tbl[i];
336 break;
337 }
338 }
339 if (s1 <= 0) {
340 s2 = c1; s1 = -1;
341 }
342 } else {
343 s2 = c1;
344 s1 = c;
345 }
346
347 if (s2 > 0) {
348 for (i=0;i<s_form_tbl_len;i++) {
349 if (c1 == s_form_tbl[i]) {
350 s1 = s_form_sjis_fallback_tbl[i];
351 break;
352 }
353 }
354 }
355
356 if (s1 >= 0) {
357 if (s1 < 0x100) {
358 CK((*filter->output_function)(s1, filter->data));
359 } else {
360 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
361 CK((*filter->output_function)(s1 & 0xff, filter->data));
362 }
363 } else {
364 CK(mbfl_filt_conv_illegal_output(c, filter));
365 }
366
367 if (s2 <= 0 || s1 == -1) {
368 break;
369 }
370
371 case 0:
372
373 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
374 s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
375 if (c == 0x5c) {
376 s1 = 0x80;
377 } else if (c == 0xa9) {
378 s1 = 0xfd;
379 }
380 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
381 s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
382 if (c == 0x2122) {
383 s1 = 0xfe;
384 } else if (c == 0x2014) {
385 s1 = 0x213d;
386 } else if (c == 0x2116) {
387 s1 = 0x2c1d;
388 }
389 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
390 s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
391 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
392 s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
393 }
394
395 if (c >= 0x2000) {
396 for (i=0;i<s_form_tbl_len;i++) {
397 if (c == s_form_tbl[i]) {
398 filter->status = 1;
399 filter->cache = c;
400 return c;
401 }
402 }
403
404 if (c == 0xf860 || c == 0xf861 || c == 0xf862) {
405 filter->status = 2;
406 filter->cache = c;
407 return c;
408 }
409 }
410
411 if (s1 <= 0) {
412 c1 = c & ~MBFL_WCSPLANE_MASK;
413 if (c1 == MBFL_WCSPLANE_WINCP932) {
414 s1 = c & MBFL_WCSPLANE_MASK;
415 s2 = 1;
416 } else if (c1 == MBFL_WCSPLANE_JIS0208) {
417 s1 = c & MBFL_WCSPLANE_MASK;
418 } else if (c1 == MBFL_WCSPLANE_JIS0212) {
419 s1 = c & MBFL_WCSPLANE_MASK;
420 s1 |= 0x8080;
421 } else if (c == 0xa0) {
422 s1 = 0x00a0;
423 } else if (c == 0xa5) { /* YEN SIGN */
424 s1 = 0x216f; /* FULLWIDTH YEN SIGN */
425 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
426 s1 = 0x2140;
427 }
428 }
429
430 if (s1 <= 0) {
431 for (i=0; i<wchar2sjis_mac_r_tbl_len; i++) {
432 if (c >= wchar2sjis_mac_r_tbl[i][0] && c <= wchar2sjis_mac_r_tbl[i][1]) {
433 s1 = c - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
434 break;
435 }
436 }
437
438 if (s1 <= 0) {
439 for (i=0; i<wchar2sjis_mac_r_map_len; i++) {
440 if (c >= wchar2sjis_mac_r_map[i][0] && c <= wchar2sjis_mac_r_map[i][1]) {
441 s1 = wchar2sjis_mac_code_map[i][c-wchar2sjis_mac_r_map[i][0]];
442 break;
443 }
444 }
445 }
446
447 if (s1 <= 0) {
448 for (i=0; i<wchar2sjis_mac_wchar_tbl_len ; i++) {
449 if ( c == wchar2sjis_mac_wchar_tbl[i][0]) {
450 s1 = wchar2sjis_mac_wchar_tbl[i][1] & 0xffff;
451 break;
452 }
453 }
454 }
455
456 if (s1 > 0) {
457 c1 = s1/94+0x21;
458 c2 = s1-94*(c1-0x21)+0x21;
459 s1 = (c1 << 8) | c2;
460 s2 = 1;
461 }
462 }
463
464 if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */
465 s1 = -1;
466 c1 = 0;
467
468 if (c == 0) {
469 s1 = 0;
470 } else if (s1 <= 0) {
471 s1 = -1;
472 }
473 }
474
475 if (s1 >= 0) {
476 if (s1 < 0x100) { /* latin or kana */
477 CK((*filter->output_function)(s1, filter->data));
478 } else { /* kanji */
479 c1 = (s1 >> 8) & 0xff;
480 c2 = s1 & 0xff;
481 SJIS_ENCODE(c1, c2, s1, s2);
482 CK((*filter->output_function)(s1, filter->data));
483 CK((*filter->output_function)(s2, filter->data));
484 }
485 } else {
486 CK(mbfl_filt_conv_illegal_output(c, filter));
487 }
488 break;
489
490
491 case 2:
492 c1 = filter->cache;
493 filter->cache = 0;
494 filter->status = 0;
495 if (c1 == 0xf860) {
496 for (i=0; i<5; i++) {
497 if (c == code_tbl_m[i][2]) {
498 filter->cache = c | 0x10000;
499 filter->status = 3;
500 break;
501 }
502 }
503 } else if (c1 == 0xf861) {
504 for (i=0; i<3; i++) {
505 if (c == code_tbl_m[i+5][2]) {
506 filter->cache = c | 0x20000;
507 filter->status = 3;
508 break;
509 }
510 }
511 } else if (c1 == 0xf862) {
512 for (i=0; i<4; i++) {
513 if (c == code_tbl_m[i+5+3][2]) {
514 filter->cache = c | 0x40000;
515 filter->status = 3;
516 break;
517 }
518 }
519 }
520
521 if (filter->status == 0) {
522 CK(mbfl_filt_conv_illegal_output(c1, filter));
523 CK(mbfl_filt_conv_illegal_output(c, filter));
524 }
525
526 break;
527
528 case 3:
529 s1 = 0;
530 c1 = filter->cache & 0xffff;
531 mode = (filter->cache & 0xf0000) >> 16;
532
533 filter->cache = 0;
534 filter->status = 0;
535
536 if (mode == 0x1) {
537 for (i=0; i<5; i++) {
538 if (c1 == code_tbl_m[i][2] && c == code_tbl_m[i][3]) {
539 s1 = code_tbl_m[i][0];
540 break;
541 }
542 }
543
544 if (s1 > 0) {
545 c1 = s1/94+0x21;
546 c2 = s1-94*(c1-0x21)+0x21;
547 SJIS_ENCODE(c1, c2, s1, s2);
548 CK((*filter->output_function)(s1, filter->data));
549 CK((*filter->output_function)(s2, filter->data));
550 }
551
552 if (s1 <= 0) {
553 CK(mbfl_filt_conv_illegal_output(0xf860, filter));
554 CK(mbfl_filt_conv_illegal_output(c1, filter));
555 CK(mbfl_filt_conv_illegal_output(c, filter));
556 }
557
558 } else if (mode == 0x2) {
559 for (i=0; i<3; i++) {
560 if (c1 == code_tbl_m[i+5][2] && c == code_tbl_m[i+5][3]) {
561 filter->cache = c | 0x20000;
562 filter->status = 4;
563 break;
564 }
565 }
566 } else if (mode == 0x4) {
567 for (i=0; i<4; i++) {
568 if (c1 == code_tbl_m[i+8][2] && c == code_tbl_m[i+8][3]) {
569 filter->cache = c | 0x40000;
570 filter->status = 4;
571 break;
572 }
573 }
574 }
575 break;
576
577 case 4:
578 s1 = 0;
579 c1 = filter->cache & 0xffff;
580 mode = (filter->cache & 0xf0000) >> 16;
581
582 filter->cache = 0;
583 filter->status = 0;
584
585 if (mode == 0x2) {
586 for (i=0; i<3; i++) {
587 if (c1 == code_tbl_m[i+5][3] && c == code_tbl_m[i+5][4]) {
588 s1 = code_tbl_m[i+5][0];
589 break;
590 }
591 }
592
593 if (s1 > 0) {
594 c1 = s1/94+0x21;
595 c2 = s1-94*(c1-0x21)+0x21;
596 SJIS_ENCODE(c1, c2, s1, s2);
597 CK((*filter->output_function)(s1, filter->data));
598 CK((*filter->output_function)(s2, filter->data));
599 }
600
601 if (s1 <= 0) {
602 CK(mbfl_filt_conv_illegal_output(0xf861, filter));
603 for (i=0; i<3; i++) {
604 if (c1 == code_tbl_m[i+5][3]) {
605 CK(mbfl_filt_conv_illegal_output(code_tbl_m[i+5][2], filter));
606 break;
607 }
608 }
609 CK(mbfl_filt_conv_illegal_output(c1, filter));
610 CK(mbfl_filt_conv_illegal_output(c, filter));
611 }
612 } else if (mode == 0x4) {
613 for (i=0; i<4; i++) {
614 if (c1 == code_tbl_m[i+8][3] && c == code_tbl_m[i+8][4]) {
615 filter->cache = c | 0x40000;
616 filter->status = 5;
617 break;
618 }
619 }
620 }
621 break;
622
623 case 5:
624 s1 = 0;
625 c1 = filter->cache & 0xffff;
626 mode = (filter->cache & 0xf0000) >> 16;
627
628 filter->cache = 0;
629 filter->status = 0;
630
631 if (mode == 0x4) {
632 for (i=0; i<4; i++) {
633 if (c1 == code_tbl_m[i+8][4] && c == code_tbl_m[i+8][5]) {
634 s1 = code_tbl_m[i+8][0];
635 break;
636 }
637 }
638
639 if (s1 > 0) {
640 c1 = s1/94+0x21;
641 c2 = s1-94*(c1-0x21)+0x21;
642 SJIS_ENCODE(c1, c2, s1, s2);
643 CK((*filter->output_function)(s1, filter->data));
644 CK((*filter->output_function)(s2, filter->data));
645 }
646
647 if (s1 <= 0) {
648 CK(mbfl_filt_conv_illegal_output(0xf862, filter));
649 for (i=0; i<4; i++) {
650 if (c1 == code_tbl_m[i+8][4]) {
651 CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][2], filter));
652 CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][3], filter));
653 break;
654 }
655 }
656 CK(mbfl_filt_conv_illegal_output(c1, filter));
657 CK(mbfl_filt_conv_illegal_output(c, filter));
658 }
659 }
660 break;
661
662 default:
663 filter->status = 0;
664 break;
665 }
666 return c;
667 }
668
669 static int
mbfl_filt_conv_sjis_mac_flush(mbfl_convert_filter * filter)670 mbfl_filt_conv_sjis_mac_flush(mbfl_convert_filter *filter)
671 {
672 int i, c1, s1 = 0;
673 if (filter->status == 1 && filter->cache > 0) {
674 c1 = filter->cache;
675 for (i=0;i<s_form_tbl_len;i++) {
676 if (c1 == s_form_tbl[i]) {
677 s1 = s_form_sjis_fallback_tbl[i];
678 break;
679 }
680 }
681 if (s1 > 0) {
682 CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
683 CK((*filter->output_function)(s1 & 0xff, filter->data));
684 }
685 }
686 filter->cache = 0;
687 filter->status = 0;
688
689 if (filter->flush_function != NULL) {
690 return (*filter->flush_function)(filter->data);
691 }
692
693 return 0;
694 }
695