1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter_ja.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_jis.h"
32
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35
36 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter);
37 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len);
38 static bool mb_check_jis(unsigned char *in, size_t in_len);
39
40 const mbfl_encoding mbfl_encoding_jis = {
41 mbfl_no_encoding_jis,
42 "JIS",
43 "ISO-2022-JP",
44 NULL,
45 NULL,
46 MBFL_ENCTYPE_GL_UNSAFE,
47 &vtbl_jis_wchar,
48 &vtbl_wchar_jis,
49 mb_check_jis
50 };
51
52 const mbfl_encoding mbfl_encoding_2022jp = {
53 mbfl_no_encoding_2022jp,
54 "ISO-2022-JP",
55 "ISO-2022-JP",
56 NULL,
57 NULL,
58 MBFL_ENCTYPE_GL_UNSAFE,
59 &vtbl_2022jp_wchar,
60 &vtbl_wchar_2022jp,
61 mb_check_iso2022jp
62 };
63
64 const struct mbfl_convert_vtbl vtbl_jis_wchar = {
65 mbfl_no_encoding_jis,
66 mbfl_no_encoding_wchar,
67 mbfl_filt_conv_common_ctor,
68 NULL,
69 mbfl_filt_conv_jis_wchar,
70 mbfl_filt_conv_jis_wchar_flush,
71 NULL,
72 };
73
74 const struct mbfl_convert_vtbl vtbl_wchar_jis = {
75 mbfl_no_encoding_wchar,
76 mbfl_no_encoding_jis,
77 mbfl_filt_conv_common_ctor,
78 NULL,
79 mbfl_filt_conv_wchar_jis,
80 mbfl_filt_conv_any_jis_flush,
81 NULL,
82 };
83
84 const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
85 mbfl_no_encoding_2022jp,
86 mbfl_no_encoding_wchar,
87 mbfl_filt_conv_common_ctor,
88 NULL,
89 mbfl_filt_conv_jis_wchar,
90 mbfl_filt_conv_jis_wchar_flush,
91 NULL,
92 };
93
94 const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
95 mbfl_no_encoding_wchar,
96 mbfl_no_encoding_2022jp,
97 mbfl_filt_conv_common_ctor,
98 NULL,
99 mbfl_filt_conv_wchar_2022jp,
100 mbfl_filt_conv_any_jis_flush,
101 NULL,
102 };
103
104 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
105
106 /*
107 * JIS => wchar
108 */
109 int
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)110 mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
111 {
112 int c1, s, w;
113
114 retry:
115 switch (filter->status & 0xf) {
116 /* case 0x00: ASCII */
117 /* case 0x10: X 0201 latin */
118 /* case 0x20: X 0201 kana */
119 /* case 0x80: X 0208 */
120 /* case 0x90: X 0212 */
121 case 0:
122 if (c == 0x1b) {
123 filter->status += 2;
124 } else if (c == 0x0e) { /* "kana in" */
125 filter->status = 0x20;
126 } else if (c == 0x0f) { /* "kana out" */
127 filter->status = 0;
128 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
129 CK((*filter->output_function)(0xa5, filter->data));
130 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
131 CK((*filter->output_function)(0x203e, filter->data));
132 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
133 CK((*filter->output_function)(0xff40 + c, filter->data));
134 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) { /* kanji first char */
135 filter->cache = c;
136 filter->status += 1;
137 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
138 CK((*filter->output_function)(c, filter->data));
139 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
140 CK((*filter->output_function)(0xfec0 + c, filter->data));
141 } else {
142 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
143 }
144 break;
145
146 /* case 0x81: X 0208 second char */
147 /* case 0x91: X 0212 second char */
148 case 1:
149 filter->status &= ~0xf;
150 c1 = filter->cache;
151 if (c > 0x20 && c < 0x7f) {
152 s = (c1 - 0x21)*94 + c - 0x21;
153 if (filter->status == 0x80) {
154 if (s >= 0 && s < jisx0208_ucs_table_size) {
155 w = jisx0208_ucs_table[s];
156 } else {
157 w = 0;
158 }
159
160 if (w <= 0) {
161 w = MBFL_BAD_INPUT;
162 }
163 } else {
164 if (s >= 0 && s < jisx0212_ucs_table_size) {
165 w = jisx0212_ucs_table[s];
166 } else {
167 w = 0;
168 }
169
170 if (w <= 0) {
171 w = MBFL_BAD_INPUT;
172 }
173 }
174 CK((*filter->output_function)(w, filter->data));
175 } else {
176 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
177 }
178 break;
179
180 /* ESC */
181 /* case 0x02: */
182 /* case 0x12: */
183 /* case 0x22: */
184 /* case 0x82: */
185 /* case 0x92: */
186 case 2:
187 if (c == 0x24) { /* '$' */
188 filter->status++;
189 } else if (c == 0x28) { /* '(' */
190 filter->status += 3;
191 } else {
192 filter->status &= ~0xf;
193 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
194 goto retry;
195 }
196 break;
197
198 /* ESC $ */
199 /* case 0x03: */
200 /* case 0x13: */
201 /* case 0x23: */
202 /* case 0x83: */
203 /* case 0x93: */
204 case 3:
205 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
206 filter->status = 0x80;
207 } else if (c == 0x28) { /* '(' */
208 filter->status++;
209 } else {
210 filter->status &= ~0xf;
211 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
212 CK((*filter->output_function)(0x24, filter->data));
213 goto retry;
214 }
215 break;
216
217 /* ESC $ ( */
218 /* case 0x04: */
219 /* case 0x14: */
220 /* case 0x24: */
221 /* case 0x84: */
222 /* case 0x94: */
223 case 4:
224 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
225 filter->status = 0x80;
226 } else if (c == 0x44) { /* 'D' */
227 filter->status = 0x90;
228 } else {
229 filter->status &= ~0xf;
230 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
231 CK((*filter->output_function)(0x24, filter->data));
232 CK((*filter->output_function)(0x28, filter->data));
233 goto retry;
234 }
235 break;
236
237 /* ESC ( */
238 /* case 0x05: */
239 /* case 0x15: */
240 /* case 0x25: */
241 /* case 0x85: */
242 /* case 0x95: */
243 case 5:
244 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
245 filter->status = 0;
246 } else if (c == 0x4a) { /* 'J' */
247 filter->status = 0x10;
248 } else if (c == 0x49) { /* 'I' */
249 filter->status = 0x20;
250 } else {
251 filter->status &= ~0xf;
252 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
253 CK((*filter->output_function)(0x28, filter->data));
254 goto retry;
255 }
256 break;
257
258 default:
259 filter->status = 0;
260 break;
261 }
262
263 return 0;
264 }
265
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)266 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
267 {
268 if ((filter->status & 0xF) == 1) {
269 /* 2-byte (JIS X 0208 or 0212) character was truncated */
270 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
271 }
272 filter->status = 0;
273
274 if (filter->flush_function) {
275 (*filter->flush_function)(filter->data);
276 }
277
278 return 0;
279 }
280
281 /*
282 * wchar => JIS
283 */
284 int
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)285 mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
286 {
287 int s = 0;
288
289 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
290 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
291 } else if (c == 0x203E) { /* OVERLINE */
292 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
293 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
294 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
295 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
296 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
297 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
298 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
299 }
300 if (s <= 0) {
301 if (c == 0xa5) { /* YEN SIGN */
302 s = 0x1005c;
303 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
304 s = 0x2140;
305 } else if (c == 0x2225) { /* PARALLEL TO */
306 s = 0x2142;
307 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
308 s = 0x215d;
309 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
310 s = 0x2171;
311 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
312 s = 0x2172;
313 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
314 s = 0x224c;
315 }
316 if (c == 0) {
317 s = 0;
318 } else if (s <= 0) {
319 s = -1;
320 }
321 }
322 if (s >= 0) {
323 if (s < 0x80) { /* ASCII */
324 if ((filter->status & 0xff00) != 0) {
325 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
326 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
327 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
328 }
329 filter->status = 0;
330 CK((*filter->output_function)(s, filter->data));
331 } else if (s < 0x100) { /* kana */
332 if ((filter->status & 0xff00) != 0x100) {
333 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
334 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
335 CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
336 }
337 filter->status = 0x100;
338 CK((*filter->output_function)(s & 0x7f, filter->data));
339 } else if (s < 0x8080) { /* X 0208 */
340 if ((filter->status & 0xff00) != 0x200) {
341 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
342 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
343 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
344 }
345 filter->status = 0x200;
346 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
347 CK((*filter->output_function)(s & 0x7f, filter->data));
348 } else if (s < 0x10000) { /* X 0212 */
349 if ((filter->status & 0xff00) != 0x300) {
350 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
351 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
352 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
353 CK((*filter->output_function)(0x44, filter->data)); /* 'D' */
354 }
355 filter->status = 0x300;
356 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
357 CK((*filter->output_function)(s & 0x7f, filter->data));
358 } else { /* X 0201 latin */
359 if ((filter->status & 0xff00) != 0x400) {
360 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
361 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
362 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
363 }
364 filter->status = 0x400;
365 CK((*filter->output_function)(s & 0x7f, filter->data));
366 }
367 } else {
368 CK(mbfl_filt_conv_illegal_output(c, filter));
369 }
370
371 return 0;
372 }
373
374
375 /*
376 * wchar => ISO-2022-JP
377 */
378 int
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)379 mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
380 {
381 int s;
382
383 s = 0;
384 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
385 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
386 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
387 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
388 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
389 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
390 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
391 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
392 }
393 if (s <= 0) {
394 if (c == 0xa5) { /* YEN SIGN */
395 s = 0x1005c;
396 } else if (c == 0x203e) { /* OVER LINE */
397 s = 0x1007e;
398 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
399 s = 0x2140;
400 } else if (c == 0x2225) { /* PARALLEL TO */
401 s = 0x2142;
402 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
403 s = 0x215d;
404 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
405 s = 0x2171;
406 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
407 s = 0x2172;
408 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
409 s = 0x224c;
410 }
411 if (c == 0) {
412 s = 0;
413 } else if (s <= 0) {
414 s = -1;
415 }
416 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
417 s = -1;
418 }
419 if (s >= 0) {
420 if (s < 0x80) { /* ASCII */
421 if ((filter->status & 0xff00) != 0) {
422 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
423 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
424 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
425 }
426 filter->status = 0;
427 CK((*filter->output_function)(s, filter->data));
428 } else if (s < 0x10000) { /* X 0208 */
429 if ((filter->status & 0xff00) != 0x200) {
430 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
431 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
432 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
433 }
434 filter->status = 0x200;
435 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
436 CK((*filter->output_function)(s & 0x7f, filter->data));
437 } else { /* X 0201 latin */
438 if ((filter->status & 0xff00) != 0x400) {
439 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
440 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
441 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
442 }
443 filter->status = 0x400;
444 CK((*filter->output_function)(s & 0x7f, filter->data));
445 }
446 } else {
447 CK(mbfl_filt_conv_illegal_output(c, filter));
448 }
449
450 return 0;
451 }
452
453 int
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)454 mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
455 {
456 /* back to latin */
457 if ((filter->status & 0xff00) != 0) {
458 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
459 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
460 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
461 }
462 filter->status = 0;
463
464 if (filter->flush_function != NULL) {
465 return (*filter->flush_function)(filter->data);
466 }
467
468 return 0;
469 }
470
471 #define ASCII 0
472 #define JISX_0201_LATIN 1
473 #define JISX_0201_KANA 2
474 #define JISX_0208 3
475 #define JISX_0212 4
476 #define JISX_0201_KANA_SO 5
477
mb_check_jis(unsigned char * in,size_t in_len)478 static bool mb_check_jis(unsigned char *in, size_t in_len)
479 {
480 unsigned char *p = in, *e = p + in_len;
481 unsigned int state = ASCII;
482
483 while (p < e) {
484 unsigned char c = *p++;
485 if (c == 0x1B) {
486 /* ESC seen; this is an escape sequence */
487 if (state == JISX_0201_KANA_SO) {
488 return false;
489 }
490 if ((e - p) < 2) {
491 return false;
492 }
493 unsigned char c2 = *p++;
494 if (c2 == '$') {
495 unsigned char c3 = *p++;
496 if (c3 == '@' || c3 == 'B') {
497 state = JISX_0208;
498 } else if (c3 == '(') {
499 if (p == e) {
500 return false;
501 }
502 unsigned char c4 = *p++;
503 if (c4 == '@' || c4 == 'B') {
504 state = JISX_0208;
505 } else if (c4 == 'D') {
506 state = JISX_0212;
507 } else {
508 return false;
509 }
510 } else {
511 return false;
512 }
513 } else if (c2 == '(') {
514 unsigned char c3 = *p++;
515 /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
516 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
517 if (c3 == 'B' || c3 == 'H') {
518 state = ASCII;
519 } else if (c3 == 'J') {
520 state = JISX_0201_LATIN;
521 } else if (c3 == 'I') {
522 state = JISX_0201_KANA;
523 } else {
524 return false;
525 }
526 } else {
527 return false;
528 }
529 } else if (c == 0xE) {
530 /* "Kana In" marker */
531 if (state != ASCII) {
532 return false;
533 }
534 state = JISX_0201_KANA_SO;
535 } else if (c == 0xF) {
536 /* "Kana Out" marker */
537 if (state != JISX_0201_KANA_SO) {
538 return false;
539 }
540 state = ASCII;
541 } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
542 if (p == e) {
543 return false;
544 }
545 unsigned char c2 = *p++;
546 if (c2 > 0x20 && c2 < 0x7F) {
547 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
548 if (state == JISX_0208) {
549 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
550 continue;
551 }
552 } else {
553 if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
554 continue;
555 }
556 }
557 return false;
558 } else {
559 return false;
560 }
561 } else if (c < 0x80) {
562 continue;
563 } else if (c >= 0xA1 && c <= 0xDF) {
564 /* GR-invoked Kana */
565 continue;
566 } else {
567 return false;
568 }
569 }
570
571 return state == ASCII;
572 }
573
574
mb_check_iso2022jp(unsigned char * in,size_t in_len)575 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
576 {
577 unsigned char *p = in, *e = p + in_len;
578 unsigned int state = ASCII;
579
580 while (p < e) {
581 unsigned char c = *p++;
582 if (c == 0x1B) {
583 /* ESC seen; this is an escape sequence */
584 if ((e - p) < 2) {
585 return false;
586 }
587 unsigned char c2 = *p++;
588 if (c2 == '$') {
589 unsigned char c3 = *p++;
590 if (c3 == '@' || c3 == 'B') {
591 state = JISX_0208;
592 } else {
593 return false;
594 }
595 } else if (c2 == '(') {
596 unsigned char c3 = *p++;
597 if (c3 == 'B') {
598 state = ASCII;
599 } else if (c3 == 'J') {
600 state = JISX_0201_LATIN;
601 } else {
602 return false;
603 }
604 } else {
605 return false;
606 }
607 } else if (c == 0xE || c == 0xF) {
608 /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
609 return false;
610 } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
611 if (p == e) {
612 return false;
613 }
614 unsigned char c2 = *p++;
615 if (c2 > 0x20 && c2 < 0x7F) {
616 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
617 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
618 continue;
619 }
620 return false;
621 } else {
622 return false;
623 }
624 } else if (c < 0x80) {
625 continue;
626 } else {
627 return false;
628 }
629 }
630
631 return state == ASCII;
632 }
633