1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter_ja.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_jis.h"
32
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35
36 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter);
37 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
39 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
40 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len);
41 static bool mb_check_jis(unsigned char *in, size_t in_len);
42
43 const mbfl_encoding mbfl_encoding_jis = {
44 mbfl_no_encoding_jis,
45 "JIS",
46 "ISO-2022-JP",
47 NULL,
48 NULL,
49 MBFL_ENCTYPE_GL_UNSAFE,
50 &vtbl_jis_wchar,
51 &vtbl_wchar_jis,
52 mb_iso2022jp_to_wchar,
53 mb_wchar_to_jis,
54 mb_check_jis
55 };
56
57 const mbfl_encoding mbfl_encoding_2022jp = {
58 mbfl_no_encoding_2022jp,
59 "ISO-2022-JP",
60 "ISO-2022-JP",
61 NULL,
62 NULL,
63 MBFL_ENCTYPE_GL_UNSAFE,
64 &vtbl_2022jp_wchar,
65 &vtbl_wchar_2022jp,
66 mb_iso2022jp_to_wchar,
67 mb_wchar_to_iso2022jp,
68 mb_check_iso2022jp
69 };
70
71 const struct mbfl_convert_vtbl vtbl_jis_wchar = {
72 mbfl_no_encoding_jis,
73 mbfl_no_encoding_wchar,
74 mbfl_filt_conv_common_ctor,
75 NULL,
76 mbfl_filt_conv_jis_wchar,
77 mbfl_filt_conv_jis_wchar_flush,
78 NULL,
79 };
80
81 const struct mbfl_convert_vtbl vtbl_wchar_jis = {
82 mbfl_no_encoding_wchar,
83 mbfl_no_encoding_jis,
84 mbfl_filt_conv_common_ctor,
85 NULL,
86 mbfl_filt_conv_wchar_jis,
87 mbfl_filt_conv_any_jis_flush,
88 NULL,
89 };
90
91 const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
92 mbfl_no_encoding_2022jp,
93 mbfl_no_encoding_wchar,
94 mbfl_filt_conv_common_ctor,
95 NULL,
96 mbfl_filt_conv_jis_wchar,
97 mbfl_filt_conv_jis_wchar_flush,
98 NULL,
99 };
100
101 const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
102 mbfl_no_encoding_wchar,
103 mbfl_no_encoding_2022jp,
104 mbfl_filt_conv_common_ctor,
105 NULL,
106 mbfl_filt_conv_wchar_2022jp,
107 mbfl_filt_conv_any_jis_flush,
108 NULL,
109 };
110
111 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
112
113 /*
114 * JIS => wchar
115 */
116 int
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)117 mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
118 {
119 int c1, s, w;
120
121 retry:
122 switch (filter->status & 0xf) {
123 /* case 0x00: ASCII */
124 /* case 0x10: X 0201 latin */
125 /* case 0x20: X 0201 kana */
126 /* case 0x80: X 0208 */
127 /* case 0x90: X 0212 */
128 case 0:
129 if (c == 0x1b) {
130 filter->status += 2;
131 } else if (c == 0x0e) { /* "kana in" */
132 filter->status = 0x20;
133 } else if (c == 0x0f) { /* "kana out" */
134 filter->status = 0;
135 } else if (filter->status == 0x10 && c == 0x5c) { /* YEN SIGN */
136 CK((*filter->output_function)(0xa5, filter->data));
137 } else if (filter->status == 0x10 && c == 0x7e) { /* OVER LINE */
138 CK((*filter->output_function)(0x203e, filter->data));
139 } else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
140 CK((*filter->output_function)(0xff40 + c, filter->data));
141 } else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) { /* kanji first char */
142 filter->cache = c;
143 filter->status += 1;
144 } else if (c >= 0 && c < 0x80) { /* latin, CTLs */
145 CK((*filter->output_function)(c, filter->data));
146 } else if (c > 0xa0 && c < 0xe0) { /* GR kana */
147 CK((*filter->output_function)(0xfec0 + c, filter->data));
148 } else {
149 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
150 }
151 break;
152
153 /* case 0x81: X 0208 second char */
154 /* case 0x91: X 0212 second char */
155 case 1:
156 filter->status &= ~0xf;
157 c1 = filter->cache;
158 if (c > 0x20 && c < 0x7f) {
159 s = (c1 - 0x21)*94 + c - 0x21;
160 if (filter->status == 0x80) {
161 if (s >= 0 && s < jisx0208_ucs_table_size) {
162 w = jisx0208_ucs_table[s];
163 } else {
164 w = 0;
165 }
166
167 if (w <= 0) {
168 w = MBFL_BAD_INPUT;
169 }
170 } else {
171 if (s >= 0 && s < jisx0212_ucs_table_size) {
172 w = jisx0212_ucs_table[s];
173 } else {
174 w = 0;
175 }
176
177 if (w <= 0) {
178 w = MBFL_BAD_INPUT;
179 }
180 }
181 CK((*filter->output_function)(w, filter->data));
182 } else {
183 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
184 }
185 break;
186
187 /* ESC */
188 /* case 0x02: */
189 /* case 0x12: */
190 /* case 0x22: */
191 /* case 0x82: */
192 /* case 0x92: */
193 case 2:
194 if (c == 0x24) { /* '$' */
195 filter->status++;
196 } else if (c == 0x28) { /* '(' */
197 filter->status += 3;
198 } else {
199 filter->status &= ~0xf;
200 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
201 goto retry;
202 }
203 break;
204
205 /* ESC $ */
206 /* case 0x03: */
207 /* case 0x13: */
208 /* case 0x23: */
209 /* case 0x83: */
210 /* case 0x93: */
211 case 3:
212 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
213 filter->status = 0x80;
214 } else if (c == 0x28) { /* '(' */
215 filter->status++;
216 } else {
217 filter->status &= ~0xf;
218 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
219 CK((*filter->output_function)(0x24, filter->data));
220 goto retry;
221 }
222 break;
223
224 /* ESC $ ( */
225 /* case 0x04: */
226 /* case 0x14: */
227 /* case 0x24: */
228 /* case 0x84: */
229 /* case 0x94: */
230 case 4:
231 if (c == 0x40 || c == 0x42) { /* '@' or 'B' */
232 filter->status = 0x80;
233 } else if (c == 0x44) { /* 'D' */
234 filter->status = 0x90;
235 } else {
236 filter->status &= ~0xf;
237 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
238 CK((*filter->output_function)(0x24, filter->data));
239 CK((*filter->output_function)(0x28, filter->data));
240 goto retry;
241 }
242 break;
243
244 /* ESC ( */
245 /* case 0x05: */
246 /* case 0x15: */
247 /* case 0x25: */
248 /* case 0x85: */
249 /* case 0x95: */
250 case 5:
251 if (c == 0x42 || c == 0x48) { /* 'B' or 'H' */
252 filter->status = 0;
253 } else if (c == 0x4a) { /* 'J' */
254 filter->status = 0x10;
255 } else if (c == 0x49) { /* 'I' */
256 filter->status = 0x20;
257 } else {
258 filter->status &= ~0xf;
259 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
260 CK((*filter->output_function)(0x28, filter->data));
261 goto retry;
262 }
263 break;
264
265 EMPTY_SWITCH_DEFAULT_CASE();
266 }
267
268 return 0;
269 }
270
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)271 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
272 {
273 if (filter->status & 0xF) {
274 /* 2-byte (JIS X 0208 or 0212) character was truncated,
275 * or else escape sequence was truncated */
276 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
277 }
278 filter->status = 0;
279
280 if (filter->flush_function) {
281 (*filter->flush_function)(filter->data);
282 }
283
284 return 0;
285 }
286
287 /*
288 * wchar => JIS
289 */
290 int
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)291 mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
292 {
293 int s = 0;
294
295 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
296 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
297 } else if (c == 0x203E) { /* OVERLINE */
298 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
299 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
300 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
301 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
302 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
303 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
304 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
305 }
306 if (s <= 0) {
307 if (c == 0xa5) { /* YEN SIGN */
308 s = 0x1005c;
309 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
310 s = 0x2140;
311 } else if (c == 0x2225) { /* PARALLEL TO */
312 s = 0x2142;
313 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
314 s = 0x215d;
315 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
316 s = 0x2171;
317 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
318 s = 0x2172;
319 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
320 s = 0x224c;
321 }
322 if (c == 0) {
323 s = 0;
324 } else if (s <= 0) {
325 s = -1;
326 }
327 }
328 if (s >= 0) {
329 if (s < 0x80) { /* ASCII */
330 if ((filter->status & 0xff00) != 0) {
331 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
332 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
333 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
334 }
335 filter->status = 0;
336 CK((*filter->output_function)(s, filter->data));
337 } else if (s < 0x8080) { /* X 0208 */
338 if ((filter->status & 0xff00) != 0x200) {
339 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
340 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
341 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
342 }
343 filter->status = 0x200;
344 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
345 CK((*filter->output_function)(s & 0x7f, filter->data));
346 } else if (s < 0x10000) { /* X 0212 */
347 if ((filter->status & 0xff00) != 0x300) {
348 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
349 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
350 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
351 CK((*filter->output_function)(0x44, filter->data)); /* 'D' */
352 }
353 filter->status = 0x300;
354 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
355 CK((*filter->output_function)(s & 0x7f, filter->data));
356 } else { /* X 0201 latin */
357 if ((filter->status & 0xff00) != 0x400) {
358 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
359 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
360 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
361 }
362 filter->status = 0x400;
363 CK((*filter->output_function)(s & 0x7f, filter->data));
364 }
365 } else {
366 CK(mbfl_filt_conv_illegal_output(c, filter));
367 }
368
369 return 0;
370 }
371
372
373 /*
374 * wchar => ISO-2022-JP
375 */
376 int
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)377 mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
378 {
379 int s;
380
381 s = 0;
382 if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
383 s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
384 } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
385 s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
386 } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
387 s = ucs_i_jis_table[c - ucs_i_jis_table_min];
388 } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
389 s = ucs_r_jis_table[c - ucs_r_jis_table_min];
390 }
391
392 if (s <= 0) {
393 if (c == 0xa5) { /* YEN SIGN */
394 s = 0x1005c;
395 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
396 s = 0x2140;
397 } else if (c == 0x2225) { /* PARALLEL TO */
398 s = 0x2142;
399 } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
400 s = 0x215d;
401 } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
402 s = 0x2171;
403 } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
404 s = 0x2172;
405 } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
406 s = 0x224c;
407 }
408 if (c == 0) {
409 s = 0;
410 } else if (s <= 0) {
411 s = -1;
412 }
413 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
414 s = -1;
415 }
416 if (s >= 0) {
417 if (s < 0x80) { /* ASCII */
418 if ((filter->status & 0xff00) != 0) {
419 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
420 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
421 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
422 }
423 filter->status = 0;
424 CK((*filter->output_function)(s, filter->data));
425 } else if (s < 0x10000) { /* X 0208 */
426 if ((filter->status & 0xff00) != 0x200) {
427 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
428 CK((*filter->output_function)(0x24, filter->data)); /* '$' */
429 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
430 }
431 filter->status = 0x200;
432 CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
433 CK((*filter->output_function)(s & 0x7f, filter->data));
434 } else { /* X 0201 latin */
435 if ((filter->status & 0xff00) != 0x400) {
436 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
437 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
438 CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
439 }
440 filter->status = 0x400;
441 CK((*filter->output_function)(s & 0x7f, filter->data));
442 }
443 } else {
444 CK(mbfl_filt_conv_illegal_output(c, filter));
445 }
446
447 return 0;
448 }
449
450 int
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)451 mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
452 {
453 /* back to latin */
454 if ((filter->status & 0xff00) != 0) {
455 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
456 CK((*filter->output_function)(0x28, filter->data)); /* '(' */
457 CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
458 }
459 filter->status = 0;
460
461 if (filter->flush_function != NULL) {
462 return (*filter->flush_function)(filter->data);
463 }
464
465 return 0;
466 }
467
468 #define ASCII 0
469 #define JISX_0201_LATIN 1
470 #define JISX_0201_KANA 2
471 #define JISX_0208 3
472 #define JISX_0212 4
473
mb_iso2022jp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)474 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
475 {
476 ZEND_ASSERT(bufsize >= 3);
477
478 unsigned char *p = *in, *e = p + *in_len;
479 uint32_t *out = buf, *limit = buf + bufsize;
480
481 while (p < e && out < limit) {
482 unsigned char c = *p++;
483
484 if (c == 0x1B) {
485 /* ESC seen; this is an escape sequence */
486 if ((e - p) < 2) {
487 *out++ = MBFL_BAD_INPUT;
488 if (p != e && (*p == '$' || *p == '('))
489 p++;
490 continue;
491 }
492
493 unsigned char c2 = *p++;
494 if (c2 == '$') {
495 unsigned char c3 = *p++;
496 if (c3 == '@' || c3 == 'B') {
497 *state = JISX_0208;
498 } else if (c3 == '(') {
499 if (p == e) {
500 *out++ = MBFL_BAD_INPUT;
501 break;
502 }
503 unsigned char c4 = *p++;
504 if (c4 == '@' || c4 == 'B') {
505 *state = JISX_0208;
506 } else if (c4 == 'D') {
507 *state = JISX_0212;
508 } else {
509 if ((limit - out) < 3) {
510 p -= 4;
511 break;
512 }
513 *out++ = MBFL_BAD_INPUT;
514 *out++ = '$';
515 *out++ = '(';
516 p--;
517 }
518 } else {
519 if ((limit - out) < 2) {
520 p -= 3;
521 break;
522 }
523 *out++ = MBFL_BAD_INPUT;
524 *out++ = '$';
525 p--;
526 }
527 } else if (c2 == '(') {
528 unsigned char c3 = *p++;
529 if (c3 == 'B' || c3 == 'H') {
530 *state = ASCII;
531 } else if (c3 == 'J') {
532 *state = JISX_0201_LATIN;
533 } else if (c3 == 'I') {
534 *state = JISX_0201_KANA;
535 } else {
536 if ((limit - out) < 2) {
537 p -= 3;
538 break;
539 }
540 *out++ = MBFL_BAD_INPUT;
541 *out++ = '(';
542 p--;
543 }
544 } else {
545 *out++ = MBFL_BAD_INPUT;
546 p--;
547 }
548 } else if (c == 0xE) {
549 /* "Kana In" marker; this is just for JIS-7/8, but we also accept it for ISO-2022-JP */
550 *state = JISX_0201_KANA;
551 } else if (c == 0xF) {
552 /* "Kana Out" marker */
553 *state = ASCII;
554 } else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
555 *out++ = 0xA5;
556 } else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
557 *out++ = 0x203E;
558 } else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
559 *out++ = 0xFF40 + c;
560 } else if (*state >= JISX_0208 && c > 0x20 && c < 0x7F) {
561 if (p == e) {
562 *out++ = MBFL_BAD_INPUT;
563 break;
564 }
565 unsigned char c2 = *p++;
566 if (c2 > 0x20 && c2 < 0x7F) {
567 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
568 uint32_t w = 0;
569 if (*state == JISX_0208) {
570 if (s < jisx0208_ucs_table_size) {
571 w = jisx0208_ucs_table[s];
572 }
573 if (!w) {
574 w = MBFL_BAD_INPUT;
575 }
576 } else {
577 if (s < jisx0212_ucs_table_size) {
578 w = jisx0212_ucs_table[s];
579 }
580 if (!w) {
581 w = MBFL_BAD_INPUT;
582 }
583 }
584 *out++ = w;
585 } else {
586 *out++ = MBFL_BAD_INPUT;
587 }
588 } else if (c < 0x80) {
589 *out++ = c;
590 } else if (c >= 0xA1 && c <= 0xDF) {
591 /* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
592 * with the MSB bit (in the context of ISO-2022 encoding).
593 *
594 * In this regard, Wikipedia states:
595 * "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
596 * encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
597 * escape sequences, using Shift Out and Shift In or setting the eighth bit
598 * (GR-invoked), respectively."
599 *
600 * Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
601 * and the 'JIS8' use of GR-invoked Kana */
602 *out++ = 0xFEC0 + c;
603 } else {
604 *out++ = MBFL_BAD_INPUT;
605 }
606 }
607
608 *in_len = e - p;
609 *in = p;
610 return out - buf;
611 }
612
mb_wchar_to_iso2022jp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)613 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
614 {
615 unsigned char *out, *limit;
616 MB_CONVERT_BUF_LOAD(buf, out, limit);
617 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
618
619 while (len--) {
620 uint32_t w = *in++;
621 unsigned int s = 0;
622
623 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
624 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
625 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
626 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
627 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
628 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
629 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
630 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
631 }
632
633 if (s == 0) {
634 if (w == 0xA5) { /* YEN SIGN */
635 s = 0x1005C;
636 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
637 s = 0x2140;
638 } else if (w == 0x2225) { /* PARALLEL TO */
639 s = 0x2142;
640 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
641 s = 0x215D;
642 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
643 s = 0x2171;
644 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
645 s = 0x2172;
646 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
647 s = 0x224C;
648 } else if (w != 0) {
649 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
650 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
651 continue;
652 }
653 } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
654 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
655 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
656 continue;
657 }
658
659 if (s < 0x80) { /* ASCII */
660 if (buf->state != ASCII) {
661 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
662 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
663 buf->state = ASCII;
664 }
665 out = mb_convert_buf_add(out, s);
666 } else if (s < 0x8080) { /* JIS X 0208 */
667 if (buf->state != JISX_0208) {
668 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
669 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
670 buf->state = JISX_0208;
671 }
672 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
673 } else if (s < 0x10000) { /* JIS X 0212 */
674 if (buf->state != JISX_0212) {
675 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
676 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
677 buf->state = JISX_0212;
678 }
679 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
680 } else { /* X 0201 Latin */
681 if (buf->state != JISX_0201_LATIN) {
682 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
683 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
684 buf->state = JISX_0201_LATIN;
685 }
686 out = mb_convert_buf_add(out, s & 0x7F);
687 }
688 }
689
690 if (end && buf->state != ASCII) {
691 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
692 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
693 }
694
695 MB_CONVERT_BUF_STORE(buf, out, limit);
696 }
697
mb_wchar_to_jis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)698 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
699 {
700 unsigned char *out, *limit;
701 MB_CONVERT_BUF_LOAD(buf, out, limit);
702 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
703
704 while (len--) {
705 uint32_t w = *in++;
706 unsigned int s = 0;
707
708 if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
709 s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
710 } else if (w == 0x203E) { /* OVERLINE */
711 s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
712 } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
713 s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
714 } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
715 s = ucs_i_jis_table[w - ucs_i_jis_table_min];
716 } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
717 s = ucs_r_jis_table[w - ucs_r_jis_table_min];
718 }
719
720 if (s == 0) {
721 if (w == 0xA5) { /* YEN SIGN */
722 s = 0x1005C;
723 } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
724 s = 0x2140;
725 } else if (w == 0x2225) { /* PARALLEL TO */
726 s = 0x2142;
727 } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
728 s = 0x215D;
729 } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
730 s = 0x2171;
731 } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
732 s = 0x2172;
733 } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
734 s = 0x224C;
735 } else if (w != 0) {
736 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
737 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
738 continue;
739 }
740 }
741
742 if (s < 0x80) { /* ASCII */
743 if (buf->state != ASCII) {
744 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
745 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
746 buf->state = ASCII;
747 }
748 out = mb_convert_buf_add(out, s);
749 } else if (s >= 0xA1 && s <= 0xDF) {
750 if (buf->state != JISX_0201_KANA) {
751 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
752 out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
753 buf->state = JISX_0201_KANA;
754 }
755 out = mb_convert_buf_add(out, s & 0x7F);
756 } else if (s < 0x8080) { /* JIS X 0208 */
757 if (buf->state != JISX_0208) {
758 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
759 out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
760 buf->state = JISX_0208;
761 }
762 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
763 } else if (s < 0x10000) { /* JIS X 0212 */
764 if (buf->state != JISX_0212) {
765 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
766 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
767 buf->state = JISX_0212;
768 }
769 out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
770 } else { /* X 0201 Latin */
771 if (buf->state != JISX_0201_LATIN) {
772 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
773 out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
774 buf->state = JISX_0201_LATIN;
775 }
776 out = mb_convert_buf_add(out, s & 0x7F);
777 }
778 }
779
780 if (end && buf->state != ASCII) {
781 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
782 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
783 }
784
785 MB_CONVERT_BUF_STORE(buf, out, limit);
786 }
787
788 #define JISX_0201_KANA_SO 5
789
mb_check_jis(unsigned char * in,size_t in_len)790 static bool mb_check_jis(unsigned char *in, size_t in_len)
791 {
792 unsigned char *p = in, *e = p + in_len;
793 unsigned int state = ASCII;
794
795 while (p < e) {
796 unsigned char c = *p++;
797 if (c == 0x1B) {
798 /* ESC seen; this is an escape sequence */
799 if (state == JISX_0201_KANA_SO) {
800 return false;
801 }
802 if ((e - p) < 2) {
803 return false;
804 }
805 unsigned char c2 = *p++;
806 if (c2 == '$') {
807 unsigned char c3 = *p++;
808 if (c3 == '@' || c3 == 'B') {
809 state = JISX_0208;
810 } else if (c3 == '(') {
811 if (p == e) {
812 return false;
813 }
814 unsigned char c4 = *p++;
815 if (c4 == '@' || c4 == 'B') {
816 state = JISX_0208;
817 } else if (c4 == 'D') {
818 state = JISX_0212;
819 } else {
820 return false;
821 }
822 } else {
823 return false;
824 }
825 } else if (c2 == '(') {
826 unsigned char c3 = *p++;
827 /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
828 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
829 if (c3 == 'B' || c3 == 'H') {
830 state = ASCII;
831 } else if (c3 == 'J') {
832 state = JISX_0201_LATIN;
833 } else if (c3 == 'I') {
834 state = JISX_0201_KANA;
835 } else {
836 return false;
837 }
838 } else {
839 return false;
840 }
841 } else if (c == 0xE) {
842 /* "Kana In" marker */
843 if (state != ASCII) {
844 return false;
845 }
846 state = JISX_0201_KANA_SO;
847 } else if (c == 0xF) {
848 /* "Kana Out" marker */
849 if (state != JISX_0201_KANA_SO) {
850 return false;
851 }
852 state = ASCII;
853 } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
854 if (p == e) {
855 return false;
856 }
857 unsigned char c2 = *p++;
858 if (c2 > 0x20 && c2 < 0x7F) {
859 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
860 if (state == JISX_0208) {
861 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
862 continue;
863 }
864 } else {
865 if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
866 continue;
867 }
868 }
869 return false;
870 } else {
871 return false;
872 }
873 } else if (c < 0x80) {
874 continue;
875 } else if (c >= 0xA1 && c <= 0xDF) {
876 /* GR-invoked Kana */
877 continue;
878 } else {
879 return false;
880 }
881 }
882
883 return state == ASCII;
884 }
885
886
mb_check_iso2022jp(unsigned char * in,size_t in_len)887 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
888 {
889 unsigned char *p = in, *e = p + in_len;
890 unsigned int state = ASCII;
891
892 while (p < e) {
893 unsigned char c = *p++;
894 if (c == 0x1B) {
895 /* ESC seen; this is an escape sequence */
896 if ((e - p) < 2) {
897 return false;
898 }
899 unsigned char c2 = *p++;
900 if (c2 == '$') {
901 unsigned char c3 = *p++;
902 if (c3 == '@' || c3 == 'B') {
903 state = JISX_0208;
904 } else {
905 return false;
906 }
907 } else if (c2 == '(') {
908 unsigned char c3 = *p++;
909 if (c3 == 'B') {
910 state = ASCII;
911 } else if (c3 == 'J') {
912 state = JISX_0201_LATIN;
913 } else {
914 return false;
915 }
916 } else {
917 return false;
918 }
919 } else if (c == 0xE || c == 0xF) {
920 /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
921 return false;
922 } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
923 if (p == e) {
924 return false;
925 }
926 unsigned char c2 = *p++;
927 if (c2 > 0x20 && c2 < 0x7F) {
928 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
929 if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
930 continue;
931 }
932 return false;
933 } else {
934 return false;
935 }
936 } else if (c < 0x80) {
937 continue;
938 } else {
939 return false;
940 }
941 }
942
943 return state == ASCII;
944 }
945