1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter_sjis.c
26 * by rui hirokawa <hirokawa@php.net> on 15 aug 2011.
27 */
28
29 /* Although the specification for Shift-JIS-2004 indicates that 0x5C and
30 * 0x7E should (respectively) represent a Yen sign and an overbar, feedback
31 * from Japanese PHP users indicates that they prefer 0x5C and 0x7E to be
32 * treated as equivalent to U+005C and U+007E. This is the historical
33 * behavior of mbstring, and promotes compatibility with other software
34 * which handles Shift-JIS and Shift-JIS-2004 text in this way. */
35
36 #include "mbfilter.h"
37 #include "mbfilter_sjis_2004.h"
38 #include "mbfilter_euc_jp_2004.h"
39 #include "mbfilter_iso2022jp_2004.h"
40
41 #include "unicode_table_jis2004.h"
42 #include "unicode_table_jis.h"
43
44 extern const unsigned char mblen_table_sjis_mobile[];
45 extern const unsigned char mblen_table_eucjp[];
46
47 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
48 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
49 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
50 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
51 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
52 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
53
54 extern int mbfl_bisec_srch(int w, const unsigned short *tbl, int n);
55 extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n);
56
57 static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL};
58 static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
59
60 const mbfl_encoding mbfl_encoding_sjis2004 = {
61 mbfl_no_encoding_sjis2004,
62 "SJIS-2004",
63 "Shift_JIS",
64 mbfl_encoding_sjis2004_aliases,
65 mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
66 MBFL_ENCTYPE_GL_UNSAFE,
67 &vtbl_sjis2004_wchar,
68 &vtbl_wchar_sjis2004,
69 mb_sjis2004_to_wchar,
70 mb_wchar_to_sjis2004,
71 NULL
72 };
73
74 const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
75 mbfl_no_encoding_sjis2004,
76 mbfl_no_encoding_wchar,
77 mbfl_filt_conv_common_ctor,
78 NULL,
79 mbfl_filt_conv_jis2004_wchar,
80 mbfl_filt_conv_jis2004_wchar_flush,
81 NULL,
82 };
83
84 const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = {
85 mbfl_no_encoding_wchar,
86 mbfl_no_encoding_sjis2004,
87 mbfl_filt_conv_common_ctor,
88 NULL,
89 mbfl_filt_conv_wchar_jis2004,
90 mbfl_filt_conv_wchar_jis2004_flush,
91 NULL,
92 };
93
94 const mbfl_encoding mbfl_encoding_eucjp2004 = {
95 mbfl_no_encoding_eucjp2004,
96 "EUC-JP-2004",
97 "EUC-JP",
98 mbfl_encoding_eucjp2004_aliases,
99 mblen_table_eucjp,
100 0,
101 &vtbl_eucjp2004_wchar,
102 &vtbl_wchar_eucjp2004,
103 mb_eucjp2004_to_wchar,
104 mb_wchar_to_eucjp2004,
105 NULL
106 };
107
108 const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
109 mbfl_no_encoding_eucjp2004,
110 mbfl_no_encoding_wchar,
111 mbfl_filt_conv_common_ctor,
112 NULL,
113 mbfl_filt_conv_jis2004_wchar,
114 mbfl_filt_conv_jis2004_wchar_flush,
115 NULL,
116 };
117
118 const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = {
119 mbfl_no_encoding_wchar,
120 mbfl_no_encoding_eucjp2004,
121 mbfl_filt_conv_common_ctor,
122 NULL,
123 mbfl_filt_conv_wchar_jis2004,
124 mbfl_filt_conv_wchar_jis2004_flush,
125 NULL,
126 };
127
128 const mbfl_encoding mbfl_encoding_2022jp_2004 = {
129 mbfl_no_encoding_2022jp_2004,
130 "ISO-2022-JP-2004",
131 "ISO-2022-JP-2004",
132 NULL,
133 NULL,
134 MBFL_ENCTYPE_GL_UNSAFE,
135 &vtbl_2022jp_2004_wchar,
136 &vtbl_wchar_2022jp_2004,
137 mb_iso2022jp2004_to_wchar,
138 mb_wchar_to_iso2022jp2004,
139 NULL
140 };
141
142 const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
143 mbfl_no_encoding_2022jp_2004,
144 mbfl_no_encoding_wchar,
145 mbfl_filt_conv_common_ctor,
146 NULL,
147 mbfl_filt_conv_jis2004_wchar,
148 mbfl_filt_conv_jis2004_wchar_flush,
149 NULL,
150 };
151
152 const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = {
153 mbfl_no_encoding_wchar,
154 mbfl_no_encoding_2022jp_2004,
155 mbfl_filt_conv_common_ctor,
156 NULL,
157 mbfl_filt_conv_wchar_jis2004,
158 mbfl_filt_conv_wchar_jis2004_flush,
159 NULL,
160 };
161
162 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
163
164 #define SJIS_ENCODE(c1,c2,s1,s2) \
165 do { \
166 s1 = c1; \
167 s1--; \
168 s1 >>= 1; \
169 if ((c1) < 0x5f) { \
170 s1 += 0x71; \
171 } else { \
172 s1 += 0xb1; \
173 } \
174 s2 = c2; \
175 if ((c1) & 1) { \
176 if ((c2) < 0x60) { \
177 s2--; \
178 } \
179 s2 += 0x20; \
180 } else { \
181 s2 += 0x7e; \
182 } \
183 } while (0)
184
185 #define SJIS_DECODE(c1,c2,s1,s2) \
186 do { \
187 s1 = c1; \
188 if (s1 < 0xa0) { \
189 s1 -= 0x81; \
190 } else { \
191 s1 -= 0xc1; \
192 } \
193 s1 <<= 1; \
194 s1 += 0x21; \
195 s2 = c2; \
196 if (s2 < 0x9f) { \
197 if (s2 < 0x7f) { \
198 s2++; \
199 } \
200 s2 -= 0x20; \
201 } else { \
202 s1++; \
203 s2 -= 0x7e; \
204 } \
205 } while (0)
206
mbfl_filt_conv_jis2004_wchar(int c,mbfl_convert_filter * filter)207 int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
208 {
209 int k;
210 int c1, c2, s, s1 = 0, s2 = 0, w = 0, w1;
211
212 switch (filter->status & 0xf) {
213 case 0:
214 if (c >= 0 && c < 0x80) { /* latin */
215 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
216 CK((*filter->output_function)(c, filter->data));
217 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
218 if (c == 0x5c) {
219 CK((*filter->output_function)(0x00a5, filter->data));
220 } else if (c == 0x7e) {
221 CK((*filter->output_function)(0x203e, filter->data));
222 } else {
223 CK((*filter->output_function)(c, filter->data));
224 }
225 } else { /* ISO-2022-JP-2004 */
226 if (c == 0x1b) {
227 filter->status += 6;
228 } else if ((filter->status == 0x80 || filter->status == 0x90 || filter->status == 0xa0)
229 && c > 0x20 && c < 0x7f) { /* kanji first char */
230 filter->cache = c;
231 if (filter->status == 0x90) {
232 filter->status += 1; /* JIS X 0213 plane 1 */
233 } else if (filter->status == 0xa0) {
234 filter->status += 4; /* JIS X 0213 plane 2 */
235 } else {
236 filter->status += 5; /* JIS X 0208 */
237 }
238 } else {
239 CK((*filter->output_function)(c, filter->data));
240 }
241 }
242 } else {
243 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
244 if (c > 0xa0 && c < 0xff) { /* X 0213 plane 1 first char */
245 filter->status = 1;
246 filter->cache = c;
247 } else if (c == 0x8e) { /* kana first char */
248 filter->cache = 0x8E; /* So error will be reported if input is truncated right here */
249 filter->status = 2;
250 } else if (c == 0x8f) { /* X 0213 plane 2 first char */
251 filter->status = 3;
252 } else {
253 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
254 }
255 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
256 if (c > 0xa0 && c < 0xe0) { /* kana */
257 CK((*filter->output_function)(0xfec0 + c, filter->data));
258 } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
259 filter->status = 1;
260 filter->cache = c;
261 } else {
262 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
263 }
264 } else {
265 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
266 }
267 }
268 break;
269
270 case 1: /* kanji second char */
271 filter->status &= ~0xf;
272 c1 = filter->cache;
273
274 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
275 if (c > 0xa0 && c < 0xff) {
276 s1 = c1 - 0x80;
277 s2 = c - 0x80;
278 } else {
279 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
280 break;
281 }
282 } else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
283 if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
284 SJIS_DECODE(c1, c, s1, s2);
285 } else {
286 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
287 break;
288 }
289 } else { /* ISO-2022-JP-2004 */
290 if (c >= 0x21 && c <= 0x7E) {
291 s1 = c1;
292 s2 = c;
293 } else {
294 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
295 break;
296 }
297 }
298 w1 = (s1 << 8) | s2;
299
300 /* conversion for combining characters */
301 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) ||
302 (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 ||
303 (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
304 k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
305 if (k >= 0) {
306 w = jisx0213_u2_tbl[2*k];
307 CK((*filter->output_function)(w, filter->data));
308 w = jisx0213_u2_tbl[2*k+1];
309 }
310 }
311
312 /* conversion for BMP */
313 if (w <= 0) {
314 w1 = (s1 - 0x21)*94 + s2 - 0x21;
315 if (w1 >= 0 && w1 < jisx0213_ucs_table_size) {
316 w = jisx0213_ucs_table[w1];
317 }
318 }
319
320 /* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
321 if (w <= 0) {
322 w1 = (s1 << 8) | s2;
323 k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
324 if (k >= 0) {
325 w = jisx0213_jis_u5_tbl[k] + 0x20000;
326 }
327 }
328
329 if (w <= 0) {
330 w = MBFL_BAD_INPUT;
331 }
332 CK((*filter->output_function)(w, filter->data));
333 break;
334
335 case 2: /* got 0x8e: EUC-JP-2004 kana */
336 filter->status = 0;
337 if (c > 0xa0 && c < 0xe0) {
338 w = 0xfec0 + c;
339 CK((*filter->output_function)(w, filter->data));
340 } else {
341 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
342 }
343 break;
344
345 case 3: /* X 0213 plane 2 first char: EUC-JP-2004 (0x8f) */
346 if (c == 0xA1 || (c >= 0xA3 && c <= 0xA5) || c == 0xA8 || (c >= 0xAC && c <= 0xAF) || (c >= 0xEE && c <= 0xFE)) {
347 filter->cache = c - 0x80;
348 filter->status++;
349 } else {
350 filter->status = 0;
351 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
352 }
353 break;
354
355 case 4: /* X 0213 plane 2 second char: EUC-JP-2004, ISO-2022-JP-2004 */
356 filter->status &= ~0xF;
357 c1 = filter->cache;
358 if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
359 c2 = c - 0x80;
360 } else {
361 c2 = c;
362 }
363
364 if (c2 < 0x21 || c2 > 0x7E) {
365 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
366 break;
367 }
368
369 s1 = c1 - 0x21;
370 s2 = c2 - 0x21;
371
372 if (((s1 >= 0 && s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) ||
373 (s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) {
374 /* calc offset from ku */
375 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
376 if (s1 == jisx0213_p2_ofst[k]) {
377 break;
378 }
379 }
380 k -= jisx0213_p2_ofst[k];
381
382 /* check for japanese chars in BMP */
383 s = (s1 + 94 + k)*94 + s2;
384 ZEND_ASSERT(s < jisx0213_ucs_table_size);
385 w = jisx0213_ucs_table[s];
386
387 /* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
388 if (w <= 0) {
389 w1 = ((c1 + k + 94) << 8) | c2;
390 k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
391 if (k >= 0) {
392 w = jisx0213_jis_u5_tbl[k] + 0x20000;
393 }
394 }
395
396 if (w <= 0) {
397 w = MBFL_BAD_INPUT;
398 }
399
400 CK((*filter->output_function)(w, filter->data));
401 } else {
402 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
403 }
404 break;
405
406 case 5: /* X 0208: ISO-2022-JP-2004 */
407 filter->status &= ~0xf;
408 c1 = filter->cache;
409 if (c > 0x20 && c < 0x7f) {
410 s = (c1 - 0x21)*94 + c - 0x21;
411 if (s >= 0 && s < jisx0208_ucs_table_size) {
412 w = jisx0208_ucs_table[s];
413 }
414 }
415
416 if (w <= 0) {
417 w = MBFL_BAD_INPUT;
418 }
419
420 CK((*filter->output_function)(w, filter->data));
421 break;
422
423 /* ESC: ISO-2022-JP-2004 */
424 /* case 0x06: */
425 /* case 0x16: */
426 /* case 0x26: */
427 /* case 0x86: */
428 /* case 0x96: */
429 /* case 0xa6: */
430 case 6:
431 if (c == '$') {
432 filter->status++;
433 } else if (c == '(') {
434 filter->status += 3;
435 } else {
436 filter->status &= ~0xf;
437 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
438 }
439 break;
440
441 /* ESC $: ISO-2022-JP-2004 */
442 /* case 0x07: */
443 /* case 0x17: */
444 /* case 0x27: */
445 /* case 0x87: */
446 /* case 0x97: */
447 /* case 0xa7: */
448 case 7:
449 if (c == 'B') { /* JIS X 0208-1983 */
450 filter->status = 0x80;
451 } else if (c == '(') {
452 filter->status++;
453 } else {
454 filter->status &= ~0xf;
455 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
456 }
457 break;
458
459 /* ESC $ (: ISO-2022-JP-2004 */
460 /* case 0x08: */
461 /* case 0x18: */
462 /* case 0x28: */
463 /* case 0x88: */
464 /* case 0x98: */
465 /* case 0xa8: */
466 case 8:
467 if (c == 'Q') { /* JIS X 0213 plane 1 */
468 filter->status = 0x90;
469 } else if (c == 'P') { /* JIS X 0213 plane 2 */
470 filter->status = 0xa0;
471 } else {
472 filter->status &= ~0xf;
473 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
474 }
475 break;
476
477 /* ESC (: ISO-2022-JP-2004 */
478 /* case 0x09: */
479 /* case 0x19: */
480 /* case 0x29: */
481 /* case 0x89: */
482 /* case 0x99: */
483 case 9:
484 if (c == 'B') {
485 filter->status = 0;
486 } else {
487 filter->status &= ~0xf;
488 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
489 }
490 break;
491
492 EMPTY_SWITCH_DEFAULT_CASE();
493 }
494
495 return 0;
496 }
497
mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter * filter)498 int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
499 {
500 if (filter->status & 0xF) {
501 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
502 }
503 filter->status = 0;
504
505 if (filter->flush_function) {
506 return (*filter->flush_function)(filter->data);
507 }
508
509 return 0;
510 }
511
mbfl_filt_conv_wchar_jis2004(int c,mbfl_convert_filter * filter)512 int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
513 {
514 int k;
515 int c1, c2, s1, s2;
516
517 retry:
518 s1 = 0;
519 /* check for 1st char of combining characters */
520 if ((filter->status & 0xf) == 0 && (
521 c == 0x00E6 ||
522 (c >= 0x0254 && c <= 0x02E9) ||
523 (c >= 0x304B && c <= 0x3053) ||
524 (c >= 0x30AB && c <= 0x30C8) ||
525 c == 0x31F7)) {
526 for (k = 0; k < jisx0213_u2_tbl_len; k++) {
527 if (c == jisx0213_u2_tbl[2*k]) {
528 filter->status++;
529 filter->cache = k;
530 return 0;
531 }
532 }
533 }
534
535 /* check for 2nd char of combining characters */
536 if ((filter->status & 0xf) == 1 && filter->cache >= 0 && filter->cache < jisx0213_u2_tbl_len) {
537 k = filter->cache;
538 filter->status &= ~0xf;
539 filter->cache = 0;
540
541 c1 = jisx0213_u2_tbl[2*k];
542 if ((c1 == 0x0254 || c1 == 0x028C || c1 == 0x0259 || c1 == 0x025A) && c == 0x0301) {
543 k++;
544 }
545 if (c == jisx0213_u2_tbl[2*k+1]) {
546 s1 = jisx0213_u2_key[k];
547 } else { /* fallback */
548 s1 = jisx0213_u2_fb_tbl[k];
549
550 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
551 c1 = (s1 >> 8) & 0xff;
552 c2 = s1 & 0xff;
553 SJIS_ENCODE(c1, c2, s1, s2);
554 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
555 s2 = (s1 & 0xff) + 0x80;
556 s1 = ((s1 >> 8) & 0xff) + 0x80;
557 } else {
558 if (filter->status != 0x200) {
559 CK((*filter->output_function)(0x1b, filter->data));
560 CK((*filter->output_function)('$', filter->data));
561 CK((*filter->output_function)('(', filter->data));
562 CK((*filter->output_function)('Q', filter->data));
563 }
564 filter->status = 0x200;
565
566 s2 = s1 & 0x7f;
567 s1 = (s1 >> 8) & 0x7f;
568 }
569
570 /* Flush out cached data */
571 CK((*filter->output_function)(s1, filter->data));
572 CK((*filter->output_function)(s2, filter->data));
573 goto retry;
574 }
575 }
576
577 /* check for major japanese chars: U+4E00 - U+9FFF */
578 if (s1 <= 0) {
579 for (k = 0; k < uni2jis_tbl_len; k++) {
580 if (c >= uni2jis_tbl_range[k][0] && c <= uni2jis_tbl_range[k][1]) {
581 s1 = uni2jis_tbl[k][c-uni2jis_tbl_range[k][0]];
582 break;
583 }
584 }
585 }
586
587 /* check for japanese chars in compressed mapping area: U+1E00 - U+4DBF */
588 if (s1 <= 0 && c >= ucs_c1_jisx0213_min && c <= ucs_c1_jisx0213_max) {
589 k = mbfl_bisec_srch(c, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
590 if (k >= 0) {
591 s1 = ucs_c1_jisx0213_ofst[k] + c - ucs_c1_jisx0213_tbl[2*k];
592 }
593 }
594
595 /* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
596 if (s1 <= 0 && c >= jisx0213_u5_tbl_min && c <= jisx0213_u5_tbl_max) {
597 k = mbfl_bisec_srch2(c - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
598 if (k >= 0) {
599 s1 = jisx0213_u5_jis_tbl[k];
600 }
601 }
602
603 if (s1 <= 0) {
604 /* CJK Compatibility Forms: U+FE30 - U+FE4F */
605 if (c == 0xfe45) {
606 s1 = 0x233e;
607 } else if (c == 0xfe46) {
608 s1 = 0x233d;
609 } else if (c >= 0xf91d && c <= 0xf9dc) {
610 /* CJK Compatibility Ideographs: U+F900 - U+F92A */
611 k = mbfl_bisec_srch2(c, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
612 if (k >= 0) {
613 s1 = ucs_r2b_jisx0213_cmap_val[k];
614 }
615 }
616 }
617
618 if (s1 <= 0) {
619 if (c == 0) {
620 s1 = 0;
621 } else {
622 s1 = -1;
623 }
624 }
625
626 if (s1 >= 0) {
627 if (s1 < 0x80) { /* ASCII */
628 if (filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (filter->status & 0xff00)) {
629 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
630 CK((*filter->output_function)('(', filter->data));
631 CK((*filter->output_function)('B', filter->data));
632 }
633 filter->status = 0;
634 CK((*filter->output_function)(s1, filter->data));
635 } else if (s1 < 0x100) { /* latin or kana */
636 if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
637 CK((*filter->output_function)(0x8e, filter->data));
638 CK((*filter->output_function)(s1, filter->data));
639 } else if (filter->to->no_encoding == mbfl_no_encoding_sjis2004 && (s1 >= 0xA1 && s1 <= 0xDF)) {
640 CK((*filter->output_function)(s1, filter->data));
641 } else {
642 CK(mbfl_filt_conv_illegal_output(c, filter));
643 }
644 } else if (s1 < 0x7f00) { /* X 0213 plane 1 */
645 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
646 c1 = (s1 >> 8) & 0xff;
647 c2 = s1 & 0xff;
648 SJIS_ENCODE(c1, c2, s1, s2);
649 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
650 s2 = (s1 & 0xff) + 0x80;
651 s1 = ((s1 >> 8) & 0xff) + 0x80;
652 } else {
653 if ((filter->status & 0xff00) != 0x200) {
654 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
655 CK((*filter->output_function)('$', filter->data));
656 CK((*filter->output_function)('(', filter->data));
657 CK((*filter->output_function)('Q', filter->data));
658 }
659 filter->status = 0x200;
660 s2 = s1 & 0xff;
661 s1 = (s1 >> 8) & 0xff;
662 }
663 CK((*filter->output_function)(s1, filter->data));
664 CK((*filter->output_function)(s2, filter->data));
665 } else { /* X 0213 plane 2 */
666 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
667 c1 = (s1 >> 8) & 0xff;
668 c2 = s1 & 0xff;
669 SJIS_ENCODE(c1, c2, s1, s2);
670 } else {
671 s2 = s1 & 0xff;
672 k = ((s1 >> 8) & 0xff) - 0x7f;
673 if (k >= 0 && k < jisx0213_p2_ofst_len) {
674 s1 = jisx0213_p2_ofst[k] + 0x21;
675 }
676 if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
677 s2 |= 0x80;
678 s1 |= 0x80;
679 CK((*filter->output_function)(0x8f, filter->data));
680 } else {
681 if ((filter->status & 0xff00) != 0x200) {
682 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
683 CK((*filter->output_function)('$', filter->data));
684 CK((*filter->output_function)('(', filter->data));
685 CK((*filter->output_function)('P', filter->data));
686 }
687 filter->status = 0x200;
688 }
689 }
690
691 CK((*filter->output_function)(s1, filter->data));
692 CK((*filter->output_function)(s2, filter->data));
693 }
694 } else {
695 CK(mbfl_filt_conv_illegal_output(c, filter));
696 }
697
698 return 0;
699 }
700
mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter * filter)701 int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter)
702 {
703 int k, c1, c2, s1, s2;
704
705 k = filter->cache;
706 filter->cache = 0;
707
708 if (filter->status == 1 && k >= 0 && k <= jisx0213_u2_tbl_len) {
709 s1 = jisx0213_u2_fb_tbl[k];
710
711 if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
712 c1 = (s1 >> 8) & 0xff;
713 c2 = s1 & 0xff;
714 SJIS_ENCODE(c1, c2, s1, s2);
715 } else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
716 s2 = (s1 & 0xff) | 0x80;
717 s1 = ((s1 >> 8) & 0xff) | 0x80;
718 } else {
719 s2 = s1 & 0x7f;
720 s1 = (s1 >> 8) & 0x7f;
721 if ((filter->status & 0xff00) != 0x200) {
722 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
723 CK((*filter->output_function)('$', filter->data));
724 CK((*filter->output_function)('(', filter->data));
725 CK((*filter->output_function)('Q', filter->data));
726 }
727 filter->status = 0x200;
728 }
729
730 CK((*filter->output_function)(s1, filter->data));
731 CK((*filter->output_function)(s2, filter->data));
732 }
733
734 /* If we had switched to a different charset, go back to ASCII mode
735 * This makes it possible to concatenate arbitrary valid strings
736 * together and get a valid string */
737 if (filter->status & 0xff00) {
738 CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
739 CK((*filter->output_function)('(', filter->data));
740 CK((*filter->output_function)('B', filter->data));
741 }
742
743 filter->status = 0;
744
745 if (filter->flush_function) {
746 return (*filter->flush_function)(filter->data);
747 }
748
749 return 0;
750 }
751
mb_sjis2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)752 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
753 {
754 unsigned char *p = *in, *e = p + *in_len;
755 uint32_t *out = buf, *limit = buf + bufsize - 1;
756
757 while (p < e && out < limit) {
758 unsigned char c = *p++;
759
760 if (c <= 0x7F) {
761 if (c == 0x5C) {
762 *out++ = 0xA5;
763 } else if (c == 0x7E) {
764 *out++ = 0x203E;
765 } else {
766 *out++ = c;
767 }
768 } else if (c >= 0xA1 && c <= 0xDF) {
769 *out++ = 0xFEC0 + c;
770 } else if (c > 0x80 && c < 0xFD && c != 0xA0) {
771 if (p == e) {
772 *out++ = MBFL_BAD_INPUT;
773 break;
774 }
775 unsigned char c2 = *p++;
776
777 if (c2 < 0x40 || c2 > 0xFC || c2 == 0x7F) {
778 *out++ = MBFL_BAD_INPUT;
779 continue;
780 }
781
782 unsigned int s1, s2;
783 SJIS_DECODE(c, c2, s1, s2);
784 unsigned int w1 = (s1 << 8) | s2, w = 0;
785
786 /* Conversion for combining characters */
787 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
788 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
789 if (k >= 0) {
790 *out++ = jisx0213_u2_tbl[2*k];
791 *out++ = jisx0213_u2_tbl[2*k+1];
792 continue;
793 }
794 }
795
796 /* Conversion for BMP */
797 w1 = (s1 - 0x21)*94 + s2 - 0x21;
798 if (w1 < jisx0213_ucs_table_size) {
799 w = jisx0213_ucs_table[w1];
800 }
801
802 /* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
803 if (!w) {
804 w1 = (s1 << 8) | s2;
805 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
806 if (k >= 0) {
807 w = jisx0213_jis_u5_tbl[k] + 0x20000;
808 }
809 }
810
811 *out++ = w ? w : MBFL_BAD_INPUT;
812 } else {
813 *out++ = MBFL_BAD_INPUT;
814 }
815 }
816
817 *in_len = e - p;
818 *in = p;
819 return out - buf;
820 }
821
mb_wchar_to_sjis2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)822 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
823 {
824 unsigned char *out, *limit;
825 MB_CONVERT_BUF_LOAD(buf, out, limit);
826 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
827
828 uint32_t w;
829 if (buf->state) {
830 w = buf->state;
831 buf->state = 0;
832 goto process_codepoint;
833 }
834
835 while (len--) {
836 w = *in++;
837 process_codepoint: ;
838 unsigned int s = 0;
839
840 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
841 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
842 if (w == jisx0213_u2_tbl[2*k]) {
843 if (!len) {
844 if (!end) {
845 buf->state = w;
846 MB_CONVERT_BUF_STORE(buf, out, limit);
847 return;
848 }
849 } else {
850 uint32_t w2 = *in++; len--;
851 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
852 k++;
853 }
854 if (w2 == jisx0213_u2_tbl[2*k+1]) {
855 s = jisx0213_u2_key[k];
856 break;
857 }
858 in--; len++;
859 }
860
861 /* Fallback */
862 s = jisx0213_u2_fb_tbl[k];
863 break;
864 }
865 }
866 }
867
868 /* Check for major Japanese chars: U+4E00-U+9FFF */
869 if (!s) {
870 for (int k = 0; k < uni2jis_tbl_len; k++) {
871 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
872 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
873 break;
874 }
875 }
876 }
877
878 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
879 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
880 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
881 if (k >= 0) {
882 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
883 }
884 }
885
886 /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
887 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
888 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
889 if (k >= 0) {
890 s = jisx0213_u5_jis_tbl[k];
891 }
892 }
893
894 if (!s) {
895 /* CJK Compatibility Forms: U+FE30-U+FE4F */
896 if (w == 0xFE45) {
897 s = 0x233E;
898 } else if (w == 0xFE46) {
899 s = 0x233D;
900 } else if (w >= 0xF91D && w <= 0xF9DC) {
901 /* CJK Compatibility Ideographs: U+F900-U+F92A */
902 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
903 if (k >= 0) {
904 s = ucs_r2b_jisx0213_cmap_val[k];
905 }
906 }
907 }
908
909 if (!s && w) {
910 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004);
911 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
912 } else if (s <= 0xFF) {
913 out = mb_convert_buf_add(out, s);
914 } else {
915 unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
916 SJIS_ENCODE(c1, c2, s1, s2);
917 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
918 out = mb_convert_buf_add2(out, s1, s2);
919 }
920 }
921
922 MB_CONVERT_BUF_STORE(buf, out, limit);
923 }
924
mb_eucjp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)925 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
926 {
927 unsigned char *p = *in, *e = p + *in_len;
928 uint32_t *out = buf, *limit = buf + bufsize - 1;
929
930 while (p < e && out < limit) {
931 unsigned char c = *p++;
932
933 if (c <= 0x7F) {
934 *out++ = c;
935 } else if (c >= 0xA1 && c <= 0xFE) {
936 /* Kanji */
937 if (p == e) {
938 *out++ = MBFL_BAD_INPUT;
939 break;
940 }
941 unsigned char c2 = *p++;
942 if (c2 <= 0xA0 || c2 == 0xFF) {
943 *out++ = MBFL_BAD_INPUT;
944 continue;
945 }
946
947 unsigned int s1 = c - 0x80, s2 = c2 - 0x80;
948 unsigned int w1 = (s1 << 8) | s2, w = 0;
949
950 /* Conversion for combining characters */
951 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
952 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
953 if (k >= 0) {
954 *out++ = jisx0213_u2_tbl[2*k];
955 *out++ = jisx0213_u2_tbl[2*k+1];
956 continue;
957 }
958 }
959
960 /* Conversion for BMP */
961 w1 = (s1 - 0x21)*94 + s2 - 0x21;
962 if (w1 < jisx0213_ucs_table_size) {
963 w = jisx0213_ucs_table[w1];
964 }
965
966 /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
967 if (!w) {
968 w1 = (s1 << 8) | s2;
969 int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
970 if (k >= 0) {
971 w = jisx0213_jis_u5_tbl[k] + 0x20000;
972 }
973 }
974
975 *out++ = w ? w : MBFL_BAD_INPUT;
976 } else if (c == 0x8E && p < e) {
977 /* Kana */
978 unsigned char c2 = *p++;
979 if (c2 >= 0xA1 && c2 <= 0xDF) {
980 *out++ = 0xFEC0 + c2;
981 } else {
982 *out++ = MBFL_BAD_INPUT;
983 }
984 } else if (c == 0x8F && p < e) {
985 unsigned char c2 = *p++;
986 if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) {
987 unsigned char c3 = *p++;
988
989 if (c3 < 0xA1 || c3 == 0xFF) {
990 *out++ = MBFL_BAD_INPUT;
991 continue;
992 }
993
994 unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1;
995
996 if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
997 int k;
998 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
999 if (s1 == jisx0213_p2_ofst[k]) {
1000 break;
1001 }
1002 }
1003 k -= jisx0213_p2_ofst[k];
1004
1005 /* Check for Japanese chars in BMP */
1006 unsigned int s = (s1 + 94 + k)*94 + s2;
1007 ZEND_ASSERT(s < jisx0213_ucs_table_size);
1008 unsigned int w = jisx0213_ucs_table[s];
1009
1010 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
1011 if (!w) {
1012 k = mbfl_bisec_srch2(((c2 - 0x80 + k + 94) << 8) | (c3 - 0x80), jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1013 if (k >= 0) {
1014 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1015 }
1016 }
1017
1018 *out++ = w ? w : MBFL_BAD_INPUT;
1019 } else {
1020 *out++ = MBFL_BAD_INPUT;
1021 }
1022 } else {
1023 *out++ = MBFL_BAD_INPUT;
1024 }
1025 } else {
1026 *out++ = MBFL_BAD_INPUT;
1027 }
1028 }
1029
1030 *in_len = e - p;
1031 *in = p;
1032 return out - buf;
1033 }
1034
mb_wchar_to_eucjp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1035 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1036 {
1037 unsigned char *out, *limit;
1038 MB_CONVERT_BUF_LOAD(buf, out, limit);
1039 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1040
1041 uint32_t w;
1042 if (buf->state) {
1043 w = buf->state;
1044 buf->state = 0;
1045 goto process_codepoint;
1046 }
1047
1048 while (len--) {
1049 w = *in++;
1050 process_codepoint: ;
1051 unsigned int s = 0;
1052
1053 /* Check for 1st char of combining characters */
1054 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
1055 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
1056 if (w == jisx0213_u2_tbl[2*k]) {
1057 if (!len) {
1058 if (!end) {
1059 buf->state = w;
1060 MB_CONVERT_BUF_STORE(buf, out, limit);
1061 return;
1062 }
1063 } else {
1064 uint32_t w2 = *in++; len--;
1065 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
1066 k++;
1067 }
1068 if (w2 == jisx0213_u2_tbl[2*k+1]) {
1069 s = jisx0213_u2_key[k];
1070 break;
1071 }
1072 in--; len++;
1073 }
1074
1075 /* Fallback */
1076 s = jisx0213_u2_fb_tbl[k];
1077 break;
1078 }
1079 }
1080 }
1081
1082 /* Check for major Japanese chars: U+4E00-U+9FFF */
1083 if (!s) {
1084 for (int k = 0; k < uni2jis_tbl_len; k++) {
1085 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
1086 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
1087 break;
1088 }
1089 }
1090 }
1091
1092 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
1093 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
1094 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1095 if (k >= 0) {
1096 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
1097 }
1098 }
1099
1100 /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1101 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
1102 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1103 if (k >= 0) {
1104 s = jisx0213_u5_jis_tbl[k];
1105 }
1106 }
1107
1108 if (!s) {
1109 /* CJK Compatibility Forms: U+FE30-U+FE4F */
1110 if (w == 0xFE45) {
1111 s = 0x233E;
1112 } else if (w == 0xFE46) {
1113 s = 0x233D;
1114 } else if (w >= 0xF91D && w <= 0xF9DC) {
1115 /* CJK Compatibility Ideographs: U+F900-U+F92A */
1116 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1117 if (k >= 0) {
1118 s = ucs_r2b_jisx0213_cmap_val[k];
1119 }
1120 }
1121 }
1122
1123 if (!s && w) {
1124 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004);
1125 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1126 } else if (s <= 0x7F) {
1127 out = mb_convert_buf_add(out, s);
1128 } else if (s <= 0xFF) {
1129 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1130 out = mb_convert_buf_add2(out, 0x8E, s);
1131 } else if (s <= 0x7EFF) {
1132 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1133 out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80);
1134 } else {
1135 unsigned int s2 = s & 0xFF;
1136 int k = ((s >> 8) & 0xFF) - 0x7F;
1137 ZEND_ASSERT(k < jisx0213_p2_ofst_len);
1138 s = jisx0213_p2_ofst[k] + 0x21;
1139 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
1140 out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80);
1141 }
1142 }
1143
1144 MB_CONVERT_BUF_STORE(buf, out, limit);
1145 }
1146
1147 #define ASCII 0
1148 #define JISX0208 1
1149 #define JISX0213_PLANE1 2
1150 #define JISX0213_PLANE2 3
1151
mb_iso2022jp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1152 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1153 {
1154 unsigned char *p = *in, *e = p + *in_len;
1155 uint32_t *out = buf, *limit = buf + bufsize - 1;
1156
1157 while (p < e && out < limit) {
1158 unsigned char c = *p++;
1159
1160 if (c <= 0x7F) {
1161 if (c == 0x1B) {
1162 if ((e - p) < 2) {
1163 *out++ = MBFL_BAD_INPUT;
1164 p = e;
1165 break;
1166 }
1167 unsigned char c2 = *p++;
1168 unsigned char c3 = *p++;
1169 if (c2 == '$') {
1170 if (c3 == 'B') {
1171 *state = JISX0208;
1172 } else if (c3 == '(') {
1173 if (p == e) {
1174 *out++ = MBFL_BAD_INPUT;
1175 break;
1176 }
1177 unsigned char c4 = *p++;
1178 if (c4 == 'Q') {
1179 *state = JISX0213_PLANE1;
1180 } else if (c4 == 'P') {
1181 *state = JISX0213_PLANE2;
1182 } else {
1183 *out++ = MBFL_BAD_INPUT;
1184 }
1185 } else {
1186 *out++ = MBFL_BAD_INPUT;
1187 }
1188 } else if (c2 == '(') {
1189 if (c3 == 'B') {
1190 *state = ASCII;
1191 } else {
1192 *out++ = MBFL_BAD_INPUT;
1193 }
1194 } else {
1195 p--;
1196 *out++ = MBFL_BAD_INPUT;
1197 }
1198 } else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) {
1199 if (p == e) {
1200 *out++ = MBFL_BAD_INPUT;
1201 break;
1202 }
1203 unsigned char c2 = *p++;
1204 if (c2 < 0x21 || c2 > 0x7E) {
1205 *out++ = MBFL_BAD_INPUT;
1206 continue;
1207 }
1208
1209 if (*state == JISX0213_PLANE1) {
1210 unsigned int w1 = (c << 8) | c2;
1211
1212 /* Conversion for combining characters */
1213 if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
1214 int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
1215 if (k >= 0) {
1216 *out++ = jisx0213_u2_tbl[2*k];
1217 *out++ = jisx0213_u2_tbl[2*k+1];
1218 continue;
1219 }
1220 }
1221
1222 /* Conversion for BMP */
1223 uint32_t w = 0;
1224 w1 = (c - 0x21)*94 + c2 - 0x21;
1225 if (w1 < jisx0213_ucs_table_size) {
1226 w = jisx0213_ucs_table[w1];
1227 }
1228
1229 /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
1230 if (!w) {
1231 int k = mbfl_bisec_srch2((c << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1232 if (k >= 0) {
1233 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1234 }
1235 }
1236
1237 *out++ = w ? w : MBFL_BAD_INPUT;
1238 } else if (*state == JISX0213_PLANE2) {
1239
1240 unsigned int s1 = c - 0x21, s2 = c2 - 0x21;
1241
1242 if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
1243 int k;
1244 for (k = 0; k < jisx0213_p2_ofst_len; k++) {
1245 if (s1 == jisx0213_p2_ofst[k]) {
1246 break;
1247 }
1248 }
1249 k -= jisx0213_p2_ofst[k];
1250
1251 /* Check for Japanese chars in BMP */
1252 unsigned int s = (s1 + 94 + k)*94 + s2;
1253 ZEND_ASSERT(s < jisx0213_ucs_table_size);
1254 uint32_t w = jisx0213_ucs_table[s];
1255
1256 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
1257 if (!w) {
1258 k = mbfl_bisec_srch2(((c + k + 94) << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1259 if (k >= 0) {
1260 w = jisx0213_jis_u5_tbl[k] + 0x20000;
1261 }
1262 }
1263
1264 *out++ = w ? w : MBFL_BAD_INPUT;
1265 } else {
1266 *out++ = MBFL_BAD_INPUT;
1267 }
1268 } else { /* state == JISX0208 */
1269 unsigned int s = (c - 0x21)*94 + c2 - 0x21;
1270 uint32_t w = 0;
1271 if (s < jisx0208_ucs_table_size) {
1272 w = jisx0208_ucs_table[s];
1273 }
1274 *out++ = w ? w : MBFL_BAD_INPUT;
1275 }
1276 } else {
1277 *out++ = c;
1278 }
1279 } else {
1280 *out++ = MBFL_BAD_INPUT;
1281 }
1282 }
1283
1284 *in_len = e - p;
1285 *in = p;
1286 return out - buf;
1287 }
1288
mb_wchar_to_iso2022jp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1289 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1290 {
1291 unsigned char *out, *limit;
1292 MB_CONVERT_BUF_LOAD(buf, out, limit);
1293 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1294
1295 uint32_t w;
1296 if (buf->state & 0xFF00) {
1297 int k = (buf->state >> 8) - 1;
1298 w = jisx0213_u2_tbl[2*k];
1299 buf->state &= 0xFF;
1300 goto process_codepoint;
1301 }
1302
1303 while (len--) {
1304 w = *in++;
1305 process_codepoint: ;
1306 unsigned int s = 0;
1307
1308 if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
1309 for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
1310 if (w == jisx0213_u2_tbl[2*k]) {
1311 if (!len) {
1312 if (!end) {
1313 buf->state |= (k+1) << 8;
1314 MB_CONVERT_BUF_STORE(buf, out, limit);
1315 return;
1316 }
1317 } else {
1318 uint32_t w2 = *in++; len--;
1319 if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
1320 k++;
1321 }
1322 if (w2 == jisx0213_u2_tbl[2*k+1]) {
1323 s = jisx0213_u2_key[k];
1324 break;
1325 }
1326 in--; len++;
1327 }
1328
1329 s = jisx0213_u2_fb_tbl[k];
1330 break;
1331 }
1332 }
1333 }
1334
1335 /* Check for major Japanese chars: U+4E00-U+9FFF */
1336 if (!s) {
1337 for (int k = 0; k < uni2jis_tbl_len; k++) {
1338 if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
1339 s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
1340 break;
1341 }
1342 }
1343 }
1344
1345 /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
1346 if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
1347 int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1348 if (k >= 0) {
1349 s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
1350 }
1351 }
1352
1353 /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
1354 if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
1355 int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1356 if (k >= 0) {
1357 s = jisx0213_u5_jis_tbl[k];
1358 }
1359 }
1360
1361 if (!s) {
1362 /* CJK Compatibility Forms: U+FE30-U+FE4F */
1363 if (w == 0xFE45) {
1364 s = 0x233E;
1365 } else if (w == 0xFE46) {
1366 s = 0x233D;
1367 } else if (w >= 0xF91D && w <= 0xF9DC) {
1368 /* CJK Compatibility Ideographs: U+F900-U+F92A */
1369 int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1370 if (k >= 0) {
1371 s = ucs_r2b_jisx0213_cmap_val[k];
1372 }
1373 }
1374 }
1375
1376 if (!s && w) {
1377 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
1378 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1379 } else if (s <= 0x7F) {
1380 if (buf->state != ASCII) {
1381 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1382 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1383 buf->state = ASCII;
1384 }
1385 out = mb_convert_buf_add(out, s);
1386 } else if (s <= 0xFF) {
1387 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
1388 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1389 } else if (s <= 0x7EFF) {
1390 if (buf->state != JISX0213_PLANE1) {
1391 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
1392 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q');
1393 buf->state = JISX0213_PLANE1;
1394 } else {
1395 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1396 }
1397 out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1398 } else {
1399 if (buf->state != JISX0213_PLANE2) {
1400 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
1401 out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P');
1402 buf->state = JISX0213_PLANE2;
1403 } else {
1404 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1405 }
1406 unsigned int s2 = s & 0xFF;
1407 int k = ((s >> 8) & 0xFF) - 0x7F;
1408 ZEND_ASSERT(k < jisx0213_p2_ofst_len);
1409 s = jisx0213_p2_ofst[k] + 0x21;
1410 out = mb_convert_buf_add2(out, s, s2);
1411 }
1412 }
1413
1414 if (end && buf->state != ASCII) {
1415 MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1416 out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1417 }
1418
1419 MB_CONVERT_BUF_STORE(buf, out, limit);
1420 }
1421