1 /*
2 +----------------------------------------------------------------------+
3 | Zend Engine |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1998-2013 Zend Technologies Ltd. (http://www.zend.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 2.00 of the Zend license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at |
10 | http://www.zend.com/license/2_00.txt. |
11 | If you did not receive a copy of the Zend license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@zend.com so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Authors: Masaki Fujimoto <fujimoto@php.net> |
16 | Rui Hirokawa <hirokawa@php.net> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* $Id$ */
21
22 #include "zend.h"
23 #include "zend_compile.h"
24 #include "zend_operators.h"
25 #include "zend_multibyte.h"
26
27 #ifdef ZEND_MULTIBYTE
28 static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC);
29 size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
30 size_t sjis_output_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
31 static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size);
32 static int zend_multibyte_parse_encoding_list(const char *encoding_list,
33 size_t encoding_list_size, zend_encoding ***result, size_t *result_size);
34 static zend_encoding *zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC);
35 static zend_encoding *zend_multibyte_detect_unicode(TSRMLS_D);
36 static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC);
37
38 /*
39 * encodings
40 */
41 static const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL};
42 static zend_encoding encoding_ucs2 = {
43 NULL,
44 NULL,
45 "UCS-2",
46 (const char *(*)[])&ucs2_aliases,
47 0
48 };
49
50 static zend_encoding encoding_ucs2be = {
51 NULL,
52 NULL,
53 "UCS-2BE",
54 NULL,
55 0
56 };
57
58 static zend_encoding encoding_ucs2le = {
59 NULL,
60 NULL,
61 "UCS-2LE",
62 NULL,
63 0
64 };
65
66 static const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL};
67 static zend_encoding encoding_ucs4 = {
68 NULL,
69 NULL,
70 "UCS-4",
71 (const char *(*)[])&ucs4_aliases,
72 0
73 };
74
75 static zend_encoding encoding_ucs4be = {
76 NULL,
77 NULL,
78 "UCS-4BE",
79 NULL,
80 0
81 };
82
83 static zend_encoding encoding_ucs4le = {
84 NULL,
85 NULL,
86 "UCS-4LE",
87 NULL,
88 0
89 };
90
91 static const char *utf32_aliases[] = {"utf32", NULL};
92 static zend_encoding encoding_utf32 = {
93 NULL,
94 NULL,
95 "UTF-32",
96 (const char *(*)[])&utf32_aliases,
97 0
98 };
99
100 static zend_encoding encoding_utf32be = {
101 NULL,
102 NULL,
103 "UTF-32BE",
104 NULL,
105 0
106 };
107
108 static zend_encoding encoding_utf32le = {
109 NULL,
110 NULL,
111 "UTF-32LE",
112 NULL,
113 0
114 };
115
116 static const char *utf16_aliases[] = {"utf16", NULL};
117 static zend_encoding encoding_utf16 = {
118 NULL,
119 NULL,
120 "UTF-16",
121 (const char *(*)[])&utf16_aliases,
122 0
123 };
124
125 static zend_encoding encoding_utf16be = {
126 NULL,
127 NULL,
128 "UTF-16BE",
129 NULL,
130 0
131 };
132
133 static zend_encoding encoding_utf16le = {
134 NULL,
135 NULL,
136 "UTF-16LE",
137 NULL,
138 0
139 };
140
141 static const char *utf8_aliases[] = {"utf8", NULL};
142 static zend_encoding encoding_utf8 = {
143 NULL,
144 NULL,
145 "UTF-8",
146 (const char *(*)[])&utf8_aliases,
147 1
148 };
149
150 static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL};
151 static zend_encoding encoding_ascii = {
152 NULL,
153 NULL,
154 "ASCII",
155 (const char *(*)[])&ascii_aliases,
156 1
157 };
158
159 static const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
160 static zend_encoding encoding_euc_jp = {
161 NULL,
162 NULL,
163 "EUC-JP",
164 (const char *(*)[])&euc_jp_aliases,
165 1
166 };
167
168 static const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL};
169 static zend_encoding encoding_sjis = {
170 sjis_input_filter,
171 sjis_output_filter,
172 "Shift_JIS",
173 (const char *(*)[])&sjis_aliases,
174 0
175 };
176
177 static const char *eucjp_win_aliases[] = {"eucJP-open", NULL};
178 static zend_encoding encoding_eucjp_win = {
179 NULL,
180 NULL,
181 "eucJP-win",
182 (const char *(*)[])&eucjp_win_aliases,
183 1
184 };
185
186 static const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL};
187 static zend_encoding encoding_sjis_win = {
188 /* sjis-filters does not care about diffs of Shift_JIS and CP932 */
189 sjis_input_filter,
190 sjis_output_filter,
191 "SJIS-win",
192 (const char *(*)[])&sjis_win_aliases,
193 0
194 };
195
196 static const char *jis_aliases[] = {"ISO-2022-JP", NULL};
197 static zend_encoding encoding_jis = {
198 NULL,
199 NULL,
200 "JIS",
201 (const char *(*)[])&jis_aliases,
202 0
203 };
204
205 static const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
206 static zend_encoding encoding_euc_cn = {
207 NULL,
208 NULL,
209 "EUC-CN",
210 (const char *(*)[])&euc_cn_aliases,
211 1
212 };
213
214 static const char *cp936_aliases[] = {"CP-936", NULL};
215 static zend_encoding encoding_cp936 = {
216 NULL,
217 NULL,
218 "CP936",
219 (const char *(*)[])&cp936_aliases,
220 0
221 };
222
223 static const char *hz_aliases[] = {"HZ-GB-2312", NULL};
224 static zend_encoding encoding_hz = {
225 NULL,
226 NULL,
227 "HZ",
228 (const char *(*)[])&hz_aliases,
229 0
230 };
231
232 static const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
233 static zend_encoding encoding_euc_tw = {
234 NULL,
235 NULL,
236 "EUC-TW",
237 (const char *(*)[])&euc_tw_aliases,
238 1
239 };
240
241 static const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
242 static zend_encoding encoding_big5 = {
243 NULL,
244 NULL,
245 "BIG-5",
246 (const char *(*)[])&big5_aliases,
247 0
248 };
249
250 static const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
251 static zend_encoding encoding_euc_kr = {
252 NULL,
253 NULL,
254 "EUC-KR",
255 (const char *(*)[])&euc_kr_aliases,
256 1
257 };
258
259 static const char *uhc_aliases[] = {"CP949", NULL};
260 static zend_encoding encoding_uhc = {
261 NULL,
262 NULL,
263 "UHC",
264 (const char *(*)[])&uhc_aliases,
265 1
266 };
267
268 static zend_encoding encoding_2022kr = {
269 NULL,
270 NULL,
271 "ISO-2022-KR",
272 NULL,
273 0
274 };
275
276 static const char *cp1252_aliases[] = {"cp1252", NULL};
277 static zend_encoding encoding_cp1252 = {
278 NULL,
279 NULL,
280 "Windows-1252",
281 (const char *(*)[])&cp1252_aliases,
282 1
283 };
284
285 static const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL};
286 static zend_encoding encoding_8859_1 = {
287 NULL,
288 NULL,
289 "ISO-8859-1",
290 (const char *(*)[])&iso_8859_1_aliases,
291 1
292 };
293
294 static const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL};
295 static zend_encoding encoding_8859_2 = {
296 NULL,
297 NULL,
298 "ISO-8859-2",
299 (const char *(*)[])&iso_8859_2_aliases,
300 1
301 };
302
303 static const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL};
304 static zend_encoding encoding_8859_3 = {
305 NULL,
306 NULL,
307 "ISO-8859-3",
308 (const char *(*)[])&iso_8859_3_aliases,
309 1
310 };
311
312 static const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL};
313 static zend_encoding encoding_8859_4 = {
314 NULL,
315 NULL,
316 "ISO-8859-4",
317 (const char *(*)[])&iso_8859_4_aliases,
318 1
319 };
320
321 static const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL};
322 static zend_encoding encoding_8859_5 = {
323 NULL,
324 NULL,
325 "ISO-8859-5",
326 (const char *(*)[])&iso_8859_5_aliases,
327 1
328 };
329
330 static const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL};
331 static zend_encoding encoding_8859_6 = {
332 NULL,
333 NULL,
334 "ISO-8859-6",
335 (const char *(*)[])&iso_8859_6_aliases,
336 1
337 };
338
339 static const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL};
340 static zend_encoding encoding_8859_7 = {
341 NULL,
342 NULL,
343 "ISO-8859-7",
344 (const char *(*)[])&iso_8859_7_aliases,
345 1
346 };
347
348 static const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL};
349 static zend_encoding encoding_8859_8 = {
350 NULL,
351 NULL,
352 "ISO-8859-8",
353 (const char *(*)[])&iso_8859_8_aliases,
354 1
355 };
356
357 static const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL};
358 static zend_encoding encoding_8859_9 = {
359 NULL,
360 NULL,
361 "ISO-8859-9",
362 (const char *(*)[])&iso_8859_9_aliases,
363 1
364 };
365
366 static const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL};
367 static zend_encoding encoding_8859_10 = {
368 NULL,
369 NULL,
370 "ISO-8859-10",
371 (const char *(*)[])&iso_8859_10_aliases,
372 1
373 };
374
375 static const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL};
376 static zend_encoding encoding_8859_13 = {
377 NULL,
378 NULL,
379 "ISO-8859-13",
380 (const char *(*)[])&iso_8859_13_aliases,
381 1
382 };
383
384 static const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL};
385 static zend_encoding encoding_8859_14 = {
386 NULL,
387 NULL,
388 "ISO-8859-14",
389 (const char *(*)[])&iso_8859_14_aliases,
390 1
391 };
392
393 static const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL};
394 static zend_encoding encoding_8859_15 = {
395 NULL,
396 NULL,
397 "ISO-8859-15",
398 (const char *(*)[])&iso_8859_15_aliases,
399 1
400 };
401
402 static const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
403 static zend_encoding encoding_cp1251 = {
404 NULL,
405 NULL,
406 "Windows-1251",
407 (const char *(*)[])&cp1251_aliases,
408 1
409 };
410
411 static const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL};
412 static zend_encoding encoding_cp866 = {
413 NULL,
414 NULL,
415 "CP866",
416 (const char *(*)[])&cp866_aliases,
417 1
418 };
419
420 static const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL};
421 static zend_encoding encoding_koi8r = {
422 NULL,
423 NULL,
424 "KOI8-R",
425 (const char *(*)[])&koi8r_aliases,
426 1
427 };
428
429 static const char *koi8u_aliases[] = {"KOI8-U", "KOI8U", NULL};
430 static zend_encoding encoding_koi8u = {
431 NULL,
432 NULL,
433 "KOI8-U",
434 (const char *(*)[])&koi8u_aliases,
435 1
436 };
437
438 static const char *cp1254_aliases[] = {"cp1254", NULL};
439 static zend_encoding encoding_cp1254 = {
440 NULL,
441 NULL,
442 "Windows-1254",
443 (const char *(*)[])&cp1254_aliases,
444 1
445 };
446
447 static const char *armscii8_aliases[] = { "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL};
448 static zend_encoding encoding_armscii8 = {
449 NULL,
450 NULL,
451 "ArmSCII-8",
452 (const char *(*)[])&armscii8_aliases,
453 1
454 };
455
456 static const char *cp850_aliases[] = {"IBM850", NULL};
457 static zend_encoding encoding_cp850 = {
458 NULL,
459 NULL,
460 "CP850",
461 (const char *(*)[])&cp850_aliases,
462 1
463 };
464
465 static zend_encoding *zend_encoding_table[] = {
466 &encoding_ucs4,
467 &encoding_ucs4be,
468 &encoding_ucs4le,
469 &encoding_ucs2,
470 &encoding_ucs2be,
471 &encoding_ucs2le,
472 &encoding_utf32,
473 &encoding_utf32be,
474 &encoding_utf32le,
475 &encoding_utf16,
476 &encoding_utf16be,
477 &encoding_utf16le,
478 &encoding_utf8,
479 &encoding_ascii,
480 &encoding_euc_jp,
481 &encoding_sjis,
482 &encoding_eucjp_win,
483 &encoding_sjis_win,
484 &encoding_jis,
485 &encoding_cp1252,
486 &encoding_8859_1,
487 &encoding_8859_2,
488 &encoding_8859_3,
489 &encoding_8859_4,
490 &encoding_8859_5,
491 &encoding_8859_6,
492 &encoding_8859_7,
493 &encoding_8859_8,
494 &encoding_8859_9,
495 &encoding_8859_10,
496 &encoding_8859_13,
497 &encoding_8859_14,
498 &encoding_8859_15,
499 &encoding_euc_cn,
500 &encoding_cp936,
501 &encoding_hz,
502 &encoding_euc_tw,
503 &encoding_big5,
504 &encoding_euc_kr,
505 &encoding_uhc,
506 &encoding_2022kr,
507 &encoding_cp1251,
508 &encoding_cp866,
509 &encoding_koi8r,
510 &encoding_koi8u,
511 &encoding_armscii8,
512 &encoding_cp1254,
513 &encoding_cp850,
514 NULL
515 };
516
517
518
zend_multibyte_set_script_encoding(const char * encoding_list,size_t encoding_list_size TSRMLS_DC)519 ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list,
520 size_t encoding_list_size TSRMLS_DC)
521 {
522 if (CG(script_encoding_list)) {
523 efree(CG(script_encoding_list));
524 CG(script_encoding_list) = NULL;
525 }
526 CG(script_encoding_list_size) = 0;
527
528 if (!encoding_list) {
529 return 0;
530 }
531
532 zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size)));
533
534 return 0;
535 }
536
537
zend_multibyte_set_internal_encoding(const char * encoding_name TSRMLS_DC)538 ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC)
539 {
540 CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name);
541 return 0;
542 }
543
zend_multibyte_set_functions(zend_encoding_detector encoding_detector,zend_encoding_converter encoding_converter,zend_encoding_oddlen encoding_oddlen TSRMLS_DC)544 ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC)
545 {
546 CG(encoding_detector) = encoding_detector;
547 CG(encoding_converter) = encoding_converter;
548 CG(encoding_oddlen) = encoding_oddlen;
549 return 0;
550 }
551
552
zend_multibyte_set_filter(zend_encoding * onetime_encoding TSRMLS_DC)553 ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC)
554 {
555 LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC);
556 LANG_SCNG(internal_encoding) = CG(internal_encoding);
557
558 /* judge input/output filter */
559 LANG_SCNG(input_filter) = NULL;
560 LANG_SCNG(output_filter) = NULL;
561
562 if (!LANG_SCNG(script_encoding)) {
563 return 0;
564 }
565
566 if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) {
567 /* if encoding specfic filters exist, use them */
568 if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) {
569 LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter;
570 LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter;
571 return 0;
572 }
573
574 if (!LANG_SCNG(script_encoding)->compatible) {
575 /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */
576 LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding);
577 LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
578 LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
579 return 0;
580 } else {
581 /* nothing to do in this case */
582 return 0;
583 }
584 }
585
586 /* LANG_SCNG(internal_encoding) cannot be NULL here */
587 if (LANG_SCNG(internal_encoding)->compatible) {
588 LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
589 return 0;
590 } else if (LANG_SCNG(script_encoding)->compatible) {
591 LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
592 return 0;
593 }
594
595 /* both script and internal encodings are incompatible w/ flex */
596 LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
597 LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
598
599 return 0;
600 }
601
602
zend_multibyte_fetch_encoding(const char * encoding_name)603 ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name)
604 {
605 int i, j;
606 zend_encoding *encoding;
607
608 if (!encoding_name) {
609 return NULL;
610 }
611
612 for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
613 if (zend_binary_strcasecmp(encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) {
614 return encoding;
615 }
616 }
617
618 for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
619 if (encoding->aliases != NULL) {
620 for (j = 0; (*encoding->aliases)[j] != NULL; j++) {
621 if (zend_binary_strcasecmp((*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) {
622 return encoding;
623 }
624 }
625 }
626 }
627
628 return NULL;
629 }
630
631
zend_multibyte_script_encoding_filter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length TSRMLS_DC)632 ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t
633 *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
634 {
635 const char *name;
636
637 if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) {
638 name = "UTF-8";
639 } else {
640 name = LANG_SCNG(internal_encoding)->name;
641 }
642
643 return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC);
644 }
645
zend_multibyte_internal_encoding_filter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length TSRMLS_DC)646 ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
647 {
648 const char *name;
649
650 if (LANG_SCNG(script_encoding)->compatible == 0) {
651 name = "UTF-8";
652 } else {
653 name = LANG_SCNG(script_encoding)->name;
654 }
655
656 return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC);
657 }
658
zend_multibyte_encoding_filter(unsigned char ** to,size_t * to_length,const char * to_encoding,const unsigned char * from,size_t from_length,const char * from_encoding TSRMLS_DC)659 static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC)
660 {
661 size_t oddlen;
662
663 if (!CG(encoding_converter)) {
664 return 0;
665 }
666
667 if (CG(encoding_oddlen)) {
668 oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC);
669 if (oddlen > 0) {
670 from_length -= oddlen;
671 }
672 }
673
674 if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) {
675 return 0;
676 }
677
678 return from_length;
679 }
680
681
682 /*
683 * Shift_JIS Input/Output Filter
684 */
685 static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */
686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
687 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
688 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
689 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
690 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
691 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
693 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
694 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
695 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
696 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
698 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
699 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
700 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
701 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0
702 };
703
sjis_input_filter(unsigned char ** buf,size_t * length,const unsigned char * sjis,size_t sjis_length TSRMLS_DC)704 size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC)
705 {
706 const unsigned char *p;
707 unsigned char *q;
708 unsigned char c1, c2;
709
710 *buf = (unsigned char*)emalloc(sjis_length * 3 / 2 + 1);
711 if (!*buf)
712 return 0;
713 *length = 0;
714
715 p = sjis;
716 q = *buf;
717
718 /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */
719 while (*p && (p - sjis) < sjis_length) {
720 if (!(*p & 0x80)) {
721 *q++ = *p++;
722 continue;
723 }
724
725 /* handling 8 bit code */
726 if (table_sjis[*p] == 1) {
727 /* 1 byte kana */
728 *q++ = 0x8e;
729 *q++ = *p++;
730 continue;
731 }
732
733 if (!*(p+1)) {
734 *q++ = *p++;
735 break;
736 }
737
738 if (table_sjis[*p] == 2) {
739 /* 2 byte kanji code */
740 c1 = *p++;
741 if (!*p || (p - sjis) >= sjis_length) {
742 break;
743 }
744 c2 = *p++;
745 c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1;
746 c1 = (c1 << 1) + 1;
747 if (c2 >= 0x9e) {
748 c2 -= 0x7e;
749 c1++;
750 } else if (c2 > 0x7f) {
751 c2 -= 0x20;
752 } else {
753 c2 -= 0x1f;
754 }
755
756 c1 |= 0x80;
757 c2 |= 0x80;
758
759 *q++ = c1;
760 *q++ = c2;
761 } else {
762 /*
763 * for user defined chars (ATTENTION)
764 *
765 * THESE ARE NOT CODE FOR CONVERSION! :-P
766 * (using *ILLEGALLY* 3byte EUC-JP space)
767 *
768 * we cannot perfectly (== 1 to 1) convert these chars to EUC-JP.
769 * so, these code are for perfect RESTORING in sjis_output_filter()
770 */
771 c1 = *p++;
772 if (!*p || (p - sjis) >= sjis_length) {
773 break;
774 }
775 c2 = *p++;
776 *q++ = 0x8f;
777 /*
778 * MAP TO (EUC-JP):
779 * type A: 0xeba1 - 0xf4fe
780 * type B: 0xf5a1 - 0xfefe
781 * type C: 0xa1a1 - 0xa6fe
782 */
783 c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1);
784 c1 = (c1 << 1) + 1;
785 if (c2 >= 0x9e) {
786 c2 -= 0x7e;
787 c1++;
788 } else if (c2 > 0x7f) {
789 c2 -= 0x20;
790 } else {
791 c2 -= 0x1f;
792 }
793
794 c1 |= 0x80;
795 c2 |= 0x80;
796
797 *q++ = c1;
798 *q++ = c2;
799 }
800 }
801 *q = '\0';
802 *length = q - *buf;
803
804 return *length;
805 }
806
807 static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */
808 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
809 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
810 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
812 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
813 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
814 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
815 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
817 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
818 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
819 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
820 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
821 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
822 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
823 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
824 };
825
sjis_output_filter(unsigned char ** sjis,size_t * sjis_length,const unsigned char * buf,size_t length TSRMLS_DC)826 size_t sjis_output_filter(unsigned char **sjis, size_t *sjis_length, const unsigned char *buf, size_t length TSRMLS_DC)
827 {
828 unsigned char c1, c2;
829 unsigned char *p;
830 const unsigned char *q;
831
832 if (!sjis || !sjis_length) {
833 return 0;
834 }
835
836 /* always Shift_JIS <= EUC-JP */
837 *sjis = (unsigned char*)emalloc(length+1);
838 if (!sjis) {
839 return 0;
840 }
841 p = *sjis;
842 q = buf;
843
844 /* restore converted strings [EUC-JP -> Shift_JIS] */
845 while (*q && (q - buf) < length) {
846 if (!(*q & 0x80)) {
847 *p++ = *q++;
848 continue;
849 }
850
851 /* hankaku kana */
852 if (*q == 0x8e) {
853 q++;
854 if (*q) {
855 *p++ = *q++;
856 }
857 continue;
858 }
859
860 /* 2 byte kanji code */
861 if (table_eucjp[*q] == 2) {
862 c1 = (*q++ & ~0x80) & 0xff;
863 if (*q) {
864 c2 = (*q++ & ~0x80) & 0xff;
865 } else {
866 q--;
867 break;
868 }
869
870 c2 += (c1 & 0x01) ? 0x1f : 0x7d;
871 if (c2 >= 0x7f) {
872 c2++;
873 }
874 c1 = ((c1 - 0x21) >> 1) + 0x81;
875 if (c1 > 0x9f) {
876 c1 += 0x40;
877 }
878
879 *p++ = c1;
880 *p++ = c2;
881 continue;
882 }
883
884 if (*q == 0x8f) {
885 q++;
886 if (*q) {
887 c1 = (*q++ & ~0x80) & 0xff;
888 } else {
889 q--;
890 break;
891 }
892 if (*q) {
893 c2 = (*q++ & ~0x80) & 0xff;
894 } else {
895 q -= 2;
896 break;
897 }
898
899 c2 += (c1 & 0x01) ? 0x1f : 0x7d;
900 if (c2 >= 0x7f) {
901 c2++;
902 }
903 c1 = ((c1 - 0x21) >> 1) + 0x81;
904 if (c1 > 0x9f) {
905 c1 += 0x40;
906 }
907
908 if (c1 >= 0x81 && c1 <= 0x9f) {
909 c1 += 0x79;
910 } else {
911 c1 += 0x0a;
912 }
913
914 *p++ = c1;
915 *p++ = c2;
916 continue;
917 }
918
919 /* some other chars (may not happen) */
920 *p++ = *q++;
921 }
922 *p = '\0';
923 *sjis_length = p - *sjis;
924
925 return q-buf; /* return length we actually read */
926 }
927
928
zend_multibyte_assemble_encoding_list(zend_encoding ** encoding_list,size_t encoding_list_size)929 static char *zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size)
930 {
931 int i, list_size = 0;
932 const char *name;
933 char *list = NULL;
934
935 if (!encoding_list || !encoding_list_size) {
936 return NULL;
937 }
938
939 for (i = 0; i < encoding_list_size; i++) {
940 name = (*(encoding_list+i))->name;
941 if (name) {
942 list_size += strlen(name) + 1;
943 if (!list) {
944 list = (char*)emalloc(list_size);
945 if (!list) {
946 return NULL;
947 }
948 *list = '\0';
949 } else {
950 list = (char*)erealloc(list, list_size);
951 if (!list) {
952 return NULL;
953 }
954 strcat(list, ",");
955 }
956 strcat(list, name);
957 }
958 }
959 return list;
960 }
961
962
zend_multibyte_parse_encoding_list(const char * encoding_list,size_t encoding_list_size,zend_encoding *** result,size_t * result_size)963 static int zend_multibyte_parse_encoding_list(const char *encoding_list,
964 size_t encoding_list_size, zend_encoding ***result, size_t *result_size)
965 {
966 int n, size;
967 char *p, *p1, *p2, *endp, *tmpstr;
968 zend_encoding **list, **entry, *encoding;
969
970 list = NULL;
971 if (encoding_list == NULL || encoding_list_size <= 0) {
972 return -1;
973 } else {
974 /* copy the encoding_list string for work */
975 tmpstr = (char *)estrndup(encoding_list, encoding_list_size);
976 if (tmpstr == NULL) {
977 return -1;
978 }
979 /* count the number of listed encoding names */
980 endp = tmpstr + encoding_list_size;
981 n = 1;
982 p1 = tmpstr;
983 while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) {
984 p1 = p2 + 1;
985 n++;
986 }
987 size = n;
988 /* make list */
989 list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*));
990 if (list != NULL) {
991 entry = list;
992 n = 0;
993 p1 = tmpstr;
994 do {
995 p2 = p = zend_memnstr(p1, ",", 1, endp);
996 if (p == NULL) {
997 p = endp;
998 }
999 *p = '\0';
1000 /* trim spaces */
1001 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
1002 p1++;
1003 }
1004 p--;
1005 while (p > p1 && (*p == ' ' || *p == '\t')) {
1006 *p = '\0';
1007 p--;
1008 }
1009 /* convert to the encoding number and check encoding */
1010 encoding = zend_multibyte_fetch_encoding(p1);
1011 if (encoding)
1012 {
1013 *entry++ = encoding;
1014 n++;
1015 }
1016 p1 = p2 + 1;
1017 } while (n < size && p2 != NULL);
1018 *result = list;
1019 *result_size = n;
1020 }
1021 efree(tmpstr);
1022 }
1023
1024 if (list == NULL) {
1025 return -1;
1026 }
1027
1028 return 0;
1029 }
1030
1031
zend_multibyte_find_script_encoding(zend_encoding * onetime_encoding TSRMLS_DC)1032 static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC)
1033 {
1034 zend_encoding *script_encoding;
1035 char *name, *list;
1036
1037 /* onetime_encoding is prior to everything */
1038 if (onetime_encoding != NULL) {
1039 return onetime_encoding;
1040 }
1041
1042 if (CG(detect_unicode)) {
1043 /* check out bom(byte order mark) and see if containing wchars */
1044 script_encoding = zend_multibyte_detect_unicode(TSRMLS_C);
1045 if (script_encoding != NULL) {
1046 /* bom or wchar detection is prior to 'script_encoding' option */
1047 return script_encoding;
1048 }
1049 }
1050
1051 /* if no script_encoding specified, just leave alone */
1052 if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) {
1053 return NULL;
1054 }
1055
1056 /* if multiple encodings specified, detect automagically */
1057 if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) {
1058 list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list),
1059 CG(script_encoding_list_size));
1060 name = CG(encoding_detector)(LANG_SCNG(script_org),
1061 LANG_SCNG(script_org_size), list TSRMLS_CC);
1062 if (list) {
1063 efree(list);
1064 }
1065 if (name) {
1066 script_encoding = zend_multibyte_fetch_encoding(name);
1067 efree(name);
1068 } else {
1069 script_encoding = NULL;
1070 }
1071 return script_encoding;
1072 }
1073
1074 return *(CG(script_encoding_list));
1075 }
1076
1077
zend_multibyte_detect_unicode(TSRMLS_D)1078 static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D)
1079 {
1080 zend_encoding *script_encoding = NULL;
1081 int bom_size;
1082 unsigned char *script;
1083
1084 if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) {
1085 return NULL;
1086 }
1087
1088 /* check out BOM */
1089 if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) {
1090 script_encoding = &encoding_utf32be;
1091 bom_size = sizeof(BOM_UTF32_BE)-1;
1092 } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) {
1093 script_encoding = &encoding_utf32le;
1094 bom_size = sizeof(BOM_UTF32_LE)-1;
1095 } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) {
1096 script_encoding = &encoding_utf16be;
1097 bom_size = sizeof(BOM_UTF16_BE)-1;
1098 } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) {
1099 script_encoding = &encoding_utf16le;
1100 bom_size = sizeof(BOM_UTF16_LE)-1;
1101 } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) {
1102 script_encoding = &encoding_utf8;
1103 bom_size = sizeof(BOM_UTF8)-1;
1104 }
1105
1106 if (script_encoding) {
1107 /* remove BOM */
1108 script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size);
1109 memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size);
1110 efree(LANG_SCNG(script_org));
1111 LANG_SCNG(script_org) = script;
1112 LANG_SCNG(script_org_size) -= bom_size;
1113
1114 return script_encoding;
1115 }
1116
1117 /* script contains NULL bytes -> auto-detection */
1118 if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) {
1119 /* make best effort if BOM is missing */
1120 return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC);
1121 }
1122
1123 return NULL;
1124 }
1125
zend_multibyte_detect_utf_encoding(const unsigned char * script,size_t script_size TSRMLS_DC)1126 static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC)
1127 {
1128 const unsigned char *p;
1129 int wchar_size = 2;
1130 int le = 0;
1131
1132 /* utf-16 or utf-32? */
1133 p = script;
1134 while ((p-script) < script_size) {
1135 p = memchr(p, 0, script_size-(p-script)-2);
1136 if (!p) {
1137 break;
1138 }
1139 if (*(p+1) == '\0' && *(p+2) == '\0') {
1140 wchar_size = 4;
1141 break;
1142 }
1143
1144 /* searching for UTF-32 specific byte orders, so this will do */
1145 p += 4;
1146 }
1147
1148 /* BE or LE? */
1149 p = script;
1150 while ((p-script) < script_size) {
1151 if (*p == '\0' && *(p+wchar_size-1) != '\0') {
1152 /* BE */
1153 le = 0;
1154 break;
1155 } else if (*p != '\0' && *(p+wchar_size-1) == '\0') {
1156 /* LE* */
1157 le = 1;
1158 break;
1159 }
1160 p += wchar_size;
1161 }
1162
1163 if (wchar_size == 2) {
1164 return le ? &encoding_utf16le : &encoding_utf16be;
1165 } else {
1166 return le ? &encoding_utf32le : &encoding_utf32be;
1167 }
1168
1169 return NULL;
1170 }
1171 #endif /* ZEND_MULTIBYTE */
1172
1173 /*
1174 * Local variables:
1175 * tab-width: 4
1176 * c-basic-offset: 4
1177 * End:
1178 * vim600: sw=4 ts=4 tw=78
1179 * vim<600: sw=4 ts=4 tw=78
1180 */
1181