xref: /PHP-5.3/Zend/zend_multibyte.c (revision 831fbcf3)
1 /*
2    +----------------------------------------------------------------------+
3    | Zend Engine                                                          |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1998-2013 Zend Technologies Ltd. (http://www.zend.com) |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 2.00 of the Zend license,     |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at                              |
10    | http://www.zend.com/license/2_00.txt.                                |
11    | If you did not receive a copy of the Zend license and are unable to  |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@zend.com so we can mail you a copy immediately.              |
14    +----------------------------------------------------------------------+
15    | Authors: Masaki Fujimoto <fujimoto@php.net>                          |
16    |          Rui Hirokawa <hirokawa@php.net>                             |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* $Id$ */
21 
22 #include "zend.h"
23 #include "zend_compile.h"
24 #include "zend_operators.h"
25 #include "zend_multibyte.h"
26 
27 #ifdef ZEND_MULTIBYTE
28 static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC);
29 size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
30 size_t sjis_output_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
31 static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size);
32 static int zend_multibyte_parse_encoding_list(const char *encoding_list,
33 size_t encoding_list_size, zend_encoding ***result, size_t *result_size);
34 static zend_encoding *zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC);
35 static zend_encoding *zend_multibyte_detect_unicode(TSRMLS_D);
36 static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC);
37 
38 /*
39  * encodings
40  */
41 static const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL};
42 static zend_encoding encoding_ucs2 = {
43 	NULL,
44 	NULL,
45 	"UCS-2",
46 	(const char *(*)[])&ucs2_aliases,
47 	0
48 };
49 
50 static zend_encoding encoding_ucs2be = {
51 	NULL,
52 	NULL,
53 	"UCS-2BE",
54 	NULL,
55 	0
56 };
57 
58 static zend_encoding encoding_ucs2le = {
59 	NULL,
60 	NULL,
61 	"UCS-2LE",
62 	NULL,
63 	0
64 };
65 
66 static const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL};
67 static zend_encoding encoding_ucs4 = {
68 	NULL,
69 	NULL,
70 	"UCS-4",
71 	(const char *(*)[])&ucs4_aliases,
72 	0
73 };
74 
75 static zend_encoding encoding_ucs4be = {
76 	NULL,
77 	NULL,
78 	"UCS-4BE",
79 	NULL,
80 	0
81 };
82 
83 static zend_encoding encoding_ucs4le = {
84 	NULL,
85 	NULL,
86 	"UCS-4LE",
87 	NULL,
88 	0
89 };
90 
91 static const char *utf32_aliases[] = {"utf32", NULL};
92 static zend_encoding encoding_utf32 = {
93 	NULL,
94 	NULL,
95 	"UTF-32",
96 	(const char *(*)[])&utf32_aliases,
97 	0
98 };
99 
100 static zend_encoding encoding_utf32be = {
101 	NULL,
102 	NULL,
103 	"UTF-32BE",
104 	NULL,
105 	0
106 };
107 
108 static zend_encoding encoding_utf32le = {
109 	NULL,
110 	NULL,
111 	"UTF-32LE",
112 	NULL,
113 	0
114 };
115 
116 static const char *utf16_aliases[] = {"utf16", NULL};
117 static zend_encoding encoding_utf16 = {
118 	NULL,
119 	NULL,
120 	"UTF-16",
121 	(const char *(*)[])&utf16_aliases,
122 	0
123 };
124 
125 static zend_encoding encoding_utf16be = {
126 	NULL,
127 	NULL,
128 	"UTF-16BE",
129 	NULL,
130 	0
131 };
132 
133 static zend_encoding encoding_utf16le = {
134 	NULL,
135 	NULL,
136 	"UTF-16LE",
137 	NULL,
138 	0
139 };
140 
141 static const char *utf8_aliases[] = {"utf8", NULL};
142 static zend_encoding encoding_utf8 = {
143 	NULL,
144 	NULL,
145 	"UTF-8",
146 	(const char *(*)[])&utf8_aliases,
147 	1
148 };
149 
150 static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL};
151 static zend_encoding encoding_ascii = {
152 	NULL,
153 	NULL,
154 	"ASCII",
155 	(const char *(*)[])&ascii_aliases,
156 	1
157 };
158 
159 static const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
160 static zend_encoding encoding_euc_jp = {
161 	NULL,
162 	NULL,
163 	"EUC-JP",
164 	(const char *(*)[])&euc_jp_aliases,
165 	1
166 };
167 
168 static const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL};
169 static zend_encoding encoding_sjis = {
170 	sjis_input_filter,
171 	sjis_output_filter,
172 	"Shift_JIS",
173 	(const char *(*)[])&sjis_aliases,
174 	0
175 };
176 
177 static const char *eucjp_win_aliases[] = {"eucJP-open", NULL};
178 static zend_encoding encoding_eucjp_win = {
179 	NULL,
180 	NULL,
181 	"eucJP-win",
182 	(const char *(*)[])&eucjp_win_aliases,
183 	1
184 };
185 
186 static const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL};
187 static zend_encoding encoding_sjis_win = {
188 	/* sjis-filters does not care about diffs of Shift_JIS and CP932 */
189 	sjis_input_filter,
190 	sjis_output_filter,
191 	"SJIS-win",
192 	(const char *(*)[])&sjis_win_aliases,
193 	0
194 };
195 
196 static const char *jis_aliases[] = {"ISO-2022-JP", NULL};
197 static zend_encoding encoding_jis = {
198 	NULL,
199 	NULL,
200 	"JIS",
201 	(const char *(*)[])&jis_aliases,
202 	0
203 };
204 
205 static const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
206 static zend_encoding encoding_euc_cn = {
207 	NULL,
208 	NULL,
209 	"EUC-CN",
210 	(const char *(*)[])&euc_cn_aliases,
211 	1
212 };
213 
214 static const char *cp936_aliases[] = {"CP-936", NULL};
215 static zend_encoding encoding_cp936 = {
216 	NULL,
217 	NULL,
218 	"CP936",
219 	(const char *(*)[])&cp936_aliases,
220 	0
221 };
222 
223 static const char *hz_aliases[] = {"HZ-GB-2312", NULL};
224 static zend_encoding encoding_hz = {
225 	NULL,
226 	NULL,
227 	"HZ",
228 	(const char *(*)[])&hz_aliases,
229 	0
230 };
231 
232 static const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
233 static zend_encoding encoding_euc_tw = {
234 	NULL,
235 	NULL,
236 	"EUC-TW",
237 	(const char *(*)[])&euc_tw_aliases,
238 	1
239 };
240 
241 static const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
242 static zend_encoding encoding_big5 = {
243 	NULL,
244 	NULL,
245 	"BIG-5",
246 	(const char *(*)[])&big5_aliases,
247 	0
248 };
249 
250 static const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
251 static zend_encoding encoding_euc_kr = {
252 	NULL,
253 	NULL,
254 	"EUC-KR",
255 	(const char *(*)[])&euc_kr_aliases,
256 	1
257 };
258 
259 static const char *uhc_aliases[] = {"CP949", NULL};
260 static zend_encoding encoding_uhc = {
261 	NULL,
262 	NULL,
263 	"UHC",
264 	(const char *(*)[])&uhc_aliases,
265 	1
266 };
267 
268 static zend_encoding encoding_2022kr = {
269 	NULL,
270 	NULL,
271 	"ISO-2022-KR",
272 	NULL,
273 	0
274 };
275 
276 static const char *cp1252_aliases[] = {"cp1252", NULL};
277 static zend_encoding encoding_cp1252 = {
278 	NULL,
279 	NULL,
280 	"Windows-1252",
281 	(const char *(*)[])&cp1252_aliases,
282 	1
283 };
284 
285 static const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL};
286 static zend_encoding encoding_8859_1 = {
287 	NULL,
288 	NULL,
289 	"ISO-8859-1",
290 	(const char *(*)[])&iso_8859_1_aliases,
291 	1
292 };
293 
294 static const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL};
295 static zend_encoding encoding_8859_2 = {
296 	NULL,
297 	NULL,
298 	"ISO-8859-2",
299 	(const char *(*)[])&iso_8859_2_aliases,
300 	1
301 };
302 
303 static const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL};
304 static zend_encoding encoding_8859_3 = {
305 	NULL,
306 	NULL,
307 	"ISO-8859-3",
308 	(const char *(*)[])&iso_8859_3_aliases,
309 	1
310 };
311 
312 static const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL};
313 static zend_encoding encoding_8859_4 = {
314 	NULL,
315 	NULL,
316 	"ISO-8859-4",
317 	(const char *(*)[])&iso_8859_4_aliases,
318 	1
319 };
320 
321 static const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL};
322 static zend_encoding encoding_8859_5 = {
323 	NULL,
324 	NULL,
325 	"ISO-8859-5",
326 	(const char *(*)[])&iso_8859_5_aliases,
327 	1
328 };
329 
330 static const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL};
331 static zend_encoding encoding_8859_6 = {
332 	NULL,
333 	NULL,
334 	"ISO-8859-6",
335 	(const char *(*)[])&iso_8859_6_aliases,
336 	1
337 };
338 
339 static const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL};
340 static zend_encoding encoding_8859_7 = {
341 	NULL,
342 	NULL,
343 	"ISO-8859-7",
344 	(const char *(*)[])&iso_8859_7_aliases,
345 	1
346 };
347 
348 static const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL};
349 static zend_encoding encoding_8859_8 = {
350 	NULL,
351 	NULL,
352 	"ISO-8859-8",
353 	(const char *(*)[])&iso_8859_8_aliases,
354 	1
355 };
356 
357 static const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL};
358 static zend_encoding encoding_8859_9 = {
359 	NULL,
360 	NULL,
361 	"ISO-8859-9",
362 	(const char *(*)[])&iso_8859_9_aliases,
363 	1
364 };
365 
366 static const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL};
367 static zend_encoding encoding_8859_10 = {
368 	NULL,
369 	NULL,
370 	"ISO-8859-10",
371 	(const char *(*)[])&iso_8859_10_aliases,
372 	1
373 };
374 
375 static const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL};
376 static zend_encoding encoding_8859_13 = {
377 	NULL,
378 	NULL,
379 	"ISO-8859-13",
380 	(const char *(*)[])&iso_8859_13_aliases,
381 	1
382 };
383 
384 static const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL};
385 static zend_encoding encoding_8859_14 = {
386 	NULL,
387 	NULL,
388 	"ISO-8859-14",
389 	(const char *(*)[])&iso_8859_14_aliases,
390 	1
391 };
392 
393 static const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL};
394 static zend_encoding encoding_8859_15 = {
395 	NULL,
396 	NULL,
397 	"ISO-8859-15",
398 	(const char *(*)[])&iso_8859_15_aliases,
399 	1
400 };
401 
402 static const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
403 static zend_encoding encoding_cp1251 = {
404 	NULL,
405 	NULL,
406 	"Windows-1251",
407 	(const char *(*)[])&cp1251_aliases,
408 	1
409 };
410 
411 static const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL};
412 static zend_encoding encoding_cp866 = {
413 	NULL,
414 	NULL,
415 	"CP866",
416 	(const char *(*)[])&cp866_aliases,
417 	1
418 };
419 
420 static const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL};
421 static zend_encoding encoding_koi8r = {
422 	NULL,
423 	NULL,
424 	"KOI8-R",
425 	(const char *(*)[])&koi8r_aliases,
426 	1
427 };
428 
429 static const char *koi8u_aliases[] = {"KOI8-U", "KOI8U", NULL};
430 static zend_encoding encoding_koi8u = {
431 	NULL,
432 	NULL,
433 	"KOI8-U",
434 	(const char *(*)[])&koi8u_aliases,
435 	1
436 };
437 
438 static const char *cp1254_aliases[] = {"cp1254", NULL};
439 static zend_encoding encoding_cp1254 = {
440 	NULL,
441 	NULL,
442 	"Windows-1254",
443 	(const char *(*)[])&cp1254_aliases,
444 	1
445 };
446 
447 static const char *armscii8_aliases[] = { "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL};
448 static zend_encoding encoding_armscii8 = {
449 	NULL,
450 	NULL,
451 	"ArmSCII-8",
452 	(const char *(*)[])&armscii8_aliases,
453 	1
454 };
455 
456 static const char *cp850_aliases[] = {"IBM850", NULL};
457 static zend_encoding encoding_cp850 = {
458 	NULL,
459 	NULL,
460 	"CP850",
461 	(const char *(*)[])&cp850_aliases,
462 	1
463 };
464 
465 static zend_encoding *zend_encoding_table[] = {
466 	&encoding_ucs4,
467 	&encoding_ucs4be,
468 	&encoding_ucs4le,
469 	&encoding_ucs2,
470 	&encoding_ucs2be,
471 	&encoding_ucs2le,
472 	&encoding_utf32,
473 	&encoding_utf32be,
474 	&encoding_utf32le,
475 	&encoding_utf16,
476 	&encoding_utf16be,
477 	&encoding_utf16le,
478 	&encoding_utf8,
479 	&encoding_ascii,
480 	&encoding_euc_jp,
481 	&encoding_sjis,
482 	&encoding_eucjp_win,
483 	&encoding_sjis_win,
484 	&encoding_jis,
485 	&encoding_cp1252,
486 	&encoding_8859_1,
487 	&encoding_8859_2,
488 	&encoding_8859_3,
489 	&encoding_8859_4,
490 	&encoding_8859_5,
491 	&encoding_8859_6,
492 	&encoding_8859_7,
493 	&encoding_8859_8,
494 	&encoding_8859_9,
495 	&encoding_8859_10,
496 	&encoding_8859_13,
497 	&encoding_8859_14,
498 	&encoding_8859_15,
499 	&encoding_euc_cn,
500 	&encoding_cp936,
501 	&encoding_hz,
502 	&encoding_euc_tw,
503 	&encoding_big5,
504 	&encoding_euc_kr,
505 	&encoding_uhc,
506 	&encoding_2022kr,
507 	&encoding_cp1251,
508 	&encoding_cp866,
509 	&encoding_koi8r,
510 	&encoding_koi8u,
511 	&encoding_armscii8,
512 	&encoding_cp1254,
513 	&encoding_cp850,
514 	NULL
515 };
516 
517 
518 
zend_multibyte_set_script_encoding(const char * encoding_list,size_t encoding_list_size TSRMLS_DC)519 ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list,
520 size_t encoding_list_size TSRMLS_DC)
521 {
522 	if (CG(script_encoding_list)) {
523 		efree(CG(script_encoding_list));
524 		CG(script_encoding_list) = NULL;
525 	}
526 	CG(script_encoding_list_size) = 0;
527 
528 	if (!encoding_list) {
529 		return 0;
530 	}
531 
532 	zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size)));
533 
534 	return 0;
535 }
536 
537 
zend_multibyte_set_internal_encoding(const char * encoding_name TSRMLS_DC)538 ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC)
539 {
540 	CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name);
541 	return 0;
542 }
543 
zend_multibyte_set_functions(zend_encoding_detector encoding_detector,zend_encoding_converter encoding_converter,zend_encoding_oddlen encoding_oddlen TSRMLS_DC)544 ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC)
545 {
546 	CG(encoding_detector) = encoding_detector;
547 	CG(encoding_converter) = encoding_converter;
548 	CG(encoding_oddlen) = encoding_oddlen;
549 	return 0;
550 }
551 
552 
zend_multibyte_set_filter(zend_encoding * onetime_encoding TSRMLS_DC)553 ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC)
554 {
555 	LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC);
556 	LANG_SCNG(internal_encoding) = CG(internal_encoding);
557 
558 	/* judge input/output filter */
559 	LANG_SCNG(input_filter) = NULL;
560 	LANG_SCNG(output_filter) = NULL;
561 
562 	if (!LANG_SCNG(script_encoding)) {
563 		return 0;
564 	}
565 
566 	if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) {
567 		/* if encoding specfic filters exist, use them */
568 		if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) {
569 			LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter;
570 			LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter;
571 			return 0;
572 		}
573 
574 		if (!LANG_SCNG(script_encoding)->compatible) {
575 			/* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */
576 			LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding);
577 			LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
578 			LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
579 			return 0;
580 		} else {
581 			/* nothing to do in this case */
582 			return 0;
583 		}
584 	}
585 
586 	/* LANG_SCNG(internal_encoding) cannot be NULL here */
587 	if (LANG_SCNG(internal_encoding)->compatible) {
588 		LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
589 		return 0;
590 	} else if (LANG_SCNG(script_encoding)->compatible) {
591 		LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
592 		return 0;
593 	}
594 
595 	/* both script and internal encodings are incompatible w/ flex */
596 	LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
597 	LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
598 
599 	return 0;
600 }
601 
602 
zend_multibyte_fetch_encoding(const char * encoding_name)603 ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name)
604 {
605 	int i, j;
606 	zend_encoding *encoding;
607 
608 	if (!encoding_name) {
609 		return NULL;
610 	}
611 
612 	for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
613 		if (zend_binary_strcasecmp(encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) {
614 			return encoding;
615 		}
616 	}
617 
618 	for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
619 		if (encoding->aliases != NULL) {
620 			for (j = 0; (*encoding->aliases)[j] != NULL; j++) {
621 				if (zend_binary_strcasecmp((*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) {
622 					return encoding;
623 				}
624 			}
625 		}
626 	}
627 
628 	return NULL;
629 }
630 
631 
zend_multibyte_script_encoding_filter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length TSRMLS_DC)632 ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t
633 *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
634 {
635 	const char *name;
636 
637 	if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) {
638 		name = "UTF-8";
639 	} else {
640 		name = LANG_SCNG(internal_encoding)->name;
641 	}
642 
643 	return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC);
644 }
645 
zend_multibyte_internal_encoding_filter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length TSRMLS_DC)646 ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
647 {
648 	const char *name;
649 
650 	if (LANG_SCNG(script_encoding)->compatible == 0) {
651 		name = "UTF-8";
652 	} else {
653 		name = LANG_SCNG(script_encoding)->name;
654 	}
655 
656 	return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC);
657 }
658 
zend_multibyte_encoding_filter(unsigned char ** to,size_t * to_length,const char * to_encoding,const unsigned char * from,size_t from_length,const char * from_encoding TSRMLS_DC)659 static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC)
660 {
661 	size_t oddlen;
662 
663 	if (!CG(encoding_converter)) {
664 		return 0;
665 	}
666 
667 	if (CG(encoding_oddlen)) {
668 		oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC);
669 		if (oddlen > 0) {
670 			from_length -= oddlen;
671 		}
672 	}
673 
674 	if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) {
675 		return 0;
676 	}
677 
678 	return from_length;
679 }
680 
681 
682 /*
683  *	Shift_JIS Input/Output Filter
684  */
685 static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */
686   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
687   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
688   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
689   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
690   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
691   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
692   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
693   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
694   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
695   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
696   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
698   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
699   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
700   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
701   3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0
702 };
703 
sjis_input_filter(unsigned char ** buf,size_t * length,const unsigned char * sjis,size_t sjis_length TSRMLS_DC)704 size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC)
705 {
706 	const unsigned char *p;
707 	unsigned char *q;
708 	unsigned char  c1, c2;
709 
710 	*buf = (unsigned char*)emalloc(sjis_length * 3 / 2 + 1);
711 	if (!*buf)
712 		return 0;
713 	*length = 0;
714 
715 	p = sjis;
716 	q = *buf;
717 
718 	/* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */
719 	while (*p && (p - sjis) < sjis_length) {
720 		if (!(*p & 0x80)) {
721 			*q++ = *p++;
722 			continue;
723 		}
724 
725 		/* handling 8 bit code */
726 		if (table_sjis[*p] == 1) {
727 			/* 1 byte kana */
728 			*q++ = 0x8e;
729 			*q++ = *p++;
730 			continue;
731 		}
732 
733 		if (!*(p+1)) {
734 			*q++ = *p++;
735 			break;
736 		}
737 
738 		if (table_sjis[*p] == 2) {
739 			/* 2 byte kanji code */
740 			c1 = *p++;
741 			if (!*p || (p - sjis) >= sjis_length) {
742 				break;
743 			}
744 			c2 = *p++;
745 			c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1;
746 			c1 = (c1 << 1) + 1;
747 			if (c2 >= 0x9e) {
748 				c2 -= 0x7e;
749 				c1++;
750 			} else if (c2 > 0x7f) {
751 				c2 -= 0x20;
752 			} else {
753 				c2 -= 0x1f;
754 			}
755 
756 			c1 |= 0x80;
757 			c2 |= 0x80;
758 
759 			*q++ = c1;
760 			*q++ = c2;
761 		} else {
762 			/*
763 			 * for user defined chars (ATTENTION)
764 			 *
765 			 * THESE ARE NOT CODE FOR CONVERSION! :-P
766 			 * (using *ILLEGALLY* 3byte EUC-JP space)
767 			 *
768 			 * we cannot perfectly (== 1 to 1)  convert these chars to EUC-JP.
769 			 * so, these code are for perfect RESTORING in sjis_output_filter()
770 			 */
771 			c1 = *p++;
772 			if (!*p || (p - sjis) >= sjis_length) {
773 				break;
774 			}
775 			c2 = *p++;
776 			*q++ = 0x8f;
777 			/*
778 			 * MAP TO (EUC-JP):
779 			 * type A: 0xeba1 - 0xf4fe
780 			 * type B: 0xf5a1 - 0xfefe
781 			 * type C: 0xa1a1 - 0xa6fe
782 			 */
783 			c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1);
784 			c1 = (c1 << 1) + 1;
785 			if (c2 >= 0x9e) {
786 				c2 -= 0x7e;
787 				c1++;
788 			} else if (c2 > 0x7f) {
789 				c2 -= 0x20;
790 			} else {
791 				c2 -= 0x1f;
792 			}
793 
794 			c1 |= 0x80;
795 			c2 |= 0x80;
796 
797 			*q++ = c1;
798 			*q++ = c2;
799 		}
800 	}
801 	*q = '\0';
802 	*length = q - *buf;
803 
804 	return *length;
805 }
806 
807 static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */
808   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
809   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
810   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
811   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
812   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
813   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
814   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
815   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
816   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
817   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
818   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
819   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
820   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
821   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
822   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
823   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
824 };
825 
sjis_output_filter(unsigned char ** sjis,size_t * sjis_length,const unsigned char * buf,size_t length TSRMLS_DC)826 size_t sjis_output_filter(unsigned char **sjis, size_t *sjis_length, const unsigned char *buf, size_t length TSRMLS_DC)
827 {
828 	unsigned char c1, c2;
829 	unsigned char *p;
830 	const unsigned char *q;
831 
832 	if (!sjis || !sjis_length) {
833 		return 0;
834 	}
835 
836 	/* always Shift_JIS <= EUC-JP */
837 	*sjis = (unsigned char*)emalloc(length+1);
838 	if (!sjis) {
839 		return 0;
840 	}
841 	p = *sjis;
842 	q = buf;
843 
844 	/* restore converted strings [EUC-JP -> Shift_JIS] */
845 	while (*q && (q - buf) < length) {
846 		if (!(*q & 0x80)) {
847 			*p++ = *q++;
848 			continue;
849 		}
850 
851 		/* hankaku kana */
852 		if (*q == 0x8e) {
853 			q++;
854 			if (*q) {
855 				*p++ = *q++;
856 			}
857 			continue;
858 		}
859 
860 		/* 2 byte kanji code */
861 		if (table_eucjp[*q] == 2) {
862 			c1 = (*q++ & ~0x80) & 0xff;
863 			if (*q) {
864 				c2 = (*q++ & ~0x80) & 0xff;
865 			} else {
866 				q--;
867 				break;
868 			}
869 
870 			c2 += (c1 & 0x01) ? 0x1f : 0x7d;
871 			if (c2 >= 0x7f) {
872 				c2++;
873 			}
874 			c1 = ((c1 - 0x21) >> 1) + 0x81;
875 			if (c1 > 0x9f) {
876 				c1 += 0x40;
877 			}
878 
879 			*p++ = c1;
880 			*p++ = c2;
881 			continue;
882 		}
883 
884 		if (*q == 0x8f) {
885 			q++;
886 			if (*q) {
887 				c1 = (*q++ & ~0x80) & 0xff;
888 			} else {
889 				q--;
890 				break;
891 			}
892 			if (*q) {
893 				c2 = (*q++ & ~0x80) & 0xff;
894 			} else {
895 				q -= 2;
896 				break;
897 			}
898 
899 			c2 += (c1 & 0x01) ? 0x1f : 0x7d;
900 			if (c2 >= 0x7f) {
901 				c2++;
902 			}
903 			c1 = ((c1 - 0x21) >> 1) + 0x81;
904 			if (c1 > 0x9f) {
905 				c1 += 0x40;
906 			}
907 
908 			if (c1 >= 0x81 && c1 <= 0x9f) {
909 				c1 += 0x79;
910 			} else {
911 				c1 += 0x0a;
912 			}
913 
914 			*p++ = c1;
915 			*p++ = c2;
916 			continue;
917 		}
918 
919 		/* some other chars (may not happen) */
920 		*p++ = *q++;
921 	}
922 	*p = '\0';
923 	*sjis_length = p - *sjis;
924 
925 	return q-buf;	/* return length we actually read */
926 }
927 
928 
zend_multibyte_assemble_encoding_list(zend_encoding ** encoding_list,size_t encoding_list_size)929 static char *zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size)
930 {
931 	int i, list_size = 0;
932 	const char *name;
933 	char *list = NULL;
934 
935 	if (!encoding_list || !encoding_list_size) {
936 		return NULL;
937 	}
938 
939 	for (i = 0; i < encoding_list_size; i++) {
940 		name = (*(encoding_list+i))->name;
941 		if (name) {
942 			list_size += strlen(name) + 1;
943 			if (!list) {
944 				list = (char*)emalloc(list_size);
945 				if (!list) {
946 					return NULL;
947 				}
948 				*list = '\0';
949 			} else {
950 				list = (char*)erealloc(list, list_size);
951 				if (!list) {
952 					return NULL;
953 				}
954 				strcat(list, ",");
955 			}
956 			strcat(list, name);
957 		}
958 	}
959 	return list;
960 }
961 
962 
zend_multibyte_parse_encoding_list(const char * encoding_list,size_t encoding_list_size,zend_encoding *** result,size_t * result_size)963 static int zend_multibyte_parse_encoding_list(const char *encoding_list,
964 size_t encoding_list_size, zend_encoding ***result, size_t *result_size)
965 {
966 	int n, size;
967 	char *p, *p1, *p2, *endp, *tmpstr;
968 	zend_encoding **list, **entry, *encoding;
969 
970 	list = NULL;
971 	if (encoding_list == NULL || encoding_list_size <= 0) {
972 		return -1;
973 	} else {
974 		/* copy the encoding_list string for work */
975 		tmpstr = (char *)estrndup(encoding_list, encoding_list_size);
976 		if (tmpstr == NULL) {
977 			return -1;
978 		}
979 		/* count the number of listed encoding names */
980 		endp = tmpstr + encoding_list_size;
981 		n = 1;
982 		p1 = tmpstr;
983 		while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) {
984 			p1 = p2 + 1;
985 			n++;
986 		}
987 		size = n;
988 		/* make list */
989 		list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*));
990 		if (list != NULL) {
991 			entry = list;
992 			n = 0;
993 			p1 = tmpstr;
994 			do {
995 				p2 = p = zend_memnstr(p1, ",", 1, endp);
996 				if (p == NULL) {
997 					p = endp;
998 				}
999 				*p = '\0';
1000 				/* trim spaces */
1001 				while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
1002 					p1++;
1003 				}
1004 				p--;
1005 				while (p > p1 && (*p == ' ' || *p == '\t')) {
1006 					*p = '\0';
1007 					p--;
1008 				}
1009 				/* convert to the encoding number and check encoding */
1010 				encoding = zend_multibyte_fetch_encoding(p1);
1011 				if (encoding)
1012 				{
1013 					*entry++ = encoding;
1014 					n++;
1015 				}
1016 				p1 = p2 + 1;
1017 			} while (n < size && p2 != NULL);
1018 			*result = list;
1019 			*result_size = n;
1020 		}
1021 		efree(tmpstr);
1022 	}
1023 
1024 	if (list == NULL) {
1025 		return -1;
1026 	}
1027 
1028 	return 0;
1029 }
1030 
1031 
zend_multibyte_find_script_encoding(zend_encoding * onetime_encoding TSRMLS_DC)1032 static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC)
1033 {
1034 	zend_encoding *script_encoding;
1035 	char *name, *list;
1036 
1037 	/* onetime_encoding is prior to everything */
1038 	if (onetime_encoding != NULL) {
1039 		return onetime_encoding;
1040 	}
1041 
1042 	if (CG(detect_unicode)) {
1043 		/* check out bom(byte order mark) and see if containing wchars */
1044 		script_encoding = zend_multibyte_detect_unicode(TSRMLS_C);
1045 		if (script_encoding != NULL) {
1046 			/* bom or wchar detection is prior to 'script_encoding' option */
1047 			return script_encoding;
1048 		}
1049 	}
1050 
1051 	/* if no script_encoding specified, just leave alone */
1052 	if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) {
1053 		return NULL;
1054 	}
1055 
1056 	/* if multiple encodings specified, detect automagically */
1057 	if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) {
1058 		list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list),
1059 				CG(script_encoding_list_size));
1060 		name = CG(encoding_detector)(LANG_SCNG(script_org),
1061 				LANG_SCNG(script_org_size), list TSRMLS_CC);
1062 		if (list) {
1063 			efree(list);
1064 		}
1065 		if (name) {
1066 			script_encoding = zend_multibyte_fetch_encoding(name);
1067 			efree(name);
1068 		} else {
1069 			script_encoding = NULL;
1070 		}
1071 		return script_encoding;
1072 	}
1073 
1074 	return *(CG(script_encoding_list));
1075 }
1076 
1077 
zend_multibyte_detect_unicode(TSRMLS_D)1078 static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D)
1079 {
1080 	zend_encoding *script_encoding = NULL;
1081 	int bom_size;
1082 	unsigned char *script;
1083 
1084 	if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) {
1085 		return NULL;
1086 	}
1087 
1088 	/* check out BOM */
1089 	if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) {
1090 		script_encoding = &encoding_utf32be;
1091 		bom_size = sizeof(BOM_UTF32_BE)-1;
1092 	} else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) {
1093 		script_encoding = &encoding_utf32le;
1094 		bom_size = sizeof(BOM_UTF32_LE)-1;
1095 	} else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) {
1096 		script_encoding = &encoding_utf16be;
1097 		bom_size = sizeof(BOM_UTF16_BE)-1;
1098 	} else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) {
1099 		script_encoding = &encoding_utf16le;
1100 		bom_size = sizeof(BOM_UTF16_LE)-1;
1101 	} else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) {
1102 		script_encoding = &encoding_utf8;
1103 		bom_size = sizeof(BOM_UTF8)-1;
1104 	}
1105 
1106 	if (script_encoding) {
1107 		/* remove BOM */
1108 		script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size);
1109 		memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size);
1110 		efree(LANG_SCNG(script_org));
1111 		LANG_SCNG(script_org) = script;
1112 		LANG_SCNG(script_org_size) -= bom_size;
1113 
1114 		return script_encoding;
1115 	}
1116 
1117 	/* script contains NULL bytes -> auto-detection */
1118 	if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) {
1119 		/* make best effort if BOM is missing */
1120 		return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC);
1121 	}
1122 
1123 	return NULL;
1124 }
1125 
zend_multibyte_detect_utf_encoding(const unsigned char * script,size_t script_size TSRMLS_DC)1126 static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC)
1127 {
1128 	const unsigned char *p;
1129 	int wchar_size = 2;
1130 	int le = 0;
1131 
1132 	/* utf-16 or utf-32? */
1133 	p = script;
1134 	while ((p-script) < script_size) {
1135 		p = memchr(p, 0, script_size-(p-script)-2);
1136 		if (!p) {
1137 			break;
1138 		}
1139 		if (*(p+1) == '\0' && *(p+2) == '\0') {
1140 			wchar_size = 4;
1141 			break;
1142 		}
1143 
1144 		/* searching for UTF-32 specific byte orders, so this will do */
1145 		p += 4;
1146 	}
1147 
1148 	/* BE or LE? */
1149 	p = script;
1150 	while ((p-script) < script_size) {
1151 		if (*p == '\0' && *(p+wchar_size-1) != '\0') {
1152 			/* BE */
1153 			le = 0;
1154 			break;
1155 		} else if (*p != '\0' && *(p+wchar_size-1) == '\0') {
1156 			/* LE* */
1157 			le = 1;
1158 			break;
1159 		}
1160 		p += wchar_size;
1161 	}
1162 
1163 	if (wchar_size == 2) {
1164 		return le ? &encoding_utf16le : &encoding_utf16be;
1165 	} else {
1166 		return le ? &encoding_utf32le : &encoding_utf32be;
1167 	}
1168 
1169 	return NULL;
1170 }
1171 #endif /* ZEND_MULTIBYTE */
1172 
1173 /*
1174  * Local variables:
1175  * tab-width: 4
1176  * c-basic-offset: 4
1177  * End:
1178  * vim600: sw=4 ts=4 tw=78
1179  * vim<600: sw=4 ts=4 tw=78
1180  */
1181