xref: /php-src/ext/mbstring/mbstring.c (revision b1954f5f)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include "libmbfl/config.h"
22 #include "php.h"
23 #include "php_ini.h"
24 #include "php_variables.h"
25 #include "mbstring.h"
26 #include "ext/standard/php_string.h"
27 #include "ext/standard/php_mail.h"
28 #include "ext/standard/exec.h"
29 #include "ext/standard/url.h"
30 #include "main/php_output.h"
31 #include "ext/standard/info.h"
32 #include "ext/pcre/php_pcre.h"
33 
34 #include "libmbfl/mbfl/mbfilter_8bit.h"
35 #include "libmbfl/mbfl/mbfilter_pass.h"
36 #include "libmbfl/mbfl/mbfilter_wchar.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_qprint.h"
40 #include "libmbfl/filters/mbfilter_htmlent.h"
41 #include "libmbfl/filters/mbfilter_uuencode.h"
42 #include "libmbfl/filters/mbfilter_ucs4.h"
43 #include "libmbfl/filters/mbfilter_utf8.h"
44 #include "libmbfl/filters/mbfilter_singlebyte.h"
45 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
46 
47 #include "php_variables.h"
48 #include "php_globals.h"
49 #include "rfc1867.h"
50 #include "php_content_types.h"
51 #include "SAPI.h"
52 #include "php_unicode.h"
53 #include "TSRM.h"
54 
55 #include "mb_gpc.h"
56 
57 #ifdef HAVE_MBREGEX
58 # include "php_mbregex.h"
59 #endif
60 
61 #include "zend_multibyte.h"
62 #include "mbstring_arginfo.h"
63 /* }}} */
64 
65 /* {{{ prototypes */
66 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
67 
68 static PHP_GINIT_FUNCTION(mbstring);
69 static PHP_GSHUTDOWN_FUNCTION(mbstring);
70 
71 static void php_mb_populate_current_detect_order_list(void);
72 
73 static int php_mb_encoding_translation(void);
74 
75 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
76 
77 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
78 
79 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
80 
81 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
82 
83 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
84 
85 /* See mbfilter_cp5022x.c */
86 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
87 /* }}} */
88 
89 /* {{{ php_mb_default_identify_list */
90 typedef struct _php_mb_nls_ident_list {
91 	enum mbfl_no_language lang;
92 	const enum mbfl_no_encoding *list;
93 	size_t list_size;
94 } php_mb_nls_ident_list;
95 
96 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
97 	mbfl_no_encoding_ascii,
98 	mbfl_no_encoding_jis,
99 	mbfl_no_encoding_utf8,
100 	mbfl_no_encoding_euc_jp,
101 	mbfl_no_encoding_sjis
102 };
103 
104 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
105 	mbfl_no_encoding_ascii,
106 	mbfl_no_encoding_utf8,
107 	mbfl_no_encoding_euc_cn,
108 	mbfl_no_encoding_cp936
109 };
110 
111 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
112 	mbfl_no_encoding_ascii,
113 	mbfl_no_encoding_utf8,
114 	mbfl_no_encoding_euc_tw,
115 	mbfl_no_encoding_big5
116 };
117 
118 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
119 	mbfl_no_encoding_ascii,
120 	mbfl_no_encoding_utf8,
121 	mbfl_no_encoding_euc_kr,
122 	mbfl_no_encoding_uhc
123 };
124 
125 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
126 	mbfl_no_encoding_ascii,
127 	mbfl_no_encoding_utf8,
128 	mbfl_no_encoding_koi8r,
129 	mbfl_no_encoding_cp1251,
130 	mbfl_no_encoding_cp866
131 };
132 
133 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
134 	mbfl_no_encoding_ascii,
135 	mbfl_no_encoding_utf8,
136 	mbfl_no_encoding_armscii8
137 };
138 
139 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
140 	mbfl_no_encoding_ascii,
141 	mbfl_no_encoding_utf8,
142 	mbfl_no_encoding_cp1254,
143 	mbfl_no_encoding_8859_9
144 };
145 
146 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
147 	mbfl_no_encoding_ascii,
148 	mbfl_no_encoding_utf8,
149 	mbfl_no_encoding_koi8u
150 };
151 
152 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
153 	mbfl_no_encoding_ascii,
154 	mbfl_no_encoding_utf8
155 };
156 
157 
158 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
159 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
160 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
161 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
162 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
163 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
164 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
165 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
166 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
167 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
168 };
169 
170 /* }}} */
171 
172 /* {{{ mbstring_deps[] */
173 static const zend_module_dep mbstring_deps[] = {
174 	ZEND_MOD_REQUIRED("pcre")
175 	ZEND_MOD_END
176 };
177 /* }}} */
178 
179 /* {{{ zend_module_entry mbstring_module_entry */
180 zend_module_entry mbstring_module_entry = {
181 	STANDARD_MODULE_HEADER_EX,
182 	NULL,
183 	mbstring_deps,
184 	"mbstring",
185 	ext_functions,
186 	PHP_MINIT(mbstring),
187 	PHP_MSHUTDOWN(mbstring),
188 	PHP_RINIT(mbstring),
189 	PHP_RSHUTDOWN(mbstring),
190 	PHP_MINFO(mbstring),
191 	PHP_MBSTRING_VERSION,
192 	PHP_MODULE_GLOBALS(mbstring),
193 	PHP_GINIT(mbstring),
194 	PHP_GSHUTDOWN(mbstring),
195 	NULL,
196 	STANDARD_MODULE_PROPERTIES_EX
197 };
198 /* }}} */
199 
200 /* {{{ static sapi_post_entry php_post_entries[] */
201 static const sapi_post_entry php_post_entries[] = {
202 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
203 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
204 	{ NULL, 0, NULL, NULL }
205 };
206 /* }}} */
207 
208 #ifdef COMPILE_DL_MBSTRING
209 #ifdef ZTS
210 ZEND_TSRMLS_CACHE_DEFINE()
211 #endif
212 ZEND_GET_MODULE(mbstring)
213 #endif
214 
215 /* {{{ static sapi_post_entry mbstr_post_entries[] */
216 static const sapi_post_entry mbstr_post_entries[] = {
217 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
218 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
219 	{ NULL, 0, NULL, NULL }
220 };
221 /* }}} */
222 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)223 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
224 	if (encoding_name) {
225 		const mbfl_encoding *encoding;
226 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
227 		if (last_encoding_name && (last_encoding_name == encoding_name
228 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
229 			return MBSTRG(last_used_encoding);
230 		}
231 
232 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
233 		if (!encoding) {
234 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
235 			return NULL;
236 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
237 			if (encoding == &mbfl_encoding_base64) {
238 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
239 			} else if (encoding == &mbfl_encoding_qprint) {
240 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
241 			} else if (encoding == &mbfl_encoding_html_ent) {
242 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
243 			} else if (encoding == &mbfl_encoding_uuencode) {
244 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
245 			}
246 		}
247 
248 		if (last_encoding_name) {
249 			zend_string_release(last_encoding_name);
250 		}
251 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
252 		MBSTRG(last_used_encoding) = encoding;
253 		return encoding;
254 	} else {
255 		return MBSTRG(current_internal_encoding);
256 	}
257 }
258 
php_mb_get_encoding_or_pass(const char * encoding_name)259 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
260 	if (strcmp(encoding_name, "pass") == 0) {
261 		return &mbfl_encoding_pass;
262 	}
263 
264 	return mbfl_name2encoding(encoding_name);
265 }
266 
count_commas(const char * p,const char * end)267 static size_t count_commas(const char *p, const char *end) {
268 	size_t count = 0;
269 	while ((p = memchr(p, ',', end - p))) {
270 		count++;
271 		p++;
272 	}
273 	return count;
274 }
275 
276 /* {{{ static zend_result php_mb_parse_encoding_list()
277  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
278  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
279  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num,bool allow_pass_encoding)280 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
281 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num,
282 	bool allow_pass_encoding)
283 {
284 	if (value == NULL || value_length == 0) {
285 		*return_list = NULL;
286 		*return_size = 0;
287 		return SUCCESS;
288 	} else {
289 		bool included_auto;
290 		size_t n, size;
291 		char *p1, *endp, *tmpstr;
292 		const mbfl_encoding **entry, **list;
293 
294 		/* copy the value string for work */
295 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
296 			tmpstr = (char *)estrndup(value+1, value_length-2);
297 			value_length -= 2;
298 		} else {
299 			tmpstr = (char *)estrndup(value, value_length);
300 		}
301 
302 		endp = tmpstr + value_length;
303 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
304 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
305 		entry = list;
306 		n = 0;
307 		included_auto = 0;
308 		p1 = tmpstr;
309 		while (1) {
310 			char *comma = memchr(p1, ',', endp - p1);
311 			char *p = comma ? comma : endp;
312 			*p = '\0';
313 			/* trim spaces */
314 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
315 				p1++;
316 			}
317 			p--;
318 			while (p > p1 && (*p == ' ' || *p == '\t')) {
319 				*p = '\0';
320 				p--;
321 			}
322 			/* convert to the encoding number and check encoding */
323 			if (strcasecmp(p1, "auto") == 0) {
324 				if (!included_auto) {
325 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
326 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
327 					size_t i;
328 					included_auto = 1;
329 					for (i = 0; i < identify_list_size; i++) {
330 						*entry++ = mbfl_no2encoding(*src++);
331 						n++;
332 					}
333 				}
334 			} else {
335 				const mbfl_encoding *encoding =
336 					allow_pass_encoding ? php_mb_get_encoding_or_pass(p1) : mbfl_name2encoding(p1);
337 				if (!encoding) {
338 					/* Called from an INI setting modification */
339 					if (arg_num == 0) {
340 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
341 					} else {
342 						zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
343 					}
344 					efree(tmpstr);
345 					pefree(ZEND_VOIDP(list), persistent);
346 					return FAILURE;
347 				}
348 
349 				*entry++ = encoding;
350 				n++;
351 			}
352 			if (n >= size || comma == NULL) {
353 				break;
354 			}
355 			p1 = comma + 1;
356 		}
357 		*return_list = list;
358 		*return_size = n;
359 		efree(tmpstr);
360 	}
361 
362 	return SUCCESS;
363 }
364 /* }}} */
365 
366 /* {{{ static int php_mb_parse_encoding_array()
367  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
368  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
369  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)370 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
371 	size_t *return_size, uint32_t arg_num)
372 {
373 	/* Allocate enough space to include the default detect order if "auto" is used. */
374 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
375 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
376 	const mbfl_encoding **entry = list;
377 	bool included_auto = 0;
378 	size_t n = 0;
379 	zval *hash_entry;
380 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
381 		zend_string *encoding_str = zval_try_get_string(hash_entry);
382 		if (UNEXPECTED(!encoding_str)) {
383 			efree(ZEND_VOIDP(list));
384 			return FAILURE;
385 		}
386 
387 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
388 			if (!included_auto) {
389 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
390 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
391 				size_t j;
392 
393 				included_auto = 1;
394 				for (j = 0; j < identify_list_size; j++) {
395 					*entry++ = mbfl_no2encoding(*src++);
396 					n++;
397 				}
398 			}
399 		} else {
400 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
401 			if (encoding) {
402 				*entry++ = encoding;
403 				n++;
404 			} else {
405 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
406 				zend_string_release(encoding_str);
407 				efree(ZEND_VOIDP(list));
408 				return FAILURE;
409 			}
410 		}
411 		zend_string_release(encoding_str);
412 	} ZEND_HASH_FOREACH_END();
413 	*return_list = list;
414 	*return_size = n;
415 	return SUCCESS;
416 }
417 /* }}} */
418 
419 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)420 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
421 {
422 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
423 }
424 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)425 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
426 {
427 	return ((const mbfl_encoding *)encoding)->name;
428 }
429 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)430 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
431 {
432 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
433 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
434 }
435 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)436 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
437 {
438 	mbfl_string string;
439 
440 	if (!list) {
441 		list = (const zend_encoding **)MBSTRG(current_detect_order_list);
442 		list_size = MBSTRG(current_detect_order_list_size);
443 	}
444 
445 	mbfl_string_init(&string);
446 	string.val = (unsigned char *)arg_string;
447 	string.len = arg_length;
448 	return (const zend_encoding *) mbfl_identify_encoding(&string, (const mbfl_encoding **)list, list_size, 0);
449 }
450 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)451 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
452 {
453 	mbfl_string string, result;
454 	mbfl_buffer_converter *convd;
455 
456 	/* new encoding */
457 	/* initialize string */
458 	string.encoding = (const mbfl_encoding*)encoding_from;
459 	string.val = (unsigned char*)from;
460 	string.len = from_length;
461 
462 	/* initialize converter */
463 	convd = mbfl_buffer_converter_new((const mbfl_encoding *)encoding_from, (const mbfl_encoding *)encoding_to, string.len);
464 	if (convd == NULL) {
465 		return (size_t) -1;
466 	}
467 
468 	mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
469 	mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
470 
471 	/* do it */
472 	size_t loc = mbfl_buffer_converter_feed(convd, &string);
473 
474 	mbfl_buffer_converter_flush(convd);
475 	mbfl_string_init(&result);
476 	if (!mbfl_buffer_converter_result(convd, &result)) {
477 		mbfl_buffer_converter_delete(convd);
478 		return (size_t)-1;
479 	}
480 
481 	*to = result.val;
482 	*to_length = result.len;
483 
484 	mbfl_buffer_converter_delete(convd);
485 
486 	return loc;
487 }
488 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)489 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
490 {
491 	return php_mb_parse_encoding_list(
492 		encoding_list, encoding_list_len,
493 		(const mbfl_encoding ***)return_list, return_size,
494 		persistent, /* arg_num */ 0, /* allow_pass_encoding */ 1);
495 }
496 
php_mb_zend_internal_encoding_getter(void)497 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
498 {
499 	return (const zend_encoding *)MBSTRG(internal_encoding);
500 }
501 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)502 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
503 {
504 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
505 	return SUCCESS;
506 }
507 
508 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
509 	"mbstring",
510 	php_mb_zend_encoding_fetcher,
511 	php_mb_zend_encoding_name_getter,
512 	php_mb_zend_encoding_lexer_compatibility_checker,
513 	php_mb_zend_encoding_detector,
514 	php_mb_zend_encoding_converter,
515 	php_mb_zend_encoding_list_parser,
516 	php_mb_zend_internal_encoding_getter,
517 	php_mb_zend_internal_encoding_setter
518 };
519 /* }}} */
520 
521 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)522 static void *_php_mb_compile_regex(const char *pattern)
523 {
524 	pcre2_code *retval;
525 	PCRE2_SIZE err_offset;
526 	int errnum;
527 
528 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
529 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
530 		PCRE2_UCHAR err_str[128];
531 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
532 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
533 	}
534 	return retval;
535 }
536 /* }}} */
537 
538 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)539 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
540 {
541 	int res;
542 
543 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
544 	if (NULL == match_data) {
545 		pcre2_code_free(opaque);
546 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
547 		return FAILURE;
548 	}
549 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
550 	php_pcre_free_match_data(match_data);
551 
552 	return res;
553 }
554 /* }}} */
555 
556 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)557 static void _php_mb_free_regex(void *opaque)
558 {
559 	pcre2_code_free(opaque);
560 }
561 /* }}} */
562 
563 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)564 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
565 {
566 	size_t i;
567 
568 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
569 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
570 
571 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
572 		if (php_mb_default_identify_list[i].lang == lang) {
573 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
574 			*plist_size = php_mb_default_identify_list[i].list_size;
575 			return 1;
576 		}
577 	}
578 	return 0;
579 }
580 /* }}} */
581 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)582 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
583 {
584 	char *result = emalloc(len + 2);
585 	char *resp = result;
586 	size_t i;
587 
588 	for (i = 0; i < len && start[i] != quote; ++i) {
589 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
590 			*resp++ = start[++i];
591 		} else {
592 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
593 
594 			while (j-- > 0 && i < len) {
595 				*resp++ = start[i++];
596 			}
597 			--i;
598 		}
599 	}
600 
601 	*resp = '\0';
602 	return result;
603 }
604 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)605 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
606 {
607 	char *pos = *line, quote;
608 	char *res;
609 
610 	while (*pos && *pos != stop) {
611 		if ((quote = *pos) == '"' || quote == '\'') {
612 			++pos;
613 			while (*pos && *pos != quote) {
614 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
615 					pos += 2;
616 				} else {
617 					++pos;
618 				}
619 			}
620 			if (*pos) {
621 				++pos;
622 			}
623 		} else {
624 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
625 
626 		}
627 	}
628 	if (*pos == '\0') {
629 		res = estrdup(*line);
630 		*line += strlen(*line);
631 		return res;
632 	}
633 
634 	res = estrndup(*line, pos - *line);
635 
636 	while (*pos == stop) {
637 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
638 	}
639 
640 	*line = pos;
641 	return res;
642 }
643 /* }}} */
644 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)645 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
646 {
647 	while (*str && isspace(*(unsigned char *)str)) {
648 		++str;
649 	}
650 
651 	if (!*str) {
652 		return estrdup("");
653 	}
654 
655 	if (*str == '"' || *str == '\'') {
656 		char quote = *str;
657 
658 		str++;
659 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
660 	} else {
661 		char *strend = str;
662 
663 		while (*strend && !isspace(*(unsigned char *)strend)) {
664 			++strend;
665 		}
666 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
667 	}
668 }
669 /* }}} */
670 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)671 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
672 {
673 	char *s, *s2;
674 	const size_t filename_len = strlen(filename);
675 
676 	/* The \ check should technically be needed for win32 systems only where
677 	 * it is a valid path separator. However, IE in all it's wisdom always sends
678 	 * the full path of the file on the user's filesystem, which means that unless
679 	 * the user does basename() they get a bogus file name. Until IE's user base drops
680 	 * to nill or problem is fixed this code must remain enabled for all systems. */
681 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
682 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
683 
684 	if (s && s2) {
685 		if (s > s2) {
686 			return ++s;
687 		} else {
688 			return ++s2;
689 		}
690 	} else if (s) {
691 		return ++s;
692 	} else if (s2) {
693 		return ++s2;
694 	} else {
695 		return filename;
696 	}
697 }
698 /* }}} */
699 
700 /* {{{ php.ini directive handler */
701 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)702 static PHP_INI_MH(OnUpdate_mbstring_language)
703 {
704 	enum mbfl_no_language no_language;
705 
706 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
707 	if (no_language == mbfl_no_language_invalid) {
708 		MBSTRG(language) = mbfl_no_language_neutral;
709 		return FAILURE;
710 	}
711 	MBSTRG(language) = no_language;
712 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
713 	return SUCCESS;
714 }
715 /* }}} */
716 
717 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)718 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
719 {
720 	const mbfl_encoding **list;
721 	size_t size;
722 
723 	if (!new_value) {
724 		if (MBSTRG(detect_order_list)) {
725 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
726 		}
727 		MBSTRG(detect_order_list) = NULL;
728 		MBSTRG(detect_order_list_size) = 0;
729 		return SUCCESS;
730 	}
731 
732 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 0) || size == 0) {
733 		return FAILURE;
734 	}
735 
736 	if (MBSTRG(detect_order_list)) {
737 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
738 	}
739 	MBSTRG(detect_order_list) = list;
740 	MBSTRG(detect_order_list_size) = size;
741 	return SUCCESS;
742 }
743 /* }}} */
744 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)745 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
746 	const mbfl_encoding **list;
747 	size_t size;
748 	if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 1) || size == 0) {
749 		return FAILURE;
750 	}
751 	if (MBSTRG(http_input_list)) {
752 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
753 	}
754 	MBSTRG(http_input_list) = list;
755 	MBSTRG(http_input_list_size) = size;
756 	return SUCCESS;
757 }
758 
759 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)760 static PHP_INI_MH(OnUpdate_mbstring_http_input)
761 {
762 	if (new_value) {
763 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
764 	}
765 
766 	if (!new_value) {
767 		const char *encoding = php_get_input_encoding();
768 		MBSTRG(http_input_set) = 0;
769 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
770 		return SUCCESS;
771 	}
772 
773 	MBSTRG(http_input_set) = 1;
774 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
775 }
776 /* }}} */
777 
_php_mb_ini_mbstring_http_output_set(const char * new_value)778 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
779 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
780 	if (!encoding) {
781 		return FAILURE;
782 	}
783 
784 	MBSTRG(http_output_encoding) = encoding;
785 	MBSTRG(current_http_output_encoding) = encoding;
786 	return SUCCESS;
787 }
788 
789 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)790 static PHP_INI_MH(OnUpdate_mbstring_http_output)
791 {
792 	if (new_value) {
793 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
794 	}
795 
796 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
797 		MBSTRG(http_output_set) = 0;
798 		_php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
799 		return SUCCESS;
800 	}
801 
802 	MBSTRG(http_output_set) = 1;
803 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
804 }
805 /* }}} */
806 
807 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)808 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
809 {
810 	const mbfl_encoding *encoding;
811 
812 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
813 		/* falls back to UTF-8 if an unknown encoding name is given */
814 		if (new_value) {
815 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
816 		}
817 		encoding = &mbfl_encoding_utf8;
818 	}
819 	MBSTRG(internal_encoding) = encoding;
820 	MBSTRG(current_internal_encoding) = encoding;
821 #ifdef HAVE_MBREGEX
822 	{
823 		const char *enc_name = new_value;
824 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
825 			/* falls back to UTF-8 if an unknown encoding name is given */
826 			enc_name = "UTF-8";
827 			php_mb_regex_set_default_mbctype(enc_name);
828 		}
829 		php_mb_regex_set_mbctype(new_value);
830 	}
831 #endif
832 	return SUCCESS;
833 }
834 /* }}} */
835 
836 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)837 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
838 {
839 	if (new_value) {
840 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
841 	}
842 
843 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
844 		return FAILURE;
845 	}
846 
847 	if (new_value && ZSTR_LEN(new_value)) {
848 		MBSTRG(internal_encoding_set) = 1;
849 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
850 	} else {
851 		const char *encoding = php_get_internal_encoding();
852 		MBSTRG(internal_encoding_set) = 0;
853 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
854 	}
855 }
856 /* }}} */
857 
858 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)859 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
860 {
861 	int c;
862 	char *endptr = NULL;
863 
864 	if (new_value != NULL) {
865 		if (zend_string_equals_literal_ci(new_value, "none")) {
866 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
867 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
868 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
869 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
870 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
871 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
872 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
873 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
874 		} else {
875 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
876 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 			if (ZSTR_LEN(new_value) > 0) {
878 				c = strtol(ZSTR_VAL(new_value), &endptr, 0);
879 				if (*endptr == '\0') {
880 					MBSTRG(filter_illegal_substchar) = c;
881 					MBSTRG(current_filter_illegal_substchar) = c;
882 				}
883 			}
884 		}
885 	} else {
886 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
887 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
888 		MBSTRG(filter_illegal_substchar) = '?';
889 		MBSTRG(current_filter_illegal_substchar) = '?';
890 	}
891 
892 	return SUCCESS;
893 }
894 /* }}} */
895 
896 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)897 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
898 {
899 	if (new_value == NULL) {
900 		return FAILURE;
901 	}
902 
903 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
904 
905 	if (MBSTRG(encoding_translation)) {
906 		sapi_unregister_post_entry(php_post_entries);
907 		sapi_register_post_entries(mbstr_post_entries);
908 	} else {
909 		sapi_unregister_post_entry(mbstr_post_entries);
910 		sapi_register_post_entries(php_post_entries);
911 	}
912 
913 	return SUCCESS;
914 }
915 /* }}} */
916 
917 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)918 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
919 {
920 	zend_string *tmp;
921 	void *re = NULL;
922 
923 	if (!new_value) {
924 		new_value = entry->orig_value;
925 	}
926 	tmp = php_trim(new_value, NULL, 0, 3);
927 
928 	if (ZSTR_LEN(tmp) > 0) {
929 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
930 			zend_string_release_ex(tmp, 0);
931 			return FAILURE;
932 		}
933 	}
934 
935 	if (MBSTRG(http_output_conv_mimetypes)) {
936 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
937 	}
938 
939 	MBSTRG(http_output_conv_mimetypes) = re;
940 
941 	zend_string_release_ex(tmp, 0);
942 	return SUCCESS;
943 }
944 /* }}} */
945 /* }}} */
946 
947 /* {{{ php.ini directive registration */
948 PHP_INI_BEGIN()
949 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
950 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
951 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
952 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
953 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
954 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
955 
956 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
957 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
958 		OnUpdate_mbstring_encoding_translation,
959 		encoding_translation, zend_mbstring_globals, mbstring_globals)
960 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
961 		"^(text/|application/xhtml\\+xml)",
962 		PHP_INI_ALL,
963 		OnUpdate_mbstring_http_output_conv_mimetypes)
964 
965 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
966 		PHP_INI_ALL,
967 		OnUpdateBool,
968 		strict_detection, zend_mbstring_globals, mbstring_globals)
969 #ifdef HAVE_MBREGEX
970 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
971 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
972 #endif
PHP_INI_END()973 PHP_INI_END()
974 /* }}} */
975 
976 static void mbstring_internal_encoding_changed_hook(void) {
977 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
978 	if (!MBSTRG(internal_encoding_set)) {
979 		const char *encoding = php_get_internal_encoding();
980 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
981 	}
982 
983 	if (!MBSTRG(http_output_set)) {
984 		const char *encoding = php_get_output_encoding();
985 		_php_mb_ini_mbstring_http_output_set(encoding);
986 	}
987 
988 	if (!MBSTRG(http_input_set)) {
989 		const char *encoding = php_get_input_encoding();
990 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
991 	}
992 }
993 
994 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)995 static PHP_GINIT_FUNCTION(mbstring)
996 {
997 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
998 ZEND_TSRMLS_CACHE_UPDATE();
999 #endif
1000 
1001 	mbstring_globals->language = mbfl_no_language_uni;
1002 	mbstring_globals->internal_encoding = NULL;
1003 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
1004 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
1005 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
1006 	mbstring_globals->http_input_identify = NULL;
1007 	mbstring_globals->http_input_identify_get = NULL;
1008 	mbstring_globals->http_input_identify_post = NULL;
1009 	mbstring_globals->http_input_identify_cookie = NULL;
1010 	mbstring_globals->http_input_identify_string = NULL;
1011 	mbstring_globals->http_input_list = NULL;
1012 	mbstring_globals->http_input_list_size = 0;
1013 	mbstring_globals->detect_order_list = NULL;
1014 	mbstring_globals->detect_order_list_size = 0;
1015 	mbstring_globals->current_detect_order_list = NULL;
1016 	mbstring_globals->current_detect_order_list_size = 0;
1017 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1018 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1019 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1020 	mbstring_globals->filter_illegal_substchar = '?';
1021 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1022 	mbstring_globals->current_filter_illegal_substchar = '?';
1023 	mbstring_globals->illegalchars = 0;
1024 	mbstring_globals->encoding_translation = 0;
1025 	mbstring_globals->strict_detection = 0;
1026 	mbstring_globals->outconv = NULL;
1027 	mbstring_globals->http_output_conv_mimetypes = NULL;
1028 #ifdef HAVE_MBREGEX
1029 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1030 #endif
1031 	mbstring_globals->last_used_encoding_name = NULL;
1032 	mbstring_globals->last_used_encoding = NULL;
1033 	mbstring_globals->internal_encoding_set = 0;
1034 	mbstring_globals->http_output_set = 0;
1035 	mbstring_globals->http_input_set = 0;
1036 }
1037 /* }}} */
1038 
1039 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1040 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1041 {
1042 	if (mbstring_globals->http_input_list) {
1043 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1044 	}
1045 	if (mbstring_globals->detect_order_list) {
1046 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1047 	}
1048 	if (mbstring_globals->http_output_conv_mimetypes) {
1049 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1050 	}
1051 #ifdef HAVE_MBREGEX
1052 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1053 #endif
1054 }
1055 /* }}} */
1056 
1057 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1058 PHP_MINIT_FUNCTION(mbstring)
1059 {
1060 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1061 ZEND_TSRMLS_CACHE_UPDATE();
1062 #endif
1063 
1064 	REGISTER_INI_ENTRIES();
1065 
1066 	/* We assume that we're the only user of the hook. */
1067 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1068 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1069 	mbstring_internal_encoding_changed_hook();
1070 
1071 	/* This is a global handler. Should not be set in a per-request handler. */
1072 	sapi_register_treat_data(mbstr_treat_data);
1073 
1074 	/* Post handlers are stored in the thread-local context. */
1075 	if (MBSTRG(encoding_translation)) {
1076 		sapi_register_post_entries(mbstr_post_entries);
1077 	}
1078 
1079 #ifdef HAVE_MBREGEX
1080 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1081 #endif
1082 
1083 	register_mbstring_symbols(module_number);
1084 
1085 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1086 		return FAILURE;
1087 	}
1088 
1089 	php_rfc1867_set_multibyte_callbacks(
1090 		php_mb_encoding_translation,
1091 		php_mb_gpc_get_detect_order,
1092 		php_mb_gpc_set_input_encoding,
1093 		php_mb_rfc1867_getword,
1094 		php_mb_rfc1867_getword_conf,
1095 		php_mb_rfc1867_basename);
1096 
1097 	return SUCCESS;
1098 }
1099 /* }}} */
1100 
1101 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1102 PHP_MSHUTDOWN_FUNCTION(mbstring)
1103 {
1104 	UNREGISTER_INI_ENTRIES();
1105 
1106 	zend_multibyte_restore_functions();
1107 
1108 #ifdef HAVE_MBREGEX
1109 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1110 #endif
1111 
1112 	php_internal_encoding_changed = NULL;
1113 
1114 	return SUCCESS;
1115 }
1116 /* }}} */
1117 
1118 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1119 PHP_RINIT_FUNCTION(mbstring)
1120 {
1121 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1122 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1123 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1124 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1125 
1126 	MBSTRG(illegalchars) = 0;
1127 
1128 	php_mb_populate_current_detect_order_list();
1129 
1130 #ifdef HAVE_MBREGEX
1131 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1132 #endif
1133 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1134 
1135 	return SUCCESS;
1136 }
1137 /* }}} */
1138 
1139 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1140 PHP_RSHUTDOWN_FUNCTION(mbstring)
1141 {
1142 	if (MBSTRG(current_detect_order_list) != NULL) {
1143 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1144 		MBSTRG(current_detect_order_list) = NULL;
1145 		MBSTRG(current_detect_order_list_size) = 0;
1146 	}
1147 	if (MBSTRG(outconv) != NULL) {
1148 		MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1149 		mbfl_buffer_converter_delete(MBSTRG(outconv));
1150 		MBSTRG(outconv) = NULL;
1151 	}
1152 
1153 	/* clear http input identification. */
1154 	MBSTRG(http_input_identify) = NULL;
1155 	MBSTRG(http_input_identify_post) = NULL;
1156 	MBSTRG(http_input_identify_get) = NULL;
1157 	MBSTRG(http_input_identify_cookie) = NULL;
1158 	MBSTRG(http_input_identify_string) = NULL;
1159 
1160 	if (MBSTRG(last_used_encoding_name)) {
1161 		zend_string_release(MBSTRG(last_used_encoding_name));
1162 		MBSTRG(last_used_encoding_name) = NULL;
1163 	}
1164 
1165 	MBSTRG(internal_encoding_set) = 0;
1166 	MBSTRG(http_output_set) = 0;
1167 	MBSTRG(http_input_set) = 0;
1168 
1169 #ifdef HAVE_MBREGEX
1170 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1171 #endif
1172 
1173 	return SUCCESS;
1174 }
1175 /* }}} */
1176 
1177 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1178 PHP_MINFO_FUNCTION(mbstring)
1179 {
1180 	php_info_print_table_start();
1181 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1182 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1183 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1184 	{
1185 		char tmp[256];
1186 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1187 		php_info_print_table_row(2, "libmbfl version", tmp);
1188 	}
1189 	php_info_print_table_end();
1190 
1191 	php_info_print_table_start();
1192 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1193 	php_info_print_table_end();
1194 
1195 #ifdef HAVE_MBREGEX
1196 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1197 #endif
1198 
1199 	DISPLAY_INI_ENTRIES();
1200 }
1201 /* }}} */
1202 
1203 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1204 PHP_FUNCTION(mb_language)
1205 {
1206 	zend_string *name = NULL;
1207 
1208 	ZEND_PARSE_PARAMETERS_START(0, 1)
1209 		Z_PARAM_OPTIONAL
1210 		Z_PARAM_STR_OR_NULL(name)
1211 	ZEND_PARSE_PARAMETERS_END();
1212 
1213 	if (name == NULL) {
1214 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1215 	} else {
1216 		zend_string *ini_name = zend_string_init("mbstring.language", sizeof("mbstring.language") - 1, 0);
1217 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1218 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1219 			zend_string_release_ex(ini_name, 0);
1220 			RETURN_THROWS();
1221 		}
1222 		// TODO Make return void
1223 		RETVAL_TRUE;
1224 		zend_string_release_ex(ini_name, 0);
1225 	}
1226 }
1227 /* }}} */
1228 
1229 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1230 PHP_FUNCTION(mb_internal_encoding)
1231 {
1232 	char *name = NULL;
1233 	size_t name_len;
1234 	const mbfl_encoding *encoding;
1235 
1236 	ZEND_PARSE_PARAMETERS_START(0, 1)
1237 		Z_PARAM_OPTIONAL
1238 		Z_PARAM_STRING_OR_NULL(name, name_len)
1239 	ZEND_PARSE_PARAMETERS_END();
1240 
1241 	if (name == NULL) {
1242 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1243 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1244 	} else {
1245 		encoding = mbfl_name2encoding(name);
1246 		if (!encoding) {
1247 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1248 			RETURN_THROWS();
1249 		} else {
1250 			MBSTRG(current_internal_encoding) = encoding;
1251 			MBSTRG(internal_encoding_set) = 1;
1252 			/* TODO Return old encoding */
1253 			RETURN_TRUE;
1254 		}
1255 	}
1256 }
1257 /* }}} */
1258 
1259 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1260 PHP_FUNCTION(mb_http_input)
1261 {
1262 	char *type = NULL;
1263 	size_t type_len = 0, n;
1264 	const mbfl_encoding **entry;
1265 	const mbfl_encoding *encoding;
1266 
1267 	ZEND_PARSE_PARAMETERS_START(0, 1)
1268 		Z_PARAM_OPTIONAL
1269 		Z_PARAM_STRING_OR_NULL(type, type_len)
1270 	ZEND_PARSE_PARAMETERS_END();
1271 
1272 	if (type == NULL) {
1273 		encoding = MBSTRG(http_input_identify);
1274 	} else {
1275 		switch (*type) {
1276 		case 'G':
1277 		case 'g':
1278 			encoding = MBSTRG(http_input_identify_get);
1279 			break;
1280 		case 'P':
1281 		case 'p':
1282 			encoding = MBSTRG(http_input_identify_post);
1283 			break;
1284 		case 'C':
1285 		case 'c':
1286 			encoding = MBSTRG(http_input_identify_cookie);
1287 			break;
1288 		case 'S':
1289 		case 's':
1290 			encoding = MBSTRG(http_input_identify_string);
1291 			break;
1292 		case 'I':
1293 		case 'i':
1294 			entry = MBSTRG(http_input_list);
1295 			n = MBSTRG(http_input_list_size);
1296 			array_init(return_value);
1297 			for (size_t i = 0; i < n; i++, entry++) {
1298 				add_next_index_string(return_value, (*entry)->name);
1299 			}
1300 			return;
1301 		case 'L':
1302 		case 'l':
1303 			entry = MBSTRG(http_input_list);
1304 			n = MBSTRG(http_input_list_size);
1305 			if (n == 0) {
1306 				RETURN_FALSE;
1307 			}
1308 			// TODO Use smart_str instead.
1309 			mbfl_string result;
1310 			mbfl_memory_device device;
1311 			mbfl_memory_device_init(&device, n * 12, 0);
1312 			for (size_t i = 0; i < n; i++, entry++) {
1313 				mbfl_memory_device_strcat(&device, (*entry)->name);
1314 				mbfl_memory_device_output(',', &device);
1315 			}
1316 			mbfl_memory_device_unput(&device); /* Remove trailing comma */
1317 			mbfl_memory_device_result(&device, &result);
1318 			RETVAL_STRINGL((const char*)result.val, result.len);
1319 			mbfl_string_clear(&result);
1320 			return;
1321 		default:
1322 			zend_argument_value_error(1,
1323 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1324 			RETURN_THROWS();
1325 		}
1326 	}
1327 
1328 	if (encoding) {
1329 		RETURN_STRING(encoding->name);
1330 	} else {
1331 		RETURN_FALSE;
1332 	}
1333 }
1334 /* }}} */
1335 
1336 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1337 PHP_FUNCTION(mb_http_output)
1338 {
1339 	char *name = NULL;
1340 	size_t name_len;
1341 
1342 	ZEND_PARSE_PARAMETERS_START(0, 1)
1343 		Z_PARAM_OPTIONAL
1344 		Z_PARAM_STRING_OR_NULL(name, name_len)
1345 	ZEND_PARSE_PARAMETERS_END();
1346 
1347 	if (name == NULL) {
1348 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1349 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1350 	} else {
1351 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1352 		if (!encoding) {
1353 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1354 			RETURN_THROWS();
1355 		} else {
1356 			MBSTRG(http_output_set) = 1;
1357 			MBSTRG(current_http_output_encoding) = encoding;
1358 			/* TODO Return previous encoding? */
1359 			RETURN_TRUE;
1360 		}
1361 	}
1362 }
1363 /* }}} */
1364 
1365 /* {{{ Sets the current detect_order or Return the current detect_order as a array */
PHP_FUNCTION(mb_detect_order)1366 PHP_FUNCTION(mb_detect_order)
1367 {
1368 	zend_string *order_str = NULL;
1369 	HashTable *order_ht = NULL;
1370 
1371 	ZEND_PARSE_PARAMETERS_START(0, 1)
1372 		Z_PARAM_OPTIONAL
1373 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1374 	ZEND_PARSE_PARAMETERS_END();
1375 
1376 	if (!order_str && !order_ht) {
1377 		size_t n = MBSTRG(current_detect_order_list_size);
1378 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1379 		array_init(return_value);
1380 		for (size_t i = 0; i < n; i++) {
1381 			add_next_index_string(return_value, (*entry)->name);
1382 			entry++;
1383 		}
1384 	} else {
1385 		const mbfl_encoding **list;
1386 		size_t size;
1387 		if (order_ht) {
1388 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1389 				RETURN_THROWS();
1390 			}
1391 		} else {
1392 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1, /* allow_pass_encoding */ 0)) {
1393 				RETURN_THROWS();
1394 			}
1395 		}
1396 
1397 		if (size == 0) {
1398 			efree(ZEND_VOIDP(list));
1399 			zend_argument_value_error(1, "must specify at least one encoding");
1400 			RETURN_THROWS();
1401 		}
1402 
1403 		if (MBSTRG(current_detect_order_list)) {
1404 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1405 		}
1406 		MBSTRG(current_detect_order_list) = list;
1407 		MBSTRG(current_detect_order_list_size) = size;
1408 		RETURN_TRUE;
1409 	}
1410 }
1411 /* }}} */
1412 
php_mb_check_code_point(zend_long cp)1413 static inline int php_mb_check_code_point(zend_long cp)
1414 {
1415 	if (cp < 0 || cp >= 0x110000) {
1416 		/* Out of Unicode range */
1417 		return 0;
1418 	}
1419 
1420 	if (cp >= 0xd800 && cp <= 0xdfff) {
1421 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1422 		 * substitute character. */
1423 		return 0;
1424 	}
1425 
1426 	/* As we do not know the target encoding of the conversion operation that is going to
1427 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1428 	 * in the given encoding at this point. Thus we have to accept everything. */
1429 	return 1;
1430 }
1431 
1432 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1433 PHP_FUNCTION(mb_substitute_character)
1434 {
1435 	zend_string *substitute_character = NULL;
1436 	zend_long substitute_codepoint;
1437 	bool substitute_is_null = 1;
1438 
1439 	ZEND_PARSE_PARAMETERS_START(0, 1)
1440 		Z_PARAM_OPTIONAL
1441 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1442 	ZEND_PARSE_PARAMETERS_END();
1443 
1444 	if (substitute_is_null) {
1445 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1446 			RETURN_STRING("none");
1447 		}
1448 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1449 			RETURN_STRING("long");
1450 		}
1451 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1452 			RETURN_STRING("entity");
1453 		}
1454 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1455 	}
1456 
1457 	if (substitute_character != NULL) {
1458 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1459 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1460 			RETURN_TRUE;
1461 		}
1462 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1463 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1464 			RETURN_TRUE;
1465 		}
1466 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1467 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1468 			RETURN_TRUE;
1469 		}
1470 		/* Invalid string value */
1471 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1472 		RETURN_THROWS();
1473 	}
1474 	/* Integer codepoint passed */
1475 	if (!php_mb_check_code_point(substitute_codepoint)) {
1476 		zend_argument_value_error(1, "is not a valid codepoint");
1477 		RETURN_THROWS();
1478 	}
1479 
1480 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1481 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1482 	RETURN_TRUE;
1483 }
1484 /* }}} */
1485 
1486 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1487 PHP_FUNCTION(mb_preferred_mime_name)
1488 {
1489 	enum mbfl_no_encoding no_encoding;
1490 	char *name = NULL;
1491 	size_t name_len;
1492 
1493 	ZEND_PARSE_PARAMETERS_START(1, 1)
1494 		Z_PARAM_STRING(name, name_len)
1495 	ZEND_PARSE_PARAMETERS_END();
1496 
1497 	no_encoding = mbfl_name2no_encoding(name);
1498 	if (no_encoding == mbfl_no_encoding_invalid) {
1499 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1500 		RETURN_THROWS();
1501 	}
1502 
1503 	const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
1504 	if (preferred_name == NULL || *preferred_name == '\0') {
1505 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1506 		RETVAL_FALSE;
1507 	} else {
1508 		RETVAL_STRING((char *)preferred_name);
1509 	}
1510 }
1511 /* }}} */
1512 
1513 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1514 PHP_FUNCTION(mb_parse_str)
1515 {
1516 	zval *track_vars_array = NULL;
1517 	char *encstr;
1518 	size_t encstr_len;
1519 	php_mb_encoding_handler_info_t info;
1520 	const mbfl_encoding *detected;
1521 
1522 	ZEND_PARSE_PARAMETERS_START(2, 2)
1523 		Z_PARAM_STRING(encstr, encstr_len)
1524 		Z_PARAM_ZVAL(track_vars_array)
1525 	ZEND_PARSE_PARAMETERS_END();
1526 
1527 	track_vars_array = zend_try_array_init(track_vars_array);
1528 	if (!track_vars_array) {
1529 		RETURN_THROWS();
1530 	}
1531 
1532 	encstr = estrndup(encstr, encstr_len);
1533 
1534 	info.data_type              = PARSE_STRING;
1535 	info.separator              = PG(arg_separator).input;
1536 	info.report_errors          = true;
1537 	info.to_encoding            = MBSTRG(current_internal_encoding);
1538 	info.from_encodings         = MBSTRG(http_input_list);
1539 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1540 
1541 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1542 
1543 	MBSTRG(http_input_identify) = detected;
1544 
1545 	RETVAL_BOOL(detected);
1546 
1547 	if (encstr != NULL) efree(encstr);
1548 }
1549 /* }}} */
1550 
1551 /* {{{ Returns string in output buffer converted to the http_output encoding */
PHP_FUNCTION(mb_output_handler)1552 PHP_FUNCTION(mb_output_handler)
1553 {
1554 	char *arg_string;
1555 	size_t arg_string_len;
1556 	zend_long arg_status;
1557 	mbfl_string string, result;
1558 	const char *charset;
1559 	char *p;
1560 	const mbfl_encoding *encoding;
1561 	int last_feed;
1562 	size_t len;
1563 	unsigned char send_text_mimetype = 0;
1564 	char *s, *mimetype = NULL;
1565 
1566 	ZEND_PARSE_PARAMETERS_START(2, 2)
1567 		Z_PARAM_STRING(arg_string, arg_string_len)
1568 		Z_PARAM_LONG(arg_status)
1569 	ZEND_PARSE_PARAMETERS_END();
1570 
1571 	encoding = MBSTRG(current_http_output_encoding);
1572 
1573 	/* start phase only */
1574 	if ((arg_status & PHP_OUTPUT_HANDLER_START) != 0) {
1575 		/* delete the converter just in case. */
1576 		if (MBSTRG(outconv)) {
1577 			MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1578 			mbfl_buffer_converter_delete(MBSTRG(outconv));
1579 			MBSTRG(outconv) = NULL;
1580 		}
1581 
1582 		if (encoding == &mbfl_encoding_pass) {
1583 			RETURN_STRINGL(arg_string, arg_string_len);
1584 		}
1585 
1586 		/* analyze mime type */
1587 		if (SG(sapi_headers).mimetype &&
1588 			_php_mb_match_regex(
1589 				MBSTRG(http_output_conv_mimetypes),
1590 				SG(sapi_headers).mimetype,
1591 				strlen(SG(sapi_headers).mimetype))) {
1592 			if ((s = strchr(SG(sapi_headers).mimetype,';')) == NULL) {
1593 				mimetype = estrdup(SG(sapi_headers).mimetype);
1594 			} else {
1595 				mimetype = estrndup(SG(sapi_headers).mimetype,s-SG(sapi_headers).mimetype);
1596 			}
1597 			send_text_mimetype = 1;
1598 		} else if (SG(sapi_headers).send_default_content_type) {
1599 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1600 		}
1601 
1602 		/* if content-type is not yet set, set it and activate the converter */
1603 		if (SG(sapi_headers).send_default_content_type || send_text_mimetype) {
1604 			charset = encoding->mime_name;
1605 			if (charset) {
1606 				len = spprintf( &p, 0, "Content-Type: %s; charset=%s",  mimetype, charset );
1607 				if (sapi_add_header(p, len, 0) != FAILURE) {
1608 					SG(sapi_headers).send_default_content_type = 0;
1609 				}
1610 			}
1611 			/* activate the converter */
1612 			MBSTRG(outconv) = mbfl_buffer_converter_new(MBSTRG(current_internal_encoding), encoding, 0);
1613 			if (send_text_mimetype){
1614 				efree(mimetype);
1615 			}
1616 		}
1617 	}
1618 
1619 	/* just return if the converter is not activated. */
1620 	if (MBSTRG(outconv) == NULL) {
1621 		RETURN_STRINGL(arg_string, arg_string_len);
1622 	}
1623 
1624 	/* flag */
1625 	last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1626 	/* mode */
1627 	mbfl_buffer_converter_illegal_mode(MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1628 	mbfl_buffer_converter_illegal_substchar(MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1629 
1630 	/* feed the string */
1631 	mbfl_string_init(&string);
1632 	/* these are not needed. convd has encoding info.
1633 	string.encoding = MBSTRG(current_internal_encoding);
1634 	*/
1635 	string.val = (unsigned char *)arg_string;
1636 	string.len = arg_string_len;
1637 
1638 	mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1639 	if (last_feed) {
1640 		mbfl_buffer_converter_flush(MBSTRG(outconv));
1641 	}
1642 	/* get the converter output, and return it */
1643 	mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1644 
1645 	// TODO: avoid reallocation ???
1646 	RETVAL_STRINGL((char *)result.val, result.len);		/* the string is already strdup()'ed */
1647 	efree(result.val);
1648 
1649 	/* delete the converter if it is the last feed. */
1650 	if (last_feed) {
1651 		MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1652 		mbfl_buffer_converter_delete(MBSTRG(outconv));
1653 		MBSTRG(outconv) = NULL;
1654 	}
1655 }
1656 /* }}} */
1657 
1658 /* {{{ Convert a multibyte string to an array. If split_length is specified,
1659  break the string down into chunks each split_length characters long. */
1660 
1661 /* structure to pass split params to the callback */
1662 struct mbfl_split_params {
1663 	zval *return_value; /* php function return value structure pointer */
1664 	mbfl_string *result_string; /* string to store result chunk */
1665 	size_t mb_chunk_length; /* actual chunk length in chars */
1666 	size_t split_length; /* split length in chars */
1667 	mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1668 };
1669 
1670 /* callback function to fill split array */
mbfl_split_output(int c,void * data)1671 static int mbfl_split_output(int c, void *data)
1672 {
1673 	struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1674 
1675 	(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1676 
1677 	if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1678 		mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1679 		mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1680 		mbfl_string *chunk = params->result_string;
1681 		mbfl_memory_device_result(device, chunk); /* make chunk */
1682 		add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1683 		efree(chunk->val);
1684 		params->mb_chunk_length = 0; /* reset mb_chunk size */
1685 	}
1686 
1687 	return 0;
1688 }
1689 
PHP_FUNCTION(mb_str_split)1690 PHP_FUNCTION(mb_str_split)
1691 {
1692 	zend_string *str, *encoding = NULL;
1693 	size_t mb_len, chunks, chunk_len;
1694 	const char *p, *last; /* pointer for the string cursor and last string char */
1695 	mbfl_string string, result_string;
1696 	const mbfl_encoding *mbfl_encoding;
1697 	zend_long split_length = 1;
1698 
1699 	ZEND_PARSE_PARAMETERS_START(1, 3)
1700 		Z_PARAM_STR(str)
1701 		Z_PARAM_OPTIONAL
1702 		Z_PARAM_LONG(split_length)
1703 		Z_PARAM_STR_OR_NULL(encoding)
1704 	ZEND_PARSE_PARAMETERS_END();
1705 
1706 	if (split_length <= 0) {
1707 		zend_argument_value_error(2, "must be greater than 0");
1708 		RETURN_THROWS();
1709 	}
1710 
1711 	/* fill mbfl_string structure */
1712 	string.val = (unsigned char *) ZSTR_VAL(str);
1713 	string.len = ZSTR_LEN(str);
1714 	string.encoding = php_mb_get_encoding(encoding, 3);
1715 	if (!string.encoding) {
1716 		RETURN_THROWS();
1717 	}
1718 
1719 	if (ZSTR_LEN(str) == 0) {
1720 		RETURN_EMPTY_ARRAY();
1721 	}
1722 
1723 	p = ZSTR_VAL(str); /* string cursor pointer */
1724 	last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1725 
1726 	mbfl_encoding = string.encoding;
1727 
1728 	/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1729 	if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1730 		mb_len = string.len;
1731 		chunk_len = (size_t)split_length; /* chunk length in bytes */
1732 	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1733 		mb_len = string.len / 2;
1734 		chunk_len = split_length * 2;
1735 	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1736 		mb_len = string.len / 4;
1737 		chunk_len = split_length * 4;
1738 	} else if (mbfl_encoding->mblen_table != NULL) {
1739 		/* second scenario: variable width encodings with length table */
1740 		char unsigned const *mbtab = mbfl_encoding->mblen_table;
1741 
1742 		/* assume that we have 1-bytes characters */
1743 		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1744 
1745 		while (p < last) { /* split cycle work until the cursor has reached the last byte */
1746 			char const *chunk_p = p; /* chunk first byte pointer */
1747 			chunk_len = 0; /* chunk length in bytes */
1748 			zend_long char_count;
1749 
1750 			for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1751 				char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1752 				chunk_len += m;
1753 				p += m;
1754 			}
1755 			if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1756 			add_next_index_stringl(return_value, chunk_p, chunk_len);
1757 		}
1758 		return;
1759 	} else {
1760 		/* third scenario: other multibyte encodings */
1761 		mbfl_convert_filter *filter, *decoder;
1762 
1763 		/* assume that we have 1-bytes characters */
1764 		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1765 
1766 		/* decoder filter to decode wchar to encoding */
1767 		mbfl_memory_device device;
1768 		mbfl_memory_device_init(&device, split_length + 1, 0);
1769 
1770 		decoder = mbfl_convert_filter_new(
1771 				&mbfl_encoding_wchar,
1772 				string.encoding,
1773 				mbfl_memory_device_output,
1774 				NULL,
1775 				&device);
1776 		/* assert that nothing is wrong with the decoder */
1777 		ZEND_ASSERT(decoder != NULL);
1778 
1779 		/* wchar filter */
1780 		mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1781 		struct mbfl_split_params params = { /* init callback function params structure */
1782 			.return_value = return_value,
1783 			.result_string = &result_string,
1784 			.mb_chunk_length = 0,
1785 			.split_length = (size_t)split_length,
1786 			.next_filter = decoder,
1787 		};
1788 
1789 		filter = mbfl_convert_filter_new(
1790 				string.encoding,
1791 				&mbfl_encoding_wchar,
1792 				mbfl_split_output,
1793 				NULL,
1794 				&params);
1795 		/* assert that nothing is wrong with the filter */
1796 		ZEND_ASSERT(filter != NULL);
1797 
1798 		while (p < last - 1) { /* cycle each byte except last with callback function */
1799 			(*filter->filter_function)(*p++, filter);
1800 		}
1801 		params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1802 		(*filter->filter_function)(*p++, filter); /* process last char */
1803 
1804 		mbfl_convert_filter_delete(decoder);
1805 		mbfl_convert_filter_delete(filter);
1806 		mbfl_memory_device_clear(&device);
1807 		return;
1808 	}
1809 
1810 	/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1811 	chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1812 	array_init_size(return_value, chunks);
1813 	if (chunks != 0) {
1814 		zend_long i;
1815 
1816 		for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1817 			add_next_index_stringl(return_value, p, chunk_len);
1818 		}
1819 		add_next_index_stringl(return_value, p, last - p);
1820 	}
1821 }
1822 /* }}} */
1823 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1824 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1825 {
1826 	size_t len = 0;
1827 
1828 	if (encoding->flag & MBFL_ENCTYPE_SBCS) {
1829 		return ZSTR_LEN(string);
1830 	} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1831 		return ZSTR_LEN(string) / 2;
1832 	} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1833 		return ZSTR_LEN(string) / 4;
1834 	} else if (encoding->mblen_table) {
1835 		const unsigned char *mbtab = encoding->mblen_table;
1836 		unsigned char *p = (unsigned char*)ZSTR_VAL(string), *e = p + ZSTR_LEN(string);
1837 		while (p < e) {
1838 			p += mbtab[*p];
1839 			len++;
1840 		}
1841 	} else {
1842 		uint32_t wchar_buf[128];
1843 		unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1844 		size_t in_len = ZSTR_LEN(string);
1845 		unsigned int state = 0;
1846 
1847 		while (in_len) {
1848 			len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1849 		}
1850 	}
1851 
1852 	return len;
1853 }
1854 
1855 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1856 PHP_FUNCTION(mb_strlen)
1857 {
1858 	zend_string *string, *enc_name = NULL;
1859 
1860 	ZEND_PARSE_PARAMETERS_START(1, 2)
1861 		Z_PARAM_STR(string)
1862 		Z_PARAM_OPTIONAL
1863 		Z_PARAM_STR_OR_NULL(enc_name)
1864 	ZEND_PARSE_PARAMETERS_END();
1865 
1866 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1867 	if (!enc) {
1868 		RETURN_THROWS();
1869 	}
1870 
1871 	RETVAL_LONG(mb_get_strlen(string, enc));
1872 }
1873 /* }}} */
1874 
handle_strpos_error(size_t error)1875 static void handle_strpos_error(size_t error) {
1876 	switch (error) {
1877 	case MBFL_ERROR_NOT_FOUND:
1878 		break;
1879 	case MBFL_ERROR_ENCODING:
1880 		php_error_docref(NULL, E_WARNING, "Conversion error");
1881 		break;
1882 	case MBFL_ERROR_OFFSET:
1883 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1884 		break;
1885 	default:
1886 		zend_value_error("mb_strpos(): Unknown error");
1887 		break;
1888 	}
1889 }
1890 
1891 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(mb_strpos)1892 PHP_FUNCTION(mb_strpos)
1893 {
1894 	int reverse = 0;
1895 	zend_long offset = 0;
1896 	char *haystack_val, *needle_val;
1897 	mbfl_string haystack, needle;
1898 	zend_string *enc_name = NULL;
1899 
1900 	ZEND_PARSE_PARAMETERS_START(2, 4)
1901 		Z_PARAM_STRING(haystack_val, haystack.len)
1902 		Z_PARAM_STRING(needle_val, needle.len)
1903 		Z_PARAM_OPTIONAL
1904 		Z_PARAM_LONG(offset)
1905 		Z_PARAM_STR_OR_NULL(enc_name)
1906 	ZEND_PARSE_PARAMETERS_END();
1907 
1908 	haystack.val = (unsigned char*)haystack_val;
1909 	needle.val = (unsigned char*)needle_val;
1910 
1911 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1912 	if (!haystack.encoding) {
1913 		RETURN_THROWS();
1914 	}
1915 
1916 	size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
1917 	if (!mbfl_is_error(n)) {
1918 		RETVAL_LONG(n);
1919 	} else {
1920 		handle_strpos_error(n);
1921 		RETVAL_FALSE;
1922 	}
1923 }
1924 /* }}} */
1925 
1926 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1927 PHP_FUNCTION(mb_strrpos)
1928 {
1929 	mbfl_string haystack, needle;
1930 	char *haystack_val, *needle_val;
1931 	zend_string *enc_name = NULL;
1932 	zend_long offset = 0;
1933 
1934 	ZEND_PARSE_PARAMETERS_START(2, 4)
1935 		Z_PARAM_STRING(haystack_val, haystack.len)
1936 		Z_PARAM_STRING(needle_val, needle.len)
1937 		Z_PARAM_OPTIONAL
1938 		Z_PARAM_LONG(offset)
1939 		Z_PARAM_STR_OR_NULL(enc_name)
1940 	ZEND_PARSE_PARAMETERS_END();
1941 
1942 	haystack.val = (unsigned char*)haystack_val;
1943 	needle.val = (unsigned char*)needle_val;
1944 
1945 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1946 	if (!haystack.encoding) {
1947 		RETURN_THROWS();
1948 	}
1949 
1950 	size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
1951 	if (!mbfl_is_error(n)) {
1952 		RETVAL_LONG(n);
1953 	} else {
1954 		handle_strpos_error(n);
1955 		RETVAL_FALSE;
1956 	}
1957 }
1958 /* }}} */
1959 
1960 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)1961 PHP_FUNCTION(mb_stripos)
1962 {
1963 	zend_long offset = 0;
1964 	mbfl_string haystack, needle;
1965 	char *haystack_val, *needle_val;
1966 	zend_string *from_encoding = NULL;
1967 
1968 	ZEND_PARSE_PARAMETERS_START(2, 4)
1969 		Z_PARAM_STRING(haystack_val, haystack.len)
1970 		Z_PARAM_STRING(needle_val, needle.len)
1971 		Z_PARAM_OPTIONAL
1972 		Z_PARAM_LONG(offset)
1973 		Z_PARAM_STR_OR_NULL(from_encoding)
1974 	ZEND_PARSE_PARAMETERS_END();
1975 
1976 	haystack.val = (unsigned char*)haystack_val;
1977 	needle.val = (unsigned char*)needle_val;
1978 
1979 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1980 	if (!enc) {
1981 		RETURN_THROWS();
1982 	}
1983 
1984 	size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1985 
1986 	if (!mbfl_is_error(n)) {
1987 		RETVAL_LONG(n);
1988 	} else {
1989 		handle_strpos_error(n);
1990 		RETVAL_FALSE;
1991 	}
1992 }
1993 /* }}} */
1994 
1995 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)1996 PHP_FUNCTION(mb_strripos)
1997 {
1998 	zend_long offset = 0;
1999 	mbfl_string haystack, needle;
2000 	char *haystack_val, *needle_val;
2001 	zend_string *from_encoding = NULL;
2002 
2003 	ZEND_PARSE_PARAMETERS_START(2, 4)
2004 		Z_PARAM_STRING(haystack_val, haystack.len)
2005 		Z_PARAM_STRING(needle_val, needle.len)
2006 		Z_PARAM_OPTIONAL
2007 		Z_PARAM_LONG(offset)
2008 		Z_PARAM_STR_OR_NULL(from_encoding)
2009 	ZEND_PARSE_PARAMETERS_END();
2010 
2011 	haystack.val = (unsigned char*)haystack_val;
2012 	needle.val = (unsigned char*)needle_val;
2013 
2014 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2015 	if (!enc) {
2016 		RETURN_THROWS();
2017 	}
2018 
2019 	size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
2020 
2021 	if (!mbfl_is_error(n)) {
2022 		RETVAL_LONG(n);
2023 	} else {
2024 		handle_strpos_error(n);
2025 		RETVAL_FALSE;
2026 	}
2027 }
2028 /* }}} */
2029 
2030 #define MB_STRSTR 1
2031 #define MB_STRRCHR 2
2032 #define MB_STRISTR 3
2033 #define MB_STRRICHR 4
2034 /* {{{ php_mb_strstr_variants */
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2035 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2036 {
2037 	int reverse_mode = 0;
2038 	size_t n;
2039 	char *haystack_val, *needle_val;
2040 	mbfl_string haystack, needle, result, *ret = NULL;
2041 	zend_string *encoding_name = NULL;
2042 	bool part = 0;
2043 
2044 	ZEND_PARSE_PARAMETERS_START(2, 4)
2045 		Z_PARAM_STRING(haystack_val, haystack.len)
2046 		Z_PARAM_STRING(needle_val, needle.len)
2047 		Z_PARAM_OPTIONAL
2048 		Z_PARAM_BOOL(part)
2049 		Z_PARAM_STR_OR_NULL(encoding_name)
2050 	ZEND_PARSE_PARAMETERS_END();
2051 
2052 	haystack.val = (unsigned char*)haystack_val;
2053 	needle.val = (unsigned char*)needle_val;
2054 	haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
2055 	if (!haystack.encoding) {
2056 		RETURN_THROWS();
2057 	}
2058 
2059 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
2060 
2061 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2062 		n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
2063 			needle.len, 0, needle.encoding);
2064 	} else {
2065 		n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
2066 	}
2067 
2068 	if (!mbfl_is_error(n)) {
2069 		if (part) {
2070 			ret = mbfl_substr(&haystack, &result, 0, n);
2071 			ZEND_ASSERT(ret != NULL);
2072 			// TODO: avoid reallocation ???
2073 			RETVAL_STRINGL((char *)ret->val, ret->len);
2074 			efree(ret->val);
2075 		} else {
2076 			ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
2077 			ZEND_ASSERT(ret != NULL);
2078 			// TODO: avoid reallocation ???
2079 			RETVAL_STRINGL((char *)ret->val, ret->len);
2080 			efree(ret->val);
2081 		}
2082 	} else {
2083 		// FIXME use handle_strpos_error(n)
2084 		RETVAL_FALSE;
2085 	}
2086 }
2087 
2088 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2089 PHP_FUNCTION(mb_strstr)
2090 {
2091 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2092 }
2093 /* }}} */
2094 
2095 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2096 PHP_FUNCTION(mb_strrchr)
2097 {
2098 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2099 }
2100 /* }}} */
2101 
2102 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2103 PHP_FUNCTION(mb_stristr)
2104 {
2105 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2106 }
2107 /* }}} */
2108 
2109 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2110 PHP_FUNCTION(mb_strrichr)
2111 {
2112 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2113 }
2114 /* }}} */
2115 
2116 #undef MB_STRSTR
2117 #undef MB_STRRCHR
2118 #undef MB_STRISTR
2119 #undef MB_STRRICHR
2120 
2121 /* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)2122 PHP_FUNCTION(mb_substr_count)
2123 {
2124 	mbfl_string haystack, needle;
2125 	char *haystack_val, *needle_val;
2126 	zend_string *enc_name = NULL;
2127 
2128 	ZEND_PARSE_PARAMETERS_START(2, 3)
2129 		Z_PARAM_STRING(haystack_val, haystack.len)
2130 		Z_PARAM_STRING(needle_val, needle.len)
2131 		Z_PARAM_OPTIONAL
2132 		Z_PARAM_STR_OR_NULL(enc_name)
2133 	ZEND_PARSE_PARAMETERS_END();
2134 
2135 	haystack.val = (unsigned char*)haystack_val;
2136 	needle.val = (unsigned char*)needle_val;
2137 
2138 	if (needle.len == 0) {
2139 		zend_argument_value_error(2, "must not be empty");
2140 		RETURN_THROWS();
2141 	}
2142 
2143 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2144 	if (!haystack.encoding) {
2145 		RETURN_THROWS();
2146 	}
2147 
2148 	size_t n = mbfl_substr_count(&haystack, &needle);
2149 	/* An error can only occur if needle is empty,
2150 	 * an encoding error happens (which should not happen at this stage and is a bug)
2151 	 * or the haystack is more than sizeof(size_t) bytes
2152 	 * If one of these things occur this is a bug and should be flagged as such */
2153 	ZEND_ASSERT(!mbfl_is_error(n));
2154 	RETVAL_LONG(n);
2155 }
2156 /* }}} */
2157 
2158 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2159 PHP_FUNCTION(mb_substr)
2160 {
2161 	char *str;
2162 	zend_string *encoding = NULL;
2163 	zend_long from, len;
2164 	size_t real_from, real_len;
2165 	size_t str_len;
2166 	bool len_is_null = 1;
2167 	mbfl_string string, result, *ret;
2168 
2169 	ZEND_PARSE_PARAMETERS_START(2, 4)
2170 		Z_PARAM_STRING(str, str_len)
2171 		Z_PARAM_LONG(from)
2172 		Z_PARAM_OPTIONAL
2173 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2174 		Z_PARAM_STR_OR_NULL(encoding)
2175 	ZEND_PARSE_PARAMETERS_END();
2176 
2177 	string.encoding = php_mb_get_encoding(encoding, 4);
2178 	if (!string.encoding) {
2179 		RETURN_THROWS();
2180 	}
2181 
2182 	string.val = (unsigned char *)str;
2183 	string.len = str_len;
2184 
2185 	/* measures length */
2186 	size_t mblen = 0;
2187 	if (from < 0 || (!len_is_null && len < 0)) {
2188 		mblen = mbfl_strlen(&string);
2189 	}
2190 
2191 	/* if "from" position is negative, count start position from the end
2192 	 * of the string
2193 	 */
2194 	if (from >= 0) {
2195 		real_from = (size_t) from;
2196 	} else if (-from < mblen) {
2197 		real_from = mblen + from;
2198 	} else {
2199 		real_from = 0;
2200 	}
2201 
2202 	/* if "length" position is negative, set it to the length
2203 	 * needed to stop that many chars from the end of the string
2204 	 */
2205 	if (len_is_null) {
2206 		real_len = MBFL_SUBSTR_UNTIL_END;
2207 	} else if (len >= 0) {
2208 		real_len = (size_t) len;
2209 	} else if (real_from < mblen && -len < mblen - real_from) {
2210 		real_len = (mblen - real_from) + len;
2211 	} else {
2212 		real_len = 0;
2213 	}
2214 
2215 	ret = mbfl_substr(&string, &result, real_from, real_len);
2216 	ZEND_ASSERT(ret != NULL);
2217 
2218 	// TODO: avoid reallocation ???
2219 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2220 	efree(ret->val);
2221 }
2222 /* }}} */
2223 
2224 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2225 PHP_FUNCTION(mb_strcut)
2226 {
2227 	zend_string *encoding = NULL;
2228 	char *string_val;
2229 	zend_long from, len;
2230 	bool len_is_null = 1;
2231 	mbfl_string string, result, *ret;
2232 
2233 	ZEND_PARSE_PARAMETERS_START(2, 4)
2234 		Z_PARAM_STRING(string_val, string.len)
2235 		Z_PARAM_LONG(from)
2236 		Z_PARAM_OPTIONAL
2237 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2238 		Z_PARAM_STR_OR_NULL(encoding)
2239 	ZEND_PARSE_PARAMETERS_END();
2240 
2241 	string.val = (unsigned char*)string_val;
2242 	string.encoding = php_mb_get_encoding(encoding, 4);
2243 	if (!string.encoding) {
2244 		RETURN_THROWS();
2245 	}
2246 
2247 	if (len_is_null) {
2248 		len = string.len;
2249 	}
2250 
2251 	/* if "from" position is negative, count start position from the end
2252 	 * of the string
2253 	 */
2254 	if (from < 0) {
2255 		from = string.len + from;
2256 		if (from < 0) {
2257 			from = 0;
2258 		}
2259 	}
2260 
2261 	/* if "length" position is negative, set it to the length
2262 	 * needed to stop that many chars from the end of the string
2263 	 */
2264 	if (len < 0) {
2265 		len = (string.len - from) + len;
2266 		if (len < 0) {
2267 			len = 0;
2268 		}
2269 	}
2270 
2271 	if (from > string.len) {
2272 		RETURN_EMPTY_STRING();
2273 	}
2274 
2275 	ret = mbfl_strcut(&string, &result, from, len);
2276 	ZEND_ASSERT(ret != NULL);
2277 
2278 	// TODO: avoid reallocation ???
2279 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2280 	efree(ret->val);
2281 }
2282 /* }}} */
2283 
2284 /* Some East Asian characters, when printed at a terminal (or the like), require double
2285  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2286 static size_t character_width(uint32_t c)
2287 {
2288 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2289 		return 1;
2290 	}
2291 
2292 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2293 	int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2294 	while (lo < hi) {
2295 		int probe = (lo + hi) / 2;
2296 		if (c < mbfl_eaw_table[probe].begin) {
2297 			hi = probe;
2298 		} else if (c > mbfl_eaw_table[probe].end) {
2299 			lo = probe + 1;
2300 		} else {
2301 			return 2;
2302 		}
2303 	}
2304 
2305 	return 1;
2306 }
2307 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2308 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2309 {
2310 	size_t width = 0;
2311 	uint32_t wchar_buf[128];
2312 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2313 	size_t in_len = ZSTR_LEN(string);
2314 	unsigned int state = 0;
2315 
2316 	while (in_len) {
2317 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2318 		ZEND_ASSERT(out_len <= 128);
2319 
2320 		while (out_len) {
2321 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2322 			 * If text conversion is performed with an ordinary ASCII character as
2323 			 * the 'replacement character', this will give us the correct display width. */
2324 			width += character_width(wchar_buf[--out_len]);
2325 		}
2326 	}
2327 
2328 	return width;
2329 }
2330 
2331 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2332 PHP_FUNCTION(mb_strwidth)
2333 {
2334 	zend_string *string, *enc_name = NULL;
2335 
2336 	ZEND_PARSE_PARAMETERS_START(1, 2)
2337 		Z_PARAM_STR(string)
2338 		Z_PARAM_OPTIONAL
2339 		Z_PARAM_STR_OR_NULL(enc_name)
2340 	ZEND_PARSE_PARAMETERS_END();
2341 
2342 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2343 	if (!enc) {
2344 		RETURN_THROWS();
2345 	}
2346 
2347 	RETVAL_LONG(mb_get_strwidth(string, enc));
2348 }
2349 
2350 /* Cut 'n' codepoints from beginning of string
2351  * Remove this once mb_substr is implemented using the new conversion filters */
mb_drop_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2352 static zend_string* mb_drop_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2353 {
2354 	if (n >= ZSTR_LEN(input)) {
2355 		/* No supported text encoding decodes to more than one codepoint per byte
2356 		 * So if the number of codepoints to drop >= number of input bytes,
2357 		 * then definitely the output should be empty
2358 		 * This also guards `ZSTR_LEN(input) - n` (below) from underflow */
2359 		return zend_empty_string;
2360 	}
2361 
2362 	uint32_t wchar_buf[128];
2363 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2364 	size_t in_len = ZSTR_LEN(input);
2365 	unsigned int state = 0;
2366 
2367 	mb_convert_buf buf;
2368 	mb_convert_buf_init(&buf, ZSTR_LEN(input) - n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2369 
2370 	while (in_len) {
2371 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2372 		ZEND_ASSERT(out_len <= 128);
2373 
2374 		if (n >= out_len) {
2375 			n -= out_len;
2376 		} else {
2377 			enc->from_wchar(wchar_buf + n, out_len - n, &buf, !in_len);
2378 			n = 0;
2379 		}
2380 	}
2381 
2382 	return mb_convert_buf_result(&buf);
2383 }
2384 
2385 /* Pick 'n' codepoints from beginning of string
2386  * Remove this once mb_substr is implemented using the new conversion filters */
mb_pick_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2387 static zend_string* mb_pick_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2388 {
2389 	uint32_t wchar_buf[128];
2390 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2391 	size_t in_len = ZSTR_LEN(input);
2392 	unsigned int state = 0;
2393 
2394 	mb_convert_buf buf;
2395 	mb_convert_buf_init(&buf, n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2396 
2397 	while (in_len && n) {
2398 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2399 		ZEND_ASSERT(out_len <= 128);
2400 
2401 		enc->from_wchar(wchar_buf, MIN(out_len, n), &buf, !in_len || out_len >= n);
2402 		n -= MIN(out_len, n);
2403 	}
2404 
2405 	return mb_convert_buf_result(&buf);
2406 }
2407 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2408 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2409 {
2410 	uint32_t wchar_buf[128];
2411 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2412 	size_t in_len = ZSTR_LEN(input);
2413 	unsigned int state = 0;
2414 	int remaining_width = width;
2415 	unsigned int to_skip = from;
2416 	size_t out_len = 0;
2417 	bool first_call = true, input_err = false;
2418 	mb_convert_buf buf;
2419 
2420 	while (in_len) {
2421 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2422 		ZEND_ASSERT(out_len <= 128);
2423 
2424 		if (out_len <= to_skip) {
2425 			to_skip -= out_len;
2426 		} else {
2427 			for (int i = to_skip; i < out_len; i++) {
2428 				uint32_t w = wchar_buf[i];
2429 				input_err |= (w == MBFL_BAD_INPUT);
2430 				remaining_width -= character_width(w);
2431 				if (remaining_width < 0) {
2432 					/* We need to truncate string and append trim marker */
2433 					width -= mb_get_strwidth(marker, enc);
2434 					/* 'width' is now the amount we want to take from 'input' */
2435 					if (width <= 0) {
2436 						return zend_string_copy(marker);
2437 					}
2438 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2439 
2440 					if (first_call) {
2441 						/* We can use the buffer of wchars which we have right now;
2442 						 * no need to convert again */
2443 						goto dont_restart_conversion;
2444 					} else {
2445 						goto restart_conversion;
2446 					}
2447 				}
2448 			}
2449 			to_skip = 0;
2450 		}
2451 		first_call = false;
2452 	}
2453 
2454 	/* The input string fits in the requested width; we don't need to append the trim marker
2455 	 * However, if the string contains erroneous byte sequences, those should be converted
2456 	 * to error markers */
2457 	if (from == 0 && !input_err) {
2458 		/* This just increments the string's refcount; it doesn't really 'copy' it */
2459 		return zend_string_copy(input);
2460 	}
2461 	return mb_drop_chars(input, enc, from);
2462 
2463 	/* The input string is too wide; we need to build a new string which
2464 	 * includes some portion of the input string, with the trim marker
2465 	 * concatenated onto it */
2466 restart_conversion:
2467 	in = (unsigned char*)ZSTR_VAL(input);
2468 	in_len = ZSTR_LEN(input);
2469 	state = 0;
2470 
2471 	while (true) {
2472 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2473 		ZEND_ASSERT(out_len <= 128);
2474 
2475 dont_restart_conversion:
2476 		if (out_len <= from) {
2477 			from -= out_len;
2478 		} else {
2479 			for (int i = from; i < out_len; i++) {
2480 				width -= character_width(wchar_buf[i]);
2481 				if (width < 0) {
2482 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2483 					goto append_trim_marker;
2484 				}
2485 			}
2486 			ZEND_ASSERT(in_len > 0);
2487 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2488 			from = 0;
2489 		}
2490 	}
2491 
2492 append_trim_marker:
2493 	if (ZSTR_LEN(marker) > 0) {
2494 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2495 		memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2496 		buf.out += ZSTR_LEN(marker);
2497 	}
2498 
2499 	return mb_convert_buf_result(&buf);
2500 }
2501 
2502 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2503 PHP_FUNCTION(mb_strimwidth)
2504 {
2505 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2506 	zend_long from, width;
2507 
2508 	ZEND_PARSE_PARAMETERS_START(3, 5)
2509 		Z_PARAM_STR(str)
2510 		Z_PARAM_LONG(from)
2511 		Z_PARAM_LONG(width)
2512 		Z_PARAM_OPTIONAL
2513 		Z_PARAM_STR(trimmarker)
2514 		Z_PARAM_STR_OR_NULL(encoding)
2515 	ZEND_PARSE_PARAMETERS_END();
2516 
2517 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2518 	if (!enc) {
2519 		RETURN_THROWS();
2520 	}
2521 
2522 	if (from != 0) {
2523 		size_t str_len = mb_get_strlen(str, enc);
2524 		if (from < 0) {
2525 			from += str_len;
2526 		}
2527 		if (from < 0 || from > str_len) {
2528 			zend_argument_value_error(2, "is out of range");
2529 			RETURN_THROWS();
2530 		}
2531 	}
2532 
2533 	if (width < 0) {
2534 		width += mb_get_strwidth(str, enc);
2535 
2536 		if (from > 0) {
2537 			zend_string *trimmed = mb_pick_chars(str, enc, from);
2538 			width -= mb_get_strwidth(trimmed, enc);
2539 			zend_string_free(trimmed);
2540 		}
2541 
2542 		if (width < 0) {
2543 			zend_argument_value_error(3, "is out of range");
2544 			RETURN_THROWS();
2545 		}
2546 	}
2547 
2548 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2549 }
2550 
2551 
2552 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2553 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2554 {
2555 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2556 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2557 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2558 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2559 }
2560 
2561 
2562 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)2563 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
2564 {
2565 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
2566 }
2567 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2568 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2569 {
2570 	unsigned int num_errors = 0;
2571 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2572 	MBSTRG(illegalchars) += num_errors;
2573 	return result;
2574 }
2575 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2576 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2577 {
2578 	const mbfl_encoding *from_encoding;
2579 
2580 	/* pre-conversion encoding */
2581 	ZEND_ASSERT(num_from_encodings >= 1);
2582 	if (num_from_encodings == 1) {
2583 		from_encoding = *from_encodings;
2584 	} else {
2585 		/* auto detect */
2586 		mbfl_string string;
2587 		mbfl_string_init(&string);
2588 		string.val = (unsigned char *)input;
2589 		string.len = length;
2590 		from_encoding = mbfl_identify_encoding(
2591 			&string, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2592 		if (!from_encoding) {
2593 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2594 			return NULL;
2595 		}
2596 	}
2597 
2598 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2599 }
2600 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2601 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2602 {
2603 	HashTable *output, *chash;
2604 	zend_long idx;
2605 	zend_string *key;
2606 	zval *entry, entry_tmp;
2607 
2608 	if (!input) {
2609 		return NULL;
2610 	}
2611 
2612 	if (GC_IS_RECURSIVE(input)) {
2613 		GC_UNPROTECT_RECURSION(input);
2614 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2615 		return NULL;
2616 	}
2617 	GC_TRY_PROTECT_RECURSION(input);
2618 	output = zend_new_array(zend_hash_num_elements(input));
2619 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2620 		/* convert key */
2621 		if (key) {
2622 			key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2623 		}
2624 		/* convert value */
2625 		ZEND_ASSERT(entry);
2626 try_again:
2627 		switch(Z_TYPE_P(entry)) {
2628 			case IS_STRING:
2629 				ZVAL_STR(&entry_tmp, php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings));
2630 				break;
2631 			case IS_NULL:
2632 			case IS_TRUE:
2633 			case IS_FALSE:
2634 			case IS_LONG:
2635 			case IS_DOUBLE:
2636 				ZVAL_COPY(&entry_tmp, entry);
2637 				break;
2638 			case IS_ARRAY:
2639 				chash = php_mb_convert_encoding_recursive(
2640 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2641 				if (chash) {
2642 					ZVAL_ARR(&entry_tmp, chash);
2643 				} else {
2644 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2645 				}
2646 				break;
2647 			case IS_REFERENCE:
2648 				entry = Z_REFVAL_P(entry);
2649 				goto try_again;
2650 			case IS_OBJECT:
2651 			default:
2652 				if (key) {
2653 					zend_string_release(key);
2654 				}
2655 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2656 				continue;
2657 		}
2658 		if (key) {
2659 			zend_hash_add(output, key, &entry_tmp);
2660 			zend_string_release(key);
2661 		} else {
2662 			zend_hash_index_add(output, idx, &entry_tmp);
2663 		}
2664 	} ZEND_HASH_FOREACH_END();
2665 	GC_TRY_UNPROTECT_RECURSION(input);
2666 
2667 	return output;
2668 }
2669 /* }}} */
2670 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2671 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2672 {
2673 	/* mbstring supports some 'text encodings' which aren't really text encodings
2674 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2675 	 * These should never be returned by `mb_detect_encoding`. */
2676 	int shift = 0;
2677 	for (int i = 0; i < *size; i++) {
2678 		const mbfl_encoding *encoding = elist[i];
2679 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2680 			shift++; /* Remove this encoding from the list */
2681 		} else if (shift) {
2682 			elist[i - shift] = encoding;
2683 		}
2684 	}
2685 	*size -= shift;
2686 }
2687 
2688 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2689 PHP_FUNCTION(mb_convert_encoding)
2690 {
2691 	zend_string *to_encoding_name;
2692 	zend_string *input_str, *from_encodings_str = NULL;
2693 	HashTable *input_ht, *from_encodings_ht = NULL;
2694 	const mbfl_encoding **from_encodings;
2695 	size_t num_from_encodings;
2696 	bool free_from_encodings;
2697 
2698 	ZEND_PARSE_PARAMETERS_START(2, 3)
2699 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2700 		Z_PARAM_STR(to_encoding_name)
2701 		Z_PARAM_OPTIONAL
2702 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2703 	ZEND_PARSE_PARAMETERS_END();
2704 
2705 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2706 	if (!to_encoding) {
2707 		RETURN_THROWS();
2708 	}
2709 
2710 	if (from_encodings_ht) {
2711 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2712 			RETURN_THROWS();
2713 		}
2714 		free_from_encodings = 1;
2715 	} else if (from_encodings_str) {
2716 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2717 				&from_encodings, &num_from_encodings,
2718 				/* persistent */ 0, /* arg_num */ 3, /* allow_pass_encoding */ 0) == FAILURE) {
2719 			RETURN_THROWS();
2720 		}
2721 		free_from_encodings = 1;
2722 	} else {
2723 		from_encodings = &MBSTRG(current_internal_encoding);
2724 		num_from_encodings = 1;
2725 		free_from_encodings = 0;
2726 	}
2727 
2728 	if (num_from_encodings > 1) {
2729 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2730 	}
2731 
2732 	if (!num_from_encodings) {
2733 		efree(ZEND_VOIDP(from_encodings));
2734 		zend_argument_value_error(3, "must specify at least one encoding");
2735 		RETURN_THROWS();
2736 	}
2737 
2738 	if (input_str) {
2739 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2740 		if (ret != NULL) {
2741 			RETVAL_STR(ret);
2742 		} else {
2743 			RETVAL_FALSE;
2744 		}
2745 	} else {
2746 		HashTable *tmp;
2747 		tmp = php_mb_convert_encoding_recursive(
2748 			input_ht, to_encoding, from_encodings, num_from_encodings);
2749 		RETVAL_ARR(tmp);
2750 	}
2751 
2752 	if (free_from_encodings) {
2753 		efree(ZEND_VOIDP(from_encodings));
2754 	}
2755 }
2756 /* }}} */
2757 
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2758 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2759 {
2760 	return php_unicode_convert_case(case_mode, str, str_len, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2761 }
2762 
PHP_FUNCTION(mb_convert_case)2763 PHP_FUNCTION(mb_convert_case)
2764 {
2765 	zend_string *str, *from_encoding = NULL;
2766 	zend_long case_mode = 0;
2767 
2768 	ZEND_PARSE_PARAMETERS_START(2, 3)
2769 		Z_PARAM_STR(str)
2770 		Z_PARAM_LONG(case_mode)
2771 		Z_PARAM_OPTIONAL
2772 		Z_PARAM_STR_OR_NULL(from_encoding)
2773 	ZEND_PARSE_PARAMETERS_END();
2774 
2775 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2776 	if (!enc) {
2777 		RETURN_THROWS();
2778 	}
2779 
2780 	if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2781 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2782 		RETURN_THROWS();
2783 	}
2784 
2785 	RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2786 }
2787 
PHP_FUNCTION(mb_strtoupper)2788 PHP_FUNCTION(mb_strtoupper)
2789 {
2790 	zend_string *str, *from_encoding = NULL;
2791 
2792 	ZEND_PARSE_PARAMETERS_START(1, 2)
2793 		Z_PARAM_STR(str)
2794 		Z_PARAM_OPTIONAL
2795 		Z_PARAM_STR_OR_NULL(from_encoding)
2796 	ZEND_PARSE_PARAMETERS_END();
2797 
2798 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2799 	if (!enc) {
2800 		RETURN_THROWS();
2801 	}
2802 
2803 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2804 }
2805 
PHP_FUNCTION(mb_strtolower)2806 PHP_FUNCTION(mb_strtolower)
2807 {
2808 	zend_string *str, *from_encoding = NULL;
2809 
2810 	ZEND_PARSE_PARAMETERS_START(1, 2)
2811 		Z_PARAM_STR(str)
2812 		Z_PARAM_OPTIONAL
2813 		Z_PARAM_STR_OR_NULL(from_encoding)
2814 	ZEND_PARSE_PARAMETERS_END();
2815 
2816 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2817 	if (!enc) {
2818 		RETURN_THROWS();
2819 	}
2820 
2821 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2822 }
2823 
duplicate_elist(const mbfl_encoding ** elist,size_t size)2824 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2825 {
2826 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2827 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2828 	return new_elist;
2829 }
2830 
2831 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)2832 PHP_FUNCTION(mb_detect_encoding)
2833 {
2834 	zend_string *str, *encoding_str = NULL;
2835 	HashTable *encoding_ht = NULL;
2836 	bool strict = false;
2837 
2838 	mbfl_string string;
2839 	const mbfl_encoding *ret;
2840 	const mbfl_encoding **elist;
2841 	size_t size;
2842 
2843 	ZEND_PARSE_PARAMETERS_START(1, 3)
2844 		Z_PARAM_STR(str)
2845 		Z_PARAM_OPTIONAL
2846 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
2847 		Z_PARAM_BOOL(strict)
2848 	ZEND_PARSE_PARAMETERS_END();
2849 
2850 	/* make encoding list */
2851 	if (encoding_ht) {
2852 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
2853 			RETURN_THROWS();
2854 		}
2855 	} else if (encoding_str) {
2856 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0)) {
2857 			RETURN_THROWS();
2858 		}
2859 	} else {
2860 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
2861 		size = MBSTRG(current_detect_order_list_size);
2862 	}
2863 
2864 	if (size == 0) {
2865 		efree(ZEND_VOIDP(elist));
2866 		zend_argument_value_error(2, "must specify at least one encoding");
2867 		RETURN_THROWS();
2868 	}
2869 
2870 	remove_non_encodings_from_elist(elist, &size);
2871 	if (size == 0) {
2872 		efree(ZEND_VOIDP(elist));
2873 		RETURN_FALSE;
2874 	}
2875 
2876 	if (ZEND_NUM_ARGS() < 3) {
2877 		strict = MBSTRG(strict_detection);
2878 	}
2879 
2880 	if (strict && size == 1) {
2881 		/* If there is only a single candidate encoding, mb_check_encoding is faster */
2882 		ret = (mb_check_str_encoding(str, *elist)) ? *elist : NULL;
2883 	} else {
2884 		mbfl_string_init(&string);
2885 		string.val = (unsigned char*)ZSTR_VAL(str);
2886 		string.len = ZSTR_LEN(str);
2887 		ret = mbfl_identify_encoding(&string, elist, size, strict);
2888 	}
2889 
2890 	efree(ZEND_VOIDP(elist));
2891 
2892 	if (ret == NULL) {
2893 		RETURN_FALSE;
2894 	}
2895 
2896 	RETVAL_STRING((char *)ret->name);
2897 }
2898 /* }}} */
2899 
2900 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)2901 PHP_FUNCTION(mb_list_encodings)
2902 {
2903 	ZEND_PARSE_PARAMETERS_NONE();
2904 
2905 	array_init(return_value);
2906 	for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
2907 		add_next_index_string(return_value, (*encodings)->name);
2908 	}
2909 }
2910 /* }}} */
2911 
2912 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)2913 PHP_FUNCTION(mb_encoding_aliases)
2914 {
2915 	const mbfl_encoding *encoding;
2916 	zend_string *encoding_name = NULL;
2917 
2918 	ZEND_PARSE_PARAMETERS_START(1, 1)
2919 		Z_PARAM_STR(encoding_name)
2920 	ZEND_PARSE_PARAMETERS_END();
2921 
2922 	encoding = php_mb_get_encoding(encoding_name, 1);
2923 	if (!encoding) {
2924 		RETURN_THROWS();
2925 	}
2926 
2927 	array_init(return_value);
2928 	if (encoding->aliases != NULL) {
2929 		for (const char **alias = encoding->aliases; *alias; ++alias) {
2930 			add_next_index_string(return_value, (char *)*alias);
2931 		}
2932 	}
2933 }
2934 /* }}} */
2935 
2936 /* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)2937 PHP_FUNCTION(mb_encode_mimeheader)
2938 {
2939 	const mbfl_encoding *charset, *transenc;
2940 	mbfl_string  string, result, *ret;
2941 	zend_string *charset_name = NULL;
2942 	char *trans_enc_name = NULL, *string_val;
2943 	size_t trans_enc_name_len;
2944 	char *linefeed = "\r\n";
2945 	size_t linefeed_len;
2946 	zend_long indent = 0;
2947 
2948 	string.encoding = MBSTRG(current_internal_encoding);
2949 
2950 	ZEND_PARSE_PARAMETERS_START(1, 5)
2951 		Z_PARAM_STRING(string_val, string.len)
2952 		Z_PARAM_OPTIONAL
2953 		Z_PARAM_STR(charset_name)
2954 		Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
2955 		Z_PARAM_STRING(linefeed, linefeed_len)
2956 		Z_PARAM_LONG(indent)
2957 	ZEND_PARSE_PARAMETERS_END();
2958 
2959 	string.val = (unsigned char*)string_val;
2960 	charset = &mbfl_encoding_pass;
2961 	transenc = &mbfl_encoding_base64;
2962 
2963 	if (charset_name != NULL) {
2964 		charset = php_mb_get_encoding(charset_name, 2);
2965 		if (!charset) {
2966 			RETURN_THROWS();
2967 		}
2968 	} else {
2969 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
2970 		if (lang != NULL) {
2971 			charset = mbfl_no2encoding(lang->mail_charset);
2972 			transenc = mbfl_no2encoding(lang->mail_header_encoding);
2973 		}
2974 	}
2975 
2976 	if (trans_enc_name != NULL) {
2977 		if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
2978 			transenc = &mbfl_encoding_base64;
2979 		} else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
2980 			transenc = &mbfl_encoding_qprint;
2981 		}
2982 	}
2983 
2984 	mbfl_string_init(&result);
2985 	ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
2986 	ZEND_ASSERT(ret != NULL);
2987 	// TODO: avoid reallocation ???
2988 	RETVAL_STRINGL((char *)ret->val, ret->len);	/* the string is already strdup()'ed */
2989 	efree(ret->val);
2990 }
2991 /* }}} */
2992 
2993 /* {{{ Decodes the MIME "encoded-word" in the string */
PHP_FUNCTION(mb_decode_mimeheader)2994 PHP_FUNCTION(mb_decode_mimeheader)
2995 {
2996 	char *string_val;
2997 	mbfl_string string, result, *ret;
2998 
2999 	string.encoding = MBSTRG(current_internal_encoding);
3000 
3001 	ZEND_PARSE_PARAMETERS_START(1, 1)
3002 		Z_PARAM_STRING(string_val, string.len)
3003 	ZEND_PARSE_PARAMETERS_END();
3004 
3005 	string.val = (unsigned char*)string_val;
3006 	mbfl_string_init(&result);
3007 	ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding));
3008 	ZEND_ASSERT(ret != NULL);
3009 	// TODO: avoid reallocation ???
3010 	RETVAL_STRINGL((char *)ret->val, ret->len);	/* the string is already strdup()'ed */
3011 	efree(ret->val);
3012 }
3013 /* }}} */
3014 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3015 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3016 {
3017 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3018 	 * if we are converting zenkaku kana to hankaku kana
3019 	 * Make the buffer for converted kana big enough that we never need to
3020 	 * perform bounds checks */
3021 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3022 	unsigned int buf_offset = 0;
3023 	unsigned int state = 0;
3024 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3025 	size_t in_len = ZSTR_LEN(input);
3026 
3027 	mb_convert_buf buf;
3028 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3029 
3030 	while (in_len) {
3031 		uint32_t *converted = converted_buf;
3032 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3033 		 * previous iteration, don't overwrite it */
3034 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3035 		out_len += buf_offset;
3036 		ZEND_ASSERT(out_len <= 64);
3037 
3038 		if (!out_len) {
3039 			continue;
3040 		}
3041 
3042 		for (int i = 0; i < out_len-1; i++) {
3043 			uint32_t second = 0;
3044 			bool consumed = false;
3045 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3046 			if (second) {
3047 				*converted++ = second;
3048 			}
3049 			if (consumed) {
3050 				i++;
3051 				if (i == out_len-1) {
3052 					/* We consumed two codepoints at the very end of the wchar buffer
3053 					 * So there is nothing remaining to reprocess on the next iteration */
3054 					buf_offset = 0;
3055 					goto emit_converted_kana;
3056 				}
3057 			}
3058 		}
3059 
3060 		if (!in_len) {
3061 			/* This is the last iteration, so we need to process the final codepoint now */
3062 			uint32_t second = 0;
3063 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3064 			if (second) {
3065 				*converted++ = second;
3066 			}
3067 		} else {
3068 			/* Reprocess the last codepoint on the next iteration */
3069 			wchar_buf[0] = wchar_buf[out_len-1];
3070 			buf_offset = 1;
3071 		}
3072 
3073 emit_converted_kana:
3074 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3075 	}
3076 
3077 	return mb_convert_buf_result(&buf);
3078 }
3079 
3080 char mb_convert_kana_flags[17] = {
3081 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3082 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3083 	'V'
3084 };
3085 
3086 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3087 PHP_FUNCTION(mb_convert_kana)
3088 {
3089 	unsigned int opt;
3090 	char *optstr = NULL;
3091 	size_t optstr_len;
3092 	zend_string *encname = NULL, *str;
3093 
3094 	ZEND_PARSE_PARAMETERS_START(1, 3)
3095 		Z_PARAM_STR(str)
3096 		Z_PARAM_OPTIONAL
3097 		Z_PARAM_STRING(optstr, optstr_len)
3098 		Z_PARAM_STR_OR_NULL(encname)
3099 	ZEND_PARSE_PARAMETERS_END();
3100 
3101 	if (optstr != NULL) {
3102 		char *p = optstr, *e = p + optstr_len;
3103 		opt = 0;
3104 next_option:
3105 		while (p < e) {
3106 			/* Walk through option string and convert to bit vector
3107 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3108 			char c = *p++;
3109 			if (c == 'A') {
3110 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3111 			} else if (c == 'a') {
3112 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3113 			} else {
3114 				for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3115 					if (c == mb_convert_kana_flags[i]) {
3116 						opt |= (1 << i);
3117 						goto next_option;
3118 					}
3119 				}
3120 
3121 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3122 				RETURN_THROWS();
3123 			}
3124 		}
3125 
3126 		/* Check for illegal combinations of options */
3127 		if (((opt & 0xFF00) >> 8) & opt) {
3128 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3129 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3130 			 * FW hiragana to FW katakana and then back again. */
3131 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3132 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3133 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3134 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3135 				flag1 = 'A';
3136 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3137 				flag2 = 'a';
3138 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3139 			RETURN_THROWS();
3140 		}
3141 
3142 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3143 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3144 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3145 			RETURN_THROWS();
3146 		}
3147 
3148 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3149 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3150 		 * more than one of these */
3151 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3152 			if (opt & MBFL_ZEN2HAN_KATAKANA) {
3153 				zend_argument_value_error(2, "must not combine 'h' and 'k' flags");
3154 				RETURN_THROWS();
3155 			} else if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3156 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3157 				RETURN_THROWS();
3158 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3159 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3160 				RETURN_THROWS();
3161 			}
3162 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3163 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3164 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3165 				RETURN_THROWS();
3166 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3167 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3168 				RETURN_THROWS();
3169 			}
3170 		}
3171 	} else {
3172 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3173 	}
3174 
3175 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3176 	if (!enc) {
3177 		RETURN_THROWS();
3178 	}
3179 
3180 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3181 }
3182 
mb_recursive_encoder_detector_feed(mbfl_encoding_detector * identd,zval * var,bool * recursion_error)3183 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, bool *recursion_error) /* {{{ */
3184 {
3185 	mbfl_string string;
3186 	HashTable *ht;
3187 	zval *entry;
3188 
3189 	ZVAL_DEREF(var);
3190 	if (Z_TYPE_P(var) == IS_STRING) {
3191 		string.val = (unsigned char *)Z_STRVAL_P(var);
3192 		string.len = Z_STRLEN_P(var);
3193 		if (mbfl_encoding_detector_feed(identd, &string)) {
3194 			return 1; /* complete detecting */
3195 		}
3196 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3197 		if (Z_REFCOUNTED_P(var)) {
3198 			if (Z_IS_RECURSIVE_P(var)) {
3199 				*recursion_error = true;
3200 				return 0;
3201 			}
3202 			Z_PROTECT_RECURSION_P(var);
3203 		}
3204 
3205 		ht = HASH_OF(var);
3206 		if (ht != NULL) {
3207 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3208 				if (mb_recursive_encoder_detector_feed(identd, entry, recursion_error)) {
3209 					if (Z_REFCOUNTED_P(var)) {
3210 						Z_UNPROTECT_RECURSION_P(var);
3211 					}
3212 					return 1;
3213 				} else if (*recursion_error) {
3214 					if (Z_REFCOUNTED_P(var)) {
3215 						Z_UNPROTECT_RECURSION_P(var);
3216 					}
3217 					return 0;
3218 				}
3219 			} ZEND_HASH_FOREACH_END();
3220 		}
3221 
3222 		if (Z_REFCOUNTED_P(var)) {
3223 			Z_UNPROTECT_RECURSION_P(var);
3224 		}
3225 	}
3226 	return 0;
3227 } /* }}} */
3228 
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3229 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3230 {
3231 	HashTable *ht;
3232 	zval *entry, *orig_var;
3233 
3234 	orig_var = var;
3235 	ZVAL_DEREF(var);
3236 
3237 	if (Z_TYPE_P(var) == IS_STRING) {
3238 		zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3239 		zval_ptr_dtor(orig_var);
3240 		ZVAL_STR(orig_var, ret);
3241 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3242 		if (Z_TYPE_P(var) == IS_ARRAY) {
3243 			SEPARATE_ARRAY(var);
3244 		}
3245 		if (Z_REFCOUNTED_P(var)) {
3246 			if (Z_IS_RECURSIVE_P(var)) {
3247 				return true;
3248 			}
3249 			Z_PROTECT_RECURSION_P(var);
3250 		}
3251 
3252 		ht = HASH_OF(var);
3253 		if (ht != NULL) {
3254 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3255 				if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3256 					if (Z_REFCOUNTED_P(var)) {
3257 						Z_UNPROTECT_RECURSION_P(var);
3258 					}
3259 					return true;
3260 				}
3261 			} ZEND_HASH_FOREACH_END();
3262 		}
3263 
3264 		if (Z_REFCOUNTED_P(var)) {
3265 			Z_UNPROTECT_RECURSION_P(var);
3266 		}
3267 	}
3268 
3269 	return false;
3270 }
3271 
3272 /* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)3273