xref: /PHP-8.2/ext/mbstring/mbstring.c (revision c34d4fbb)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include "libmbfl/config.h"
22 #include "php.h"
23 #include "php_ini.h"
24 #include "php_variables.h"
25 #include "mbstring.h"
26 #include "ext/standard/php_string.h"
27 #include "ext/standard/php_mail.h"
28 #include "ext/standard/exec.h"
29 #include "ext/standard/url.h"
30 #include "main/php_output.h"
31 #include "ext/standard/info.h"
32 #include "ext/pcre/php_pcre.h"
33 
34 #include "libmbfl/mbfl/mbfilter_8bit.h"
35 #include "libmbfl/mbfl/mbfilter_pass.h"
36 #include "libmbfl/mbfl/mbfilter_wchar.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_qprint.h"
40 #include "libmbfl/filters/mbfilter_htmlent.h"
41 #include "libmbfl/filters/mbfilter_uuencode.h"
42 #include "libmbfl/filters/mbfilter_ucs4.h"
43 #include "libmbfl/filters/mbfilter_utf8.h"
44 #include "libmbfl/filters/mbfilter_singlebyte.h"
45 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
46 
47 #include "php_variables.h"
48 #include "php_globals.h"
49 #include "rfc1867.h"
50 #include "php_content_types.h"
51 #include "SAPI.h"
52 #include "php_unicode.h"
53 #include "TSRM.h"
54 
55 #include "mb_gpc.h"
56 
57 #ifdef HAVE_MBREGEX
58 # include "php_mbregex.h"
59 #endif
60 
61 #include "zend_multibyte.h"
62 #include "mbstring_arginfo.h"
63 /* }}} */
64 
65 /* {{{ prototypes */
66 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
67 
68 static PHP_GINIT_FUNCTION(mbstring);
69 static PHP_GSHUTDOWN_FUNCTION(mbstring);
70 
71 static void php_mb_populate_current_detect_order_list(void);
72 
73 static int php_mb_encoding_translation(void);
74 
75 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
76 
77 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
78 
79 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
80 
81 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
82 
83 /* See mbfilter_cp5022x.c */
84 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
85 /* }}} */
86 
87 /* {{{ php_mb_default_identify_list */
88 typedef struct _php_mb_nls_ident_list {
89 	enum mbfl_no_language lang;
90 	const enum mbfl_no_encoding *list;
91 	size_t list_size;
92 } php_mb_nls_ident_list;
93 
94 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
95 	mbfl_no_encoding_ascii,
96 	mbfl_no_encoding_jis,
97 	mbfl_no_encoding_utf8,
98 	mbfl_no_encoding_euc_jp,
99 	mbfl_no_encoding_sjis
100 };
101 
102 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
103 	mbfl_no_encoding_ascii,
104 	mbfl_no_encoding_utf8,
105 	mbfl_no_encoding_euc_cn,
106 	mbfl_no_encoding_cp936
107 };
108 
109 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
110 	mbfl_no_encoding_ascii,
111 	mbfl_no_encoding_utf8,
112 	mbfl_no_encoding_euc_tw,
113 	mbfl_no_encoding_big5
114 };
115 
116 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
117 	mbfl_no_encoding_ascii,
118 	mbfl_no_encoding_utf8,
119 	mbfl_no_encoding_euc_kr,
120 	mbfl_no_encoding_uhc
121 };
122 
123 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
124 	mbfl_no_encoding_ascii,
125 	mbfl_no_encoding_utf8,
126 	mbfl_no_encoding_koi8r,
127 	mbfl_no_encoding_cp1251,
128 	mbfl_no_encoding_cp866
129 };
130 
131 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
132 	mbfl_no_encoding_ascii,
133 	mbfl_no_encoding_utf8,
134 	mbfl_no_encoding_armscii8
135 };
136 
137 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
138 	mbfl_no_encoding_ascii,
139 	mbfl_no_encoding_utf8,
140 	mbfl_no_encoding_cp1254,
141 	mbfl_no_encoding_8859_9
142 };
143 
144 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
145 	mbfl_no_encoding_ascii,
146 	mbfl_no_encoding_utf8,
147 	mbfl_no_encoding_koi8u
148 };
149 
150 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
151 	mbfl_no_encoding_ascii,
152 	mbfl_no_encoding_utf8
153 };
154 
155 
156 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
157 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
158 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
159 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
160 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
161 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
162 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
163 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
164 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
165 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
166 };
167 
168 /* }}} */
169 
170 /* {{{ mbstring_deps[] */
171 static const zend_module_dep mbstring_deps[] = {
172 	ZEND_MOD_REQUIRED("pcre")
173 	ZEND_MOD_END
174 };
175 /* }}} */
176 
177 /* {{{ zend_module_entry mbstring_module_entry */
178 zend_module_entry mbstring_module_entry = {
179 	STANDARD_MODULE_HEADER_EX,
180 	NULL,
181 	mbstring_deps,
182 	"mbstring",
183 	ext_functions,
184 	PHP_MINIT(mbstring),
185 	PHP_MSHUTDOWN(mbstring),
186 	PHP_RINIT(mbstring),
187 	PHP_RSHUTDOWN(mbstring),
188 	PHP_MINFO(mbstring),
189 	PHP_MBSTRING_VERSION,
190 	PHP_MODULE_GLOBALS(mbstring),
191 	PHP_GINIT(mbstring),
192 	PHP_GSHUTDOWN(mbstring),
193 	NULL,
194 	STANDARD_MODULE_PROPERTIES_EX
195 };
196 /* }}} */
197 
198 /* {{{ static sapi_post_entry php_post_entries[] */
199 static const sapi_post_entry php_post_entries[] = {
200 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
201 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
202 	{ NULL, 0, NULL, NULL }
203 };
204 /* }}} */
205 
206 #ifdef COMPILE_DL_MBSTRING
207 #ifdef ZTS
208 ZEND_TSRMLS_CACHE_DEFINE()
209 #endif
210 ZEND_GET_MODULE(mbstring)
211 #endif
212 
213 /* {{{ static sapi_post_entry mbstr_post_entries[] */
214 static const sapi_post_entry mbstr_post_entries[] = {
215 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
216 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
217 	{ NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)221 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
222 	if (encoding_name) {
223 		const mbfl_encoding *encoding;
224 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
225 		if (last_encoding_name && (last_encoding_name == encoding_name
226 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
227 			return MBSTRG(last_used_encoding);
228 		}
229 
230 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
231 		if (!encoding) {
232 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
233 			return NULL;
234 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
235 			if (encoding == &mbfl_encoding_base64) {
236 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
237 			} else if (encoding == &mbfl_encoding_qprint) {
238 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
239 			} else if (encoding == &mbfl_encoding_html_ent) {
240 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
241 			} else if (encoding == &mbfl_encoding_uuencode) {
242 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
243 			}
244 		}
245 
246 		if (last_encoding_name) {
247 			zend_string_release(last_encoding_name);
248 		}
249 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
250 		MBSTRG(last_used_encoding) = encoding;
251 		return encoding;
252 	} else {
253 		return MBSTRG(current_internal_encoding);
254 	}
255 }
256 
php_mb_get_encoding_or_pass(const char * encoding_name)257 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
258 	if (strcmp(encoding_name, "pass") == 0) {
259 		return &mbfl_encoding_pass;
260 	}
261 
262 	return mbfl_name2encoding(encoding_name);
263 }
264 
count_commas(const char * p,const char * end)265 static size_t count_commas(const char *p, const char *end) {
266 	size_t count = 0;
267 	while ((p = memchr(p, ',', end - p))) {
268 		count++;
269 		p++;
270 	}
271 	return count;
272 }
273 
274 /* {{{ static zend_result php_mb_parse_encoding_list()
275  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
276  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
277  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num,bool allow_pass_encoding)278 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
279 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num,
280 	bool allow_pass_encoding)
281 {
282 	if (value == NULL || value_length == 0) {
283 		*return_list = NULL;
284 		*return_size = 0;
285 		return SUCCESS;
286 	} else {
287 		bool included_auto;
288 		size_t n, size;
289 		char *p1, *endp, *tmpstr;
290 		const mbfl_encoding **entry, **list;
291 
292 		/* copy the value string for work */
293 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
294 			tmpstr = (char *)estrndup(value+1, value_length-2);
295 			value_length -= 2;
296 		} else {
297 			tmpstr = (char *)estrndup(value, value_length);
298 		}
299 
300 		endp = tmpstr + value_length;
301 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
302 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
303 		entry = list;
304 		n = 0;
305 		included_auto = 0;
306 		p1 = tmpstr;
307 		while (1) {
308 			char *comma = memchr(p1, ',', endp - p1);
309 			char *p = comma ? comma : endp;
310 			*p = '\0';
311 			/* trim spaces */
312 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
313 				p1++;
314 			}
315 			p--;
316 			while (p > p1 && (*p == ' ' || *p == '\t')) {
317 				*p = '\0';
318 				p--;
319 			}
320 			/* convert to the encoding number and check encoding */
321 			if (strcasecmp(p1, "auto") == 0) {
322 				if (!included_auto) {
323 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
324 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
325 					size_t i;
326 					included_auto = 1;
327 					for (i = 0; i < identify_list_size; i++) {
328 						*entry++ = mbfl_no2encoding(*src++);
329 						n++;
330 					}
331 				}
332 			} else {
333 				const mbfl_encoding *encoding =
334 					allow_pass_encoding ? php_mb_get_encoding_or_pass(p1) : mbfl_name2encoding(p1);
335 				if (!encoding) {
336 					/* Called from an INI setting modification */
337 					if (arg_num == 0) {
338 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
339 					} else {
340 						zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
341 					}
342 					efree(tmpstr);
343 					pefree(ZEND_VOIDP(list), persistent);
344 					return FAILURE;
345 				}
346 
347 				*entry++ = encoding;
348 				n++;
349 			}
350 			if (n >= size || comma == NULL) {
351 				break;
352 			}
353 			p1 = comma + 1;
354 		}
355 		*return_list = list;
356 		*return_size = n;
357 		efree(tmpstr);
358 	}
359 
360 	return SUCCESS;
361 }
362 /* }}} */
363 
364 /* {{{ static int php_mb_parse_encoding_array()
365  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
366  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
367  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)368 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
369 	size_t *return_size, uint32_t arg_num)
370 {
371 	/* Allocate enough space to include the default detect order if "auto" is used. */
372 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
373 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
374 	const mbfl_encoding **entry = list;
375 	bool included_auto = 0;
376 	size_t n = 0;
377 	zval *hash_entry;
378 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
379 		zend_string *encoding_str = zval_try_get_string(hash_entry);
380 		if (UNEXPECTED(!encoding_str)) {
381 			efree(ZEND_VOIDP(list));
382 			return FAILURE;
383 		}
384 
385 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
386 			if (!included_auto) {
387 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
388 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
389 				size_t j;
390 
391 				included_auto = 1;
392 				for (j = 0; j < identify_list_size; j++) {
393 					*entry++ = mbfl_no2encoding(*src++);
394 					n++;
395 				}
396 			}
397 		} else {
398 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
399 			if (encoding) {
400 				*entry++ = encoding;
401 				n++;
402 			} else {
403 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
404 				zend_string_release(encoding_str);
405 				efree(ZEND_VOIDP(list));
406 				return FAILURE;
407 			}
408 		}
409 		zend_string_release(encoding_str);
410 	} ZEND_HASH_FOREACH_END();
411 	*return_list = list;
412 	*return_size = n;
413 	return SUCCESS;
414 }
415 /* }}} */
416 
417 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)418 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
419 {
420 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
421 }
422 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)423 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
424 {
425 	return ((const mbfl_encoding *)encoding)->name;
426 }
427 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)428 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
429 {
430 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
431 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
432 }
433 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)434 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
435 {
436 	mbfl_string string;
437 
438 	if (!list) {
439 		list = (const zend_encoding **)MBSTRG(current_detect_order_list);
440 		list_size = MBSTRG(current_detect_order_list_size);
441 	}
442 
443 	mbfl_string_init(&string);
444 	string.val = (unsigned char *)arg_string;
445 	string.len = arg_length;
446 	return (const zend_encoding *) mbfl_identify_encoding(&string, (const mbfl_encoding **)list, list_size, 0);
447 }
448 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)449 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
450 {
451 	mbfl_string string, result;
452 	mbfl_buffer_converter *convd;
453 
454 	/* new encoding */
455 	/* initialize string */
456 	string.encoding = (const mbfl_encoding*)encoding_from;
457 	string.val = (unsigned char*)from;
458 	string.len = from_length;
459 
460 	/* initialize converter */
461 	convd = mbfl_buffer_converter_new((const mbfl_encoding *)encoding_from, (const mbfl_encoding *)encoding_to, string.len);
462 	if (convd == NULL) {
463 		return (size_t) -1;
464 	}
465 
466 	mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
467 	mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
468 
469 	/* do it */
470 	size_t loc = mbfl_buffer_converter_feed(convd, &string);
471 
472 	mbfl_buffer_converter_flush(convd);
473 	mbfl_string_init(&result);
474 	if (!mbfl_buffer_converter_result(convd, &result)) {
475 		mbfl_buffer_converter_delete(convd);
476 		return (size_t)-1;
477 	}
478 
479 	*to = result.val;
480 	*to_length = result.len;
481 
482 	mbfl_buffer_converter_delete(convd);
483 
484 	return loc;
485 }
486 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)487 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
488 {
489 	return php_mb_parse_encoding_list(
490 		encoding_list, encoding_list_len,
491 		(const mbfl_encoding ***)return_list, return_size,
492 		persistent, /* arg_num */ 0, /* allow_pass_encoding */ 1);
493 }
494 
php_mb_zend_internal_encoding_getter(void)495 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
496 {
497 	return (const zend_encoding *)MBSTRG(internal_encoding);
498 }
499 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)500 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
501 {
502 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
503 	return SUCCESS;
504 }
505 
506 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
507 	"mbstring",
508 	php_mb_zend_encoding_fetcher,
509 	php_mb_zend_encoding_name_getter,
510 	php_mb_zend_encoding_lexer_compatibility_checker,
511 	php_mb_zend_encoding_detector,
512 	php_mb_zend_encoding_converter,
513 	php_mb_zend_encoding_list_parser,
514 	php_mb_zend_internal_encoding_getter,
515 	php_mb_zend_internal_encoding_setter
516 };
517 /* }}} */
518 
519 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)520 static void *_php_mb_compile_regex(const char *pattern)
521 {
522 	pcre2_code *retval;
523 	PCRE2_SIZE err_offset;
524 	int errnum;
525 
526 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
527 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
528 		PCRE2_UCHAR err_str[128];
529 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
530 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
531 	}
532 	return retval;
533 }
534 /* }}} */
535 
536 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)537 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
538 {
539 	int res;
540 
541 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
542 	if (NULL == match_data) {
543 		pcre2_code_free(opaque);
544 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
545 		return FAILURE;
546 	}
547 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
548 	php_pcre_free_match_data(match_data);
549 
550 	return res;
551 }
552 /* }}} */
553 
554 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)555 static void _php_mb_free_regex(void *opaque)
556 {
557 	pcre2_code_free(opaque);
558 }
559 /* }}} */
560 
561 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)562 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
563 {
564 	size_t i;
565 
566 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
567 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
568 
569 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
570 		if (php_mb_default_identify_list[i].lang == lang) {
571 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
572 			*plist_size = php_mb_default_identify_list[i].list_size;
573 			return 1;
574 		}
575 	}
576 	return 0;
577 }
578 /* }}} */
579 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)580 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
581 {
582 	char *result = emalloc(len + 2);
583 	char *resp = result;
584 	size_t i;
585 
586 	for (i = 0; i < len && start[i] != quote; ++i) {
587 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
588 			*resp++ = start[++i];
589 		} else {
590 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
591 
592 			while (j-- > 0 && i < len) {
593 				*resp++ = start[i++];
594 			}
595 			--i;
596 		}
597 	}
598 
599 	*resp = '\0';
600 	return result;
601 }
602 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)603 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
604 {
605 	char *pos = *line, quote;
606 	char *res;
607 
608 	while (*pos && *pos != stop) {
609 		if ((quote = *pos) == '"' || quote == '\'') {
610 			++pos;
611 			while (*pos && *pos != quote) {
612 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
613 					pos += 2;
614 				} else {
615 					++pos;
616 				}
617 			}
618 			if (*pos) {
619 				++pos;
620 			}
621 		} else {
622 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
623 
624 		}
625 	}
626 	if (*pos == '\0') {
627 		res = estrdup(*line);
628 		*line += strlen(*line);
629 		return res;
630 	}
631 
632 	res = estrndup(*line, pos - *line);
633 
634 	while (*pos == stop) {
635 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
636 	}
637 
638 	*line = pos;
639 	return res;
640 }
641 /* }}} */
642 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)643 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
644 {
645 	while (*str && isspace(*(unsigned char *)str)) {
646 		++str;
647 	}
648 
649 	if (!*str) {
650 		return estrdup("");
651 	}
652 
653 	if (*str == '"' || *str == '\'') {
654 		char quote = *str;
655 
656 		str++;
657 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
658 	} else {
659 		char *strend = str;
660 
661 		while (*strend && !isspace(*(unsigned char *)strend)) {
662 			++strend;
663 		}
664 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
665 	}
666 }
667 /* }}} */
668 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)669 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
670 {
671 	char *s, *s2;
672 	const size_t filename_len = strlen(filename);
673 
674 	/* The \ check should technically be needed for win32 systems only where
675 	 * it is a valid path separator. However, IE in all it's wisdom always sends
676 	 * the full path of the file on the user's filesystem, which means that unless
677 	 * the user does basename() they get a bogus file name. Until IE's user base drops
678 	 * to nill or problem is fixed this code must remain enabled for all systems. */
679 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
680 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
681 
682 	if (s && s2) {
683 		if (s > s2) {
684 			return ++s;
685 		} else {
686 			return ++s2;
687 		}
688 	} else if (s) {
689 		return ++s;
690 	} else if (s2) {
691 		return ++s2;
692 	} else {
693 		return filename;
694 	}
695 }
696 /* }}} */
697 
698 /* {{{ php.ini directive handler */
699 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)700 static PHP_INI_MH(OnUpdate_mbstring_language)
701 {
702 	enum mbfl_no_language no_language;
703 
704 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
705 	if (no_language == mbfl_no_language_invalid) {
706 		MBSTRG(language) = mbfl_no_language_neutral;
707 		return FAILURE;
708 	}
709 	MBSTRG(language) = no_language;
710 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
711 	return SUCCESS;
712 }
713 /* }}} */
714 
715 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)716 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
717 {
718 	const mbfl_encoding **list;
719 	size_t size;
720 
721 	if (!new_value) {
722 		if (MBSTRG(detect_order_list)) {
723 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 		}
725 		MBSTRG(detect_order_list) = NULL;
726 		MBSTRG(detect_order_list_size) = 0;
727 		return SUCCESS;
728 	}
729 
730 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 0) || size == 0) {
731 		return FAILURE;
732 	}
733 
734 	if (MBSTRG(detect_order_list)) {
735 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
736 	}
737 	MBSTRG(detect_order_list) = list;
738 	MBSTRG(detect_order_list_size) = size;
739 	return SUCCESS;
740 }
741 /* }}} */
742 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)743 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
744 	const mbfl_encoding **list;
745 	size_t size;
746 	if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 1) || size == 0) {
747 		return FAILURE;
748 	}
749 	if (MBSTRG(http_input_list)) {
750 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
751 	}
752 	MBSTRG(http_input_list) = list;
753 	MBSTRG(http_input_list_size) = size;
754 	return SUCCESS;
755 }
756 
757 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)758 static PHP_INI_MH(OnUpdate_mbstring_http_input)
759 {
760 	if (new_value) {
761 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
762 	}
763 
764 	if (!new_value || !ZSTR_LEN(new_value)) {
765 		const char *encoding = php_get_input_encoding();
766 		MBSTRG(http_input_set) = 0;
767 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
768 		return SUCCESS;
769 	}
770 
771 	MBSTRG(http_input_set) = 1;
772 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
773 }
774 /* }}} */
775 
_php_mb_ini_mbstring_http_output_set(const char * new_value)776 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
777 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
778 	if (!encoding) {
779 		return FAILURE;
780 	}
781 
782 	MBSTRG(http_output_encoding) = encoding;
783 	MBSTRG(current_http_output_encoding) = encoding;
784 	return SUCCESS;
785 }
786 
787 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)788 static PHP_INI_MH(OnUpdate_mbstring_http_output)
789 {
790 	if (new_value) {
791 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
792 	}
793 
794 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
795 		MBSTRG(http_output_set) = 0;
796 		_php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
797 		return SUCCESS;
798 	}
799 
800 	MBSTRG(http_output_set) = 1;
801 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
802 }
803 /* }}} */
804 
805 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)806 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
807 {
808 	const mbfl_encoding *encoding;
809 
810 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
811 		/* falls back to UTF-8 if an unknown encoding name is given */
812 		if (new_value) {
813 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
814 		}
815 		encoding = &mbfl_encoding_utf8;
816 	}
817 	MBSTRG(internal_encoding) = encoding;
818 	MBSTRG(current_internal_encoding) = encoding;
819 #ifdef HAVE_MBREGEX
820 	{
821 		const char *enc_name = new_value;
822 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
823 			/* falls back to UTF-8 if an unknown encoding name is given */
824 			enc_name = "UTF-8";
825 			php_mb_regex_set_default_mbctype(enc_name);
826 		}
827 		php_mb_regex_set_mbctype(new_value);
828 	}
829 #endif
830 	return SUCCESS;
831 }
832 /* }}} */
833 
834 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)835 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
836 {
837 	if (new_value) {
838 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
839 	}
840 
841 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
842 		return FAILURE;
843 	}
844 
845 	if (new_value && ZSTR_LEN(new_value)) {
846 		MBSTRG(internal_encoding_set) = 1;
847 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
848 	} else {
849 		const char *encoding = php_get_internal_encoding();
850 		MBSTRG(internal_encoding_set) = 0;
851 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
852 	}
853 }
854 /* }}} */
855 
856 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)857 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
858 {
859 	int c;
860 	char *endptr = NULL;
861 
862 	if (new_value != NULL) {
863 		if (zend_string_equals_literal_ci(new_value, "none")) {
864 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
865 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
866 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
867 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
868 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
869 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
870 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
871 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
872 		} else {
873 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
874 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
875 			if (ZSTR_LEN(new_value) > 0) {
876 				c = strtol(ZSTR_VAL(new_value), &endptr, 0);
877 				if (*endptr == '\0') {
878 					MBSTRG(filter_illegal_substchar) = c;
879 					MBSTRG(current_filter_illegal_substchar) = c;
880 				}
881 			}
882 		}
883 	} else {
884 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
885 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
886 		MBSTRG(filter_illegal_substchar) = 0x3f;	/* '?' */
887 		MBSTRG(current_filter_illegal_substchar) = 0x3f;	/* '?' */
888 	}
889 
890 	return SUCCESS;
891 }
892 /* }}} */
893 
894 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)895 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
896 {
897 	if (new_value == NULL) {
898 		return FAILURE;
899 	}
900 
901 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
902 
903 	if (MBSTRG(encoding_translation)) {
904 		sapi_unregister_post_entry(php_post_entries);
905 		sapi_register_post_entries(mbstr_post_entries);
906 	} else {
907 		sapi_unregister_post_entry(mbstr_post_entries);
908 		sapi_register_post_entries(php_post_entries);
909 	}
910 
911 	return SUCCESS;
912 }
913 /* }}} */
914 
915 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)916 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
917 {
918 	zend_string *tmp;
919 	void *re = NULL;
920 
921 	if (!new_value) {
922 		new_value = entry->orig_value;
923 	}
924 	tmp = php_trim(new_value, NULL, 0, 3);
925 
926 	if (ZSTR_LEN(tmp) > 0) {
927 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
928 			zend_string_release_ex(tmp, 0);
929 			return FAILURE;
930 		}
931 	}
932 
933 	if (MBSTRG(http_output_conv_mimetypes)) {
934 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
935 	}
936 
937 	MBSTRG(http_output_conv_mimetypes) = re;
938 
939 	zend_string_release_ex(tmp, 0);
940 	return SUCCESS;
941 }
942 /* }}} */
943 /* }}} */
944 
945 /* {{{ php.ini directive registration */
946 PHP_INI_BEGIN()
947 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
948 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
949 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
950 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
951 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
952 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
953 
954 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
955 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
956 		OnUpdate_mbstring_encoding_translation,
957 		encoding_translation, zend_mbstring_globals, mbstring_globals)
958 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
959 		"^(text/|application/xhtml\\+xml)",
960 		PHP_INI_ALL,
961 		OnUpdate_mbstring_http_output_conv_mimetypes)
962 
963 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
964 		PHP_INI_ALL,
965 		OnUpdateBool,
966 		strict_detection, zend_mbstring_globals, mbstring_globals)
967 #ifdef HAVE_MBREGEX
968 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
969 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
970 #endif
PHP_INI_END()971 PHP_INI_END()
972 /* }}} */
973 
974 static void mbstring_internal_encoding_changed_hook(void) {
975 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
976 	if (!MBSTRG(internal_encoding_set)) {
977 		const char *encoding = php_get_internal_encoding();
978 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
979 	}
980 
981 	if (!MBSTRG(http_output_set)) {
982 		const char *encoding = php_get_output_encoding();
983 		_php_mb_ini_mbstring_http_output_set(encoding);
984 	}
985 
986 	if (!MBSTRG(http_input_set)) {
987 		const char *encoding = php_get_input_encoding();
988 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
989 	}
990 }
991 
992 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)993 static PHP_GINIT_FUNCTION(mbstring)
994 {
995 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
996 ZEND_TSRMLS_CACHE_UPDATE();
997 #endif
998 
999 	mbstring_globals->language = mbfl_no_language_uni;
1000 	mbstring_globals->internal_encoding = NULL;
1001 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
1002 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
1003 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
1004 	mbstring_globals->http_input_identify = NULL;
1005 	mbstring_globals->http_input_identify_get = NULL;
1006 	mbstring_globals->http_input_identify_post = NULL;
1007 	mbstring_globals->http_input_identify_cookie = NULL;
1008 	mbstring_globals->http_input_identify_string = NULL;
1009 	mbstring_globals->http_input_list = NULL;
1010 	mbstring_globals->http_input_list_size = 0;
1011 	mbstring_globals->detect_order_list = NULL;
1012 	mbstring_globals->detect_order_list_size = 0;
1013 	mbstring_globals->current_detect_order_list = NULL;
1014 	mbstring_globals->current_detect_order_list_size = 0;
1015 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1016 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1017 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1018 	mbstring_globals->filter_illegal_substchar = 0x3f;	/* '?' */
1019 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1020 	mbstring_globals->current_filter_illegal_substchar = 0x3f;	/* '?' */
1021 	mbstring_globals->illegalchars = 0;
1022 	mbstring_globals->encoding_translation = 0;
1023 	mbstring_globals->strict_detection = 0;
1024 	mbstring_globals->outconv = NULL;
1025 	mbstring_globals->http_output_conv_mimetypes = NULL;
1026 #ifdef HAVE_MBREGEX
1027 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1028 #endif
1029 	mbstring_globals->last_used_encoding_name = NULL;
1030 	mbstring_globals->last_used_encoding = NULL;
1031 	mbstring_globals->internal_encoding_set = 0;
1032 	mbstring_globals->http_output_set = 0;
1033 	mbstring_globals->http_input_set = 0;
1034 }
1035 /* }}} */
1036 
1037 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1038 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1039 {
1040 	if (mbstring_globals->http_input_list) {
1041 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1042 	}
1043 	if (mbstring_globals->detect_order_list) {
1044 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1045 	}
1046 	if (mbstring_globals->http_output_conv_mimetypes) {
1047 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1048 	}
1049 #ifdef HAVE_MBREGEX
1050 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1051 #endif
1052 }
1053 /* }}} */
1054 
1055 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1056 PHP_MINIT_FUNCTION(mbstring)
1057 {
1058 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1059 ZEND_TSRMLS_CACHE_UPDATE();
1060 #endif
1061 
1062 	REGISTER_INI_ENTRIES();
1063 
1064 	/* We assume that we're the only user of the hook. */
1065 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1066 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1067 	mbstring_internal_encoding_changed_hook();
1068 
1069 	/* This is a global handler. Should not be set in a per-request handler. */
1070 	sapi_register_treat_data(mbstr_treat_data);
1071 
1072 	/* Post handlers are stored in the thread-local context. */
1073 	if (MBSTRG(encoding_translation)) {
1074 		sapi_register_post_entries(mbstr_post_entries);
1075 	}
1076 
1077 #ifdef HAVE_MBREGEX
1078 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1079 #endif
1080 
1081 	register_mbstring_symbols(module_number);
1082 
1083 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1084 		return FAILURE;
1085 	}
1086 
1087 	php_rfc1867_set_multibyte_callbacks(
1088 		php_mb_encoding_translation,
1089 		php_mb_gpc_get_detect_order,
1090 		php_mb_gpc_set_input_encoding,
1091 		php_mb_rfc1867_getword,
1092 		php_mb_rfc1867_getword_conf,
1093 		php_mb_rfc1867_basename);
1094 
1095 	return SUCCESS;
1096 }
1097 /* }}} */
1098 
1099 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1100 PHP_MSHUTDOWN_FUNCTION(mbstring)
1101 {
1102 	UNREGISTER_INI_ENTRIES();
1103 
1104 	zend_multibyte_restore_functions();
1105 
1106 #ifdef HAVE_MBREGEX
1107 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1108 #endif
1109 
1110 	php_internal_encoding_changed = NULL;
1111 
1112 	return SUCCESS;
1113 }
1114 /* }}} */
1115 
1116 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1117 PHP_RINIT_FUNCTION(mbstring)
1118 {
1119 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1120 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1121 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1122 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1123 
1124 	MBSTRG(illegalchars) = 0;
1125 
1126 	php_mb_populate_current_detect_order_list();
1127 
1128 #ifdef HAVE_MBREGEX
1129 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1130 #endif
1131 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1132 
1133 	return SUCCESS;
1134 }
1135 /* }}} */
1136 
1137 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1138 PHP_RSHUTDOWN_FUNCTION(mbstring)
1139 {
1140 	if (MBSTRG(current_detect_order_list) != NULL) {
1141 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1142 		MBSTRG(current_detect_order_list) = NULL;
1143 		MBSTRG(current_detect_order_list_size) = 0;
1144 	}
1145 	if (MBSTRG(outconv) != NULL) {
1146 		MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1147 		mbfl_buffer_converter_delete(MBSTRG(outconv));
1148 		MBSTRG(outconv) = NULL;
1149 	}
1150 
1151 	/* clear http input identification. */
1152 	MBSTRG(http_input_identify) = NULL;
1153 	MBSTRG(http_input_identify_post) = NULL;
1154 	MBSTRG(http_input_identify_get) = NULL;
1155 	MBSTRG(http_input_identify_cookie) = NULL;
1156 	MBSTRG(http_input_identify_string) = NULL;
1157 
1158 	if (MBSTRG(last_used_encoding_name)) {
1159 		zend_string_release(MBSTRG(last_used_encoding_name));
1160 		MBSTRG(last_used_encoding_name) = NULL;
1161 	}
1162 
1163 	MBSTRG(internal_encoding_set) = 0;
1164 	MBSTRG(http_output_set) = 0;
1165 	MBSTRG(http_input_set) = 0;
1166 
1167 #ifdef HAVE_MBREGEX
1168 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1169 #endif
1170 
1171 	return SUCCESS;
1172 }
1173 /* }}} */
1174 
1175 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1176 PHP_MINFO_FUNCTION(mbstring)
1177 {
1178 	php_info_print_table_start();
1179 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1180 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1181 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1182 	{
1183 		char tmp[256];
1184 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1185 		php_info_print_table_row(2, "libmbfl version", tmp);
1186 	}
1187 	php_info_print_table_end();
1188 
1189 	php_info_print_table_start();
1190 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1191 	php_info_print_table_end();
1192 
1193 #ifdef HAVE_MBREGEX
1194 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1195 #endif
1196 
1197 	DISPLAY_INI_ENTRIES();
1198 }
1199 /* }}} */
1200 
1201 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1202 PHP_FUNCTION(mb_language)
1203 {
1204 	zend_string *name = NULL;
1205 
1206 	ZEND_PARSE_PARAMETERS_START(0, 1)
1207 		Z_PARAM_OPTIONAL
1208 		Z_PARAM_STR_OR_NULL(name)
1209 	ZEND_PARSE_PARAMETERS_END();
1210 
1211 	if (name == NULL) {
1212 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1213 	} else {
1214 		zend_string *ini_name = zend_string_init("mbstring.language", sizeof("mbstring.language") - 1, 0);
1215 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1216 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1217 			zend_string_release_ex(ini_name, 0);
1218 			RETURN_THROWS();
1219 		}
1220 		// TODO Make return void
1221 		RETVAL_TRUE;
1222 		zend_string_release_ex(ini_name, 0);
1223 	}
1224 }
1225 /* }}} */
1226 
1227 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1228 PHP_FUNCTION(mb_internal_encoding)
1229 {
1230 	char *name = NULL;
1231 	size_t name_len;
1232 	const mbfl_encoding *encoding;
1233 
1234 	ZEND_PARSE_PARAMETERS_START(0, 1)
1235 		Z_PARAM_OPTIONAL
1236 		Z_PARAM_STRING_OR_NULL(name, name_len)
1237 	ZEND_PARSE_PARAMETERS_END();
1238 
1239 	if (name == NULL) {
1240 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1241 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1242 	} else {
1243 		encoding = mbfl_name2encoding(name);
1244 		if (!encoding) {
1245 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1246 			RETURN_THROWS();
1247 		} else {
1248 			MBSTRG(current_internal_encoding) = encoding;
1249 			MBSTRG(internal_encoding_set) = 1;
1250 			/* TODO Return old encoding */
1251 			RETURN_TRUE;
1252 		}
1253 	}
1254 }
1255 /* }}} */
1256 
1257 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1258 PHP_FUNCTION(mb_http_input)
1259 {
1260 	char *type = NULL;
1261 	size_t type_len = 0, n;
1262 	const mbfl_encoding **entry;
1263 	const mbfl_encoding *encoding;
1264 
1265 	ZEND_PARSE_PARAMETERS_START(0, 1)
1266 		Z_PARAM_OPTIONAL
1267 		Z_PARAM_STRING_OR_NULL(type, type_len)
1268 	ZEND_PARSE_PARAMETERS_END();
1269 
1270 	if (type == NULL) {
1271 		encoding = MBSTRG(http_input_identify);
1272 	} else {
1273 		switch (*type) {
1274 		case 'G':
1275 		case 'g':
1276 			encoding = MBSTRG(http_input_identify_get);
1277 			break;
1278 		case 'P':
1279 		case 'p':
1280 			encoding = MBSTRG(http_input_identify_post);
1281 			break;
1282 		case 'C':
1283 		case 'c':
1284 			encoding = MBSTRG(http_input_identify_cookie);
1285 			break;
1286 		case 'S':
1287 		case 's':
1288 			encoding = MBSTRG(http_input_identify_string);
1289 			break;
1290 		case 'I':
1291 		case 'i':
1292 			entry = MBSTRG(http_input_list);
1293 			n = MBSTRG(http_input_list_size);
1294 			array_init(return_value);
1295 			for (size_t i = 0; i < n; i++, entry++) {
1296 				add_next_index_string(return_value, (*entry)->name);
1297 			}
1298 			return;
1299 		case 'L':
1300 		case 'l':
1301 			entry = MBSTRG(http_input_list);
1302 			n = MBSTRG(http_input_list_size);
1303 			if (n == 0) {
1304 				RETURN_FALSE;
1305 			}
1306 			// TODO Use smart_str instead.
1307 			mbfl_string result;
1308 			mbfl_memory_device device;
1309 			mbfl_memory_device_init(&device, n * 12, 0);
1310 			for (size_t i = 0; i < n; i++, entry++) {
1311 				mbfl_memory_device_strcat(&device, (*entry)->name);
1312 				mbfl_memory_device_output(',', &device);
1313 			}
1314 			mbfl_memory_device_unput(&device); /* Remove trailing comma */
1315 			mbfl_memory_device_result(&device, &result);
1316 			RETVAL_STRINGL((const char*)result.val, result.len);
1317 			mbfl_string_clear(&result);
1318 			return;
1319 		default:
1320 			zend_argument_value_error(1,
1321 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1322 			RETURN_THROWS();
1323 		}
1324 	}
1325 
1326 	if (encoding) {
1327 		RETURN_STRING(encoding->name);
1328 	} else {
1329 		RETURN_FALSE;
1330 	}
1331 }
1332 /* }}} */
1333 
1334 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1335 PHP_FUNCTION(mb_http_output)
1336 {
1337 	char *name = NULL;
1338 	size_t name_len;
1339 
1340 	ZEND_PARSE_PARAMETERS_START(0, 1)
1341 		Z_PARAM_OPTIONAL
1342 		Z_PARAM_STRING_OR_NULL(name, name_len)
1343 	ZEND_PARSE_PARAMETERS_END();
1344 
1345 	if (name == NULL) {
1346 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1347 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1348 	} else {
1349 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1350 		if (!encoding) {
1351 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1352 			RETURN_THROWS();
1353 		} else {
1354 			MBSTRG(http_output_set) = 1;
1355 			MBSTRG(current_http_output_encoding) = encoding;
1356 			/* TODO Return previous encoding? */
1357 			RETURN_TRUE;
1358 		}
1359 	}
1360 }
1361 /* }}} */
1362 
1363 /* {{{ Sets the current detect_order or Return the current detect_order as a array */
PHP_FUNCTION(mb_detect_order)1364 PHP_FUNCTION(mb_detect_order)
1365 {
1366 	zend_string *order_str = NULL;
1367 	HashTable *order_ht = NULL;
1368 
1369 	ZEND_PARSE_PARAMETERS_START(0, 1)
1370 		Z_PARAM_OPTIONAL
1371 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1372 	ZEND_PARSE_PARAMETERS_END();
1373 
1374 	if (!order_str && !order_ht) {
1375 		size_t n = MBSTRG(current_detect_order_list_size);
1376 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1377 		array_init(return_value);
1378 		for (size_t i = 0; i < n; i++) {
1379 			add_next_index_string(return_value, (*entry)->name);
1380 			entry++;
1381 		}
1382 	} else {
1383 		const mbfl_encoding **list;
1384 		size_t size;
1385 		if (order_ht) {
1386 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1387 				RETURN_THROWS();
1388 			}
1389 		} else {
1390 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1, /* allow_pass_encoding */ 0)) {
1391 				RETURN_THROWS();
1392 			}
1393 		}
1394 
1395 		if (size == 0) {
1396 			efree(ZEND_VOIDP(list));
1397 			zend_argument_value_error(1, "must specify at least one encoding");
1398 			RETURN_THROWS();
1399 		}
1400 
1401 		if (MBSTRG(current_detect_order_list)) {
1402 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1403 		}
1404 		MBSTRG(current_detect_order_list) = list;
1405 		MBSTRG(current_detect_order_list_size) = size;
1406 		RETURN_TRUE;
1407 	}
1408 }
1409 /* }}} */
1410 
php_mb_check_code_point(zend_long cp)1411 static inline int php_mb_check_code_point(zend_long cp)
1412 {
1413 	if (cp < 0 || cp >= 0x110000) {
1414 		/* Out of Unicode range */
1415 		return 0;
1416 	}
1417 
1418 	if (cp >= 0xd800 && cp <= 0xdfff) {
1419 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1420 		 * substitute character. */
1421 		return 0;
1422 	}
1423 
1424 	/* As we do not know the target encoding of the conversion operation that is going to
1425 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1426 	 * in the given encoding at this point. Thus we have to accept everything. */
1427 	return 1;
1428 }
1429 
1430 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1431 PHP_FUNCTION(mb_substitute_character)
1432 {
1433 	zend_string *substitute_character = NULL;
1434 	zend_long substitute_codepoint;
1435 	bool substitute_is_null = 1;
1436 
1437 	ZEND_PARSE_PARAMETERS_START(0, 1)
1438 		Z_PARAM_OPTIONAL
1439 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1440 	ZEND_PARSE_PARAMETERS_END();
1441 
1442 	if (substitute_is_null) {
1443 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1444 			RETURN_STRING("none");
1445 		}
1446 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1447 			RETURN_STRING("long");
1448 		}
1449 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1450 			RETURN_STRING("entity");
1451 		}
1452 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1453 	}
1454 
1455 	if (substitute_character != NULL) {
1456 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1457 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1458 			RETURN_TRUE;
1459 		}
1460 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1461 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1462 			RETURN_TRUE;
1463 		}
1464 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1465 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1466 			RETURN_TRUE;
1467 		}
1468 		/* Invalid string value */
1469 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1470 		RETURN_THROWS();
1471 	}
1472 	/* Integer codepoint passed */
1473 	if (!php_mb_check_code_point(substitute_codepoint)) {
1474 		zend_argument_value_error(1, "is not a valid codepoint");
1475 		RETURN_THROWS();
1476 	}
1477 
1478 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1479 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1480 	RETURN_TRUE;
1481 }
1482 /* }}} */
1483 
1484 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1485 PHP_FUNCTION(mb_preferred_mime_name)
1486 {
1487 	enum mbfl_no_encoding no_encoding;
1488 	char *name = NULL;
1489 	size_t name_len;
1490 
1491 	ZEND_PARSE_PARAMETERS_START(1, 1)
1492 		Z_PARAM_STRING(name, name_len)
1493 	ZEND_PARSE_PARAMETERS_END();
1494 
1495 	no_encoding = mbfl_name2no_encoding(name);
1496 	if (no_encoding == mbfl_no_encoding_invalid) {
1497 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1498 		RETURN_THROWS();
1499 	}
1500 
1501 	const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
1502 	if (preferred_name == NULL || *preferred_name == '\0') {
1503 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1504 		RETVAL_FALSE;
1505 	} else {
1506 		RETVAL_STRING((char *)preferred_name);
1507 	}
1508 }
1509 /* }}} */
1510 
1511 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1512 PHP_FUNCTION(mb_parse_str)
1513 {
1514 	zval *track_vars_array = NULL;
1515 	char *encstr;
1516 	size_t encstr_len;
1517 	php_mb_encoding_handler_info_t info;
1518 	const mbfl_encoding *detected;
1519 
1520 	ZEND_PARSE_PARAMETERS_START(2, 2)
1521 		Z_PARAM_STRING(encstr, encstr_len)
1522 		Z_PARAM_ZVAL(track_vars_array)
1523 	ZEND_PARSE_PARAMETERS_END();
1524 
1525 	track_vars_array = zend_try_array_init(track_vars_array);
1526 	if (!track_vars_array) {
1527 		RETURN_THROWS();
1528 	}
1529 
1530 	encstr = estrndup(encstr, encstr_len);
1531 
1532 	info.data_type              = PARSE_STRING;
1533 	info.separator              = PG(arg_separator).input;
1534 	info.report_errors          = true;
1535 	info.to_encoding            = MBSTRG(current_internal_encoding);
1536 	info.from_encodings         = MBSTRG(http_input_list);
1537 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1538 
1539 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1540 
1541 	MBSTRG(http_input_identify) = detected;
1542 
1543 	RETVAL_BOOL(detected);
1544 
1545 	if (encstr != NULL) efree(encstr);
1546 }
1547 /* }}} */
1548 
1549 /* {{{ Returns string in output buffer converted to the http_output encoding */
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 	char *arg_string;
1553 	size_t arg_string_len;
1554 	zend_long arg_status;
1555 	mbfl_string string, result;
1556 	const char *charset;
1557 	char *p;
1558 	const mbfl_encoding *encoding;
1559 	int last_feed;
1560 	size_t len;
1561 	unsigned char send_text_mimetype = 0;
1562 	char *s, *mimetype = NULL;
1563 
1564 	ZEND_PARSE_PARAMETERS_START(2, 2)
1565 		Z_PARAM_STRING(arg_string, arg_string_len)
1566 		Z_PARAM_LONG(arg_status)
1567 	ZEND_PARSE_PARAMETERS_END();
1568 
1569 	encoding = MBSTRG(current_http_output_encoding);
1570 
1571 	/* start phase only */
1572 	if ((arg_status & PHP_OUTPUT_HANDLER_START) != 0) {
1573 		/* delete the converter just in case. */
1574 		if (MBSTRG(outconv)) {
1575 			MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1576 			mbfl_buffer_converter_delete(MBSTRG(outconv));
1577 			MBSTRG(outconv) = NULL;
1578 		}
1579 
1580 		if (encoding == &mbfl_encoding_pass) {
1581 			RETURN_STRINGL(arg_string, arg_string_len);
1582 		}
1583 
1584 		/* analyze mime type */
1585 		if (SG(sapi_headers).mimetype &&
1586 			_php_mb_match_regex(
1587 				MBSTRG(http_output_conv_mimetypes),
1588 				SG(sapi_headers).mimetype,
1589 				strlen(SG(sapi_headers).mimetype))) {
1590 			if ((s = strchr(SG(sapi_headers).mimetype,';')) == NULL) {
1591 				mimetype = estrdup(SG(sapi_headers).mimetype);
1592 			} else {
1593 				mimetype = estrndup(SG(sapi_headers).mimetype,s-SG(sapi_headers).mimetype);
1594 			}
1595 			send_text_mimetype = 1;
1596 		} else if (SG(sapi_headers).send_default_content_type) {
1597 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1598 		}
1599 
1600 		/* if content-type is not yet set, set it and activate the converter */
1601 		if (SG(sapi_headers).send_default_content_type || send_text_mimetype) {
1602 			charset = encoding->mime_name;
1603 			if (charset) {
1604 				len = spprintf( &p, 0, "Content-Type: %s; charset=%s",  mimetype, charset );
1605 				if (sapi_add_header(p, len, 0) != FAILURE) {
1606 					SG(sapi_headers).send_default_content_type = 0;
1607 				}
1608 			}
1609 			/* activate the converter */
1610 			MBSTRG(outconv) = mbfl_buffer_converter_new(MBSTRG(current_internal_encoding), encoding, 0);
1611 			if (send_text_mimetype){
1612 				efree(mimetype);
1613 			}
1614 		}
1615 	}
1616 
1617 	/* just return if the converter is not activated. */
1618 	if (MBSTRG(outconv) == NULL) {
1619 		RETURN_STRINGL(arg_string, arg_string_len);
1620 	}
1621 
1622 	/* flag */
1623 	last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1624 	/* mode */
1625 	mbfl_buffer_converter_illegal_mode(MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1626 	mbfl_buffer_converter_illegal_substchar(MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1627 
1628 	/* feed the string */
1629 	mbfl_string_init(&string);
1630 	/* these are not needed. convd has encoding info.
1631 	string.encoding = MBSTRG(current_internal_encoding);
1632 	*/
1633 	string.val = (unsigned char *)arg_string;
1634 	string.len = arg_string_len;
1635 
1636 	mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1637 	if (last_feed) {
1638 		mbfl_buffer_converter_flush(MBSTRG(outconv));
1639 	}
1640 	/* get the converter output, and return it */
1641 	mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1642 
1643 	// TODO: avoid reallocation ???
1644 	RETVAL_STRINGL((char *)result.val, result.len);		/* the string is already strdup()'ed */
1645 	efree(result.val);
1646 
1647 	/* delete the converter if it is the last feed. */
1648 	if (last_feed) {
1649 		MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1650 		mbfl_buffer_converter_delete(MBSTRG(outconv));
1651 		MBSTRG(outconv) = NULL;
1652 	}
1653 }
1654 /* }}} */
1655 
1656 /* {{{ Convert a multibyte string to an array. If split_length is specified,
1657  break the string down into chunks each split_length characters long. */
1658 
1659 /* structure to pass split params to the callback */
1660 struct mbfl_split_params {
1661 	zval *return_value; /* php function return value structure pointer */
1662 	mbfl_string *result_string; /* string to store result chunk */
1663 	size_t mb_chunk_length; /* actual chunk length in chars */
1664 	size_t split_length; /* split length in chars */
1665 	mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1666 };
1667 
1668 /* callback function to fill split array */
mbfl_split_output(int c,void * data)1669 static int mbfl_split_output(int c, void *data)
1670 {
1671 	struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1672 
1673 	(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1674 
1675 	if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1676 		mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1677 		mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1678 		mbfl_string *chunk = params->result_string;
1679 		mbfl_memory_device_result(device, chunk); /* make chunk */
1680 		add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1681 		efree(chunk->val);
1682 		params->mb_chunk_length = 0; /* reset mb_chunk size */
1683 	}
1684 
1685 	return 0;
1686 }
1687 
PHP_FUNCTION(mb_str_split)1688 PHP_FUNCTION(mb_str_split)
1689 {
1690 	zend_string *str, *encoding = NULL;
1691 	size_t mb_len, chunks, chunk_len;
1692 	const char *p, *last; /* pointer for the string cursor and last string char */
1693 	mbfl_string string, result_string;
1694 	const mbfl_encoding *mbfl_encoding;
1695 	zend_long split_length = 1;
1696 
1697 	ZEND_PARSE_PARAMETERS_START(1, 3)
1698 		Z_PARAM_STR(str)
1699 		Z_PARAM_OPTIONAL
1700 		Z_PARAM_LONG(split_length)
1701 		Z_PARAM_STR_OR_NULL(encoding)
1702 	ZEND_PARSE_PARAMETERS_END();
1703 
1704 	if (split_length <= 0) {
1705 		zend_argument_value_error(2, "must be greater than 0");
1706 		RETURN_THROWS();
1707 	}
1708 
1709 	/* fill mbfl_string structure */
1710 	string.val = (unsigned char *) ZSTR_VAL(str);
1711 	string.len = ZSTR_LEN(str);
1712 	string.encoding = php_mb_get_encoding(encoding, 3);
1713 	if (!string.encoding) {
1714 		RETURN_THROWS();
1715 	}
1716 
1717 	if (ZSTR_LEN(str) == 0) {
1718 		RETURN_EMPTY_ARRAY();
1719 	}
1720 
1721 	p = ZSTR_VAL(str); /* string cursor pointer */
1722 	last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1723 
1724 	mbfl_encoding = string.encoding;
1725 
1726 	/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1727 	if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1728 		mb_len = string.len;
1729 		chunk_len = (size_t)split_length; /* chunk length in bytes */
1730 	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1731 		mb_len = string.len / 2;
1732 		chunk_len = split_length * 2;
1733 	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1734 		mb_len = string.len / 4;
1735 		chunk_len = split_length * 4;
1736 	} else if (mbfl_encoding->mblen_table != NULL) {
1737 		/* second scenario: variable width encodings with length table */
1738 		char unsigned const *mbtab = mbfl_encoding->mblen_table;
1739 
1740 		/* assume that we have 1-bytes characters */
1741 		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1742 
1743 		while (p < last) { /* split cycle work until the cursor has reached the last byte */
1744 			char const *chunk_p = p; /* chunk first byte pointer */
1745 			chunk_len = 0; /* chunk length in bytes */
1746 			zend_long char_count;
1747 
1748 			for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1749 				char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1750 				chunk_len += m;
1751 				p += m;
1752 			}
1753 			if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1754 			add_next_index_stringl(return_value, chunk_p, chunk_len);
1755 		}
1756 		return;
1757 	} else {
1758 		/* third scenario: other multibyte encodings */
1759 		mbfl_convert_filter *filter, *decoder;
1760 
1761 		/* assume that we have 1-bytes characters */
1762 		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1763 
1764 		/* decoder filter to decode wchar to encoding */
1765 		mbfl_memory_device device;
1766 		mbfl_memory_device_init(&device, split_length + 1, 0);
1767 
1768 		decoder = mbfl_convert_filter_new(
1769 				&mbfl_encoding_wchar,
1770 				string.encoding,
1771 				mbfl_memory_device_output,
1772 				NULL,
1773 				&device);
1774 		/* assert that nothing is wrong with the decoder */
1775 		ZEND_ASSERT(decoder != NULL);
1776 
1777 		/* wchar filter */
1778 		mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1779 		struct mbfl_split_params params = { /* init callback function params structure */
1780 			.return_value = return_value,
1781 			.result_string = &result_string,
1782 			.mb_chunk_length = 0,
1783 			.split_length = (size_t)split_length,
1784 			.next_filter = decoder,
1785 		};
1786 
1787 		filter = mbfl_convert_filter_new(
1788 				string.encoding,
1789 				&mbfl_encoding_wchar,
1790 				mbfl_split_output,
1791 				NULL,
1792 				&params);
1793 		/* assert that nothing is wrong with the filter */
1794 		ZEND_ASSERT(filter != NULL);
1795 
1796 		while (p < last - 1) { /* cycle each byte except last with callback function */
1797 			(*filter->filter_function)(*p++, filter);
1798 		}
1799 		params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1800 		(*filter->filter_function)(*p++, filter); /* process last char */
1801 
1802 		mbfl_convert_filter_delete(decoder);
1803 		mbfl_convert_filter_delete(filter);
1804 		mbfl_memory_device_clear(&device);
1805 		return;
1806 	}
1807 
1808 	/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1809 	chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1810 	array_init_size(return_value, chunks);
1811 	if (chunks != 0) {
1812 		zend_long i;
1813 
1814 		for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1815 			add_next_index_stringl(return_value, p, chunk_len);
1816 		}
1817 		add_next_index_stringl(return_value, p, last - p);
1818 	}
1819 }
1820 /* }}} */
1821 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1822 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1823 {
1824 	size_t len = 0;
1825 
1826 	if (encoding->flag & MBFL_ENCTYPE_SBCS) {
1827 		return ZSTR_LEN(string);
1828 	} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1829 		return ZSTR_LEN(string) / 2;
1830 	} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1831 		return ZSTR_LEN(string) / 4;
1832 	} else if (encoding->mblen_table) {
1833 		const unsigned char *mbtab = encoding->mblen_table;
1834 		unsigned char *p = (unsigned char*)ZSTR_VAL(string), *e = p + ZSTR_LEN(string);
1835 		while (p < e) {
1836 			p += mbtab[*p];
1837 			len++;
1838 		}
1839 	} else {
1840 		uint32_t wchar_buf[128];
1841 		unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1842 		size_t in_len = ZSTR_LEN(string);
1843 		unsigned int state = 0;
1844 
1845 		while (in_len) {
1846 			len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1847 		}
1848 	}
1849 
1850 	return len;
1851 }
1852 
1853 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1854 PHP_FUNCTION(mb_strlen)
1855 {
1856 	zend_string *string, *enc_name = NULL;
1857 
1858 	ZEND_PARSE_PARAMETERS_START(1, 2)
1859 		Z_PARAM_STR(string)
1860 		Z_PARAM_OPTIONAL
1861 		Z_PARAM_STR_OR_NULL(enc_name)
1862 	ZEND_PARSE_PARAMETERS_END();
1863 
1864 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1865 	if (!enc) {
1866 		RETURN_THROWS();
1867 	}
1868 
1869 	RETVAL_LONG(mb_get_strlen(string, enc));
1870 }
1871 /* }}} */
1872 
handle_strpos_error(size_t error)1873 static void handle_strpos_error(size_t error) {
1874 	switch (error) {
1875 	case MBFL_ERROR_NOT_FOUND:
1876 		break;
1877 	case MBFL_ERROR_ENCODING:
1878 		php_error_docref(NULL, E_WARNING, "Conversion error");
1879 		break;
1880 	case MBFL_ERROR_OFFSET:
1881 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1882 		break;
1883 	default:
1884 		zend_value_error("mb_strpos(): Unknown error");
1885 		break;
1886 	}
1887 }
1888 
1889 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(mb_strpos)1890 PHP_FUNCTION(mb_strpos)
1891 {
1892 	int reverse = 0;
1893 	zend_long offset = 0;
1894 	char *haystack_val, *needle_val;
1895 	mbfl_string haystack, needle;
1896 	zend_string *enc_name = NULL;
1897 
1898 	ZEND_PARSE_PARAMETERS_START(2, 4)
1899 		Z_PARAM_STRING(haystack_val, haystack.len)
1900 		Z_PARAM_STRING(needle_val, needle.len)
1901 		Z_PARAM_OPTIONAL
1902 		Z_PARAM_LONG(offset)
1903 		Z_PARAM_STR_OR_NULL(enc_name)
1904 	ZEND_PARSE_PARAMETERS_END();
1905 
1906 	haystack.val = (unsigned char*)haystack_val;
1907 	needle.val = (unsigned char*)needle_val;
1908 
1909 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1910 	if (!haystack.encoding) {
1911 		RETURN_THROWS();
1912 	}
1913 
1914 	size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
1915 	if (!mbfl_is_error(n)) {
1916 		RETVAL_LONG(n);
1917 	} else {
1918 		handle_strpos_error(n);
1919 		RETVAL_FALSE;
1920 	}
1921 }
1922 /* }}} */
1923 
1924 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1925 PHP_FUNCTION(mb_strrpos)
1926 {
1927 	mbfl_string haystack, needle;
1928 	char *haystack_val, *needle_val;
1929 	zend_string *enc_name = NULL;
1930 	zend_long offset = 0;
1931 
1932 	ZEND_PARSE_PARAMETERS_START(2, 4)
1933 		Z_PARAM_STRING(haystack_val, haystack.len)
1934 		Z_PARAM_STRING(needle_val, needle.len)
1935 		Z_PARAM_OPTIONAL
1936 		Z_PARAM_LONG(offset)
1937 		Z_PARAM_STR_OR_NULL(enc_name)
1938 	ZEND_PARSE_PARAMETERS_END();
1939 
1940 	haystack.val = (unsigned char*)haystack_val;
1941 	needle.val = (unsigned char*)needle_val;
1942 
1943 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1944 	if (!haystack.encoding) {
1945 		RETURN_THROWS();
1946 	}
1947 
1948 	size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
1949 	if (!mbfl_is_error(n)) {
1950 		RETVAL_LONG(n);
1951 	} else {
1952 		handle_strpos_error(n);
1953 		RETVAL_FALSE;
1954 	}
1955 }
1956 /* }}} */
1957 
1958 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)1959 PHP_FUNCTION(mb_stripos)
1960 {
1961 	zend_long offset = 0;
1962 	mbfl_string haystack, needle;
1963 	char *haystack_val, *needle_val;
1964 	zend_string *from_encoding = NULL;
1965 
1966 	ZEND_PARSE_PARAMETERS_START(2, 4)
1967 		Z_PARAM_STRING(haystack_val, haystack.len)
1968 		Z_PARAM_STRING(needle_val, needle.len)
1969 		Z_PARAM_OPTIONAL
1970 		Z_PARAM_LONG(offset)
1971 		Z_PARAM_STR_OR_NULL(from_encoding)
1972 	ZEND_PARSE_PARAMETERS_END();
1973 
1974 	haystack.val = (unsigned char*)haystack_val;
1975 	needle.val = (unsigned char*)needle_val;
1976 
1977 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1978 	if (!enc) {
1979 		RETURN_THROWS();
1980 	}
1981 
1982 	size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1983 
1984 	if (!mbfl_is_error(n)) {
1985 		RETVAL_LONG(n);
1986 	} else {
1987 		handle_strpos_error(n);
1988 		RETVAL_FALSE;
1989 	}
1990 }
1991 /* }}} */
1992 
1993 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)1994 PHP_FUNCTION(mb_strripos)
1995 {
1996 	zend_long offset = 0;
1997 	mbfl_string haystack, needle;
1998 	char *haystack_val, *needle_val;
1999 	zend_string *from_encoding = NULL;
2000 
2001 	ZEND_PARSE_PARAMETERS_START(2, 4)
2002 		Z_PARAM_STRING(haystack_val, haystack.len)
2003 		Z_PARAM_STRING(needle_val, needle.len)
2004 		Z_PARAM_OPTIONAL
2005 		Z_PARAM_LONG(offset)
2006 		Z_PARAM_STR_OR_NULL(from_encoding)
2007 	ZEND_PARSE_PARAMETERS_END();
2008 
2009 	haystack.val = (unsigned char*)haystack_val;
2010 	needle.val = (unsigned char*)needle_val;
2011 
2012 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2013 	if (!enc) {
2014 		RETURN_THROWS();
2015 	}
2016 
2017 	size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
2018 
2019 	if (!mbfl_is_error(n)) {
2020 		RETVAL_LONG(n);
2021 	} else {
2022 		handle_strpos_error(n);
2023 		RETVAL_FALSE;
2024 	}
2025 }
2026 /* }}} */
2027 
2028 #define MB_STRSTR 1
2029 #define MB_STRRCHR 2
2030 #define MB_STRISTR 3
2031 #define MB_STRRICHR 4
2032 /* {{{ php_mb_strstr_variants */
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2033 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2034 {
2035 	int reverse_mode = 0;
2036 	size_t n;
2037 	char *haystack_val, *needle_val;
2038 	mbfl_string haystack, needle, result, *ret = NULL;
2039 	zend_string *encoding_name = NULL;
2040 	bool part = 0;
2041 
2042 	ZEND_PARSE_PARAMETERS_START(2, 4)
2043 		Z_PARAM_STRING(haystack_val, haystack.len)
2044 		Z_PARAM_STRING(needle_val, needle.len)
2045 		Z_PARAM_OPTIONAL
2046 		Z_PARAM_BOOL(part)
2047 		Z_PARAM_STR_OR_NULL(encoding_name)
2048 	ZEND_PARSE_PARAMETERS_END();
2049 
2050 	haystack.val = (unsigned char*)haystack_val;
2051 	needle.val = (unsigned char*)needle_val;
2052 	haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
2053 	if (!haystack.encoding) {
2054 		RETURN_THROWS();
2055 	}
2056 
2057 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
2058 
2059 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2060 		n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
2061 			needle.len, 0, needle.encoding);
2062 	} else {
2063 		n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
2064 	}
2065 
2066 	if (!mbfl_is_error(n)) {
2067 		if (part) {
2068 			ret = mbfl_substr(&haystack, &result, 0, n);
2069 			ZEND_ASSERT(ret != NULL);
2070 			// TODO: avoid reallocation ???
2071 			RETVAL_STRINGL((char *)ret->val, ret->len);
2072 			efree(ret->val);
2073 		} else {
2074 			ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
2075 			ZEND_ASSERT(ret != NULL);
2076 			// TODO: avoid reallocation ???
2077 			RETVAL_STRINGL((char *)ret->val, ret->len);
2078 			efree(ret->val);
2079 		}
2080 	} else {
2081 		// FIXME use handle_strpos_error(n)
2082 		RETVAL_FALSE;
2083 	}
2084 }
2085 
2086 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2087 PHP_FUNCTION(mb_strstr)
2088 {
2089 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2090 }
2091 /* }}} */
2092 
2093 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2094 PHP_FUNCTION(mb_strrchr)
2095 {
2096 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2097 }
2098 /* }}} */
2099 
2100 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2101 PHP_FUNCTION(mb_stristr)
2102 {
2103 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2104 }
2105 /* }}} */
2106 
2107 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2108 PHP_FUNCTION(mb_strrichr)
2109 {
2110 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2111 }
2112 /* }}} */
2113 
2114 #undef MB_STRSTR
2115 #undef MB_STRRCHR
2116 #undef MB_STRISTR
2117 #undef MB_STRRICHR
2118 
2119 /* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)2120 PHP_FUNCTION(mb_substr_count)
2121 {
2122 	mbfl_string haystack, needle;
2123 	char *haystack_val, *needle_val;
2124 	zend_string *enc_name = NULL;
2125 
2126 	ZEND_PARSE_PARAMETERS_START(2, 3)
2127 		Z_PARAM_STRING(haystack_val, haystack.len)
2128 		Z_PARAM_STRING(needle_val, needle.len)
2129 		Z_PARAM_OPTIONAL
2130 		Z_PARAM_STR_OR_NULL(enc_name)
2131 	ZEND_PARSE_PARAMETERS_END();
2132 
2133 	haystack.val = (unsigned char*)haystack_val;
2134 	needle.val = (unsigned char*)needle_val;
2135 
2136 	if (needle.len == 0) {
2137 		zend_argument_value_error(2, "must not be empty");
2138 		RETURN_THROWS();
2139 	}
2140 
2141 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2142 	if (!haystack.encoding) {
2143 		RETURN_THROWS();
2144 	}
2145 
2146 	size_t n = mbfl_substr_count(&haystack, &needle);
2147 	/* An error can only occur if needle is empty,
2148 	 * an encoding error happens (which should not happen at this stage and is a bug)
2149 	 * or the haystack is more than sizeof(size_t) bytes
2150 	 * If one of these things occur this is a bug and should be flagged as such */
2151 	ZEND_ASSERT(!mbfl_is_error(n));
2152 	RETVAL_LONG(n);
2153 }
2154 /* }}} */
2155 
2156 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2157 PHP_FUNCTION(mb_substr)
2158 {
2159 	char *str;
2160 	zend_string *encoding = NULL;
2161 	zend_long from, len;
2162 	size_t real_from, real_len;
2163 	size_t str_len;
2164 	bool len_is_null = 1;
2165 	mbfl_string string, result, *ret;
2166 
2167 	ZEND_PARSE_PARAMETERS_START(2, 4)
2168 		Z_PARAM_STRING(str, str_len)
2169 		Z_PARAM_LONG(from)
2170 		Z_PARAM_OPTIONAL
2171 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2172 		Z_PARAM_STR_OR_NULL(encoding)
2173 	ZEND_PARSE_PARAMETERS_END();
2174 
2175 	if (from == ZEND_LONG_MIN) {
2176 		zend_argument_value_error(2, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2177 		RETURN_THROWS();
2178 	}
2179 
2180 	if (!len_is_null && len == ZEND_LONG_MIN) {
2181 		zend_argument_value_error(3, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2182 		RETURN_THROWS();
2183 	}
2184 
2185 	string.encoding = php_mb_get_encoding(encoding, 4);
2186 	if (!string.encoding) {
2187 		RETURN_THROWS();
2188 	}
2189 
2190 	string.val = (unsigned char *)str;
2191 	string.len = str_len;
2192 
2193 	/* measures length */
2194 	size_t mblen = 0;
2195 	if (from < 0 || (!len_is_null && len < 0)) {
2196 		mblen = mbfl_strlen(&string);
2197 	}
2198 
2199 	/* if "from" position is negative, count start position from the end
2200 	 * of the string
2201 	 */
2202 	if (from >= 0) {
2203 		real_from = (size_t) from;
2204 	} else if (-from < mblen) {
2205 		real_from = mblen + from;
2206 	} else {
2207 		real_from = 0;
2208 	}
2209 
2210 	/* if "length" position is negative, set it to the length
2211 	 * needed to stop that many chars from the end of the string
2212 	 */
2213 	if (len_is_null) {
2214 		real_len = MBFL_SUBSTR_UNTIL_END;
2215 	} else if (len >= 0) {
2216 		real_len = (size_t) len;
2217 	} else if (real_from < mblen && -len < mblen - real_from) {
2218 		real_len = (mblen - real_from) + len;
2219 	} else {
2220 		real_len = 0;
2221 	}
2222 
2223 	ret = mbfl_substr(&string, &result, real_from, real_len);
2224 	ZEND_ASSERT(ret != NULL);
2225 
2226 	// TODO: avoid reallocation ???
2227 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2228 	efree(ret->val);
2229 }
2230 /* }}} */
2231 
2232 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2233 PHP_FUNCTION(mb_strcut)
2234 {
2235 	zend_string *encoding = NULL;
2236 	char *string_val;
2237 	zend_long from, len;
2238 	bool len_is_null = 1;
2239 	mbfl_string string, result, *ret;
2240 
2241 	ZEND_PARSE_PARAMETERS_START(2, 4)
2242 		Z_PARAM_STRING(string_val, string.len)
2243 		Z_PARAM_LONG(from)
2244 		Z_PARAM_OPTIONAL
2245 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2246 		Z_PARAM_STR_OR_NULL(encoding)
2247 	ZEND_PARSE_PARAMETERS_END();
2248 
2249 	string.val = (unsigned char*)string_val;
2250 	string.encoding = php_mb_get_encoding(encoding, 4);
2251 	if (!string.encoding) {
2252 		RETURN_THROWS();
2253 	}
2254 
2255 	if (len_is_null) {
2256 		len = string.len;
2257 	}
2258 
2259 	/* if "from" position is negative, count start position from the end
2260 	 * of the string
2261 	 */
2262 	if (from < 0) {
2263 		from = string.len + from;
2264 		if (from < 0) {
2265 			from = 0;
2266 		}
2267 	}
2268 
2269 	/* if "length" position is negative, set it to the length
2270 	 * needed to stop that many chars from the end of the string
2271 	 */
2272 	if (len < 0) {
2273 		len = (string.len - from) + len;
2274 		if (len < 0) {
2275 			len = 0;
2276 		}
2277 	}
2278 
2279 	if (from > string.len) {
2280 		RETURN_EMPTY_STRING();
2281 	}
2282 
2283 	ret = mbfl_strcut(&string, &result, from, len);
2284 	ZEND_ASSERT(ret != NULL);
2285 
2286 	// TODO: avoid reallocation ???
2287 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2288 	efree(ret->val);
2289 }
2290 /* }}} */
2291 
2292 /* Some East Asian characters, when printed at a terminal (or the like), require double
2293  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2294 static size_t character_width(uint32_t c)
2295 {
2296 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2297 		return 1;
2298 	}
2299 
2300 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2301 	int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2302 	while (lo < hi) {
2303 		int probe = (lo + hi) / 2;
2304 		if (c < mbfl_eaw_table[probe].begin) {
2305 			hi = probe;
2306 		} else if (c > mbfl_eaw_table[probe].end) {
2307 			lo = probe + 1;
2308 		} else {
2309 			return 2;
2310 		}
2311 	}
2312 
2313 	return 1;
2314 }
2315 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2316 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2317 {
2318 	size_t width = 0;
2319 	uint32_t wchar_buf[128];
2320 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2321 	size_t in_len = ZSTR_LEN(string);
2322 	unsigned int state = 0;
2323 
2324 	while (in_len) {
2325 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2326 		ZEND_ASSERT(out_len <= 128);
2327 
2328 		while (out_len) {
2329 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2330 			 * If text conversion is performed with an ordinary ASCII character as
2331 			 * the 'replacement character', this will give us the correct display width. */
2332 			width += character_width(wchar_buf[--out_len]);
2333 		}
2334 	}
2335 
2336 	return width;
2337 }
2338 
2339 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2340 PHP_FUNCTION(mb_strwidth)
2341 {
2342 	zend_string *string, *enc_name = NULL;
2343 
2344 	ZEND_PARSE_PARAMETERS_START(1, 2)
2345 		Z_PARAM_STR(string)
2346 		Z_PARAM_OPTIONAL
2347 		Z_PARAM_STR_OR_NULL(enc_name)
2348 	ZEND_PARSE_PARAMETERS_END();
2349 
2350 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2351 	if (!enc) {
2352 		RETURN_THROWS();
2353 	}
2354 
2355 	RETVAL_LONG(mb_get_strwidth(string, enc));
2356 }
2357 
2358 /* Cut 'n' codepoints from beginning of string
2359  * Remove this once mb_substr is implemented using the new conversion filters */
mb_drop_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2360 static zend_string* mb_drop_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2361 {
2362 	if (n >= ZSTR_LEN(input)) {
2363 		/* No supported text encoding decodes to more than one codepoint per byte
2364 		 * So if the number of codepoints to drop >= number of input bytes,
2365 		 * then definitely the output should be empty
2366 		 * This also guards `ZSTR_LEN(input) - n` (below) from underflow */
2367 		return zend_empty_string;
2368 	}
2369 
2370 	uint32_t wchar_buf[128];
2371 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2372 	size_t in_len = ZSTR_LEN(input);
2373 	unsigned int state = 0;
2374 
2375 	mb_convert_buf buf;
2376 	mb_convert_buf_init(&buf, ZSTR_LEN(input) - n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2377 
2378 	while (in_len) {
2379 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2380 		ZEND_ASSERT(out_len <= 128);
2381 
2382 		if (n >= out_len) {
2383 			n -= out_len;
2384 		} else {
2385 			enc->from_wchar(wchar_buf + n, out_len - n, &buf, !in_len);
2386 			n = 0;
2387 		}
2388 	}
2389 
2390 	return mb_convert_buf_result(&buf);
2391 }
2392 
2393 /* Pick 'n' codepoints from beginning of string
2394  * Remove this once mb_substr is implemented using the new conversion filters */
mb_pick_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2395 static zend_string* mb_pick_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2396 {
2397 	uint32_t wchar_buf[128];
2398 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2399 	size_t in_len = ZSTR_LEN(input);
2400 	unsigned int state = 0;
2401 
2402 	mb_convert_buf buf;
2403 	mb_convert_buf_init(&buf, n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2404 
2405 	while (in_len && n) {
2406 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2407 		ZEND_ASSERT(out_len <= 128);
2408 
2409 		enc->from_wchar(wchar_buf, MIN(out_len, n), &buf, !in_len || out_len >= n);
2410 		n -= MIN(out_len, n);
2411 	}
2412 
2413 	return mb_convert_buf_result(&buf);
2414 }
2415 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2416 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2417 {
2418 	uint32_t wchar_buf[128];
2419 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2420 	size_t in_len = ZSTR_LEN(input);
2421 	unsigned int state = 0;
2422 	int remaining_width = width;
2423 	unsigned int to_skip = from;
2424 	size_t out_len = 0;
2425 	bool first_call = true, input_err = false;
2426 	mb_convert_buf buf;
2427 
2428 	while (in_len) {
2429 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2430 		ZEND_ASSERT(out_len <= 128);
2431 
2432 		if (out_len <= to_skip) {
2433 			to_skip -= out_len;
2434 		} else {
2435 			for (int i = to_skip; i < out_len; i++) {
2436 				uint32_t w = wchar_buf[i];
2437 				input_err |= (w == MBFL_BAD_INPUT);
2438 				remaining_width -= character_width(w);
2439 				if (remaining_width < 0) {
2440 					/* We need to truncate string and append trim marker */
2441 					width -= mb_get_strwidth(marker, enc);
2442 					/* 'width' is now the amount we want to take from 'input' */
2443 					if (width <= 0) {
2444 						return zend_string_copy(marker);
2445 					}
2446 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2447 
2448 					if (first_call) {
2449 						/* We can use the buffer of wchars which we have right now;
2450 						 * no need to convert again */
2451 						goto dont_restart_conversion;
2452 					} else {
2453 						goto restart_conversion;
2454 					}
2455 				}
2456 			}
2457 			to_skip = 0;
2458 		}
2459 		first_call = false;
2460 	}
2461 
2462 	/* The input string fits in the requested width; we don't need to append the trim marker
2463 	 * However, if the string contains erroneous byte sequences, those should be converted
2464 	 * to error markers */
2465 	if (from == 0 && !input_err) {
2466 		/* This just increments the string's refcount; it doesn't really 'copy' it */
2467 		return zend_string_copy(input);
2468 	}
2469 	return mb_drop_chars(input, enc, from);
2470 
2471 	/* The input string is too wide; we need to build a new string which
2472 	 * includes some portion of the input string, with the trim marker
2473 	 * concatenated onto it */
2474 restart_conversion:
2475 	in = (unsigned char*)ZSTR_VAL(input);
2476 	in_len = ZSTR_LEN(input);
2477 	state = 0;
2478 
2479 	while (true) {
2480 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2481 		ZEND_ASSERT(out_len <= 128);
2482 
2483 dont_restart_conversion:
2484 		if (out_len <= from) {
2485 			from -= out_len;
2486 		} else {
2487 			for (int i = from; i < out_len; i++) {
2488 				width -= character_width(wchar_buf[i]);
2489 				if (width < 0) {
2490 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2491 					goto append_trim_marker;
2492 				}
2493 			}
2494 			ZEND_ASSERT(in_len > 0);
2495 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2496 			from = 0;
2497 		}
2498 	}
2499 
2500 append_trim_marker:
2501 	if (ZSTR_LEN(marker) > 0) {
2502 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2503 		memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2504 		buf.out += ZSTR_LEN(marker);
2505 	}
2506 
2507 	return mb_convert_buf_result(&buf);
2508 }
2509 
2510 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2511 PHP_FUNCTION(mb_strimwidth)
2512 {
2513 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2514 	zend_long from, width;
2515 
2516 	ZEND_PARSE_PARAMETERS_START(3, 5)
2517 		Z_PARAM_STR(str)
2518 		Z_PARAM_LONG(from)
2519 		Z_PARAM_LONG(width)
2520 		Z_PARAM_OPTIONAL
2521 		Z_PARAM_STR(trimmarker)
2522 		Z_PARAM_STR_OR_NULL(encoding)
2523 	ZEND_PARSE_PARAMETERS_END();
2524 
2525 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2526 	if (!enc) {
2527 		RETURN_THROWS();
2528 	}
2529 
2530 	if (from != 0) {
2531 		size_t str_len = mb_get_strlen(str, enc);
2532 		if (from < 0) {
2533 			from += str_len;
2534 		}
2535 		if (from < 0 || from > str_len) {
2536 			zend_argument_value_error(2, "is out of range");
2537 			RETURN_THROWS();
2538 		}
2539 	}
2540 
2541 	if (width < 0) {
2542 		width += mb_get_strwidth(str, enc);
2543 
2544 		if (from > 0) {
2545 			zend_string *trimmed = mb_pick_chars(str, enc, from);
2546 			width -= mb_get_strwidth(trimmed, enc);
2547 			zend_string_free(trimmed);
2548 		}
2549 
2550 		if (width < 0) {
2551 			zend_argument_value_error(3, "is out of range");
2552 			RETURN_THROWS();
2553 		}
2554 	}
2555 
2556 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2557 }
2558 
2559 
2560 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2561 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2562 {
2563 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2564 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2565 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2566 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2567 }
2568 
2569 
2570 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)2571 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
2572 {
2573 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
2574 }
2575 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2576 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2577 {
2578 	unsigned int num_errors = 0;
2579 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2580 	MBSTRG(illegalchars) += num_errors;
2581 	return result;
2582 }
2583 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2584 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2585 {
2586 	const mbfl_encoding *from_encoding;
2587 
2588 	/* pre-conversion encoding */
2589 	ZEND_ASSERT(num_from_encodings >= 1);
2590 	if (num_from_encodings == 1) {
2591 		from_encoding = *from_encodings;
2592 	} else {
2593 		/* auto detect */
2594 		mbfl_string string;
2595 		mbfl_string_init(&string);
2596 		string.val = (unsigned char *)input;
2597 		string.len = length;
2598 		from_encoding = mbfl_identify_encoding(
2599 			&string, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2600 		if (!from_encoding) {
2601 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2602 			return NULL;
2603 		}
2604 	}
2605 
2606 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2607 }
2608 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2609 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2610 {
2611 	HashTable *output, *chash;
2612 	zend_long idx;
2613 	zend_string *key;
2614 	zval *entry, entry_tmp;
2615 
2616 	if (!input) {
2617 		return NULL;
2618 	}
2619 
2620 	if (GC_IS_RECURSIVE(input)) {
2621 		GC_UNPROTECT_RECURSION(input);
2622 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2623 		return NULL;
2624 	}
2625 	GC_TRY_PROTECT_RECURSION(input);
2626 	output = zend_new_array(zend_hash_num_elements(input));
2627 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2628 		/* convert key */
2629 		if (key) {
2630 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2631 			if (!converted_key) {
2632 				continue;
2633 			}
2634 			key = converted_key;
2635 		}
2636 		/* convert value */
2637 		ZEND_ASSERT(entry);
2638 try_again:
2639 		switch(Z_TYPE_P(entry)) {
2640 			case IS_STRING: {
2641 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2642 				if (!converted_key) {
2643 					if (key) {
2644 						zend_string_release(key);
2645 					}
2646 					continue;
2647 				}
2648 				ZVAL_STR(&entry_tmp, converted_key);
2649 				break;
2650 			}
2651 			case IS_NULL:
2652 			case IS_TRUE:
2653 			case IS_FALSE:
2654 			case IS_LONG:
2655 			case IS_DOUBLE:
2656 				ZVAL_COPY(&entry_tmp, entry);
2657 				break;
2658 			case IS_ARRAY:
2659 				chash = php_mb_convert_encoding_recursive(
2660 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2661 				if (chash) {
2662 					ZVAL_ARR(&entry_tmp, chash);
2663 				} else {
2664 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2665 				}
2666 				break;
2667 			case IS_REFERENCE:
2668 				entry = Z_REFVAL_P(entry);
2669 				goto try_again;
2670 			case IS_OBJECT:
2671 			default:
2672 				if (key) {
2673 					zend_string_release(key);
2674 				}
2675 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2676 				continue;
2677 		}
2678 		if (key) {
2679 			zend_hash_add(output, key, &entry_tmp);
2680 			zend_string_release(key);
2681 		} else {
2682 			zend_hash_index_add(output, idx, &entry_tmp);
2683 		}
2684 	} ZEND_HASH_FOREACH_END();
2685 	GC_TRY_UNPROTECT_RECURSION(input);
2686 
2687 	return output;
2688 }
2689 /* }}} */
2690 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2691 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2692 {
2693 	/* mbstring supports some 'text encodings' which aren't really text encodings
2694 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2695 	 * These should never be returned by `mb_detect_encoding`. */
2696 	int shift = 0;
2697 	for (int i = 0; i < *size; i++) {
2698 		const mbfl_encoding *encoding = elist[i];
2699 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2700 			shift++; /* Remove this encoding from the list */
2701 		} else if (shift) {
2702 			elist[i - shift] = encoding;
2703 		}
2704 	}
2705 	*size -= shift;
2706 }
2707 
2708 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2709 PHP_FUNCTION(mb_convert_encoding)
2710 {
2711 	zend_string *to_encoding_name;
2712 	zend_string *input_str, *from_encodings_str = NULL;
2713 	HashTable *input_ht, *from_encodings_ht = NULL;
2714 	const mbfl_encoding **from_encodings;
2715 	size_t num_from_encodings;
2716 	bool free_from_encodings;
2717 
2718 	ZEND_PARSE_PARAMETERS_START(2, 3)
2719 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2720 		Z_PARAM_STR(to_encoding_name)
2721 		Z_PARAM_OPTIONAL
2722 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2723 	ZEND_PARSE_PARAMETERS_END();
2724 
2725 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2726 	if (!to_encoding) {
2727 		RETURN_THROWS();
2728 	}
2729 
2730 	if (from_encodings_ht) {
2731 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2732 			RETURN_THROWS();
2733 		}
2734 		free_from_encodings = 1;
2735 	} else if (from_encodings_str) {
2736 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2737 				&from_encodings, &num_from_encodings,
2738 				/* persistent */ 0, /* arg_num */ 3, /* allow_pass_encoding */ 0) == FAILURE) {
2739 			RETURN_THROWS();
2740 		}
2741 		free_from_encodings = 1;
2742 	} else {
2743 		from_encodings = &MBSTRG(current_internal_encoding);
2744 		num_from_encodings = 1;
2745 		free_from_encodings = 0;
2746 	}
2747 
2748 	if (num_from_encodings > 1) {
2749 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2750 	}
2751 
2752 	if (!num_from_encodings) {
2753 		efree(ZEND_VOIDP(from_encodings));
2754 		zend_argument_value_error(3, "must specify at least one encoding");
2755 		RETURN_THROWS();
2756 	}
2757 
2758 	if (input_str) {
2759 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2760 		if (ret != NULL) {
2761 			RETVAL_STR(ret);
2762 		} else {
2763 			RETVAL_FALSE;
2764 		}
2765 	} else {
2766 		HashTable *tmp;
2767 		tmp = php_mb_convert_encoding_recursive(
2768 			input_ht, to_encoding, from_encodings, num_from_encodings);
2769 		RETVAL_ARR(tmp);
2770 	}
2771 
2772 	if (free_from_encodings) {
2773 		efree(ZEND_VOIDP(from_encodings));
2774 	}
2775 }
2776 /* }}} */
2777 
mbstring_convert_case(int case_mode,const char * str,size_t str_len,size_t * ret_len,const mbfl_encoding * enc)2778 static char *mbstring_convert_case(
2779 		int case_mode, const char *str, size_t str_len, size_t *ret_len,
2780 		const mbfl_encoding *enc) {
2781 	return php_unicode_convert_case(
2782 		case_mode, str, str_len, ret_len, enc,
2783 		MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2784 }
2785 
2786 /* {{{ Returns a case-folded version of source_string */
PHP_FUNCTION(mb_convert_case)2787 PHP_FUNCTION(mb_convert_case)
2788 {
2789 	zend_string *from_encoding = NULL;
2790 	char *str;
2791 	size_t str_len, ret_len;
2792 	zend_long case_mode = 0;
2793 
2794 	ZEND_PARSE_PARAMETERS_START(2, 3)
2795 		Z_PARAM_STRING(str, str_len)
2796 		Z_PARAM_LONG(case_mode)
2797 		Z_PARAM_OPTIONAL
2798 		Z_PARAM_STR_OR_NULL(from_encoding)
2799 	ZEND_PARSE_PARAMETERS_END();
2800 
2801 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2802 	if (!enc) {
2803 		RETURN_THROWS();
2804 	}
2805 
2806 	if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) {
2807 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2808 		RETURN_THROWS();
2809 	}
2810 
2811 	char *newstr = mbstring_convert_case(case_mode, str, str_len, &ret_len, enc);
2812 	/* If newstr is NULL something went wrong in mbfl and this is a bug */
2813 	ZEND_ASSERT(newstr != NULL);
2814 
2815 	// TODO: avoid reallocation ???
2816 	RETVAL_STRINGL(newstr, ret_len);
2817 	efree(newstr);
2818 }
2819 /* }}} */
2820 
2821 /* {{{ Returns a upper cased version of source_string */
PHP_FUNCTION(mb_strtoupper)2822 PHP_FUNCTION(mb_strtoupper)
2823 {
2824 	zend_string *from_encoding = NULL;
2825 	char *str;
2826 	size_t str_len, ret_len;
2827 
2828 	ZEND_PARSE_PARAMETERS_START(1, 2)
2829 		Z_PARAM_STRING(str, str_len)
2830 		Z_PARAM_OPTIONAL
2831 		Z_PARAM_STR_OR_NULL(from_encoding)
2832 	ZEND_PARSE_PARAMETERS_END();
2833 
2834 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2835 	if (!enc) {
2836 		RETURN_THROWS();
2837 	}
2838 
2839 	char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
2840 	/* If newstr is NULL something went wrong in mbfl and this is a bug */
2841 	ZEND_ASSERT(newstr != NULL);
2842 
2843 	// TODO: avoid reallocation ???
2844 	RETVAL_STRINGL(newstr, ret_len);
2845 	efree(newstr);
2846 }
2847 /* }}} */
2848 
2849 /* {{{ Returns a lower cased version of source_string */
PHP_FUNCTION(mb_strtolower)2850 PHP_FUNCTION(mb_strtolower)
2851 {
2852 	zend_string *from_encoding = NULL;
2853 	char *str;
2854 	size_t str_len;
2855 	char *newstr;
2856 	size_t ret_len;
2857 	const mbfl_encoding *enc;
2858 
2859 	ZEND_PARSE_PARAMETERS_START(1, 2)
2860 		Z_PARAM_STRING(str, str_len)
2861 		Z_PARAM_OPTIONAL
2862 		Z_PARAM_STR_OR_NULL(from_encoding)
2863 	ZEND_PARSE_PARAMETERS_END();
2864 
2865 	enc = php_mb_get_encoding(from_encoding, 2);
2866 	if (!enc) {
2867 		RETURN_THROWS();
2868 	}
2869 
2870 	newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
2871 	/* If newstr is NULL something went wrong in mbfl and this is a bug */
2872 	ZEND_ASSERT(newstr != NULL);
2873 
2874 	// TODO: avoid reallocation ???
2875 	RETVAL_STRINGL(newstr, ret_len);
2876 	efree(newstr);
2877 }
2878 /* }}} */
2879 
duplicate_elist(const mbfl_encoding ** elist,size_t size)2880 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2881 {
2882 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2883 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2884 	return new_elist;
2885 }
2886 
2887 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)2888 PHP_FUNCTION(mb_detect_encoding)
2889 {
2890 	char *str;
2891 	size_t str_len;
2892 	zend_string *encoding_str = NULL;
2893 	HashTable *encoding_ht = NULL;
2894 	bool strict = 0;
2895 
2896 	mbfl_string string;
2897 	const mbfl_encoding *ret;
2898 	const mbfl_encoding **elist;
2899 	size_t size;
2900 
2901 	ZEND_PARSE_PARAMETERS_START(1, 3)
2902 		Z_PARAM_STRING(str, str_len)
2903 		Z_PARAM_OPTIONAL
2904 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
2905 		Z_PARAM_BOOL(strict)
2906 	ZEND_PARSE_PARAMETERS_END();
2907 
2908 	/* make encoding list */
2909 	if (encoding_ht) {
2910 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
2911 			RETURN_THROWS();
2912 		}
2913 	} else if (encoding_str) {
2914 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0)) {
2915 			RETURN_THROWS();
2916 		}
2917 	} else {
2918 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
2919 		size = MBSTRG(current_detect_order_list_size);
2920 	}
2921 
2922 	if (size == 0) {
2923 		efree(ZEND_VOIDP(elist));
2924 		zend_argument_value_error(2, "must specify at least one encoding");
2925 		RETURN_THROWS();
2926 	}
2927 
2928 	remove_non_encodings_from_elist(elist, &size);
2929 	if (size == 0) {
2930 		efree(ZEND_VOIDP(elist));
2931 		RETURN_FALSE;
2932 	}
2933 
2934 	if (ZEND_NUM_ARGS() < 3) {
2935 		strict = MBSTRG(strict_detection);
2936 	}
2937 
2938 	if (strict && size == 1) {
2939 		/* If there is only a single candidate encoding, mb_check_encoding is faster */
2940 		ret = (php_mb_check_encoding(str, str_len, *elist)) ? *elist : NULL;
2941 	} else {
2942 		mbfl_string_init(&string);
2943 		string.val = (unsigned char *)str;
2944 		string.len = str_len;
2945 		ret = mbfl_identify_encoding(&string, elist, size, strict);
2946 	}
2947 
2948 	efree(ZEND_VOIDP(elist));
2949 
2950 	if (ret == NULL) {
2951 		RETURN_FALSE;
2952 	}
2953 
2954 	RETVAL_STRING((char *)ret->name);
2955 }
2956 /* }}} */
2957 
2958 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)2959 PHP_FUNCTION(mb_list_encodings)
2960 {
2961 	ZEND_PARSE_PARAMETERS_NONE();
2962 
2963 	array_init(return_value);
2964 	for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
2965 		add_next_index_string(return_value, (*encodings)->name);
2966 	}
2967 }
2968 /* }}} */
2969 
2970 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)2971 PHP_FUNCTION(mb_encoding_aliases)
2972 {
2973 	const mbfl_encoding *encoding;
2974 	zend_string *encoding_name = NULL;
2975 
2976 	ZEND_PARSE_PARAMETERS_START(1, 1)
2977 		Z_PARAM_STR(encoding_name)
2978 	ZEND_PARSE_PARAMETERS_END();
2979 
2980 	encoding = php_mb_get_encoding(encoding_name, 1);
2981 	if (!encoding) {
2982 		RETURN_THROWS();
2983 	}
2984 
2985 	array_init(return_value);
2986 	if (encoding->aliases != NULL) {
2987 		for (const char **alias = encoding->aliases; *alias; ++alias) {
2988 			add_next_index_string(return_value, (char *)*alias);
2989 		}
2990 	}
2991 }
2992 /* }}} */
2993 
2994 /* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)2995 PHP_FUNCTION(mb_encode_mimeheader)
2996 {
2997 	const mbfl_encoding *charset, *transenc;
2998 	mbfl_string  string, result, *ret;
2999 	zend_string *charset_name = NULL;
3000 	char *trans_enc_name = NULL, *string_val;
3001 	size_t trans_enc_name_len;
3002 	char *linefeed = "\r\n";
3003 	size_t linefeed_len;
3004 	zend_long indent = 0;
3005 
3006 	string.encoding = MBSTRG(current_internal_encoding);
3007 
3008 	ZEND_PARSE_PARAMETERS_START(1, 5)
3009 		Z_PARAM_STRING(string_val, string.len)
3010 		Z_PARAM_OPTIONAL
3011 		Z_PARAM_STR(charset_name)
3012 		Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
3013 		Z_PARAM_STRING(linefeed, linefeed_len)
3014 		Z_PARAM_LONG(indent)
3015 	ZEND_PARSE_PARAMETERS_END();
3016 
3017 	string.val = (unsigned char*)string_val;
3018 	charset = &mbfl_encoding_pass;
3019 	transenc = &mbfl_encoding_base64;
3020 
3021 	if (charset_name != NULL) {
3022 		charset = php_mb_get_encoding(charset_name, 2);
3023 		if (!charset) {
3024 			RETURN_THROWS();
3025 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
3026 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
3027 			RETURN_THROWS();
3028 		}
3029 	} else {
3030 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
3031 		if (lang != NULL) {
3032 			charset = mbfl_no2encoding(lang->mail_charset);
3033 			transenc = mbfl_no2encoding(lang->mail_header_encoding);
3034 		}
3035 	}
3036 
3037 	if (trans_enc_name != NULL) {
3038 		if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
3039 			transenc = &mbfl_encoding_base64;
3040 		} else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
3041 			transenc = &mbfl_encoding_qprint;
3042 		}
3043 	}
3044 
3045 	mbfl_string_init(&result);
3046 	ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
3047 	ZEND_ASSERT(ret != NULL);
3048 	// TODO: avoid reallocation ???
3049 	RETVAL_STRINGL((char *)ret->val, ret->len);	/* the string is already strdup()'ed */
3050 	efree(ret->val);
3051 }
3052 /* }}} */
3053 
3054 /* {{{ Decodes the MIME "encoded-word" in the string */
PHP_FUNCTION(mb_decode_mimeheader)3055 PHP_FUNCTION(mb_decode_mimeheader)
3056 {
3057 	char *string_val;
3058 	mbfl_string string, result, *ret;
3059 
3060 	string.encoding = MBSTRG(current_internal_encoding);
3061 
3062 	ZEND_PARSE_PARAMETERS_START(1, 1)
3063 		Z_PARAM_STRING(string_val, string.len)
3064 	ZEND_PARSE_PARAMETERS_END();
3065 
3066 	string.val = (unsigned char*)string_val;
3067 	mbfl_string_init(&result);
3068 	ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding));
3069 	ZEND_ASSERT(ret != NULL);
3070 	// TODO: avoid reallocation ???
3071 	RETVAL_STRINGL((char *)ret->val, ret->len);	/* the string is already strdup()'ed */
3072 	efree(ret->val);
3073 }
3074 /* }}} */
3075 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3076 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3077 {
3078 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3079 	 * if we are converting zenkaku kana to hankaku kana
3080 	 * Make the buffer for converted kana big enough that we never need to
3081 	 * perform bounds checks */
3082 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3083 	unsigned int buf_offset = 0;
3084 	unsigned int state = 0;
3085 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3086 	size_t in_len = ZSTR_LEN(input);
3087 
3088 	mb_convert_buf buf;
3089 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3090 
3091 	while (in_len) {
3092 		uint32_t *converted = converted_buf;
3093 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3094 		 * previous iteration, don't overwrite it */
3095 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3096 		out_len += buf_offset;
3097 		ZEND_ASSERT(out_len <= 64);
3098 
3099 		if (!out_len) {
3100 			continue;
3101 		}
3102 
3103 		for (int i = 0; i < out_len-1; i++) {
3104 			uint32_t second = 0;
3105 			bool consumed = false;
3106 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3107 			if (second) {
3108 				*converted++ = second;
3109 			}
3110 			if (consumed) {
3111 				i++;
3112 				if (i == out_len-1) {
3113 					/* We consumed two codepoints at the very end of the wchar buffer
3114 					 * So there is nothing remaining to reprocess on the next iteration */
3115 					buf_offset = 0;
3116 					goto emit_converted_kana;
3117 				}
3118 			}
3119 		}
3120 
3121 		if (!in_len) {
3122 			/* This is the last iteration, so we need to process the final codepoint now */
3123 			uint32_t second = 0;
3124 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3125 			if (second) {
3126 				*converted++ = second;
3127 			}
3128 		} else {
3129 			/* Reprocess the last codepoint on the next iteration */
3130 			wchar_buf[0] = wchar_buf[out_len-1];
3131 			buf_offset = 1;
3132 		}
3133 
3134 emit_converted_kana:
3135 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3136 	}
3137 
3138 	return mb_convert_buf_result(&buf);
3139 }
3140 
3141 char mb_convert_kana_flags[17] = {
3142 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3143 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3144 	'V'
3145 };
3146 
3147 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3148 PHP_FUNCTION(mb_convert_kana)
3149 {
3150 	unsigned int opt;
3151 	char *optstr = NULL;
3152 	size_t optstr_len;
3153 	zend_string *encname = NULL, *str;
3154 
3155 	ZEND_PARSE_PARAMETERS_START(1, 3)
3156 		Z_PARAM_STR(str)
3157 		Z_PARAM_OPTIONAL
3158 		Z_PARAM_STRING(optstr, optstr_len)
3159 		Z_PARAM_STR_OR_NULL(encname)
3160 	ZEND_PARSE_PARAMETERS_END();
3161 
3162 	if (optstr != NULL) {
3163 		char *p = optstr, *e = p + optstr_len;
3164 		opt = 0;
3165 next_option:
3166 		while (p < e) {
3167 			/* Walk through option string and convert to bit vector
3168 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3169 			char c = *p++;
3170 			if (c == 'A') {
3171 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3172 			} else if (c == 'a') {
3173 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3174 			} else {
3175 				for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3176 					if (c == mb_convert_kana_flags[i]) {
3177 						opt |= (1 << i);
3178 						goto next_option;
3179 					}
3180 				}
3181 
3182 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3183 				RETURN_THROWS();
3184 			}
3185 		}
3186 
3187 		/* Check for illegal combinations of options */
3188 		if (((opt & 0xFF00) >> 8) & opt) {
3189 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3190 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3191 			 * FW hiragana to FW katakana and then back again. */
3192 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3193 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3194 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3195 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3196 				flag1 = 'A';
3197 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3198 				flag2 = 'a';
3199 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3200 			RETURN_THROWS();
3201 		}
3202 
3203 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3204 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3205 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3206 			RETURN_THROWS();
3207 		}
3208 
3209 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3210 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3211 		 * more than one of these */
3212 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3213 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3214 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3215 				RETURN_THROWS();
3216 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3217 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3218 				RETURN_THROWS();
3219 			}
3220 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3221 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3222 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3223 				RETURN_THROWS();
3224 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3225 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3226 				RETURN_THROWS();
3227 			}
3228 		}
3229 	} else {
3230 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3231 	}
3232 
3233 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3234 	if (!enc) {
3235 		RETURN_THROWS();
3236 	}
3237 
3238 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3239 }
3240 
mb_recursive_encoder_detector_feed(mbfl_encoding_detector * identd,zval * var,int * recursion_error)3241 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
3242 {
3243 	mbfl_string string;
3244 	HashTable *ht;
3245 	zval *entry;
3246 
3247 	ZVAL_DEREF(var);
3248 	if (Z_TYPE_P(var) == IS_STRING) {
3249 		string.val = (unsigned char *)Z_STRVAL_P(var);
3250 		string.len = Z_STRLEN_P(var);
3251 		if (mbfl_encoding_detector_feed(identd, &string)) {
3252 			return 1; /* complete detecting */
3253 		}
3254 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3255 		if (Z_REFCOUNTED_P(var)) {
3256 			if (Z_IS_RECURSIVE_P(var)) {
3257 				*recursion_error = 1;
3258 				return 0;
3259 			}
3260 			Z_PROTECT_RECURSION_P(var);
3261 		}
3262 
3263 		ht = HASH_OF(var);
3264 		if (ht != NULL) {
3265 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3266 				if (mb_recursive_encoder_detector_feed(identd, entry, recursion_error)) {
3267 					if (Z_REFCOUNTED_P(var)) {
3268 						Z_UNPROTECT_RECURSION_P(var);
3269 					}
3270 					return 1;
3271 				} else if (*recursion_error) {
3272 					if (Z_REFCOUNTED_P(var)) {
3273 						Z_UNPROTECT_RECURSION_P(var);
3274 					}
3275 					return 0;
3276 				}
3277 			} ZEND_HASH_FOREACH_END();
3278 		}
3279 
3280 		if (Z_REFCOUNTED_P(var)) {
3281 			Z_UNPROTECT_RECURSION_P(var);
3282 		}
3283 	}
3284 	return 0;
3285 } /* }}} */
3286 
mb_recursive_convert_variable(mbfl_buffer_converter * convd,zval * var)3287 static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
3288 {
3289 	mbfl_string string, result, *ret;
3290 	HashTable *ht;
3291 	zval *entry, *orig_var;
3292 
3293 	orig_var = var;
3294 	ZVAL_DEREF(var);
3295 	if (Z_TYPE_P(var) == IS_STRING) {
3296 		string.val = (unsigned char *)Z_STRVAL_P(var);
3297 		string.len = Z_STRLEN_P(var);
3298 		ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3299 		if (ret != NULL) {
3300 			zval_ptr_dtor(orig_var);
3301 			// TODO: avoid reallocation ???
3302 			ZVAL_STRINGL(orig_var, (const char *) ret->val, ret->len);
3303 			efree(ret->val);
3304 		}
3305 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3306 		if (Z_TYPE_P(var) == IS_ARRAY) {
3307 			SEPARATE_ARRAY(var);
3308 		}
3309 		if (Z_REFCOUNTED_P(var)) {
3310 			if (Z_IS_RECURSIVE_P(var)) {
3311 				return 1;
3312 			}
3313 			Z_PROTECT_RECURSION_P(var);
3314 		}
3315 
3316 		ht = HASH_OF(var);
3317 		if (ht != NULL) {
3318 			ZEND_HASH_FOREACH_VAL(ht, entry) {
3319 				/* Can be a typed property declaration, in which case we need to remove the reference from the source list.
3320 				 * Just using ZEND_TRY_ASSIGN_STRINGL is not sufficient because that would not unwrap the reference
3321 				 * and change values through references (see bug #26639). */
3322 				if (Z_TYPE_P(entry) == IS_INDIRECT) {
3323 					ZEND_ASSERT(Z_TYPE_P(var) == IS_OBJECT);
3324 
3325 					entry = Z_INDIRECT_P(entry);
3326 					if (Z_ISREF_P(entry) && Z_TYPE_P(Z_REFVAL_P(entry)) == IS_STRING) {
3327 						zend_property_info *info = zend_get_typed_property_info_for_slot(Z_OBJ_P(var), entry);
3328 						if (info) {
3329 							ZEND_REF_DEL_TYPE_SOURCE(Z_REF_P(entry), info);
3330 						}
3331 					}
3332 				}
3333 
3334 				if (mb_recursive_convert_variable(convd, entry)) {
3335 					if (Z_REFCOUNTED_P(var)) {
3336 						Z_UNPROTECT_RECURSION_P(var);
3337 					}
3338 					return 1;
3339 				}
3340 			} ZEND_HASH_FOREACH_END();
3341 		}
3342 
3343 		if (Z_REFCOUNTED_P(var)) {
3344 			Z_UNPROTECT_RECURSION_P(var);
3345 		}
3346 	}
3347 	return 0;
3348 } /* }}} */
3349 
3350 /* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)3351 PHP_FUNCTION(mb_convert_variables)
3352 {
3353 	zval *args;
3354 	zend_string *to_enc_str;
3355 	zend_string *from_enc_str;
3356 	HashTable *from_enc_ht;
3357 	mbfl_string string, result;
3358 	const mbfl_encoding *from_encoding, *to_encoding;
3359 	mbfl_encoding_detector *identd;
3360 	mbfl_buffer_converter *convd;
3361 	int n, argc;
3362 	size_t elistsz;
3363 	const mbfl_encoding **elist;
3364 	int recursion_error = 0;
3365 
3366 	ZEND_PARSE_PARAMETERS_START(3, -1)
3367 		Z_PARAM_STR(to_enc_str)
3368 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3369 		Z_PARAM_VARIADIC('+', args, argc)
3370 	ZEND_PARSE_PARAMETERS_END();
3371 
3372 	/* new encoding */
3373 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3374 	if (!to_encoding) {
3375 		RETURN_THROWS();
3376 	}
3377 
3378 	/* initialize string */
3379 	from_encoding = MBSTRG(current_internal_encoding);
3380 	mbfl_string_init_set(&string, from_encoding);
3381 	mbfl_string_init(&result);
3382 
3383 	/* pre-conversion encoding */
3384 	if (from_enc_ht) {
3385 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3386 			RETURN_THROWS();
3387 		}
3388 	} else {
3389 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0) == FAILURE) {
3390 			RETURN_THROWS();
3391 		}
3392 	}
3393 
3394 	if (elistsz == 0) {
3395 		efree(ZEND_VOIDP(elist));
3396 		zend_argument_value_error(2, "must specify at least one encoding");
3397 		RETURN_THROWS();
3398 	}
3399 
3400 	if (elistsz == 1) {
3401 		from_encoding = *elist;
3402 	} else {
3403 		/* auto detect */
3404 		from_encoding = NULL;
3405 		identd = mbfl_encoding_detector_new(elist, elistsz, MBSTRG(strict_detection));
3406 		if (identd != NULL) {
3407 			n = 0;
3408 			while (n < argc) {
3409 				if (mb_recursive_encoder_detector_feed(identd, &args[n], &recursion_error)) {
3410 					break;
3411 				}
3412 				n++;
3413 			}
3414 			from_encoding = mbfl_encoding_detector_judge(identd);
3415 			mbfl_encoding_detector_delete(identd);
3416 			if (recursion_error) {
3417 				efree(ZEND_VOIDP(elist));
3418 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3419 				RETURN_FALSE;
3420 			}
3421 		}
3422 
3423 		if (!from_encoding) {
3424 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3425 			efree(ZEND_VOIDP(elist));
3426 			RETURN_FALSE;
3427 		}
3428 	}
3429 
3430 	efree(ZEND_VOIDP(elist));
3431 
3432 	convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
3433 	/* If this assertion fails this means some memory allocation failure which is a bug */
3434 	ZEND_ASSERT(convd != NULL);
3435 
3436 	mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
3437 	mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
3438 
3439 	/* convert */
3440 	n = 0;
3441 	while (n < argc) {
3442 		zval *zv = &args[n];
3443 
3444 		ZVAL_DEREF(zv);
3445 		recursion_error = mb_recursive_convert_variable(convd, zv);
3446 		if (recursion_error) {
3447 			break;
3448 		}
3449 		n++;
3450 	}
3451 
3452 	MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
3453 	mbfl_buffer_converter_delete(convd);
3454 
3455 	if (recursion_error) {
3456 		php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3457 		RETURN_FALSE;
3458 	}
3459 
3460 	RETURN_STRING(from_encoding->name);
3461 }
3462 /* }}} */
3463 
3464 /* HTML numeric entities */
3465 
3466 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3467 static uint32_t *make_conversion_map(HashTable *target_hash, int *convmap_size)
3468 {
3469 	zval *hash_entry;
3470 
3471 	int n_elems = zend_hash_num_elements(target_hash);
3472 	if (n_elems % 4 != 0) {
3473 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3474 		return NULL;
3475 	}
3476 
3477 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3478 	uint32_t *mapelm = convmap;
3479 
3480 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3481 		*mapelm++ = zval_get_long(hash_entry);
3482 	} ZEND_HASH_FOREACH_END();
3483 
3484 	*convmap_size = n_elems / 4;
3485 	return convmap;
3486 }
3487 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,int mapsize,uint32_t * retval)3488 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, int mapsize, uint32_t *retval)
3489 {
3490 	uint32_t *convmap_end = convmap + (mapsize * 4);
3491 
3492 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3493 		uint32_t lo_code = mapelm[0];
3494 		uint32_t hi_code = mapelm[1];
3495 		uint32_t offset  = mapelm[2];
3496 		uint32_t mask    = mapelm[3];
3497 
3498 		if (w >= lo_code && w <= hi_code) {
3499 			/* This wchar falls inside one of the ranges which should be
3500 			 * converted to HTML entities */
3501 			*retval = (w + offset) & mask;
3502 			return true;
3503 		}
3504 	}
3505 
3506 	/* None of the ranges matched */
3507 	return false;
3508 }
3509 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize,bool hex)3510 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize, bool hex)
3511 {
3512 	/* Each wchar which we get from decoding the input string may become up to
3513 	 * 13 wchars when we convert it to an HTML entity */
3514 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3515 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3516 
3517 	unsigned int state = 0;
3518 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3519 	size_t in_len = ZSTR_LEN(input);
3520 
3521 	mb_convert_buf buf;
3522 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3523 
3524 	while (in_len) {
3525 		/* Convert input string to wchars, up to 32 at a time */
3526 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3527 		ZEND_ASSERT(out_len <= 32);
3528 		uint32_t *converted = converted_buf;
3529 
3530 		/* Run through wchars and see if any of them fall into the ranges
3531 		 * which we want to convert to HTML entities */
3532 		for (int i = 0; i < out_len; i++) {
3533 			uint32_t w = wchar_buf[i];
3534 
3535 			if (html_numeric_entity_convert(w, convmap, mapsize, &w)) {
3536 				*converted++ = '&';
3537 				*converted++ = '#';
3538 				if (hex) {
3539 					*converted++ = 'x';
3540 				}
3541 
3542 				/* Convert wchar to decimal/hex string */
3543 				if (w == 0) {
3544 					*converted++ = '0';
3545 				} else {
3546 					unsigned char *p = entity + sizeof(entity);
3547 					if (hex) {
3548 						while (w > 0) {
3549 							*(--p) = "0123456789ABCDEF"[w & 0xF];
3550 							w >>= 4;
3551 						}
3552 					} else {
3553 						while (w > 0) {
3554 							*(--p) = "0123456789"[w % 10];
3555 							w /= 10;
3556 						}
3557 					}
3558 					while (p < entity + sizeof(entity)) {
3559 						*converted++ = *p++;
3560 					}
3561 				}
3562 
3563 				*converted++ = ';';
3564 			} else {
3565 				*converted++ = w;
3566 			}
3567 		}
3568 
3569 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3570 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3571 	}
3572 
3573 	return mb_convert_buf_result(&buf);
3574 }
3575 
3576 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3577 PHP_FUNCTION(mb_encode_numericentity)
3578 {
3579 	zend_string *encoding = NULL, *str;
3580 	int mapsize;
3581 	HashTable *target_hash;
3582 	bool is_hex = false;
3583 
3584 	ZEND_PARSE_PARAMETERS_START(2, 4)
3585 		Z_PARAM_STR(str)
3586 		Z_PARAM_ARRAY_HT(target_hash)
3587 		Z_PARAM_OPTIONAL
3588 		Z_PARAM_STR_OR_NULL(encoding)
3589 		Z_PARAM_BOOL(is_hex)
3590 	ZEND_PARSE_PARAMETERS_END();
3591 
3592 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3593 	if (!enc) {
3594 		RETURN_THROWS();
3595 	}
3596 
3597 	uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3598 	if (convmap == NULL) {
3599 		RETURN_THROWS();
3600 	}
3601 
3602 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, mapsize, is_hex));
3603 	efree(convmap);
3604 }
3605 /* }}} */
3606 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,int mapsize,uint32_t * retval)3607 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, int mapsize, uint32_t *retval)
3608 {
3609 	uint32_t *convmap_end = convmap + (mapsize * 4);
3610 
3611 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3612 		uint32_t lo_code = mapelm[0];
3613 		uint32_t hi_code = mapelm[1];
3614 		uint32_t offset  = mapelm[2];
3615 		uint32_t codepoint = number - offset;
3616 		if (codepoint >= lo_code && codepoint <= hi_code) {
3617 			*retval = codepoint;
3618 			return true;
3619 		}
3620 	}
3621 
3622 	return false;
3623 }
3624 
3625 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
3626 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
3627 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3628 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3629 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize)3630 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
3631 {
3632 	uint32_t wchar_buf[128], converted_buf[128];
3633 
3634 	unsigned int state = 0;
3635 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3636 	size_t in_len = ZSTR_LEN(input);
3637 
3638 	mb_convert_buf buf;
3639 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3640 
3641 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
3642 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
3643 	 * 2nd 'converted' buffer.
3644 	 *
3645 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
3646 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
3647 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
3648 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
3649 	 * to store the next batch of wchars after it.
3650 	 *
3651 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
3652 	 * wchars from the 1st buffer to the 2nd one.
3653 	 *
3654 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
3655 	 * have to do bounds checks when writing wchars into it.
3656 	 */
3657 
3658 	unsigned int wchar_buf_offset = 0;
3659 
3660 	while (in_len) {
3661 		/* Leave space for sentinel at the end of the buffer */
3662 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
3663 		out_len += wchar_buf_offset;
3664 		ZEND_ASSERT(out_len <= 127);
3665 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
3666 
3667 		uint32_t *p, *converted;
3668 
3669 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
3670 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
3671 		if (wchar_buf_offset == 0) {
3672 			p = wchar_buf;
3673 			while (*p != '&')
3674 				p++;
3675 			if (p == wchar_buf + out_len) {
3676 				/* No HTML entities in this buffer */
3677 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
3678 				continue;
3679 			}
3680 
3681 			/* Copy over the prefix with no & which we already scanned */
3682 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
3683 			converted = converted_buf + (p - wchar_buf);
3684 		} else {
3685 			p = wchar_buf;
3686 			converted = converted_buf;
3687 		}
3688 
3689 found_ampersand:
3690 		ZEND_ASSERT(*p == '&');
3691 		uint32_t *p2 = p;
3692 
3693 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
3694 		if (*++p2 == '#') {
3695 			if (*++p2 == 'x') {
3696 				/* Possible hex entity */
3697 				uint32_t w = *++p2;
3698 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
3699 					w = *++p2;
3700 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3701 					/* We hit the end of the buffer while reading digits, and
3702 					 * more wchars are still coming in the next buffer
3703 					 * Reprocess this identity on next iteration */
3704 					memmove(wchar_buf, p, (p2 - p) * 4);
3705 					wchar_buf_offset = p2 - p;
3706 					goto process_converted_wchars;
3707 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3708 					/* Invalid entity (too long or "&#x" only) */
3709 					memcpy(converted, p, (p2 - p) * 4);
3710 					converted += p2 - p;
3711 				} else {
3712 					/* Valid hexadecimal entity */
3713 					uint32_t value = 0, *p3 = p + 3;
3714 					while (p3 < p2) {
3715 						w = *p3++;
3716 						if (w <= '9') {
3717 							value = (value * 16) + (w - '0');
3718 						} else if (w >= 'a') {
3719 							value = (value * 16) + 10 + (w - 'a');
3720 						} else {
3721 							value = (value * 16) + 10 + (w - 'A');
3722 						}
3723 					}
3724 					if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3725 						converted++;
3726 						if (*p2 == ';')
3727 							p2++;
3728 					} else {
3729 						memcpy(converted, p, (p2 - p) * 4);
3730 						converted += p2 - p;
3731 					}
3732 				}
3733 			} else {
3734 				/* Possible decimal entity */
3735 				uint32_t w = *p2;
3736 				while (w >= '0' && w <= '9')
3737 					w = *++p2;
3738 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3739 					/* The number of digits was legal (no more than 10 decimal digits)
3740 					 * Reprocess this identity on next iteration of main loop */
3741 					memmove(wchar_buf, p, (p2 - p) * 4);
3742 					wchar_buf_offset = p2 - p;
3743 					goto process_converted_wchars;
3744 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3745 					/* Invalid entity (too long or "&#" only) */
3746 					memcpy(converted, p, (p2 - p) * 4);
3747 					converted += p2 - p;
3748 				} else {
3749 					/* Valid decimal entity */
3750 					uint32_t value = 0, *p3 = p + 2;
3751 					while (p3 < p2) {
3752 						/* If unsigned integer overflow would occur in the below
3753 						 * multiplication by 10, this entity is no good
3754 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
3755 						if (value > 0x19999999) {
3756 							memcpy(converted, p, (p2 - p) * 4);
3757 							converted += p2 - p;
3758 							goto decimal_entity_too_big;
3759 						}
3760 						value = (value * 10) + (*p3++ - '0');
3761 					}
3762 					if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3763 						converted++;
3764 						if (*p2 == ';')
3765 							p2++;
3766 					} else {
3767 						memcpy(converted, p, (p2 - p) * 4);
3768 						converted += p2 - p;
3769 					}
3770 				}
3771 			}
3772 		} else if ((p2 == wchar_buf + out_len) && in_len) {
3773 			/* Corner case: & at end of buffer */
3774 			wchar_buf[0] = '&';
3775 			wchar_buf_offset = 1;
3776 			goto process_converted_wchars;
3777 		} else {
3778 			*converted++ = '&';
3779 		}
3780 decimal_entity_too_big:
3781 
3782 		/* Starting to scan a new section of the wchar buffer
3783 		 * 'p2' is pointing at the next wchar which needs to be processed */
3784 		p = p2;
3785 		while (*p2 != '&')
3786 			p2++;
3787 
3788 		if (p2 > p) {
3789 			memcpy(converted, p, (p2 - p) * 4);
3790 			converted += p2 - p;
3791 			p = p2;
3792 		}
3793 
3794 		if (p < wchar_buf + out_len)
3795 			goto found_ampersand;
3796 
3797 		/* We do not have any wchars remaining at the end of this buffer which
3798 		 * we need to reprocess on the next call */
3799 		wchar_buf_offset = 0;
3800 process_converted_wchars:
3801 		ZEND_ASSERT(converted <= converted_buf + 128);
3802 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3803 	}
3804 
3805 	return mb_convert_buf_result(&buf);
3806 }
3807 
3808 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3809 PHP_FUNCTION(mb_decode_numericentity)
3810 {
3811 	zend_string *encoding = NULL, *str;
3812 	int mapsize;
3813 	HashTable *target_hash;
3814 
3815 	ZEND_PARSE_PARAMETERS_START(2, 3)
3816 		Z_PARAM_STR(str)
3817 		Z_PARAM_ARRAY_HT(target_hash)
3818 		Z_PARAM_OPTIONAL
3819 		Z_PARAM_STR_OR_NULL(encoding)
3820 	ZEND_PARSE_PARAMETERS_END();
3821 
3822 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3823 	if (!enc) {
3824 		RETURN_THROWS();
3825 	}
3826 
3827 	uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3828 	if (convmap == NULL) {
3829 		RETURN_THROWS();
3830 	}
3831 
3832 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, mapsize));
3833 	efree(convmap);
3834 }
3835 /* }}} */
3836 
3837 /* {{{ Sends an email message with MIME scheme */
3838 #define CRLF "\r\n"
3839 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)3840 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
3841 {
3842 	const char *ps;
3843 	size_t icnt;
3844 	int state = 0;
3845 	int crlf_state = -1;
3846 	char *token = NULL;
3847 	size_t token_pos = 0;
3848 	zend_string *fld_name, *fld_val;
3849 
3850 	ps = str;
3851 	icnt = str_len;
3852 	fld_name = fld_val = NULL;
3853 
3854 	/*
3855 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
3856 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
3857 	 *      state  0            1           2          3
3858 	 *
3859 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
3860 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
3861 	 * crlf_state -1                       0                     1 -1
3862 	 *
3863 	 */
3864 
3865 	while (icnt > 0) {
3866 		switch (*ps) {
3867 			case ':':
3868 				if (crlf_state == 1) {
3869 					token_pos++;
3870 				}
3871 
3872 				if (state == 0 || state == 1) {
3873 					if(token && token_pos > 0) {
3874 						fld_name = zend_string_init(token, token_pos, 0);
3875 					}
3876 					state = 2;
3877 				} else {
3878 					token_pos++;
3879 				}
3880 
3881 				crlf_state = 0;
3882 				break;
3883 
3884 			case '\n':
3885 				if (crlf_state == -1) {
3886 					goto out;
3887 				}
3888 				crlf_state = -1;
3889 				break;
3890 
3891 			case '\r':
3892 				if (crlf_state == 1) {
3893 					token_pos++;
3894 				} else {
3895 					crlf_state = 1;
3896 				}
3897 				break;
3898 
3899 			case ' ': case '\t':
3900 				if (crlf_state == -1) {
3901 					if (state == 3) {
3902 						/* continuing from the previous line */
3903 						state = 4;
3904 					} else {
3905 						/* simply skipping this new line */
3906 						state = 5;
3907 					}
3908 				} else {
3909 					if (crlf_state == 1) {
3910 						token_pos++;
3911 					}
3912 					if (state == 1 || state == 3) {
3913 						token_pos++;
3914 					}
3915 				}
3916 				crlf_state = 0;
3917 				break;
3918 
3919 			default:
3920 				switch (state) {
3921 					case 0:
3922 						token = (char*)ps;
3923 						token_pos = 0;
3924 						state = 1;
3925 						break;
3926 
3927 					case 2:
3928 						if (crlf_state != -1) {
3929 							token = (char*)ps;
3930 							token_pos = 0;
3931 
3932 							state = 3;
3933 							break;
3934 						}
3935 						ZEND_FALLTHROUGH;
3936 
3937 					case 3:
3938 						if (crlf_state == -1) {
3939 							if(token && token_pos > 0) {
3940 								fld_val = zend_string_init(token, token_pos, 0);
3941 							}
3942 
3943 							if (fld_name != NULL && fld_val != NULL) {
3944 								zval val;
3945 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3946 								ZVAL_STR(&val, fld_val);
3947 
3948 								zend_hash_update(ht, fld_name, &val);
3949 
3950 								zend_string_release_ex(fld_name, 0);
3951 							}
3952 
3953 							fld_name = fld_val = NULL;
3954 							token = (char*)ps;
3955 							token_pos = 0;
3956 
3957 							state = 1;
3958 						}
3959 						break;
3960 
3961 					case 4:
3962 						token_pos++;
3963 						state = 3;
3964 						break;
3965 				}
3966 
3967 				if (crlf_state == 1) {
3968 					token_pos++;
3969 				}
3970 
3971 				token_pos++;
3972 
3973 				crlf_state = 0;
3974 				break;
3975 		}
3976 		ps++, icnt--;
3977 	}
3978 out:
3979 	if (state == 2) {
3980 		token = "";
3981 		token_pos = 0;
3982 
3983 		state = 3;
3984 	}
3985 	if (state == 3) {
3986 		if(token && token_pos > 0) {
3987 			fld_val = zend_string_init(token, token_pos, 0);
3988 		}
3989 		if (fld_name != NULL && fld_val != NULL) {
3990 			zval val;
3991 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3992 			ZVAL_STR(&val, fld_val);
3993 			zend_hash_update(ht, fld_name, &val);
3994 
3995 			zend_string_release_ex(fld_name, 0);
3996 		}
3997 	}
3998 	return state;
3999 }
4000 
PHP_FUNCTION(mb_send_mail)4001 PHP_FUNCTION(mb_send_mail)
4002 {
4003 	char *to;
4004 	size_t to_len;
4005 	char *message;
4006 	size_t message_len;
4007 	char *subject;
4008 	size_t subject_len;
4009 	zend_string *extra_cmd = NULL;
4010 	HashTable *headers_ht = NULL;
4011 	zend_string *str_headers = NULL;
4012 	size_t n, i;
4013 	char *to_r = NULL;
4014 	char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4015 	struct {
4016 		int cnt_type:1;
4017 		int cnt_trans_enc:1;
4018 	} suppressed_hdrs = { 0, 0 };
4019 
4020 	char *message_buf = NULL, *subject_buf = NULL, *p;
4021 	mbfl_string orig_str, conv_str;
4022 	mbfl_string *pstr;	/* pointer to mbfl string for return value */
4023 	enum mbfl_no_encoding;
4024 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4025 						*head_enc,	/* header transfer encoding */
4026 						*body_enc;	/* body transfer encoding */
4027 	mbfl_memory_device device;	/* automatic allocateable buffer for additional header */
4028 	const mbfl_language *lang;
4029 	int err = 0;
4030 	HashTable ht_headers;
4031 	zval *s;
4032 	extern void mbfl_memory_device_unput(mbfl_memory_device *device);
4033 
4034 	/* initialize */
4035 	mbfl_memory_device_init(&device, 0, 0);
4036 	mbfl_string_init(&orig_str);
4037 	mbfl_string_init(&conv_str);
4038 
4039 	/* character-set, transfer-encoding */
4040 	tran_cs = &mbfl_encoding_utf8;
4041 	head_enc = &mbfl_encoding_base64;
4042 	body_enc = &mbfl_encoding_base64;
4043 	lang = mbfl_no2language(MBSTRG(language));
4044 	if (lang != NULL) {
4045 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4046 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4047 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4048 	}
4049 
4050 	ZEND_PARSE_PARAMETERS_START(3, 5)
4051 		Z_PARAM_PATH(to, to_len)
4052 		Z_PARAM_PATH(subject, subject_len)
4053 		Z_PARAM_PATH(message, message_len)
4054 		Z_PARAM_OPTIONAL
4055 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4056 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4057 	ZEND_PARSE_PARAMETERS_END();
4058 
4059 	if (str_headers) {
4060 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4061 			zend_argument_value_error(4, "must not contain any null bytes");
4062 			RETURN_THROWS();
4063 		}
4064 		str_headers = php_trim(str_headers, NULL, 0, 2);
4065 	} else if (headers_ht) {
4066 		str_headers = php_mail_build_headers(headers_ht);
4067 		if (EG(exception)) {
4068 			RETURN_THROWS();
4069 		}
4070 	}
4071 
4072 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4073 
4074 	if (str_headers != NULL) {
4075 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4076 	}
4077 
4078 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4079 		char *tmp;
4080 		char *param_name;
4081 		char *charset = NULL;
4082 
4083 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4084 		p = strchr(Z_STRVAL_P(s), ';');
4085 
4086 		if (p != NULL) {
4087 			/* skipping the padded spaces */
4088 			do {
4089 				++p;
4090 			} while (*p == ' ' || *p == '\t');
4091 
4092 			if (*p != '\0') {
4093 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4094 					if (strcasecmp(param_name, "charset") == 0) {
4095 						const mbfl_encoding *_tran_cs = tran_cs;
4096 
4097 						charset = php_strtok_r(NULL, "= \"", &tmp);
4098 						if (charset != NULL) {
4099 							_tran_cs = mbfl_name2encoding(charset);
4100 						}
4101 
4102 						if (!_tran_cs) {
4103 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4104 							_tran_cs = &mbfl_encoding_ascii;
4105 						}
4106 						tran_cs = _tran_cs;
4107 					}
4108 				}
4109 			}
4110 		}
4111 		suppressed_hdrs.cnt_type = 1;
4112 	}
4113 
4114 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4115 		const mbfl_encoding *_body_enc;
4116 
4117 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4118 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4119 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4120 			case mbfl_no_encoding_base64:
4121 			case mbfl_no_encoding_7bit:
4122 			case mbfl_no_encoding_8bit:
4123 				body_enc = _body_enc;
4124 				break;
4125 
4126 			default:
4127 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4128 				body_enc =	&mbfl_encoding_8bit;
4129 				break;
4130 		}
4131 		suppressed_hdrs.cnt_trans_enc = 1;
4132 	}
4133 
4134 	/* To: */
4135 	if (to_len > 0) {
4136 		to_r = estrndup(to, to_len);
4137 		for (; to_len; to_len--) {
4138 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4139 				break;
4140 			}
4141 			to_r[to_len - 1] = '\0';
4142 		}
4143 		for (i = 0; to_r[i]; i++) {
4144 			if (iscntrl((unsigned char) to_r[i])) {
4145 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4146 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4147 				 * To prevent these separators from being replaced with a space, we skip over them. */
4148 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4149 					i += 2;
4150 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4151 						i++;
4152 					}
4153 					continue;
4154 				}
4155 
4156 				to_r[i] = ' ';
4157 			}
4158 		}
4159 	} else {
4160 		to_r = to;
4161 	}
4162 
4163 	/* Subject: */
4164 	orig_str.val = (unsigned char *)subject;
4165 	orig_str.len = subject_len;
4166 	orig_str.encoding = MBSTRG(current_internal_encoding);
4167 	if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4168 			|| orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4169 		orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4170 	}
4171 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4172 	size_t line_sep_len = strlen(line_sep);
4173 	pstr = mbfl_mime_header_encode(&orig_str, &conv_str, tran_cs, head_enc, line_sep, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4174 	if (pstr != NULL) {
4175 		subject_buf = subject = (char *)pstr->val;
4176 	}
4177 
4178 	/* message body */
4179 	orig_str.val = (unsigned char *)message;
4180 	orig_str.len = message_len;
4181 	orig_str.encoding = MBSTRG(current_internal_encoding);
4182 
4183 	if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4184 			|| orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4185 		orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4186 	}
4187 
4188 	pstr = NULL;
4189 	{
4190 		mbfl_string tmpstr;
4191 
4192 		if (mbfl_convert_encoding(&orig_str, &tmpstr, tran_cs) != NULL) {
4193 			tmpstr.encoding = &mbfl_encoding_8bit;
4194 			pstr = mbfl_convert_encoding(&tmpstr, &conv_str, body_enc);
4195 			efree(tmpstr.val);
4196 		}
4197 	}
4198 	if (pstr != NULL) {
4199 		message_buf = message = (char *)pstr->val;
4200 	}
4201 
4202 	/* other headers */
4203 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4204 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4205 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4206 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4207 
4208 	if (str_headers != NULL && ZSTR_LEN(str_headers) > 0) {
4209 		p = ZSTR_VAL(str_headers);
4210 		n = ZSTR_LEN(str_headers);
4211 		mbfl_memory_device_strncat(&device, p, n);
4212 		if (n > 0 && p[n - 1] != '\n') {
4213 			mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4214 		}
4215 		zend_string_release_ex(str_headers, 0);
4216 	}
4217 
4218 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4219 		mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4220 		mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4221 	}
4222 
4223 	if (!suppressed_hdrs.cnt_type) {
4224 		mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4225 
4226 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4227 		if (p != NULL) {
4228 			mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4229 			mbfl_memory_device_strcat(&device, p);
4230 		}
4231 		mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4232 	}
4233 	if (!suppressed_hdrs.cnt_trans_enc) {
4234 		mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4235 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4236 		if (p == NULL) {
4237 			p = "7bit";
4238 		}
4239 		mbfl_memory_device_strcat(&device, p);
4240 		mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4241 	}
4242 
4243 	if (!PG(mail_mixed_lf_and_crlf)) {
4244 		mbfl_memory_device_unput(&device);
4245 	}
4246 	mbfl_memory_device_unput(&device);
4247 	mbfl_memory_device_output('\0', &device);
4248 	str_headers = zend_string_init((char *)device.buffer, strlen((char *)device.buffer), 0);
4249 
4250 	if (force_extra_parameters) {
4251 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4252 	} else if (extra_cmd) {
4253 		extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4254 	}
4255 
4256 	if (!err && php_mail(to_r, subject, message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL)) {
4257 		RETVAL_TRUE;
4258 	} else {
4259 		RETVAL_FALSE;
4260 	}
4261 
4262 	if (extra_cmd) {
4263 		zend_string_release_ex(extra_cmd, 0);
4264 	}
4265 
4266 	if (to_r != to) {
4267 		efree(to_r);
4268 	}
4269 	if (subject_buf) {
4270 		efree((void *)subject_buf);
4271 	}
4272 	if (message_buf) {
4273 		efree((void *)message_buf);
4274 	}
4275 	mbfl_memory_device_clear(&device);
4276 	zend_hash_destroy(&ht_headers);
4277 	if (str_headers) {
4278 		zend_string_release_ex(str_headers, 0);
4279 	}
4280 }
4281 
4282 #undef CRLF
4283 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4284 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4285 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4286 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4287 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4288 /* }}} */
4289 
4290 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4291 PHP_FUNCTION(mb_get_info)
4292 {
4293 	zend_string *type = NULL;
4294 	size_t n;
4295 	char *name;
4296 	zval row;
4297 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4298 	const mbfl_encoding **entry;
4299 
4300 	ZEND_PARSE_PARAMETERS_START(0, 1)
4301 		Z_PARAM_OPTIONAL
4302 		Z_PARAM_STR(type)
4303 	ZEND_PARSE_PARAMETERS_END();
4304 
4305 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4306 		array_init(return_value);
4307 		if (MBSTRG(current_internal_encoding)) {
4308 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4309 		}
4310 		if (MBSTRG(http_input_identify)) {
4311 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4312 		}
4313 		if (MBSTRG(current_http_output_encoding)) {
4314 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4315 		}
4316 		if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4317 			add_assoc_string(return_value, "http_output_conv_mimetypes", name);
4318 		}
4319 		if (lang != NULL) {
4320 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4321 				add_assoc_string(return_value, "mail_charset", name);
4322 			}
4323 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4324 				add_assoc_string(return_value, "mail_header_encoding", name);
4325 			}
4326 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4327 				add_assoc_string(return_value, "mail_body_encoding", name);
4328 			}
4329 		}
4330 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4331 		if (MBSTRG(encoding_translation)) {
4332 			add_assoc_string(return_value, "encoding_translation", "On");
4333 		} else {
4334 			add_assoc_string(return_value, "encoding_translation", "Off");
4335 		}
4336 		if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4337 			add_assoc_string(return_value, "language", name);
4338 		}
4339 		n = MBSTRG(current_detect_order_list_size);
4340 		entry = MBSTRG(current_detect_order_list);
4341 		if (n > 0) {
4342 			size_t i;
4343 			array_init(&row);
4344 			for (i = 0; i < n; i++) {
4345 				add_next_index_string(&row, (*entry)->name);
4346 				entry++;
4347 			}
4348 			add_assoc_zval(return_value, "detect_order", &row);
4349 		}
4350 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4351 			add_assoc_string(return_value, "substitute_character", "none");
4352 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4353 			add_assoc_string(return_value, "substitute_character", "long");
4354 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4355 			add_assoc_string(return_value, "substitute_character", "entity");
4356 		} else {
4357 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4358 		}
4359 		if (MBSTRG(strict_detection)) {
4360 			add_assoc_string(return_value, "strict_detection", "On");
4361 		} else {
4362 			add_assoc_string(return_value, "strict_detection", "Off");
4363 		}
4364 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4365 		if (MBSTRG(current_internal_encoding)) {
4366 			RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
4367 		}
4368 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4369 		if (MBSTRG(http_input_identify)) {
4370 			RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
4371 		}
4372 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4373 		if (MBSTRG(current_http_output_encoding)) {
4374 			RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4375 		}
4376 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4377 		if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4378 			RETVAL_STRING(name);
4379 		}
4380 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4381 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4382 			RETVAL_STRING(name);
4383 		}
4384 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4385 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4386 			RETVAL_STRING(name);
4387 		}
4388 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4389 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4390 			RETVAL_STRING(name);
4391 		}
4392 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4393 		RETVAL_LONG(MBSTRG(illegalchars));
4394 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4395 		if (MBSTRG(encoding_translation)) {
4396 			RETVAL_STRING("On");
4397 		} else {
4398 			RETVAL_STRING("Off");
4399 		}
4400 	} else if (zend_string_equals_literal_ci(type, "language")) {
4401 		if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4402 			RETVAL_STRING(name);
4403 		}
4404 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4405 		n = MBSTRG(current_detect_order_list_size);
4406 		entry = MBSTRG(current_detect_order_list);
4407 		if (n > 0) {
4408 			size_t i;
4409 			array_init(return_value);
4410 			for (i = 0; i < n; i++) {
4411 				add_next_index_string(return_value, (*entry)->name);
4412 				entry++;
4413 			}
4414 		}
4415 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4416 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4417 			RETVAL_STRING("none");
4418 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4419 			RETVAL_STRING("long");
4420 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4421 			RETVAL_STRING("entity");
4422 		} else {
4423 			RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
4424 		}
4425 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4426 		if (MBSTRG(strict_detection)) {
4427 			RETVAL_STRING("On");
4428 		} else {
4429 			RETVAL_STRING("Off");
4430 		}
4431 	} else {
4432 		// TODO Convert to ValueError
4433 		RETURN_FALSE;
4434 	}
4435 }
4436 /* }}} */
4437 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4438 MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4439 {
4440 	uint32_t wchar_buf[128];
4441 	unsigned char *in = (unsigned char*)input;
4442 	unsigned int state = 0;
4443 
4444 	if (encoding->check != NULL) {
4445 		return encoding->check(in, length);
4446 	}
4447 
4448 	/* If the input string is not encoded in the given encoding, there is a significant chance
4449 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4450 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4451 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4452 	ZEND_ASSERT(out_len <= 8);
4453 	for (int i = 0; i < out_len; i++) {
4454 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4455 			return 0;
4456 		}
4457 	}
4458 
4459 	while (length) {
4460 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4461 		ZEND_ASSERT(out_len <= 128);
4462 		for (int i = 0; i < out_len; i++) {
4463 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4464 				return 0;
4465 			}
4466 		}
4467 	}
4468 
4469 	return 1;
4470 }
4471 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)4472 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
4473 {
4474 	zend_long idx;
4475 	zend_string *key;
4476 	zval *entry;
4477 	int valid = 1;
4478 
4479 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
4480 
4481 	if (GC_IS_RECURSIVE(vars)) {
4482 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
4483 		return 0;
4484 	}
4485 	GC_TRY_PROTECT_RECURSION(vars);
4486 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
4487 		ZVAL_DEREF(entry);
4488 		if (key) {
4489 			if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
4490 				valid = 0;
4491 				break;
4492 			}
4493 		}
4494 		switch (Z_TYPE_P(entry)) {
4495 			case IS_STRING:
4496 				if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
4497 					valid = 0;
4498 					break;
4499 				}
4500 				break;
4501 			case IS_ARRAY:
4502 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
4503 					valid = 0;
4504 					break;
4505 				}
4506 				break;
4507 			case IS_LONG:
4508 			case IS_DOUBLE:
4509 			case IS_NULL:
4510 			case IS_TRUE:
4511 			case IS_FALSE:
4512 				break;
4513 			default:
4514 				/* Other types are error. */
4515 				valid = 0;
4516 				break;
4517 		}
4518 	} ZEND_HASH_FOREACH_END();
4519 	GC_TRY_UNPROTECT_RECURSION(vars);
4520 	return valid;
4521 }
4522 
4523 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)4524 PHP_FUNCTION(mb_check_encoding)
4525 {
4526 	zend_string *input_str = NULL, *enc = NULL;
4527 	HashTable *input_ht = NULL;
4528 	const mbfl_encoding *encoding;
4529 
4530 	ZEND_PARSE_PARAMETERS_START(0, 2)
4531 		Z_PARAM_OPTIONAL
4532 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
4533 		Z_PARAM_STR_OR_NULL(enc)
4534 	ZEND_PARSE_PARAMETERS_END();
4535 
4536 	encoding = php_mb_get_encoding(enc, 2);
4537 	if (!encoding) {
4538 		RETURN_THROWS();
4539 	}
4540 
4541 	if (input_ht) {
4542 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
4543 	} else if (input_str) {
4544 		RETURN_BOOL(php_mb_check_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), encoding));
4545 	} else {
4546 		php_error_docref(NULL, E_DEPRECATED,
4547 			"Calling mb_check_encoding() without argument is deprecated");
4548 
4549 		/* FIXME: Actually check all inputs, except $_FILES file content. */
4550 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
4551 	}
4552 }
4553 /* }}} */
4554 
4555 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)4556 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
4557 	const uint32_t enc_name_arg_num)
4558 {
4559 	const mbfl_encoding *enc;
4560 	enum mbfl_no_encoding no_enc;
4561 
4562 	ZEND_ASSERT(str_len > 0);
4563 
4564 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4565 	if (!enc) {
4566 		return -2;
4567 	}
4568 
4569 	no_enc = enc->no_encoding;
4570 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
4571 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
4572 		return -2;
4573 	}
4574 
4575 	/* Some legacy text encodings have a minimum required wchar buffer size;
4576 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
4577 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
4578 	unsigned int state = 0;
4579 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
4580 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
4581 
4582 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
4583 		return -1;
4584 	}
4585 	return wchar_buf[0];
4586 }
4587 
4588 
4589 /* {{{ */
PHP_FUNCTION(mb_ord)4590 PHP_FUNCTION(mb_ord)
4591 {
4592 	char *str;
4593 	size_t str_len;
4594 	zend_string *enc = NULL;
4595 	zend_long cp;
4596 
4597 	ZEND_PARSE_PARAMETERS_START(1, 2)
4598 		Z_PARAM_STRING(str, str_len)
4599 		Z_PARAM_OPTIONAL
4600 		Z_PARAM_STR_OR_NULL(enc)
4601 	ZEND_PARSE_PARAMETERS_END();
4602 
4603 	if (str_len == 0) {
4604 		zend_argument_value_error(1, "must not be empty");
4605 		RETURN_THROWS();
4606 	}
4607 
4608 	cp = php_mb_ord(str, str_len, enc, 2);
4609 
4610 	if (0 > cp) {
4611 		if (cp == -2) {
4612 			RETURN_THROWS();
4613 		}
4614 		RETURN_FALSE;
4615 	}
4616 
4617 	RETURN_LONG(cp);
4618 }
4619 /* }}} */
4620 
4621 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)4622 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
4623 {
4624 	const mbfl_encoding *enc;
4625 	enum mbfl_no_encoding no_enc;
4626 	zend_string *ret;
4627 	char buf[4];
4628 
4629 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4630 	if (!enc) {
4631 		return NULL;
4632 	}
4633 
4634 	no_enc = enc->no_encoding;
4635 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
4636 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
4637 		return NULL;
4638 	}
4639 
4640 	if (cp < 0 || cp > 0x10ffff) {
4641 		return NULL;
4642 	}
4643 
4644 	if (php_mb_is_no_encoding_utf8(no_enc)) {
4645 		if (cp > 0xd7ff && 0xe000 > cp) {
4646 			return NULL;
4647 		}
4648 
4649 		if (cp < 0x80) {
4650 			ret = ZSTR_CHAR(cp);
4651 		} else if (cp < 0x800) {
4652 			ret = zend_string_alloc(2, 0);
4653 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
4654 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
4655 			ZSTR_VAL(ret)[2] = 0;
4656 		} else if (cp < 0x10000) {
4657 			ret = zend_string_alloc(3, 0);
4658 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
4659 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
4660 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
4661 			ZSTR_VAL(ret)[3] = 0;
4662 		} else {
4663 			ret = zend_string_alloc(4, 0);
4664 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
4665 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
4666 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
4667 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
4668 			ZSTR_VAL(ret)[4] = 0;
4669 		}
4670 
4671 		return ret;
4672 	}
4673 
4674 	buf[0] = (cp >> 24) & 0xff;
4675 	buf[1] = (cp >> 16) & 0xff;
4676 	buf[2] = (cp >>  8) & 0xff;
4677 	buf[3] = cp & 0xff;
4678 
4679 	long orig_illegalchars = MBSTRG(illegalchars);
4680 	MBSTRG(illegalchars) = 0;
4681 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
4682 
4683 	if (MBSTRG(illegalchars) != 0) {
4684 		zend_string_release(ret);
4685 		ret = NULL;
4686 	}
4687 
4688 	MBSTRG(illegalchars) = orig_illegalchars;
4689 	return ret;
4690 }
4691 
4692 
4693 /* {{{ */
PHP_FUNCTION(mb_chr)4694 PHP_FUNCTION(mb_chr)
4695 {
4696 	zend_long cp;
4697 	zend_string *enc = NULL;
4698 
4699 	ZEND_PARSE_PARAMETERS_START(1, 2)
4700 		Z_PARAM_LONG(cp)
4701 		Z_PARAM_OPTIONAL
4702 		Z_PARAM_STR_OR_NULL(enc)
4703 	ZEND_PARSE_PARAMETERS_END();
4704 
4705 	zend_string* ret = php_mb_chr(cp, enc, 2);
4706 	if (ret == NULL) {
4707 		RETURN_FALSE;
4708 	}
4709 
4710 	RETURN_STR(ret);
4711 }
4712 /* }}} */
4713 
4714 /* {{{ */
PHP_FUNCTION(mb_scrub)4715 PHP_FUNCTION(mb_scrub)
4716 {
4717 	char* str;
4718 	size_t str_len;
4719 	zend_string *enc_name = NULL;
4720 
4721 	ZEND_PARSE_PARAMETERS_START(1, 2)
4722 		Z_PARAM_STRING(str, str_len)
4723 		Z_PARAM_OPTIONAL
4724 		Z_PARAM_STR_OR_NULL(enc_name)
4725 	ZEND_PARSE_PARAMETERS_END();
4726 
4727 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
4728 	if (!enc) {
4729 		RETURN_THROWS();
4730 	}
4731 
4732 	RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc));
4733 }
4734 /* }}} */
4735 
4736 
4737 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)4738 static void php_mb_populate_current_detect_order_list(void)
4739 {
4740 	const mbfl_encoding **entry = 0;
4741 	size_t nentries;
4742 
4743 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
4744 		nentries = MBSTRG(detect_order_list_size);
4745 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4746 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
4747 	} else {
4748 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
4749 		size_t i;
4750 		nentries = MBSTRG(default_detect_order_list_size);
4751 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4752 		for (i = 0; i < nentries; i++) {
4753 			entry[i] = mbfl_no2encoding(src[i]);
4754 		}
4755 	}
4756 	MBSTRG(current_detect_order_list) = entry;
4757 	MBSTRG(current_detect_order_list_size) = nentries;
4758 }
4759 /* }}} */
4760 
4761 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)4762 static int php_mb_encoding_translation(void)
4763 {
4764 	return MBSTRG(encoding_translation);
4765 }
4766 /* }}} */
4767 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)4768 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
4769 {
4770 	if (enc) {
4771 		if (enc->mblen_table) {
4772 			if (s) {
4773 				return enc->mblen_table[*(unsigned char *)s];
4774 			}
4775 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
4776 			return 2;
4777 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
4778 			return 4;
4779 		}
4780 	}
4781 	return 1;
4782 }
4783 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)4784 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
4785 {
4786 	const char *p = s;
4787 	char *last=NULL;
4788 
4789 	if (nbytes == (size_t)-1) {
4790 		size_t nb = 0;
4791 
4792 		while (*p != '\0') {
4793 			if (nb == 0) {
4794 				if ((unsigned char)*p == (unsigned char)c) {
4795 					last = (char *)p;
4796 				}
4797 				nb = php_mb_mbchar_bytes(p, enc);
4798 				if (nb == 0) {
4799 					return NULL; /* something is going wrong! */
4800 				}
4801 			}
4802 			--nb;
4803 			++p;
4804 		}
4805 	} else {
4806 		size_t bcnt = nbytes;
4807 		size_t nbytes_char;
4808 		while (bcnt > 0) {
4809 			if ((unsigned char)*p == (unsigned char)c) {
4810 				last = (char *)p;
4811 			}
4812 			nbytes_char = php_mb_mbchar_bytes(p, enc);
4813 			if (bcnt < nbytes_char) {
4814 				return NULL;
4815 			}
4816 			p += nbytes_char;
4817 			bcnt -= nbytes_char;
4818 		}
4819 	}
4820 	return last;
4821 }
4822 
4823 /* {{{ MBSTRING_API int php_mb_stripos() */
php_mb_stripos(int mode,const char * old_haystack,size_t old_haystack_len,const char * old_needle,size_t old_needle_len,zend_long offset,const mbfl_encoding * enc)4824 MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *enc)
4825 {
4826 	size_t n = (size_t) -1;
4827 	mbfl_string haystack, needle;
4828 
4829 	mbfl_string_init_set(&haystack, enc);
4830 	mbfl_string_init_set(&needle, enc);
4831 
4832 	do {
4833 		/* We're using simple case-folding here, because we'd have to deal with remapping of
4834 		 * offsets otherwise. */
4835 
4836 		size_t len = 0;
4837 		haystack.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc);
4838 		haystack.len = len;
4839 
4840 		if (!haystack.val) {
4841 			break;
4842 		}
4843 
4844 		if (haystack.len == 0) {
4845 			break;
4846 		}
4847 
4848 		needle.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc);
4849 		needle.len = len;
4850 
4851 		if (!needle.val) {
4852 			break;
4853 		}
4854 
4855 		n = mbfl_strpos(&haystack, &needle, offset, mode);
4856 	} while(0);
4857 
4858 	if (haystack.val) {
4859 		efree(haystack.val);
4860 	}
4861 
4862 	if (needle.val) {
4863 		efree(needle.val);
4864 	}
4865 
4866 	return n;
4867 }
4868 /* }}} */
4869 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)4870 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
4871 {
4872 	*list = (const zend_encoding **)MBSTRG(http_input_list);
4873 	*list_size = MBSTRG(http_input_list_size);
4874 }
4875 /* }}} */
4876 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)4877 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
4878 {
4879 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
4880 }
4881 /* }}} */
4882