xref: /PHP-8.2/ext/mbstring/mbstring.c (revision 8128d17c)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include <limits.h>
22 
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "ext/standard/url.h"
32 #include "main/php_output.h"
33 #include "ext/standard/info.h"
34 #include "ext/pcre/php_pcre.h"
35 
36 #include "libmbfl/mbfl/mbfilter_8bit.h"
37 #include "libmbfl/mbfl/mbfilter_pass.h"
38 #include "libmbfl/mbfl/mbfilter_wchar.h"
39 #include "libmbfl/mbfl/eaw_table.h"
40 #include "libmbfl/filters/mbfilter_base64.h"
41 #include "libmbfl/filters/mbfilter_cjk.h"
42 #include "libmbfl/filters/mbfilter_qprint.h"
43 #include "libmbfl/filters/mbfilter_htmlent.h"
44 #include "libmbfl/filters/mbfilter_uuencode.h"
45 #include "libmbfl/filters/mbfilter_ucs4.h"
46 #include "libmbfl/filters/mbfilter_utf8.h"
47 #include "libmbfl/filters/mbfilter_utf16.h"
48 #include "libmbfl/filters/mbfilter_singlebyte.h"
49 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
50 #include "libmbfl/filters/unicode_prop.h"
51 
52 #include "php_variables.h"
53 #include "php_globals.h"
54 #include "rfc1867.h"
55 #include "php_content_types.h"
56 #include "SAPI.h"
57 #include "php_unicode.h"
58 #include "TSRM.h"
59 
60 #include "mb_gpc.h"
61 
62 #ifdef HAVE_MBREGEX
63 # include "php_mbregex.h"
64 #endif
65 
66 #include "zend_smart_str.h"
67 #include "zend_multibyte.h"
68 #include "mbstring_arginfo.h"
69 
70 #include "rare_cp_bitvec.h"
71 
72 /* }}} */
73 
74 /* {{{ prototypes */
75 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
76 
77 static PHP_GINIT_FUNCTION(mbstring);
78 static PHP_GSHUTDOWN_FUNCTION(mbstring);
79 
80 static void php_mb_populate_current_detect_order_list(void);
81 
82 static int php_mb_encoding_translation(void);
83 
84 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
85 
86 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
87 
88 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
89 
90 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
91 
92 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
93 
94 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
95 
96 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
97 
98 /* See mbfilter_cp5022x.c */
99 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
100 /* }}} */
101 
102 /* {{{ php_mb_default_identify_list */
103 typedef struct _php_mb_nls_ident_list {
104 	enum mbfl_no_language lang;
105 	const enum mbfl_no_encoding *list;
106 	size_t list_size;
107 } php_mb_nls_ident_list;
108 
109 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
110 	mbfl_no_encoding_ascii,
111 	mbfl_no_encoding_jis,
112 	mbfl_no_encoding_utf8,
113 	mbfl_no_encoding_euc_jp,
114 	mbfl_no_encoding_sjis
115 };
116 
117 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
118 	mbfl_no_encoding_ascii,
119 	mbfl_no_encoding_utf8,
120 	mbfl_no_encoding_euc_cn,
121 	mbfl_no_encoding_cp936
122 };
123 
124 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
125 	mbfl_no_encoding_ascii,
126 	mbfl_no_encoding_utf8,
127 	mbfl_no_encoding_euc_tw,
128 	mbfl_no_encoding_big5
129 };
130 
131 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
132 	mbfl_no_encoding_ascii,
133 	mbfl_no_encoding_utf8,
134 	mbfl_no_encoding_euc_kr,
135 	mbfl_no_encoding_uhc
136 };
137 
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
139 	mbfl_no_encoding_ascii,
140 	mbfl_no_encoding_utf8,
141 	mbfl_no_encoding_koi8r,
142 	mbfl_no_encoding_cp1251,
143 	mbfl_no_encoding_cp866
144 };
145 
146 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
147 	mbfl_no_encoding_ascii,
148 	mbfl_no_encoding_utf8,
149 	mbfl_no_encoding_armscii8
150 };
151 
152 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
153 	mbfl_no_encoding_ascii,
154 	mbfl_no_encoding_utf8,
155 	mbfl_no_encoding_cp1254,
156 	mbfl_no_encoding_8859_9
157 };
158 
159 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
160 	mbfl_no_encoding_ascii,
161 	mbfl_no_encoding_utf8,
162 	mbfl_no_encoding_koi8u
163 };
164 
165 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
166 	mbfl_no_encoding_ascii,
167 	mbfl_no_encoding_utf8
168 };
169 
170 
171 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
172 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
173 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
174 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
175 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
176 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
177 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
178 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
179 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
180 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
181 };
182 
183 /* }}} */
184 
185 /* {{{ mbstring_deps[] */
186 static const zend_module_dep mbstring_deps[] = {
187 	ZEND_MOD_REQUIRED("pcre")
188 	ZEND_MOD_END
189 };
190 /* }}} */
191 
192 /* {{{ zend_module_entry mbstring_module_entry */
193 zend_module_entry mbstring_module_entry = {
194 	STANDARD_MODULE_HEADER_EX,
195 	NULL,
196 	mbstring_deps,
197 	"mbstring",
198 	ext_functions,
199 	PHP_MINIT(mbstring),
200 	PHP_MSHUTDOWN(mbstring),
201 	PHP_RINIT(mbstring),
202 	PHP_RSHUTDOWN(mbstring),
203 	PHP_MINFO(mbstring),
204 	PHP_MBSTRING_VERSION,
205 	PHP_MODULE_GLOBALS(mbstring),
206 	PHP_GINIT(mbstring),
207 	PHP_GSHUTDOWN(mbstring),
208 	NULL,
209 	STANDARD_MODULE_PROPERTIES_EX
210 };
211 /* }}} */
212 
213 /* {{{ static sapi_post_entry php_post_entries[] */
214 static const sapi_post_entry php_post_entries[] = {
215 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
216 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
217 	{ NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220 
221 #ifdef COMPILE_DL_MBSTRING
222 #ifdef ZTS
223 ZEND_TSRMLS_CACHE_DEFINE()
224 #endif
225 ZEND_GET_MODULE(mbstring)
226 #endif
227 
228 /* {{{ static sapi_post_entry mbstr_post_entries[] */
229 static const sapi_post_entry mbstr_post_entries[] = {
230 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
231 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
232 	{ NULL, 0, NULL, NULL }
233 };
234 /* }}} */
235 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)236 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
237 	if (encoding_name) {
238 		const mbfl_encoding *encoding;
239 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
240 		if (last_encoding_name && (last_encoding_name == encoding_name
241 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
242 			return MBSTRG(last_used_encoding);
243 		}
244 
245 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
246 		if (!encoding) {
247 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
248 			return NULL;
249 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
250 			if (encoding == &mbfl_encoding_base64) {
251 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
252 			} else if (encoding == &mbfl_encoding_qprint) {
253 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
254 			} else if (encoding == &mbfl_encoding_html_ent) {
255 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
256 			} else if (encoding == &mbfl_encoding_uuencode) {
257 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
258 			}
259 		}
260 
261 		if (last_encoding_name) {
262 			zend_string_release(last_encoding_name);
263 		}
264 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
265 		MBSTRG(last_used_encoding) = encoding;
266 		return encoding;
267 	} else {
268 		return MBSTRG(current_internal_encoding);
269 	}
270 }
271 
php_mb_get_encoding_or_pass(const char * encoding_name)272 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
273 	if (strcmp(encoding_name, "pass") == 0) {
274 		return &mbfl_encoding_pass;
275 	}
276 
277 	return mbfl_name2encoding(encoding_name);
278 }
279 
count_commas(const char * p,const char * end)280 static size_t count_commas(const char *p, const char *end) {
281 	size_t count = 0;
282 	while ((p = memchr(p, ',', end - p))) {
283 		count++;
284 		p++;
285 	}
286 	return count;
287 }
288 
289 /* {{{ static zend_result php_mb_parse_encoding_list()
290  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
291  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
292  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)293 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
294 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
295 {
296 	if (value == NULL || value_length == 0) {
297 		*return_list = NULL;
298 		*return_size = 0;
299 		return SUCCESS;
300 	} else {
301 		bool included_auto;
302 		size_t n, size;
303 		const char *p1, *endp, *tmpstr;
304 		const mbfl_encoding **entry, **list;
305 
306 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
307 			tmpstr = value + 1;
308 			value_length -= 2;
309 		} else {
310 			tmpstr = value;
311 		}
312 
313 		endp = tmpstr + value_length;
314 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
315 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
316 		entry = list;
317 		n = 0;
318 		included_auto = 0;
319 		p1 = tmpstr;
320 		while (1) {
321 			const char *comma = memchr(p1, ',', endp - p1);
322 			const char *p = comma ? comma : endp;
323 			/* trim spaces */
324 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
325 				p1++;
326 			}
327 			p--;
328 			while (p > p1 && (*p == ' ' || *p == '\t')) {
329 				p--;
330 			}
331 			size_t p1_length = p - p1 + 1;
332 			/* convert to the encoding number and check encoding */
333 			if (strncasecmp(p1, "auto", p1_length) == 0) {
334 				if (!included_auto) {
335 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
336 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
337 					size_t i;
338 					included_auto = 1;
339 					for (i = 0; i < identify_list_size; i++) {
340 						*entry++ = mbfl_no2encoding(*src++);
341 						n++;
342 					}
343 				}
344 			} else {
345 				const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
346 				if (!encoding) {
347 					/* Called from an INI setting modification */
348 					if (arg_num == 0) {
349 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
350 					} else {
351 						zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
352 					}
353 					pefree(ZEND_VOIDP(list), persistent);
354 					return FAILURE;
355 				}
356 
357 				*entry++ = encoding;
358 				n++;
359 			}
360 			if (n >= size || comma == NULL) {
361 				break;
362 			}
363 			p1 = comma + 1;
364 		}
365 		*return_list = list;
366 		*return_size = n;
367 	}
368 
369 	return SUCCESS;
370 }
371 /* }}} */
372 
373 /* {{{
374  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
375  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
376  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)377 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
378 	size_t *return_size, uint32_t arg_num)
379 {
380 	/* Allocate enough space to include the default detect order if "auto" is used. */
381 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
382 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
383 	const mbfl_encoding **entry = list;
384 	bool included_auto = 0;
385 	size_t n = 0;
386 	zval *hash_entry;
387 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
388 		zend_string *encoding_str = zval_try_get_string(hash_entry);
389 		if (UNEXPECTED(!encoding_str)) {
390 			efree(ZEND_VOIDP(list));
391 			return FAILURE;
392 		}
393 
394 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
395 			if (!included_auto) {
396 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
397 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
398 				size_t j;
399 
400 				included_auto = 1;
401 				for (j = 0; j < identify_list_size; j++) {
402 					*entry++ = mbfl_no2encoding(*src++);
403 					n++;
404 				}
405 			}
406 		} else {
407 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
408 			if (encoding) {
409 				*entry++ = encoding;
410 				n++;
411 			} else {
412 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
413 				zend_string_release(encoding_str);
414 				efree(ZEND_VOIDP(list));
415 				return FAILURE;
416 			}
417 		}
418 		zend_string_release(encoding_str);
419 	} ZEND_HASH_FOREACH_END();
420 	*return_list = list;
421 	*return_size = n;
422 	return SUCCESS;
423 }
424 /* }}} */
425 
426 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)427 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
428 {
429 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
430 }
431 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)432 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
433 {
434 	return ((const mbfl_encoding *)encoding)->name;
435 }
436 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)437 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
438 {
439 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
440 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
441 }
442 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)443 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
444 {
445 	if (!list) {
446 		list = (const zend_encoding**)MBSTRG(current_detect_order_list);
447 		list_size = MBSTRG(current_detect_order_list_size);
448 	}
449 	if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
450 		/* Emulate behavior of previous implementation; it would never return "pass"
451 		 * from an encoding auto-detection operation */
452 		return NULL;
453 	}
454 	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
455 }
456 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)457 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
458 {
459 	unsigned int num_errors = 0;
460 	zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
461 
462 	*to_length = ZSTR_LEN(result);
463 	*to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
464 	memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
465 	zend_string_free(result);
466 
467 	return from_length;
468 }
469 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)470 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
471 {
472 	return php_mb_parse_encoding_list(
473 		encoding_list, encoding_list_len,
474 		(const mbfl_encoding ***)return_list, return_size,
475 		persistent, /* arg_num */ 0);
476 }
477 
php_mb_zend_internal_encoding_getter(void)478 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
479 {
480 	return (const zend_encoding *)MBSTRG(internal_encoding);
481 }
482 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)483 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
484 {
485 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
486 	return SUCCESS;
487 }
488 
489 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
490 	"mbstring",
491 	php_mb_zend_encoding_fetcher,
492 	php_mb_zend_encoding_name_getter,
493 	php_mb_zend_encoding_lexer_compatibility_checker,
494 	php_mb_zend_encoding_detector,
495 	php_mb_zend_encoding_converter,
496 	php_mb_zend_encoding_list_parser,
497 	php_mb_zend_internal_encoding_getter,
498 	php_mb_zend_internal_encoding_setter
499 };
500 /* }}} */
501 
502 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)503 static void *_php_mb_compile_regex(const char *pattern)
504 {
505 	pcre2_code *retval;
506 	PCRE2_SIZE err_offset;
507 	int errnum;
508 
509 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
510 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
511 		PCRE2_UCHAR err_str[128];
512 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
513 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
514 	}
515 	return retval;
516 }
517 /* }}} */
518 
519 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)520 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
521 {
522 	int res;
523 
524 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
525 	if (NULL == match_data) {
526 		pcre2_code_free(opaque);
527 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
528 		return FAILURE;
529 	}
530 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
531 	php_pcre_free_match_data(match_data);
532 
533 	return res;
534 }
535 /* }}} */
536 
537 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)538 static void _php_mb_free_regex(void *opaque)
539 {
540 	pcre2_code_free(opaque);
541 }
542 /* }}} */
543 
544 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)545 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
546 {
547 	size_t i;
548 
549 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
550 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
551 
552 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
553 		if (php_mb_default_identify_list[i].lang == lang) {
554 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
555 			*plist_size = php_mb_default_identify_list[i].list_size;
556 			return 1;
557 		}
558 	}
559 	return 0;
560 }
561 /* }}} */
562 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)563 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
564 {
565 	char *result = emalloc(len + 2);
566 	char *resp = result;
567 	size_t i;
568 
569 	for (i = 0; i < len && start[i] != quote; ++i) {
570 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
571 			*resp++ = start[++i];
572 		} else {
573 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
574 
575 			while (j-- > 0 && i < len) {
576 				*resp++ = start[i++];
577 			}
578 			--i;
579 		}
580 	}
581 
582 	*resp = '\0';
583 	return result;
584 }
585 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)586 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
587 {
588 	char *pos = *line, quote;
589 	char *res;
590 
591 	while (*pos && *pos != stop) {
592 		if ((quote = *pos) == '"' || quote == '\'') {
593 			++pos;
594 			while (*pos && *pos != quote) {
595 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
596 					pos += 2;
597 				} else {
598 					++pos;
599 				}
600 			}
601 			if (*pos) {
602 				++pos;
603 			}
604 		} else {
605 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
606 
607 		}
608 	}
609 	if (*pos == '\0') {
610 		res = estrdup(*line);
611 		*line += strlen(*line);
612 		return res;
613 	}
614 
615 	res = estrndup(*line, pos - *line);
616 
617 	while (*pos == stop) {
618 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
619 	}
620 
621 	*line = pos;
622 	return res;
623 }
624 /* }}} */
625 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)626 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
627 {
628 	while (*str && isspace(*(unsigned char *)str)) {
629 		++str;
630 	}
631 
632 	if (!*str) {
633 		return estrdup("");
634 	}
635 
636 	if (*str == '"' || *str == '\'') {
637 		char quote = *str;
638 
639 		str++;
640 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
641 	} else {
642 		char *strend = str;
643 
644 		while (*strend && !isspace(*(unsigned char *)strend)) {
645 			++strend;
646 		}
647 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
648 	}
649 }
650 /* }}} */
651 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)652 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
653 {
654 	char *s, *s2;
655 	const size_t filename_len = strlen(filename);
656 
657 	/* The \ check should technically be needed for win32 systems only where
658 	 * it is a valid path separator. However, IE in all it's wisdom always sends
659 	 * the full path of the file on the user's filesystem, which means that unless
660 	 * the user does basename() they get a bogus file name. Until IE's user base drops
661 	 * to nill or problem is fixed this code must remain enabled for all systems. */
662 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
663 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
664 
665 	if (s && s2) {
666 		if (s > s2) {
667 			return ++s;
668 		} else {
669 			return ++s2;
670 		}
671 	} else if (s) {
672 		return ++s;
673 	} else if (s2) {
674 		return ++s2;
675 	} else {
676 		return filename;
677 	}
678 }
679 /* }}} */
680 
681 /* {{{ php.ini directive handler */
682 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)683 static PHP_INI_MH(OnUpdate_mbstring_language)
684 {
685 	enum mbfl_no_language no_language;
686 
687 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
688 	if (no_language == mbfl_no_language_invalid) {
689 		MBSTRG(language) = mbfl_no_language_neutral;
690 		return FAILURE;
691 	}
692 	MBSTRG(language) = no_language;
693 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
694 	return SUCCESS;
695 }
696 /* }}} */
697 
698 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)699 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
700 {
701 	const mbfl_encoding **list;
702 	size_t size;
703 
704 	if (!new_value) {
705 		if (MBSTRG(detect_order_list)) {
706 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
707 		}
708 		MBSTRG(detect_order_list) = NULL;
709 		MBSTRG(detect_order_list_size) = 0;
710 		return SUCCESS;
711 	}
712 
713 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
714 		return FAILURE;
715 	}
716 
717 	if (MBSTRG(detect_order_list)) {
718 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
719 	}
720 	MBSTRG(detect_order_list) = list;
721 	MBSTRG(detect_order_list_size) = size;
722 	return SUCCESS;
723 }
724 /* }}} */
725 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)726 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
727 	const mbfl_encoding **list;
728 	size_t size;
729 	if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
730 		list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
731 		*list = &mbfl_encoding_pass;
732 		size = 1;
733 	} else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
734 		return FAILURE;
735 	}
736 	if (MBSTRG(http_input_list)) {
737 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
738 	}
739 	MBSTRG(http_input_list) = list;
740 	MBSTRG(http_input_list_size) = size;
741 	return SUCCESS;
742 }
743 
744 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)745 static PHP_INI_MH(OnUpdate_mbstring_http_input)
746 {
747 	if (new_value) {
748 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
749 	}
750 
751 	if (!new_value || !ZSTR_LEN(new_value)) {
752 		const char *encoding = php_get_input_encoding();
753 		MBSTRG(http_input_set) = 0;
754 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
755 		return SUCCESS;
756 	}
757 
758 	MBSTRG(http_input_set) = 1;
759 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
760 }
761 /* }}} */
762 
_php_mb_ini_mbstring_http_output_set(const char * new_value)763 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value) {
764 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
765 	if (!encoding) {
766 		return FAILURE;
767 	}
768 
769 	MBSTRG(http_output_encoding) = encoding;
770 	MBSTRG(current_http_output_encoding) = encoding;
771 	return SUCCESS;
772 }
773 
774 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)775 static PHP_INI_MH(OnUpdate_mbstring_http_output)
776 {
777 	if (new_value) {
778 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
779 	}
780 
781 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
782 		MBSTRG(http_output_set) = 0;
783 		_php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
784 		return SUCCESS;
785 	}
786 
787 	MBSTRG(http_output_set) = 1;
788 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
789 }
790 /* }}} */
791 
792 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)793 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
794 {
795 	const mbfl_encoding *encoding;
796 
797 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
798 		/* falls back to UTF-8 if an unknown encoding name is given */
799 		if (new_value) {
800 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
801 		}
802 		encoding = &mbfl_encoding_utf8;
803 	}
804 	MBSTRG(internal_encoding) = encoding;
805 	MBSTRG(current_internal_encoding) = encoding;
806 #ifdef HAVE_MBREGEX
807 	{
808 		const char *enc_name = new_value;
809 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
810 			/* falls back to UTF-8 if an unknown encoding name is given */
811 			enc_name = "UTF-8";
812 			php_mb_regex_set_default_mbctype(enc_name);
813 		}
814 		php_mb_regex_set_mbctype(new_value);
815 	}
816 #endif
817 	return SUCCESS;
818 }
819 /* }}} */
820 
821 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)822 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
823 {
824 	if (new_value) {
825 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
826 	}
827 
828 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
829 		return FAILURE;
830 	}
831 
832 	if (new_value && ZSTR_LEN(new_value)) {
833 		MBSTRG(internal_encoding_set) = 1;
834 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
835 	} else {
836 		const char *encoding = php_get_internal_encoding();
837 		MBSTRG(internal_encoding_set) = 0;
838 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
839 	}
840 }
841 /* }}} */
842 
843 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)844 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
845 {
846 	if (new_value != NULL) {
847 		if (zend_string_equals_literal_ci(new_value, "none")) {
848 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
849 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
850 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
851 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
852 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
853 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
854 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
855 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
856 		} else {
857 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
858 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
859 			if (ZSTR_LEN(new_value) > 0) {
860 				char *endptr = NULL;
861 				int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
862 
863 				if (*endptr == '\0') {
864 					MBSTRG(filter_illegal_substchar) = c;
865 					MBSTRG(current_filter_illegal_substchar) = c;
866 				}
867 			}
868 		}
869 	} else {
870 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
871 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
872 		MBSTRG(filter_illegal_substchar) = '?';
873 		MBSTRG(current_filter_illegal_substchar) = '?';
874 	}
875 
876 	return SUCCESS;
877 }
878 /* }}} */
879 
880 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)881 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
882 {
883 	if (new_value == NULL) {
884 		return FAILURE;
885 	}
886 
887 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
888 
889 	if (MBSTRG(encoding_translation)) {
890 		sapi_unregister_post_entry(php_post_entries);
891 		sapi_register_post_entries(mbstr_post_entries);
892 	} else {
893 		sapi_unregister_post_entry(mbstr_post_entries);
894 		sapi_register_post_entries(php_post_entries);
895 	}
896 
897 	return SUCCESS;
898 }
899 /* }}} */
900 
901 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)902 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
903 {
904 	zend_string *tmp;
905 	void *re = NULL;
906 
907 	if (!new_value) {
908 		new_value = entry->orig_value;
909 	}
910 	tmp = php_trim(new_value, NULL, 0, 3);
911 
912 	if (ZSTR_LEN(tmp) > 0) {
913 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
914 			zend_string_release_ex(tmp, 0);
915 			return FAILURE;
916 		}
917 	}
918 
919 	if (MBSTRG(http_output_conv_mimetypes)) {
920 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
921 	}
922 
923 	MBSTRG(http_output_conv_mimetypes) = re;
924 
925 	zend_string_release_ex(tmp, 0);
926 	return SUCCESS;
927 }
928 /* }}} */
929 /* }}} */
930 
931 /* {{{ php.ini directive registration */
932 PHP_INI_BEGIN()
933 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
934 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
935 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
936 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
937 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
938 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
939 
940 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
941 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
942 		OnUpdate_mbstring_encoding_translation,
943 		encoding_translation, zend_mbstring_globals, mbstring_globals)
944 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
945 		"^(text/|application/xhtml\\+xml)",
946 		PHP_INI_ALL,
947 		OnUpdate_mbstring_http_output_conv_mimetypes)
948 
949 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
950 		PHP_INI_ALL,
951 		OnUpdateBool,
952 		strict_detection, zend_mbstring_globals, mbstring_globals)
953 #ifdef HAVE_MBREGEX
954 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
955 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
956 #endif
PHP_INI_END()957 PHP_INI_END()
958 /* }}} */
959 
960 static void mbstring_internal_encoding_changed_hook(void) {
961 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
962 	if (!MBSTRG(internal_encoding_set)) {
963 		const char *encoding = php_get_internal_encoding();
964 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
965 	}
966 
967 	if (!MBSTRG(http_output_set)) {
968 		const char *encoding = php_get_output_encoding();
969 		_php_mb_ini_mbstring_http_output_set(encoding);
970 	}
971 
972 	if (!MBSTRG(http_input_set)) {
973 		const char *encoding = php_get_input_encoding();
974 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
975 	}
976 }
977 
978 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)979 static PHP_GINIT_FUNCTION(mbstring)
980 {
981 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
982 ZEND_TSRMLS_CACHE_UPDATE();
983 #endif
984 
985 	mbstring_globals->language = mbfl_no_language_uni;
986 	mbstring_globals->internal_encoding = NULL;
987 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
988 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
989 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
990 	mbstring_globals->http_input_identify = NULL;
991 	mbstring_globals->http_input_identify_get = NULL;
992 	mbstring_globals->http_input_identify_post = NULL;
993 	mbstring_globals->http_input_identify_cookie = NULL;
994 	mbstring_globals->http_input_identify_string = NULL;
995 	mbstring_globals->http_input_list = NULL;
996 	mbstring_globals->http_input_list_size = 0;
997 	mbstring_globals->detect_order_list = NULL;
998 	mbstring_globals->detect_order_list_size = 0;
999 	mbstring_globals->current_detect_order_list = NULL;
1000 	mbstring_globals->current_detect_order_list_size = 0;
1001 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1002 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1003 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1004 	mbstring_globals->filter_illegal_substchar = '?';
1005 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1006 	mbstring_globals->current_filter_illegal_substchar = '?';
1007 	mbstring_globals->illegalchars = 0;
1008 	mbstring_globals->encoding_translation = 0;
1009 	mbstring_globals->strict_detection = 0;
1010 	mbstring_globals->outconv_enabled = false;
1011 	mbstring_globals->outconv_state = 0;
1012 	mbstring_globals->http_output_conv_mimetypes = NULL;
1013 #ifdef HAVE_MBREGEX
1014 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1015 #endif
1016 	mbstring_globals->last_used_encoding_name = NULL;
1017 	mbstring_globals->last_used_encoding = NULL;
1018 	mbstring_globals->internal_encoding_set = 0;
1019 	mbstring_globals->http_output_set = 0;
1020 	mbstring_globals->http_input_set = 0;
1021 	mbstring_globals->all_encodings_list = NULL;
1022 }
1023 /* }}} */
1024 
1025 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1026 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1027 {
1028 	if (mbstring_globals->http_input_list) {
1029 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1030 	}
1031 	if (mbstring_globals->detect_order_list) {
1032 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1033 	}
1034 	if (mbstring_globals->http_output_conv_mimetypes) {
1035 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1036 	}
1037 #ifdef HAVE_MBREGEX
1038 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1039 #endif
1040 }
1041 /* }}} */
1042 
1043 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1044 static void init_check_utf8(void);
1045 #endif
1046 
1047 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1048 PHP_MINIT_FUNCTION(mbstring)
1049 {
1050 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1051 ZEND_TSRMLS_CACHE_UPDATE();
1052 #endif
1053 
1054 	REGISTER_INI_ENTRIES();
1055 
1056 	/* We assume that we're the only user of the hook. */
1057 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1058 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1059 	mbstring_internal_encoding_changed_hook();
1060 
1061 	/* This is a global handler. Should not be set in a per-request handler. */
1062 	sapi_register_treat_data(mbstr_treat_data);
1063 
1064 	/* Post handlers are stored in the thread-local context. */
1065 	if (MBSTRG(encoding_translation)) {
1066 		sapi_register_post_entries(mbstr_post_entries);
1067 	}
1068 
1069 #ifdef HAVE_MBREGEX
1070 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1071 #endif
1072 
1073 	register_mbstring_symbols(module_number);
1074 
1075 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1076 		return FAILURE;
1077 	}
1078 
1079 	php_rfc1867_set_multibyte_callbacks(
1080 		php_mb_encoding_translation,
1081 		php_mb_gpc_get_detect_order,
1082 		php_mb_gpc_set_input_encoding,
1083 		php_mb_rfc1867_getword,
1084 		php_mb_rfc1867_getword_conf,
1085 		php_mb_rfc1867_basename);
1086 
1087 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1088 	init_check_utf8();
1089 	init_convert_utf16();
1090 #endif
1091 
1092 	return SUCCESS;
1093 }
1094 /* }}} */
1095 
1096 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1097 PHP_MSHUTDOWN_FUNCTION(mbstring)
1098 {
1099 	UNREGISTER_INI_ENTRIES();
1100 
1101 	zend_multibyte_restore_functions();
1102 
1103 #ifdef HAVE_MBREGEX
1104 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1105 #endif
1106 
1107 	php_internal_encoding_changed = NULL;
1108 
1109 	return SUCCESS;
1110 }
1111 /* }}} */
1112 
1113 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1114 PHP_RINIT_FUNCTION(mbstring)
1115 {
1116 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1117 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1118 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1119 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1120 
1121 	MBSTRG(illegalchars) = 0;
1122 
1123 	php_mb_populate_current_detect_order_list();
1124 
1125 #ifdef HAVE_MBREGEX
1126 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1127 #endif
1128 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1129 
1130 	return SUCCESS;
1131 }
1132 /* }}} */
1133 
1134 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1135 PHP_RSHUTDOWN_FUNCTION(mbstring)
1136 {
1137 	if (MBSTRG(current_detect_order_list) != NULL) {
1138 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1139 		MBSTRG(current_detect_order_list) = NULL;
1140 		MBSTRG(current_detect_order_list_size) = 0;
1141 	}
1142 
1143 	/* clear http input identification. */
1144 	MBSTRG(http_input_identify) = NULL;
1145 	MBSTRG(http_input_identify_post) = NULL;
1146 	MBSTRG(http_input_identify_get) = NULL;
1147 	MBSTRG(http_input_identify_cookie) = NULL;
1148 	MBSTRG(http_input_identify_string) = NULL;
1149 
1150 	if (MBSTRG(last_used_encoding_name)) {
1151 		zend_string_release(MBSTRG(last_used_encoding_name));
1152 		MBSTRG(last_used_encoding_name) = NULL;
1153 	}
1154 
1155 	MBSTRG(internal_encoding_set) = 0;
1156 	MBSTRG(http_output_set) = 0;
1157 	MBSTRG(http_input_set) = 0;
1158 
1159 	MBSTRG(outconv_enabled) = false;
1160 	MBSTRG(outconv_state) = 0;
1161 
1162 	if (MBSTRG(all_encodings_list)) {
1163 		GC_DELREF(MBSTRG(all_encodings_list));
1164 		zend_array_destroy(MBSTRG(all_encodings_list));
1165 		MBSTRG(all_encodings_list) = NULL;
1166 	}
1167 
1168 #ifdef HAVE_MBREGEX
1169 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1170 #endif
1171 
1172 	return SUCCESS;
1173 }
1174 /* }}} */
1175 
1176 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1177 PHP_MINFO_FUNCTION(mbstring)
1178 {
1179 	php_info_print_table_start();
1180 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1181 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1182 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1183 	{
1184 		char tmp[256];
1185 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1186 		php_info_print_table_row(2, "libmbfl version", tmp);
1187 	}
1188 	php_info_print_table_end();
1189 
1190 	php_info_print_table_start();
1191 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1192 	php_info_print_table_end();
1193 
1194 #ifdef HAVE_MBREGEX
1195 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1196 #endif
1197 
1198 	DISPLAY_INI_ENTRIES();
1199 }
1200 /* }}} */
1201 
1202 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1203 PHP_FUNCTION(mb_language)
1204 {
1205 	zend_string *name = NULL;
1206 
1207 	ZEND_PARSE_PARAMETERS_START(0, 1)
1208 		Z_PARAM_OPTIONAL
1209 		Z_PARAM_STR_OR_NULL(name)
1210 	ZEND_PARSE_PARAMETERS_END();
1211 
1212 	if (name == NULL) {
1213 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1214 	} else {
1215 		zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1216 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1217 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1218 			zend_string_release_ex(ini_name, 0);
1219 			RETURN_THROWS();
1220 		}
1221 		// TODO Make return void
1222 		RETVAL_TRUE;
1223 		zend_string_release_ex(ini_name, 0);
1224 	}
1225 }
1226 /* }}} */
1227 
1228 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1229 PHP_FUNCTION(mb_internal_encoding)
1230 {
1231 	char *name = NULL;
1232 	size_t name_len;
1233 	const mbfl_encoding *encoding;
1234 
1235 	ZEND_PARSE_PARAMETERS_START(0, 1)
1236 		Z_PARAM_OPTIONAL
1237 		Z_PARAM_STRING_OR_NULL(name, name_len)
1238 	ZEND_PARSE_PARAMETERS_END();
1239 
1240 	if (name == NULL) {
1241 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1242 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1243 	} else {
1244 		encoding = mbfl_name2encoding(name);
1245 		if (!encoding) {
1246 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1247 			RETURN_THROWS();
1248 		} else {
1249 			MBSTRG(current_internal_encoding) = encoding;
1250 			MBSTRG(internal_encoding_set) = 1;
1251 			/* TODO Return old encoding */
1252 			RETURN_TRUE;
1253 		}
1254 	}
1255 }
1256 /* }}} */
1257 
1258 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1259 PHP_FUNCTION(mb_http_input)
1260 {
1261 	char *type = NULL;
1262 	size_t type_len = 0, n;
1263 	const mbfl_encoding **entry;
1264 	const mbfl_encoding *encoding;
1265 
1266 	ZEND_PARSE_PARAMETERS_START(0, 1)
1267 		Z_PARAM_OPTIONAL
1268 		Z_PARAM_STRING_OR_NULL(type, type_len)
1269 	ZEND_PARSE_PARAMETERS_END();
1270 
1271 	if (type == NULL) {
1272 		encoding = MBSTRG(http_input_identify);
1273 	} else if (type_len != 1) {
1274 		zend_argument_value_error(1,
1275 			"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1276 		RETURN_THROWS();
1277 	} else {
1278 		switch (*type) {
1279 		case 'G':
1280 		case 'g':
1281 			encoding = MBSTRG(http_input_identify_get);
1282 			break;
1283 		case 'P':
1284 		case 'p':
1285 			encoding = MBSTRG(http_input_identify_post);
1286 			break;
1287 		case 'C':
1288 		case 'c':
1289 			encoding = MBSTRG(http_input_identify_cookie);
1290 			break;
1291 		case 'S':
1292 		case 's':
1293 			encoding = MBSTRG(http_input_identify_string);
1294 			break;
1295 		case 'I':
1296 		case 'i':
1297 			entry = MBSTRG(http_input_list);
1298 			n = MBSTRG(http_input_list_size);
1299 			array_init(return_value);
1300 			for (size_t i = 0; i < n; i++, entry++) {
1301 				add_next_index_string(return_value, (*entry)->name);
1302 			}
1303 			return;
1304 		case 'L':
1305 		case 'l':
1306 			entry = MBSTRG(http_input_list);
1307 			n = MBSTRG(http_input_list_size);
1308 			if (n == 0) {
1309 				RETURN_FALSE;
1310 			}
1311 
1312 			smart_str result = {0};
1313 			for (size_t i = 0; i < n; i++, entry++) {
1314 				if (i > 0) {
1315 					smart_str_appendc(&result, ',');
1316 				}
1317 				smart_str_appends(&result, (*entry)->name);
1318 			}
1319 			RETURN_STR(smart_str_extract(&result));
1320 		default:
1321 			zend_argument_value_error(1,
1322 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1323 			RETURN_THROWS();
1324 		}
1325 	}
1326 
1327 	if (encoding) {
1328 		RETURN_STRING(encoding->name);
1329 	} else {
1330 		RETURN_FALSE;
1331 	}
1332 }
1333 /* }}} */
1334 
1335 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1336 PHP_FUNCTION(mb_http_output)
1337 {
1338 	char *name = NULL;
1339 	size_t name_len;
1340 
1341 	ZEND_PARSE_PARAMETERS_START(0, 1)
1342 		Z_PARAM_OPTIONAL
1343 		Z_PARAM_STRING_OR_NULL(name, name_len)
1344 	ZEND_PARSE_PARAMETERS_END();
1345 
1346 	if (name == NULL) {
1347 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1348 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1349 	} else {
1350 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1351 		if (!encoding) {
1352 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1353 			RETURN_THROWS();
1354 		} else {
1355 			MBSTRG(http_output_set) = 1;
1356 			MBSTRG(current_http_output_encoding) = encoding;
1357 			/* TODO Return previous encoding? */
1358 			RETURN_TRUE;
1359 		}
1360 	}
1361 }
1362 /* }}} */
1363 
1364 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1365 PHP_FUNCTION(mb_detect_order)
1366 {
1367 	zend_string *order_str = NULL;
1368 	HashTable *order_ht = NULL;
1369 
1370 	ZEND_PARSE_PARAMETERS_START(0, 1)
1371 		Z_PARAM_OPTIONAL
1372 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1373 	ZEND_PARSE_PARAMETERS_END();
1374 
1375 	if (!order_str && !order_ht) {
1376 		size_t n = MBSTRG(current_detect_order_list_size);
1377 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1378 		array_init(return_value);
1379 		for (size_t i = 0; i < n; i++) {
1380 			add_next_index_string(return_value, (*entry)->name);
1381 			entry++;
1382 		}
1383 	} else {
1384 		const mbfl_encoding **list;
1385 		size_t size;
1386 		if (order_ht) {
1387 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1388 				RETURN_THROWS();
1389 			}
1390 		} else {
1391 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1392 				RETURN_THROWS();
1393 			}
1394 		}
1395 
1396 		if (size == 0) {
1397 			efree(ZEND_VOIDP(list));
1398 			zend_argument_value_error(1, "must specify at least one encoding");
1399 			RETURN_THROWS();
1400 		}
1401 
1402 		if (MBSTRG(current_detect_order_list)) {
1403 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1404 		}
1405 		MBSTRG(current_detect_order_list) = list;
1406 		MBSTRG(current_detect_order_list_size) = size;
1407 		RETURN_TRUE;
1408 	}
1409 }
1410 /* }}} */
1411 
php_mb_check_code_point(zend_long cp)1412 static inline bool php_mb_check_code_point(zend_long cp)
1413 {
1414 	if (cp < 0 || cp >= 0x110000) {
1415 		/* Out of Unicode range */
1416 		return false;
1417 	}
1418 
1419 	if (cp >= 0xd800 && cp <= 0xdfff) {
1420 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1421 		 * substitute character. */
1422 		return false;
1423 	}
1424 
1425 	/* As we do not know the target encoding of the conversion operation that is going to
1426 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1427 	 * in the given encoding at this point. Thus we have to accept everything. */
1428 	return true;
1429 }
1430 
1431 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1432 PHP_FUNCTION(mb_substitute_character)
1433 {
1434 	zend_string *substitute_character = NULL;
1435 	zend_long substitute_codepoint;
1436 	bool substitute_is_null = 1;
1437 
1438 	ZEND_PARSE_PARAMETERS_START(0, 1)
1439 		Z_PARAM_OPTIONAL
1440 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1441 	ZEND_PARSE_PARAMETERS_END();
1442 
1443 	if (substitute_is_null) {
1444 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1445 			RETURN_STRING("none");
1446 		}
1447 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1448 			RETURN_STRING("long");
1449 		}
1450 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1451 			RETURN_STRING("entity");
1452 		}
1453 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1454 	}
1455 
1456 	if (substitute_character != NULL) {
1457 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1458 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1459 			RETURN_TRUE;
1460 		}
1461 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1462 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1463 			RETURN_TRUE;
1464 		}
1465 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1466 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1467 			RETURN_TRUE;
1468 		}
1469 		/* Invalid string value */
1470 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1471 		RETURN_THROWS();
1472 	}
1473 	/* Integer codepoint passed */
1474 	if (!php_mb_check_code_point(substitute_codepoint)) {
1475 		zend_argument_value_error(1, "is not a valid codepoint");
1476 		RETURN_THROWS();
1477 	}
1478 
1479 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1480 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1481 	RETURN_TRUE;
1482 }
1483 /* }}} */
1484 
1485 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1486 PHP_FUNCTION(mb_preferred_mime_name)
1487 {
1488 	char *name = NULL;
1489 	size_t name_len;
1490 
1491 	ZEND_PARSE_PARAMETERS_START(1, 1)
1492 		Z_PARAM_STRING(name, name_len)
1493 	ZEND_PARSE_PARAMETERS_END();
1494 
1495 	const mbfl_encoding *enc = mbfl_name2encoding(name);
1496 	if (enc == NULL) {
1497 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1498 		RETURN_THROWS();
1499 	}
1500 
1501 	const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1502 	if (preferred_name == NULL || *preferred_name == '\0') {
1503 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1504 		RETVAL_FALSE;
1505 	} else {
1506 		RETVAL_STRING((char *)preferred_name);
1507 	}
1508 }
1509 /* }}} */
1510 
1511 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1512 PHP_FUNCTION(mb_parse_str)
1513 {
1514 	zval *track_vars_array = NULL;
1515 	char *encstr;
1516 	size_t encstr_len;
1517 	php_mb_encoding_handler_info_t info;
1518 	const mbfl_encoding *detected;
1519 
1520 	ZEND_PARSE_PARAMETERS_START(2, 2)
1521 		Z_PARAM_STRING(encstr, encstr_len)
1522 		Z_PARAM_ZVAL(track_vars_array)
1523 	ZEND_PARSE_PARAMETERS_END();
1524 
1525 	track_vars_array = zend_try_array_init(track_vars_array);
1526 	if (!track_vars_array) {
1527 		RETURN_THROWS();
1528 	}
1529 
1530 	encstr = estrndup(encstr, encstr_len);
1531 
1532 	info.data_type              = PARSE_STRING;
1533 	info.separator              = PG(arg_separator).input;
1534 	info.report_errors          = true;
1535 	info.to_encoding            = MBSTRG(current_internal_encoding);
1536 	info.from_encodings         = MBSTRG(http_input_list);
1537 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1538 
1539 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1540 
1541 	MBSTRG(http_input_identify) = detected;
1542 
1543 	RETVAL_BOOL(detected);
1544 
1545 	if (encstr != NULL) efree(encstr);
1546 }
1547 /* }}} */
1548 
PHP_FUNCTION(mb_output_handler)1549 PHP_FUNCTION(mb_output_handler)
1550 {
1551 	zend_string *str;
1552 	zend_long arg_status;
1553 
1554 	ZEND_PARSE_PARAMETERS_START(2, 2)
1555 		Z_PARAM_STR(str)
1556 		Z_PARAM_LONG(arg_status)
1557 	ZEND_PARSE_PARAMETERS_END();
1558 
1559 	const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1560 	if (encoding == &mbfl_encoding_pass) {
1561 		RETURN_STR_COPY(str);
1562 	}
1563 
1564 	if (arg_status & PHP_OUTPUT_HANDLER_START) {
1565 		bool free_mimetype = false;
1566 		char *mimetype = NULL;
1567 
1568 		/* Analyze mime type */
1569 		if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1570 			char *s;
1571 			if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1572 				mimetype = estrdup(SG(sapi_headers).mimetype);
1573 			} else {
1574 				mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1575 			}
1576 			free_mimetype = true;
1577 		} else if (SG(sapi_headers).send_default_content_type) {
1578 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1579 		}
1580 
1581 		/* If content-type is not yet set, set it and enable conversion */
1582 		if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1583 			const char *charset = encoding->mime_name;
1584 			if (charset) {
1585 				char *p;
1586 				size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s",  mimetype, charset);
1587 				if (sapi_add_header(p, len, 0) != FAILURE) {
1588 					SG(sapi_headers).send_default_content_type = 0;
1589 				}
1590 			}
1591 
1592 			MBSTRG(outconv_enabled) = true;
1593 		}
1594 
1595 		if (free_mimetype) {
1596 			efree(mimetype);
1597 		}
1598 	}
1599 
1600 	if (!MBSTRG(outconv_enabled)) {
1601 		RETURN_STR_COPY(str);
1602 	}
1603 
1604 	mb_convert_buf buf;
1605 	mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1606 
1607 	uint32_t wchar_buf[128];
1608 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1609 	size_t in_len = ZSTR_LEN(str);
1610 	bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1611 
1612 	while (in_len) {
1613 		size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1614 		ZEND_ASSERT(out_len <= 128);
1615 		encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1616 	}
1617 
1618 	MBSTRG(illegalchars) += buf.errors;
1619 	RETVAL_STR(mb_convert_buf_result_raw(&buf));
1620 
1621 	if (last_feed) {
1622 		MBSTRG(outconv_enabled) = false;
1623 		MBSTRG(outconv_state) = 0;
1624 	}
1625 }
1626 
PHP_FUNCTION(mb_str_split)1627 PHP_FUNCTION(mb_str_split)
1628 {
1629 	zend_string *str, *encoding = NULL;
1630 	zend_long split_len = 1;
1631 
1632 	ZEND_PARSE_PARAMETERS_START(1, 3)
1633 		Z_PARAM_STR(str)
1634 		Z_PARAM_OPTIONAL
1635 		Z_PARAM_LONG(split_len)
1636 		Z_PARAM_STR_OR_NULL(encoding)
1637 	ZEND_PARSE_PARAMETERS_END();
1638 
1639 	if (split_len <= 0) {
1640 		zend_argument_value_error(2, "must be greater than 0");
1641 		RETURN_THROWS();
1642 	} else if (split_len > UINT_MAX / 4) {
1643 		zend_argument_value_error(2, "is too large");
1644 		RETURN_THROWS();
1645 	}
1646 
1647 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1648 	if (!enc) {
1649 		RETURN_THROWS();
1650 	}
1651 
1652 	if (ZSTR_LEN(str) == 0) {
1653 		RETURN_EMPTY_ARRAY();
1654 	}
1655 
1656 	unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1657 
1658 	unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1659 	if (char_len) {
1660 		unsigned int chunk_len = char_len * split_len;
1661 		unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1662 		array_init_size(return_value, chunks);
1663 		while (p < e) {
1664 			add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1665 			p += chunk_len;
1666 		}
1667 	} else if (enc->mblen_table) {
1668 		unsigned char const *mbtab = enc->mblen_table;
1669 
1670 		/* Assume that we have 1-byte characters */
1671 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1672 
1673 		while (p < e) {
1674 			unsigned char *chunk = p; /* start of chunk */
1675 
1676 			for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1677 				p += mbtab[*p];
1678 			}
1679 			if (p > e) {
1680 				p = e; /* ensure chunk is in bounds */
1681 			}
1682 			add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1683 		}
1684 	} else {
1685 		/* Assume that we have 1-byte characters */
1686 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1687 
1688 		uint32_t wchar_buf[128];
1689 		size_t in_len = ZSTR_LEN(str);
1690 		unsigned int state = 0, char_count = 0;
1691 
1692 		mb_convert_buf buf;
1693 
1694 		while (in_len) {
1695 			size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1696 			ZEND_ASSERT(out_len <= 128);
1697 			size_t i = 0;
1698 
1699 			/* Is there some output remaining from the previous iteration? */
1700 			if (char_count) {
1701 				if (out_len >= split_len - char_count) {
1702 					/* Finish off an incomplete chunk from previous iteration
1703 					 * ('buf' was already initialized; we don't need to do it again) */
1704 					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1705 					i += split_len - char_count;
1706 					char_count = 0;
1707 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1708 				} else {
1709 					/* Output from this iteration is not enough to finish the next chunk;
1710 					 * output what we can, and leave 'buf' to be used again on next iteration */
1711 					enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1712 					char_count += out_len;
1713 					continue;
1714 				}
1715 			}
1716 
1717 			while (i < out_len) {
1718 				/* Prepare for the next chunk */
1719 				mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1720 
1721 				if (out_len - i >= split_len) {
1722 					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1723 					i += split_len;
1724 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1725 				} else {
1726 					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1727 					 * leave them for the next iteration */
1728 					enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1729 					char_count = out_len - i;
1730 					break;
1731 				}
1732 			}
1733 		}
1734 
1735 		if (char_count) {
1736 			/* The main loop above has finished processing the input string, but
1737 			 * has left a partial chunk in 'buf' */
1738 			add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1739 		}
1740 	}
1741 }
1742 
1743 #ifdef __SSE2__
1744 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1745  * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1746  * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1747  * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1748 static inline uint32_t _mm_sum_epu8(const __m128i v)
1749 {
1750 	/* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1751 	 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1752 	 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1753 	 * halves of the destination XMM register
1754 	 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1755 	 * summed up will actually just be the 8-bit values from `v` */
1756 	__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1757 	/* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1758 	 * to extract it here; but it stored the sum as two different 16-bit values
1759 	 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1760 	 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1761 	return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1762 }
1763 #endif
1764 
1765 /* This assumes that `string` is valid UTF-8
1766  * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1767  * Interpreted as signed integers, those are all byte values less than -64
1768  * A fast way to get the length of a UTF-8 string is to start with its byte length,
1769  * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1770 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1771 {
1772 	unsigned char *e = p + len;
1773 
1774 #ifdef __SSE2__
1775 	if (len >= sizeof(__m128i)) {
1776 		e -= sizeof(__m128i);
1777 
1778 		const __m128i threshold = _mm_set1_epi8(-64);
1779 		const __m128i delta = _mm_set1_epi8(1);
1780 		__m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1781 
1782 		unsigned char reset_counter = 255;
1783 		do {
1784 			__m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1785 			__m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1786 			counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1787 
1788 			/* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1789 			 * and reset them to zero */
1790 			if (--reset_counter == 0) {
1791 				len -= _mm_sum_epu8(counter);
1792 				counter = _mm_setzero_si128();
1793 				reset_counter = 255;
1794 			}
1795 
1796 			p += sizeof(__m128i);
1797 		} while (p <= e);
1798 
1799 		e += sizeof(__m128i);
1800 		len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1801 	}
1802 #endif
1803 
1804 	/* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1805 	while (p < e) {
1806 		signed char c = *p++;
1807 		if (c < -64) {
1808 			len--;
1809 		}
1810 	}
1811 
1812 	return len;
1813 }
1814 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1815 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1816 {
1817 	unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1818 	if (char_len) {
1819 		return ZSTR_LEN(string) / char_len;
1820 	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1821 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1822 	}
1823 
1824 	uint32_t wchar_buf[128];
1825 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1826 	size_t in_len = ZSTR_LEN(string);
1827 	unsigned int state = 0;
1828 	size_t len = 0;
1829 
1830 	while (in_len) {
1831 		len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1832 	}
1833 
1834 	return len;
1835 }
1836 
1837 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1838 PHP_FUNCTION(mb_strlen)
1839 {
1840 	zend_string *string, *enc_name = NULL;
1841 
1842 	ZEND_PARSE_PARAMETERS_START(1, 2)
1843 		Z_PARAM_STR(string)
1844 		Z_PARAM_OPTIONAL
1845 		Z_PARAM_STR_OR_NULL(enc_name)
1846 	ZEND_PARSE_PARAMETERS_END();
1847 
1848 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1849 	if (!enc) {
1850 		RETURN_THROWS();
1851 	}
1852 
1853 	RETVAL_LONG(mb_get_strlen(string, enc));
1854 }
1855 /* }}} */
1856 
1857 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1858 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1859 {
1860 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1861 }
1862 
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1863 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1864 	if (offset < 0) {
1865 		unsigned char *pos = end;
1866 		while (offset < 0) {
1867 			if (pos <= str) {
1868 				return NULL;
1869 			}
1870 
1871 			unsigned char c = *--pos;
1872 			if (c < 0x80 || (c & 0xC0) != 0x80) {
1873 				offset++;
1874 			}
1875 		}
1876 		return pos;
1877 	} else {
1878 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1879 		unsigned char *pos = str;
1880 		while (offset-- > 0) {
1881 			if (pos >= end) {
1882 				return NULL;
1883 			}
1884 			pos += u8_tbl[*pos];
1885 		}
1886 		return pos;
1887 	}
1888 }
1889 
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1890 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1891 	return mb_fast_strlen_utf8(start, pos - start);
1892 }
1893 
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1894 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1895 {
1896 	size_t result;
1897 	zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1898 	unsigned char *offset_pointer;
1899 
1900 	if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1901 		unsigned int num_errors = 0;
1902 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1903 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1904 	} else {
1905 		haystack_u8 = haystack;
1906 		needle_u8 = needle;
1907 	}
1908 
1909 	offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1910 	if (!offset_pointer) {
1911 		result = MBFL_ERROR_OFFSET;
1912 		goto out;
1913 	}
1914 
1915 	result = MBFL_ERROR_NOT_FOUND;
1916 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1917 		goto out;
1918 	}
1919 
1920 	const char *found_pos;
1921 	if (!reverse) {
1922 		found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1923 	} else if (offset >= 0) {
1924 		found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1925 	} else {
1926 		size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1927 		offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1928 		if (!offset_pointer) {
1929 			offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1930 		}
1931 
1932 		found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1933 	}
1934 
1935 	if (found_pos) {
1936 		result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1937 	}
1938 
1939 out:
1940 	if (haystack_u8 != haystack) {
1941 		zend_string_free(haystack_u8);
1942 	}
1943 	if (needle_u8 != needle) {
1944 		zend_string_free(needle_u8);
1945 	}
1946 	return result;
1947 }
1948 
handle_strpos_error(size_t error)1949 static void handle_strpos_error(size_t error) {
1950 	switch (error) {
1951 	case MBFL_ERROR_NOT_FOUND:
1952 		break;
1953 	case MBFL_ERROR_ENCODING:
1954 		php_error_docref(NULL, E_WARNING, "Conversion error");
1955 		break;
1956 	case MBFL_ERROR_OFFSET:
1957 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1958 		break;
1959 	default:
1960 		zend_value_error("mb_strpos(): Unknown error");
1961 		break;
1962 	}
1963 }
1964 
PHP_FUNCTION(mb_strpos)1965 PHP_FUNCTION(mb_strpos)
1966 {
1967 	zend_long offset = 0;
1968 	zend_string *needle, *haystack;
1969 	zend_string *enc_name = NULL;
1970 
1971 	ZEND_PARSE_PARAMETERS_START(2, 4)
1972 		Z_PARAM_STR(haystack)
1973 		Z_PARAM_STR(needle)
1974 		Z_PARAM_OPTIONAL
1975 		Z_PARAM_LONG(offset)
1976 		Z_PARAM_STR_OR_NULL(enc_name)
1977 	ZEND_PARSE_PARAMETERS_END();
1978 
1979 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1980 	if (!enc) {
1981 		RETURN_THROWS();
1982 	}
1983 
1984 	size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1985 	if (!mbfl_is_error(n)) {
1986 		RETVAL_LONG(n);
1987 	} else {
1988 		handle_strpos_error(n);
1989 		RETVAL_FALSE;
1990 	}
1991 }
1992 
1993 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1994 PHP_FUNCTION(mb_strrpos)
1995 {
1996 	zend_long offset = 0;
1997 	zend_string *needle, *haystack;
1998 	zend_string *enc_name = NULL;
1999 
2000 	ZEND_PARSE_PARAMETERS_START(2, 4)
2001 		Z_PARAM_STR(haystack)
2002 		Z_PARAM_STR(needle)
2003 		Z_PARAM_OPTIONAL
2004 		Z_PARAM_LONG(offset)
2005 		Z_PARAM_STR_OR_NULL(enc_name)
2006 	ZEND_PARSE_PARAMETERS_END();
2007 
2008 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2009 	if (!enc) {
2010 		RETURN_THROWS();
2011 	}
2012 
2013 	size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2014 	if (!mbfl_is_error(n)) {
2015 		RETVAL_LONG(n);
2016 	} else {
2017 		handle_strpos_error(n);
2018 		RETVAL_FALSE;
2019 	}
2020 }
2021 /* }}} */
2022 
2023 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2024 PHP_FUNCTION(mb_stripos)
2025 {
2026 	zend_long offset = 0;
2027 	zend_string *haystack, *needle;
2028 	zend_string *from_encoding = NULL;
2029 
2030 	ZEND_PARSE_PARAMETERS_START(2, 4)
2031 		Z_PARAM_STR(haystack)
2032 		Z_PARAM_STR(needle)
2033 		Z_PARAM_OPTIONAL
2034 		Z_PARAM_LONG(offset)
2035 		Z_PARAM_STR_OR_NULL(from_encoding)
2036 	ZEND_PARSE_PARAMETERS_END();
2037 
2038 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2039 	if (!enc) {
2040 		RETURN_THROWS();
2041 	}
2042 
2043 	size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2044 
2045 	if (!mbfl_is_error(n)) {
2046 		RETVAL_LONG(n);
2047 	} else {
2048 		handle_strpos_error(n);
2049 		RETVAL_FALSE;
2050 	}
2051 }
2052 /* }}} */
2053 
2054 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2055 PHP_FUNCTION(mb_strripos)
2056 {
2057 	zend_long offset = 0;
2058 	zend_string *haystack, *needle;
2059 	zend_string *from_encoding = NULL;
2060 
2061 	ZEND_PARSE_PARAMETERS_START(2, 4)
2062 		Z_PARAM_STR(haystack)
2063 		Z_PARAM_STR(needle)
2064 		Z_PARAM_OPTIONAL
2065 		Z_PARAM_LONG(offset)
2066 		Z_PARAM_STR_OR_NULL(from_encoding)
2067 	ZEND_PARSE_PARAMETERS_END();
2068 
2069 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2070 	if (!enc) {
2071 		RETURN_THROWS();
2072 	}
2073 
2074 	size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2075 
2076 	if (!mbfl_is_error(n)) {
2077 		RETVAL_LONG(n);
2078 	} else {
2079 		handle_strpos_error(n);
2080 		RETVAL_FALSE;
2081 	}
2082 }
2083 /* }}} */
2084 
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2085 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2086 {
2087 	uint32_t wchar_buf[128];
2088 	unsigned int state = 0;
2089 
2090 	mb_convert_buf buf;
2091 	mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2092 
2093 	while (in_len && len) {
2094 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2095 		ZEND_ASSERT(out_len <= 128);
2096 
2097 		if (from >= out_len) {
2098 			from -= out_len;
2099 		} else {
2100 			size_t needed_codepoints = MIN(out_len - from, len);
2101 			enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2102 			from = 0;
2103 			len -= needed_codepoints;
2104 		}
2105 	}
2106 
2107 	return mb_convert_buf_result(&buf, enc);
2108 }
2109 
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2110 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2111 {
2112 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2113 	size_t in_len = ZSTR_LEN(input);
2114 
2115 	if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2116 		/* Other than MacJapanese, no supported text encoding decodes to
2117 		 * more than one codepoint per byte
2118 		 * So if the number of codepoints to skip >= number of input bytes,
2119 		 * then definitely the output should be empty */
2120 		return zend_empty_string;
2121 	}
2122 
2123 	/* Does each codepoint have a fixed byte width? */
2124 	unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2125 	if (flag) {
2126 		/* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2127 		from *= flag;
2128 		len *= flag;
2129 		if (from >= in_len) {
2130 			return zend_empty_string;
2131 		}
2132 		in += from;
2133 		in_len -= from;
2134 		if (len > in_len) {
2135 			len = in_len;
2136 		}
2137 		return zend_string_init_fast((const char*)in, len);
2138 	}
2139 
2140 	return mb_get_substr_slow(in, in_len, from, len, enc);
2141 }
2142 
2143 #define MB_STRSTR 1
2144 #define MB_STRRCHR 2
2145 #define MB_STRISTR 3
2146 #define MB_STRRICHR 4
2147 
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2148 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2149 {
2150 	bool reverse_mode = false, part = false;
2151 	size_t n;
2152 	zend_string *haystack, *needle;
2153 	zend_string *encoding_name = NULL;
2154 
2155 	ZEND_PARSE_PARAMETERS_START(2, 4)
2156 		Z_PARAM_STR(haystack)
2157 		Z_PARAM_STR(needle)
2158 		Z_PARAM_OPTIONAL
2159 		Z_PARAM_BOOL(part)
2160 		Z_PARAM_STR_OR_NULL(encoding_name)
2161 	ZEND_PARSE_PARAMETERS_END();
2162 
2163 	const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2164 	if (!enc) {
2165 		RETURN_THROWS();
2166 	}
2167 
2168 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2169 		reverse_mode = true;
2170 	}
2171 
2172 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2173 		n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2174 	} else {
2175 		n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2176 	}
2177 
2178 	if (!mbfl_is_error(n)) {
2179 		if (part) {
2180 			RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2181 		} else {
2182 			RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2183 		}
2184 	} else {
2185 		// FIXME use handle_strpos_error(n)
2186 		RETVAL_FALSE;
2187 	}
2188 }
2189 
2190 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2191 PHP_FUNCTION(mb_strstr)
2192 {
2193 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2194 }
2195 /* }}} */
2196 
2197 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2198 PHP_FUNCTION(mb_strrchr)
2199 {
2200 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2201 }
2202 /* }}} */
2203 
2204 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2205 PHP_FUNCTION(mb_stristr)
2206 {
2207 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2208 }
2209 /* }}} */
2210 
2211 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2212 PHP_FUNCTION(mb_strrichr)
2213 {
2214 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2215 }
2216 /* }}} */
2217 
2218 #undef MB_STRSTR
2219 #undef MB_STRRCHR
2220 #undef MB_STRISTR
2221 #undef MB_STRRICHR
2222 
PHP_FUNCTION(mb_substr_count)2223 PHP_FUNCTION(mb_substr_count)
2224 {
2225 	zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2226 
2227 	ZEND_PARSE_PARAMETERS_START(2, 3)
2228 		Z_PARAM_STR(haystack)
2229 		Z_PARAM_STR(needle)
2230 		Z_PARAM_OPTIONAL
2231 		Z_PARAM_STR_OR_NULL(enc_name)
2232 	ZEND_PARSE_PARAMETERS_END();
2233 
2234 	if (ZSTR_LEN(needle) == 0) {
2235 		zend_argument_value_error(2, "must not be empty");
2236 		RETURN_THROWS();
2237 	}
2238 
2239 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2240 	if (!enc) {
2241 		RETURN_THROWS();
2242 	}
2243 
2244 	if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2245 		/* No need to do any conversion if haystack/needle are already known-valid UTF-8
2246 		 * (If they are not valid, then not passing them through conversion filters could affect output) */
2247 		if (ZSTR_IS_VALID_UTF8(haystack)) {
2248 			haystack_u8 = haystack;
2249 		} else {
2250 			unsigned int num_errors = 0;
2251 			haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2252 			if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2253 				GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2254 			}
2255 		}
2256 
2257 		if (ZSTR_IS_VALID_UTF8(needle)) {
2258 			needle_u8 = needle;
2259 		} else {
2260 			unsigned int num_errors = 0;
2261 			needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2262 			if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2263 				GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2264 			}
2265 		}
2266 	} else {
2267 		unsigned int num_errors = 0;
2268 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2269 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 		/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2271 		 * may be only escape sequences */
2272 		if (ZSTR_LEN(needle_u8) == 0) {
2273 			zend_string_free(haystack_u8);
2274 			zend_string_free(needle_u8);
2275 			zend_argument_value_error(2, "must not be empty");
2276 			RETURN_THROWS();
2277 		}
2278 	}
2279 
2280 	size_t result = 0;
2281 
2282 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2283 		goto out;
2284 	}
2285 
2286 	const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2287 	while (true) {
2288 		p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2289 		if (!p) {
2290 			break;
2291 		}
2292 		p += ZSTR_LEN(needle_u8);
2293 		result++;
2294 	}
2295 
2296 out:
2297 	if (haystack_u8 != haystack) {
2298 		zend_string_free(haystack_u8);
2299 	}
2300 	if (needle_u8 != needle) {
2301 		zend_string_free(needle_u8);
2302 	}
2303 
2304 	RETVAL_LONG(result);
2305 }
2306 
2307 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2308 PHP_FUNCTION(mb_substr)
2309 {
2310 	zend_string *str, *encoding = NULL;
2311 	zend_long from, len;
2312 	size_t real_from, real_len;
2313 	bool len_is_null = true;
2314 
2315 	ZEND_PARSE_PARAMETERS_START(2, 4)
2316 		Z_PARAM_STR(str)
2317 		Z_PARAM_LONG(from)
2318 		Z_PARAM_OPTIONAL
2319 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2320 		Z_PARAM_STR_OR_NULL(encoding)
2321 	ZEND_PARSE_PARAMETERS_END();
2322 
2323 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2324 	if (!enc) {
2325 		RETURN_THROWS();
2326 	}
2327 
2328 	size_t mblen = 0;
2329 	if (from < 0 || (!len_is_null && len < 0)) {
2330 		mblen = mb_get_strlen(str, enc);
2331 	}
2332 
2333 	/* if "from" position is negative, count start position from the end
2334 	 * of the string */
2335 	if (from >= 0) {
2336 		real_from = (size_t) from;
2337 	} else if (-from < mblen) {
2338 		real_from = mblen + from;
2339 	} else {
2340 		real_from = 0;
2341 	}
2342 
2343 	/* if "length" position is negative, set it to the length
2344 	 * needed to stop that many chars from the end of the string */
2345 	if (len_is_null) {
2346 		real_len = MBFL_SUBSTR_UNTIL_END;
2347 	} else if (len >= 0) {
2348 		real_len = (size_t) len;
2349 	} else if (real_from < mblen && -len < mblen - real_from) {
2350 		real_len = (mblen - real_from) + len;
2351 	} else {
2352 		real_len = 0;
2353 	}
2354 
2355 	RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2356 }
2357 /* }}} */
2358 
2359 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2360 PHP_FUNCTION(mb_strcut)
2361 {
2362 	zend_string *encoding = NULL;
2363 	char *string_val;
2364 	zend_long from, len;
2365 	bool len_is_null = true;
2366 	mbfl_string string, result, *ret;
2367 
2368 	ZEND_PARSE_PARAMETERS_START(2, 4)
2369 		Z_PARAM_STRING(string_val, string.len)
2370 		Z_PARAM_LONG(from)
2371 		Z_PARAM_OPTIONAL
2372 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2373 		Z_PARAM_STR_OR_NULL(encoding)
2374 	ZEND_PARSE_PARAMETERS_END();
2375 
2376 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2377 	if (!enc) {
2378 		RETURN_THROWS();
2379 	}
2380 
2381 	string.val = (unsigned char*)string_val;
2382 	string.encoding = enc;
2383 
2384 	if (len_is_null) {
2385 		len = string.len;
2386 	}
2387 
2388 	/* if "from" position is negative, count start position from the end
2389 	 * of the string */
2390 	if (from < 0) {
2391 		from = string.len + from;
2392 		if (from < 0) {
2393 			from = 0;
2394 		}
2395 	}
2396 
2397 	/* if "length" position is negative, set it to the length
2398 	 * needed to stop that many chars from the end of the string */
2399 	if (len < 0) {
2400 		len = (string.len - from) + len;
2401 		if (len < 0) {
2402 			len = 0;
2403 		}
2404 	}
2405 
2406 	if (from > string.len || len == 0) {
2407 		RETURN_EMPTY_STRING();
2408 	}
2409 
2410 	if (enc->cut) {
2411 		RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2412 	}
2413 
2414 	unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2415 	if (char_len) {
2416 		/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2417 		from &= -char_len;
2418 		if (len > string.len - from) {
2419 			len = string.len - from;
2420 		}
2421 		RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2422 	}
2423 
2424 	if (enc->mblen_table) {
2425 		const unsigned char *mbtab = enc->mblen_table;
2426 		const unsigned char *p, *q, *end;
2427 		int m = 0;
2428 		/* Search for start position */
2429 		for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2430 		if (p > q) {
2431 			p -= m;
2432 		}
2433 		const unsigned char *start = p;
2434 		/* Search for end position */
2435 		if (len >= string.len - (start - (const unsigned char*)string.val)) {
2436 			end = (const unsigned char*)(string.val + string.len);
2437 		} else {
2438 			for (q = p + len; p < q; p += (m = mbtab[*p]));
2439 			if (p > q) {
2440 				p -= m;
2441 			}
2442 			end = p;
2443 		}
2444 		RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2445 	}
2446 
2447 	ret = mbfl_strcut(&string, &result, from, len);
2448 	ZEND_ASSERT(ret != NULL);
2449 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2450 	efree(ret->val);
2451 }
2452 /* }}} */
2453 
2454 /* Some East Asian characters, when printed at a terminal (or the like), require double
2455  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2456 static size_t character_width(uint32_t c)
2457 {
2458 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2459 		return 1;
2460 	}
2461 
2462 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2463 	unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2464 	while (lo < hi) {
2465 		unsigned int probe = (lo + hi) / 2;
2466 		if (c < mbfl_eaw_table[probe].begin) {
2467 			hi = probe;
2468 		} else if (c > mbfl_eaw_table[probe].end) {
2469 			lo = probe + 1;
2470 		} else {
2471 			return 2;
2472 		}
2473 	}
2474 
2475 	return 1;
2476 }
2477 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2478 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2479 {
2480 	size_t width = 0;
2481 	uint32_t wchar_buf[128];
2482 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2483 	size_t in_len = ZSTR_LEN(string);
2484 	unsigned int state = 0;
2485 
2486 	while (in_len) {
2487 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2488 		ZEND_ASSERT(out_len <= 128);
2489 
2490 		while (out_len) {
2491 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2492 			 * If text conversion is performed with an ordinary ASCII character as
2493 			 * the 'replacement character', this will give us the correct display width. */
2494 			width += character_width(wchar_buf[--out_len]);
2495 		}
2496 	}
2497 
2498 	return width;
2499 }
2500 
2501 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2502 PHP_FUNCTION(mb_strwidth)
2503 {
2504 	zend_string *string, *enc_name = NULL;
2505 
2506 	ZEND_PARSE_PARAMETERS_START(1, 2)
2507 		Z_PARAM_STR(string)
2508 		Z_PARAM_OPTIONAL
2509 		Z_PARAM_STR_OR_NULL(enc_name)
2510 	ZEND_PARSE_PARAMETERS_END();
2511 
2512 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2513 	if (!enc) {
2514 		RETURN_THROWS();
2515 	}
2516 
2517 	RETVAL_LONG(mb_get_strwidth(string, enc));
2518 }
2519 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2520 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2521 {
2522 	uint32_t wchar_buf[128];
2523 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2524 	size_t in_len = ZSTR_LEN(input);
2525 	unsigned int state = 0;
2526 	size_t remaining_width = width;
2527 	size_t to_skip = from;
2528 	size_t out_len = 0;
2529 	bool first_call = true, input_err = false;
2530 	mb_convert_buf buf;
2531 
2532 	while (in_len) {
2533 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2534 		ZEND_ASSERT(out_len <= 128);
2535 
2536 		if (out_len <= to_skip) {
2537 			to_skip -= out_len;
2538 		} else {
2539 			for (size_t i = to_skip; i < out_len; i++) {
2540 				uint32_t w = wchar_buf[i];
2541 				size_t current_w_width = character_width(w);
2542 
2543 				input_err |= (w == MBFL_BAD_INPUT);
2544 
2545 				if (remaining_width < current_w_width) {
2546 					size_t marker_width = mb_get_strwidth(marker, enc);
2547 
2548 					/* The trim marker is larger than the desired string width */
2549 					if (width <= marker_width) {
2550 						return zend_string_copy(marker);
2551 					}
2552 
2553 					/* We need to truncate string and append trim marker */
2554 					width -= marker_width;
2555 					/* 'width' is now the amount we want to take from 'input' */
2556 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2557 
2558 					if (first_call) {
2559 						/* We can use the buffer of wchars which we have right now;
2560 						 * no need to convert again */
2561 						goto dont_restart_conversion;
2562 					} else {
2563 						goto restart_conversion;
2564 					}
2565 				}
2566 				remaining_width -= current_w_width;
2567 			}
2568 			to_skip = 0;
2569 		}
2570 		first_call = false;
2571 	}
2572 
2573 	/* The input string fits in the requested width; we don't need to append the trim marker
2574 	 * However, if the string contains erroneous byte sequences, those should be converted
2575 	 * to error markers */
2576 	if (!input_err) {
2577 		if (from == 0) {
2578 			/* This just increments the string's refcount; it doesn't really 'copy' it */
2579 			return zend_string_copy(input);
2580 		} else {
2581 			return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2582 		}
2583 	} else {
2584 		/* We can't use `mb_get_substr`, because it uses the fastest method possible of
2585 		 * picking out a substring, which may not include converting erroneous byte
2586 		 * sequences to error markers */
2587 		return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2588 	}
2589 
2590 	/* The input string is too wide; we need to build a new string which
2591 	 * includes some portion of the input string, with the trim marker
2592 	 * concatenated onto it */
2593 restart_conversion:
2594 	in = (unsigned char*)ZSTR_VAL(input);
2595 	in_len = ZSTR_LEN(input);
2596 	state = 0;
2597 
2598 	while (true) {
2599 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2600 		ZEND_ASSERT(out_len <= 128);
2601 
2602 dont_restart_conversion:
2603 		if (out_len <= from) {
2604 			from -= out_len;
2605 		} else {
2606 			for (size_t i = from; i < out_len; i++) {
2607 				size_t current_wchar_char_width = character_width(wchar_buf[i]);
2608 				if (width < current_wchar_char_width) {
2609 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2610 					goto append_trim_marker;
2611 				}
2612 				width -= current_wchar_char_width;
2613 			}
2614 			ZEND_ASSERT(in_len > 0);
2615 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2616 			from = 0;
2617 		}
2618 	}
2619 
2620 append_trim_marker:
2621 	if (ZSTR_LEN(marker) > 0) {
2622 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2623 		buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2624 	}
2625 
2626 	/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2627 	 * we have no guarantee that the trim marker string is valid UTF-8 */
2628 	return mb_convert_buf_result_raw(&buf);
2629 }
2630 
2631 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2632 PHP_FUNCTION(mb_strimwidth)
2633 {
2634 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2635 	zend_long from, width;
2636 
2637 	ZEND_PARSE_PARAMETERS_START(3, 5)
2638 		Z_PARAM_STR(str)
2639 		Z_PARAM_LONG(from)
2640 		Z_PARAM_LONG(width)
2641 		Z_PARAM_OPTIONAL
2642 		Z_PARAM_STR(trimmarker)
2643 		Z_PARAM_STR_OR_NULL(encoding)
2644 	ZEND_PARSE_PARAMETERS_END();
2645 
2646 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2647 	if (!enc) {
2648 		RETURN_THROWS();
2649 	}
2650 
2651 	if (from != 0) {
2652 		size_t str_len = mb_get_strlen(str, enc);
2653 		if (from < 0) {
2654 			from += str_len;
2655 		}
2656 		if (from < 0 || from > str_len) {
2657 			zend_argument_value_error(2, "is out of range");
2658 			RETURN_THROWS();
2659 		}
2660 	}
2661 
2662 	if (width < 0) {
2663 		php_error_docref(NULL, E_DEPRECATED,
2664 			"passing a negative integer to argument #3 ($width) is deprecated");
2665 		width += mb_get_strwidth(str, enc);
2666 
2667 		if (from > 0) {
2668 			zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2669 			width -= mb_get_strwidth(trimmed, enc);
2670 			zend_string_free(trimmed);
2671 		}
2672 
2673 		if (width < 0) {
2674 			zend_argument_value_error(3, "is out of range");
2675 			RETURN_THROWS();
2676 		}
2677 	}
2678 
2679 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2680 }
2681 
2682 
2683 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2684 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2685 {
2686 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2687 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2688 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2689 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2690 }
2691 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2692 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2693 {
2694 	unsigned int num_errors = 0;
2695 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2696 	MBSTRG(illegalchars) += num_errors;
2697 	return result;
2698 }
2699 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2700 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2701 {
2702 	const mbfl_encoding *from_encoding;
2703 
2704 	/* pre-conversion encoding */
2705 	ZEND_ASSERT(num_from_encodings >= 1);
2706 	if (num_from_encodings == 1) {
2707 		from_encoding = *from_encodings;
2708 	} else {
2709 		/* auto detect */
2710 		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2711 		if (!from_encoding) {
2712 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2713 			return NULL;
2714 		}
2715 	}
2716 
2717 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2718 }
2719 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2720 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2721 {
2722 	HashTable *output, *chash;
2723 	zend_long idx;
2724 	zend_string *key;
2725 	zval *entry, entry_tmp;
2726 
2727 	if (!input) {
2728 		return NULL;
2729 	}
2730 
2731 	if (GC_IS_RECURSIVE(input)) {
2732 		GC_UNPROTECT_RECURSION(input);
2733 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2734 		return NULL;
2735 	}
2736 	GC_TRY_PROTECT_RECURSION(input);
2737 	output = zend_new_array(zend_hash_num_elements(input));
2738 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2739 		/* convert key */
2740 		if (key) {
2741 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2742 			if (!converted_key) {
2743 				continue;
2744 			}
2745 			key = converted_key;
2746 		}
2747 		/* convert value */
2748 		ZEND_ASSERT(entry);
2749 try_again:
2750 		switch(Z_TYPE_P(entry)) {
2751 			case IS_STRING: {
2752 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2753 				if (!converted_key) {
2754 					if (key) {
2755 						zend_string_release(key);
2756 					}
2757 					continue;
2758 				}
2759 				ZVAL_STR(&entry_tmp, converted_key);
2760 				break;
2761 			}
2762 			case IS_NULL:
2763 			case IS_TRUE:
2764 			case IS_FALSE:
2765 			case IS_LONG:
2766 			case IS_DOUBLE:
2767 				ZVAL_COPY(&entry_tmp, entry);
2768 				break;
2769 			case IS_ARRAY:
2770 				chash = php_mb_convert_encoding_recursive(
2771 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2772 				if (chash) {
2773 					ZVAL_ARR(&entry_tmp, chash);
2774 				} else {
2775 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2776 				}
2777 				break;
2778 			case IS_REFERENCE:
2779 				entry = Z_REFVAL_P(entry);
2780 				goto try_again;
2781 			case IS_OBJECT:
2782 			default:
2783 				if (key) {
2784 					zend_string_release(key);
2785 				}
2786 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2787 				continue;
2788 		}
2789 		if (key) {
2790 			zend_hash_add(output, key, &entry_tmp);
2791 			zend_string_release(key);
2792 		} else {
2793 			zend_hash_index_add(output, idx, &entry_tmp);
2794 		}
2795 	} ZEND_HASH_FOREACH_END();
2796 	GC_TRY_UNPROTECT_RECURSION(input);
2797 
2798 	return output;
2799 }
2800 /* }}} */
2801 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2802 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2803 {
2804 	/* mbstring supports some 'text encodings' which aren't really text encodings
2805 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2806 	 * These should never be returned by `mb_detect_encoding`. */
2807 	unsigned int shift = 0;
2808 	for (unsigned int i = 0; i < *size; i++) {
2809 		const mbfl_encoding *encoding = elist[i];
2810 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2811 			shift++; /* Remove this encoding from the list */
2812 		} else if (shift) {
2813 			elist[i - shift] = encoding;
2814 		}
2815 	}
2816 	*size -= shift;
2817 }
2818 
2819 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2820 PHP_FUNCTION(mb_convert_encoding)
2821 {
2822 	zend_string *to_encoding_name;
2823 	zend_string *input_str, *from_encodings_str = NULL;
2824 	HashTable *input_ht, *from_encodings_ht = NULL;
2825 	const mbfl_encoding **from_encodings;
2826 	size_t num_from_encodings;
2827 	bool free_from_encodings = false;
2828 
2829 	ZEND_PARSE_PARAMETERS_START(2, 3)
2830 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2831 		Z_PARAM_STR(to_encoding_name)
2832 		Z_PARAM_OPTIONAL
2833 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2834 	ZEND_PARSE_PARAMETERS_END();
2835 
2836 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2837 	if (!to_encoding) {
2838 		RETURN_THROWS();
2839 	}
2840 
2841 	if (from_encodings_ht) {
2842 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2843 			RETURN_THROWS();
2844 		}
2845 		free_from_encodings = true;
2846 	} else if (from_encodings_str) {
2847 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2848 				&from_encodings, &num_from_encodings,
2849 				/* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2850 			RETURN_THROWS();
2851 		}
2852 		free_from_encodings = true;
2853 	} else {
2854 		from_encodings = &MBSTRG(current_internal_encoding);
2855 		num_from_encodings = 1;
2856 	}
2857 
2858 	if (num_from_encodings > 1) {
2859 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2860 	}
2861 
2862 	if (!num_from_encodings) {
2863 		efree(ZEND_VOIDP(from_encodings));
2864 		zend_argument_value_error(3, "must specify at least one encoding");
2865 		RETURN_THROWS();
2866 	}
2867 
2868 	if (input_str) {
2869 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2870 		if (ret != NULL) {
2871 			RETVAL_STR(ret);
2872 		} else {
2873 			RETVAL_FALSE;
2874 		}
2875 	} else {
2876 		HashTable *tmp;
2877 		tmp = php_mb_convert_encoding_recursive(
2878 			input_ht, to_encoding, from_encodings, num_from_encodings);
2879 		RETVAL_ARR(tmp);
2880 	}
2881 
2882 	if (free_from_encodings) {
2883 		efree(ZEND_VOIDP(from_encodings));
2884 	}
2885 }
2886 /* }}} */
2887 
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2888 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2889 {
2890 	return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2891 }
2892 
PHP_FUNCTION(mb_convert_case)2893 PHP_FUNCTION(mb_convert_case)
2894 {
2895 	zend_string *str, *from_encoding = NULL;
2896 	zend_long case_mode = 0;
2897 
2898 	ZEND_PARSE_PARAMETERS_START(2, 3)
2899 		Z_PARAM_STR(str)
2900 		Z_PARAM_LONG(case_mode)
2901 		Z_PARAM_OPTIONAL
2902 		Z_PARAM_STR_OR_NULL(from_encoding)
2903 	ZEND_PARSE_PARAMETERS_END();
2904 
2905 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2906 	if (!enc) {
2907 		RETURN_THROWS();
2908 	}
2909 
2910 	if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2911 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2912 		RETURN_THROWS();
2913 	}
2914 
2915 	RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2916 }
2917 
PHP_FUNCTION(mb_strtoupper)2918 PHP_FUNCTION(mb_strtoupper)
2919 {
2920 	zend_string *str, *from_encoding = NULL;
2921 
2922 	ZEND_PARSE_PARAMETERS_START(1, 2)
2923 		Z_PARAM_STR(str)
2924 		Z_PARAM_OPTIONAL
2925 		Z_PARAM_STR_OR_NULL(from_encoding)
2926 	ZEND_PARSE_PARAMETERS_END();
2927 
2928 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2929 	if (!enc) {
2930 		RETURN_THROWS();
2931 	}
2932 
2933 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2934 }
2935 
PHP_FUNCTION(mb_strtolower)2936 PHP_FUNCTION(mb_strtolower)
2937 {
2938 	zend_string *str, *from_encoding = NULL;
2939 
2940 	ZEND_PARSE_PARAMETERS_START(1, 2)
2941 		Z_PARAM_STR(str)
2942 		Z_PARAM_OPTIONAL
2943 		Z_PARAM_STR_OR_NULL(from_encoding)
2944 	ZEND_PARSE_PARAMETERS_END();
2945 
2946 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2947 	if (!enc) {
2948 		RETURN_THROWS();
2949 	}
2950 
2951 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2952 }
2953 
2954 typedef enum {
2955 	MB_LTRIM = 1,
2956 	MB_RTRIM = 2,
2957 	MB_BOTH_TRIM = 3
2958 } mb_trim_mode;
2959 
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)2960 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
2961 {
2962 	if (ht) {
2963 		return zend_hash_index_exists(ht, w);
2964 	} else {
2965 		for (size_t i = 0; i < default_chars_length; i++) {
2966 			if (w == default_chars[i]) {
2967 				return true;
2968 			}
2969 		}
2970 		return false;
2971 	}
2972 }
2973 
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)2974 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
2975 {
2976 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
2977 	uint32_t wchar_buf[128];
2978 	size_t in_len = ZSTR_LEN(str);
2979 	size_t out_len = 0;
2980 	unsigned int state = 0;
2981 	size_t left = 0;
2982 	size_t right = 0;
2983 	size_t total_len = 0;
2984 
2985 	while (in_len) {
2986 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2987 		ZEND_ASSERT(out_len <= 128);
2988 		total_len += out_len;
2989 
2990 		for (size_t i = 0; i < out_len; i++) {
2991 			uint32_t w = wchar_buf[i];
2992 			if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
2993 				if (mode & MB_LTRIM) {
2994 					left += 1;
2995 				}
2996 				if (mode & MB_RTRIM) {
2997 					right += 1;
2998 				}
2999 			} else {
3000 				mode &= ~MB_LTRIM;
3001 				if (mode & MB_RTRIM) {
3002 					right = 0;
3003 				}
3004 			}
3005 		}
3006 	}
3007 
3008 	if (left == 0 && right == 0) {
3009 		return zend_string_copy(str);
3010 	}
3011 	return mb_get_substr(str, left, total_len - (right + left), enc);
3012 }
3013 
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3014 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3015 {
3016 	const uint32_t trim_default_chars[] = {
3017 		0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3018 		0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3019 		0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3020 		0x85, 0x180E
3021 	};
3022 	size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3023 
3024 	HashTable what_ht;
3025 	zval val;
3026 	ZVAL_TRUE(&val);
3027 
3028 	zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3029 
3030 	for (size_t i = 0; i < trim_default_chars_length; i++) {
3031 		zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3032 	}
3033 	zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3034 	zend_hash_destroy(&what_ht);
3035 
3036 	return retval;
3037 }
3038 
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3039 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3040 {
3041 	unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3042 	uint32_t what_wchar_buf[128];
3043 	size_t what_out_len = 0;
3044 	unsigned int state = 0;
3045 	size_t what_len = ZSTR_LEN(what);
3046 	HashTable what_ht;
3047 	zval val;
3048 	bool hash_initialized = false;
3049 
3050 	while (what_len) {
3051 		what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3052 		ZEND_ASSERT(what_out_len <= 128);
3053 
3054 		if (what_out_len <= 4 && !hash_initialized) {
3055 			return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3056 		} else {
3057 			if (!hash_initialized) {
3058 				hash_initialized = true;
3059 				ZVAL_TRUE(&val);
3060 				zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3061 			}
3062 			for (size_t i = 0; i < what_out_len; i++) {
3063 				zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3064 			}
3065 		}
3066 	}
3067 
3068 	if (UNEXPECTED(!hash_initialized)) {
3069 		/* This is only possible if what is empty */
3070 		return zend_string_copy(str);
3071 	}
3072 
3073 	zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3074 	zend_hash_destroy(&what_ht);
3075 
3076 	return retval;
3077 }
3078 
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3079 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3080 {
3081 	zend_string *str;
3082 	zend_string *what = NULL;
3083 	zend_string *encoding = NULL;
3084 
3085 	ZEND_PARSE_PARAMETERS_START(1, 3)
3086 		Z_PARAM_STR(str)
3087 		Z_PARAM_OPTIONAL
3088 		Z_PARAM_STR(what)
3089 		Z_PARAM_STR_OR_NULL(encoding)
3090 	ZEND_PARSE_PARAMETERS_END();
3091 
3092 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3093 	if (!enc) {
3094 		RETURN_THROWS();
3095 	}
3096 
3097 	if (what) {
3098 		RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3099 	} else {
3100 		RETURN_STR(mb_trim_default_chars(str, mode, enc));
3101 	}
3102 }
3103 
PHP_FUNCTION(mb_trim)3104 PHP_FUNCTION(mb_trim)
3105 {
3106 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3107 }
3108 
PHP_FUNCTION(mb_ltrim)3109 PHP_FUNCTION(mb_ltrim)
3110 {
3111 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3112 }
3113 
PHP_FUNCTION(mb_rtrim)3114 PHP_FUNCTION(mb_rtrim)
3115 {
3116 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3117 }
3118 
duplicate_elist(const mbfl_encoding ** elist,size_t size)3119 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3120 {
3121 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3122 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3123 	return new_elist;
3124 }
3125 
estimate_demerits(uint32_t w)3126 static unsigned int estimate_demerits(uint32_t w)
3127 {
3128 	/* Receive wchars decoded from input string using candidate encoding.
3129 	 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3130 	 * a smaller number for each ASCII punctuation character, and 1 for
3131 	 * all other codepoints.
3132 	 *
3133 	 * The 'common' codepoints should cover the vast majority of
3134 	 * codepoints we are likely to see in practice, while only covering
3135 	 * a small minority of the entire Unicode encoding space. Why?
3136 	 * Well, if the test string happens to be valid in an incorrect
3137 	 * candidate encoding, the bogus codepoints which it decodes to will
3138 	 * be more or less random. By treating the majority of codepoints as
3139 	 * 'rare', we ensure that in almost all such cases, the bogus
3140 	 * codepoints will include plenty of 'rares', thus giving the
3141 	 * incorrect candidate encoding lots of demerits. See
3142 	 * common_codepoints.txt for the actual list used.
3143 	 *
3144 	 * So, why give extra demerits for ASCII punctuation characters? It's
3145 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3146 	 * which deliberately only use bytes in the ASCII range. When
3147 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3148 	 * have an unusually high number of ASCII punctuation characters.
3149 	 * So giving extra demerits for such characters will improve
3150 	 * detection accuracy for UTF-7 and similar encodings.
3151 	 *
3152 	 * Finally, why 1 demerit for all other characters? That penalizes
3153 	 * long strings, meaning we will tend to choose a candidate encoding
3154 	 * in which the test string decodes to a smaller number of
3155 	 * codepoints. That prevents single-byte encodings in which almost
3156 	 * every possible input byte decodes to a 'common' codepoint from
3157 	 * being favored too much. */
3158 	if (w > 0xFFFF) {
3159 		return 40;
3160 	} else if (w >= 0x21 && w <= 0x2F) {
3161 		return 6;
3162 	} else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3163 		return 30;
3164 	} else {
3165 		return 1;
3166 	}
3167 	return 0;
3168 }
3169 
3170 struct candidate {
3171 	const mbfl_encoding *enc;
3172 	const unsigned char *in;
3173 	size_t in_len;
3174 	uint64_t demerits; /* Wide bit size to prevent overflow */
3175 	unsigned int state;
3176 	float multiplier;
3177 };
3178 
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3179 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3180 {
3181 	size_t j = 0;
3182 
3183 	for (size_t i = 0; i < length; i++) {
3184 		const mbfl_encoding *enc = encodings[i];
3185 
3186 		array[j].enc = enc;
3187 		array[j].state = 0;
3188 		array[j].demerits = 0;
3189 
3190 		/* If any candidate encodings have specialized validation functions, use them
3191 		 * to eliminate as many candidates as possible */
3192 		if (enc->check != NULL) {
3193 			for (size_t k = 0; k < n; k++) {
3194 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
3195 					if (strict) {
3196 						goto skip_to_next;
3197 					} else {
3198 						array[j].demerits += 500;
3199 					}
3200 				}
3201 			}
3202 		}
3203 
3204 		/* This multiplier can optionally be used to make candidate encodings listed
3205 		 * first more likely to be chosen. It is a weight factor which multiplies
3206 		 * the number of demerits counted for each candidate. */
3207 		array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3208 		j++;
3209 skip_to_next: ;
3210 	}
3211 
3212 	return j;
3213 }
3214 
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3215 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3216 {
3217 	for (size_t i = 0; i < length; i++) {
3218 		const mbfl_encoding *enc = array[i].enc;
3219 
3220 		array[i].in = in;
3221 		array[i].in_len = in_len;
3222 
3223 		/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3224 		if (enc == &mbfl_encoding_utf8) {
3225 			if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3226 				array[i].in_len -= 3;
3227 				array[i].in += 3;
3228 			}
3229 		} else if (enc == &mbfl_encoding_utf16be) {
3230 			if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3231 				array[i].in_len -= 2;
3232 				array[i].in += 2;
3233 			}
3234 		} else if (enc == &mbfl_encoding_utf16le) {
3235 			if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3236 				array[i].in_len -= 2;
3237 				array[i].in += 2;
3238 			}
3239 		}
3240 	}
3241 }
3242 
count_demerits(struct candidate * array,size_t length,bool strict)3243 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3244 {
3245 	uint32_t wchar_buf[128];
3246 	unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3247 
3248 	for (size_t i = 0; i < length; i++) {
3249 		if (array[i].in_len == 0) {
3250 			finished++;
3251 		}
3252 	}
3253 
3254 	while ((strict || length > 1) && finished < length) {
3255 		/* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3256 		for (size_t i = length - 1; i != (size_t)-1; i--) {
3257 			/* Do we still have more input to process for this candidate encoding? */
3258 			if (array[i].in_len) {
3259 				const mbfl_encoding *enc = array[i].enc;
3260 				size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3261 				ZEND_ASSERT(out_len <= 128);
3262 				/* Check this batch of decoded codepoints; are there any error markers?
3263 				 * Also sum up the number of demerits */
3264 				while (out_len) {
3265 					uint32_t w = wchar_buf[--out_len];
3266 					if (w == MBFL_BAD_INPUT) {
3267 						if (strict) {
3268 							/* This candidate encoding is not valid, eliminate it from consideration */
3269 							length--;
3270 							if (i < length) {
3271 								/* The eliminated candidate was the last valid one in the list */
3272 								memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3273 							}
3274 							goto try_next_encoding;
3275 						} else {
3276 							array[i].demerits += 1000;
3277 						}
3278 					} else {
3279 						array[i].demerits += estimate_demerits(w);
3280 					}
3281 				}
3282 				if (array[i].in_len == 0) {
3283 					finished++;
3284 				}
3285 			}
3286 try_next_encoding:;
3287 		}
3288 	}
3289 
3290 	for (size_t i = 0; i < length; i++) {
3291 		array[i].demerits *= array[i].multiplier;
3292 	}
3293 
3294 	return length;
3295 }
3296 
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3297 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3298 {
3299 	if (elist_size == 0) {
3300 		return NULL;
3301 	}
3302 	if (elist_size == 1) {
3303 		if (strict) {
3304 			while (n--) {
3305 				if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3306 					return NULL;
3307 				}
3308 			}
3309 		}
3310 		return *elist;
3311 	}
3312 	if (n == 1 && *str_lengths == 0) {
3313 		return *elist;
3314 	}
3315 
3316 	/* Allocate on stack; when we return, this array is automatically freed */
3317 	struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3318 	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3319 
3320 	while (n--) {
3321 		start_string(array, elist_size, strings[n], str_lengths[n]);
3322 		elist_size = count_demerits(array, elist_size, strict);
3323 		if (elist_size == 0) {
3324 			/* All candidates were eliminated */
3325 			return NULL;
3326 		}
3327 	}
3328 
3329 	/* See which remaining candidate encoding has the least demerits */
3330 	unsigned int best = 0;
3331 	for (unsigned int i = 1; i < elist_size; i++) {
3332 		if (array[i].demerits < array[best].demerits) {
3333 			best = i;
3334 		}
3335 	}
3336 	return array[best].enc;
3337 }
3338 
3339 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3340  * is rejected. With non-strict detection, we just continue, but apply demerits for
3341  * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3342 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3343 {
3344 	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3345 }
3346 
3347 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3348 PHP_FUNCTION(mb_detect_encoding)
3349 {
3350 	zend_string *str, *encoding_str = NULL;
3351 	HashTable *encoding_ht = NULL;
3352 	bool strict = false;
3353 	const mbfl_encoding *ret, **elist;
3354 	size_t size;
3355 
3356 	ZEND_PARSE_PARAMETERS_START(1, 3)
3357 		Z_PARAM_STR(str)
3358 		Z_PARAM_OPTIONAL
3359 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3360 		Z_PARAM_BOOL(strict)
3361 	ZEND_PARSE_PARAMETERS_END();
3362 
3363 	/* Should we pay attention to the order of the provided candidate encodings and prefer
3364 	 * the earlier ones (if more than one candidate encoding matches)?
3365 	 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3366 	 * in, then don't treat the order as significant */
3367 	bool order_significant = true;
3368 
3369 	/* make encoding list */
3370 	if (encoding_ht) {
3371 		if (encoding_ht == MBSTRG(all_encodings_list)) {
3372 			order_significant = false;
3373 		}
3374 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3375 			RETURN_THROWS();
3376 		}
3377 	} else if (encoding_str) {
3378 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3379 			RETURN_THROWS();
3380 		}
3381 	} else {
3382 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3383 		size = MBSTRG(current_detect_order_list_size);
3384 	}
3385 
3386 	if (size == 0) {
3387 		efree(ZEND_VOIDP(elist));
3388 		zend_argument_value_error(2, "must specify at least one encoding");
3389 		RETURN_THROWS();
3390 	}
3391 
3392 	remove_non_encodings_from_elist(elist, &size);
3393 	if (size == 0) {
3394 		efree(ZEND_VOIDP(elist));
3395 		RETURN_FALSE;
3396 	}
3397 
3398 	if (ZEND_NUM_ARGS() < 3) {
3399 		strict = MBSTRG(strict_detection);
3400 	}
3401 
3402 	if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3403 		ret = &mbfl_encoding_utf8;
3404 	} else {
3405 		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3406 	}
3407 
3408 	efree(ZEND_VOIDP(elist));
3409 
3410 	if (ret == NULL) {
3411 		RETURN_FALSE;
3412 	}
3413 
3414 	RETVAL_STRING((char *)ret->name);
3415 }
3416 /* }}} */
3417 
3418 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3419 PHP_FUNCTION(mb_list_encodings)
3420 {
3421 	ZEND_PARSE_PARAMETERS_NONE();
3422 
3423 	if (MBSTRG(all_encodings_list) == NULL) {
3424 		/* Initialize shared array of supported encoding names
3425 		 * This is done so that we can check if `mb_list_encodings()` is being
3426 		 * passed to other mbstring functions using a cheap pointer equality check */
3427 		HashTable *array = emalloc(sizeof(HashTable));
3428 		zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3429 		for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3430 			zval tmp;
3431 			ZVAL_STRING(&tmp, (*encodings)->name);
3432 			zend_hash_next_index_insert(array, &tmp);
3433 		}
3434 		MBSTRG(all_encodings_list) = array;
3435 	}
3436 
3437 	GC_ADDREF(MBSTRG(all_encodings_list));
3438 	RETURN_ARR(MBSTRG(all_encodings_list));
3439 }
3440 /* }}} */
3441 
3442 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3443 PHP_FUNCTION(mb_encoding_aliases)
3444 {
3445 	const mbfl_encoding *encoding;
3446 	zend_string *encoding_name = NULL;
3447 
3448 	ZEND_PARSE_PARAMETERS_START(1, 1)
3449 		Z_PARAM_STR(encoding_name)
3450 	ZEND_PARSE_PARAMETERS_END();
3451 
3452 	encoding = php_mb_get_encoding(encoding_name, 1);
3453 	if (!encoding) {
3454 		RETURN_THROWS();
3455 	}
3456 
3457 	array_init(return_value);
3458 	if (encoding->aliases != NULL) {
3459 		for (const char **alias = encoding->aliases; *alias; ++alias) {
3460 			add_next_index_string(return_value, (char *)*alias);
3461 		}
3462 	}
3463 }
3464 /* }}} */
3465 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3466 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3467 {
3468 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3469 	 * if we are converting zenkaku kana to hankaku kana
3470 	 * Make the buffer for converted kana big enough that we never need to
3471 	 * perform bounds checks */
3472 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3473 	unsigned int buf_offset = 0;
3474 	unsigned int state = 0;
3475 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3476 	size_t in_len = ZSTR_LEN(input);
3477 
3478 	mb_convert_buf buf;
3479 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3480 
3481 	while (in_len) {
3482 		uint32_t *converted = converted_buf;
3483 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3484 		 * previous iteration, don't overwrite it */
3485 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3486 		out_len += buf_offset;
3487 		ZEND_ASSERT(out_len <= 64);
3488 
3489 		if (!out_len) {
3490 			continue;
3491 		}
3492 
3493 		for (size_t i = 0; i < out_len-1; i++) {
3494 			uint32_t second = 0;
3495 			bool consumed = false;
3496 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3497 			if (second) {
3498 				*converted++ = second;
3499 			}
3500 			if (consumed) {
3501 				i++;
3502 				if (i == out_len-1) {
3503 					/* We consumed two codepoints at the very end of the wchar buffer
3504 					 * So there is nothing remaining to reprocess on the next iteration */
3505 					buf_offset = 0;
3506 					goto emit_converted_kana;
3507 				}
3508 			}
3509 		}
3510 
3511 		if (!in_len) {
3512 			/* This is the last iteration, so we need to process the final codepoint now */
3513 			uint32_t second = 0;
3514 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3515 			if (second) {
3516 				*converted++ = second;
3517 			}
3518 		} else {
3519 			/* Reprocess the last codepoint on the next iteration */
3520 			wchar_buf[0] = wchar_buf[out_len-1];
3521 			buf_offset = 1;
3522 		}
3523 
3524 emit_converted_kana:
3525 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3526 	}
3527 
3528 	return mb_convert_buf_result(&buf, encoding);
3529 }
3530 
3531 char mb_convert_kana_flags[17] = {
3532 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3533 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3534 	'V'
3535 };
3536 
3537 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3538 PHP_FUNCTION(mb_convert_kana)
3539 {
3540 	unsigned int opt;
3541 	char *optstr = NULL;
3542 	size_t optstr_len;
3543 	zend_string *encname = NULL, *str;
3544 
3545 	ZEND_PARSE_PARAMETERS_START(1, 3)
3546 		Z_PARAM_STR(str)
3547 		Z_PARAM_OPTIONAL
3548 		Z_PARAM_STRING(optstr, optstr_len)
3549 		Z_PARAM_STR_OR_NULL(encname)
3550 	ZEND_PARSE_PARAMETERS_END();
3551 
3552 	if (optstr != NULL) {
3553 		char *p = optstr, *e = p + optstr_len;
3554 		opt = 0;
3555 next_option:
3556 		while (p < e) {
3557 			/* Walk through option string and convert to bit vector
3558 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3559 			char c = *p++;
3560 			if (c == 'A') {
3561 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3562 			} else if (c == 'a') {
3563 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3564 			} else {
3565 				for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3566 					if (c == mb_convert_kana_flags[i]) {
3567 						opt |= (1 << i);
3568 						goto next_option;
3569 					}
3570 				}
3571 
3572 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3573 				RETURN_THROWS();
3574 			}
3575 		}
3576 
3577 		/* Check for illegal combinations of options */
3578 		if (((opt & 0xFF00) >> 8) & opt) {
3579 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3580 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3581 			 * FW hiragana to FW katakana and then back again. */
3582 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3583 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3584 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3585 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3586 				flag1 = 'A';
3587 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3588 				flag2 = 'a';
3589 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3590 			RETURN_THROWS();
3591 		}
3592 
3593 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3594 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3595 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3596 			RETURN_THROWS();
3597 		}
3598 
3599 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3600 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3601 		 * more than one of these */
3602 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3603 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3604 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3605 				RETURN_THROWS();
3606 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3607 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3608 				RETURN_THROWS();
3609 			}
3610 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3611 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3612 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3613 				RETURN_THROWS();
3614 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3615 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3616 				RETURN_THROWS();
3617 			}
3618 		}
3619 	} else {
3620 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3621 	}
3622 
3623 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3624 	if (!enc) {
3625 		RETURN_THROWS();
3626 	}
3627 
3628 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3629 }
3630 
mb_recursive_count_strings(zval * var)3631 static unsigned int mb_recursive_count_strings(zval *var)
3632 {
3633 	unsigned int count = 0;
3634 	ZVAL_DEREF(var);
3635 
3636 	if (Z_TYPE_P(var) == IS_STRING) {
3637 		count++;
3638 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3639 		if (Z_REFCOUNTED_P(var)) {
3640 			if (Z_IS_RECURSIVE_P(var)) {
3641 				return count;
3642 			}
3643 			Z_PROTECT_RECURSION_P(var);
3644 		}
3645 
3646 		HashTable *ht = HASH_OF(var);
3647 		if (ht != NULL) {
3648 			zval *entry;
3649 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3650 				count += mb_recursive_count_strings(entry);
3651 			} ZEND_HASH_FOREACH_END();
3652 		}
3653 
3654 		if (Z_REFCOUNTED_P(var)) {
3655 			Z_UNPROTECT_RECURSION_P(var);
3656 		}
3657 	}
3658 
3659 	return count;
3660 }
3661 
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3662 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3663 {
3664 	ZVAL_DEREF(var);
3665 
3666 	if (Z_TYPE_P(var) == IS_STRING) {
3667 		val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3668 		len_list[*count] = Z_STRLEN_P(var);
3669 		(*count)++;
3670 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3671 		if (Z_REFCOUNTED_P(var)) {
3672 			if (Z_IS_RECURSIVE_P(var)) {
3673 				return true;
3674 			}
3675 			Z_PROTECT_RECURSION_P(var);
3676 		}
3677 
3678 		HashTable *ht = HASH_OF(var);
3679 		if (ht != NULL) {
3680 			zval *entry;
3681 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3682 				if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3683 					if (Z_REFCOUNTED_P(var)) {
3684 						Z_UNPROTECT_RECURSION_P(var);
3685 						return true;
3686 					}
3687 				}
3688 			} ZEND_HASH_FOREACH_END();
3689 		}
3690 
3691 		if (Z_REFCOUNTED_P(var)) {
3692 			Z_UNPROTECT_RECURSION_P(var);
3693 		}
3694 	}
3695 
3696 	return false;
3697 }
3698 
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3699 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3700 {
3701 	zval *entry, *orig_var;
3702 
3703 	orig_var = var;
3704 	ZVAL_DEREF(var);
3705 
3706 	if (Z_TYPE_P(var) == IS_STRING) {
3707 		zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3708 		zval_ptr_dtor(orig_var);
3709 		ZVAL_STR(orig_var, ret);
3710 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3711 		if (Z_TYPE_P(var) == IS_ARRAY) {
3712 			SEPARATE_ARRAY(var);
3713 		}
3714 		if (Z_REFCOUNTED_P(var)) {
3715 			if (Z_IS_RECURSIVE_P(var)) {
3716 				return true;
3717 			}
3718 			Z_PROTECT_RECURSION_P(var);
3719 		}
3720 
3721 		HashTable *ht = HASH_OF(var);
3722 		if (ht != NULL) {
3723 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3724 				if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3725 					if (Z_REFCOUNTED_P(var)) {
3726 						Z_UNPROTECT_RECURSION_P(var);
3727 					}
3728 					return true;
3729 				}
3730 			} ZEND_HASH_FOREACH_END();
3731 		}
3732 
3733 		if (Z_REFCOUNTED_P(var)) {
3734 			Z_UNPROTECT_RECURSION_P(var);
3735 		}
3736 	}
3737 
3738 	return false;
3739 }
3740 
PHP_FUNCTION(mb_convert_variables)3741 PHP_FUNCTION(mb_convert_variables)
3742 {
3743 	zval *args;
3744 	zend_string *to_enc_str;
3745 	zend_string *from_enc_str;
3746 	HashTable *from_enc_ht;
3747 	const mbfl_encoding *from_encoding, *to_encoding;
3748 	uint32_t argc;
3749 	size_t elistsz;
3750 	const mbfl_encoding **elist;
3751 
3752 	ZEND_PARSE_PARAMETERS_START(3, -1)
3753 		Z_PARAM_STR(to_enc_str)
3754 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3755 		Z_PARAM_VARIADIC('+', args, argc)
3756 	ZEND_PARSE_PARAMETERS_END();
3757 
3758 	/* new encoding */
3759 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3760 	if (!to_encoding) {
3761 		RETURN_THROWS();
3762 	}
3763 
3764 	from_encoding = MBSTRG(current_internal_encoding);
3765 
3766 	bool order_significant = true;
3767 
3768 	/* pre-conversion encoding */
3769 	if (from_enc_ht) {
3770 		if (from_enc_ht == MBSTRG(all_encodings_list)) {
3771 			/* If entire list of supported encodings returned by `mb_list_encodings` is passed
3772 			 * in, then don't treat the order of the list as significant */
3773 			order_significant = false;
3774 		}
3775 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3776 			RETURN_THROWS();
3777 		}
3778 	} else {
3779 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3780 			RETURN_THROWS();
3781 		}
3782 	}
3783 
3784 	if (elistsz == 0) {
3785 		efree(ZEND_VOIDP(elist));
3786 		zend_argument_value_error(2, "must specify at least one encoding");
3787 		RETURN_THROWS();
3788 	}
3789 
3790 	if (elistsz == 1) {
3791 		from_encoding = *elist;
3792 	} else {
3793 		/* auto detect */
3794 		unsigned int num = 0;
3795 		for (size_t n = 0; n < argc; n++) {
3796 			zval *zv = &args[n];
3797 			num += mb_recursive_count_strings(zv);
3798 		}
3799 		const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3800 		size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3801 		unsigned int i = 0;
3802 		for (size_t n = 0; n < argc; n++) {
3803 			zval *zv = &args[n];
3804 			if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3805 				efree(ZEND_VOIDP(elist));
3806 				efree(ZEND_VOIDP(val_list));
3807 				efree(len_list);
3808 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3809 				RETURN_FALSE;
3810 			}
3811 		}
3812 		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3813 		efree(ZEND_VOIDP(val_list));
3814 		efree(len_list);
3815 		if (!from_encoding) {
3816 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3817 			efree(ZEND_VOIDP(elist));
3818 			RETURN_FALSE;
3819 		}
3820 
3821 	}
3822 
3823 	efree(ZEND_VOIDP(elist));
3824 
3825 	/* convert */
3826 	for (size_t n = 0; n < argc; n++) {
3827 		zval *zv = &args[n];
3828 		ZVAL_DEREF(zv);
3829 		if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3830 			php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3831 			RETURN_FALSE;
3832 		}
3833 	}
3834 
3835 	RETURN_STRING(from_encoding->name);
3836 }
3837 
3838 /* HTML numeric entities */
3839 
3840 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3841 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3842 {
3843 	zval *hash_entry;
3844 
3845 	size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3846 	if (n_elems % 4 != 0) {
3847 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3848 		return NULL;
3849 	}
3850 
3851 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3852 	uint32_t *mapelm = convmap;
3853 
3854 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3855 		bool failed = true;
3856 		zend_long tmp = zval_try_get_long(hash_entry, &failed);
3857 		if (failed) {
3858 			efree(convmap);
3859 			zend_argument_value_error(2, "must only be composed of values of type int");
3860 			return NULL;
3861 		}
3862 		*mapelm++ = tmp;
3863 	} ZEND_HASH_FOREACH_END();
3864 
3865 	return convmap;
3866 }
3867 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3868 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3869 {
3870 	uint32_t *convmap_end = convmap + conversion_map_size;
3871 
3872 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3873 		uint32_t lo_code = mapelm[0];
3874 		uint32_t hi_code = mapelm[1];
3875 		uint32_t offset  = mapelm[2];
3876 		uint32_t mask    = mapelm[3];
3877 
3878 		if (w >= lo_code && w <= hi_code) {
3879 			/* This wchar falls inside one of the ranges which should be
3880 			 * converted to HTML entities */
3881 			*retval = (w + offset) & mask;
3882 			return true;
3883 		}
3884 	}
3885 
3886 	/* None of the ranges matched */
3887 	return false;
3888 }
3889 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3890 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3891 {
3892 	/* Each wchar which we get from decoding the input string may become up to
3893 	 * 13 wchars when we convert it to an HTML entity */
3894 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3895 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3896 
3897 	unsigned int state = 0;
3898 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3899 	size_t in_len = ZSTR_LEN(input);
3900 
3901 	mb_convert_buf buf;
3902 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3903 
3904 	while (in_len) {
3905 		/* Convert input string to wchars, up to 32 at a time */
3906 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3907 		ZEND_ASSERT(out_len <= 32);
3908 		uint32_t *converted = converted_buf;
3909 
3910 		/* Run through wchars and see if any of them fall into the ranges
3911 		 * which we want to convert to HTML entities */
3912 		for (size_t i = 0; i < out_len; i++) {
3913 			uint32_t w = wchar_buf[i];
3914 
3915 			if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3916 				*converted++ = '&';
3917 				*converted++ = '#';
3918 				if (hex) {
3919 					*converted++ = 'x';
3920 				}
3921 
3922 				/* Convert wchar to decimal/hex string */
3923 				if (w == 0) {
3924 					*converted++ = '0';
3925 				} else {
3926 					unsigned char *p = entity + sizeof(entity);
3927 					if (hex) {
3928 						while (w > 0) {
3929 							*(--p) = "0123456789ABCDEF"[w & 0xF];
3930 							w >>= 4;
3931 						}
3932 					} else {
3933 						while (w > 0) {
3934 							*(--p) = "0123456789"[w % 10];
3935 							w /= 10;
3936 						}
3937 					}
3938 					while (p < entity + sizeof(entity)) {
3939 						*converted++ = *p++;
3940 					}
3941 				}
3942 
3943 				*converted++ = ';';
3944 			} else {
3945 				*converted++ = w;
3946 			}
3947 		}
3948 
3949 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3950 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3951 	}
3952 
3953 	return mb_convert_buf_result(&buf, encoding);
3954 }
3955 
3956 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3957 PHP_FUNCTION(mb_encode_numericentity)
3958 {
3959 	zend_string *encoding = NULL, *str;
3960 	size_t conversion_map_size;
3961 	HashTable *target_hash;
3962 	bool is_hex = false;
3963 
3964 	ZEND_PARSE_PARAMETERS_START(2, 4)
3965 		Z_PARAM_STR(str)
3966 		Z_PARAM_ARRAY_HT(target_hash)
3967 		Z_PARAM_OPTIONAL
3968 		Z_PARAM_STR_OR_NULL(encoding)
3969 		Z_PARAM_BOOL(is_hex)
3970 	ZEND_PARSE_PARAMETERS_END();
3971 
3972 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3973 	if (!enc) {
3974 		RETURN_THROWS();
3975 	}
3976 
3977 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
3978 	if (convmap == NULL) {
3979 		RETURN_THROWS();
3980 	}
3981 
3982 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
3983 	efree(convmap);
3984 }
3985 /* }}} */
3986 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3987 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3988 {
3989 	uint32_t *convmap_end = convmap + conversion_map_size;
3990 
3991 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3992 		uint32_t lo_code = mapelm[0];
3993 		uint32_t hi_code = mapelm[1];
3994 		uint32_t offset  = mapelm[2];
3995 		uint32_t codepoint = number - offset;
3996 		if (codepoint >= lo_code && codepoint <= hi_code) {
3997 			*retval = codepoint;
3998 			return true;
3999 		}
4000 	}
4001 
4002 	return false;
4003 }
4004 
4005 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
4006 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
4007 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4008 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4009 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4010 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4011 {
4012 	uint32_t wchar_buf[128], converted_buf[128];
4013 
4014 	unsigned int state = 0;
4015 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4016 	size_t in_len = ZSTR_LEN(input);
4017 
4018 	mb_convert_buf buf;
4019 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4020 
4021 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4022 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4023 	 * 2nd 'converted' buffer.
4024 	 *
4025 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4026 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4027 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4028 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4029 	 * to store the next batch of wchars after it.
4030 	 *
4031 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4032 	 * wchars from the 1st buffer to the 2nd one.
4033 	 *
4034 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4035 	 * have to do bounds checks when writing wchars into it.
4036 	 */
4037 
4038 	unsigned int wchar_buf_offset = 0;
4039 
4040 	while (in_len) {
4041 		/* Leave space for sentinel at the end of the buffer */
4042 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4043 		out_len += wchar_buf_offset;
4044 		ZEND_ASSERT(out_len <= 127);
4045 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4046 
4047 		uint32_t *p, *converted;
4048 
4049 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4050 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
4051 		if (wchar_buf_offset == 0) {
4052 			p = wchar_buf;
4053 			while (*p != '&')
4054 				p++;
4055 			if (p == wchar_buf + out_len) {
4056 				/* No HTML entities in this buffer */
4057 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4058 				continue;
4059 			}
4060 
4061 			/* Copy over the prefix with no & which we already scanned */
4062 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4063 			converted = converted_buf + (p - wchar_buf);
4064 		} else {
4065 			p = wchar_buf;
4066 			converted = converted_buf;
4067 		}
4068 
4069 found_ampersand:
4070 		ZEND_ASSERT(*p == '&');
4071 		uint32_t *p2 = p;
4072 
4073 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4074 		if (*++p2 == '#') {
4075 			if (*++p2 == 'x') {
4076 				/* Possible hex entity */
4077 				uint32_t w = *++p2;
4078 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4079 					w = *++p2;
4080 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4081 					/* We hit the end of the buffer while reading digits, and
4082 					 * more wchars are still coming in the next buffer
4083 					 * Reprocess this identity on next iteration */
4084 					memmove(wchar_buf, p, (p2 - p) * 4);
4085 					wchar_buf_offset = p2 - p;
4086 					goto process_converted_wchars;
4087 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4088 					/* Invalid entity (too long or "&#x" only) */
4089 					memcpy(converted, p, (p2 - p) * 4);
4090 					converted += p2 - p;
4091 				} else {
4092 					/* Valid hexadecimal entity */
4093 					uint32_t value = 0, *p3 = p + 3;
4094 					while (p3 < p2) {
4095 						w = *p3++;
4096 						if (w <= '9') {
4097 							value = (value * 16) + (w - '0');
4098 						} else if (w >= 'a') {
4099 							value = (value * 16) + 10 + (w - 'a');
4100 						} else {
4101 							value = (value * 16) + 10 + (w - 'A');
4102 						}
4103 					}
4104 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4105 						converted++;
4106 						if (*p2 == ';')
4107 							p2++;
4108 					} else {
4109 						memcpy(converted, p, (p2 - p) * 4);
4110 						converted += p2 - p;
4111 					}
4112 				}
4113 			} else {
4114 				/* Possible decimal entity */
4115 				uint32_t w = *p2;
4116 				while (w >= '0' && w <= '9')
4117 					w = *++p2;
4118 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4119 					/* The number of digits was legal (no more than 10 decimal digits)
4120 					 * Reprocess this identity on next iteration of main loop */
4121 					memmove(wchar_buf, p, (p2 - p) * 4);
4122 					wchar_buf_offset = p2 - p;
4123 					goto process_converted_wchars;
4124 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4125 					/* Invalid entity (too long or "&#" only) */
4126 					memcpy(converted, p, (p2 - p) * 4);
4127 					converted += p2 - p;
4128 				} else {
4129 					/* Valid decimal entity */
4130 					uint32_t value = 0, *p3 = p + 2;
4131 					while (p3 < p2) {
4132 						/* If unsigned integer overflow would occur in the below
4133 						 * multiplication by 10, this entity is no good
4134 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4135 						if (value > 0x19999999) {
4136 							memcpy(converted, p, (p2 - p) * 4);
4137 							converted += p2 - p;
4138 							goto decimal_entity_too_big;
4139 						}
4140 						value = (value * 10) + (*p3++ - '0');
4141 					}
4142 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4143 						converted++;
4144 						if (*p2 == ';')
4145 							p2++;
4146 					} else {
4147 						memcpy(converted, p, (p2 - p) * 4);
4148 						converted += p2 - p;
4149 					}
4150 				}
4151 			}
4152 		} else if ((p2 == wchar_buf + out_len) && in_len) {
4153 			/* Corner case: & at end of buffer */
4154 			wchar_buf[0] = '&';
4155 			wchar_buf_offset = 1;
4156 			goto process_converted_wchars;
4157 		} else {
4158 			*converted++ = '&';
4159 		}
4160 decimal_entity_too_big:
4161 
4162 		/* Starting to scan a new section of the wchar buffer
4163 		 * 'p2' is pointing at the next wchar which needs to be processed */
4164 		p = p2;
4165 		while (*p2 != '&')
4166 			p2++;
4167 
4168 		if (p2 > p) {
4169 			memcpy(converted, p, (p2 - p) * 4);
4170 			converted += p2 - p;
4171 			p = p2;
4172 		}
4173 
4174 		if (p < wchar_buf + out_len)
4175 			goto found_ampersand;
4176 
4177 		/* We do not have any wchars remaining at the end of this buffer which
4178 		 * we need to reprocess on the next call */
4179 		wchar_buf_offset = 0;
4180 process_converted_wchars:
4181 		ZEND_ASSERT(converted <= converted_buf + 128);
4182 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4183 	}
4184 
4185 	return mb_convert_buf_result(&buf, encoding);
4186 }
4187 
4188 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4189 PHP_FUNCTION(mb_decode_numericentity)
4190 {
4191 	zend_string *encoding = NULL, *str;
4192 	size_t conversion_map_size;
4193 	HashTable *target_hash;
4194 
4195 	ZEND_PARSE_PARAMETERS_START(2, 3)
4196 		Z_PARAM_STR(str)
4197 		Z_PARAM_ARRAY_HT(target_hash)
4198 		Z_PARAM_OPTIONAL
4199 		Z_PARAM_STR_OR_NULL(encoding)
4200 	ZEND_PARSE_PARAMETERS_END();
4201 
4202 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4203 	if (!enc) {
4204 		RETURN_THROWS();
4205 	}
4206 
4207 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4208 	if (convmap == NULL) {
4209 		RETURN_THROWS();
4210 	}
4211 
4212 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4213 	efree(convmap);
4214 }
4215 /* }}} */
4216 
4217 /* {{{ Sends an email message with MIME scheme */
4218 #define CRLF "\r\n"
4219 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4220 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4221 {
4222 	const char *ps;
4223 	size_t icnt;
4224 	int state = 0;
4225 	int crlf_state = -1;
4226 	char *token = NULL;
4227 	size_t token_pos = 0;
4228 	zend_string *fld_name, *fld_val;
4229 
4230 	ps = str;
4231 	icnt = str_len;
4232 	fld_name = fld_val = NULL;
4233 
4234 	/*
4235 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4236 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4237 	 *      state  0            1           2          3
4238 	 *
4239 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4240 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4241 	 * crlf_state -1                       0                     1 -1
4242 	 *
4243 	 */
4244 
4245 	while (icnt > 0) {
4246 		switch (*ps) {
4247 			case ':':
4248 				if (crlf_state == 1) {
4249 					token_pos++;
4250 				}
4251 
4252 				if (state == 0 || state == 1) {
4253 					if(token && token_pos > 0) {
4254 						fld_name = zend_string_init(token, token_pos, 0);
4255 					}
4256 					state = 2;
4257 				} else {
4258 					token_pos++;
4259 				}
4260 
4261 				crlf_state = 0;
4262 				break;
4263 
4264 			case '\n':
4265 				if (crlf_state == -1) {
4266 					goto out;
4267 				}
4268 				crlf_state = -1;
4269 				break;
4270 
4271 			case '\r':
4272 				if (crlf_state == 1) {
4273 					token_pos++;
4274 				} else {
4275 					crlf_state = 1;
4276 				}
4277 				break;
4278 
4279 			case ' ': case '\t':
4280 				if (crlf_state == -1) {
4281 					if (state == 3) {
4282 						/* continuing from the previous line */
4283 						state = 4;
4284 					} else {
4285 						/* simply skipping this new line */
4286 						state = 5;
4287 					}
4288 				} else {
4289 					if (crlf_state == 1) {
4290 						token_pos++;
4291 					}
4292 					if (state == 1 || state == 3) {
4293 						token_pos++;
4294 					}
4295 				}
4296 				crlf_state = 0;
4297 				break;
4298 
4299 			default:
4300 				switch (state) {
4301 					case 0:
4302 						token = (char*)ps;
4303 						token_pos = 0;
4304 						state = 1;
4305 						break;
4306 
4307 					case 2:
4308 						if (crlf_state != -1) {
4309 							token = (char*)ps;
4310 							token_pos = 0;
4311 
4312 							state = 3;
4313 							break;
4314 						}
4315 						ZEND_FALLTHROUGH;
4316 
4317 					case 3:
4318 						if (crlf_state == -1) {
4319 							if(token && token_pos > 0) {
4320 								fld_val = zend_string_init(token, token_pos, 0);
4321 							}
4322 
4323 							if (fld_name != NULL && fld_val != NULL) {
4324 								zval val;
4325 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4326 								ZVAL_STR(&val, fld_val);
4327 
4328 								zend_hash_update(ht, fld_name, &val);
4329 
4330 								zend_string_release_ex(fld_name, 0);
4331 							}
4332 
4333 							fld_name = fld_val = NULL;
4334 							token = (char*)ps;
4335 							token_pos = 0;
4336 
4337 							state = 1;
4338 						}
4339 						break;
4340 
4341 					case 4:
4342 						token_pos++;
4343 						state = 3;
4344 						break;
4345 				}
4346 
4347 				if (crlf_state == 1) {
4348 					token_pos++;
4349 				}
4350 
4351 				token_pos++;
4352 
4353 				crlf_state = 0;
4354 				break;
4355 		}
4356 		ps++, icnt--;
4357 	}
4358 out:
4359 	if (state == 2) {
4360 		token = "";
4361 		token_pos = 0;
4362 
4363 		state = 3;
4364 	}
4365 	if (state == 3) {
4366 		if(token && token_pos > 0) {
4367 			fld_val = zend_string_init(token, token_pos, 0);
4368 		}
4369 		if (fld_name != NULL && fld_val != NULL) {
4370 			zval val;
4371 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4372 			ZVAL_STR(&val, fld_val);
4373 			zend_hash_update(ht, fld_name, &val);
4374 
4375 			zend_string_release_ex(fld_name, 0);
4376 		}
4377 	}
4378 	return state;
4379 }
4380 
PHP_FUNCTION(mb_send_mail)4381 PHP_FUNCTION(mb_send_mail)
4382 {
4383 	char *to;
4384 	size_t to_len;
4385 	char *message;
4386 	size_t message_len;
4387 	zend_string *subject;
4388 	zend_string *extra_cmd = NULL;
4389 	HashTable *headers_ht = NULL;
4390 	zend_string *str_headers = NULL;
4391 	size_t i;
4392 	char *to_r = NULL;
4393 	char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4394 	bool suppress_content_type = false;
4395 	bool suppress_content_transfer_encoding = false;
4396 
4397 	char *p;
4398 	enum mbfl_no_encoding;
4399 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4400 						*head_enc,	/* header transfer encoding */
4401 						*body_enc;	/* body transfer encoding */
4402 	const mbfl_language *lang;
4403 	HashTable ht_headers;
4404 	zval *s;
4405 
4406 	/* character-set, transfer-encoding */
4407 	tran_cs = &mbfl_encoding_utf8;
4408 	head_enc = &mbfl_encoding_base64;
4409 	body_enc = &mbfl_encoding_base64;
4410 	lang = mbfl_no2language(MBSTRG(language));
4411 	if (lang != NULL) {
4412 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4413 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4414 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4415 	}
4416 
4417 	ZEND_PARSE_PARAMETERS_START(3, 5)
4418 		Z_PARAM_PATH(to, to_len)
4419 		Z_PARAM_PATH_STR(subject)
4420 		Z_PARAM_PATH(message, message_len)
4421 		Z_PARAM_OPTIONAL
4422 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4423 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4424 	ZEND_PARSE_PARAMETERS_END();
4425 
4426 	if (str_headers) {
4427 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4428 			zend_argument_value_error(4, "must not contain any null bytes");
4429 			RETURN_THROWS();
4430 		}
4431 		str_headers = php_trim(str_headers, NULL, 0, 2);
4432 	} else if (headers_ht) {
4433 		str_headers = php_mail_build_headers(headers_ht);
4434 		if (EG(exception)) {
4435 			RETURN_THROWS();
4436 		}
4437 	}
4438 
4439 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4440 
4441 	if (str_headers != NULL) {
4442 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4443 	}
4444 
4445 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4446 		char *tmp;
4447 		char *param_name;
4448 		char *charset = NULL;
4449 
4450 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4451 		p = strchr(Z_STRVAL_P(s), ';');
4452 
4453 		if (p != NULL) {
4454 			/* skipping the padded spaces */
4455 			do {
4456 				++p;
4457 			} while (*p == ' ' || *p == '\t');
4458 
4459 			if (*p != '\0') {
4460 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4461 					if (strcasecmp(param_name, "charset") == 0) {
4462 						const mbfl_encoding *_tran_cs = tran_cs;
4463 
4464 						charset = php_strtok_r(NULL, "= \"", &tmp);
4465 						if (charset != NULL) {
4466 							_tran_cs = mbfl_name2encoding(charset);
4467 						}
4468 
4469 						if (!_tran_cs) {
4470 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4471 							_tran_cs = &mbfl_encoding_ascii;
4472 						}
4473 						tran_cs = _tran_cs;
4474 					}
4475 				}
4476 			}
4477 		}
4478 		suppress_content_type = true;
4479 	}
4480 
4481 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4482 		const mbfl_encoding *_body_enc;
4483 
4484 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4485 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4486 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4487 			case mbfl_no_encoding_base64:
4488 			case mbfl_no_encoding_7bit:
4489 			case mbfl_no_encoding_8bit:
4490 				body_enc = _body_enc;
4491 				break;
4492 
4493 			default:
4494 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4495 				body_enc =	&mbfl_encoding_8bit;
4496 				break;
4497 		}
4498 		suppress_content_transfer_encoding = true;
4499 	}
4500 
4501 	/* To: */
4502 	if (to_len > 0) {
4503 		to_r = estrndup(to, to_len);
4504 		for (; to_len; to_len--) {
4505 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4506 				break;
4507 			}
4508 			to_r[to_len - 1] = '\0';
4509 		}
4510 		for (i = 0; to_r[i]; i++) {
4511 			if (iscntrl((unsigned char) to_r[i])) {
4512 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4513 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4514 				 * To prevent these separators from being replaced with a space, we skip over them. */
4515 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4516 					i += 2;
4517 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4518 						i++;
4519 					}
4520 					continue;
4521 				}
4522 
4523 				to_r[i] = ' ';
4524 			}
4525 		}
4526 	} else {
4527 		to_r = to;
4528 	}
4529 
4530 	/* Subject: */
4531 	const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4532 	if (enc == &mbfl_encoding_pass) {
4533 		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4534 	}
4535 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4536 	size_t line_sep_len = strlen(line_sep);
4537 
4538 	subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4539 
4540 	/* message body */
4541 	const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4542 	if (msg_enc == &mbfl_encoding_pass) {
4543 		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4544 	}
4545 
4546 	unsigned int num_errors = 0;
4547 	zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4548 	zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4549 	zend_string_free(tmpstr);
4550 	message = ZSTR_VAL(conv);
4551 
4552 	/* other headers */
4553 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4554 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4555 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4556 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4557 
4558 	smart_str str = {0};
4559 	bool empty = true;
4560 
4561 	if (str_headers != NULL) {
4562 		/* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4563 		size_t len = ZSTR_LEN(str_headers);
4564 		if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4565 			len--;
4566 		}
4567 		if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4568 			len--;
4569 		}
4570 		smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4571 		empty = false;
4572 		zend_string_release_ex(str_headers, 0);
4573 	}
4574 
4575 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4576 		if (!empty) {
4577 			smart_str_appendl(&str, line_sep, line_sep_len);
4578 		}
4579 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4580 		empty = false;
4581 	}
4582 
4583 	if (!suppress_content_type) {
4584 		if (!empty) {
4585 			smart_str_appendl(&str, line_sep, line_sep_len);
4586 		}
4587 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4588 
4589 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4590 		if (p != NULL) {
4591 			smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4592 			smart_str_appends(&str, p);
4593 		}
4594 		empty = false;
4595 	}
4596 
4597 	if (!suppress_content_transfer_encoding) {
4598 		if (!empty) {
4599 			smart_str_appendl(&str, line_sep, line_sep_len);
4600 		}
4601 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4602 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4603 		if (p == NULL) {
4604 			p = "7bit";
4605 		}
4606 		smart_str_appends(&str, p);
4607 	}
4608 
4609 	str_headers = smart_str_extract(&str);
4610 
4611 	if (force_extra_parameters) {
4612 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4613 	} else if (extra_cmd) {
4614 		extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4615 	}
4616 
4617 	RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4618 
4619 	if (extra_cmd) {
4620 		zend_string_release_ex(extra_cmd, 0);
4621 	}
4622 	if (to_r != to) {
4623 		efree(to_r);
4624 	}
4625 	zend_string_release(subject);
4626 	zend_string_free(conv);
4627 	zend_hash_destroy(&ht_headers);
4628 	if (str_headers) {
4629 		zend_string_release_ex(str_headers, 0);
4630 	}
4631 }
4632 
4633 #undef CRLF
4634 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4635 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4636 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4637 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4638 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4639 /* }}} */
4640 
4641 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4642 PHP_FUNCTION(mb_get_info)
4643 {
4644 	zend_string *type = NULL;
4645 	size_t n;
4646 	char *name;
4647 	zval row;
4648 	const mbfl_encoding **entry;
4649 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4650 
4651 	ZEND_ASSERT(lang);
4652 
4653 	ZEND_PARSE_PARAMETERS_START(0, 1)
4654 		Z_PARAM_OPTIONAL
4655 		Z_PARAM_STR(type)
4656 	ZEND_PARSE_PARAMETERS_END();
4657 
4658 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4659 		array_init(return_value);
4660 		if (MBSTRG(current_internal_encoding)) {
4661 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4662 		}
4663 		if (MBSTRG(http_input_identify)) {
4664 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4665 		}
4666 		if (MBSTRG(current_http_output_encoding)) {
4667 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4668 		}
4669 
4670 		add_assoc_str(return_value, "http_output_conv_mimetypes",
4671 			zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4672 		);
4673 
4674 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4675 		add_assoc_string(return_value, "mail_charset", name);
4676 
4677 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4678 		add_assoc_string(return_value, "mail_header_encoding", name);
4679 
4680 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4681 		add_assoc_string(return_value, "mail_body_encoding", name);
4682 
4683 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4684 
4685 		if (MBSTRG(encoding_translation)) {
4686 			add_assoc_string(return_value, "encoding_translation", "On");
4687 		} else {
4688 			add_assoc_string(return_value, "encoding_translation", "Off");
4689 		}
4690 
4691 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4692 		add_assoc_string(return_value, "language", name);
4693 
4694 		// TODO Seems to always have one entry at least?
4695 		n = MBSTRG(current_detect_order_list_size);
4696 		entry = MBSTRG(current_detect_order_list);
4697 		if (n > 0) {
4698 			size_t i;
4699 			array_init(&row);
4700 			for (i = 0; i < n; i++) {
4701 				add_next_index_string(&row, (*entry)->name);
4702 				entry++;
4703 			}
4704 			add_assoc_zval(return_value, "detect_order", &row);
4705 		}
4706 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4707 			add_assoc_string(return_value, "substitute_character", "none");
4708 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4709 			add_assoc_string(return_value, "substitute_character", "long");
4710 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4711 			add_assoc_string(return_value, "substitute_character", "entity");
4712 		} else {
4713 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4714 		}
4715 		if (MBSTRG(strict_detection)) {
4716 			add_assoc_string(return_value, "strict_detection", "On");
4717 		} else {
4718 			add_assoc_string(return_value, "strict_detection", "Off");
4719 		}
4720 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4721 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
4722 		RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4723 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4724 		if (MBSTRG(http_input_identify)) {
4725 			RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4726 		}
4727 		RETURN_NULL();
4728 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4729 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4730 		RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4731 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4732 		RETURN_STR(
4733 			zend_ini_str(
4734 				"mbstring.http_output_conv_mimetypes",
4735 				sizeof("mbstring.http_output_conv_mimetypes") - 1,
4736 				false
4737 			)
4738 		);
4739 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4740 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4741 		RETURN_STRING(name);
4742 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4743 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4744 		RETURN_STRING(name);
4745 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4746 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4747 		RETURN_STRING(name);
4748 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4749 		RETURN_LONG(MBSTRG(illegalchars));
4750 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4751 		if (MBSTRG(encoding_translation)) {
4752 			RETURN_STRING("On");
4753 		} else {
4754 			RETURN_STRING("Off");
4755 		}
4756 	} else if (zend_string_equals_literal_ci(type, "language")) {
4757 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4758 		RETURN_STRING(name);
4759 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4760 		// TODO Seems to always have one entry at least?
4761 		n = MBSTRG(current_detect_order_list_size);
4762 		entry = MBSTRG(current_detect_order_list);
4763 		if (n > 0) {
4764 			size_t i;
4765 			array_init(return_value);
4766 			for (i = 0; i < n; i++) {
4767 				add_next_index_string(return_value, (*entry)->name);
4768 				entry++;
4769 			}
4770 		}
4771 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4772 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4773 			RETURN_STRING("none");
4774 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4775 			RETURN_STRING("long");
4776 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4777 			RETURN_STRING("entity");
4778 		} else {
4779 			RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4780 		}
4781 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4782 		if (MBSTRG(strict_detection)) {
4783 			RETURN_STRING("On");
4784 		} else {
4785 			RETURN_STRING("Off");
4786 		}
4787 	} else {
4788 		php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4789 		RETURN_FALSE;
4790 	}
4791 }
4792 /* }}} */
4793 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4794 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4795 {
4796 	uint32_t wchar_buf[128];
4797 	unsigned char *in = (unsigned char*)input;
4798 	unsigned int state = 0;
4799 
4800 	if (encoding->check != NULL) {
4801 		return encoding->check(in, length);
4802 	}
4803 
4804 	/* If the input string is not encoded in the given encoding, there is a significant chance
4805 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4806 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4807 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4808 	ZEND_ASSERT(out_len <= 8);
4809 	for (unsigned int i = 0; i < out_len; i++) {
4810 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4811 			return false;
4812 		}
4813 	}
4814 
4815 	while (length) {
4816 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4817 		ZEND_ASSERT(out_len <= 128);
4818 		for (unsigned int i = 0; i < out_len; i++) {
4819 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4820 				return false;
4821 			}
4822 		}
4823 	}
4824 
4825 	return true;
4826 }
4827 
4828 /* MSVC 32-bit has issues with 64-bit intrinsics.
4829  * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4830  * It seems this is caused by a bug in MS Visual C++
4831  * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4832 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4833 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4834 #endif
4835 
4836 /* If we are building an AVX2-only binary, don't compile the next function */
4837 #ifndef ZEND_INTRIN_AVX2_NATIVE
4838 
4839 /* SSE2-based function for validating UTF-8 strings
4840  * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4841 static bool mb_fast_check_utf8_default(zend_string *str)
4842 {
4843 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4844 # ifdef __SSE2__
4845 	/* `e` points 1 byte past the last full 16-byte block of string content
4846 	 * Note that we include the terminating null byte which is included in each zend_string
4847 	 * as part of the content to check; this ensures that multi-byte characters which are
4848 	 * truncated abruptly at the end of the string will be detected as invalid */
4849 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4850 
4851 	/* For checking for illegal bytes 0xF5-FF */
4852 	const __m128i over_f5 = _mm_set1_epi8(-117);
4853 	/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4854 	const __m128i over_9f = _mm_set1_epi8(-97);
4855 	/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4856 	const __m128i over_8f = _mm_set1_epi8(-113);
4857 	/* For checking for illegal bytes 0xC0-C1 */
4858 	const __m128i find_c0 = _mm_set1_epi8(-64);
4859 	const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4860 	/* For checking structure of continuation bytes */
4861 	const __m128i find_e0 = _mm_set1_epi8(-32);
4862 	const __m128i find_f0 = _mm_set1_epi8(-16);
4863 
4864 	__m128i last_block = _mm_setzero_si128();
4865 	__m128i operand;
4866 
4867 	while (p < e) {
4868 		operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4869 
4870 check_operand:
4871 		/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4872 		if (!_mm_movemask_epi8(operand)) {
4873 			/* Even if this block only contains single-byte characters, there may have been a
4874 			 * multi-byte character at the end of the previous block, which was supposed to
4875 			 * have continuation bytes in this block
4876 			 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4877 			 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4878 			 * from the 3rd last */
4879 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4880 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4881 			if (_mm_movemask_epi8(bad)) {
4882 				return false;
4883 			}
4884 
4885 			/* Consume as many full blocks of single-byte characters as we can */
4886 			while (true) {
4887 				p += sizeof(__m128i);
4888 				if (p >= e) {
4889 					goto finish_up_remaining_bytes;
4890 				}
4891 				operand = _mm_loadu_si128((__m128i*)p);
4892 				if (_mm_movemask_epi8(operand)) {
4893 					break;
4894 				}
4895 			}
4896 		}
4897 
4898 		/* Check for >= 0xF5, which are illegal byte values in UTF-8
4899 		 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4900 		 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4901 		 * Then a single signed compare will pick out any bad bytes
4902 		 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4903 		__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4904 
4905 		/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4906 		 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4907 		 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4908 		 * We can check for both problems at once by generating a vector where each byte < 0xA0
4909 		 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4910 		 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4911 		__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4912 		__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4913 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4914 
4915 		/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4916 		 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4917 		 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4918 		 * Build the bitmask and compare it with the shifted block */
4919 		__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4920 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4921 
4922 		/* Check for overlong 2-byte code units
4923 		 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4924 		 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4925 		 * byte range, do a signed compare to pick out any bad bytes */
4926 		bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4927 
4928 		/* Check structure of continuation bytes
4929 		 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4930 		 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4931 		 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4932 		 * 3) 3 bytes after the start of a 4-byte character
4933 		 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4934 		 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4935 		__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4936 		__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4937 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4938 		__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4939 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4940 
4941 		/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4942 		 * a continuation byte actually is
4943 		 * XOR those two bitmasks together; if everything is good, the result should be zero
4944 		 * However, if a byte which should have been a continuation wasn't, or if a byte which
4945 		 * shouldn't have been a continuation was, we will get 0xFF in that position */
4946 		__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4947 		bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4948 
4949 		/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4950 		 * If that value is non-zero, then we found a bad byte somewhere! */
4951 		if (_mm_movemask_epi8(bad)) {
4952 			return false;
4953 		}
4954 
4955 		last_block = operand;
4956 		p += sizeof(__m128i);
4957 	}
4958 
4959 finish_up_remaining_bytes:
4960 	/* Finish up 1-15 remaining bytes */
4961 	if (p == e) {
4962 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
4963 
4964 		/* Crazy hack here for cases where 9 or more bytes are remaining...
4965 		 * We want to use the above vectorized code to check a block of less than 16 bytes,
4966 		 * but there is no good way to read a variable number of bytes into an XMM register
4967 		 * However, we know that these bytes are part of a zend_string, and a zend_string has some
4968 		 * 'header' fields which occupy the memory just before its content
4969 		 * And, those header fields occupy more than 16 bytes...
4970 		 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4971 		 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4972 		 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4973 		 * Then, we do a left shift to get rid of the unwanted bytes
4974 		 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4975 		 *
4976 		 * The following `switch` looks useless, but it's not
4977 		 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4978 		 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4979 		 */
4980 		switch (remaining_bytes) {
4981 		case 0: ;
4982 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4983 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4984 			return _mm_movemask_epi8(bad) == 0;
4985 		case 1:
4986 		case 2:
4987 			operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
4988 			goto check_operand;
4989 		case 3:
4990 		case 4:
4991 			operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
4992 			goto check_operand;
4993 		case 5:
4994 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
4995 			goto check_operand;
4996 		case 6:
4997 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
4998 			goto check_operand;
4999 		case 7:
5000 		case 8:
5001 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5002 			operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5003 #else
5004 			operand = _mm_set_epi64x(0, *((uint64_t*)p));
5005 #endif
5006 			goto check_operand;
5007 		case 9:
5008 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5009 			goto check_operand;
5010 		case 10:
5011 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5012 			goto check_operand;
5013 		case 11:
5014 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5015 			goto check_operand;
5016 		case 12:
5017 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5018 			goto check_operand;
5019 		case 13:
5020 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5021 			goto check_operand;
5022 		case 14:
5023 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5024 			goto check_operand;
5025 		case 15:
5026 			/* No trailing bytes are left which need to be checked
5027 			 * We get 15 because we did not include the terminating null when
5028 			 * calculating `remaining_bytes`, so the value wraps around */
5029 			return true;
5030 		}
5031 
5032 		ZEND_UNREACHABLE();
5033 	}
5034 
5035 	return true;
5036 # else
5037 	/* This UTF-8 validation function is derived from PCRE2 */
5038 	size_t length = ZSTR_LEN(str);
5039 	/* Table of the number of extra bytes, indexed by the first byte masked with
5040 	0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5041 	static const uint8_t utf8_table[] = {
5042 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5043 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5044 		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5045 		3,3,3,3,3,3,3,3
5046 	};
5047 
5048 	for (; length > 0; p++) {
5049 		uint32_t d;
5050 		unsigned char c = *p;
5051 		length--;
5052 
5053 		if (c < 128) {
5054 			/* ASCII character */
5055 			continue;
5056 		}
5057 
5058 		if (c < 0xc0) {
5059 			/* Isolated 10xx xxxx byte */
5060 			return false;
5061 		}
5062 
5063 		if (c >= 0xf5) {
5064 			return false;
5065 		}
5066 
5067 		uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5068 		if (length < ab) {
5069 			/* Missing bytes */
5070 			return false;
5071 		}
5072 		length -= ab;
5073 
5074 		/* Check top bits in the second byte */
5075 		if (((d = *(++p)) & 0xc0) != 0x80) {
5076 			return false;
5077 		}
5078 
5079 		/* For each length, check that the remaining bytes start with the 0x80 bit
5080 		 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5081 		 * excluded range 0xd800 to 0xdfff. */
5082 		switch (ab) {
5083 		case 1:
5084 			/* 2-byte character. No further bytes to check for 0x80. Check first byte
5085 			 * for xx00 000x (overlong sequence). */
5086 			if ((c & 0x3e) == 0) {
5087 				return false;
5088 			}
5089 			break;
5090 
5091 		case 2:
5092 			/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5093 			 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5094 			if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5095 				return false;
5096 			}
5097 			break;
5098 
5099 		case 3:
5100 			/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5101 			 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5102 			 * character greater than 0x0010ffff (f4 8f bf bf) */
5103 			if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5104 				return false;
5105 			}
5106 			break;
5107 
5108 			EMPTY_SWITCH_DEFAULT_CASE();
5109 		}
5110 	}
5111 
5112 	return true;
5113 # endif
5114 }
5115 
5116 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5117 
5118 #ifdef ZEND_INTRIN_AVX2_NATIVE
5119 
5120 /* We are building AVX2-only binary */
5121 # include <immintrin.h>
5122 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5123 
5124 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5125 
5126 /* We are building binary which works with or without AVX2; whether or not to use
5127  * AVX2-accelerated functions will be determined at runtime */
5128 # include <immintrin.h>
5129 # include "Zend/zend_cpuinfo.h"
5130 
5131 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5132 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5133  * resolve symbols accordingly */
5134 
5135 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5136 
5137 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5138 
5139 typedef bool (*check_utf8_func_t)(zend_string*);
5140 
5141 ZEND_NO_SANITIZE_ADDRESS
5142 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5143 static check_utf8_func_t resolve_check_utf8(void)
5144 {
5145 	if (zend_cpu_supports_avx2()) {
5146 		return mb_fast_check_utf8_avx2;
5147 	}
5148 	return mb_fast_check_utf8_default;
5149 }
5150 
5151 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5152 /* We are compiling for a target where the dynamic linker will not be able to
5153  * resolve symbols according to whether the host supports AVX2 or not; so instead,
5154  * we can make calls go through a function pointer and set the function pointer
5155  * on module load */
5156 
5157 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5158 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5159 #else
5160 static bool mb_fast_check_utf8_avx2(zend_string *str);
5161 #endif
5162 
5163 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5164 
mb_fast_check_utf8(zend_string * str)5165 static bool mb_fast_check_utf8(zend_string *str)
5166 {
5167 	return check_utf8_ptr(str);
5168 }
5169 
init_check_utf8(void)5170 static void init_check_utf8(void)
5171 {
5172 	if (zend_cpu_supports_avx2()) {
5173 		check_utf8_ptr = mb_fast_check_utf8_avx2;
5174 	} else {
5175 		check_utf8_ptr = mb_fast_check_utf8_default;
5176 	}
5177 }
5178 # endif
5179 
5180 #else
5181 
5182 /* No AVX2 support */
5183 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5184 
5185 #endif
5186 
5187 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5188 
5189 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5190  * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5191 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5192 # define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5193 #endif
5194 
5195 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5196  * number of bytes, then take the low 256 bits
5197  * This is used to take some number of trailing bytes from the previous 32-byte
5198  * block followed by some number of leading bytes from the current 32-byte block
5199  *
5200  * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5201  * YMM register while shifting in bytes from another YMM register... but
5202  * it works separately on respective 128-bit halves of the YMM registers,
5203  * which is not what we want.
5204  * To make it work as desired, we first do _mm256_permute2x128_si256
5205  * (VPERM2I128) to combine the low 128 bits from the previous block and
5206  * the high 128 bits of the current block in one YMM register.
5207  * Then VPALIGNR will do what is needed. */
5208 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5209 
5210 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5211  *
5212  * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5213  * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5214  * sections. */
5215 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5216 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5217 #else
5218 static bool mb_fast_check_utf8_avx2(zend_string *str)
5219 #endif
5220 {
5221 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5222 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5223 
5224 	/* The algorithm used here for UTF-8 validation is partially adapted from the
5225 	 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5226 	 * and Daniel Lemire.
5227 	 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5228 	 *
5229 	 * Most types of invalid UTF-8 text can be detected by examining pairs of
5230 	 * successive bytes. Specifically:
5231 	 *
5232 	 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5233 	 *   No valid UTF-8 string ever uses these byte values.
5234 	 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5235 	 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5236 	 * • 5-byte or 6-byte code units, which should never be used, start with
5237 	 *   0xF8-FE.
5238 	 * • A codepoint value higher than U+10FFFF, which is the highest value for
5239 	 *   any Unicode codepoint, would either start with 0xF4, followed by a
5240 	 *   byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5241 	 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5242 	 *   be used, would start with 0xED, followed by a byte >= 0xA0.
5243 	 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5244 	 *
5245 	 * To detect all these problems, for each pair of successive bytes, we do
5246 	 * table lookups using the high nibble of the first byte, the low nibble of
5247 	 * the first byte, and the high nibble of the second byte. Each table lookup
5248 	 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5249 	 * combination; AND those three bitmasks together, and any 1 bit in the result
5250 	 * will indicate an actual invalid byte combination was found.
5251 	 */
5252 
5253 #define BAD_BYTE 0x1
5254 #define OVERLONG_2BYTE 0x2
5255 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5256 #define OVERLONG_3BYTE 0x4
5257 #define SURROGATE 0x8
5258 #define OVERLONG_4BYTE 0x10
5259 #define INVALID_CP 0x20
5260 
5261 	/* Each of these are 16-entry tables, repeated twice; this is required by the
5262 	 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5263 	 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5264 	 *
5265 	 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5266 	 * that means that high nibble 0xC is consistent with the byte pair being part of
5267 	 * an overlong 2-byte code unit */
5268 	const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5269 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5270 		0, 0, 0, 0,
5271 		0, 0, 0, 0,
5272 		0, 0, 0, 0,
5273 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5274 		0, 0, 0, 0,
5275 		0, 0, 0, 0,
5276 		0, 0, 0, 0);
5277 	const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5278 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5279 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5280 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5281 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5282 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5283 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5284 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5285 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5286 	const __m256i bad_hi_nibble = _mm256_set_epi8(
5287 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5288 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5289 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5290 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5291 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5292 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5293 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5294 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5295 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5296 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5297 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5298 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5299 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5300 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5301 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5302 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5303 
5304 	const __m256i find_continuation = _mm256_set1_epi8(-64);
5305 	const __m256i _b = _mm256_set1_epi8(0xB);
5306 	const __m256i _d = _mm256_set1_epi8(0xD);
5307 	const __m256i _f = _mm256_set1_epi8(0xF);
5308 
5309 	__m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5310 	__m256i operand;
5311 
5312 	while (p < e) {
5313 		operand = _mm256_loadu_si256((__m256i*)p);
5314 
5315 check_operand:
5316 		if (!_mm256_movemask_epi8(operand)) {
5317 			/* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5318 			 * the previous block didn't end with an incomplete multi-byte character
5319 			 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5320 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5321 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5322 			if (_mm256_movemask_epi8(bad)) {
5323 				return false;
5324 			}
5325 
5326 			/* Consume as many full blocks of single-byte characters as we can */
5327 			while (true) {
5328 				p += sizeof(__m256i);
5329 				if (p >= e) {
5330 					goto finish_up_remaining_bytes;
5331 				}
5332 				operand = _mm256_loadu_si256((__m256i*)p);
5333 				if (_mm256_movemask_epi8(operand)) {
5334 					break;
5335 				}
5336 			}
5337 		}
5338 
5339 		__m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5340 		__m256i lo_nibbles = _mm256_and_si256(operand, _f);
5341 
5342 		__m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5343 		__m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5344 
5345 		/* Do parallel table lookups in all 3 tables */
5346 		__m256i bad = _mm256_cmpgt_epi8(
5347 			_mm256_and_si256(
5348 				_mm256_and_si256(
5349 					_mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5350 					_mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5351 				_mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5352 			_mm256_setzero_si256());
5353 
5354 		__m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5355 		__m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5356 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5357 		__m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5358 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5359 
5360 		__m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5361 		bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5362 
5363 		if (_mm256_movemask_epi8(bad)) {
5364 			return false;
5365 		}
5366 
5367 		last_hi_nibbles = hi_nibbles;
5368 		last_lo_nibbles = lo_nibbles;
5369 		p += sizeof(__m256i);
5370 	}
5371 
5372 finish_up_remaining_bytes:
5373 	if (p == e) {
5374 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5375 
5376 		switch (remaining_bytes) {
5377 		case 0: ;
5378 			/* No actual data bytes are remaining */
5379 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5380 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5381 			return _mm256_movemask_epi8(bad) == 0;
5382 		case 1:
5383 		case 2:
5384 			operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5385 			goto check_operand;
5386 		case 3:
5387 		case 4:
5388 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5389 			goto check_operand;
5390 		case 5:
5391 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5392 			goto check_operand;
5393 		case 6:
5394 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5395 			goto check_operand;
5396 		case 7:
5397 		case 8:
5398 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5399 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5400 #else
5401 			operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5402 #endif
5403 			goto check_operand;
5404 		case 9:
5405 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5406 			goto check_operand;
5407 		case 10:
5408 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5409 			goto check_operand;
5410 		case 11:
5411 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5412 			goto check_operand;
5413 		case 12:
5414 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5415 			goto check_operand;
5416 		case 13:
5417 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5418 			goto check_operand;
5419 		case 14:
5420 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5421 			goto check_operand;
5422 		case 15:
5423 		case 16:
5424 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5425 			goto check_operand;
5426 		case 17:
5427 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5428 			goto check_operand;
5429 		case 18:
5430 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5431 			goto check_operand;
5432 		case 19:
5433 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5434 			goto check_operand;
5435 		case 20:
5436 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5437 			goto check_operand;
5438 		case 21:
5439 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5440 			goto check_operand;
5441 		case 22:
5442 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5443 			goto check_operand;
5444 		case 23:
5445 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5446 			goto check_operand;
5447 		case 24:
5448 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5449 			goto check_operand;
5450 		case 25:
5451 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5452 			goto check_operand;
5453 		case 26:
5454 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5455 			goto check_operand;
5456 		case 27:
5457 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5458 			goto check_operand;
5459 		case 28:
5460 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5461 			goto check_operand;
5462 		case 29:
5463 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5464 			goto check_operand;
5465 		case 30:
5466 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5467 			goto check_operand;
5468 		case 31:
5469 			return true;
5470 		}
5471 
5472 		ZEND_UNREACHABLE();
5473 	}
5474 
5475 	return true;
5476 }
5477 
5478 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5479 
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5480 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5481 {
5482 	if (encoding == &mbfl_encoding_utf8) {
5483 		if (ZSTR_IS_VALID_UTF8(str)) {
5484 			return true;
5485 		}
5486 		bool result = mb_fast_check_utf8(str);
5487 		if (result && !ZSTR_IS_INTERNED(str)) {
5488 			GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5489 		}
5490 		return result;
5491 	} else {
5492 		return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5493 	}
5494 }
5495 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5496 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5497 {
5498 	zend_long idx;
5499 	zend_string *key;
5500 	zval *entry;
5501 	bool valid = true;
5502 
5503 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5504 
5505 	if (GC_IS_RECURSIVE(vars)) {
5506 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5507 		return false;
5508 	}
5509 	GC_TRY_PROTECT_RECURSION(vars);
5510 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5511 		ZVAL_DEREF(entry);
5512 		if (key) {
5513 			if (!mb_check_str_encoding(key, encoding)) {
5514 				valid = false;
5515 				break;
5516 			}
5517 		}
5518 		switch (Z_TYPE_P(entry)) {
5519 			case IS_STRING:
5520 				if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5521 					valid = false;
5522 					break;
5523 				}
5524 				break;
5525 			case IS_ARRAY:
5526 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5527 					valid = false;
5528 					break;
5529 				}
5530 				break;
5531 			case IS_LONG:
5532 			case IS_DOUBLE:
5533 			case IS_NULL:
5534 			case IS_TRUE:
5535 			case IS_FALSE:
5536 				break;
5537 			default:
5538 				/* Other types are error. */
5539 				valid = false;
5540 				break;
5541 		}
5542 	} ZEND_HASH_FOREACH_END();
5543 	GC_TRY_UNPROTECT_RECURSION(vars);
5544 	return valid;
5545 }
5546 
5547 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5548 PHP_FUNCTION(mb_check_encoding)
5549 {
5550 	zend_string *input_str = NULL, *enc = NULL;
5551 	HashTable *input_ht = NULL;
5552 	const mbfl_encoding *encoding;
5553 
5554 	ZEND_PARSE_PARAMETERS_START(0, 2)
5555 		Z_PARAM_OPTIONAL
5556 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5557 		Z_PARAM_STR_OR_NULL(enc)
5558 	ZEND_PARSE_PARAMETERS_END();
5559 
5560 	encoding = php_mb_get_encoding(enc, 2);
5561 	if (!encoding) {
5562 		RETURN_THROWS();
5563 	}
5564 
5565 	if (input_ht) {
5566 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5567 	} else if (input_str) {
5568 		RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5569 	} else {
5570 		php_error_docref(NULL, E_DEPRECATED,
5571 			"Calling mb_check_encoding() without argument is deprecated");
5572 
5573 		/* FIXME: Actually check all inputs, except $_FILES file content. */
5574 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
5575 	}
5576 }
5577 /* }}} */
5578 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5579 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5580 	const uint32_t enc_name_arg_num)
5581 {
5582 	const mbfl_encoding *enc;
5583 	enum mbfl_no_encoding no_enc;
5584 
5585 	ZEND_ASSERT(str_len > 0);
5586 
5587 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5588 	if (!enc) {
5589 		return -2;
5590 	}
5591 
5592 	no_enc = enc->no_encoding;
5593 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5594 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5595 		return -2;
5596 	}
5597 
5598 	/* Some legacy text encodings have a minimum required wchar buffer size;
5599 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5600 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5601 	unsigned int state = 0;
5602 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5603 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5604 
5605 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5606 		return -1;
5607 	}
5608 	return wchar_buf[0];
5609 }
5610 
5611 /* {{{ */
PHP_FUNCTION(mb_ord)5612 PHP_FUNCTION(mb_ord)
5613 {
5614 	char *str;
5615 	size_t str_len;
5616 	zend_string *enc = NULL;
5617 	zend_long cp;
5618 
5619 	ZEND_PARSE_PARAMETERS_START(1, 2)
5620 		Z_PARAM_STRING(str, str_len)
5621 		Z_PARAM_OPTIONAL
5622 		Z_PARAM_STR_OR_NULL(enc)
5623 	ZEND_PARSE_PARAMETERS_END();
5624 
5625 	if (str_len == 0) {
5626 		zend_argument_value_error(1, "must not be empty");
5627 		RETURN_THROWS();
5628 	}
5629 
5630 	cp = php_mb_ord(str, str_len, enc, 2);
5631 
5632 	if (0 > cp) {
5633 		if (cp == -2) {
5634 			RETURN_THROWS();
5635 		}
5636 		RETURN_FALSE;
5637 	}
5638 
5639 	RETURN_LONG(cp);
5640 }
5641 /* }}} */
5642 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5643 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5644 {
5645 	const mbfl_encoding *enc;
5646 	enum mbfl_no_encoding no_enc;
5647 	zend_string *ret;
5648 	char buf[4];
5649 
5650 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5651 	if (!enc) {
5652 		return NULL;
5653 	}
5654 
5655 	no_enc = enc->no_encoding;
5656 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5657 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5658 		return NULL;
5659 	}
5660 
5661 	if (cp < 0 || cp > 0x10ffff) {
5662 		return NULL;
5663 	}
5664 
5665 	if (php_mb_is_no_encoding_utf8(no_enc)) {
5666 		if (cp > 0xd7ff && 0xe000 > cp) {
5667 			return NULL;
5668 		}
5669 
5670 		if (cp < 0x80) {
5671 			ret = ZSTR_CHAR(cp);
5672 		} else if (cp < 0x800) {
5673 			ret = zend_string_alloc(2, 0);
5674 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5675 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5676 			ZSTR_VAL(ret)[2] = 0;
5677 		} else if (cp < 0x10000) {
5678 			ret = zend_string_alloc(3, 0);
5679 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5680 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5681 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5682 			ZSTR_VAL(ret)[3] = 0;
5683 		} else {
5684 			ret = zend_string_alloc(4, 0);
5685 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5686 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5687 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5688 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5689 			ZSTR_VAL(ret)[4] = 0;
5690 		}
5691 
5692 		return ret;
5693 	}
5694 
5695 	buf[0] = (cp >> 24) & 0xff;
5696 	buf[1] = (cp >> 16) & 0xff;
5697 	buf[2] = (cp >>  8) & 0xff;
5698 	buf[3] = cp & 0xff;
5699 
5700 	long orig_illegalchars = MBSTRG(illegalchars);
5701 	MBSTRG(illegalchars) = 0;
5702 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5703 
5704 	if (MBSTRG(illegalchars) != 0) {
5705 		zend_string_release(ret);
5706 		ret = NULL;
5707 	}
5708 
5709 	MBSTRG(illegalchars) = orig_illegalchars;
5710 	return ret;
5711 }
5712 
5713 /* {{{ */
PHP_FUNCTION(mb_chr)5714 PHP_FUNCTION(mb_chr)
5715 {
5716 	zend_long cp;
5717 	zend_string *enc = NULL;
5718 
5719 	ZEND_PARSE_PARAMETERS_START(1, 2)
5720 		Z_PARAM_LONG(cp)
5721 		Z_PARAM_OPTIONAL
5722 		Z_PARAM_STR_OR_NULL(enc)
5723 	ZEND_PARSE_PARAMETERS_END();
5724 
5725 	zend_string* ret = php_mb_chr(cp, enc, 2);
5726 	if (ret == NULL) {
5727 		RETURN_FALSE;
5728 	}
5729 
5730 	RETURN_STR(ret);
5731 }
5732 /* }}} */
5733 
PHP_FUNCTION(mb_str_pad)5734 PHP_FUNCTION(mb_str_pad)
5735 {
5736 	zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5737 	zend_long pad_to_length;
5738 	zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5739 
5740 	ZEND_PARSE_PARAMETERS_START(2, 5)
5741 		Z_PARAM_STR(input)
5742 		Z_PARAM_LONG(pad_to_length)
5743 		Z_PARAM_OPTIONAL
5744 		Z_PARAM_STR(pad)
5745 		Z_PARAM_LONG(pad_type_val)
5746 		Z_PARAM_STR_OR_NULL(encoding_str)
5747 	ZEND_PARSE_PARAMETERS_END();
5748 
5749 	const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5750 	if (!encoding) {
5751 		RETURN_THROWS();
5752 	}
5753 
5754 	size_t input_length = mb_get_strlen(input, encoding);
5755 
5756 	/* If resulting string turns out to be shorter than input string,
5757 	   we simply copy the input and return. */
5758 	if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5759 		RETURN_STR_COPY(input);
5760 	}
5761 
5762 	if (ZSTR_LEN(pad) == 0) {
5763 		zend_argument_value_error(3, "must be a non-empty string");
5764 		RETURN_THROWS();
5765 	}
5766 
5767 	if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5768 		zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5769 		RETURN_THROWS();
5770 	}
5771 
5772 	size_t pad_length = mb_get_strlen(pad, encoding);
5773 
5774 	size_t num_mb_pad_chars = pad_to_length - input_length;
5775 
5776 	/* We need to figure out the left/right padding lengths. */
5777 	size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5778 	switch (pad_type_val) {
5779 		case PHP_STR_PAD_RIGHT:
5780 			right_pad = num_mb_pad_chars;
5781 			break;
5782 
5783 		case PHP_STR_PAD_LEFT:
5784 			left_pad = num_mb_pad_chars;
5785 			break;
5786 
5787 		case PHP_STR_PAD_BOTH:
5788 			left_pad = num_mb_pad_chars / 2;
5789 			right_pad = num_mb_pad_chars - left_pad;
5790 			break;
5791 	}
5792 
5793 	/* How many full block copies need to happen, and how many characters are then left over? */
5794 	size_t full_left_pad_copies = left_pad / pad_length;
5795 	size_t full_right_pad_copies = right_pad / pad_length;
5796 	size_t remaining_left_pad_chars = left_pad % pad_length;
5797 	size_t remaining_right_pad_chars = right_pad % pad_length;
5798 
5799 	if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5800 		goto overflow_no_release;
5801 	}
5802 
5803 	/* Compute the number of bytes required for the padding */
5804 	size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5805 	size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5806 
5807 	/* No special fast-path handling necessary for zero-length pads because these functions will not
5808 	 * allocate memory in case a zero-length pad is required. */
5809 	zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5810 	zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5811 
5812 	if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5813 		|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5814 		goto overflow;
5815 	}
5816 
5817 	size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5818 	size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5819 
5820 	if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5821 		|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5822 		goto overflow;
5823 	}
5824 
5825 	zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5826 	char *buffer = ZSTR_VAL(result);
5827 
5828 	/* First we pad the left. */
5829 	for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5830 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5831 	}
5832 	memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5833 	buffer += ZSTR_LEN(remaining_left_pad_str);
5834 
5835 	/* Then we copy the input string. */
5836 	memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5837 	buffer += ZSTR_LEN(input);
5838 
5839 	/* Finally, we pad on the right. */
5840 	for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5841 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5842 	}
5843 	memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5844 
5845 	ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5846 
5847 	zend_string_release_ex(remaining_left_pad_str, false);
5848 	zend_string_release_ex(remaining_right_pad_str, false);
5849 
5850 	RETURN_NEW_STR(result);
5851 
5852 overflow:
5853 	zend_string_release_ex(remaining_left_pad_str, false);
5854 	zend_string_release_ex(remaining_right_pad_str, false);
5855 overflow_no_release:
5856 	zend_throw_error(NULL, "String size overflow");
5857 	RETURN_THROWS();
5858 }
5859 
5860 /* {{{ */
PHP_FUNCTION(mb_scrub)5861 PHP_FUNCTION(mb_scrub)
5862 {
5863 	zend_string *str, *enc_name = NULL;
5864 
5865 	ZEND_PARSE_PARAMETERS_START(1, 2)
5866 		Z_PARAM_STR(str)
5867 		Z_PARAM_OPTIONAL
5868 		Z_PARAM_STR_OR_NULL(enc_name)
5869 	ZEND_PARSE_PARAMETERS_END();
5870 
5871 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5872 	if (!enc) {
5873 		RETURN_THROWS();
5874 	}
5875 
5876 	if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5877 		/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5878 		RETURN_STR_COPY(str);
5879 	}
5880 
5881 	RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5882 }
5883 /* }}} */
5884 
5885 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5886 static void php_mb_populate_current_detect_order_list(void)
5887 {
5888 	const mbfl_encoding **entry = 0;
5889 	size_t nentries;
5890 
5891 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5892 		nentries = MBSTRG(detect_order_list_size);
5893 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5894 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5895 	} else {
5896 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5897 		size_t i;
5898 		nentries = MBSTRG(default_detect_order_list_size);
5899 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5900 		for (i = 0; i < nentries; i++) {
5901 			entry[i] = mbfl_no2encoding(src[i]);
5902 		}
5903 	}
5904 	MBSTRG(current_detect_order_list) = entry;
5905 	MBSTRG(current_detect_order_list_size) = nentries;
5906 }
5907 /* }}} */
5908 
5909 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5910 static int php_mb_encoding_translation(void)
5911 {
5912 	return MBSTRG(encoding_translation);
5913 }
5914 /* }}} */
5915 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5916 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5917 {
5918 	if (enc) {
5919 		if (enc->mblen_table) {
5920 			if (s) {
5921 				return enc->mblen_table[*(unsigned char *)s];
5922 			}
5923 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5924 			return 2;
5925 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5926 			return 4;
5927 		}
5928 	}
5929 	return 1;
5930 }
5931 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5932 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5933 {
5934 	const char *p = s;
5935 	char *last=NULL;
5936 
5937 	if (nbytes == (size_t)-1) {
5938 		size_t nb = 0;
5939 
5940 		while (*p != '\0') {
5941 			if (nb == 0) {
5942 				if ((unsigned char)*p == (unsigned char)c) {
5943 					last = (char *)p;
5944 				}
5945 				nb = php_mb_mbchar_bytes(p, enc);
5946 				if (nb == 0) {
5947 					return NULL; /* something is going wrong! */
5948 				}
5949 			}
5950 			--nb;
5951 			++p;
5952 		}
5953 	} else {
5954 		size_t bcnt = nbytes;
5955 		size_t nbytes_char;
5956 		while (bcnt > 0) {
5957 			if ((unsigned char)*p == (unsigned char)c) {
5958 				last = (char *)p;
5959 			}
5960 			nbytes_char = php_mb_mbchar_bytes(p, enc);
5961 			if (bcnt < nbytes_char) {
5962 				return NULL;
5963 			}
5964 			p += nbytes_char;
5965 			bcnt -= nbytes_char;
5966 		}
5967 	}
5968 	return last;
5969 }
5970 
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)5971 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
5972 {
5973 	/* We're using simple case-folding here, because we'd have to deal with remapping of
5974 	 * offsets otherwise. */
5975 	zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5976 	zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5977 
5978 	size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
5979 
5980 	zend_string_free(haystack_conv);
5981 	zend_string_free(needle_conv);
5982 
5983 	return n;
5984 }
5985 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)5986 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
5987 {
5988 	*list = (const zend_encoding **)MBSTRG(http_input_list);
5989 	*list_size = MBSTRG(http_input_list_size);
5990 }
5991 /* }}} */
5992 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)5993 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
5994 {
5995 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
5996 }
5997 /* }}} */
5998 
5999 static const unsigned char base64_table[] = {
6000  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6001    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6002  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6003    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6004  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6005    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6006  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6007    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6008  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6009    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6010 };
6011 
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6012 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6013 {
6014 	if (base64) {
6015 		return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6016 	} else {
6017 		size_t enc_size = 0;
6018 		unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6019 		while (p < tmpbuf->out) {
6020 			unsigned char c = *p++;
6021 			enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6022 		}
6023 		return enc_size;
6024 	}
6025 }
6026 
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6027 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6028 {
6029 	unsigned char *out, *limit;
6030 	MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6031 	unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6032 
6033 	if (base64) {
6034 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6035 		while ((e - p) >= 3) {
6036 			unsigned char a = *p++;
6037 			unsigned char b = *p++;
6038 			unsigned char c = *p++;
6039 			uint32_t bits = (a << 16) | (b << 8) | c;
6040 			out = mb_convert_buf_add4(out,
6041 				base64_table[(bits >> 18) & 0x3F],
6042 				base64_table[(bits >> 12) & 0x3F],
6043 				base64_table[(bits >> 6) & 0x3F],
6044 				base64_table[bits & 0x3F]);
6045 		}
6046 		if (p != e) {
6047 			if ((e - p) == 1) {
6048 				uint32_t bits = *p++;
6049 				out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6050 			} else {
6051 				unsigned char a = *p++;
6052 				unsigned char b = *p++;
6053 				uint32_t bits = (a << 8) | b;
6054 				out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6055 			}
6056 		}
6057 	} else {
6058 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6059 		while (p < e) {
6060 			unsigned char c = *p++;
6061 			if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6062 				out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6063 			} else {
6064 				out = mb_convert_buf_add(out, c);
6065 			}
6066 		}
6067 	}
6068 
6069 	mb_convert_buf_reset(tmpbuf, 0);
6070 	MB_CONVERT_BUF_STORE(outbuf, out, limit);
6071 }
6072 
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6073 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6074 {
6075 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6076 	size_t in_len = ZSTR_LEN(input);
6077 
6078 	if (!in_len) {
6079 		return zend_empty_string;
6080 	}
6081 
6082 	if (indent < 0 || indent >= 74) {
6083 		indent = 0;
6084 	}
6085 
6086 	if (linefeed_len > 8) {
6087 		linefeed_len = 8;
6088 	}
6089 	/* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6090 	for (size_t i = 0; i < linefeed_len; i++) {
6091 		if (linefeed[i] == '\0') {
6092 			linefeed_len = i;
6093 			break;
6094 		}
6095 	}
6096 
6097 	unsigned int state = 0;
6098 	/* wchar_buf should be big enough that when it is full, we definitely have enough
6099 	 * wchars to fill an entire line of output */
6100 	uint32_t wchar_buf[80];
6101 	uint32_t *p, *e;
6102 	/* What part of wchar_buf is filled with still-unprocessed data which should not
6103 	 * be overwritten? */
6104 	unsigned int offset = 0;
6105 	size_t line_start = 0;
6106 
6107 	/* If the entire input string is ASCII with no spaces (except possibly leading
6108 	 * spaces), just pass it through unchanged */
6109 	bool checking_leading_spaces = true;
6110 	while (in_len) {
6111 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, 80, &state);
6112 		p = wchar_buf;
6113 		e = wchar_buf + out_len;
6114 
6115 		while (p < e) {
6116 			uint32_t w = *p++;
6117 			if (checking_leading_spaces) {
6118 				if (w == ' ') {
6119 					continue;
6120 				} else {
6121 					checking_leading_spaces = false;
6122 				}
6123 			}
6124 			if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6125 				/* We cannot simply pass input string through unchanged; start again */
6126 				in = (unsigned char*)ZSTR_VAL(input);
6127 				in_len = ZSTR_LEN(input);
6128 				goto no_passthrough;
6129 			}
6130 		}
6131 	}
6132 
6133 	return zend_string_copy(input); /* This just increments refcount */
6134 
6135 no_passthrough: ;
6136 
6137 	mb_convert_buf buf;
6138 	mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6139 
6140 	/* Encode some prefix of the input string as plain ASCII if possible
6141 	 * If we find it necessary to switch to Base64/QPrint encoding, we will
6142 	 * do so all the way to the end of the string */
6143 	while (in_len) {
6144 		/* Decode part of the input string, refill wchar_buf */
6145 		ZEND_ASSERT(offset < 80);
6146 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
6147 		ZEND_ASSERT(out_len <= 80 - offset);
6148 		p = wchar_buf;
6149 		e = wchar_buf + offset + out_len;
6150 		/* ASCII output is broken into space-delimited 'words'
6151 		 * If we find a non-ASCII character in the middle of a word, we will
6152 		 * transfer-encode the entire word */
6153 		uint32_t *word_start = p;
6154 
6155 		/* Don't consider adding line feed for spaces at the beginning of a word */
6156 		while (p < e && *p == ' ' && (p - word_start) <= 74) {
6157 			p++;
6158 		}
6159 
6160 		while (p < e) {
6161 			uint32_t w = *p++;
6162 
6163 			if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6164 				/* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6165 				 * If we are already too far along on a line to include Base64/QPrint encoded data
6166 				 * on the same line (without overrunning max line length), then add a line feed
6167 				 * right now */
6168 				if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6169 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6170 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6171 					buf.out = mb_convert_buf_add(buf.out, ' ');
6172 					indent = 0;
6173 					line_start = mb_convert_buf_len(&buf);
6174 				} else if (mb_convert_buf_len(&buf) > 0) {
6175 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6176 					buf.out = mb_convert_buf_add(buf.out, ' ');
6177 				}
6178 				p = word_start; /* Back up to where MIME encoding of input chars should start */
6179 				goto mime_encoding_needed;
6180 			} else if (w == ' ') {
6181 				/* When we see a space, check whether we should insert a line break */
6182 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6183 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6184 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6185 					buf.out = mb_convert_buf_add(buf.out, ' ');
6186 					indent = 0;
6187 					line_start = mb_convert_buf_len(&buf);
6188 				} else if (mb_convert_buf_len(&buf) > 0) {
6189 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6190 					buf.out = mb_convert_buf_add(buf.out, ' ');
6191 				}
6192 				/* Output one (space-delimited) word as plain ASCII */
6193 				while (word_start < p-1) {
6194 					buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6195 				}
6196 				word_start++;
6197 				while (p < e && *p == ' ') {
6198 					p++;
6199 				}
6200 			}
6201 		}
6202 
6203 		if (in_len) {
6204 			/* Copy chars which are part of an incomplete 'word' to the beginning
6205 			 * of wchar_buf and reprocess them on the next iteration */
6206 			offset = e - word_start;
6207 			if (offset) {
6208 				memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6209 			}
6210 		} else {
6211 			/* We have reached the end of the input string while still in 'ASCII mode';
6212 			 * process any trailing ASCII chars which were not followed by a space */
6213 			if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6214 				/* The whole input string was not just one big ASCII 'word' with no spaces
6215 				 * consider adding a line feed if necessary to prevent output lines from
6216 				 * being too long */
6217 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6218 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6219 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6220 					buf.out = mb_convert_buf_add(buf.out, ' ');
6221 				} else {
6222 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6223 					buf.out = mb_convert_buf_add(buf.out, ' ');
6224 				}
6225 			}
6226 			while (word_start < e) {
6227 				buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6228 			}
6229 		}
6230 	}
6231 
6232 	/* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6233 	return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6234 
6235 mime_encoding_needed: ;
6236 
6237 	/* We will generate the output line by line, first converting wchars to bytes
6238 	 * in the requested output encoding, then transfer-encoding those bytes as
6239 	 * Base64 or QPrint
6240 	 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6241 	 * sending them to 'buf' */
6242 	mb_convert_buf tmpbuf;
6243 	mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6244 
6245 	/* Do we need to refill wchar_buf to make sure we don't run out of wchars
6246 	 * in the middle of a line? */
6247 	if (p == wchar_buf) {
6248 		goto start_new_line;
6249 	}
6250 	offset = e - p;
6251 	memmove(wchar_buf, p, offset * sizeof(uint32_t));
6252 
6253 	while(true) {
6254 refill_wchar_buf: ;
6255 		ZEND_ASSERT(offset < 80);
6256 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
6257 		ZEND_ASSERT(out_len <= 80 - offset);
6258 		p = wchar_buf;
6259 		e = wchar_buf + offset + out_len;
6260 
6261 start_new_line: ;
6262 		MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6263 		buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6264 		buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6265 		buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6266 
6267 		/* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6268 		 * We do something like a 'binary search' to find the greatest number which
6269 		 * can be included on this line without exceeding max line length */
6270 		unsigned int n = 12;
6271 		size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6272 
6273 		while (true) {
6274 			ZEND_ASSERT(p < e);
6275 
6276 			/* Remember where we were in process of generating output, so we can back
6277 			 * up if necessary */
6278 			size_t tmppos = mb_convert_buf_len(&tmpbuf);
6279 			unsigned int tmpstate = tmpbuf.state;
6280 
6281 			/* Try encoding 'n' wchars in output text encoding and sending output
6282 			 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6283 			 * current line. */
6284 			n = MIN(n, e - p);
6285 			outcode->from_wchar(p, n, &tmpbuf, false);
6286 
6287 			/* For some output text encodings, there may be a few ending bytes
6288 			 * which need to be emitted to output before we break a line.
6289 			 * Again, remember where we were so we can back up */
6290 			size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6291 			unsigned int tmpstate2 = tmpbuf.state;
6292 			outcode->from_wchar(NULL, 0, &tmpbuf, true);
6293 
6294 			if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6295 				/* If we convert 'n' more wchars on the current line, it will not
6296 				 * overflow the maximum line length */
6297 				p += n;
6298 
6299 				if (p == e) {
6300 					/* We are done; we shouldn't reach here if there is more remaining
6301 					 * of the input string which needs to be processed */
6302 					ZEND_ASSERT(!in_len);
6303 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6304 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6305 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6306 					mb_convert_buf_free(&tmpbuf);
6307 					return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6308 				} else {
6309 					/* It's possible that more chars might fit on the current line,
6310 					 * so back up to where we were before emitting any ending bytes */
6311 					mb_convert_buf_reset(&tmpbuf, tmppos2);
6312 					tmpbuf.state = tmpstate2;
6313 				}
6314 			} else {
6315 				/* Converting 'n' more wchars on this line would be too much.
6316 				 * Back up to where we were before we tried that. */
6317 				mb_convert_buf_reset(&tmpbuf, tmppos);
6318 				tmpbuf.state = tmpstate;
6319 
6320 				if (n == 1) {
6321 					/* We have found the exact number of chars which will fit on the
6322 					 * current line. Finish up and move to a new line. */
6323 					outcode->from_wchar(NULL, 0, &tmpbuf, true);
6324 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6325 					tmpbuf.state = 0;
6326 
6327 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6328 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6329 
6330 					indent = 0; /* Indent argument must only affect the first line */
6331 
6332 					if (in_len) {
6333 						/* We still have more of input string remaining to decode */
6334 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6335 						buf.out = mb_convert_buf_add(buf.out, ' ');
6336 						line_start = mb_convert_buf_len(&buf);
6337 						/* Copy remaining wchars to beginning of buffer so they will be
6338 						 * processed on the next iteration of outer 'do' loop */
6339 						offset = e - p;
6340 						memmove(wchar_buf, p, offset * sizeof(uint32_t));
6341 						goto refill_wchar_buf;
6342 					} else if (p < e) {
6343 						/* Input string is finished, but we still have trailing wchars
6344 						 * remaining to be processed in wchar_buf */
6345 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6346 						buf.out = mb_convert_buf_add(buf.out, ' ');
6347 						line_start = mb_convert_buf_len(&buf);
6348 						goto start_new_line;
6349 					} else {
6350 						/* We are done! */
6351 						mb_convert_buf_free(&tmpbuf);
6352 						return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6353 					}
6354 				} else {
6355 					/* Try a smaller number of wchars */
6356 					n = MAX(n >> 1, 1);
6357 				}
6358 			}
6359 		}
6360 	}
6361 }
6362 
PHP_FUNCTION(mb_encode_mimeheader)6363 PHP_FUNCTION(mb_encode_mimeheader)
6364 {
6365 	const mbfl_encoding *charset = &mbfl_encoding_pass;
6366 	zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6367 	char *linefeed = "\r\n";
6368 	size_t linefeed_len = 2;
6369 	zend_long indent = 0;
6370 	bool base64 = true;
6371 
6372 	ZEND_PARSE_PARAMETERS_START(1, 5)
6373 		Z_PARAM_STR(str)
6374 		Z_PARAM_OPTIONAL
6375 		Z_PARAM_STR(charset_name)
6376 		Z_PARAM_STR(transenc_name)
6377 		Z_PARAM_STRING(linefeed, linefeed_len)
6378 		Z_PARAM_LONG(indent)
6379 	ZEND_PARSE_PARAMETERS_END();
6380 
6381 	if (charset_name != NULL) {
6382 		charset = php_mb_get_encoding(charset_name, 2);
6383 		if (!charset) {
6384 			RETURN_THROWS();
6385 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
6386 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6387 			RETURN_THROWS();
6388 		}
6389 	} else {
6390 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6391 		if (lang != NULL) {
6392 			charset = mbfl_no2encoding(lang->mail_charset);
6393 			const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6394 			char t = transenc->name[0];
6395 			if (t == 'Q' || t == 'q') {
6396 				base64 = false;
6397 			}
6398 		}
6399 	}
6400 
6401 	if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6402 		char t = ZSTR_VAL(transenc_name)[0];
6403 		if (t == 'Q' || t == 'q') {
6404 			base64 = false;
6405 		}
6406 	}
6407 
6408 	RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6409 }
6410 
decode_base64(unsigned char c)6411 static int8_t decode_base64(unsigned char c)
6412 {
6413 	if (c >= 'A' && c <= 'Z') {
6414 		return c - 'A';
6415 	} else if (c >= 'a' && c <= 'z') {
6416 		return c - 'a' + 26;
6417 	} else if (c >= '0' && c <= '9') {
6418 		return c - '0' + 52;
6419 	} else if (c == '+') {
6420 		return 62;
6421 	} else if (c == '/') {
6422 		return 63;
6423 	}
6424 	return -1;
6425 }
6426 
6427 static int8_t qprint_map[] = {
6428 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6429 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6430 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6431 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
6432 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6433 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6434 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6435 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6436 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6437 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6438 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6439 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6440 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6441 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6442 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6443 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6444 };
6445 
6446 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6447 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6448 {
6449 	if ((e - p) < 6) {
6450 		return NULL;
6451 	}
6452 
6453 	ZEND_ASSERT(p[0] == '=');
6454 	ZEND_ASSERT(p[1] == '?');
6455 	p += 2;
6456 
6457 	unsigned char *charset = p;
6458 	unsigned char *charset_end = memchr(charset, '?', e - charset);
6459 	if (charset_end == NULL) {
6460 		return NULL;
6461 	}
6462 
6463 	unsigned char *encoding = charset_end + 1;
6464 	p = encoding + 1;
6465 	if (p >= e || *p++ != '?') {
6466 		return NULL;
6467 	}
6468 
6469 	char *charset_name = estrndup((const char*)charset, charset_end - charset);
6470 	const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6471 	efree(charset_name);
6472 	if (incode == NULL) {
6473 		return NULL;
6474 	}
6475 
6476 	unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6477 	if (end_marker) {
6478 		e = end_marker;
6479 	} else if (p < e && *(e-1) == '?') {
6480 		/* If encoded word is not properly terminated, but last byte is '?',
6481 		 * take that as a terminator (legacy behavior) */
6482 		e--;
6483 	}
6484 
6485 	unsigned char *buf = emalloc(e - p), *bufp = buf;
6486 	if (*encoding == 'Q' || *encoding == 'q') {
6487 		/* Fill `buf` with bytes from decoding QPrint */
6488 		while (p < e) {
6489 			unsigned char c = *p++;
6490 			if (c == '_') {
6491 				*bufp++ = ' ';
6492 				continue;
6493 			} else if (c == '=' && (e - p) >= 2) {
6494 				unsigned char c2 = *p++;
6495 				unsigned char c3 = *p++;
6496 				if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6497 					*bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6498 					continue;
6499 				} else if (c2 == '\r') {
6500 					if (c3 != '\n') {
6501 						p--;
6502 					}
6503 					continue;
6504 				} else if (c2 == '\n') {
6505 					p--;
6506 					continue;
6507 				}
6508 			}
6509 			*bufp++ = c;
6510 		}
6511 	} else if (*encoding == 'B' || *encoding == 'b') {
6512 		/* Fill `buf` with bytes from decoding Base64 */
6513 		unsigned int bits = 0, cache = 0;
6514 		while (p < e) {
6515 			unsigned char c = *p++;
6516 			if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6517 				continue;
6518 			}
6519 			int8_t decoded = decode_base64(c);
6520 			if (decoded == -1) {
6521 				*bufp++ = '?';
6522 				continue;
6523 			}
6524 			bits += 6;
6525 			cache = (cache << 6) | (decoded & 0x3F);
6526 			if (bits == 24) {
6527 				*bufp++ = (cache >> 16) & 0xFF;
6528 				*bufp++ = (cache >> 8) & 0xFF;
6529 				*bufp++ = cache & 0xFF;
6530 				bits = cache = 0;
6531 			}
6532 		}
6533 		if (bits == 18) {
6534 			*bufp++ = (cache >> 10) & 0xFF;
6535 			*bufp++ = (cache >> 2) & 0xFF;
6536 		} else if (bits == 12) {
6537 			*bufp++ = (cache >> 4) & 0xFF;
6538 		}
6539 	} else {
6540 		efree(buf);
6541 		return NULL;
6542 	}
6543 
6544 	size_t in_len = bufp - buf;
6545 	uint32_t wchar_buf[128];
6546 
6547 	bufp = buf;
6548 	while (in_len) {
6549 		size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6550 		ZEND_ASSERT(out_len <= 128);
6551 		outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6552 	}
6553 
6554 	efree(buf);
6555 	return e + 2;
6556 }
6557 
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6558 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6559 {
6560 	unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6561 	unsigned int state = 0;
6562 	bool space_pending = false;
6563 
6564 	mb_convert_buf buf;
6565 	mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6566 
6567 	while (p < e) {
6568 		unsigned char c = *p;
6569 
6570 		if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6571 			/* Does this look like a MIME encoded word? If so, try to decode it as one */
6572 			unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6573 			if (incode_end && (e - incode_end) >= 3) {
6574 				unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6575 				if (temp) {
6576 					p = temp;
6577 					/* Decoding of MIME encoded word was successful;
6578 					 * Try to collapse a run of whitespace */
6579 					if (p < e && (*p == '\n' || *p == '\r')) {
6580 						do {
6581 							p++;
6582 						} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6583 						/* We will only actually output a space if this is not immediately followed
6584 						 * by another valid encoded word */
6585 						space_pending = true;
6586 					}
6587 					continue;
6588 				}
6589 			}
6590 		}
6591 
6592 		if (space_pending) {
6593 			uint32_t space = ' ';
6594 			outcode->from_wchar(&space, 1, &buf, false);
6595 			space_pending = false;
6596 		}
6597 
6598 		/* Consume a run of plain ASCII characters */
6599 		if (c != '\n' && c != '\r') {
6600 			unsigned char *end = p + 1;
6601 			while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6602 				end++;
6603 			}
6604 			uint32_t wchar_buf[128];
6605 			size_t in_len = end - p;
6606 			while (in_len) {
6607 				size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6608 				ZEND_ASSERT(out_len <= 128);
6609 				outcode->from_wchar(wchar_buf, out_len, &buf, false);
6610 			}
6611 		}
6612 		/* Collapse a run of whitespace into a single space */
6613 		if (p < e && (*p == '\n' || *p == '\r')) {
6614 			do {
6615 				p++;
6616 			} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6617 			if (p < e) {
6618 				/* Emulating legacy behavior of mb_decode_mimeheader here;
6619 				 * a run of whitespace is not converted to a space at the very
6620 				 * end of the input string */
6621 				uint32_t space = ' ';
6622 				outcode->from_wchar(&space, 1, &buf, false);
6623 			}
6624 		}
6625 	}
6626 
6627 	outcode->from_wchar(NULL, 0, &buf, true);
6628 
6629 	return mb_convert_buf_result(&buf, outcode);
6630 }
6631 
PHP_FUNCTION(mb_decode_mimeheader)6632 PHP_FUNCTION(mb_decode_mimeheader)
6633 {
6634 	zend_string *str;
6635 
6636 	ZEND_PARSE_PARAMETERS_START(1, 1)
6637 		Z_PARAM_STR(str)
6638 	ZEND_PARSE_PARAMETERS_END();
6639 
6640 	RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6641 }
6642