xref: /php-src/ext/mbstring/mbstring.c (revision c96e8946)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include <limits.h>
22 
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "ext/standard/url.h"
32 #include "main/php_output.h"
33 #include "ext/standard/info.h"
34 #include "ext/pcre/php_pcre.h"
35 
36 #include "libmbfl/mbfl/mbfilter_8bit.h"
37 #include "libmbfl/mbfl/mbfilter_pass.h"
38 #include "libmbfl/mbfl/mbfilter_wchar.h"
39 #include "libmbfl/mbfl/eaw_table.h"
40 #include "libmbfl/filters/mbfilter_base64.h"
41 #include "libmbfl/filters/mbfilter_cjk.h"
42 #include "libmbfl/filters/mbfilter_qprint.h"
43 #include "libmbfl/filters/mbfilter_htmlent.h"
44 #include "libmbfl/filters/mbfilter_uuencode.h"
45 #include "libmbfl/filters/mbfilter_ucs4.h"
46 #include "libmbfl/filters/mbfilter_utf8.h"
47 #include "libmbfl/filters/mbfilter_utf16.h"
48 #include "libmbfl/filters/mbfilter_singlebyte.h"
49 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
50 #include "libmbfl/filters/unicode_prop.h"
51 
52 #include "php_variables.h"
53 #include "php_globals.h"
54 #include "rfc1867.h"
55 #include "php_content_types.h"
56 #include "SAPI.h"
57 #include "php_unicode.h"
58 #include "TSRM.h"
59 
60 #include "mb_gpc.h"
61 
62 #ifdef HAVE_MBREGEX
63 # include "php_mbregex.h"
64 #endif
65 
66 #include "zend_smart_str.h"
67 #include "zend_multibyte.h"
68 #include "mbstring_arginfo.h"
69 
70 #include "rare_cp_bitvec.h"
71 
72 /* }}} */
73 
74 /* {{{ prototypes */
75 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
76 
77 static PHP_GINIT_FUNCTION(mbstring);
78 static PHP_GSHUTDOWN_FUNCTION(mbstring);
79 
80 static void php_mb_populate_current_detect_order_list(void);
81 
82 static int php_mb_encoding_translation(void);
83 
84 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
85 
86 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
87 
88 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
89 
90 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
91 
92 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
93 
94 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
95 
96 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
97 
98 /* See mbfilter_cp5022x.c */
99 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
100 /* }}} */
101 
102 /* {{{ php_mb_default_identify_list */
103 typedef struct _php_mb_nls_ident_list {
104 	enum mbfl_no_language lang;
105 	const enum mbfl_no_encoding *list;
106 	size_t list_size;
107 } php_mb_nls_ident_list;
108 
109 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
110 	mbfl_no_encoding_ascii,
111 	mbfl_no_encoding_jis,
112 	mbfl_no_encoding_utf8,
113 	mbfl_no_encoding_euc_jp,
114 	mbfl_no_encoding_sjis
115 };
116 
117 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
118 	mbfl_no_encoding_ascii,
119 	mbfl_no_encoding_utf8,
120 	mbfl_no_encoding_euc_cn,
121 	mbfl_no_encoding_cp936
122 };
123 
124 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
125 	mbfl_no_encoding_ascii,
126 	mbfl_no_encoding_utf8,
127 	mbfl_no_encoding_euc_tw,
128 	mbfl_no_encoding_big5
129 };
130 
131 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
132 	mbfl_no_encoding_ascii,
133 	mbfl_no_encoding_utf8,
134 	mbfl_no_encoding_euc_kr,
135 	mbfl_no_encoding_uhc
136 };
137 
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
139 	mbfl_no_encoding_ascii,
140 	mbfl_no_encoding_utf8,
141 	mbfl_no_encoding_koi8r,
142 	mbfl_no_encoding_cp1251,
143 	mbfl_no_encoding_cp866
144 };
145 
146 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
147 	mbfl_no_encoding_ascii,
148 	mbfl_no_encoding_utf8,
149 	mbfl_no_encoding_armscii8
150 };
151 
152 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
153 	mbfl_no_encoding_ascii,
154 	mbfl_no_encoding_utf8,
155 	mbfl_no_encoding_cp1254,
156 	mbfl_no_encoding_8859_9
157 };
158 
159 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
160 	mbfl_no_encoding_ascii,
161 	mbfl_no_encoding_utf8,
162 	mbfl_no_encoding_koi8u
163 };
164 
165 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
166 	mbfl_no_encoding_ascii,
167 	mbfl_no_encoding_utf8
168 };
169 
170 
171 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
172 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
173 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
174 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
175 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
176 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
177 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
178 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
179 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
180 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
181 };
182 
183 /* }}} */
184 
185 /* {{{ mbstring_deps[] */
186 static const zend_module_dep mbstring_deps[] = {
187 	ZEND_MOD_REQUIRED("pcre")
188 	ZEND_MOD_END
189 };
190 /* }}} */
191 
192 /* {{{ zend_module_entry mbstring_module_entry */
193 zend_module_entry mbstring_module_entry = {
194 	STANDARD_MODULE_HEADER_EX,
195 	NULL,
196 	mbstring_deps,
197 	"mbstring",
198 	ext_functions,
199 	PHP_MINIT(mbstring),
200 	PHP_MSHUTDOWN(mbstring),
201 	PHP_RINIT(mbstring),
202 	PHP_RSHUTDOWN(mbstring),
203 	PHP_MINFO(mbstring),
204 	PHP_MBSTRING_VERSION,
205 	PHP_MODULE_GLOBALS(mbstring),
206 	PHP_GINIT(mbstring),
207 	PHP_GSHUTDOWN(mbstring),
208 	NULL,
209 	STANDARD_MODULE_PROPERTIES_EX
210 };
211 /* }}} */
212 
213 /* {{{ static sapi_post_entry php_post_entries[] */
214 static const sapi_post_entry php_post_entries[] = {
215 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
216 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
217 	{ NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220 
221 #ifdef COMPILE_DL_MBSTRING
222 #ifdef ZTS
223 ZEND_TSRMLS_CACHE_DEFINE()
224 #endif
225 ZEND_GET_MODULE(mbstring)
226 #endif
227 
228 /* {{{ static sapi_post_entry mbstr_post_entries[] */
229 static const sapi_post_entry mbstr_post_entries[] = {
230 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
231 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
232 	{ NULL, 0, NULL, NULL }
233 };
234 /* }}} */
235 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)236 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
237 	if (encoding_name) {
238 		const mbfl_encoding *encoding;
239 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
240 		if (last_encoding_name && (last_encoding_name == encoding_name
241 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
242 			return MBSTRG(last_used_encoding);
243 		}
244 
245 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
246 		if (!encoding) {
247 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
248 			return NULL;
249 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
250 			if (encoding == &mbfl_encoding_base64) {
251 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
252 			} else if (encoding == &mbfl_encoding_qprint) {
253 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
254 			} else if (encoding == &mbfl_encoding_html_ent) {
255 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
256 			} else if (encoding == &mbfl_encoding_uuencode) {
257 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
258 			}
259 		}
260 
261 		if (last_encoding_name) {
262 			zend_string_release(last_encoding_name);
263 		}
264 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
265 		MBSTRG(last_used_encoding) = encoding;
266 		return encoding;
267 	} else {
268 		return MBSTRG(current_internal_encoding);
269 	}
270 }
271 
php_mb_get_encoding_or_pass(const char * encoding_name,size_t encoding_name_len)272 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
273 	if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
274 		return &mbfl_encoding_pass;
275 	}
276 
277 	return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
278 }
279 
count_commas(const char * p,const char * end)280 static size_t count_commas(const char *p, const char *end) {
281 	size_t count = 0;
282 	while ((p = memchr(p, ',', end - p))) {
283 		count++;
284 		p++;
285 	}
286 	return count;
287 }
288 
289 /* {{{ static zend_result php_mb_parse_encoding_list()
290  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
291  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
292  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)293 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
294 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
295 {
296 	if (value == NULL || value_length == 0) {
297 		*return_list = NULL;
298 		*return_size = 0;
299 		return SUCCESS;
300 	} else {
301 		bool included_auto;
302 		size_t n, size;
303 		const char *p1, *endp, *tmpstr;
304 		const mbfl_encoding **entry, **list;
305 
306 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
307 			tmpstr = value + 1;
308 			value_length -= 2;
309 		} else {
310 			tmpstr = value;
311 		}
312 
313 		endp = tmpstr + value_length;
314 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
315 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
316 		entry = list;
317 		n = 0;
318 		included_auto = 0;
319 		p1 = tmpstr;
320 		while (1) {
321 			const char *comma = memchr(p1, ',', endp - p1);
322 			const char *p = comma ? comma : endp;
323 			/* trim spaces */
324 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
325 				p1++;
326 			}
327 			p--;
328 			while (p > p1 && (*p == ' ' || *p == '\t')) {
329 				p--;
330 			}
331 			size_t p1_length = p - p1 + 1;
332 			/* convert to the encoding number and check encoding */
333 			if (strncasecmp(p1, "auto", p1_length) == 0) {
334 				if (!included_auto) {
335 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
336 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
337 					size_t i;
338 					included_auto = 1;
339 					for (i = 0; i < identify_list_size; i++) {
340 						*entry++ = mbfl_no2encoding(*src++);
341 						n++;
342 					}
343 				}
344 			} else {
345 				const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
346 				if (!encoding) {
347 					/* Called from an INI setting modification */
348 					if (arg_num == 0) {
349 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
350 					} else {
351 						zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
352 					}
353 					pefree(ZEND_VOIDP(list), persistent);
354 					return FAILURE;
355 				}
356 
357 				*entry++ = encoding;
358 				n++;
359 			}
360 			if (n >= size || comma == NULL) {
361 				break;
362 			}
363 			p1 = comma + 1;
364 		}
365 		*return_list = list;
366 		*return_size = n;
367 	}
368 
369 	return SUCCESS;
370 }
371 /* }}} */
372 
373 /* {{{
374  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
375  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
376  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)377 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
378 	size_t *return_size, uint32_t arg_num)
379 {
380 	/* Allocate enough space to include the default detect order if "auto" is used. */
381 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
382 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
383 	const mbfl_encoding **entry = list;
384 	bool included_auto = 0;
385 	size_t n = 0;
386 	zval *hash_entry;
387 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
388 		zend_string *encoding_str = zval_try_get_string(hash_entry);
389 		if (UNEXPECTED(!encoding_str)) {
390 			efree(ZEND_VOIDP(list));
391 			return FAILURE;
392 		}
393 
394 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
395 			if (!included_auto) {
396 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
397 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
398 				size_t j;
399 
400 				included_auto = 1;
401 				for (j = 0; j < identify_list_size; j++) {
402 					*entry++ = mbfl_no2encoding(*src++);
403 					n++;
404 				}
405 			}
406 		} else {
407 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
408 			if (encoding) {
409 				*entry++ = encoding;
410 				n++;
411 			} else {
412 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
413 				zend_string_release(encoding_str);
414 				efree(ZEND_VOIDP(list));
415 				return FAILURE;
416 			}
417 		}
418 		zend_string_release(encoding_str);
419 	} ZEND_HASH_FOREACH_END();
420 	*return_list = list;
421 	*return_size = n;
422 	return SUCCESS;
423 }
424 /* }}} */
425 
426 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)427 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
428 {
429 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
430 }
431 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)432 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
433 {
434 	return ((const mbfl_encoding *)encoding)->name;
435 }
436 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)437 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
438 {
439 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
440 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
441 }
442 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)443 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
444 {
445 	if (!list) {
446 		list = (const zend_encoding**)MBSTRG(current_detect_order_list);
447 		list_size = MBSTRG(current_detect_order_list_size);
448 	}
449 	if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
450 		/* Emulate behavior of previous implementation; it would never return "pass"
451 		 * from an encoding auto-detection operation */
452 		return NULL;
453 	}
454 	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
455 }
456 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)457 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
458 {
459 	unsigned int num_errors = 0;
460 	zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
461 
462 	*to_length = ZSTR_LEN(result);
463 	*to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
464 	memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
465 	zend_string_free(result);
466 
467 	return from_length;
468 }
469 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)470 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
471 {
472 	return php_mb_parse_encoding_list(
473 		encoding_list, encoding_list_len,
474 		(const mbfl_encoding ***)return_list, return_size,
475 		persistent, /* arg_num */ 0);
476 }
477 
php_mb_zend_internal_encoding_getter(void)478 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
479 {
480 	return (const zend_encoding *)MBSTRG(internal_encoding);
481 }
482 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)483 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
484 {
485 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
486 	return SUCCESS;
487 }
488 
489 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
490 	"mbstring",
491 	php_mb_zend_encoding_fetcher,
492 	php_mb_zend_encoding_name_getter,
493 	php_mb_zend_encoding_lexer_compatibility_checker,
494 	php_mb_zend_encoding_detector,
495 	php_mb_zend_encoding_converter,
496 	php_mb_zend_encoding_list_parser,
497 	php_mb_zend_internal_encoding_getter,
498 	php_mb_zend_internal_encoding_setter
499 };
500 /* }}} */
501 
502 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)503 static void *_php_mb_compile_regex(const char *pattern)
504 {
505 	pcre2_code *retval;
506 	PCRE2_SIZE err_offset;
507 	int errnum;
508 
509 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
510 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
511 		PCRE2_UCHAR err_str[128];
512 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
513 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
514 	}
515 	return retval;
516 }
517 /* }}} */
518 
519 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)520 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
521 {
522 	int res;
523 
524 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
525 	if (NULL == match_data) {
526 		pcre2_code_free(opaque);
527 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
528 		return FAILURE;
529 	}
530 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
531 	php_pcre_free_match_data(match_data);
532 
533 	return res;
534 }
535 /* }}} */
536 
537 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)538 static void _php_mb_free_regex(void *opaque)
539 {
540 	pcre2_code_free(opaque);
541 }
542 /* }}} */
543 
544 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)545 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
546 {
547 	size_t i;
548 
549 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
550 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
551 
552 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
553 		if (php_mb_default_identify_list[i].lang == lang) {
554 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
555 			*plist_size = php_mb_default_identify_list[i].list_size;
556 			return 1;
557 		}
558 	}
559 	return 0;
560 }
561 /* }}} */
562 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)563 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
564 {
565 	char *result = emalloc(len + 2);
566 	char *resp = result;
567 	size_t i;
568 
569 	for (i = 0; i < len && start[i] != quote; ++i) {
570 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
571 			*resp++ = start[++i];
572 		} else {
573 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
574 
575 			while (j-- > 0 && i < len) {
576 				*resp++ = start[i++];
577 			}
578 			--i;
579 		}
580 	}
581 
582 	*resp = '\0';
583 	return result;
584 }
585 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)586 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
587 {
588 	char *pos = *line, quote;
589 	char *res;
590 
591 	while (*pos && *pos != stop) {
592 		if ((quote = *pos) == '"' || quote == '\'') {
593 			++pos;
594 			while (*pos && *pos != quote) {
595 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
596 					pos += 2;
597 				} else {
598 					++pos;
599 				}
600 			}
601 			if (*pos) {
602 				++pos;
603 			}
604 		} else {
605 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
606 
607 		}
608 	}
609 	if (*pos == '\0') {
610 		res = estrdup(*line);
611 		*line += strlen(*line);
612 		return res;
613 	}
614 
615 	res = estrndup(*line, pos - *line);
616 
617 	while (*pos == stop) {
618 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
619 	}
620 
621 	*line = pos;
622 	return res;
623 }
624 /* }}} */
625 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)626 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
627 {
628 	while (*str && isspace(*(unsigned char *)str)) {
629 		++str;
630 	}
631 
632 	if (!*str) {
633 		return estrdup("");
634 	}
635 
636 	if (*str == '"' || *str == '\'') {
637 		char quote = *str;
638 
639 		str++;
640 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
641 	} else {
642 		char *strend = str;
643 
644 		while (*strend && !isspace(*(unsigned char *)strend)) {
645 			++strend;
646 		}
647 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
648 	}
649 }
650 /* }}} */
651 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)652 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
653 {
654 	char *s, *s2;
655 	const size_t filename_len = strlen(filename);
656 
657 	/* The \ check should technically be needed for win32 systems only where
658 	 * it is a valid path separator. However, IE in all it's wisdom always sends
659 	 * the full path of the file on the user's filesystem, which means that unless
660 	 * the user does basename() they get a bogus file name. Until IE's user base drops
661 	 * to nill or problem is fixed this code must remain enabled for all systems. */
662 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
663 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
664 
665 	if (s && s2) {
666 		if (s > s2) {
667 			return ++s;
668 		} else {
669 			return ++s2;
670 		}
671 	} else if (s) {
672 		return ++s;
673 	} else if (s2) {
674 		return ++s2;
675 	} else {
676 		return filename;
677 	}
678 }
679 /* }}} */
680 
681 /* {{{ php.ini directive handler */
682 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)683 static PHP_INI_MH(OnUpdate_mbstring_language)
684 {
685 	enum mbfl_no_language no_language;
686 
687 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
688 	if (no_language == mbfl_no_language_invalid) {
689 		MBSTRG(language) = mbfl_no_language_neutral;
690 		return FAILURE;
691 	}
692 	MBSTRG(language) = no_language;
693 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
694 	return SUCCESS;
695 }
696 /* }}} */
697 
698 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)699 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
700 {
701 	const mbfl_encoding **list;
702 	size_t size;
703 
704 	if (!new_value) {
705 		if (MBSTRG(detect_order_list)) {
706 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
707 		}
708 		MBSTRG(detect_order_list) = NULL;
709 		MBSTRG(detect_order_list_size) = 0;
710 		return SUCCESS;
711 	}
712 
713 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
714 		return FAILURE;
715 	}
716 
717 	if (MBSTRG(detect_order_list)) {
718 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
719 	}
720 	MBSTRG(detect_order_list) = list;
721 	MBSTRG(detect_order_list_size) = size;
722 	return SUCCESS;
723 }
724 /* }}} */
725 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)726 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
727 	const mbfl_encoding **list;
728 	size_t size;
729 	if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
730 		list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
731 		*list = &mbfl_encoding_pass;
732 		size = 1;
733 	} else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
734 		return FAILURE;
735 	}
736 	if (MBSTRG(http_input_list)) {
737 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
738 	}
739 	MBSTRG(http_input_list) = list;
740 	MBSTRG(http_input_list_size) = size;
741 	return SUCCESS;
742 }
743 
744 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)745 static PHP_INI_MH(OnUpdate_mbstring_http_input)
746 {
747 	if (new_value) {
748 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
749 	}
750 
751 	if (!new_value || !ZSTR_LEN(new_value)) {
752 		const char *encoding = php_get_input_encoding();
753 		MBSTRG(http_input_set) = 0;
754 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
755 		return SUCCESS;
756 	}
757 
758 	MBSTRG(http_input_set) = 1;
759 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
760 }
761 /* }}} */
762 
_php_mb_ini_mbstring_http_output_set(const char * new_value,size_t length)763 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
764 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
765 	if (!encoding) {
766 		return FAILURE;
767 	}
768 
769 	MBSTRG(http_output_encoding) = encoding;
770 	MBSTRG(current_http_output_encoding) = encoding;
771 	return SUCCESS;
772 }
773 
774 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)775 static PHP_INI_MH(OnUpdate_mbstring_http_output)
776 {
777 	if (new_value) {
778 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
779 	}
780 
781 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
782 		const char *encoding = php_get_output_encoding();
783 		MBSTRG(http_output_set) = 0;
784 		_php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
785 		return SUCCESS;
786 	}
787 
788 	MBSTRG(http_output_set) = 1;
789 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
790 }
791 /* }}} */
792 
793 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)794 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
795 {
796 	const mbfl_encoding *encoding;
797 
798 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
799 		/* falls back to UTF-8 if an unknown encoding name is given */
800 		if (new_value) {
801 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
802 		}
803 		encoding = &mbfl_encoding_utf8;
804 	}
805 	MBSTRG(internal_encoding) = encoding;
806 	MBSTRG(current_internal_encoding) = encoding;
807 #ifdef HAVE_MBREGEX
808 	{
809 		const char *enc_name = new_value;
810 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
811 			/* falls back to UTF-8 if an unknown encoding name is given */
812 			enc_name = "UTF-8";
813 			php_mb_regex_set_default_mbctype(enc_name);
814 		}
815 		php_mb_regex_set_mbctype(new_value);
816 	}
817 #endif
818 	return SUCCESS;
819 }
820 /* }}} */
821 
822 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)823 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
824 {
825 	if (new_value) {
826 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
827 	}
828 
829 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
830 		return FAILURE;
831 	}
832 
833 	if (new_value && ZSTR_LEN(new_value)) {
834 		MBSTRG(internal_encoding_set) = 1;
835 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
836 	} else {
837 		const char *encoding = php_get_internal_encoding();
838 		MBSTRG(internal_encoding_set) = 0;
839 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
840 	}
841 }
842 /* }}} */
843 
844 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)845 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
846 {
847 	if (new_value != NULL) {
848 		if (zend_string_equals_literal_ci(new_value, "none")) {
849 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
850 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
851 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
852 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
853 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
854 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
855 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
856 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
857 		} else {
858 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
859 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
860 			if (ZSTR_LEN(new_value) > 0) {
861 				char *endptr = NULL;
862 				int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
863 
864 				if (*endptr == '\0') {
865 					MBSTRG(filter_illegal_substchar) = c;
866 					MBSTRG(current_filter_illegal_substchar) = c;
867 				}
868 			}
869 		}
870 	} else {
871 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
872 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
873 		MBSTRG(filter_illegal_substchar) = '?';
874 		MBSTRG(current_filter_illegal_substchar) = '?';
875 	}
876 
877 	return SUCCESS;
878 }
879 /* }}} */
880 
881 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)882 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
883 {
884 	if (new_value == NULL) {
885 		return FAILURE;
886 	}
887 
888 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
889 
890 	if (MBSTRG(encoding_translation)) {
891 		sapi_unregister_post_entry(php_post_entries);
892 		sapi_register_post_entries(mbstr_post_entries);
893 	} else {
894 		sapi_unregister_post_entry(mbstr_post_entries);
895 		sapi_register_post_entries(php_post_entries);
896 	}
897 
898 	return SUCCESS;
899 }
900 /* }}} */
901 
902 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)903 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
904 {
905 	zend_string *tmp;
906 	void *re = NULL;
907 
908 	if (!new_value) {
909 		new_value = entry->orig_value;
910 	}
911 	tmp = php_trim(new_value, NULL, 0, 3);
912 
913 	if (ZSTR_LEN(tmp) > 0) {
914 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
915 			zend_string_release_ex(tmp, 0);
916 			return FAILURE;
917 		}
918 	}
919 
920 	if (MBSTRG(http_output_conv_mimetypes)) {
921 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
922 	}
923 
924 	MBSTRG(http_output_conv_mimetypes) = re;
925 
926 	zend_string_release_ex(tmp, 0);
927 	return SUCCESS;
928 }
929 /* }}} */
930 /* }}} */
931 
932 /* {{{ php.ini directive registration */
933 PHP_INI_BEGIN()
934 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
935 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
936 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
937 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
938 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
939 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
940 
941 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
942 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
943 		OnUpdate_mbstring_encoding_translation,
944 		encoding_translation, zend_mbstring_globals, mbstring_globals)
945 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
946 		"^(text/|application/xhtml\\+xml)",
947 		PHP_INI_ALL,
948 		OnUpdate_mbstring_http_output_conv_mimetypes)
949 
950 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
951 		PHP_INI_ALL,
952 		OnUpdateBool,
953 		strict_detection, zend_mbstring_globals, mbstring_globals)
954 #ifdef HAVE_MBREGEX
955 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
956 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
957 #endif
PHP_INI_END()958 PHP_INI_END()
959 /* }}} */
960 
961 static void mbstring_internal_encoding_changed_hook(void) {
962 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
963 	if (!MBSTRG(internal_encoding_set)) {
964 		const char *encoding = php_get_internal_encoding();
965 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
966 	}
967 
968 	if (!MBSTRG(http_output_set)) {
969 		const char *encoding = php_get_output_encoding();
970 		_php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
971 	}
972 
973 	if (!MBSTRG(http_input_set)) {
974 		const char *encoding = php_get_input_encoding();
975 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
976 	}
977 }
978 
979 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)980 static PHP_GINIT_FUNCTION(mbstring)
981 {
982 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
983 ZEND_TSRMLS_CACHE_UPDATE();
984 #endif
985 
986 	mbstring_globals->language = mbfl_no_language_uni;
987 	mbstring_globals->internal_encoding = NULL;
988 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
989 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
990 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
991 	mbstring_globals->http_input_identify = NULL;
992 	mbstring_globals->http_input_identify_get = NULL;
993 	mbstring_globals->http_input_identify_post = NULL;
994 	mbstring_globals->http_input_identify_cookie = NULL;
995 	mbstring_globals->http_input_identify_string = NULL;
996 	mbstring_globals->http_input_list = NULL;
997 	mbstring_globals->http_input_list_size = 0;
998 	mbstring_globals->detect_order_list = NULL;
999 	mbstring_globals->detect_order_list_size = 0;
1000 	mbstring_globals->current_detect_order_list = NULL;
1001 	mbstring_globals->current_detect_order_list_size = 0;
1002 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1003 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1004 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1005 	mbstring_globals->filter_illegal_substchar = '?';
1006 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1007 	mbstring_globals->current_filter_illegal_substchar = '?';
1008 	mbstring_globals->illegalchars = 0;
1009 	mbstring_globals->encoding_translation = 0;
1010 	mbstring_globals->strict_detection = 0;
1011 	mbstring_globals->outconv_enabled = false;
1012 	mbstring_globals->outconv_state = 0;
1013 	mbstring_globals->http_output_conv_mimetypes = NULL;
1014 #ifdef HAVE_MBREGEX
1015 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1016 #endif
1017 	mbstring_globals->last_used_encoding_name = NULL;
1018 	mbstring_globals->last_used_encoding = NULL;
1019 	mbstring_globals->internal_encoding_set = 0;
1020 	mbstring_globals->http_output_set = 0;
1021 	mbstring_globals->http_input_set = 0;
1022 	mbstring_globals->all_encodings_list = NULL;
1023 }
1024 /* }}} */
1025 
1026 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1027 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1028 {
1029 	if (mbstring_globals->http_input_list) {
1030 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1031 	}
1032 	if (mbstring_globals->detect_order_list) {
1033 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1034 	}
1035 	if (mbstring_globals->http_output_conv_mimetypes) {
1036 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1037 	}
1038 #ifdef HAVE_MBREGEX
1039 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1040 #endif
1041 }
1042 /* }}} */
1043 
1044 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1045 static void init_check_utf8(void);
1046 #endif
1047 
1048 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1049 PHP_MINIT_FUNCTION(mbstring)
1050 {
1051 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1052 ZEND_TSRMLS_CACHE_UPDATE();
1053 #endif
1054 
1055 	REGISTER_INI_ENTRIES();
1056 
1057 	/* We assume that we're the only user of the hook. */
1058 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1059 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1060 	mbstring_internal_encoding_changed_hook();
1061 
1062 	/* This is a global handler. Should not be set in a per-request handler. */
1063 	sapi_register_treat_data(mbstr_treat_data);
1064 
1065 	/* Post handlers are stored in the thread-local context. */
1066 	if (MBSTRG(encoding_translation)) {
1067 		sapi_register_post_entries(mbstr_post_entries);
1068 	}
1069 
1070 #ifdef HAVE_MBREGEX
1071 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1072 #endif
1073 
1074 	register_mbstring_symbols(module_number);
1075 
1076 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1077 		return FAILURE;
1078 	}
1079 
1080 	php_rfc1867_set_multibyte_callbacks(
1081 		php_mb_encoding_translation,
1082 		php_mb_gpc_get_detect_order,
1083 		php_mb_gpc_set_input_encoding,
1084 		php_mb_rfc1867_getword,
1085 		php_mb_rfc1867_getword_conf,
1086 		php_mb_rfc1867_basename);
1087 
1088 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1089 	init_check_utf8();
1090 	init_convert_utf16();
1091 #endif
1092 
1093 	return SUCCESS;
1094 }
1095 /* }}} */
1096 
1097 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1098 PHP_MSHUTDOWN_FUNCTION(mbstring)
1099 {
1100 	UNREGISTER_INI_ENTRIES();
1101 
1102 	zend_multibyte_restore_functions();
1103 
1104 #ifdef HAVE_MBREGEX
1105 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1106 #endif
1107 
1108 	php_internal_encoding_changed = NULL;
1109 
1110 	return SUCCESS;
1111 }
1112 /* }}} */
1113 
1114 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1115 PHP_RINIT_FUNCTION(mbstring)
1116 {
1117 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1118 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1119 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1120 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1121 
1122 	MBSTRG(illegalchars) = 0;
1123 
1124 	php_mb_populate_current_detect_order_list();
1125 
1126 #ifdef HAVE_MBREGEX
1127 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1128 #endif
1129 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1130 
1131 	return SUCCESS;
1132 }
1133 /* }}} */
1134 
1135 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1136 PHP_RSHUTDOWN_FUNCTION(mbstring)
1137 {
1138 	if (MBSTRG(current_detect_order_list) != NULL) {
1139 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1140 		MBSTRG(current_detect_order_list) = NULL;
1141 		MBSTRG(current_detect_order_list_size) = 0;
1142 	}
1143 
1144 	/* clear http input identification. */
1145 	MBSTRG(http_input_identify) = NULL;
1146 	MBSTRG(http_input_identify_post) = NULL;
1147 	MBSTRG(http_input_identify_get) = NULL;
1148 	MBSTRG(http_input_identify_cookie) = NULL;
1149 	MBSTRG(http_input_identify_string) = NULL;
1150 
1151 	if (MBSTRG(last_used_encoding_name)) {
1152 		zend_string_release(MBSTRG(last_used_encoding_name));
1153 		MBSTRG(last_used_encoding_name) = NULL;
1154 	}
1155 
1156 	MBSTRG(internal_encoding_set) = 0;
1157 	MBSTRG(http_output_set) = 0;
1158 	MBSTRG(http_input_set) = 0;
1159 
1160 	MBSTRG(outconv_enabled) = false;
1161 	MBSTRG(outconv_state) = 0;
1162 
1163 	if (MBSTRG(all_encodings_list)) {
1164 		GC_DELREF(MBSTRG(all_encodings_list));
1165 		zend_array_destroy(MBSTRG(all_encodings_list));
1166 		MBSTRG(all_encodings_list) = NULL;
1167 	}
1168 
1169 #ifdef HAVE_MBREGEX
1170 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1171 #endif
1172 
1173 	return SUCCESS;
1174 }
1175 /* }}} */
1176 
1177 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1178 PHP_MINFO_FUNCTION(mbstring)
1179 {
1180 	php_info_print_table_start();
1181 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1182 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1183 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1184 	{
1185 		char tmp[256];
1186 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1187 		php_info_print_table_row(2, "libmbfl version", tmp);
1188 	}
1189 	php_info_print_table_end();
1190 
1191 	php_info_print_table_start();
1192 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1193 	php_info_print_table_end();
1194 
1195 #ifdef HAVE_MBREGEX
1196 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1197 #endif
1198 
1199 	DISPLAY_INI_ENTRIES();
1200 }
1201 /* }}} */
1202 
1203 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1204 PHP_FUNCTION(mb_language)
1205 {
1206 	zend_string *name = NULL;
1207 
1208 	ZEND_PARSE_PARAMETERS_START(0, 1)
1209 		Z_PARAM_OPTIONAL
1210 		Z_PARAM_STR_OR_NULL(name)
1211 	ZEND_PARSE_PARAMETERS_END();
1212 
1213 	if (name == NULL) {
1214 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1215 	} else {
1216 		zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1217 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1218 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1219 			zend_string_release_ex(ini_name, 0);
1220 			RETURN_THROWS();
1221 		}
1222 		// TODO Make return void
1223 		RETVAL_TRUE;
1224 		zend_string_release_ex(ini_name, 0);
1225 	}
1226 }
1227 /* }}} */
1228 
1229 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1230 PHP_FUNCTION(mb_internal_encoding)
1231 {
1232 	char *name = NULL;
1233 	size_t name_len;
1234 	const mbfl_encoding *encoding;
1235 
1236 	ZEND_PARSE_PARAMETERS_START(0, 1)
1237 		Z_PARAM_OPTIONAL
1238 		Z_PARAM_STRING_OR_NULL(name, name_len)
1239 	ZEND_PARSE_PARAMETERS_END();
1240 
1241 	if (name == NULL) {
1242 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1243 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1244 	} else {
1245 		encoding = mbfl_name2encoding(name);
1246 		if (!encoding) {
1247 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1248 			RETURN_THROWS();
1249 		} else {
1250 			MBSTRG(current_internal_encoding) = encoding;
1251 			MBSTRG(internal_encoding_set) = 1;
1252 			/* TODO Return old encoding */
1253 			RETURN_TRUE;
1254 		}
1255 	}
1256 }
1257 /* }}} */
1258 
1259 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1260 PHP_FUNCTION(mb_http_input)
1261 {
1262 	char *type = NULL;
1263 	size_t type_len = 0, n;
1264 	const mbfl_encoding **entry;
1265 	const mbfl_encoding *encoding;
1266 
1267 	ZEND_PARSE_PARAMETERS_START(0, 1)
1268 		Z_PARAM_OPTIONAL
1269 		Z_PARAM_STRING_OR_NULL(type, type_len)
1270 	ZEND_PARSE_PARAMETERS_END();
1271 
1272 	if (type == NULL) {
1273 		encoding = MBSTRG(http_input_identify);
1274 	} else if (type_len != 1) {
1275 		zend_argument_value_error(1,
1276 			"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1277 		RETURN_THROWS();
1278 	} else {
1279 		switch (*type) {
1280 		case 'G':
1281 		case 'g':
1282 			encoding = MBSTRG(http_input_identify_get);
1283 			break;
1284 		case 'P':
1285 		case 'p':
1286 			encoding = MBSTRG(http_input_identify_post);
1287 			break;
1288 		case 'C':
1289 		case 'c':
1290 			encoding = MBSTRG(http_input_identify_cookie);
1291 			break;
1292 		case 'S':
1293 		case 's':
1294 			encoding = MBSTRG(http_input_identify_string);
1295 			break;
1296 		case 'I':
1297 		case 'i':
1298 			entry = MBSTRG(http_input_list);
1299 			n = MBSTRG(http_input_list_size);
1300 			array_init(return_value);
1301 			for (size_t i = 0; i < n; i++, entry++) {
1302 				add_next_index_string(return_value, (*entry)->name);
1303 			}
1304 			return;
1305 		case 'L':
1306 		case 'l':
1307 			entry = MBSTRG(http_input_list);
1308 			n = MBSTRG(http_input_list_size);
1309 			if (n == 0) {
1310 				RETURN_FALSE;
1311 			}
1312 
1313 			smart_str result = {0};
1314 			for (size_t i = 0; i < n; i++, entry++) {
1315 				if (i > 0) {
1316 					smart_str_appendc(&result, ',');
1317 				}
1318 				smart_str_appends(&result, (*entry)->name);
1319 			}
1320 			RETURN_STR(smart_str_extract(&result));
1321 		default:
1322 			zend_argument_value_error(1,
1323 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1324 			RETURN_THROWS();
1325 		}
1326 	}
1327 
1328 	if (encoding) {
1329 		RETURN_STRING(encoding->name);
1330 	} else {
1331 		RETURN_FALSE;
1332 	}
1333 }
1334 /* }}} */
1335 
1336 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1337 PHP_FUNCTION(mb_http_output)
1338 {
1339 	char *name = NULL;
1340 	size_t name_len;
1341 
1342 	ZEND_PARSE_PARAMETERS_START(0, 1)
1343 		Z_PARAM_OPTIONAL
1344 		Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1345 	ZEND_PARSE_PARAMETERS_END();
1346 
1347 	if (name == NULL) {
1348 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1349 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1350 	} else {
1351 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1352 		if (!encoding) {
1353 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1354 			RETURN_THROWS();
1355 		} else {
1356 			MBSTRG(http_output_set) = 1;
1357 			MBSTRG(current_http_output_encoding) = encoding;
1358 			/* TODO Return previous encoding? */
1359 			RETURN_TRUE;
1360 		}
1361 	}
1362 }
1363 /* }}} */
1364 
1365 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1366 PHP_FUNCTION(mb_detect_order)
1367 {
1368 	zend_string *order_str = NULL;
1369 	HashTable *order_ht = NULL;
1370 
1371 	ZEND_PARSE_PARAMETERS_START(0, 1)
1372 		Z_PARAM_OPTIONAL
1373 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1374 	ZEND_PARSE_PARAMETERS_END();
1375 
1376 	if (!order_str && !order_ht) {
1377 		size_t n = MBSTRG(current_detect_order_list_size);
1378 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1379 		array_init(return_value);
1380 		for (size_t i = 0; i < n; i++) {
1381 			add_next_index_string(return_value, (*entry)->name);
1382 			entry++;
1383 		}
1384 	} else {
1385 		const mbfl_encoding **list;
1386 		size_t size;
1387 		if (order_ht) {
1388 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1389 				RETURN_THROWS();
1390 			}
1391 		} else {
1392 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1393 				RETURN_THROWS();
1394 			}
1395 		}
1396 
1397 		if (size == 0) {
1398 			efree(ZEND_VOIDP(list));
1399 			zend_argument_value_error(1, "must specify at least one encoding");
1400 			RETURN_THROWS();
1401 		}
1402 
1403 		if (MBSTRG(current_detect_order_list)) {
1404 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1405 		}
1406 		MBSTRG(current_detect_order_list) = list;
1407 		MBSTRG(current_detect_order_list_size) = size;
1408 		RETURN_TRUE;
1409 	}
1410 }
1411 /* }}} */
1412 
php_mb_check_code_point(zend_long cp)1413 static inline bool php_mb_check_code_point(zend_long cp)
1414 {
1415 	if (cp < 0 || cp >= 0x110000) {
1416 		/* Out of Unicode range */
1417 		return false;
1418 	}
1419 
1420 	if (cp >= 0xd800 && cp <= 0xdfff) {
1421 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1422 		 * substitute character. */
1423 		return false;
1424 	}
1425 
1426 	/* As we do not know the target encoding of the conversion operation that is going to
1427 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1428 	 * in the given encoding at this point. Thus we have to accept everything. */
1429 	return true;
1430 }
1431 
1432 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1433 PHP_FUNCTION(mb_substitute_character)
1434 {
1435 	zend_string *substitute_character = NULL;
1436 	zend_long substitute_codepoint;
1437 	bool substitute_is_null = 1;
1438 
1439 	ZEND_PARSE_PARAMETERS_START(0, 1)
1440 		Z_PARAM_OPTIONAL
1441 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1442 	ZEND_PARSE_PARAMETERS_END();
1443 
1444 	if (substitute_is_null) {
1445 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1446 			RETURN_STRING("none");
1447 		}
1448 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1449 			RETURN_STRING("long");
1450 		}
1451 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1452 			RETURN_STRING("entity");
1453 		}
1454 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1455 	}
1456 
1457 	if (substitute_character != NULL) {
1458 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1459 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1460 			RETURN_TRUE;
1461 		}
1462 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1463 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1464 			RETURN_TRUE;
1465 		}
1466 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1467 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1468 			RETURN_TRUE;
1469 		}
1470 		/* Invalid string value */
1471 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1472 		RETURN_THROWS();
1473 	}
1474 	/* Integer codepoint passed */
1475 	if (!php_mb_check_code_point(substitute_codepoint)) {
1476 		zend_argument_value_error(1, "is not a valid codepoint");
1477 		RETURN_THROWS();
1478 	}
1479 
1480 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1481 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1482 	RETURN_TRUE;
1483 }
1484 /* }}} */
1485 
1486 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1487 PHP_FUNCTION(mb_preferred_mime_name)
1488 {
1489 	char *name = NULL;
1490 	size_t name_len;
1491 
1492 	ZEND_PARSE_PARAMETERS_START(1, 1)
1493 		Z_PARAM_STRING(name, name_len)
1494 	ZEND_PARSE_PARAMETERS_END();
1495 
1496 	const mbfl_encoding *enc = mbfl_name2encoding(name);
1497 	if (enc == NULL) {
1498 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1499 		RETURN_THROWS();
1500 	}
1501 
1502 	const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1503 	if (preferred_name == NULL || *preferred_name == '\0') {
1504 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1505 		RETVAL_FALSE;
1506 	} else {
1507 		RETVAL_STRING((char *)preferred_name);
1508 	}
1509 }
1510 /* }}} */
1511 
1512 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1513 PHP_FUNCTION(mb_parse_str)
1514 {
1515 	zval *track_vars_array = NULL;
1516 	char *encstr;
1517 	size_t encstr_len;
1518 	php_mb_encoding_handler_info_t info;
1519 	const mbfl_encoding *detected;
1520 
1521 	ZEND_PARSE_PARAMETERS_START(2, 2)
1522 		Z_PARAM_STRING(encstr, encstr_len)
1523 		Z_PARAM_ZVAL(track_vars_array)
1524 	ZEND_PARSE_PARAMETERS_END();
1525 
1526 	track_vars_array = zend_try_array_init(track_vars_array);
1527 	if (!track_vars_array) {
1528 		RETURN_THROWS();
1529 	}
1530 
1531 	encstr = estrndup(encstr, encstr_len);
1532 
1533 	info.data_type              = PARSE_STRING;
1534 	info.separator              = PG(arg_separator).input;
1535 	info.report_errors          = true;
1536 	info.to_encoding            = MBSTRG(current_internal_encoding);
1537 	info.from_encodings         = MBSTRG(http_input_list);
1538 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1539 
1540 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1541 
1542 	MBSTRG(http_input_identify) = detected;
1543 
1544 	RETVAL_BOOL(detected);
1545 
1546 	if (encstr != NULL) efree(encstr);
1547 }
1548 /* }}} */
1549 
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 	zend_string *str;
1553 	zend_long arg_status;
1554 
1555 	ZEND_PARSE_PARAMETERS_START(2, 2)
1556 		Z_PARAM_STR(str)
1557 		Z_PARAM_LONG(arg_status)
1558 	ZEND_PARSE_PARAMETERS_END();
1559 
1560 	const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1561 	if (encoding == &mbfl_encoding_pass) {
1562 		RETURN_STR_COPY(str);
1563 	}
1564 
1565 	if (arg_status & PHP_OUTPUT_HANDLER_START) {
1566 		bool free_mimetype = false;
1567 		char *mimetype = NULL;
1568 
1569 		/* Analyze mime type */
1570 		if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1571 			char *s;
1572 			if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1573 				mimetype = estrdup(SG(sapi_headers).mimetype);
1574 			} else {
1575 				mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1576 			}
1577 			free_mimetype = true;
1578 		} else if (SG(sapi_headers).send_default_content_type) {
1579 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1580 		}
1581 
1582 		/* If content-type is not yet set, set it and enable conversion */
1583 		if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1584 			const char *charset = encoding->mime_name;
1585 			if (charset) {
1586 				char *p;
1587 				size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s",  mimetype, charset);
1588 				if (sapi_add_header(p, len, 0) != FAILURE) {
1589 					SG(sapi_headers).send_default_content_type = 0;
1590 				}
1591 			}
1592 
1593 			MBSTRG(outconv_enabled) = true;
1594 		}
1595 
1596 		if (free_mimetype) {
1597 			efree(mimetype);
1598 		}
1599 	}
1600 
1601 	if (!MBSTRG(outconv_enabled)) {
1602 		RETURN_STR_COPY(str);
1603 	}
1604 
1605 	mb_convert_buf buf;
1606 	mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1607 
1608 	uint32_t wchar_buf[128];
1609 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1610 	size_t in_len = ZSTR_LEN(str);
1611 	bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1612 
1613 	while (in_len) {
1614 		size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1615 		ZEND_ASSERT(out_len <= 128);
1616 		encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1617 	}
1618 
1619 	MBSTRG(illegalchars) += buf.errors;
1620 	RETVAL_STR(mb_convert_buf_result_raw(&buf));
1621 
1622 	if (last_feed) {
1623 		MBSTRG(outconv_enabled) = false;
1624 		MBSTRG(outconv_state) = 0;
1625 	}
1626 }
1627 
PHP_FUNCTION(mb_str_split)1628 PHP_FUNCTION(mb_str_split)
1629 {
1630 	zend_string *str, *encoding = NULL;
1631 	zend_long split_len = 1;
1632 
1633 	ZEND_PARSE_PARAMETERS_START(1, 3)
1634 		Z_PARAM_STR(str)
1635 		Z_PARAM_OPTIONAL
1636 		Z_PARAM_LONG(split_len)
1637 		Z_PARAM_STR_OR_NULL(encoding)
1638 	ZEND_PARSE_PARAMETERS_END();
1639 
1640 	if (split_len <= 0) {
1641 		zend_argument_value_error(2, "must be greater than 0");
1642 		RETURN_THROWS();
1643 	} else if (split_len > UINT_MAX / 4) {
1644 		zend_argument_value_error(2, "is too large");
1645 		RETURN_THROWS();
1646 	}
1647 
1648 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1649 	if (!enc) {
1650 		RETURN_THROWS();
1651 	}
1652 
1653 	if (ZSTR_LEN(str) == 0) {
1654 		RETURN_EMPTY_ARRAY();
1655 	}
1656 
1657 	unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1658 
1659 	unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1660 	if (char_len) {
1661 		unsigned int chunk_len = char_len * split_len;
1662 		unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1663 		array_init_size(return_value, chunks);
1664 		while (p < e) {
1665 			add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1666 			p += chunk_len;
1667 		}
1668 	} else if (enc->mblen_table) {
1669 		unsigned char const *mbtab = enc->mblen_table;
1670 
1671 		/* Assume that we have 1-byte characters */
1672 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1673 
1674 		while (p < e) {
1675 			unsigned char *chunk = p; /* start of chunk */
1676 
1677 			for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1678 				p += mbtab[*p];
1679 			}
1680 			if (p > e) {
1681 				p = e; /* ensure chunk is in bounds */
1682 			}
1683 			add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1684 		}
1685 	} else {
1686 		/* Assume that we have 1-byte characters */
1687 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1688 
1689 		uint32_t wchar_buf[128];
1690 		size_t in_len = ZSTR_LEN(str);
1691 		unsigned int state = 0, char_count = 0;
1692 
1693 		mb_convert_buf buf;
1694 
1695 		while (in_len) {
1696 			size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1697 			ZEND_ASSERT(out_len <= 128);
1698 			size_t i = 0;
1699 
1700 			/* Is there some output remaining from the previous iteration? */
1701 			if (char_count) {
1702 				if (out_len >= split_len - char_count) {
1703 					/* Finish off an incomplete chunk from previous iteration
1704 					 * ('buf' was already initialized; we don't need to do it again) */
1705 					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1706 					i += split_len - char_count;
1707 					char_count = 0;
1708 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1709 				} else {
1710 					/* Output from this iteration is not enough to finish the next chunk;
1711 					 * output what we can, and leave 'buf' to be used again on next iteration */
1712 					enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1713 					char_count += out_len;
1714 					continue;
1715 				}
1716 			}
1717 
1718 			while (i < out_len) {
1719 				/* Prepare for the next chunk */
1720 				mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1721 
1722 				if (out_len - i >= split_len) {
1723 					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1724 					i += split_len;
1725 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1726 				} else {
1727 					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1728 					 * leave them for the next iteration */
1729 					enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1730 					char_count = out_len - i;
1731 					break;
1732 				}
1733 			}
1734 		}
1735 
1736 		if (char_count) {
1737 			/* The main loop above has finished processing the input string, but
1738 			 * has left a partial chunk in 'buf' */
1739 			add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1740 		}
1741 	}
1742 }
1743 
1744 #ifdef __SSE2__
1745 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1746  * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1747  * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1748  * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1749 static inline uint32_t _mm_sum_epu8(const __m128i v)
1750 {
1751 	/* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1752 	 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1753 	 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1754 	 * halves of the destination XMM register
1755 	 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1756 	 * summed up will actually just be the 8-bit values from `v` */
1757 	__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1758 	/* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1759 	 * to extract it here; but it stored the sum as two different 16-bit values
1760 	 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1761 	 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1762 	return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1763 }
1764 #endif
1765 
1766 /* This assumes that `string` is valid UTF-8
1767  * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1768  * Interpreted as signed integers, those are all byte values less than -64
1769  * A fast way to get the length of a UTF-8 string is to start with its byte length,
1770  * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1771 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1772 {
1773 	unsigned char *e = p + len;
1774 
1775 #ifdef __SSE2__
1776 	if (len >= sizeof(__m128i)) {
1777 		e -= sizeof(__m128i);
1778 
1779 		const __m128i threshold = _mm_set1_epi8(-64);
1780 		const __m128i delta = _mm_set1_epi8(1);
1781 		__m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1782 
1783 		unsigned char reset_counter = 255;
1784 		do {
1785 			__m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1786 			__m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1787 			counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1788 
1789 			/* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1790 			 * and reset them to zero */
1791 			if (--reset_counter == 0) {
1792 				len -= _mm_sum_epu8(counter);
1793 				counter = _mm_setzero_si128();
1794 				reset_counter = 255;
1795 			}
1796 
1797 			p += sizeof(__m128i);
1798 		} while (p <= e);
1799 
1800 		e += sizeof(__m128i);
1801 		len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1802 	}
1803 #endif
1804 
1805 	/* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1806 	while (p < e) {
1807 		signed char c = *p++;
1808 		if (c < -64) {
1809 			len--;
1810 		}
1811 	}
1812 
1813 	return len;
1814 }
1815 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1816 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1817 {
1818 	unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1819 	if (char_len) {
1820 		return ZSTR_LEN(string) / char_len;
1821 	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1822 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1823 	}
1824 
1825 	uint32_t wchar_buf[128];
1826 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1827 	size_t in_len = ZSTR_LEN(string);
1828 	unsigned int state = 0;
1829 	size_t len = 0;
1830 
1831 	while (in_len) {
1832 		len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1833 	}
1834 
1835 	return len;
1836 }
1837 
1838 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1839 PHP_FUNCTION(mb_strlen)
1840 {
1841 	zend_string *string, *enc_name = NULL;
1842 
1843 	ZEND_PARSE_PARAMETERS_START(1, 2)
1844 		Z_PARAM_STR(string)
1845 		Z_PARAM_OPTIONAL
1846 		Z_PARAM_STR_OR_NULL(enc_name)
1847 	ZEND_PARSE_PARAMETERS_END();
1848 
1849 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1850 	if (!enc) {
1851 		RETURN_THROWS();
1852 	}
1853 
1854 	RETVAL_LONG(mb_get_strlen(string, enc));
1855 }
1856 /* }}} */
1857 
1858 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1859 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1860 {
1861 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1862 }
1863 
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1864 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1865 	if (offset < 0) {
1866 		unsigned char *pos = end;
1867 		while (offset < 0) {
1868 			if (pos <= str) {
1869 				return NULL;
1870 			}
1871 
1872 			unsigned char c = *--pos;
1873 			if (c < 0x80 || (c & 0xC0) != 0x80) {
1874 				offset++;
1875 			}
1876 		}
1877 		return pos;
1878 	} else {
1879 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1880 		unsigned char *pos = str;
1881 		while (offset-- > 0) {
1882 			if (pos >= end) {
1883 				return NULL;
1884 			}
1885 			pos += u8_tbl[*pos];
1886 		}
1887 		return pos;
1888 	}
1889 }
1890 
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1891 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1892 	return mb_fast_strlen_utf8(start, pos - start);
1893 }
1894 
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1895 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1896 {
1897 	size_t result;
1898 	zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1899 	unsigned char *offset_pointer;
1900 
1901 	if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1902 		unsigned int num_errors = 0;
1903 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1904 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1905 	} else {
1906 		haystack_u8 = haystack;
1907 		needle_u8 = needle;
1908 	}
1909 
1910 	offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1911 	if (!offset_pointer) {
1912 		result = MBFL_ERROR_OFFSET;
1913 		goto out;
1914 	}
1915 
1916 	result = MBFL_ERROR_NOT_FOUND;
1917 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1918 		goto out;
1919 	}
1920 
1921 	const char *found_pos;
1922 	if (!reverse) {
1923 		found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1924 	} else if (offset >= 0) {
1925 		found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1926 	} else {
1927 		size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1928 		offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1929 		if (!offset_pointer) {
1930 			offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1931 		}
1932 
1933 		found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1934 	}
1935 
1936 	if (found_pos) {
1937 		result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1938 	}
1939 
1940 out:
1941 	if (haystack_u8 != haystack) {
1942 		zend_string_free(haystack_u8);
1943 	}
1944 	if (needle_u8 != needle) {
1945 		zend_string_free(needle_u8);
1946 	}
1947 	return result;
1948 }
1949 
handle_strpos_error(size_t error)1950 static void handle_strpos_error(size_t error) {
1951 	switch (error) {
1952 	case MBFL_ERROR_NOT_FOUND:
1953 		break;
1954 	case MBFL_ERROR_ENCODING:
1955 		php_error_docref(NULL, E_WARNING, "Conversion error");
1956 		break;
1957 	case MBFL_ERROR_OFFSET:
1958 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1959 		break;
1960 	default:
1961 		zend_value_error("mb_strpos(): Unknown error");
1962 		break;
1963 	}
1964 }
1965 
PHP_FUNCTION(mb_strpos)1966 PHP_FUNCTION(mb_strpos)
1967 {
1968 	zend_long offset = 0;
1969 	zend_string *needle, *haystack;
1970 	zend_string *enc_name = NULL;
1971 
1972 	ZEND_PARSE_PARAMETERS_START(2, 4)
1973 		Z_PARAM_STR(haystack)
1974 		Z_PARAM_STR(needle)
1975 		Z_PARAM_OPTIONAL
1976 		Z_PARAM_LONG(offset)
1977 		Z_PARAM_STR_OR_NULL(enc_name)
1978 	ZEND_PARSE_PARAMETERS_END();
1979 
1980 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1981 	if (!enc) {
1982 		RETURN_THROWS();
1983 	}
1984 
1985 	size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1986 	if (!mbfl_is_error(n)) {
1987 		RETVAL_LONG(n);
1988 	} else {
1989 		handle_strpos_error(n);
1990 		RETVAL_FALSE;
1991 	}
1992 }
1993 
1994 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1995 PHP_FUNCTION(mb_strrpos)
1996 {
1997 	zend_long offset = 0;
1998 	zend_string *needle, *haystack;
1999 	zend_string *enc_name = NULL;
2000 
2001 	ZEND_PARSE_PARAMETERS_START(2, 4)
2002 		Z_PARAM_STR(haystack)
2003 		Z_PARAM_STR(needle)
2004 		Z_PARAM_OPTIONAL
2005 		Z_PARAM_LONG(offset)
2006 		Z_PARAM_STR_OR_NULL(enc_name)
2007 	ZEND_PARSE_PARAMETERS_END();
2008 
2009 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2010 	if (!enc) {
2011 		RETURN_THROWS();
2012 	}
2013 
2014 	size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2015 	if (!mbfl_is_error(n)) {
2016 		RETVAL_LONG(n);
2017 	} else {
2018 		handle_strpos_error(n);
2019 		RETVAL_FALSE;
2020 	}
2021 }
2022 /* }}} */
2023 
2024 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2025 PHP_FUNCTION(mb_stripos)
2026 {
2027 	zend_long offset = 0;
2028 	zend_string *haystack, *needle;
2029 	zend_string *from_encoding = NULL;
2030 
2031 	ZEND_PARSE_PARAMETERS_START(2, 4)
2032 		Z_PARAM_STR(haystack)
2033 		Z_PARAM_STR(needle)
2034 		Z_PARAM_OPTIONAL
2035 		Z_PARAM_LONG(offset)
2036 		Z_PARAM_STR_OR_NULL(from_encoding)
2037 	ZEND_PARSE_PARAMETERS_END();
2038 
2039 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2040 	if (!enc) {
2041 		RETURN_THROWS();
2042 	}
2043 
2044 	size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2045 
2046 	if (!mbfl_is_error(n)) {
2047 		RETVAL_LONG(n);
2048 	} else {
2049 		handle_strpos_error(n);
2050 		RETVAL_FALSE;
2051 	}
2052 }
2053 /* }}} */
2054 
2055 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2056 PHP_FUNCTION(mb_strripos)
2057 {
2058 	zend_long offset = 0;
2059 	zend_string *haystack, *needle;
2060 	zend_string *from_encoding = NULL;
2061 
2062 	ZEND_PARSE_PARAMETERS_START(2, 4)
2063 		Z_PARAM_STR(haystack)
2064 		Z_PARAM_STR(needle)
2065 		Z_PARAM_OPTIONAL
2066 		Z_PARAM_LONG(offset)
2067 		Z_PARAM_STR_OR_NULL(from_encoding)
2068 	ZEND_PARSE_PARAMETERS_END();
2069 
2070 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2071 	if (!enc) {
2072 		RETURN_THROWS();
2073 	}
2074 
2075 	size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2076 
2077 	if (!mbfl_is_error(n)) {
2078 		RETVAL_LONG(n);
2079 	} else {
2080 		handle_strpos_error(n);
2081 		RETVAL_FALSE;
2082 	}
2083 }
2084 /* }}} */
2085 
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2086 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2087 {
2088 	uint32_t wchar_buf[128];
2089 	unsigned int state = 0;
2090 
2091 	mb_convert_buf buf;
2092 	mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2093 
2094 	while (in_len && len) {
2095 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2096 		ZEND_ASSERT(out_len <= 128);
2097 
2098 		if (from >= out_len) {
2099 			from -= out_len;
2100 		} else {
2101 			size_t needed_codepoints = MIN(out_len - from, len);
2102 			enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2103 			from = 0;
2104 			len -= needed_codepoints;
2105 		}
2106 	}
2107 
2108 	return mb_convert_buf_result(&buf, enc);
2109 }
2110 
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2111 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2112 {
2113 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2114 	size_t in_len = ZSTR_LEN(input);
2115 
2116 	if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2117 		/* Other than MacJapanese, no supported text encoding decodes to
2118 		 * more than one codepoint per byte
2119 		 * So if the number of codepoints to skip >= number of input bytes,
2120 		 * then definitely the output should be empty */
2121 		return zend_empty_string;
2122 	}
2123 
2124 	/* Does each codepoint have a fixed byte width? */
2125 	unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2126 	if (flag) {
2127 		/* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2128 		from *= flag;
2129 		len *= flag;
2130 		if (from >= in_len) {
2131 			return zend_empty_string;
2132 		}
2133 		in += from;
2134 		in_len -= from;
2135 		if (len > in_len) {
2136 			len = in_len;
2137 		}
2138 		return zend_string_init_fast((const char*)in, len);
2139 	}
2140 
2141 	return mb_get_substr_slow(in, in_len, from, len, enc);
2142 }
2143 
2144 #define MB_STRSTR 1
2145 #define MB_STRRCHR 2
2146 #define MB_STRISTR 3
2147 #define MB_STRRICHR 4
2148 
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2149 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2150 {
2151 	bool reverse_mode = false, part = false;
2152 	size_t n;
2153 	zend_string *haystack, *needle;
2154 	zend_string *encoding_name = NULL;
2155 
2156 	ZEND_PARSE_PARAMETERS_START(2, 4)
2157 		Z_PARAM_STR(haystack)
2158 		Z_PARAM_STR(needle)
2159 		Z_PARAM_OPTIONAL
2160 		Z_PARAM_BOOL(part)
2161 		Z_PARAM_STR_OR_NULL(encoding_name)
2162 	ZEND_PARSE_PARAMETERS_END();
2163 
2164 	const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2165 	if (!enc) {
2166 		RETURN_THROWS();
2167 	}
2168 
2169 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2170 		reverse_mode = true;
2171 	}
2172 
2173 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2174 		n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2175 	} else {
2176 		n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2177 	}
2178 
2179 	if (!mbfl_is_error(n)) {
2180 		if (part) {
2181 			RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2182 		} else {
2183 			RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2184 		}
2185 	} else {
2186 		// FIXME use handle_strpos_error(n)
2187 		RETVAL_FALSE;
2188 	}
2189 }
2190 
2191 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2192 PHP_FUNCTION(mb_strstr)
2193 {
2194 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2195 }
2196 /* }}} */
2197 
2198 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2199 PHP_FUNCTION(mb_strrchr)
2200 {
2201 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2202 }
2203 /* }}} */
2204 
2205 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2206 PHP_FUNCTION(mb_stristr)
2207 {
2208 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2209 }
2210 /* }}} */
2211 
2212 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2213 PHP_FUNCTION(mb_strrichr)
2214 {
2215 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2216 }
2217 /* }}} */
2218 
2219 #undef MB_STRSTR
2220 #undef MB_STRRCHR
2221 #undef MB_STRISTR
2222 #undef MB_STRRICHR
2223 
PHP_FUNCTION(mb_substr_count)2224 PHP_FUNCTION(mb_substr_count)
2225 {
2226 	zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2227 
2228 	ZEND_PARSE_PARAMETERS_START(2, 3)
2229 		Z_PARAM_STR(haystack)
2230 		Z_PARAM_STR(needle)
2231 		Z_PARAM_OPTIONAL
2232 		Z_PARAM_STR_OR_NULL(enc_name)
2233 	ZEND_PARSE_PARAMETERS_END();
2234 
2235 	if (ZSTR_LEN(needle) == 0) {
2236 		zend_argument_value_error(2, "must not be empty");
2237 		RETURN_THROWS();
2238 	}
2239 
2240 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2241 	if (!enc) {
2242 		RETURN_THROWS();
2243 	}
2244 
2245 	if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2246 		/* No need to do any conversion if haystack/needle are already known-valid UTF-8
2247 		 * (If they are not valid, then not passing them through conversion filters could affect output) */
2248 		if (ZSTR_IS_VALID_UTF8(haystack)) {
2249 			haystack_u8 = haystack;
2250 		} else {
2251 			unsigned int num_errors = 0;
2252 			haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2253 			if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2254 				GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2255 			}
2256 		}
2257 
2258 		if (ZSTR_IS_VALID_UTF8(needle)) {
2259 			needle_u8 = needle;
2260 		} else {
2261 			unsigned int num_errors = 0;
2262 			needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2263 			if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2264 				GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2265 			}
2266 		}
2267 	} else {
2268 		unsigned int num_errors = 0;
2269 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2271 		/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2272 		 * may be only escape sequences */
2273 		if (ZSTR_LEN(needle_u8) == 0) {
2274 			zend_string_free(haystack_u8);
2275 			zend_string_free(needle_u8);
2276 			zend_argument_value_error(2, "must not be empty");
2277 			RETURN_THROWS();
2278 		}
2279 	}
2280 
2281 	size_t result = 0;
2282 
2283 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2284 		goto out;
2285 	}
2286 
2287 	const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2288 	while (true) {
2289 		p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2290 		if (!p) {
2291 			break;
2292 		}
2293 		p += ZSTR_LEN(needle_u8);
2294 		result++;
2295 	}
2296 
2297 out:
2298 	if (haystack_u8 != haystack) {
2299 		zend_string_free(haystack_u8);
2300 	}
2301 	if (needle_u8 != needle) {
2302 		zend_string_free(needle_u8);
2303 	}
2304 
2305 	RETVAL_LONG(result);
2306 }
2307 
2308 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2309 PHP_FUNCTION(mb_substr)
2310 {
2311 	zend_string *str, *encoding = NULL;
2312 	zend_long from, len;
2313 	size_t real_from, real_len;
2314 	bool len_is_null = true;
2315 
2316 	ZEND_PARSE_PARAMETERS_START(2, 4)
2317 		Z_PARAM_STR(str)
2318 		Z_PARAM_LONG(from)
2319 		Z_PARAM_OPTIONAL
2320 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2321 		Z_PARAM_STR_OR_NULL(encoding)
2322 	ZEND_PARSE_PARAMETERS_END();
2323 
2324 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2325 	if (!enc) {
2326 		RETURN_THROWS();
2327 	}
2328 
2329 	size_t mblen = 0;
2330 	if (from < 0 || (!len_is_null && len < 0)) {
2331 		mblen = mb_get_strlen(str, enc);
2332 	}
2333 
2334 	/* if "from" position is negative, count start position from the end
2335 	 * of the string */
2336 	if (from >= 0) {
2337 		real_from = (size_t) from;
2338 	} else if (-from < mblen) {
2339 		real_from = mblen + from;
2340 	} else {
2341 		real_from = 0;
2342 	}
2343 
2344 	/* if "length" position is negative, set it to the length
2345 	 * needed to stop that many chars from the end of the string */
2346 	if (len_is_null) {
2347 		real_len = MBFL_SUBSTR_UNTIL_END;
2348 	} else if (len >= 0) {
2349 		real_len = (size_t) len;
2350 	} else if (real_from < mblen && -len < mblen - real_from) {
2351 		real_len = (mblen - real_from) + len;
2352 	} else {
2353 		real_len = 0;
2354 	}
2355 
2356 	RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2357 }
2358 /* }}} */
2359 
2360 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2361 PHP_FUNCTION(mb_strcut)
2362 {
2363 	zend_string *encoding = NULL;
2364 	char *string_val;
2365 	zend_long from, len;
2366 	bool len_is_null = true;
2367 	mbfl_string string, result, *ret;
2368 
2369 	ZEND_PARSE_PARAMETERS_START(2, 4)
2370 		Z_PARAM_STRING(string_val, string.len)
2371 		Z_PARAM_LONG(from)
2372 		Z_PARAM_OPTIONAL
2373 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2374 		Z_PARAM_STR_OR_NULL(encoding)
2375 	ZEND_PARSE_PARAMETERS_END();
2376 
2377 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2378 	if (!enc) {
2379 		RETURN_THROWS();
2380 	}
2381 
2382 	string.val = (unsigned char*)string_val;
2383 	string.encoding = enc;
2384 
2385 	if (len_is_null) {
2386 		len = string.len;
2387 	}
2388 
2389 	/* if "from" position is negative, count start position from the end
2390 	 * of the string */
2391 	if (from < 0) {
2392 		from = string.len + from;
2393 		if (from < 0) {
2394 			from = 0;
2395 		}
2396 	}
2397 
2398 	/* if "length" position is negative, set it to the length
2399 	 * needed to stop that many chars from the end of the string */
2400 	if (len < 0) {
2401 		len = (string.len - from) + len;
2402 		if (len < 0) {
2403 			len = 0;
2404 		}
2405 	}
2406 
2407 	if (from > string.len || len == 0) {
2408 		RETURN_EMPTY_STRING();
2409 	}
2410 
2411 	if (enc->cut) {
2412 		RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2413 	}
2414 
2415 	unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2416 	if (char_len) {
2417 		/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2418 		from &= -char_len;
2419 		if (len > string.len - from) {
2420 			len = string.len - from;
2421 		}
2422 		RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2423 	}
2424 
2425 	if (enc->mblen_table) {
2426 		const unsigned char *mbtab = enc->mblen_table;
2427 		const unsigned char *p, *q, *end;
2428 		int m = 0;
2429 		/* Search for start position */
2430 		for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2431 		if (p > q) {
2432 			p -= m;
2433 		}
2434 		const unsigned char *start = p;
2435 		/* Search for end position */
2436 		if (len >= string.len - (start - (const unsigned char*)string.val)) {
2437 			end = (const unsigned char*)(string.val + string.len);
2438 		} else {
2439 			for (q = p + len; p < q; p += (m = mbtab[*p]));
2440 			if (p > q) {
2441 				p -= m;
2442 			}
2443 			end = p;
2444 		}
2445 		RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2446 	}
2447 
2448 	ret = mbfl_strcut(&string, &result, from, len);
2449 	ZEND_ASSERT(ret != NULL);
2450 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2451 	efree(ret->val);
2452 }
2453 /* }}} */
2454 
2455 /* Some East Asian characters, when printed at a terminal (or the like), require double
2456  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2457 static size_t character_width(uint32_t c)
2458 {
2459 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2460 		return 1;
2461 	}
2462 
2463 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2464 	unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2465 	while (lo < hi) {
2466 		unsigned int probe = (lo + hi) / 2;
2467 		if (c < mbfl_eaw_table[probe].begin) {
2468 			hi = probe;
2469 		} else if (c > mbfl_eaw_table[probe].end) {
2470 			lo = probe + 1;
2471 		} else {
2472 			return 2;
2473 		}
2474 	}
2475 
2476 	return 1;
2477 }
2478 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2479 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2480 {
2481 	size_t width = 0;
2482 	uint32_t wchar_buf[128];
2483 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2484 	size_t in_len = ZSTR_LEN(string);
2485 	unsigned int state = 0;
2486 
2487 	while (in_len) {
2488 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2489 		ZEND_ASSERT(out_len <= 128);
2490 
2491 		while (out_len) {
2492 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2493 			 * If text conversion is performed with an ordinary ASCII character as
2494 			 * the 'replacement character', this will give us the correct display width. */
2495 			width += character_width(wchar_buf[--out_len]);
2496 		}
2497 	}
2498 
2499 	return width;
2500 }
2501 
2502 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2503 PHP_FUNCTION(mb_strwidth)
2504 {
2505 	zend_string *string, *enc_name = NULL;
2506 
2507 	ZEND_PARSE_PARAMETERS_START(1, 2)
2508 		Z_PARAM_STR(string)
2509 		Z_PARAM_OPTIONAL
2510 		Z_PARAM_STR_OR_NULL(enc_name)
2511 	ZEND_PARSE_PARAMETERS_END();
2512 
2513 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2514 	if (!enc) {
2515 		RETURN_THROWS();
2516 	}
2517 
2518 	RETVAL_LONG(mb_get_strwidth(string, enc));
2519 }
2520 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2521 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2522 {
2523 	uint32_t wchar_buf[128];
2524 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2525 	size_t in_len = ZSTR_LEN(input);
2526 	unsigned int state = 0;
2527 	size_t remaining_width = width;
2528 	size_t to_skip = from;
2529 	size_t out_len = 0;
2530 	bool first_call = true, input_err = false;
2531 	mb_convert_buf buf;
2532 
2533 	while (in_len) {
2534 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2535 		ZEND_ASSERT(out_len <= 128);
2536 
2537 		if (out_len <= to_skip) {
2538 			to_skip -= out_len;
2539 		} else {
2540 			for (size_t i = to_skip; i < out_len; i++) {
2541 				uint32_t w = wchar_buf[i];
2542 				size_t current_w_width = character_width(w);
2543 
2544 				input_err |= (w == MBFL_BAD_INPUT);
2545 
2546 				if (remaining_width < current_w_width) {
2547 					size_t marker_width = mb_get_strwidth(marker, enc);
2548 
2549 					/* The trim marker is larger than the desired string width */
2550 					if (width <= marker_width) {
2551 						return zend_string_copy(marker);
2552 					}
2553 
2554 					/* We need to truncate string and append trim marker */
2555 					width -= marker_width;
2556 					/* 'width' is now the amount we want to take from 'input' */
2557 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2558 
2559 					if (first_call) {
2560 						/* We can use the buffer of wchars which we have right now;
2561 						 * no need to convert again */
2562 						goto dont_restart_conversion;
2563 					} else {
2564 						goto restart_conversion;
2565 					}
2566 				}
2567 				remaining_width -= current_w_width;
2568 			}
2569 			to_skip = 0;
2570 		}
2571 		first_call = false;
2572 	}
2573 
2574 	/* The input string fits in the requested width; we don't need to append the trim marker
2575 	 * However, if the string contains erroneous byte sequences, those should be converted
2576 	 * to error markers */
2577 	if (!input_err) {
2578 		if (from == 0) {
2579 			/* This just increments the string's refcount; it doesn't really 'copy' it */
2580 			return zend_string_copy(input);
2581 		} else {
2582 			return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2583 		}
2584 	} else {
2585 		/* We can't use `mb_get_substr`, because it uses the fastest method possible of
2586 		 * picking out a substring, which may not include converting erroneous byte
2587 		 * sequences to error markers */
2588 		return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2589 	}
2590 
2591 	/* The input string is too wide; we need to build a new string which
2592 	 * includes some portion of the input string, with the trim marker
2593 	 * concatenated onto it */
2594 restart_conversion:
2595 	in = (unsigned char*)ZSTR_VAL(input);
2596 	in_len = ZSTR_LEN(input);
2597 	state = 0;
2598 
2599 	while (true) {
2600 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2601 		ZEND_ASSERT(out_len <= 128);
2602 
2603 dont_restart_conversion:
2604 		if (out_len <= from) {
2605 			from -= out_len;
2606 		} else {
2607 			for (size_t i = from; i < out_len; i++) {
2608 				size_t current_wchar_char_width = character_width(wchar_buf[i]);
2609 				if (width < current_wchar_char_width) {
2610 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2611 					goto append_trim_marker;
2612 				}
2613 				width -= current_wchar_char_width;
2614 			}
2615 			ZEND_ASSERT(in_len > 0);
2616 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2617 			from = 0;
2618 		}
2619 	}
2620 
2621 append_trim_marker:
2622 	if (ZSTR_LEN(marker) > 0) {
2623 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2624 		buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2625 	}
2626 
2627 	/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2628 	 * we have no guarantee that the trim marker string is valid UTF-8 */
2629 	return mb_convert_buf_result_raw(&buf);
2630 }
2631 
2632 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2633 PHP_FUNCTION(mb_strimwidth)
2634 {
2635 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2636 	zend_long from, width;
2637 
2638 	ZEND_PARSE_PARAMETERS_START(3, 5)
2639 		Z_PARAM_STR(str)
2640 		Z_PARAM_LONG(from)
2641 		Z_PARAM_LONG(width)
2642 		Z_PARAM_OPTIONAL
2643 		Z_PARAM_STR(trimmarker)
2644 		Z_PARAM_STR_OR_NULL(encoding)
2645 	ZEND_PARSE_PARAMETERS_END();
2646 
2647 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2648 	if (!enc) {
2649 		RETURN_THROWS();
2650 	}
2651 
2652 	if (from != 0) {
2653 		size_t str_len = mb_get_strlen(str, enc);
2654 		if (from < 0) {
2655 			from += str_len;
2656 		}
2657 		if (from < 0 || from > str_len) {
2658 			zend_argument_value_error(2, "is out of range");
2659 			RETURN_THROWS();
2660 		}
2661 	}
2662 
2663 	if (width < 0) {
2664 		php_error_docref(NULL, E_DEPRECATED,
2665 			"passing a negative integer to argument #3 ($width) is deprecated");
2666 		width += mb_get_strwidth(str, enc);
2667 
2668 		if (from > 0) {
2669 			zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2670 			width -= mb_get_strwidth(trimmed, enc);
2671 			zend_string_free(trimmed);
2672 		}
2673 
2674 		if (width < 0) {
2675 			zend_argument_value_error(3, "is out of range");
2676 			RETURN_THROWS();
2677 		}
2678 	}
2679 
2680 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2681 }
2682 
2683 
2684 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2685 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2686 {
2687 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2688 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2689 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2690 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2691 }
2692 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2693 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2694 {
2695 	unsigned int num_errors = 0;
2696 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2697 	MBSTRG(illegalchars) += num_errors;
2698 	return result;
2699 }
2700 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2701 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2702 {
2703 	const mbfl_encoding *from_encoding;
2704 
2705 	/* pre-conversion encoding */
2706 	ZEND_ASSERT(num_from_encodings >= 1);
2707 	if (num_from_encodings == 1) {
2708 		from_encoding = *from_encodings;
2709 	} else {
2710 		/* auto detect */
2711 		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2712 		if (!from_encoding) {
2713 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2714 			return NULL;
2715 		}
2716 	}
2717 
2718 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2719 }
2720 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2721 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2722 {
2723 	HashTable *output, *chash;
2724 	zend_long idx;
2725 	zend_string *key;
2726 	zval *entry, entry_tmp;
2727 
2728 	if (!input) {
2729 		return NULL;
2730 	}
2731 
2732 	if (GC_IS_RECURSIVE(input)) {
2733 		GC_UNPROTECT_RECURSION(input);
2734 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2735 		return NULL;
2736 	}
2737 	GC_TRY_PROTECT_RECURSION(input);
2738 	output = zend_new_array(zend_hash_num_elements(input));
2739 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2740 		/* convert key */
2741 		if (key) {
2742 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2743 			if (!converted_key) {
2744 				continue;
2745 			}
2746 			key = converted_key;
2747 		}
2748 		/* convert value */
2749 		ZEND_ASSERT(entry);
2750 try_again:
2751 		switch(Z_TYPE_P(entry)) {
2752 			case IS_STRING: {
2753 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2754 				if (!converted_key) {
2755 					if (key) {
2756 						zend_string_release(key);
2757 					}
2758 					continue;
2759 				}
2760 				ZVAL_STR(&entry_tmp, converted_key);
2761 				break;
2762 			}
2763 			case IS_NULL:
2764 			case IS_TRUE:
2765 			case IS_FALSE:
2766 			case IS_LONG:
2767 			case IS_DOUBLE:
2768 				ZVAL_COPY(&entry_tmp, entry);
2769 				break;
2770 			case IS_ARRAY:
2771 				chash = php_mb_convert_encoding_recursive(
2772 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2773 				if (chash) {
2774 					ZVAL_ARR(&entry_tmp, chash);
2775 				} else {
2776 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2777 				}
2778 				break;
2779 			case IS_REFERENCE:
2780 				entry = Z_REFVAL_P(entry);
2781 				goto try_again;
2782 			case IS_OBJECT:
2783 			default:
2784 				if (key) {
2785 					zend_string_release(key);
2786 				}
2787 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2788 				continue;
2789 		}
2790 		if (key) {
2791 			zend_hash_add(output, key, &entry_tmp);
2792 			zend_string_release(key);
2793 		} else {
2794 			zend_hash_index_add(output, idx, &entry_tmp);
2795 		}
2796 	} ZEND_HASH_FOREACH_END();
2797 	GC_TRY_UNPROTECT_RECURSION(input);
2798 
2799 	return output;
2800 }
2801 /* }}} */
2802 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2803 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2804 {
2805 	/* mbstring supports some 'text encodings' which aren't really text encodings
2806 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2807 	 * These should never be returned by `mb_detect_encoding`. */
2808 	unsigned int shift = 0;
2809 	for (unsigned int i = 0; i < *size; i++) {
2810 		const mbfl_encoding *encoding = elist[i];
2811 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2812 			shift++; /* Remove this encoding from the list */
2813 		} else if (shift) {
2814 			elist[i - shift] = encoding;
2815 		}
2816 	}
2817 	*size -= shift;
2818 }
2819 
2820 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2821 PHP_FUNCTION(mb_convert_encoding)
2822 {
2823 	zend_string *to_encoding_name;
2824 	zend_string *input_str, *from_encodings_str = NULL;
2825 	HashTable *input_ht, *from_encodings_ht = NULL;
2826 	const mbfl_encoding **from_encodings;
2827 	size_t num_from_encodings;
2828 	bool free_from_encodings = false;
2829 
2830 	ZEND_PARSE_PARAMETERS_START(2, 3)
2831 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2832 		Z_PARAM_STR(to_encoding_name)
2833 		Z_PARAM_OPTIONAL
2834 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2835 	ZEND_PARSE_PARAMETERS_END();
2836 
2837 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2838 	if (!to_encoding) {
2839 		RETURN_THROWS();
2840 	}
2841 
2842 	if (from_encodings_ht) {
2843 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2844 			RETURN_THROWS();
2845 		}
2846 		free_from_encodings = true;
2847 	} else if (from_encodings_str) {
2848 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2849 				&from_encodings, &num_from_encodings,
2850 				/* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2851 			RETURN_THROWS();
2852 		}
2853 		free_from_encodings = true;
2854 	} else {
2855 		from_encodings = &MBSTRG(current_internal_encoding);
2856 		num_from_encodings = 1;
2857 	}
2858 
2859 	if (num_from_encodings > 1) {
2860 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2861 	}
2862 
2863 	if (!num_from_encodings) {
2864 		efree(ZEND_VOIDP(from_encodings));
2865 		zend_argument_value_error(3, "must specify at least one encoding");
2866 		RETURN_THROWS();
2867 	}
2868 
2869 	if (input_str) {
2870 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2871 		if (ret != NULL) {
2872 			RETVAL_STR(ret);
2873 		} else {
2874 			RETVAL_FALSE;
2875 		}
2876 	} else {
2877 		HashTable *tmp;
2878 		tmp = php_mb_convert_encoding_recursive(
2879 			input_ht, to_encoding, from_encodings, num_from_encodings);
2880 		RETVAL_ARR(tmp);
2881 	}
2882 
2883 	if (free_from_encodings) {
2884 		efree(ZEND_VOIDP(from_encodings));
2885 	}
2886 }
2887 /* }}} */
2888 
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2889 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2890 {
2891 	return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2892 }
2893 
PHP_FUNCTION(mb_convert_case)2894 PHP_FUNCTION(mb_convert_case)
2895 {
2896 	zend_string *str, *from_encoding = NULL;
2897 	zend_long case_mode = 0;
2898 
2899 	ZEND_PARSE_PARAMETERS_START(2, 3)
2900 		Z_PARAM_STR(str)
2901 		Z_PARAM_LONG(case_mode)
2902 		Z_PARAM_OPTIONAL
2903 		Z_PARAM_STR_OR_NULL(from_encoding)
2904 	ZEND_PARSE_PARAMETERS_END();
2905 
2906 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2907 	if (!enc) {
2908 		RETURN_THROWS();
2909 	}
2910 
2911 	if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2912 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2913 		RETURN_THROWS();
2914 	}
2915 
2916 	RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2917 }
2918 
PHP_FUNCTION(mb_strtoupper)2919 PHP_FUNCTION(mb_strtoupper)
2920 {
2921 	zend_string *str, *from_encoding = NULL;
2922 
2923 	ZEND_PARSE_PARAMETERS_START(1, 2)
2924 		Z_PARAM_STR(str)
2925 		Z_PARAM_OPTIONAL
2926 		Z_PARAM_STR_OR_NULL(from_encoding)
2927 	ZEND_PARSE_PARAMETERS_END();
2928 
2929 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2930 	if (!enc) {
2931 		RETURN_THROWS();
2932 	}
2933 
2934 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2935 }
2936 
PHP_FUNCTION(mb_strtolower)2937 PHP_FUNCTION(mb_strtolower)
2938 {
2939 	zend_string *str, *from_encoding = NULL;
2940 
2941 	ZEND_PARSE_PARAMETERS_START(1, 2)
2942 		Z_PARAM_STR(str)
2943 		Z_PARAM_OPTIONAL
2944 		Z_PARAM_STR_OR_NULL(from_encoding)
2945 	ZEND_PARSE_PARAMETERS_END();
2946 
2947 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2948 	if (!enc) {
2949 		RETURN_THROWS();
2950 	}
2951 
2952 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2953 }
2954 
php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS,php_case_mode mode)2955 static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2956 {
2957 	zend_string *str, *from_encoding = NULL;
2958 
2959 	ZEND_PARSE_PARAMETERS_START(1, 2)
2960 		Z_PARAM_STR(str)
2961 		Z_PARAM_OPTIONAL
2962 		Z_PARAM_STR_OR_NULL(from_encoding)
2963 	ZEND_PARSE_PARAMETERS_END();
2964 
2965 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2966 	if (!enc) {
2967 		RETURN_THROWS();
2968 	}
2969 
2970 	zend_string *first = mb_get_substr(str, 0, 1, enc);
2971 	zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2972 
2973 	if (zend_string_equals(first, head)) {
2974 		zend_string_release_ex(first, false);
2975 		zend_string_release_ex(head, false);
2976 		RETURN_STR(zend_string_copy(str));
2977 	}
2978 
2979 	zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2980 	zend_string *retval = zend_string_concat2(ZSTR_VAL(head), ZSTR_LEN(head), ZSTR_VAL(second), ZSTR_LEN(second));
2981 
2982 	zend_string_release_ex(first, false);
2983 	zend_string_release_ex(head, false);
2984 	zend_string_release_ex(second, false);
2985 
2986 	RETVAL_STR(retval);
2987 }
2988 
PHP_FUNCTION(mb_ucfirst)2989 PHP_FUNCTION(mb_ucfirst)
2990 {
2991 	php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_TITLE);
2992 }
2993 
PHP_FUNCTION(mb_lcfirst)2994 PHP_FUNCTION(mb_lcfirst)
2995 {
2996 	php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_LOWER);
2997 }
2998 
2999 typedef enum {
3000 	MB_LTRIM = 1,
3001 	MB_RTRIM = 2,
3002 	MB_BOTH_TRIM = 3
3003 } mb_trim_mode;
3004 
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)3005 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3006 {
3007 	if (ht) {
3008 		return zend_hash_index_exists(ht, w);
3009 	} else {
3010 		for (size_t i = 0; i < default_chars_length; i++) {
3011 			if (w == default_chars[i]) {
3012 				return true;
3013 			}
3014 		}
3015 		return false;
3016 	}
3017 }
3018 
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)3019 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3020 {
3021 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3022 	uint32_t wchar_buf[128];
3023 	size_t in_len = ZSTR_LEN(str);
3024 	size_t out_len = 0;
3025 	unsigned int state = 0;
3026 	size_t left = 0;
3027 	size_t right = 0;
3028 	size_t total_len = 0;
3029 
3030 	while (in_len) {
3031 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3032 		ZEND_ASSERT(out_len <= 128);
3033 		total_len += out_len;
3034 
3035 		for (size_t i = 0; i < out_len; i++) {
3036 			uint32_t w = wchar_buf[i];
3037 			if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3038 				if (mode & MB_LTRIM) {
3039 					left += 1;
3040 				}
3041 				if (mode & MB_RTRIM) {
3042 					right += 1;
3043 				}
3044 			} else {
3045 				mode &= ~MB_LTRIM;
3046 				if (mode & MB_RTRIM) {
3047 					right = 0;
3048 				}
3049 			}
3050 		}
3051 	}
3052 
3053 	if (left == 0 && right == 0) {
3054 		return zend_string_copy(str);
3055 	}
3056 	return mb_get_substr(str, left, total_len - (right + left), enc);
3057 }
3058 
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3059 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3060 {
3061 	const uint32_t trim_default_chars[] = {
3062 		0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3063 		0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3064 		0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3065 		0x85, 0x180E
3066 	};
3067 	size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3068 
3069 	HashTable what_ht;
3070 	zval val;
3071 	ZVAL_TRUE(&val);
3072 
3073 	zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3074 
3075 	for (size_t i = 0; i < trim_default_chars_length; i++) {
3076 		zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3077 	}
3078 	zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3079 	zend_hash_destroy(&what_ht);
3080 
3081 	return retval;
3082 }
3083 
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3084 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3085 {
3086 	unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3087 	uint32_t what_wchar_buf[128];
3088 	size_t what_out_len = 0;
3089 	unsigned int state = 0;
3090 	size_t what_len = ZSTR_LEN(what);
3091 	HashTable what_ht;
3092 	zval val;
3093 	bool hash_initialized = false;
3094 
3095 	while (what_len) {
3096 		what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3097 		ZEND_ASSERT(what_out_len <= 128);
3098 
3099 		if (what_out_len <= 4 && !hash_initialized) {
3100 			return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3101 		} else {
3102 			if (!hash_initialized) {
3103 				hash_initialized = true;
3104 				ZVAL_TRUE(&val);
3105 				zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3106 			}
3107 			for (size_t i = 0; i < what_out_len; i++) {
3108 				zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3109 			}
3110 		}
3111 	}
3112 
3113 	if (UNEXPECTED(!hash_initialized)) {
3114 		/* This is only possible if what is empty */
3115 		return zend_string_copy(str);
3116 	}
3117 
3118 	zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3119 	zend_hash_destroy(&what_ht);
3120 
3121 	return retval;
3122 }
3123 
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3124 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3125 {
3126 	zend_string *str;
3127 	zend_string *what = NULL;
3128 	zend_string *encoding = NULL;
3129 
3130 	ZEND_PARSE_PARAMETERS_START(1, 3)
3131 		Z_PARAM_STR(str)
3132 		Z_PARAM_OPTIONAL
3133 		Z_PARAM_STR_OR_NULL(what)
3134 		Z_PARAM_STR_OR_NULL(encoding)
3135 	ZEND_PARSE_PARAMETERS_END();
3136 
3137 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3138 	if (!enc) {
3139 		RETURN_THROWS();
3140 	}
3141 
3142 	if (what) {
3143 		RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3144 	} else {
3145 		RETURN_STR(mb_trim_default_chars(str, mode, enc));
3146 	}
3147 }
3148 
PHP_FUNCTION(mb_trim)3149 PHP_FUNCTION(mb_trim)
3150 {
3151 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3152 }
3153 
PHP_FUNCTION(mb_ltrim)3154 PHP_FUNCTION(mb_ltrim)
3155 {
3156 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3157 }
3158 
PHP_FUNCTION(mb_rtrim)3159 PHP_FUNCTION(mb_rtrim)
3160 {
3161 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3162 }
3163 
duplicate_elist(const mbfl_encoding ** elist,size_t size)3164 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3165 {
3166 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3167 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3168 	return new_elist;
3169 }
3170 
estimate_demerits(uint32_t w)3171 static unsigned int estimate_demerits(uint32_t w)
3172 {
3173 	/* Receive wchars decoded from input string using candidate encoding.
3174 	 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3175 	 * a smaller number for each ASCII punctuation character, and 1 for
3176 	 * all other codepoints.
3177 	 *
3178 	 * The 'common' codepoints should cover the vast majority of
3179 	 * codepoints we are likely to see in practice, while only covering
3180 	 * a small minority of the entire Unicode encoding space. Why?
3181 	 * Well, if the test string happens to be valid in an incorrect
3182 	 * candidate encoding, the bogus codepoints which it decodes to will
3183 	 * be more or less random. By treating the majority of codepoints as
3184 	 * 'rare', we ensure that in almost all such cases, the bogus
3185 	 * codepoints will include plenty of 'rares', thus giving the
3186 	 * incorrect candidate encoding lots of demerits. See
3187 	 * common_codepoints.txt for the actual list used.
3188 	 *
3189 	 * So, why give extra demerits for ASCII punctuation characters? It's
3190 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3191 	 * which deliberately only use bytes in the ASCII range. When
3192 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3193 	 * have an unusually high number of ASCII punctuation characters.
3194 	 * So giving extra demerits for such characters will improve
3195 	 * detection accuracy for UTF-7 and similar encodings.
3196 	 *
3197 	 * Finally, why 1 demerit for all other characters? That penalizes
3198 	 * long strings, meaning we will tend to choose a candidate encoding
3199 	 * in which the test string decodes to a smaller number of
3200 	 * codepoints. That prevents single-byte encodings in which almost
3201 	 * every possible input byte decodes to a 'common' codepoint from
3202 	 * being favored too much. */
3203 	if (w > 0xFFFF) {
3204 		return 40;
3205 	} else if (w >= 0x21 && w <= 0x2F) {
3206 		return 6;
3207 	} else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3208 		return 30;
3209 	} else {
3210 		return 1;
3211 	}
3212 	return 0;
3213 }
3214 
3215 struct candidate {
3216 	const mbfl_encoding *enc;
3217 	const unsigned char *in;
3218 	size_t in_len;
3219 	uint64_t demerits; /* Wide bit size to prevent overflow */
3220 	unsigned int state;
3221 	float multiplier;
3222 };
3223 
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3224 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3225 {
3226 	size_t j = 0;
3227 
3228 	for (size_t i = 0; i < length; i++) {
3229 		const mbfl_encoding *enc = encodings[i];
3230 
3231 		array[j].enc = enc;
3232 		array[j].state = 0;
3233 		array[j].demerits = 0;
3234 
3235 		/* If any candidate encodings have specialized validation functions, use them
3236 		 * to eliminate as many candidates as possible */
3237 		if (enc->check != NULL) {
3238 			for (size_t k = 0; k < n; k++) {
3239 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
3240 					if (strict) {
3241 						goto skip_to_next;
3242 					} else {
3243 						array[j].demerits += 500;
3244 					}
3245 				}
3246 			}
3247 		}
3248 
3249 		/* This multiplier can optionally be used to make candidate encodings listed
3250 		 * first more likely to be chosen. It is a weight factor which multiplies
3251 		 * the number of demerits counted for each candidate. */
3252 		array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3253 		j++;
3254 skip_to_next: ;
3255 	}
3256 
3257 	return j;
3258 }
3259 
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3260 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3261 {
3262 	for (size_t i = 0; i < length; i++) {
3263 		const mbfl_encoding *enc = array[i].enc;
3264 
3265 		array[i].in = in;
3266 		array[i].in_len = in_len;
3267 
3268 		/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3269 		if (enc == &mbfl_encoding_utf8) {
3270 			if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3271 				array[i].in_len -= 3;
3272 				array[i].in += 3;
3273 			}
3274 		} else if (enc == &mbfl_encoding_utf16be) {
3275 			if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3276 				array[i].in_len -= 2;
3277 				array[i].in += 2;
3278 			}
3279 		} else if (enc == &mbfl_encoding_utf16le) {
3280 			if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3281 				array[i].in_len -= 2;
3282 				array[i].in += 2;
3283 			}
3284 		}
3285 	}
3286 }
3287 
count_demerits(struct candidate * array,size_t length,bool strict)3288 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3289 {
3290 	uint32_t wchar_buf[128];
3291 	unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3292 
3293 	for (size_t i = 0; i < length; i++) {
3294 		if (array[i].in_len == 0) {
3295 			finished++;
3296 		}
3297 	}
3298 
3299 	while ((strict || length > 1) && finished < length) {
3300 		/* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3301 		for (size_t i = length - 1; i != (size_t)-1; i--) {
3302 			/* Do we still have more input to process for this candidate encoding? */
3303 			if (array[i].in_len) {
3304 				const mbfl_encoding *enc = array[i].enc;
3305 				size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3306 				ZEND_ASSERT(out_len <= 128);
3307 				/* Check this batch of decoded codepoints; are there any error markers?
3308 				 * Also sum up the number of demerits */
3309 				while (out_len) {
3310 					uint32_t w = wchar_buf[--out_len];
3311 					if (w == MBFL_BAD_INPUT) {
3312 						if (strict) {
3313 							/* This candidate encoding is not valid, eliminate it from consideration */
3314 							length--;
3315 							if (i < length) {
3316 								/* The eliminated candidate was the last valid one in the list */
3317 								memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3318 							}
3319 							goto try_next_encoding;
3320 						} else {
3321 							array[i].demerits += 1000;
3322 						}
3323 					} else {
3324 						array[i].demerits += estimate_demerits(w);
3325 					}
3326 				}
3327 				if (array[i].in_len == 0) {
3328 					finished++;
3329 				}
3330 			}
3331 try_next_encoding:;
3332 		}
3333 	}
3334 
3335 	for (size_t i = 0; i < length; i++) {
3336 		array[i].demerits *= array[i].multiplier;
3337 	}
3338 
3339 	return length;
3340 }
3341 
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3342 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3343 {
3344 	if (elist_size == 0) {
3345 		return NULL;
3346 	}
3347 	if (elist_size == 1) {
3348 		if (strict) {
3349 			while (n--) {
3350 				if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3351 					return NULL;
3352 				}
3353 			}
3354 		}
3355 		return *elist;
3356 	}
3357 	if (n == 1 && *str_lengths == 0) {
3358 		return *elist;
3359 	}
3360 
3361 	/* Allocate on stack; when we return, this array is automatically freed */
3362 	struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3363 	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3364 
3365 	while (n--) {
3366 		start_string(array, elist_size, strings[n], str_lengths[n]);
3367 		elist_size = count_demerits(array, elist_size, strict);
3368 		if (elist_size == 0) {
3369 			/* All candidates were eliminated */
3370 			return NULL;
3371 		}
3372 	}
3373 
3374 	/* See which remaining candidate encoding has the least demerits */
3375 	unsigned int best = 0;
3376 	for (unsigned int i = 1; i < elist_size; i++) {
3377 		if (array[i].demerits < array[best].demerits) {
3378 			best = i;
3379 		}
3380 	}
3381 	return array[best].enc;
3382 }
3383 
3384 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3385  * is rejected. With non-strict detection, we just continue, but apply demerits for
3386  * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3387 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3388 {
3389 	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3390 }
3391 
3392 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3393 PHP_FUNCTION(mb_detect_encoding)
3394 {
3395 	zend_string *str, *encoding_str = NULL;
3396 	HashTable *encoding_ht = NULL;
3397 	bool strict = false;
3398 	const mbfl_encoding *ret, **elist;
3399 	size_t size;
3400 
3401 	ZEND_PARSE_PARAMETERS_START(1, 3)
3402 		Z_PARAM_STR(str)
3403 		Z_PARAM_OPTIONAL
3404 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3405 		Z_PARAM_BOOL(strict)
3406 	ZEND_PARSE_PARAMETERS_END();
3407 
3408 	/* Should we pay attention to the order of the provided candidate encodings and prefer
3409 	 * the earlier ones (if more than one candidate encoding matches)?
3410 	 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3411 	 * in, then don't treat the order as significant */
3412 	bool order_significant = true;
3413 
3414 	/* make encoding list */
3415 	if (encoding_ht) {
3416 		if (encoding_ht == MBSTRG(all_encodings_list)) {
3417 			order_significant = false;
3418 		}
3419 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3420 			RETURN_THROWS();
3421 		}
3422 	} else if (encoding_str) {
3423 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3424 			RETURN_THROWS();
3425 		}
3426 	} else {
3427 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3428 		size = MBSTRG(current_detect_order_list_size);
3429 	}
3430 
3431 	if (size == 0) {
3432 		efree(ZEND_VOIDP(elist));
3433 		zend_argument_value_error(2, "must specify at least one encoding");
3434 		RETURN_THROWS();
3435 	}
3436 
3437 	remove_non_encodings_from_elist(elist, &size);
3438 	if (size == 0) {
3439 		efree(ZEND_VOIDP(elist));
3440 		RETURN_FALSE;
3441 	}
3442 
3443 	if (ZEND_NUM_ARGS() < 3) {
3444 		strict = MBSTRG(strict_detection);
3445 	}
3446 
3447 	if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3448 		ret = &mbfl_encoding_utf8;
3449 	} else {
3450 		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3451 	}
3452 
3453 	efree(ZEND_VOIDP(elist));
3454 
3455 	if (ret == NULL) {
3456 		RETURN_FALSE;
3457 	}
3458 
3459 	RETVAL_STRING((char *)ret->name);
3460 }
3461 /* }}} */
3462 
3463 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3464 PHP_FUNCTION(mb_list_encodings)
3465 {
3466 	ZEND_PARSE_PARAMETERS_NONE();
3467 
3468 	if (MBSTRG(all_encodings_list) == NULL) {
3469 		/* Initialize shared array of supported encoding names
3470 		 * This is done so that we can check if `mb_list_encodings()` is being
3471 		 * passed to other mbstring functions using a cheap pointer equality check */
3472 		HashTable *array = emalloc(sizeof(HashTable));
3473 		zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3474 		for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3475 			zval tmp;
3476 			ZVAL_STRING(&tmp, (*encodings)->name);
3477 			zend_hash_next_index_insert(array, &tmp);
3478 		}
3479 		MBSTRG(all_encodings_list) = array;
3480 	}
3481 
3482 	GC_ADDREF(MBSTRG(all_encodings_list));
3483 	RETURN_ARR(MBSTRG(all_encodings_list));
3484 }
3485 /* }}} */
3486 
3487 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3488 PHP_FUNCTION(mb_encoding_aliases)
3489 {
3490 	const mbfl_encoding *encoding;
3491 	zend_string *encoding_name = NULL;
3492 
3493 	ZEND_PARSE_PARAMETERS_START(1, 1)
3494 		Z_PARAM_STR(encoding_name)
3495 	ZEND_PARSE_PARAMETERS_END();
3496 
3497 	encoding = php_mb_get_encoding(encoding_name, 1);
3498 	if (!encoding) {
3499 		RETURN_THROWS();
3500 	}
3501 
3502 	array_init(return_value);
3503 	if (encoding->aliases != NULL) {
3504 		for (const char **alias = encoding->aliases; *alias; ++alias) {
3505 			add_next_index_string(return_value, (char *)*alias);
3506 		}
3507 	}
3508 }
3509 /* }}} */
3510 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3511 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3512 {
3513 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3514 	 * if we are converting zenkaku kana to hankaku kana
3515 	 * Make the buffer for converted kana big enough that we never need to
3516 	 * perform bounds checks */
3517 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3518 	unsigned int buf_offset = 0;
3519 	unsigned int state = 0;
3520 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3521 	size_t in_len = ZSTR_LEN(input);
3522 
3523 	mb_convert_buf buf;
3524 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3525 
3526 	while (in_len) {
3527 		uint32_t *converted = converted_buf;
3528 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3529 		 * previous iteration, don't overwrite it */
3530 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3531 		out_len += buf_offset;
3532 		ZEND_ASSERT(out_len <= 64);
3533 
3534 		if (!out_len) {
3535 			continue;
3536 		}
3537 
3538 		for (size_t i = 0; i < out_len-1; i++) {
3539 			uint32_t second = 0;
3540 			bool consumed = false;
3541 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3542 			if (second) {
3543 				*converted++ = second;
3544 			}
3545 			if (consumed) {
3546 				i++;
3547 				if (i == out_len-1) {
3548 					/* We consumed two codepoints at the very end of the wchar buffer
3549 					 * So there is nothing remaining to reprocess on the next iteration */
3550 					buf_offset = 0;
3551 					goto emit_converted_kana;
3552 				}
3553 			}
3554 		}
3555 
3556 		if (!in_len) {
3557 			/* This is the last iteration, so we need to process the final codepoint now */
3558 			uint32_t second = 0;
3559 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3560 			if (second) {
3561 				*converted++ = second;
3562 			}
3563 		} else {
3564 			/* Reprocess the last codepoint on the next iteration */
3565 			wchar_buf[0] = wchar_buf[out_len-1];
3566 			buf_offset = 1;
3567 		}
3568 
3569 emit_converted_kana:
3570 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3571 	}
3572 
3573 	return mb_convert_buf_result(&buf, encoding);
3574 }
3575 
3576 char mb_convert_kana_flags[17] = {
3577 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3578 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3579 	'V'
3580 };
3581 
3582 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3583 PHP_FUNCTION(mb_convert_kana)
3584 {
3585 	unsigned int opt;
3586 	char *optstr = NULL;
3587 	size_t optstr_len;
3588 	zend_string *encname = NULL, *str;
3589 
3590 	ZEND_PARSE_PARAMETERS_START(1, 3)
3591 		Z_PARAM_STR(str)
3592 		Z_PARAM_OPTIONAL
3593 		Z_PARAM_STRING(optstr, optstr_len)
3594 		Z_PARAM_STR_OR_NULL(encname)
3595 	ZEND_PARSE_PARAMETERS_END();
3596 
3597 	if (optstr != NULL) {
3598 		char *p = optstr, *e = p + optstr_len;
3599 		opt = 0;
3600 next_option:
3601 		while (p < e) {
3602 			/* Walk through option string and convert to bit vector
3603 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3604 			char c = *p++;
3605 			if (c == 'A') {
3606 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3607 			} else if (c == 'a') {
3608 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3609 			} else {
3610 				for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3611 					if (c == mb_convert_kana_flags[i]) {
3612 						opt |= (1 << i);
3613 						goto next_option;
3614 					}
3615 				}
3616 
3617 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3618 				RETURN_THROWS();
3619 			}
3620 		}
3621 
3622 		/* Check for illegal combinations of options */
3623 		if (((opt & 0xFF00) >> 8) & opt) {
3624 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3625 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3626 			 * FW hiragana to FW katakana and then back again. */
3627 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3628 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3629 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3630 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3631 				flag1 = 'A';
3632 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3633 				flag2 = 'a';
3634 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3635 			RETURN_THROWS();
3636 		}
3637 
3638 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3639 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3640 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3641 			RETURN_THROWS();
3642 		}
3643 
3644 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3645 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3646 		 * more than one of these */
3647 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3648 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3649 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3650 				RETURN_THROWS();
3651 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3652 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3653 				RETURN_THROWS();
3654 			}
3655 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3656 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3657 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3658 				RETURN_THROWS();
3659 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3660 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3661 				RETURN_THROWS();
3662 			}
3663 		}
3664 	} else {
3665 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3666 	}
3667 
3668 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3669 	if (!enc) {
3670 		RETURN_THROWS();
3671 	}
3672 
3673 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3674 }
3675 
mb_recursive_count_strings(zval * var)3676 static unsigned int mb_recursive_count_strings(zval *var)
3677 {
3678 	unsigned int count = 0;
3679 	ZVAL_DEREF(var);
3680 
3681 	if (Z_TYPE_P(var) == IS_STRING) {
3682 		count++;
3683 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3684 		if (Z_REFCOUNTED_P(var)) {
3685 			if (Z_IS_RECURSIVE_P(var)) {
3686 				return count;
3687 			}
3688 			Z_PROTECT_RECURSION_P(var);
3689 		}
3690 
3691 		HashTable *ht = HASH_OF(var);
3692 		if (ht != NULL) {
3693 			zval *entry;
3694 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3695 				count += mb_recursive_count_strings(entry);
3696 			} ZEND_HASH_FOREACH_END();
3697 		}
3698 
3699 		if (Z_REFCOUNTED_P(var)) {
3700 			Z_UNPROTECT_RECURSION_P(var);
3701 		}
3702 	}
3703 
3704 	return count;
3705 }
3706 
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3707 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3708 {
3709 	ZVAL_DEREF(var);
3710 
3711 	if (Z_TYPE_P(var) == IS_STRING) {
3712 		val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3713 		len_list[*count] = Z_STRLEN_P(var);
3714 		(*count)++;
3715 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3716 		if (Z_REFCOUNTED_P(var)) {
3717 			if (Z_IS_RECURSIVE_P(var)) {
3718 				return true;
3719 			}
3720 			Z_PROTECT_RECURSION_P(var);
3721 		}
3722 
3723 		HashTable *ht = HASH_OF(var);
3724 		if (ht != NULL) {
3725 			zval *entry;
3726 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3727 				if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3728 					if (Z_REFCOUNTED_P(var)) {
3729 						Z_UNPROTECT_RECURSION_P(var);
3730 						return true;
3731 					}
3732 				}
3733 			} ZEND_HASH_FOREACH_END();
3734 		}
3735 
3736 		if (Z_REFCOUNTED_P(var)) {
3737 			Z_UNPROTECT_RECURSION_P(var);
3738 		}
3739 	}
3740 
3741 	return false;
3742 }
3743 
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3744 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3745 {
3746 	zval *entry, *orig_var;
3747 
3748 	orig_var = var;
3749 	ZVAL_DEREF(var);
3750 
3751 	if (Z_TYPE_P(var) == IS_STRING) {
3752 		zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3753 		zval_ptr_dtor(orig_var);
3754 		ZVAL_STR(orig_var, ret);
3755 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3756 		if (Z_TYPE_P(var) == IS_ARRAY) {
3757 			SEPARATE_ARRAY(var);
3758 		}
3759 		if (Z_REFCOUNTED_P(var)) {
3760 			if (Z_IS_RECURSIVE_P(var)) {
3761 				return true;
3762 			}
3763 			Z_PROTECT_RECURSION_P(var);
3764 		}
3765 
3766 		HashTable *ht = HASH_OF(var);
3767 		if (ht != NULL) {
3768 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3769 				if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3770 					if (Z_REFCOUNTED_P(var)) {
3771 						Z_UNPROTECT_RECURSION_P(var);
3772 					}
3773 					return true;
3774 				}
3775 			} ZEND_HASH_FOREACH_END();
3776 		}
3777 
3778 		if (Z_REFCOUNTED_P(var)) {
3779 			Z_UNPROTECT_RECURSION_P(var);
3780 		}
3781 	}
3782 
3783 	return false;
3784 }
3785 
PHP_FUNCTION(mb_convert_variables)3786 PHP_FUNCTION(mb_convert_variables)
3787 {
3788 	zval *args;
3789 	zend_string *to_enc_str;
3790 	zend_string *from_enc_str;
3791 	HashTable *from_enc_ht;
3792 	const mbfl_encoding *from_encoding, *to_encoding;
3793 	uint32_t argc;
3794 	size_t elistsz;
3795 	const mbfl_encoding **elist;
3796 
3797 	ZEND_PARSE_PARAMETERS_START(3, -1)
3798 		Z_PARAM_STR(to_enc_str)
3799 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3800 		Z_PARAM_VARIADIC('+', args, argc)
3801 	ZEND_PARSE_PARAMETERS_END();
3802 
3803 	/* new encoding */
3804 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3805 	if (!to_encoding) {
3806 		RETURN_THROWS();
3807 	}
3808 
3809 	from_encoding = MBSTRG(current_internal_encoding);
3810 
3811 	bool order_significant = true;
3812 
3813 	/* pre-conversion encoding */
3814 	if (from_enc_ht) {
3815 		if (from_enc_ht == MBSTRG(all_encodings_list)) {
3816 			/* If entire list of supported encodings returned by `mb_list_encodings` is passed
3817 			 * in, then don't treat the order of the list as significant */
3818 			order_significant = false;
3819 		}
3820 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3821 			RETURN_THROWS();
3822 		}
3823 	} else {
3824 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3825 			RETURN_THROWS();
3826 		}
3827 	}
3828 
3829 	if (elistsz == 0) {
3830 		efree(ZEND_VOIDP(elist));
3831 		zend_argument_value_error(2, "must specify at least one encoding");
3832 		RETURN_THROWS();
3833 	}
3834 
3835 	if (elistsz == 1) {
3836 		from_encoding = *elist;
3837 	} else {
3838 		/* auto detect */
3839 		unsigned int num = 0;
3840 		for (size_t n = 0; n < argc; n++) {
3841 			zval *zv = &args[n];
3842 			num += mb_recursive_count_strings(zv);
3843 		}
3844 		const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3845 		size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3846 		unsigned int i = 0;
3847 		for (size_t n = 0; n < argc; n++) {
3848 			zval *zv = &args[n];
3849 			if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3850 				efree(ZEND_VOIDP(elist));
3851 				efree(ZEND_VOIDP(val_list));
3852 				efree(len_list);
3853 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3854 				RETURN_FALSE;
3855 			}
3856 		}
3857 		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3858 		efree(ZEND_VOIDP(val_list));
3859 		efree(len_list);
3860 		if (!from_encoding) {
3861 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3862 			efree(ZEND_VOIDP(elist));
3863 			RETURN_FALSE;
3864 		}
3865 
3866 	}
3867 
3868 	efree(ZEND_VOIDP(elist));
3869 
3870 	/* convert */
3871 	for (size_t n = 0; n < argc; n++) {
3872 		zval *zv = &args[n];
3873 		ZVAL_DEREF(zv);
3874 		if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3875 			php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3876 			RETURN_FALSE;
3877 		}
3878 	}
3879 
3880 	RETURN_STRING(from_encoding->name);
3881 }
3882 
3883 /* HTML numeric entities */
3884 
3885 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3886 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3887 {
3888 	zval *hash_entry;
3889 
3890 	size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3891 	if (n_elems % 4 != 0) {
3892 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3893 		return NULL;
3894 	}
3895 
3896 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3897 	uint32_t *mapelm = convmap;
3898 
3899 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3900 		bool failed = true;
3901 		zend_long tmp = zval_try_get_long(hash_entry, &failed);
3902 		if (failed) {
3903 			efree(convmap);
3904 			zend_argument_value_error(2, "must only be composed of values of type int");
3905 			return NULL;
3906 		}
3907 		*mapelm++ = tmp;
3908 	} ZEND_HASH_FOREACH_END();
3909 
3910 	return convmap;
3911 }
3912 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3913 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3914 {
3915 	uint32_t *convmap_end = convmap + conversion_map_size;
3916 
3917 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3918 		uint32_t lo_code = mapelm[0];
3919 		uint32_t hi_code = mapelm[1];
3920 		uint32_t offset  = mapelm[2];
3921 		uint32_t mask    = mapelm[3];
3922 
3923 		if (w >= lo_code && w <= hi_code) {
3924 			/* This wchar falls inside one of the ranges which should be
3925 			 * converted to HTML entities */
3926 			*retval = (w + offset) & mask;
3927 			return true;
3928 		}
3929 	}
3930 
3931 	/* None of the ranges matched */
3932 	return false;
3933 }
3934 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3935 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3936 {
3937 	/* Each wchar which we get from decoding the input string may become up to
3938 	 * 13 wchars when we convert it to an HTML entity */
3939 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3940 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3941 
3942 	unsigned int state = 0;
3943 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3944 	size_t in_len = ZSTR_LEN(input);
3945 
3946 	mb_convert_buf buf;
3947 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3948 
3949 	while (in_len) {
3950 		/* Convert input string to wchars, up to 32 at a time */
3951 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3952 		ZEND_ASSERT(out_len <= 32);
3953 		uint32_t *converted = converted_buf;
3954 
3955 		/* Run through wchars and see if any of them fall into the ranges
3956 		 * which we want to convert to HTML entities */
3957 		for (size_t i = 0; i < out_len; i++) {
3958 			uint32_t w = wchar_buf[i];
3959 
3960 			if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3961 				*converted++ = '&';
3962 				*converted++ = '#';
3963 				if (hex) {
3964 					*converted++ = 'x';
3965 				}
3966 
3967 				/* Convert wchar to decimal/hex string */
3968 				if (w == 0) {
3969 					*converted++ = '0';
3970 				} else {
3971 					unsigned char *p = entity + sizeof(entity);
3972 					if (hex) {
3973 						while (w > 0) {
3974 							*(--p) = "0123456789ABCDEF"[w & 0xF];
3975 							w >>= 4;
3976 						}
3977 					} else {
3978 						while (w > 0) {
3979 							*(--p) = "0123456789"[w % 10];
3980 							w /= 10;
3981 						}
3982 					}
3983 					while (p < entity + sizeof(entity)) {
3984 						*converted++ = *p++;
3985 					}
3986 				}
3987 
3988 				*converted++ = ';';
3989 			} else {
3990 				*converted++ = w;
3991 			}
3992 		}
3993 
3994 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3995 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3996 	}
3997 
3998 	return mb_convert_buf_result(&buf, encoding);
3999 }
4000 
4001 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)4002 PHP_FUNCTION(mb_encode_numericentity)
4003 {
4004 	zend_string *encoding = NULL, *str;
4005 	size_t conversion_map_size;
4006 	HashTable *target_hash;
4007 	bool is_hex = false;
4008 
4009 	ZEND_PARSE_PARAMETERS_START(2, 4)
4010 		Z_PARAM_STR(str)
4011 		Z_PARAM_ARRAY_HT(target_hash)
4012 		Z_PARAM_OPTIONAL
4013 		Z_PARAM_STR_OR_NULL(encoding)
4014 		Z_PARAM_BOOL(is_hex)
4015 	ZEND_PARSE_PARAMETERS_END();
4016 
4017 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4018 	if (!enc) {
4019 		RETURN_THROWS();
4020 	}
4021 
4022 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4023 	if (convmap == NULL) {
4024 		RETURN_THROWS();
4025 	}
4026 
4027 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4028 	efree(convmap);
4029 }
4030 /* }}} */
4031 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)4032 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4033 {
4034 	uint32_t *convmap_end = convmap + conversion_map_size;
4035 
4036 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4037 		uint32_t lo_code = mapelm[0];
4038 		uint32_t hi_code = mapelm[1];
4039 		uint32_t offset  = mapelm[2];
4040 		uint32_t codepoint = number - offset;
4041 		if (codepoint >= lo_code && codepoint <= hi_code) {
4042 			*retval = codepoint;
4043 			return true;
4044 		}
4045 	}
4046 
4047 	return false;
4048 }
4049 
4050 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
4051 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
4052 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4053 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4054 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4055 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4056 {
4057 	uint32_t wchar_buf[128], converted_buf[128];
4058 
4059 	unsigned int state = 0;
4060 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4061 	size_t in_len = ZSTR_LEN(input);
4062 
4063 	mb_convert_buf buf;
4064 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4065 
4066 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4067 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4068 	 * 2nd 'converted' buffer.
4069 	 *
4070 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4071 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4072 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4073 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4074 	 * to store the next batch of wchars after it.
4075 	 *
4076 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4077 	 * wchars from the 1st buffer to the 2nd one.
4078 	 *
4079 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4080 	 * have to do bounds checks when writing wchars into it.
4081 	 */
4082 
4083 	unsigned int wchar_buf_offset = 0;
4084 
4085 	while (in_len) {
4086 		/* Leave space for sentinel at the end of the buffer */
4087 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4088 		out_len += wchar_buf_offset;
4089 		ZEND_ASSERT(out_len <= 127);
4090 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4091 
4092 		uint32_t *p, *converted;
4093 
4094 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4095 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
4096 		if (wchar_buf_offset == 0) {
4097 			p = wchar_buf;
4098 			while (*p != '&')
4099 				p++;
4100 			if (p == wchar_buf + out_len) {
4101 				/* No HTML entities in this buffer */
4102 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4103 				continue;
4104 			}
4105 
4106 			/* Copy over the prefix with no & which we already scanned */
4107 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4108 			converted = converted_buf + (p - wchar_buf);
4109 		} else {
4110 			p = wchar_buf;
4111 			converted = converted_buf;
4112 		}
4113 
4114 found_ampersand:
4115 		ZEND_ASSERT(*p == '&');
4116 		uint32_t *p2 = p;
4117 
4118 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4119 		if (*++p2 == '#') {
4120 			if (*++p2 == 'x') {
4121 				/* Possible hex entity */
4122 				uint32_t w = *++p2;
4123 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4124 					w = *++p2;
4125 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4126 					/* We hit the end of the buffer while reading digits, and
4127 					 * more wchars are still coming in the next buffer
4128 					 * Reprocess this identity on next iteration */
4129 					memmove(wchar_buf, p, (p2 - p) * 4);
4130 					wchar_buf_offset = p2 - p;
4131 					goto process_converted_wchars;
4132 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4133 					/* Invalid entity (too long or "&#x" only) */
4134 					memcpy(converted, p, (p2 - p) * 4);
4135 					converted += p2 - p;
4136 				} else {
4137 					/* Valid hexadecimal entity */
4138 					uint32_t value = 0, *p3 = p + 3;
4139 					while (p3 < p2) {
4140 						w = *p3++;
4141 						if (w <= '9') {
4142 							value = (value * 16) + (w - '0');
4143 						} else if (w >= 'a') {
4144 							value = (value * 16) + 10 + (w - 'a');
4145 						} else {
4146 							value = (value * 16) + 10 + (w - 'A');
4147 						}
4148 					}
4149 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4150 						converted++;
4151 						if (*p2 == ';')
4152 							p2++;
4153 					} else {
4154 						memcpy(converted, p, (p2 - p) * 4);
4155 						converted += p2 - p;
4156 					}
4157 				}
4158 			} else {
4159 				/* Possible decimal entity */
4160 				uint32_t w = *p2;
4161 				while (w >= '0' && w <= '9')
4162 					w = *++p2;
4163 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4164 					/* The number of digits was legal (no more than 10 decimal digits)
4165 					 * Reprocess this identity on next iteration of main loop */
4166 					memmove(wchar_buf, p, (p2 - p) * 4);
4167 					wchar_buf_offset = p2 - p;
4168 					goto process_converted_wchars;
4169 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4170 					/* Invalid entity (too long or "&#" only) */
4171 					memcpy(converted, p, (p2 - p) * 4);
4172 					converted += p2 - p;
4173 				} else {
4174 					/* Valid decimal entity */
4175 					uint32_t value = 0, *p3 = p + 2;
4176 					while (p3 < p2) {
4177 						/* If unsigned integer overflow would occur in the below
4178 						 * multiplication by 10, this entity is no good
4179 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4180 						if (value > 0x19999999) {
4181 							memcpy(converted, p, (p2 - p) * 4);
4182 							converted += p2 - p;
4183 							goto decimal_entity_too_big;
4184 						}
4185 						value = (value * 10) + (*p3++ - '0');
4186 					}
4187 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4188 						converted++;
4189 						if (*p2 == ';')
4190 							p2++;
4191 					} else {
4192 						memcpy(converted, p, (p2 - p) * 4);
4193 						converted += p2 - p;
4194 					}
4195 				}
4196 			}
4197 		} else if ((p2 == wchar_buf + out_len) && in_len) {
4198 			/* Corner case: & at end of buffer */
4199 			wchar_buf[0] = '&';
4200 			wchar_buf_offset = 1;
4201 			goto process_converted_wchars;
4202 		} else {
4203 			*converted++ = '&';
4204 		}
4205 decimal_entity_too_big:
4206 
4207 		/* Starting to scan a new section of the wchar buffer
4208 		 * 'p2' is pointing at the next wchar which needs to be processed */
4209 		p = p2;
4210 		while (*p2 != '&')
4211 			p2++;
4212 
4213 		if (p2 > p) {
4214 			memcpy(converted, p, (p2 - p) * 4);
4215 			converted += p2 - p;
4216 			p = p2;
4217 		}
4218 
4219 		if (p < wchar_buf + out_len)
4220 			goto found_ampersand;
4221 
4222 		/* We do not have any wchars remaining at the end of this buffer which
4223 		 * we need to reprocess on the next call */
4224 		wchar_buf_offset = 0;
4225 process_converted_wchars:
4226 		ZEND_ASSERT(converted <= converted_buf + 128);
4227 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4228 	}
4229 
4230 	return mb_convert_buf_result(&buf, encoding);
4231 }
4232 
4233 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4234 PHP_FUNCTION(mb_decode_numericentity)
4235 {
4236 	zend_string *encoding = NULL, *str;
4237 	size_t conversion_map_size;
4238 	HashTable *target_hash;
4239 
4240 	ZEND_PARSE_PARAMETERS_START(2, 3)
4241 		Z_PARAM_STR(str)
4242 		Z_PARAM_ARRAY_HT(target_hash)
4243 		Z_PARAM_OPTIONAL
4244 		Z_PARAM_STR_OR_NULL(encoding)
4245 	ZEND_PARSE_PARAMETERS_END();
4246 
4247 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4248 	if (!enc) {
4249 		RETURN_THROWS();
4250 	}
4251 
4252 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4253 	if (convmap == NULL) {
4254 		RETURN_THROWS();
4255 	}
4256 
4257 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4258 	efree(convmap);
4259 }
4260 /* }}} */
4261 
4262 /* {{{ Sends an email message with MIME scheme */
4263 #define CRLF "\r\n"
4264 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4265 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4266 {
4267 	const char *ps;
4268 	size_t icnt;
4269 	int state = 0;
4270 	int crlf_state = -1;
4271 	char *token = NULL;
4272 	size_t token_pos = 0;
4273 	zend_string *fld_name, *fld_val;
4274 
4275 	ps = str;
4276 	icnt = str_len;
4277 	fld_name = fld_val = NULL;
4278 
4279 	/*
4280 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4281 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4282 	 *      state  0            1           2          3
4283 	 *
4284 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4285 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4286 	 * crlf_state -1                       0                     1 -1
4287 	 *
4288 	 */
4289 
4290 	while (icnt > 0) {
4291 		switch (*ps) {
4292 			case ':':
4293 				if (crlf_state == 1) {
4294 					token_pos++;
4295 				}
4296 
4297 				if (state == 0 || state == 1) {
4298 					if(token && token_pos > 0) {
4299 						fld_name = zend_string_init(token, token_pos, 0);
4300 					}
4301 					state = 2;
4302 				} else {
4303 					token_pos++;
4304 				}
4305 
4306 				crlf_state = 0;
4307 				break;
4308 
4309 			case '\n':
4310 				if (crlf_state == -1) {
4311 					goto out;
4312 				}
4313 				crlf_state = -1;
4314 				break;
4315 
4316 			case '\r':
4317 				if (crlf_state == 1) {
4318 					token_pos++;
4319 				} else {
4320 					crlf_state = 1;
4321 				}
4322 				break;
4323 
4324 			case ' ': case '\t':
4325 				if (crlf_state == -1) {
4326 					if (state == 3) {
4327 						/* continuing from the previous line */
4328 						state = 4;
4329 					} else {
4330 						/* simply skipping this new line */
4331 						state = 5;
4332 					}
4333 				} else {
4334 					if (crlf_state == 1) {
4335 						token_pos++;
4336 					}
4337 					if (state == 1 || state == 3) {
4338 						token_pos++;
4339 					}
4340 				}
4341 				crlf_state = 0;
4342 				break;
4343 
4344 			default:
4345 				switch (state) {
4346 					case 0:
4347 						token = (char*)ps;
4348 						token_pos = 0;
4349 						state = 1;
4350 						break;
4351 
4352 					case 2:
4353 						if (crlf_state != -1) {
4354 							token = (char*)ps;
4355 							token_pos = 0;
4356 
4357 							state = 3;
4358 							break;
4359 						}
4360 						ZEND_FALLTHROUGH;
4361 
4362 					case 3:
4363 						if (crlf_state == -1) {
4364 							if(token && token_pos > 0) {
4365 								fld_val = zend_string_init(token, token_pos, 0);
4366 							}
4367 
4368 							if (fld_name != NULL && fld_val != NULL) {
4369 								zval val;
4370 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4371 								ZVAL_STR(&val, fld_val);
4372 
4373 								zend_hash_update(ht, fld_name, &val);
4374 
4375 								zend_string_release_ex(fld_name, 0);
4376 							}
4377 
4378 							fld_name = fld_val = NULL;
4379 							token = (char*)ps;
4380 							token_pos = 0;
4381 
4382 							state = 1;
4383 						}
4384 						break;
4385 
4386 					case 4:
4387 						token_pos++;
4388 						state = 3;
4389 						break;
4390 				}
4391 
4392 				if (crlf_state == 1) {
4393 					token_pos++;
4394 				}
4395 
4396 				token_pos++;
4397 
4398 				crlf_state = 0;
4399 				break;
4400 		}
4401 		ps++, icnt--;
4402 	}
4403 out:
4404 	if (state == 2) {
4405 		token = "";
4406 		token_pos = 0;
4407 
4408 		state = 3;
4409 	}
4410 	if (state == 3) {
4411 		if(token && token_pos > 0) {
4412 			fld_val = zend_string_init(token, token_pos, 0);
4413 		}
4414 		if (fld_name != NULL && fld_val != NULL) {
4415 			zval val;
4416 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4417 			ZVAL_STR(&val, fld_val);
4418 			zend_hash_update(ht, fld_name, &val);
4419 
4420 			zend_string_release_ex(fld_name, 0);
4421 		}
4422 	}
4423 	return state;
4424 }
4425 
PHP_FUNCTION(mb_send_mail)4426 PHP_FUNCTION(mb_send_mail)
4427 {
4428 	char *to;
4429 	size_t to_len;
4430 	char *message;
4431 	size_t message_len;
4432 	zend_string *subject;
4433 	zend_string *extra_cmd = NULL;
4434 	HashTable *headers_ht = NULL;
4435 	zend_string *str_headers = NULL;
4436 	size_t i;
4437 	char *to_r = NULL;
4438 	char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4439 	bool suppress_content_type = false;
4440 	bool suppress_content_transfer_encoding = false;
4441 
4442 	char *p;
4443 	enum mbfl_no_encoding;
4444 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4445 						*head_enc,	/* header transfer encoding */
4446 						*body_enc;	/* body transfer encoding */
4447 	const mbfl_language *lang;
4448 	HashTable ht_headers;
4449 	zval *s;
4450 
4451 	/* character-set, transfer-encoding */
4452 	tran_cs = &mbfl_encoding_utf8;
4453 	head_enc = &mbfl_encoding_base64;
4454 	body_enc = &mbfl_encoding_base64;
4455 	lang = mbfl_no2language(MBSTRG(language));
4456 	if (lang != NULL) {
4457 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4458 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4459 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4460 	}
4461 
4462 	ZEND_PARSE_PARAMETERS_START(3, 5)
4463 		Z_PARAM_PATH(to, to_len)
4464 		Z_PARAM_PATH_STR(subject)
4465 		Z_PARAM_PATH(message, message_len)
4466 		Z_PARAM_OPTIONAL
4467 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4468 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4469 	ZEND_PARSE_PARAMETERS_END();
4470 
4471 	if (str_headers) {
4472 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4473 			zend_argument_value_error(4, "must not contain any null bytes");
4474 			RETURN_THROWS();
4475 		}
4476 		str_headers = php_trim(str_headers, NULL, 0, 2);
4477 	} else if (headers_ht) {
4478 		str_headers = php_mail_build_headers(headers_ht);
4479 		if (EG(exception)) {
4480 			RETURN_THROWS();
4481 		}
4482 	}
4483 
4484 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4485 
4486 	if (str_headers != NULL) {
4487 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4488 	}
4489 
4490 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4491 		char *tmp;
4492 		char *param_name;
4493 		char *charset = NULL;
4494 
4495 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4496 		p = strchr(Z_STRVAL_P(s), ';');
4497 
4498 		if (p != NULL) {
4499 			/* skipping the padded spaces */
4500 			do {
4501 				++p;
4502 			} while (*p == ' ' || *p == '\t');
4503 
4504 			if (*p != '\0') {
4505 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4506 					if (strcasecmp(param_name, "charset") == 0) {
4507 						const mbfl_encoding *_tran_cs = tran_cs;
4508 
4509 						charset = php_strtok_r(NULL, "= \"", &tmp);
4510 						if (charset != NULL) {
4511 							_tran_cs = mbfl_name2encoding(charset);
4512 						}
4513 
4514 						if (!_tran_cs) {
4515 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4516 							_tran_cs = &mbfl_encoding_ascii;
4517 						}
4518 						tran_cs = _tran_cs;
4519 					}
4520 				}
4521 			}
4522 		}
4523 		suppress_content_type = true;
4524 	}
4525 
4526 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4527 		const mbfl_encoding *_body_enc;
4528 
4529 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4530 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4531 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4532 			case mbfl_no_encoding_base64:
4533 			case mbfl_no_encoding_7bit:
4534 			case mbfl_no_encoding_8bit:
4535 				body_enc = _body_enc;
4536 				break;
4537 
4538 			default:
4539 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4540 				body_enc =	&mbfl_encoding_8bit;
4541 				break;
4542 		}
4543 		suppress_content_transfer_encoding = true;
4544 	}
4545 
4546 	/* To: */
4547 	if (to_len > 0) {
4548 		to_r = estrndup(to, to_len);
4549 		for (; to_len; to_len--) {
4550 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4551 				break;
4552 			}
4553 			to_r[to_len - 1] = '\0';
4554 		}
4555 		for (i = 0; to_r[i]; i++) {
4556 			if (iscntrl((unsigned char) to_r[i])) {
4557 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4558 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4559 				 * To prevent these separators from being replaced with a space, we skip over them. */
4560 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4561 					i += 2;
4562 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4563 						i++;
4564 					}
4565 					continue;
4566 				}
4567 
4568 				to_r[i] = ' ';
4569 			}
4570 		}
4571 	} else {
4572 		to_r = to;
4573 	}
4574 
4575 	/* Subject: */
4576 	const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4577 	if (enc == &mbfl_encoding_pass) {
4578 		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4579 	}
4580 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4581 	size_t line_sep_len = strlen(line_sep);
4582 
4583 	subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4584 
4585 	/* message body */
4586 	const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4587 	if (msg_enc == &mbfl_encoding_pass) {
4588 		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4589 	}
4590 
4591 	unsigned int num_errors = 0;
4592 	zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4593 	zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4594 	zend_string_free(tmpstr);
4595 	message = ZSTR_VAL(conv);
4596 
4597 	/* other headers */
4598 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4599 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4600 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4601 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4602 
4603 	smart_str str = {0};
4604 	bool empty = true;
4605 
4606 	if (str_headers != NULL) {
4607 		/* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4608 		size_t len = ZSTR_LEN(str_headers);
4609 		if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4610 			len--;
4611 		}
4612 		if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4613 			len--;
4614 		}
4615 		smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4616 		empty = false;
4617 		zend_string_release_ex(str_headers, 0);
4618 	}
4619 
4620 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4621 		if (!empty) {
4622 			smart_str_appendl(&str, line_sep, line_sep_len);
4623 		}
4624 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4625 		empty = false;
4626 	}
4627 
4628 	if (!suppress_content_type) {
4629 		if (!empty) {
4630 			smart_str_appendl(&str, line_sep, line_sep_len);
4631 		}
4632 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4633 
4634 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4635 		if (p != NULL) {
4636 			smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4637 			smart_str_appends(&str, p);
4638 		}
4639 		empty = false;
4640 	}
4641 
4642 	if (!suppress_content_transfer_encoding) {
4643 		if (!empty) {
4644 			smart_str_appendl(&str, line_sep, line_sep_len);
4645 		}
4646 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4647 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4648 		if (p == NULL) {
4649 			p = "7bit";
4650 		}
4651 		smart_str_appends(&str, p);
4652 	}
4653 
4654 	str_headers = smart_str_extract(&str);
4655 
4656 	if (force_extra_parameters) {
4657 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4658 	} else if (extra_cmd) {
4659 		extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4660 	}
4661 
4662 	RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4663 
4664 	if (extra_cmd) {
4665 		zend_string_release_ex(extra_cmd, 0);
4666 	}
4667 	if (to_r != to) {
4668 		efree(to_r);
4669 	}
4670 	zend_string_release(subject);
4671 	zend_string_free(conv);
4672 	zend_hash_destroy(&ht_headers);
4673 	if (str_headers) {
4674 		zend_string_release_ex(str_headers, 0);
4675 	}
4676 }
4677 
4678 #undef CRLF
4679 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4680 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4681 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4682 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4683 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4684 /* }}} */
4685 
4686 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4687 PHP_FUNCTION(mb_get_info)
4688 {
4689 	zend_string *type = NULL;
4690 	size_t n;
4691 	char *name;
4692 	zval row;
4693 	const mbfl_encoding **entry;
4694 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4695 
4696 	ZEND_ASSERT(lang);
4697 
4698 	ZEND_PARSE_PARAMETERS_START(0, 1)
4699 		Z_PARAM_OPTIONAL
4700 		Z_PARAM_STR(type)
4701 	ZEND_PARSE_PARAMETERS_END();
4702 
4703 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4704 		array_init(return_value);
4705 		if (MBSTRG(current_internal_encoding)) {
4706 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4707 		}
4708 		if (MBSTRG(http_input_identify)) {
4709 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4710 		}
4711 		if (MBSTRG(current_http_output_encoding)) {
4712 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4713 		}
4714 
4715 		add_assoc_str(return_value, "http_output_conv_mimetypes",
4716 			zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4717 		);
4718 
4719 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4720 		add_assoc_string(return_value, "mail_charset", name);
4721 
4722 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4723 		add_assoc_string(return_value, "mail_header_encoding", name);
4724 
4725 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4726 		add_assoc_string(return_value, "mail_body_encoding", name);
4727 
4728 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4729 
4730 		if (MBSTRG(encoding_translation)) {
4731 			add_assoc_string(return_value, "encoding_translation", "On");
4732 		} else {
4733 			add_assoc_string(return_value, "encoding_translation", "Off");
4734 		}
4735 
4736 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4737 		add_assoc_string(return_value, "language", name);
4738 
4739 		// TODO Seems to always have one entry at least?
4740 		n = MBSTRG(current_detect_order_list_size);
4741 		entry = MBSTRG(current_detect_order_list);
4742 		if (n > 0) {
4743 			size_t i;
4744 			array_init(&row);
4745 			for (i = 0; i < n; i++) {
4746 				add_next_index_string(&row, (*entry)->name);
4747 				entry++;
4748 			}
4749 			add_assoc_zval(return_value, "detect_order", &row);
4750 		}
4751 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4752 			add_assoc_string(return_value, "substitute_character", "none");
4753 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4754 			add_assoc_string(return_value, "substitute_character", "long");
4755 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4756 			add_assoc_string(return_value, "substitute_character", "entity");
4757 		} else {
4758 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4759 		}
4760 		if (MBSTRG(strict_detection)) {
4761 			add_assoc_string(return_value, "strict_detection", "On");
4762 		} else {
4763 			add_assoc_string(return_value, "strict_detection", "Off");
4764 		}
4765 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4766 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
4767 		RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4768 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4769 		if (MBSTRG(http_input_identify)) {
4770 			RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4771 		}
4772 		RETURN_NULL();
4773 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4774 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4775 		RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4776 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4777 		RETURN_STR(
4778 			zend_ini_str(
4779 				"mbstring.http_output_conv_mimetypes",
4780 				sizeof("mbstring.http_output_conv_mimetypes") - 1,
4781 				false
4782 			)
4783 		);
4784 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4785 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4786 		RETURN_STRING(name);
4787 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4788 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4789 		RETURN_STRING(name);
4790 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4791 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4792 		RETURN_STRING(name);
4793 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4794 		RETURN_LONG(MBSTRG(illegalchars));
4795 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4796 		if (MBSTRG(encoding_translation)) {
4797 			RETURN_STRING("On");
4798 		} else {
4799 			RETURN_STRING("Off");
4800 		}
4801 	} else if (zend_string_equals_literal_ci(type, "language")) {
4802 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4803 		RETURN_STRING(name);
4804 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4805 		// TODO Seems to always have one entry at least?
4806 		n = MBSTRG(current_detect_order_list_size);
4807 		entry = MBSTRG(current_detect_order_list);
4808 		if (n > 0) {
4809 			size_t i;
4810 			array_init(return_value);
4811 			for (i = 0; i < n; i++) {
4812 				add_next_index_string(return_value, (*entry)->name);
4813 				entry++;
4814 			}
4815 		}
4816 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4817 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4818 			RETURN_STRING("none");
4819 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4820 			RETURN_STRING("long");
4821 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4822 			RETURN_STRING("entity");
4823 		} else {
4824 			RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4825 		}
4826 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4827 		if (MBSTRG(strict_detection)) {
4828 			RETURN_STRING("On");
4829 		} else {
4830 			RETURN_STRING("Off");
4831 		}
4832 	} else {
4833 		php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4834 		RETURN_FALSE;
4835 	}
4836 }
4837 /* }}} */
4838 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4839 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4840 {
4841 	uint32_t wchar_buf[128];
4842 	unsigned char *in = (unsigned char*)input;
4843 	unsigned int state = 0;
4844 
4845 	if (encoding->check != NULL) {
4846 		return encoding->check(in, length);
4847 	}
4848 
4849 	/* If the input string is not encoded in the given encoding, there is a significant chance
4850 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4851 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4852 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4853 	ZEND_ASSERT(out_len <= 8);
4854 	for (unsigned int i = 0; i < out_len; i++) {
4855 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4856 			return false;
4857 		}
4858 	}
4859 
4860 	while (length) {
4861 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4862 		ZEND_ASSERT(out_len <= 128);
4863 		for (unsigned int i = 0; i < out_len; i++) {
4864 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4865 				return false;
4866 			}
4867 		}
4868 	}
4869 
4870 	return true;
4871 }
4872 
4873 /* MSVC 32-bit has issues with 64-bit intrinsics.
4874  * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4875  * It seems this is caused by a bug in MS Visual C++
4876  * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4877 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4878 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4879 #endif
4880 
4881 /* If we are building an AVX2-only binary, don't compile the next function */
4882 #ifndef ZEND_INTRIN_AVX2_NATIVE
4883 
4884 /* SSE2-based function for validating UTF-8 strings
4885  * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4886 static bool mb_fast_check_utf8_default(zend_string *str)
4887 {
4888 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4889 # ifdef __SSE2__
4890 	/* `e` points 1 byte past the last full 16-byte block of string content
4891 	 * Note that we include the terminating null byte which is included in each zend_string
4892 	 * as part of the content to check; this ensures that multi-byte characters which are
4893 	 * truncated abruptly at the end of the string will be detected as invalid */
4894 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4895 
4896 	/* For checking for illegal bytes 0xF5-FF */
4897 	const __m128i over_f5 = _mm_set1_epi8(-117);
4898 	/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4899 	const __m128i over_9f = _mm_set1_epi8(-97);
4900 	/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4901 	const __m128i over_8f = _mm_set1_epi8(-113);
4902 	/* For checking for illegal bytes 0xC0-C1 */
4903 	const __m128i find_c0 = _mm_set1_epi8(-64);
4904 	const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4905 	/* For checking structure of continuation bytes */
4906 	const __m128i find_e0 = _mm_set1_epi8(-32);
4907 	const __m128i find_f0 = _mm_set1_epi8(-16);
4908 
4909 	__m128i last_block = _mm_setzero_si128();
4910 	__m128i operand;
4911 
4912 	while (p < e) {
4913 		operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4914 
4915 check_operand:
4916 		/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4917 		if (!_mm_movemask_epi8(operand)) {
4918 			/* Even if this block only contains single-byte characters, there may have been a
4919 			 * multi-byte character at the end of the previous block, which was supposed to
4920 			 * have continuation bytes in this block
4921 			 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4922 			 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4923 			 * from the 3rd last */
4924 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4925 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4926 			if (_mm_movemask_epi8(bad)) {
4927 				return false;
4928 			}
4929 
4930 			/* Consume as many full blocks of single-byte characters as we can */
4931 			while (true) {
4932 				p += sizeof(__m128i);
4933 				if (p >= e) {
4934 					goto finish_up_remaining_bytes;
4935 				}
4936 				operand = _mm_loadu_si128((__m128i*)p);
4937 				if (_mm_movemask_epi8(operand)) {
4938 					break;
4939 				}
4940 			}
4941 		}
4942 
4943 		/* Check for >= 0xF5, which are illegal byte values in UTF-8
4944 		 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4945 		 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4946 		 * Then a single signed compare will pick out any bad bytes
4947 		 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4948 		__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4949 
4950 		/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4951 		 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4952 		 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4953 		 * We can check for both problems at once by generating a vector where each byte < 0xA0
4954 		 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4955 		 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4956 		__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4957 		__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4958 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4959 
4960 		/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4961 		 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4962 		 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4963 		 * Build the bitmask and compare it with the shifted block */
4964 		__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4965 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4966 
4967 		/* Check for overlong 2-byte code units
4968 		 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4969 		 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4970 		 * byte range, do a signed compare to pick out any bad bytes */
4971 		bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4972 
4973 		/* Check structure of continuation bytes
4974 		 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4975 		 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4976 		 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4977 		 * 3) 3 bytes after the start of a 4-byte character
4978 		 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4979 		 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4980 		__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4981 		__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4982 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4983 		__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4984 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4985 
4986 		/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4987 		 * a continuation byte actually is
4988 		 * XOR those two bitmasks together; if everything is good, the result should be zero
4989 		 * However, if a byte which should have been a continuation wasn't, or if a byte which
4990 		 * shouldn't have been a continuation was, we will get 0xFF in that position */
4991 		__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4992 		bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4993 
4994 		/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4995 		 * If that value is non-zero, then we found a bad byte somewhere! */
4996 		if (_mm_movemask_epi8(bad)) {
4997 			return false;
4998 		}
4999 
5000 		last_block = operand;
5001 		p += sizeof(__m128i);
5002 	}
5003 
5004 finish_up_remaining_bytes:
5005 	/* Finish up 1-15 remaining bytes */
5006 	if (p == e) {
5007 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5008 
5009 		/* Crazy hack here for cases where 9 or more bytes are remaining...
5010 		 * We want to use the above vectorized code to check a block of less than 16 bytes,
5011 		 * but there is no good way to read a variable number of bytes into an XMM register
5012 		 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5013 		 * 'header' fields which occupy the memory just before its content
5014 		 * And, those header fields occupy more than 16 bytes...
5015 		 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5016 		 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5017 		 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5018 		 * Then, we do a left shift to get rid of the unwanted bytes
5019 		 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5020 		 *
5021 		 * The following `switch` looks useless, but it's not
5022 		 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5023 		 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5024 		 */
5025 		switch (remaining_bytes) {
5026 		case 0: ;
5027 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5028 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5029 			return _mm_movemask_epi8(bad) == 0;
5030 		case 1:
5031 		case 2:
5032 			operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5033 			goto check_operand;
5034 		case 3:
5035 		case 4:
5036 			operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5037 			goto check_operand;
5038 		case 5:
5039 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5040 			goto check_operand;
5041 		case 6:
5042 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5043 			goto check_operand;
5044 		case 7:
5045 		case 8:
5046 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5047 			operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5048 #else
5049 			operand = _mm_set_epi64x(0, *((uint64_t*)p));
5050 #endif
5051 			goto check_operand;
5052 		case 9:
5053 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5054 			goto check_operand;
5055 		case 10:
5056 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5057 			goto check_operand;
5058 		case 11:
5059 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5060 			goto check_operand;
5061 		case 12:
5062 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5063 			goto check_operand;
5064 		case 13:
5065 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5066 			goto check_operand;
5067 		case 14:
5068 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5069 			goto check_operand;
5070 		case 15:
5071 			/* No trailing bytes are left which need to be checked
5072 			 * We get 15 because we did not include the terminating null when
5073 			 * calculating `remaining_bytes`, so the value wraps around */
5074 			return true;
5075 		}
5076 
5077 		ZEND_UNREACHABLE();
5078 	}
5079 
5080 	return true;
5081 # else
5082 	/* This UTF-8 validation function is derived from PCRE2 */
5083 	size_t length = ZSTR_LEN(str);
5084 	/* Table of the number of extra bytes, indexed by the first byte masked with
5085 	0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5086 	static const uint8_t utf8_table[] = {
5087 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5088 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5089 		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5090 		3,3,3,3,3,3,3,3
5091 	};
5092 
5093 	for (; length > 0; p++) {
5094 		uint32_t d;
5095 		unsigned char c = *p;
5096 		length--;
5097 
5098 		if (c < 128) {
5099 			/* ASCII character */
5100 			continue;
5101 		}
5102 
5103 		if (c < 0xc0) {
5104 			/* Isolated 10xx xxxx byte */
5105 			return false;
5106 		}
5107 
5108 		if (c >= 0xf5) {
5109 			return false;
5110 		}
5111 
5112 		uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5113 		if (length < ab) {
5114 			/* Missing bytes */
5115 			return false;
5116 		}
5117 		length -= ab;
5118 
5119 		/* Check top bits in the second byte */
5120 		if (((d = *(++p)) & 0xc0) != 0x80) {
5121 			return false;
5122 		}
5123 
5124 		/* For each length, check that the remaining bytes start with the 0x80 bit
5125 		 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5126 		 * excluded range 0xd800 to 0xdfff. */
5127 		switch (ab) {
5128 		case 1:
5129 			/* 2-byte character. No further bytes to check for 0x80. Check first byte
5130 			 * for xx00 000x (overlong sequence). */
5131 			if ((c & 0x3e) == 0) {
5132 				return false;
5133 			}
5134 			break;
5135 
5136 		case 2:
5137 			/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5138 			 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5139 			if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5140 				return false;
5141 			}
5142 			break;
5143 
5144 		case 3:
5145 			/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5146 			 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5147 			 * character greater than 0x0010ffff (f4 8f bf bf) */
5148 			if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5149 				return false;
5150 			}
5151 			break;
5152 
5153 			EMPTY_SWITCH_DEFAULT_CASE();
5154 		}
5155 	}
5156 
5157 	return true;
5158 # endif
5159 }
5160 
5161 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5162 
5163 #ifdef ZEND_INTRIN_AVX2_NATIVE
5164 
5165 /* We are building AVX2-only binary */
5166 # include <immintrin.h>
5167 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5168 
5169 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5170 
5171 /* We are building binary which works with or without AVX2; whether or not to use
5172  * AVX2-accelerated functions will be determined at runtime */
5173 # include <immintrin.h>
5174 # include "Zend/zend_cpuinfo.h"
5175 
5176 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5177 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5178  * resolve symbols accordingly */
5179 
5180 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5181 
5182 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5183 
5184 typedef bool (*check_utf8_func_t)(zend_string*);
5185 
5186 ZEND_NO_SANITIZE_ADDRESS
5187 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5188 static check_utf8_func_t resolve_check_utf8(void)
5189 {
5190 	if (zend_cpu_supports_avx2()) {
5191 		return mb_fast_check_utf8_avx2;
5192 	}
5193 	return mb_fast_check_utf8_default;
5194 }
5195 
5196 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5197 /* We are compiling for a target where the dynamic linker will not be able to
5198  * resolve symbols according to whether the host supports AVX2 or not; so instead,
5199  * we can make calls go through a function pointer and set the function pointer
5200  * on module load */
5201 
5202 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5203 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5204 #else
5205 static bool mb_fast_check_utf8_avx2(zend_string *str);
5206 #endif
5207 
5208 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5209 
mb_fast_check_utf8(zend_string * str)5210 static bool mb_fast_check_utf8(zend_string *str)
5211 {
5212 	return check_utf8_ptr(str);
5213 }
5214 
init_check_utf8(void)5215 static void init_check_utf8(void)
5216 {
5217 	if (zend_cpu_supports_avx2()) {
5218 		check_utf8_ptr = mb_fast_check_utf8_avx2;
5219 	} else {
5220 		check_utf8_ptr = mb_fast_check_utf8_default;
5221 	}
5222 }
5223 # endif
5224 
5225 #else
5226 
5227 /* No AVX2 support */
5228 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5229 
5230 #endif
5231 
5232 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5233 
5234 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5235  * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5236 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5237 # define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5238 #endif
5239 
5240 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5241  * number of bytes, then take the low 256 bits
5242  * This is used to take some number of trailing bytes from the previous 32-byte
5243  * block followed by some number of leading bytes from the current 32-byte block
5244  *
5245  * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5246  * YMM register while shifting in bytes from another YMM register... but
5247  * it works separately on respective 128-bit halves of the YMM registers,
5248  * which is not what we want.
5249  * To make it work as desired, we first do _mm256_permute2x128_si256
5250  * (VPERM2I128) to combine the low 128 bits from the previous block and
5251  * the high 128 bits of the current block in one YMM register.
5252  * Then VPALIGNR will do what is needed. */
5253 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5254 
5255 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5256  *
5257  * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5258  * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5259  * sections. */
5260 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5261 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5262 #else
5263 static bool mb_fast_check_utf8_avx2(zend_string *str)
5264 #endif
5265 {
5266 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5267 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5268 
5269 	/* The algorithm used here for UTF-8 validation is partially adapted from the
5270 	 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5271 	 * and Daniel Lemire.
5272 	 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5273 	 *
5274 	 * Most types of invalid UTF-8 text can be detected by examining pairs of
5275 	 * successive bytes. Specifically:
5276 	 *
5277 	 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5278 	 *   No valid UTF-8 string ever uses these byte values.
5279 	 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5280 	 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5281 	 * • 5-byte or 6-byte code units, which should never be used, start with
5282 	 *   0xF8-FE.
5283 	 * • A codepoint value higher than U+10FFFF, which is the highest value for
5284 	 *   any Unicode codepoint, would either start with 0xF4, followed by a
5285 	 *   byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5286 	 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5287 	 *   be used, would start with 0xED, followed by a byte >= 0xA0.
5288 	 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5289 	 *
5290 	 * To detect all these problems, for each pair of successive bytes, we do
5291 	 * table lookups using the high nibble of the first byte, the low nibble of
5292 	 * the first byte, and the high nibble of the second byte. Each table lookup
5293 	 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5294 	 * combination; AND those three bitmasks together, and any 1 bit in the result
5295 	 * will indicate an actual invalid byte combination was found.
5296 	 */
5297 
5298 #define BAD_BYTE 0x1
5299 #define OVERLONG_2BYTE 0x2
5300 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5301 #define OVERLONG_3BYTE 0x4
5302 #define SURROGATE 0x8
5303 #define OVERLONG_4BYTE 0x10
5304 #define INVALID_CP 0x20
5305 
5306 	/* Each of these are 16-entry tables, repeated twice; this is required by the
5307 	 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5308 	 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5309 	 *
5310 	 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5311 	 * that means that high nibble 0xC is consistent with the byte pair being part of
5312 	 * an overlong 2-byte code unit */
5313 	const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5314 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5315 		0, 0, 0, 0,
5316 		0, 0, 0, 0,
5317 		0, 0, 0, 0,
5318 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5319 		0, 0, 0, 0,
5320 		0, 0, 0, 0,
5321 		0, 0, 0, 0);
5322 	const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5323 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5324 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5325 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5326 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5327 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5328 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5329 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5330 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5331 	const __m256i bad_hi_nibble = _mm256_set_epi8(
5332 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5333 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5334 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5335 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5336 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5337 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5338 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5339 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5340 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5341 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5342 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5343 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5344 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5345 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5346 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5347 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5348 
5349 	const __m256i find_continuation = _mm256_set1_epi8(-64);
5350 	const __m256i _b = _mm256_set1_epi8(0xB);
5351 	const __m256i _d = _mm256_set1_epi8(0xD);
5352 	const __m256i _f = _mm256_set1_epi8(0xF);
5353 
5354 	__m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5355 	__m256i operand;
5356 
5357 	while (p < e) {
5358 		operand = _mm256_loadu_si256((__m256i*)p);
5359 
5360 check_operand:
5361 		if (!_mm256_movemask_epi8(operand)) {
5362 			/* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5363 			 * the previous block didn't end with an incomplete multi-byte character
5364 			 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5365 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5366 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5367 			if (_mm256_movemask_epi8(bad)) {
5368 				return false;
5369 			}
5370 
5371 			/* Consume as many full blocks of single-byte characters as we can */
5372 			while (true) {
5373 				p += sizeof(__m256i);
5374 				if (p >= e) {
5375 					goto finish_up_remaining_bytes;
5376 				}
5377 				operand = _mm256_loadu_si256((__m256i*)p);
5378 				if (_mm256_movemask_epi8(operand)) {
5379 					break;
5380 				}
5381 			}
5382 		}
5383 
5384 		__m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5385 		__m256i lo_nibbles = _mm256_and_si256(operand, _f);
5386 
5387 		__m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5388 		__m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5389 
5390 		/* Do parallel table lookups in all 3 tables */
5391 		__m256i bad = _mm256_cmpgt_epi8(
5392 			_mm256_and_si256(
5393 				_mm256_and_si256(
5394 					_mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5395 					_mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5396 				_mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5397 			_mm256_setzero_si256());
5398 
5399 		__m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5400 		__m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5401 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5402 		__m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5403 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5404 
5405 		__m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5406 		bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5407 
5408 		if (_mm256_movemask_epi8(bad)) {
5409 			return false;
5410 		}
5411 
5412 		last_hi_nibbles = hi_nibbles;
5413 		last_lo_nibbles = lo_nibbles;
5414 		p += sizeof(__m256i);
5415 	}
5416 
5417 finish_up_remaining_bytes:
5418 	if (p == e) {
5419 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5420 
5421 		switch (remaining_bytes) {
5422 		case 0: ;
5423 			/* No actual data bytes are remaining */
5424 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5425 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5426 			return _mm256_movemask_epi8(bad) == 0;
5427 		case 1:
5428 		case 2:
5429 			operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5430 			goto check_operand;
5431 		case 3:
5432 		case 4:
5433 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5434 			goto check_operand;
5435 		case 5:
5436 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5437 			goto check_operand;
5438 		case 6:
5439 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5440 			goto check_operand;
5441 		case 7:
5442 		case 8:
5443 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5444 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5445 #else
5446 			operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5447 #endif
5448 			goto check_operand;
5449 		case 9:
5450 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5451 			goto check_operand;
5452 		case 10:
5453 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5454 			goto check_operand;
5455 		case 11:
5456 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5457 			goto check_operand;
5458 		case 12:
5459 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5460 			goto check_operand;
5461 		case 13:
5462 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5463 			goto check_operand;
5464 		case 14:
5465 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5466 			goto check_operand;
5467 		case 15:
5468 		case 16:
5469 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5470 			goto check_operand;
5471 		case 17:
5472 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5473 			goto check_operand;
5474 		case 18:
5475 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5476 			goto check_operand;
5477 		case 19:
5478 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5479 			goto check_operand;
5480 		case 20:
5481 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5482 			goto check_operand;
5483 		case 21:
5484 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5485 			goto check_operand;
5486 		case 22:
5487 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5488 			goto check_operand;
5489 		case 23:
5490 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5491 			goto check_operand;
5492 		case 24:
5493 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5494 			goto check_operand;
5495 		case 25:
5496 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5497 			goto check_operand;
5498 		case 26:
5499 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5500 			goto check_operand;
5501 		case 27:
5502 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5503 			goto check_operand;
5504 		case 28:
5505 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5506 			goto check_operand;
5507 		case 29:
5508 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5509 			goto check_operand;
5510 		case 30:
5511 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5512 			goto check_operand;
5513 		case 31:
5514 			return true;
5515 		}
5516 
5517 		ZEND_UNREACHABLE();
5518 	}
5519 
5520 	return true;
5521 }
5522 
5523 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5524 
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5525 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5526 {
5527 	if (encoding == &mbfl_encoding_utf8) {
5528 		if (ZSTR_IS_VALID_UTF8(str)) {
5529 			return true;
5530 		}
5531 		bool result = mb_fast_check_utf8(str);
5532 		if (result && !ZSTR_IS_INTERNED(str)) {
5533 			GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5534 		}
5535 		return result;
5536 	} else {
5537 		return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5538 	}
5539 }
5540 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5541 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5542 {
5543 	zend_long idx;
5544 	zend_string *key;
5545 	zval *entry;
5546 	bool valid = true;
5547 
5548 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5549 
5550 	if (GC_IS_RECURSIVE(vars)) {
5551 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5552 		return false;
5553 	}
5554 	GC_TRY_PROTECT_RECURSION(vars);
5555 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5556 		ZVAL_DEREF(entry);
5557 		if (key) {
5558 			if (!mb_check_str_encoding(key, encoding)) {
5559 				valid = false;
5560 				break;
5561 			}
5562 		}
5563 		switch (Z_TYPE_P(entry)) {
5564 			case IS_STRING:
5565 				if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5566 					valid = false;
5567 					break;
5568 				}
5569 				break;
5570 			case IS_ARRAY:
5571 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5572 					valid = false;
5573 					break;
5574 				}
5575 				break;
5576 			case IS_LONG:
5577 			case IS_DOUBLE:
5578 			case IS_NULL:
5579 			case IS_TRUE:
5580 			case IS_FALSE:
5581 				break;
5582 			default:
5583 				/* Other types are error. */
5584 				valid = false;
5585 				break;
5586 		}
5587 	} ZEND_HASH_FOREACH_END();
5588 	GC_TRY_UNPROTECT_RECURSION(vars);
5589 	return valid;
5590 }
5591 
5592 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5593 PHP_FUNCTION(mb_check_encoding)
5594 {
5595 	zend_string *input_str = NULL, *enc = NULL;
5596 	HashTable *input_ht = NULL;
5597 	const mbfl_encoding *encoding;
5598 
5599 	ZEND_PARSE_PARAMETERS_START(0, 2)
5600 		Z_PARAM_OPTIONAL
5601 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5602 		Z_PARAM_STR_OR_NULL(enc)
5603 	ZEND_PARSE_PARAMETERS_END();
5604 
5605 	encoding = php_mb_get_encoding(enc, 2);
5606 	if (!encoding) {
5607 		RETURN_THROWS();
5608 	}
5609 
5610 	if (input_ht) {
5611 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5612 	} else if (input_str) {
5613 		RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5614 	} else {
5615 		php_error_docref(NULL, E_DEPRECATED,
5616 			"Calling mb_check_encoding() without argument is deprecated");
5617 
5618 		/* FIXME: Actually check all inputs, except $_FILES file content. */
5619 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
5620 	}
5621 }
5622 /* }}} */
5623 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5624 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5625 	const uint32_t enc_name_arg_num)
5626 {
5627 	const mbfl_encoding *enc;
5628 	enum mbfl_no_encoding no_enc;
5629 
5630 	ZEND_ASSERT(str_len > 0);
5631 
5632 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5633 	if (!enc) {
5634 		return -2;
5635 	}
5636 
5637 	no_enc = enc->no_encoding;
5638 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5639 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5640 		return -2;
5641 	}
5642 
5643 	/* Some legacy text encodings have a minimum required wchar buffer size;
5644 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5645 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5646 	unsigned int state = 0;
5647 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5648 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5649 
5650 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5651 		return -1;
5652 	}
5653 	return wchar_buf[0];
5654 }
5655 
5656 /* {{{ */
PHP_FUNCTION(mb_ord)5657 PHP_FUNCTION(mb_ord)
5658 {
5659 	char *str;
5660 	size_t str_len;
5661 	zend_string *enc = NULL;
5662 	zend_long cp;
5663 
5664 	ZEND_PARSE_PARAMETERS_START(1, 2)
5665 		Z_PARAM_STRING(str, str_len)
5666 		Z_PARAM_OPTIONAL
5667 		Z_PARAM_STR_OR_NULL(enc)
5668 	ZEND_PARSE_PARAMETERS_END();
5669 
5670 	if (str_len == 0) {
5671 		zend_argument_value_error(1, "must not be empty");
5672 		RETURN_THROWS();
5673 	}
5674 
5675 	cp = php_mb_ord(str, str_len, enc, 2);
5676 
5677 	if (0 > cp) {
5678 		if (cp == -2) {
5679 			RETURN_THROWS();
5680 		}
5681 		RETURN_FALSE;
5682 	}
5683 
5684 	RETURN_LONG(cp);
5685 }
5686 /* }}} */
5687 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5688 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5689 {
5690 	const mbfl_encoding *enc;
5691 	enum mbfl_no_encoding no_enc;
5692 	zend_string *ret;
5693 	char buf[4];
5694 
5695 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5696 	if (!enc) {
5697 		return NULL;
5698 	}
5699 
5700 	no_enc = enc->no_encoding;
5701 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5702 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5703 		return NULL;
5704 	}
5705 
5706 	if (cp < 0 || cp > 0x10ffff) {
5707 		return NULL;
5708 	}
5709 
5710 	if (php_mb_is_no_encoding_utf8(no_enc)) {
5711 		if (cp > 0xd7ff && 0xe000 > cp) {
5712 			return NULL;
5713 		}
5714 
5715 		if (cp < 0x80) {
5716 			ret = ZSTR_CHAR(cp);
5717 		} else if (cp < 0x800) {
5718 			ret = zend_string_alloc(2, 0);
5719 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5720 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5721 			ZSTR_VAL(ret)[2] = 0;
5722 		} else if (cp < 0x10000) {
5723 			ret = zend_string_alloc(3, 0);
5724 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5725 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5726 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5727 			ZSTR_VAL(ret)[3] = 0;
5728 		} else {
5729 			ret = zend_string_alloc(4, 0);
5730 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5731 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5732 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5733 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5734 			ZSTR_VAL(ret)[4] = 0;
5735 		}
5736 
5737 		return ret;
5738 	}
5739 
5740 	buf[0] = (cp >> 24) & 0xff;
5741 	buf[1] = (cp >> 16) & 0xff;
5742 	buf[2] = (cp >>  8) & 0xff;
5743 	buf[3] = cp & 0xff;
5744 
5745 	long orig_illegalchars = MBSTRG(illegalchars);
5746 	MBSTRG(illegalchars) = 0;
5747 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5748 
5749 	if (MBSTRG(illegalchars) != 0) {
5750 		zend_string_release(ret);
5751 		ret = NULL;
5752 	}
5753 
5754 	MBSTRG(illegalchars) = orig_illegalchars;
5755 	return ret;
5756 }
5757 
5758 /* {{{ */
PHP_FUNCTION(mb_chr)5759 PHP_FUNCTION(mb_chr)
5760 {
5761 	zend_long cp;
5762 	zend_string *enc = NULL;
5763 
5764 	ZEND_PARSE_PARAMETERS_START(1, 2)
5765 		Z_PARAM_LONG(cp)
5766 		Z_PARAM_OPTIONAL
5767 		Z_PARAM_STR_OR_NULL(enc)
5768 	ZEND_PARSE_PARAMETERS_END();
5769 
5770 	zend_string* ret = php_mb_chr(cp, enc, 2);
5771 	if (ret == NULL) {
5772 		RETURN_FALSE;
5773 	}
5774 
5775 	RETURN_STR(ret);
5776 }
5777 /* }}} */
5778 
PHP_FUNCTION(mb_str_pad)5779 PHP_FUNCTION(mb_str_pad)
5780 {
5781 	zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5782 	zend_long pad_to_length;
5783 	zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5784 
5785 	ZEND_PARSE_PARAMETERS_START(2, 5)
5786 		Z_PARAM_STR(input)
5787 		Z_PARAM_LONG(pad_to_length)
5788 		Z_PARAM_OPTIONAL
5789 		Z_PARAM_STR(pad)
5790 		Z_PARAM_LONG(pad_type_val)
5791 		Z_PARAM_STR_OR_NULL(encoding_str)
5792 	ZEND_PARSE_PARAMETERS_END();
5793 
5794 	const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5795 	if (!encoding) {
5796 		RETURN_THROWS();
5797 	}
5798 
5799 	size_t input_length = mb_get_strlen(input, encoding);
5800 
5801 	/* If resulting string turns out to be shorter than input string,
5802 	   we simply copy the input and return. */
5803 	if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5804 		RETURN_STR_COPY(input);
5805 	}
5806 
5807 	if (ZSTR_LEN(pad) == 0) {
5808 		zend_argument_value_error(3, "must be a non-empty string");
5809 		RETURN_THROWS();
5810 	}
5811 
5812 	if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5813 		zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5814 		RETURN_THROWS();
5815 	}
5816 
5817 	size_t pad_length = mb_get_strlen(pad, encoding);
5818 
5819 	size_t num_mb_pad_chars = pad_to_length - input_length;
5820 
5821 	/* We need to figure out the left/right padding lengths. */
5822 	size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5823 	switch (pad_type_val) {
5824 		case PHP_STR_PAD_RIGHT:
5825 			right_pad = num_mb_pad_chars;
5826 			break;
5827 
5828 		case PHP_STR_PAD_LEFT:
5829 			left_pad = num_mb_pad_chars;
5830 			break;
5831 
5832 		case PHP_STR_PAD_BOTH:
5833 			left_pad = num_mb_pad_chars / 2;
5834 			right_pad = num_mb_pad_chars - left_pad;
5835 			break;
5836 	}
5837 
5838 	/* How many full block copies need to happen, and how many characters are then left over? */
5839 	size_t full_left_pad_copies = left_pad / pad_length;
5840 	size_t full_right_pad_copies = right_pad / pad_length;
5841 	size_t remaining_left_pad_chars = left_pad % pad_length;
5842 	size_t remaining_right_pad_chars = right_pad % pad_length;
5843 
5844 	if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5845 		goto overflow_no_release;
5846 	}
5847 
5848 	/* Compute the number of bytes required for the padding */
5849 	size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5850 	size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5851 
5852 	/* No special fast-path handling necessary for zero-length pads because these functions will not
5853 	 * allocate memory in case a zero-length pad is required. */
5854 	zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5855 	zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5856 
5857 	if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5858 		|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5859 		goto overflow;
5860 	}
5861 
5862 	size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5863 	size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5864 
5865 	if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5866 		|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5867 		goto overflow;
5868 	}
5869 
5870 	zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5871 	char *buffer = ZSTR_VAL(result);
5872 
5873 	/* First we pad the left. */
5874 	for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5875 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5876 	}
5877 	memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5878 	buffer += ZSTR_LEN(remaining_left_pad_str);
5879 
5880 	/* Then we copy the input string. */
5881 	memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5882 	buffer += ZSTR_LEN(input);
5883 
5884 	/* Finally, we pad on the right. */
5885 	for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5886 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5887 	}
5888 	memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5889 
5890 	ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5891 
5892 	zend_string_release_ex(remaining_left_pad_str, false);
5893 	zend_string_release_ex(remaining_right_pad_str, false);
5894 
5895 	RETURN_NEW_STR(result);
5896 
5897 overflow:
5898 	zend_string_release_ex(remaining_left_pad_str, false);
5899 	zend_string_release_ex(remaining_right_pad_str, false);
5900 overflow_no_release:
5901 	zend_throw_error(NULL, "String size overflow");
5902 	RETURN_THROWS();
5903 }
5904 
5905 /* {{{ */
PHP_FUNCTION(mb_scrub)5906 PHP_FUNCTION(mb_scrub)
5907 {
5908 	zend_string *str, *enc_name = NULL;
5909 
5910 	ZEND_PARSE_PARAMETERS_START(1, 2)
5911 		Z_PARAM_STR(str)
5912 		Z_PARAM_OPTIONAL
5913 		Z_PARAM_STR_OR_NULL(enc_name)
5914 	ZEND_PARSE_PARAMETERS_END();
5915 
5916 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5917 	if (!enc) {
5918 		RETURN_THROWS();
5919 	}
5920 
5921 	if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5922 		/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5923 		RETURN_STR_COPY(str);
5924 	}
5925 
5926 	RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5927 }
5928 /* }}} */
5929 
5930 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5931 static void php_mb_populate_current_detect_order_list(void)
5932 {
5933 	const mbfl_encoding **entry = 0;
5934 	size_t nentries;
5935 
5936 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5937 		nentries = MBSTRG(detect_order_list_size);
5938 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5939 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5940 	} else {
5941 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5942 		size_t i;
5943 		nentries = MBSTRG(default_detect_order_list_size);
5944 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5945 		for (i = 0; i < nentries; i++) {
5946 			entry[i] = mbfl_no2encoding(src[i]);
5947 		}
5948 	}
5949 	MBSTRG(current_detect_order_list) = entry;
5950 	MBSTRG(current_detect_order_list_size) = nentries;
5951 }
5952 /* }}} */
5953 
5954 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5955 static int php_mb_encoding_translation(void)
5956 {
5957 	return MBSTRG(encoding_translation);
5958 }
5959 /* }}} */
5960 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5961 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5962 {
5963 	if (enc) {
5964 		if (enc->mblen_table) {
5965 			if (s) {
5966 				return enc->mblen_table[*(unsigned char *)s];
5967 			}
5968 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5969 			return 2;
5970 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5971 			return 4;
5972 		}
5973 	}
5974 	return 1;
5975 }
5976 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5977 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5978 {
5979 	const char *p = s;
5980 	char *last=NULL;
5981 
5982 	if (nbytes == (size_t)-1) {
5983 		size_t nb = 0;
5984 
5985 		while (*p != '\0') {
5986 			if (nb == 0) {
5987 				if ((unsigned char)*p == (unsigned char)c) {
5988 					last = (char *)p;
5989 				}
5990 				nb = php_mb_mbchar_bytes(p, enc);
5991 				if (nb == 0) {
5992 					return NULL; /* something is going wrong! */
5993 				}
5994 			}
5995 			--nb;
5996 			++p;
5997 		}
5998 	} else {
5999 		size_t bcnt = nbytes;
6000 		size_t nbytes_char;
6001 		while (bcnt > 0) {
6002 			if ((unsigned char)*p == (unsigned char)c) {
6003 				last = (char *)p;
6004 			}
6005 			nbytes_char = php_mb_mbchar_bytes(p, enc);
6006 			if (bcnt < nbytes_char) {
6007 				return NULL;
6008 			}
6009 			p += nbytes_char;
6010 			bcnt -= nbytes_char;
6011 		}
6012 	}
6013 	return last;
6014 }
6015 
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)6016 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
6017 {
6018 	/* We're using simple case-folding here, because we'd have to deal with remapping of
6019 	 * offsets otherwise. */
6020 	zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6021 	zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6022 
6023 	size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6024 
6025 	zend_string_free(haystack_conv);
6026 	zend_string_free(needle_conv);
6027 
6028 	return n;
6029 }
6030 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)6031 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6032 {
6033 	*list = (const zend_encoding **)MBSTRG(http_input_list);
6034 	*list_size = MBSTRG(http_input_list_size);
6035 }
6036 /* }}} */
6037 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)6038 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6039 {
6040 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
6041 }
6042 /* }}} */
6043 
6044 static const unsigned char base64_table[] = {
6045  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6046    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6047  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6048    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6049  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6050    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6051  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6052    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6053  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6054    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6055 };
6056 
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6057 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6058 {
6059 	if (base64) {
6060 		return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6061 	} else {
6062 		size_t enc_size = 0;
6063 		unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6064 		while (p < tmpbuf->out) {
6065 			unsigned char c = *p++;
6066 			enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6067 		}
6068 		return enc_size;
6069 	}
6070 }
6071 
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6072 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6073 {
6074 	unsigned char *out, *limit;
6075 	MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6076 	unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6077 
6078 	if (base64) {
6079 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6080 		while ((e - p) >= 3) {
6081 			unsigned char a = *p++;
6082 			unsigned char b = *p++;
6083 			unsigned char c = *p++;
6084 			uint32_t bits = (a << 16) | (b << 8) | c;
6085 			out = mb_convert_buf_add4(out,
6086 				base64_table[(bits >> 18) & 0x3F],
6087 				base64_table[(bits >> 12) & 0x3F],
6088 				base64_table[(bits >> 6) & 0x3F],
6089 				base64_table[bits & 0x3F]);
6090 		}
6091 		if (p != e) {
6092 			if ((e - p) == 1) {
6093 				uint32_t bits = *p++;
6094 				out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6095 			} else {
6096 				unsigned char a = *p++;
6097 				unsigned char b = *p++;
6098 				uint32_t bits = (a << 8) | b;
6099 				out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6100 			}
6101 		}
6102 	} else {
6103 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6104 		while (p < e) {
6105 			unsigned char c = *p++;
6106 			if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6107 				out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6108 			} else {
6109 				out = mb_convert_buf_add(out, c);
6110 			}
6111 		}
6112 	}
6113 
6114 	mb_convert_buf_reset(tmpbuf, 0);
6115 	MB_CONVERT_BUF_STORE(outbuf, out, limit);
6116 }
6117 
6118 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6119 
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6120 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6121 {
6122 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6123 	size_t in_len = ZSTR_LEN(input);
6124 
6125 	ZEND_ASSERT(outcode->mime_name != NULL);
6126 	ZEND_ASSERT(outcode->mime_name[0] != '\0');
6127 
6128 	if (!in_len) {
6129 		return zend_empty_string;
6130 	}
6131 
6132 	if (indent < 0 || indent >= 74) {
6133 		indent = 0;
6134 	}
6135 
6136 	if (linefeed_len > 8) {
6137 		linefeed_len = 8;
6138 	}
6139 	/* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6140 	for (size_t i = 0; i < linefeed_len; i++) {
6141 		if (linefeed[i] == '\0') {
6142 			linefeed_len = i;
6143 			break;
6144 		}
6145 	}
6146 
6147 	unsigned int state = 0;
6148 	/* wchar_buf should be big enough that when it is full, we definitely have enough
6149 	 * wchars to fill an entire line of output */
6150 	uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6151 	uint32_t *p, *e;
6152 	/* What part of wchar_buf is filled with still-unprocessed data which should not
6153 	 * be overwritten? */
6154 	unsigned int offset = 0;
6155 	size_t line_start = 0;
6156 
6157 	/* If the entire input string is ASCII with no spaces (except possibly leading
6158 	 * spaces), just pass it through unchanged */
6159 	bool checking_leading_spaces = true;
6160 	while (in_len) {
6161 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6162 		p = wchar_buf;
6163 		e = wchar_buf + out_len;
6164 
6165 		while (p < e) {
6166 			uint32_t w = *p++;
6167 			if (checking_leading_spaces) {
6168 				if (w == ' ') {
6169 					continue;
6170 				} else {
6171 					checking_leading_spaces = false;
6172 				}
6173 			}
6174 			if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6175 				/* We cannot simply pass input string through unchanged; start again */
6176 				in = (unsigned char*)ZSTR_VAL(input);
6177 				in_len = ZSTR_LEN(input);
6178 				goto no_passthrough;
6179 			}
6180 		}
6181 	}
6182 
6183 	return zend_string_copy(input); /* This just increments refcount */
6184 
6185 no_passthrough: ;
6186 
6187 	mb_convert_buf buf;
6188 	mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6189 
6190 	/* Encode some prefix of the input string as plain ASCII if possible
6191 	 * If we find it necessary to switch to Base64/QPrint encoding, we will
6192 	 * do so all the way to the end of the string */
6193 	while (in_len) {
6194 		/* Decode part of the input string, refill wchar_buf */
6195 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6196 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6197 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6198 		p = wchar_buf;
6199 		e = wchar_buf + offset + out_len;
6200 		/* ASCII output is broken into space-delimited 'words'
6201 		 * If we find a non-ASCII character in the middle of a word, we will
6202 		 * transfer-encode the entire word */
6203 		uint32_t *word_start = p;
6204 
6205 		/* Don't consider adding line feed for spaces at the beginning of a word */
6206 		while (p < e && *p == ' ' && (p - word_start) <= 74) {
6207 			p++;
6208 		}
6209 
6210 		while (p < e) {
6211 			uint32_t w = *p++;
6212 
6213 			if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6214 				/* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6215 				 * If we are already too far along on a line to include Base64/QPrint encoded data
6216 				 * on the same line (without overrunning max line length), then add a line feed
6217 				 * right now */
6218 feed_and_mime_encode:
6219 				if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6220 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6221 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6222 					buf.out = mb_convert_buf_add(buf.out, ' ');
6223 					indent = 0;
6224 					line_start = mb_convert_buf_len(&buf);
6225 				} else if (mb_convert_buf_len(&buf) > 0) {
6226 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6227 					buf.out = mb_convert_buf_add(buf.out, ' ');
6228 				}
6229 				p = word_start; /* Back up to where MIME encoding of input chars should start */
6230 				goto mime_encoding_needed;
6231 			} else if (w == ' ') {
6232 				/* When we see a space, check whether we should insert a line break */
6233 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6234 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6235 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6236 					buf.out = mb_convert_buf_add(buf.out, ' ');
6237 					indent = 0;
6238 					line_start = mb_convert_buf_len(&buf);
6239 				} else if (mb_convert_buf_len(&buf) > 0) {
6240 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6241 					buf.out = mb_convert_buf_add(buf.out, ' ');
6242 				}
6243 				/* Output one (space-delimited) word as plain ASCII */
6244 				while (word_start < p-1) {
6245 					buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6246 				}
6247 				word_start++;
6248 				while (p < e && *p == ' ') {
6249 					p++;
6250 				}
6251 			}
6252 		}
6253 
6254 		if (in_len) {
6255 			/* Copy chars which are part of an incomplete 'word' to the beginning
6256 			 * of wchar_buf and reprocess them on the next iteration.
6257 			 * But first make sure that the incomplete 'word' isn't so big that
6258 			 * there will be no space to add any more decoded wchars in the buffer
6259 			 * (which could lead to an infinite loop) */
6260 			if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6261 				goto feed_and_mime_encode;
6262 			}
6263 			offset = e - word_start;
6264 			if (offset) {
6265 				memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6266 			}
6267 		} else {
6268 			/* We have reached the end of the input string while still in 'ASCII mode';
6269 			 * process any trailing ASCII chars which were not followed by a space */
6270 			if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6271 				/* The whole input string was not just one big ASCII 'word' with no spaces
6272 				 * consider adding a line feed if necessary to prevent output lines from
6273 				 * being too long */
6274 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6275 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6276 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6277 					buf.out = mb_convert_buf_add(buf.out, ' ');
6278 				} else {
6279 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6280 					buf.out = mb_convert_buf_add(buf.out, ' ');
6281 				}
6282 			}
6283 			while (word_start < e) {
6284 				buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6285 			}
6286 		}
6287 	}
6288 
6289 	/* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6290 	return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6291 
6292 mime_encoding_needed: ;
6293 
6294 	/* We will generate the output line by line, first converting wchars to bytes
6295 	 * in the requested output encoding, then transfer-encoding those bytes as
6296 	 * Base64 or QPrint
6297 	 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6298 	 * sending them to 'buf' */
6299 	mb_convert_buf tmpbuf;
6300 	mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6301 
6302 	/* Do we need to refill wchar_buf to make sure we don't run out of wchars
6303 	 * in the middle of a line? */
6304 	offset = e - p;
6305 	if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6306 		goto start_new_line;
6307 	}
6308 	memmove(wchar_buf, p, offset * sizeof(uint32_t));
6309 
6310 	while(true) {
6311 refill_wchar_buf: ;
6312 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6313 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6314 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6315 		p = wchar_buf;
6316 		e = wchar_buf + offset + out_len;
6317 
6318 start_new_line: ;
6319 		MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6320 		buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6321 		buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6322 		buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6323 
6324 		/* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6325 		 * We do something like a 'binary search' to find the greatest number which
6326 		 * can be included on this line without exceeding max line length */
6327 		unsigned int n = 12;
6328 		size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6329 
6330 		while (true) {
6331 			ZEND_ASSERT(p < e);
6332 
6333 			/* Remember where we were in process of generating output, so we can back
6334 			 * up if necessary */
6335 			size_t tmppos = mb_convert_buf_len(&tmpbuf);
6336 			unsigned int tmpstate = tmpbuf.state;
6337 
6338 			/* Try encoding 'n' wchars in output text encoding and sending output
6339 			 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6340 			 * current line. */
6341 			n = MIN(n, e - p);
6342 			outcode->from_wchar(p, n, &tmpbuf, false);
6343 
6344 			/* For some output text encodings, there may be a few ending bytes
6345 			 * which need to be emitted to output before we break a line.
6346 			 * Again, remember where we were so we can back up */
6347 			size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6348 			unsigned int tmpstate2 = tmpbuf.state;
6349 			outcode->from_wchar(NULL, 0, &tmpbuf, true);
6350 
6351 			if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6352 				/* If we convert 'n' more wchars on the current line, it will not
6353 				 * overflow the maximum line length */
6354 				p += n;
6355 
6356 				if (p == e) {
6357 					/* We are done; we shouldn't reach here if there is more remaining
6358 					 * of the input string which needs to be processed */
6359 					ZEND_ASSERT(!in_len);
6360 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6361 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6362 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6363 					mb_convert_buf_free(&tmpbuf);
6364 					return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6365 				} else {
6366 					/* It's possible that more chars might fit on the current line,
6367 					 * so back up to where we were before emitting any ending bytes */
6368 					mb_convert_buf_reset(&tmpbuf, tmppos2);
6369 					tmpbuf.state = tmpstate2;
6370 				}
6371 			} else {
6372 				/* Converting 'n' more wchars on this line would be too much.
6373 				 * Back up to where we were before we tried that. */
6374 				mb_convert_buf_reset(&tmpbuf, tmppos);
6375 				tmpbuf.state = tmpstate;
6376 
6377 				if (n == 1) {
6378 					/* We have found the exact number of chars which will fit on the
6379 					 * current line. Finish up and move to a new line. */
6380 					outcode->from_wchar(NULL, 0, &tmpbuf, true);
6381 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6382 					tmpbuf.state = 0;
6383 
6384 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6385 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6386 
6387 					indent = 0; /* Indent argument must only affect the first line */
6388 
6389 					if (in_len || p < e) {
6390 						/* We still have more input to process */
6391 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6392 						buf.out = mb_convert_buf_add(buf.out, ' ');
6393 						line_start = mb_convert_buf_len(&buf);
6394 						offset = e - p;
6395 						if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6396 							/* Copy any remaining wchars to beginning of buffer and refill
6397 							 * the rest of the buffer */
6398 							memmove(wchar_buf, p, offset * sizeof(uint32_t));
6399 							goto refill_wchar_buf;
6400 						}
6401 						goto start_new_line;
6402 					} else {
6403 						/* We are done! */
6404 						mb_convert_buf_free(&tmpbuf);
6405 						return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6406 					}
6407 				} else {
6408 					/* Try a smaller number of wchars */
6409 					n = MAX(n >> 1, 1);
6410 				}
6411 			}
6412 		}
6413 	}
6414 }
6415 
PHP_FUNCTION(mb_encode_mimeheader)6416 PHP_FUNCTION(mb_encode_mimeheader)
6417 {
6418 	const mbfl_encoding *charset = &mbfl_encoding_pass;
6419 	zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6420 	char *linefeed = "\r\n";
6421 	size_t linefeed_len = 2;
6422 	zend_long indent = 0;
6423 	bool base64 = true;
6424 
6425 	ZEND_PARSE_PARAMETERS_START(1, 5)
6426 		Z_PARAM_STR(str)
6427 		Z_PARAM_OPTIONAL
6428 		Z_PARAM_STR(charset_name)
6429 		Z_PARAM_STR(transenc_name)
6430 		Z_PARAM_STRING(linefeed, linefeed_len)
6431 		Z_PARAM_LONG(indent)
6432 	ZEND_PARSE_PARAMETERS_END();
6433 
6434 	if (charset_name != NULL) {
6435 		charset = php_mb_get_encoding(charset_name, 2);
6436 		if (!charset) {
6437 			RETURN_THROWS();
6438 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6439 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6440 			RETURN_THROWS();
6441 		}
6442 	} else {
6443 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6444 		if (lang != NULL) {
6445 			charset = mbfl_no2encoding(lang->mail_charset);
6446 			const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6447 			char t = transenc->name[0];
6448 			if (t == 'Q' || t == 'q') {
6449 				base64 = false;
6450 			}
6451 		}
6452 	}
6453 
6454 	if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6455 		char t = ZSTR_VAL(transenc_name)[0];
6456 		if (t == 'Q' || t == 'q') {
6457 			base64 = false;
6458 		}
6459 	}
6460 
6461 	RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6462 }
6463 
decode_base64(unsigned char c)6464 static int8_t decode_base64(unsigned char c)
6465 {
6466 	if (c >= 'A' && c <= 'Z') {
6467 		return c - 'A';
6468 	} else if (c >= 'a' && c <= 'z') {
6469 		return c - 'a' + 26;
6470 	} else if (c >= '0' && c <= '9') {
6471 		return c - '0' + 52;
6472 	} else if (c == '+') {
6473 		return 62;
6474 	} else if (c == '/') {
6475 		return 63;
6476 	}
6477 	return -1;
6478 }
6479 
6480 static int8_t qprint_map[] = {
6481 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6482 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6483 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6484 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
6485 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6486 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6487 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6488 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6489 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6490 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6491 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6492 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6493 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6494 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6495 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6496 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6497 };
6498 
6499 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6500 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6501 {
6502 	if ((e - p) < 6) {
6503 		return NULL;
6504 	}
6505 
6506 	ZEND_ASSERT(p[0] == '=');
6507 	ZEND_ASSERT(p[1] == '?');
6508 	p += 2;
6509 
6510 	unsigned char *charset = p;
6511 	unsigned char *charset_end = memchr(charset, '?', e - charset);
6512 	if (charset_end == NULL) {
6513 		return NULL;
6514 	}
6515 
6516 	unsigned char *encoding = charset_end + 1;
6517 	p = encoding + 1;
6518 	if (p >= e || *p++ != '?') {
6519 		return NULL;
6520 	}
6521 
6522 	char *charset_name = estrndup((const char*)charset, charset_end - charset);
6523 	const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6524 	efree(charset_name);
6525 	if (incode == NULL) {
6526 		return NULL;
6527 	}
6528 
6529 	unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6530 	if (end_marker) {
6531 		e = end_marker;
6532 	} else if (p < e && *(e-1) == '?') {
6533 		/* If encoded word is not properly terminated, but last byte is '?',
6534 		 * take that as a terminator (legacy behavior) */
6535 		e--;
6536 	}
6537 
6538 	unsigned char *buf = emalloc(e - p), *bufp = buf;
6539 	if (*encoding == 'Q' || *encoding == 'q') {
6540 		/* Fill `buf` with bytes from decoding QPrint */
6541 		while (p < e) {
6542 			unsigned char c = *p++;
6543 			if (c == '_') {
6544 				*bufp++ = ' ';
6545 				continue;
6546 			} else if (c == '=' && (e - p) >= 2) {
6547 				unsigned char c2 = *p++;
6548 				unsigned char c3 = *p++;
6549 				if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6550 					*bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6551 					continue;
6552 				} else if (c2 == '\r') {
6553 					if (c3 != '\n') {
6554 						p--;
6555 					}
6556 					continue;
6557 				} else if (c2 == '\n') {
6558 					p--;
6559 					continue;
6560 				}
6561 			}
6562 			*bufp++ = c;
6563 		}
6564 	} else if (*encoding == 'B' || *encoding == 'b') {
6565 		/* Fill `buf` with bytes from decoding Base64 */
6566 		unsigned int bits = 0, cache = 0;
6567 		while (p < e) {
6568 			unsigned char c = *p++;
6569 			if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6570 				continue;
6571 			}
6572 			int8_t decoded = decode_base64(c);
6573 			if (decoded == -1) {
6574 				*bufp++ = '?';
6575 				continue;
6576 			}
6577 			bits += 6;
6578 			cache = (cache << 6) | (decoded & 0x3F);
6579 			if (bits == 24) {
6580 				*bufp++ = (cache >> 16) & 0xFF;
6581 				*bufp++ = (cache >> 8) & 0xFF;
6582 				*bufp++ = cache & 0xFF;
6583 				bits = cache = 0;
6584 			}
6585 		}
6586 		if (bits == 18) {
6587 			*bufp++ = (cache >> 10) & 0xFF;
6588 			*bufp++ = (cache >> 2) & 0xFF;
6589 		} else if (bits == 12) {
6590 			*bufp++ = (cache >> 4) & 0xFF;
6591 		}
6592 	} else {
6593 		efree(buf);
6594 		return NULL;
6595 	}
6596 
6597 	size_t in_len = bufp - buf;
6598 	uint32_t wchar_buf[128];
6599 
6600 	bufp = buf;
6601 	while (in_len) {
6602 		size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6603 		ZEND_ASSERT(out_len <= 128);
6604 		outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6605 	}
6606 
6607 	efree(buf);
6608 	return e + 2;
6609 }
6610 
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6611 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6612 {
6613 	unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6614 	unsigned int state = 0;
6615 	bool space_pending = false;
6616 
6617 	mb_convert_buf buf;
6618 	mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6619 
6620 	while (p < e) {
6621 		unsigned char c = *p;
6622 
6623 		if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6624 			/* Does this look like a MIME encoded word? If so, try to decode it as one */
6625 			unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6626 			if (incode_end && (e - incode_end) >= 3) {
6627 				unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6628 				if (temp) {
6629 					p = temp;
6630 					/* Decoding of MIME encoded word was successful;
6631 					 * Try to collapse a run of whitespace */
6632 					if (p < e && (*p == '\n' || *p == '\r')) {
6633 						do {
6634 							p++;
6635 						} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6636 						/* We will only actually output a space if this is not immediately followed
6637 						 * by another valid encoded word */
6638 						space_pending = true;
6639 					}
6640 					continue;
6641 				}
6642 			}
6643 		}
6644 
6645 		if (space_pending) {
6646 			uint32_t space = ' ';
6647 			outcode->from_wchar(&space, 1, &buf, false);
6648 			space_pending = false;
6649 		}
6650 
6651 		/* Consume a run of plain ASCII characters */
6652 		if (c != '\n' && c != '\r') {
6653 			unsigned char *end = p + 1;
6654 			while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6655 				end++;
6656 			}
6657 			uint32_t wchar_buf[128];
6658 			size_t in_len = end - p;
6659 			while (in_len) {
6660 				size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6661 				ZEND_ASSERT(out_len <= 128);
6662 				outcode->from_wchar(wchar_buf, out_len, &buf, false);
6663 			}
6664 		}
6665 		/* Collapse a run of whitespace into a single space */
6666 		if (p < e && (*p == '\n' || *p == '\r')) {
6667 			do {
6668 				p++;
6669 			} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6670 			if (p < e) {
6671 				/* Emulating legacy behavior of mb_decode_mimeheader here;
6672 				 * a run of whitespace is not converted to a space at the very
6673 				 * end of the input string */
6674 				uint32_t space = ' ';
6675 				outcode->from_wchar(&space, 1, &buf, false);
6676 			}
6677 		}
6678 	}
6679 
6680 	outcode->from_wchar(NULL, 0, &buf, true);
6681 
6682 	return mb_convert_buf_result(&buf, outcode);
6683 }
6684 
PHP_FUNCTION(mb_decode_mimeheader)6685 PHP_FUNCTION(mb_decode_mimeheader)
6686 {
6687 	zend_string *str;
6688 
6689 	ZEND_PARSE_PARAMETERS_START(1, 1)
6690 		Z_PARAM_STR(str)
6691 	ZEND_PARSE_PARAMETERS_END();
6692 
6693 	RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6694 }
6695