xref: /php-src/ext/mbstring/mbstring.c (revision 5853cdb7)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include <limits.h>
22 
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "main/php_output.h"
32 #include "ext/standard/info.h"
33 #include "ext/pcre/php_pcre.h"
34 
35 #include "libmbfl/mbfl/mbfilter_8bit.h"
36 #include "libmbfl/mbfl/mbfilter_pass.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_cjk.h"
40 #include "libmbfl/filters/mbfilter_qprint.h"
41 #include "libmbfl/filters/mbfilter_htmlent.h"
42 #include "libmbfl/filters/mbfilter_uuencode.h"
43 #include "libmbfl/filters/mbfilter_ucs4.h"
44 #include "libmbfl/filters/mbfilter_utf16.h"
45 #include "libmbfl/filters/mbfilter_singlebyte.h"
46 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
47 #include "libmbfl/filters/unicode_prop.h"
48 
49 #include "php_globals.h"
50 #include "rfc1867.h"
51 #include "php_content_types.h"
52 #include "SAPI.h"
53 #include "php_unicode.h"
54 #include "TSRM.h"
55 
56 #include "mb_gpc.h"
57 
58 #ifdef HAVE_MBREGEX
59 # include "php_mbregex.h"
60 #endif
61 
62 #include "zend_smart_str.h"
63 #include "zend_multibyte.h"
64 #include "mbstring_arginfo.h"
65 
66 #include "rare_cp_bitvec.h"
67 
68 #ifdef __SSE2__
69 #include <emmintrin.h>
70 #endif
71 
72 #ifdef __SSE3__
73 #include <immintrin.h>
74 #include <pmmintrin.h>
75 #endif
76 
77 /* }}} */
78 
79 /* {{{ prototypes */
80 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
81 
82 static PHP_GINIT_FUNCTION(mbstring);
83 static PHP_GSHUTDOWN_FUNCTION(mbstring);
84 
85 static void php_mb_populate_current_detect_order_list(void);
86 
87 static int php_mb_encoding_translation(void);
88 
89 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
90 
91 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
92 
93 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
94 
95 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
96 
97 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
98 
99 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
100 
101 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
102 
103 /* See mbfilter_cp5022x.c */
104 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
105 /* }}} */
106 
107 /* {{{ php_mb_default_identify_list */
108 typedef struct _php_mb_nls_ident_list {
109 	enum mbfl_no_language lang;
110 	const enum mbfl_no_encoding *list;
111 	size_t list_size;
112 } php_mb_nls_ident_list;
113 
114 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
115 	mbfl_no_encoding_ascii,
116 	mbfl_no_encoding_jis,
117 	mbfl_no_encoding_utf8,
118 	mbfl_no_encoding_euc_jp,
119 	mbfl_no_encoding_sjis
120 };
121 
122 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
123 	mbfl_no_encoding_ascii,
124 	mbfl_no_encoding_utf8,
125 	mbfl_no_encoding_euc_cn,
126 	mbfl_no_encoding_cp936
127 };
128 
129 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
130 	mbfl_no_encoding_ascii,
131 	mbfl_no_encoding_utf8,
132 	mbfl_no_encoding_euc_tw,
133 	mbfl_no_encoding_big5
134 };
135 
136 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
137 	mbfl_no_encoding_ascii,
138 	mbfl_no_encoding_utf8,
139 	mbfl_no_encoding_euc_kr,
140 	mbfl_no_encoding_uhc
141 };
142 
143 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
144 	mbfl_no_encoding_ascii,
145 	mbfl_no_encoding_utf8,
146 	mbfl_no_encoding_koi8r,
147 	mbfl_no_encoding_cp1251,
148 	mbfl_no_encoding_cp866
149 };
150 
151 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
152 	mbfl_no_encoding_ascii,
153 	mbfl_no_encoding_utf8,
154 	mbfl_no_encoding_armscii8
155 };
156 
157 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
158 	mbfl_no_encoding_ascii,
159 	mbfl_no_encoding_utf8,
160 	mbfl_no_encoding_cp1254,
161 	mbfl_no_encoding_8859_9
162 };
163 
164 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
165 	mbfl_no_encoding_ascii,
166 	mbfl_no_encoding_utf8,
167 	mbfl_no_encoding_koi8u
168 };
169 
170 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
171 	mbfl_no_encoding_ascii,
172 	mbfl_no_encoding_utf8
173 };
174 
175 
176 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
177 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
178 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
179 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
180 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
181 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
182 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
183 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
184 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
185 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
186 };
187 
188 /* }}} */
189 
190 /* {{{ mbstring_deps[] */
191 static const zend_module_dep mbstring_deps[] = {
192 	ZEND_MOD_REQUIRED("pcre")
193 	ZEND_MOD_END
194 };
195 /* }}} */
196 
197 /* {{{ zend_module_entry mbstring_module_entry */
198 zend_module_entry mbstring_module_entry = {
199 	STANDARD_MODULE_HEADER_EX,
200 	NULL,
201 	mbstring_deps,
202 	"mbstring",
203 	ext_functions,
204 	PHP_MINIT(mbstring),
205 	PHP_MSHUTDOWN(mbstring),
206 	PHP_RINIT(mbstring),
207 	PHP_RSHUTDOWN(mbstring),
208 	PHP_MINFO(mbstring),
209 	PHP_MBSTRING_VERSION,
210 	PHP_MODULE_GLOBALS(mbstring),
211 	PHP_GINIT(mbstring),
212 	PHP_GSHUTDOWN(mbstring),
213 	NULL,
214 	STANDARD_MODULE_PROPERTIES_EX
215 };
216 /* }}} */
217 
218 /* {{{ static sapi_post_entry php_post_entries[] */
219 static const sapi_post_entry php_post_entries[] = {
220 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
221 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
222 	{ NULL, 0, NULL, NULL }
223 };
224 /* }}} */
225 
226 #ifdef COMPILE_DL_MBSTRING
227 #ifdef ZTS
228 ZEND_TSRMLS_CACHE_DEFINE()
229 #endif
230 ZEND_GET_MODULE(mbstring)
231 #endif
232 
233 /* {{{ static sapi_post_entry mbstr_post_entries[] */
234 static const sapi_post_entry mbstr_post_entries[] = {
235 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
236 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
237 	{ NULL, 0, NULL, NULL }
238 };
239 /* }}} */
240 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)241 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
242 	if (encoding_name) {
243 		const mbfl_encoding *encoding;
244 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
245 		if (last_encoding_name && (last_encoding_name == encoding_name
246 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
247 			return MBSTRG(last_used_encoding);
248 		}
249 
250 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
251 		if (!encoding) {
252 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
253 			return NULL;
254 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
255 			if (encoding == &mbfl_encoding_base64) {
256 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
257 			} else if (encoding == &mbfl_encoding_qprint) {
258 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
259 			} else if (encoding == &mbfl_encoding_html_ent) {
260 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
261 			} else if (encoding == &mbfl_encoding_uuencode) {
262 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
263 			}
264 		}
265 
266 		if (last_encoding_name) {
267 			zend_string_release(last_encoding_name);
268 		}
269 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
270 		MBSTRG(last_used_encoding) = encoding;
271 		return encoding;
272 	} else {
273 		return MBSTRG(current_internal_encoding);
274 	}
275 }
276 
php_mb_get_encoding_or_pass(const char * encoding_name,size_t encoding_name_len)277 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
278 	if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
279 		return &mbfl_encoding_pass;
280 	}
281 
282 	return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
283 }
284 
count_commas(const char * p,const char * end)285 static size_t count_commas(const char *p, const char *end) {
286 	size_t count = 0;
287 	while ((p = memchr(p, ',', end - p))) {
288 		count++;
289 		p++;
290 	}
291 	return count;
292 }
293 
294 /* {{{ static zend_result php_mb_parse_encoding_list()
295  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
296  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
297  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)298 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
299 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
300 {
301 	if (value == NULL || value_length == 0) {
302 		*return_list = NULL;
303 		*return_size = 0;
304 		return SUCCESS;
305 	} else {
306 		bool included_auto;
307 		size_t n, size;
308 		const char *p1, *endp, *tmpstr;
309 		const mbfl_encoding **entry, **list;
310 
311 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
312 			tmpstr = value + 1;
313 			value_length -= 2;
314 		} else {
315 			tmpstr = value;
316 		}
317 
318 		endp = tmpstr + value_length;
319 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
320 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
321 		entry = list;
322 		n = 0;
323 		included_auto = 0;
324 		p1 = tmpstr;
325 		while (1) {
326 			const char *comma = memchr(p1, ',', endp - p1);
327 			const char *p = comma ? comma : endp;
328 			/* trim spaces */
329 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
330 				p1++;
331 			}
332 			p--;
333 			while (p > p1 && (*p == ' ' || *p == '\t')) {
334 				p--;
335 			}
336 			size_t p1_length = p - p1 + 1;
337 			/* convert to the encoding number and check encoding */
338 			if (strncasecmp(p1, "auto", p1_length) == 0) {
339 				if (!included_auto) {
340 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
341 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
342 					size_t i;
343 					included_auto = 1;
344 					for (i = 0; i < identify_list_size; i++) {
345 						*entry++ = mbfl_no2encoding(*src++);
346 						n++;
347 					}
348 				}
349 			} else {
350 				const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
351 				if (!encoding) {
352 					/* Called from an INI setting modification */
353 					if (arg_num == 0) {
354 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
355 					} else {
356 						zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
357 					}
358 					pefree(ZEND_VOIDP(list), persistent);
359 					return FAILURE;
360 				}
361 
362 				*entry++ = encoding;
363 				n++;
364 			}
365 			if (n >= size || comma == NULL) {
366 				break;
367 			}
368 			p1 = comma + 1;
369 		}
370 		*return_list = list;
371 		*return_size = n;
372 	}
373 
374 	return SUCCESS;
375 }
376 /* }}} */
377 
378 /* {{{
379  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
380  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
381  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)382 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
383 	size_t *return_size, uint32_t arg_num)
384 {
385 	/* Allocate enough space to include the default detect order if "auto" is used. */
386 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
387 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
388 	const mbfl_encoding **entry = list;
389 	bool included_auto = 0;
390 	size_t n = 0;
391 	zval *hash_entry;
392 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
393 		zend_string *encoding_str = zval_try_get_string(hash_entry);
394 		if (UNEXPECTED(!encoding_str)) {
395 			efree(ZEND_VOIDP(list));
396 			return FAILURE;
397 		}
398 
399 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
400 			if (!included_auto) {
401 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
402 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
403 				size_t j;
404 
405 				included_auto = 1;
406 				for (j = 0; j < identify_list_size; j++) {
407 					*entry++ = mbfl_no2encoding(*src++);
408 					n++;
409 				}
410 			}
411 		} else {
412 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
413 			if (encoding) {
414 				*entry++ = encoding;
415 				n++;
416 			} else {
417 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
418 				zend_string_release(encoding_str);
419 				efree(ZEND_VOIDP(list));
420 				return FAILURE;
421 			}
422 		}
423 		zend_string_release(encoding_str);
424 	} ZEND_HASH_FOREACH_END();
425 	*return_list = list;
426 	*return_size = n;
427 	return SUCCESS;
428 }
429 /* }}} */
430 
431 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)432 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
433 {
434 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
435 }
436 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)437 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
438 {
439 	return ((const mbfl_encoding *)encoding)->name;
440 }
441 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)442 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
443 {
444 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
445 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
446 }
447 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)448 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
449 {
450 	if (!list) {
451 		list = (const zend_encoding**)MBSTRG(current_detect_order_list);
452 		list_size = MBSTRG(current_detect_order_list_size);
453 	}
454 	if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
455 		/* Emulate behavior of previous implementation; it would never return "pass"
456 		 * from an encoding auto-detection operation */
457 		return NULL;
458 	}
459 	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
460 }
461 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)462 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
463 {
464 	unsigned int num_errors = 0;
465 	zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
466 
467 	*to_length = ZSTR_LEN(result);
468 	*to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
469 	memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
470 	zend_string_free(result);
471 
472 	return from_length;
473 }
474 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)475 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
476 {
477 	return php_mb_parse_encoding_list(
478 		encoding_list, encoding_list_len,
479 		(const mbfl_encoding ***)return_list, return_size,
480 		persistent, /* arg_num */ 0);
481 }
482 
php_mb_zend_internal_encoding_getter(void)483 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
484 {
485 	return (const zend_encoding *)MBSTRG(internal_encoding);
486 }
487 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)488 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
489 {
490 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
491 	return SUCCESS;
492 }
493 
494 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
495 	"mbstring",
496 	php_mb_zend_encoding_fetcher,
497 	php_mb_zend_encoding_name_getter,
498 	php_mb_zend_encoding_lexer_compatibility_checker,
499 	php_mb_zend_encoding_detector,
500 	php_mb_zend_encoding_converter,
501 	php_mb_zend_encoding_list_parser,
502 	php_mb_zend_internal_encoding_getter,
503 	php_mb_zend_internal_encoding_setter
504 };
505 /* }}} */
506 
507 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)508 static void *_php_mb_compile_regex(const char *pattern)
509 {
510 	pcre2_code *retval;
511 	PCRE2_SIZE err_offset;
512 	int errnum;
513 
514 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
515 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
516 		PCRE2_UCHAR err_str[128];
517 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
518 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
519 	}
520 	return retval;
521 }
522 /* }}} */
523 
524 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)525 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
526 {
527 	int res;
528 
529 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
530 	if (NULL == match_data) {
531 		pcre2_code_free(opaque);
532 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
533 		return FAILURE;
534 	}
535 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
536 	php_pcre_free_match_data(match_data);
537 
538 	return res;
539 }
540 /* }}} */
541 
542 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)543 static void _php_mb_free_regex(void *opaque)
544 {
545 	pcre2_code_free(opaque);
546 }
547 /* }}} */
548 
549 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)550 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
551 {
552 	size_t i;
553 
554 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
555 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
556 
557 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
558 		if (php_mb_default_identify_list[i].lang == lang) {
559 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
560 			*plist_size = php_mb_default_identify_list[i].list_size;
561 			return 1;
562 		}
563 	}
564 	return 0;
565 }
566 /* }}} */
567 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)568 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
569 {
570 	char *result = emalloc(len + 2);
571 	char *resp = result;
572 	size_t i;
573 
574 	for (i = 0; i < len && start[i] != quote; ++i) {
575 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
576 			*resp++ = start[++i];
577 		} else {
578 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
579 
580 			while (j-- > 0 && i < len) {
581 				*resp++ = start[i++];
582 			}
583 			--i;
584 		}
585 	}
586 
587 	*resp = '\0';
588 	return result;
589 }
590 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)591 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
592 {
593 	char *pos = *line, quote;
594 	char *res;
595 
596 	while (*pos && *pos != stop) {
597 		if ((quote = *pos) == '"' || quote == '\'') {
598 			++pos;
599 			while (*pos && *pos != quote) {
600 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
601 					pos += 2;
602 				} else {
603 					++pos;
604 				}
605 			}
606 			if (*pos) {
607 				++pos;
608 			}
609 		} else {
610 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
611 
612 		}
613 	}
614 	if (*pos == '\0') {
615 		res = estrdup(*line);
616 		*line += strlen(*line);
617 		return res;
618 	}
619 
620 	res = estrndup(*line, pos - *line);
621 
622 	while (*pos == stop) {
623 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
624 	}
625 
626 	*line = pos;
627 	return res;
628 }
629 /* }}} */
630 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)631 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
632 {
633 	while (*str && isspace(*(unsigned char *)str)) {
634 		++str;
635 	}
636 
637 	if (!*str) {
638 		return estrdup("");
639 	}
640 
641 	if (*str == '"' || *str == '\'') {
642 		char quote = *str;
643 
644 		str++;
645 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
646 	} else {
647 		char *strend = str;
648 
649 		while (*strend && !isspace(*(unsigned char *)strend)) {
650 			++strend;
651 		}
652 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
653 	}
654 }
655 /* }}} */
656 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)657 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
658 {
659 	char *s, *s2;
660 	const size_t filename_len = strlen(filename);
661 
662 	/* The \ check should technically be needed for win32 systems only where
663 	 * it is a valid path separator. However, IE in all it's wisdom always sends
664 	 * the full path of the file on the user's filesystem, which means that unless
665 	 * the user does basename() they get a bogus file name. Until IE's user base drops
666 	 * to nill or problem is fixed this code must remain enabled for all systems. */
667 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
668 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
669 
670 	if (s && s2) {
671 		if (s > s2) {
672 			return ++s;
673 		} else {
674 			return ++s2;
675 		}
676 	} else if (s) {
677 		return ++s;
678 	} else if (s2) {
679 		return ++s2;
680 	} else {
681 		return filename;
682 	}
683 }
684 /* }}} */
685 
686 /* {{{ php.ini directive handler */
687 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)688 static PHP_INI_MH(OnUpdate_mbstring_language)
689 {
690 	enum mbfl_no_language no_language;
691 
692 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
693 	if (no_language == mbfl_no_language_invalid) {
694 		MBSTRG(language) = mbfl_no_language_neutral;
695 		return FAILURE;
696 	}
697 	MBSTRG(language) = no_language;
698 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
699 	return SUCCESS;
700 }
701 /* }}} */
702 
703 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)704 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
705 {
706 	const mbfl_encoding **list;
707 	size_t size;
708 
709 	if (!new_value) {
710 		if (MBSTRG(detect_order_list)) {
711 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
712 		}
713 		MBSTRG(detect_order_list) = NULL;
714 		MBSTRG(detect_order_list_size) = 0;
715 		return SUCCESS;
716 	}
717 
718 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
719 		return FAILURE;
720 	}
721 
722 	if (MBSTRG(detect_order_list)) {
723 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 	}
725 	MBSTRG(detect_order_list) = list;
726 	MBSTRG(detect_order_list_size) = size;
727 	return SUCCESS;
728 }
729 /* }}} */
730 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)731 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
732 	const mbfl_encoding **list;
733 	size_t size;
734 	if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
735 		list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
736 		*list = &mbfl_encoding_pass;
737 		size = 1;
738 	} else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
739 		return FAILURE;
740 	}
741 	if (MBSTRG(http_input_list)) {
742 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
743 	}
744 	MBSTRG(http_input_list) = list;
745 	MBSTRG(http_input_list_size) = size;
746 	return SUCCESS;
747 }
748 
749 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)750 static PHP_INI_MH(OnUpdate_mbstring_http_input)
751 {
752 	if (new_value) {
753 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
754 	}
755 
756 	if (!new_value || !ZSTR_LEN(new_value)) {
757 		const char *encoding = php_get_input_encoding();
758 		MBSTRG(http_input_set) = 0;
759 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
760 		return SUCCESS;
761 	}
762 
763 	MBSTRG(http_input_set) = 1;
764 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
765 }
766 /* }}} */
767 
_php_mb_ini_mbstring_http_output_set(const char * new_value,size_t length)768 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
769 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
770 	if (!encoding) {
771 		return FAILURE;
772 	}
773 
774 	MBSTRG(http_output_encoding) = encoding;
775 	MBSTRG(current_http_output_encoding) = encoding;
776 	return SUCCESS;
777 }
778 
779 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)780 static PHP_INI_MH(OnUpdate_mbstring_http_output)
781 {
782 	if (new_value) {
783 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
784 	}
785 
786 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
787 		const char *encoding = php_get_output_encoding();
788 		MBSTRG(http_output_set) = 0;
789 		_php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
790 		return SUCCESS;
791 	}
792 
793 	MBSTRG(http_output_set) = 1;
794 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
795 }
796 /* }}} */
797 
798 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)799 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
800 {
801 	const mbfl_encoding *encoding;
802 
803 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
804 		/* falls back to UTF-8 if an unknown encoding name is given */
805 		if (new_value) {
806 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
807 		}
808 		encoding = &mbfl_encoding_utf8;
809 	}
810 	MBSTRG(internal_encoding) = encoding;
811 	MBSTRG(current_internal_encoding) = encoding;
812 #ifdef HAVE_MBREGEX
813 	{
814 		const char *enc_name = new_value;
815 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
816 			/* falls back to UTF-8 if an unknown encoding name is given */
817 			enc_name = "UTF-8";
818 			php_mb_regex_set_default_mbctype(enc_name);
819 		}
820 		php_mb_regex_set_mbctype(new_value);
821 	}
822 #endif
823 	return SUCCESS;
824 }
825 /* }}} */
826 
827 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)828 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
829 {
830 	if (new_value) {
831 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
832 	}
833 
834 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
835 		return FAILURE;
836 	}
837 
838 	if (new_value && ZSTR_LEN(new_value)) {
839 		MBSTRG(internal_encoding_set) = 1;
840 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
841 	} else {
842 		const char *encoding = php_get_internal_encoding();
843 		MBSTRG(internal_encoding_set) = 0;
844 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
845 	}
846 }
847 /* }}} */
848 
849 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)850 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
851 {
852 	if (new_value != NULL) {
853 		if (zend_string_equals_literal_ci(new_value, "none")) {
854 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
855 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
856 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
857 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
858 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
859 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
860 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
861 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
862 		} else {
863 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
864 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
865 			if (ZSTR_LEN(new_value) > 0) {
866 				char *endptr = NULL;
867 				int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868 
869 				if (*endptr == '\0') {
870 					MBSTRG(filter_illegal_substchar) = c;
871 					MBSTRG(current_filter_illegal_substchar) = c;
872 				}
873 			}
874 		}
875 	} else {
876 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
878 		MBSTRG(filter_illegal_substchar) = '?';
879 		MBSTRG(current_filter_illegal_substchar) = '?';
880 	}
881 
882 	return SUCCESS;
883 }
884 /* }}} */
885 
886 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)887 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
888 {
889 	if (new_value == NULL) {
890 		return FAILURE;
891 	}
892 
893 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
894 
895 	if (MBSTRG(encoding_translation)) {
896 		sapi_unregister_post_entry(php_post_entries);
897 		sapi_register_post_entries(mbstr_post_entries);
898 	} else {
899 		sapi_unregister_post_entry(mbstr_post_entries);
900 		sapi_register_post_entries(php_post_entries);
901 	}
902 
903 	return SUCCESS;
904 }
905 /* }}} */
906 
907 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)908 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
909 {
910 	zend_string *tmp;
911 	void *re = NULL;
912 
913 	if (!new_value) {
914 		new_value = entry->orig_value;
915 	}
916 	tmp = php_trim(new_value, NULL, 0, 3);
917 
918 	if (ZSTR_LEN(tmp) > 0) {
919 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
920 			zend_string_release_ex(tmp, 0);
921 			return FAILURE;
922 		}
923 	}
924 
925 	if (MBSTRG(http_output_conv_mimetypes)) {
926 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
927 	}
928 
929 	MBSTRG(http_output_conv_mimetypes) = re;
930 
931 	zend_string_release_ex(tmp, 0);
932 	return SUCCESS;
933 }
934 /* }}} */
935 /* }}} */
936 
937 /* {{{ php.ini directive registration */
938 PHP_INI_BEGIN()
939 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
940 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
941 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
942 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
943 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
944 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
945 
946 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
947 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
948 		OnUpdate_mbstring_encoding_translation,
949 		encoding_translation, zend_mbstring_globals, mbstring_globals)
950 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
951 		"^(text/|application/xhtml\\+xml)",
952 		PHP_INI_ALL,
953 		OnUpdate_mbstring_http_output_conv_mimetypes)
954 
955 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
956 		PHP_INI_ALL,
957 		OnUpdateBool,
958 		strict_detection, zend_mbstring_globals, mbstring_globals)
959 #ifdef HAVE_MBREGEX
960 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
961 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
962 #endif
PHP_INI_END()963 PHP_INI_END()
964 /* }}} */
965 
966 static void mbstring_internal_encoding_changed_hook(void) {
967 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
968 	if (!MBSTRG(internal_encoding_set)) {
969 		const char *encoding = php_get_internal_encoding();
970 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
971 	}
972 
973 	if (!MBSTRG(http_output_set)) {
974 		const char *encoding = php_get_output_encoding();
975 		_php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
976 	}
977 
978 	if (!MBSTRG(http_input_set)) {
979 		const char *encoding = php_get_input_encoding();
980 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
981 	}
982 }
983 
984 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)985 static PHP_GINIT_FUNCTION(mbstring)
986 {
987 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
988 ZEND_TSRMLS_CACHE_UPDATE();
989 #endif
990 
991 	mbstring_globals->language = mbfl_no_language_uni;
992 	mbstring_globals->internal_encoding = NULL;
993 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
994 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
995 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
996 	mbstring_globals->http_input_identify = NULL;
997 	mbstring_globals->http_input_identify_get = NULL;
998 	mbstring_globals->http_input_identify_post = NULL;
999 	mbstring_globals->http_input_identify_cookie = NULL;
1000 	mbstring_globals->http_input_identify_string = NULL;
1001 	mbstring_globals->http_input_list = NULL;
1002 	mbstring_globals->http_input_list_size = 0;
1003 	mbstring_globals->detect_order_list = NULL;
1004 	mbstring_globals->detect_order_list_size = 0;
1005 	mbstring_globals->current_detect_order_list = NULL;
1006 	mbstring_globals->current_detect_order_list_size = 0;
1007 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1008 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1009 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1010 	mbstring_globals->filter_illegal_substchar = '?';
1011 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1012 	mbstring_globals->current_filter_illegal_substchar = '?';
1013 	mbstring_globals->illegalchars = 0;
1014 	mbstring_globals->encoding_translation = 0;
1015 	mbstring_globals->strict_detection = 0;
1016 	mbstring_globals->outconv_enabled = false;
1017 	mbstring_globals->outconv_state = 0;
1018 	mbstring_globals->http_output_conv_mimetypes = NULL;
1019 #ifdef HAVE_MBREGEX
1020 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1021 #endif
1022 	mbstring_globals->last_used_encoding_name = NULL;
1023 	mbstring_globals->last_used_encoding = NULL;
1024 	mbstring_globals->internal_encoding_set = 0;
1025 	mbstring_globals->http_output_set = 0;
1026 	mbstring_globals->http_input_set = 0;
1027 	mbstring_globals->all_encodings_list = NULL;
1028 }
1029 /* }}} */
1030 
1031 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1032 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1033 {
1034 	if (mbstring_globals->http_input_list) {
1035 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1036 	}
1037 	if (mbstring_globals->detect_order_list) {
1038 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1039 	}
1040 	if (mbstring_globals->http_output_conv_mimetypes) {
1041 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1042 	}
1043 #ifdef HAVE_MBREGEX
1044 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1045 #endif
1046 }
1047 /* }}} */
1048 
1049 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1050 static void init_check_utf8(void);
1051 #endif
1052 
1053 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1054 PHP_MINIT_FUNCTION(mbstring)
1055 {
1056 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1057 ZEND_TSRMLS_CACHE_UPDATE();
1058 #endif
1059 
1060 	REGISTER_INI_ENTRIES();
1061 
1062 	/* We assume that we're the only user of the hook. */
1063 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1064 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1065 	mbstring_internal_encoding_changed_hook();
1066 
1067 	/* This is a global handler. Should not be set in a per-request handler. */
1068 	sapi_register_treat_data(mbstr_treat_data);
1069 
1070 	/* Post handlers are stored in the thread-local context. */
1071 	if (MBSTRG(encoding_translation)) {
1072 		sapi_register_post_entries(mbstr_post_entries);
1073 	}
1074 
1075 #ifdef HAVE_MBREGEX
1076 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1077 #endif
1078 
1079 	register_mbstring_symbols(module_number);
1080 
1081 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1082 		return FAILURE;
1083 	}
1084 
1085 	php_rfc1867_set_multibyte_callbacks(
1086 		php_mb_encoding_translation,
1087 		php_mb_gpc_get_detect_order,
1088 		php_mb_gpc_set_input_encoding,
1089 		php_mb_rfc1867_getword,
1090 		php_mb_rfc1867_getword_conf,
1091 		php_mb_rfc1867_basename);
1092 
1093 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1094 	init_check_utf8();
1095 	init_convert_utf16();
1096 #endif
1097 
1098 	return SUCCESS;
1099 }
1100 /* }}} */
1101 
1102 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1103 PHP_MSHUTDOWN_FUNCTION(mbstring)
1104 {
1105 	UNREGISTER_INI_ENTRIES();
1106 
1107 	zend_multibyte_restore_functions();
1108 
1109 #ifdef HAVE_MBREGEX
1110 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1111 #endif
1112 
1113 	php_internal_encoding_changed = NULL;
1114 
1115 	return SUCCESS;
1116 }
1117 /* }}} */
1118 
1119 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1120 PHP_RINIT_FUNCTION(mbstring)
1121 {
1122 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1123 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1124 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1125 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1126 
1127 	MBSTRG(illegalchars) = 0;
1128 
1129 	php_mb_populate_current_detect_order_list();
1130 
1131 #ifdef HAVE_MBREGEX
1132 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1133 #endif
1134 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1135 
1136 	return SUCCESS;
1137 }
1138 /* }}} */
1139 
1140 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1141 PHP_RSHUTDOWN_FUNCTION(mbstring)
1142 {
1143 	if (MBSTRG(current_detect_order_list) != NULL) {
1144 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1145 		MBSTRG(current_detect_order_list) = NULL;
1146 		MBSTRG(current_detect_order_list_size) = 0;
1147 	}
1148 
1149 	/* clear http input identification. */
1150 	MBSTRG(http_input_identify) = NULL;
1151 	MBSTRG(http_input_identify_post) = NULL;
1152 	MBSTRG(http_input_identify_get) = NULL;
1153 	MBSTRG(http_input_identify_cookie) = NULL;
1154 	MBSTRG(http_input_identify_string) = NULL;
1155 
1156 	if (MBSTRG(last_used_encoding_name)) {
1157 		zend_string_release(MBSTRG(last_used_encoding_name));
1158 		MBSTRG(last_used_encoding_name) = NULL;
1159 	}
1160 
1161 	MBSTRG(internal_encoding_set) = 0;
1162 	MBSTRG(http_output_set) = 0;
1163 	MBSTRG(http_input_set) = 0;
1164 
1165 	MBSTRG(outconv_enabled) = false;
1166 	MBSTRG(outconv_state) = 0;
1167 
1168 	if (MBSTRG(all_encodings_list)) {
1169 		GC_DELREF(MBSTRG(all_encodings_list));
1170 		zend_array_destroy(MBSTRG(all_encodings_list));
1171 		MBSTRG(all_encodings_list) = NULL;
1172 	}
1173 
1174 #ifdef HAVE_MBREGEX
1175 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1176 #endif
1177 
1178 	return SUCCESS;
1179 }
1180 /* }}} */
1181 
1182 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1183 PHP_MINFO_FUNCTION(mbstring)
1184 {
1185 	php_info_print_table_start();
1186 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1187 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1188 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1189 	{
1190 		char tmp[256];
1191 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1192 		php_info_print_table_row(2, "libmbfl version", tmp);
1193 	}
1194 	php_info_print_table_end();
1195 
1196 	php_info_print_table_start();
1197 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1198 	php_info_print_table_end();
1199 
1200 #ifdef HAVE_MBREGEX
1201 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1202 #endif
1203 
1204 	DISPLAY_INI_ENTRIES();
1205 }
1206 /* }}} */
1207 
1208 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1209 PHP_FUNCTION(mb_language)
1210 {
1211 	zend_string *name = NULL;
1212 
1213 	ZEND_PARSE_PARAMETERS_START(0, 1)
1214 		Z_PARAM_OPTIONAL
1215 		Z_PARAM_STR_OR_NULL(name)
1216 	ZEND_PARSE_PARAMETERS_END();
1217 
1218 	if (name == NULL) {
1219 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1220 	} else {
1221 		zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1222 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1223 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1224 			zend_string_release_ex(ini_name, 0);
1225 			RETURN_THROWS();
1226 		}
1227 		// TODO Make return void
1228 		RETVAL_TRUE;
1229 		zend_string_release_ex(ini_name, 0);
1230 	}
1231 }
1232 /* }}} */
1233 
1234 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1235 PHP_FUNCTION(mb_internal_encoding)
1236 {
1237 	char *name = NULL;
1238 	size_t name_len;
1239 	const mbfl_encoding *encoding;
1240 
1241 	ZEND_PARSE_PARAMETERS_START(0, 1)
1242 		Z_PARAM_OPTIONAL
1243 		Z_PARAM_STRING_OR_NULL(name, name_len)
1244 	ZEND_PARSE_PARAMETERS_END();
1245 
1246 	if (name == NULL) {
1247 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1248 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1249 	} else {
1250 		encoding = mbfl_name2encoding(name);
1251 		if (!encoding) {
1252 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1253 			RETURN_THROWS();
1254 		} else {
1255 			MBSTRG(current_internal_encoding) = encoding;
1256 			MBSTRG(internal_encoding_set) = 1;
1257 			/* TODO Return old encoding */
1258 			RETURN_TRUE;
1259 		}
1260 	}
1261 }
1262 /* }}} */
1263 
1264 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1265 PHP_FUNCTION(mb_http_input)
1266 {
1267 	char *type = NULL;
1268 	size_t type_len = 0, n;
1269 	const mbfl_encoding **entry;
1270 	const mbfl_encoding *encoding;
1271 
1272 	ZEND_PARSE_PARAMETERS_START(0, 1)
1273 		Z_PARAM_OPTIONAL
1274 		Z_PARAM_STRING_OR_NULL(type, type_len)
1275 	ZEND_PARSE_PARAMETERS_END();
1276 
1277 	if (type == NULL) {
1278 		encoding = MBSTRG(http_input_identify);
1279 	} else if (type_len != 1) {
1280 		zend_argument_value_error(1,
1281 			"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1282 		RETURN_THROWS();
1283 	} else {
1284 		switch (*type) {
1285 		case 'G':
1286 		case 'g':
1287 			encoding = MBSTRG(http_input_identify_get);
1288 			break;
1289 		case 'P':
1290 		case 'p':
1291 			encoding = MBSTRG(http_input_identify_post);
1292 			break;
1293 		case 'C':
1294 		case 'c':
1295 			encoding = MBSTRG(http_input_identify_cookie);
1296 			break;
1297 		case 'S':
1298 		case 's':
1299 			encoding = MBSTRG(http_input_identify_string);
1300 			break;
1301 		case 'I':
1302 		case 'i':
1303 			entry = MBSTRG(http_input_list);
1304 			n = MBSTRG(http_input_list_size);
1305 			array_init(return_value);
1306 			for (size_t i = 0; i < n; i++, entry++) {
1307 				add_next_index_string(return_value, (*entry)->name);
1308 			}
1309 			return;
1310 		case 'L':
1311 		case 'l':
1312 			entry = MBSTRG(http_input_list);
1313 			n = MBSTRG(http_input_list_size);
1314 			if (n == 0) {
1315 				RETURN_FALSE;
1316 			}
1317 
1318 			smart_str result = {0};
1319 			for (size_t i = 0; i < n; i++, entry++) {
1320 				if (i > 0) {
1321 					smart_str_appendc(&result, ',');
1322 				}
1323 				smart_str_appends(&result, (*entry)->name);
1324 			}
1325 			RETURN_STR(smart_str_extract(&result));
1326 		default:
1327 			zend_argument_value_error(1,
1328 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1329 			RETURN_THROWS();
1330 		}
1331 	}
1332 
1333 	if (encoding) {
1334 		RETURN_STRING(encoding->name);
1335 	} else {
1336 		RETURN_FALSE;
1337 	}
1338 }
1339 /* }}} */
1340 
1341 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1342 PHP_FUNCTION(mb_http_output)
1343 {
1344 	char *name = NULL;
1345 	size_t name_len;
1346 
1347 	ZEND_PARSE_PARAMETERS_START(0, 1)
1348 		Z_PARAM_OPTIONAL
1349 		Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1350 	ZEND_PARSE_PARAMETERS_END();
1351 
1352 	if (name == NULL) {
1353 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1354 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1355 	} else {
1356 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1357 		if (!encoding) {
1358 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1359 			RETURN_THROWS();
1360 		} else {
1361 			MBSTRG(http_output_set) = 1;
1362 			MBSTRG(current_http_output_encoding) = encoding;
1363 			/* TODO Return previous encoding? */
1364 			RETURN_TRUE;
1365 		}
1366 	}
1367 }
1368 /* }}} */
1369 
1370 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1371 PHP_FUNCTION(mb_detect_order)
1372 {
1373 	zend_string *order_str = NULL;
1374 	HashTable *order_ht = NULL;
1375 
1376 	ZEND_PARSE_PARAMETERS_START(0, 1)
1377 		Z_PARAM_OPTIONAL
1378 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1379 	ZEND_PARSE_PARAMETERS_END();
1380 
1381 	if (!order_str && !order_ht) {
1382 		size_t n = MBSTRG(current_detect_order_list_size);
1383 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1384 		array_init(return_value);
1385 		for (size_t i = 0; i < n; i++) {
1386 			add_next_index_string(return_value, (*entry)->name);
1387 			entry++;
1388 		}
1389 	} else {
1390 		const mbfl_encoding **list;
1391 		size_t size;
1392 		if (order_ht) {
1393 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1394 				RETURN_THROWS();
1395 			}
1396 		} else {
1397 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1398 				RETURN_THROWS();
1399 			}
1400 		}
1401 
1402 		if (size == 0) {
1403 			efree(ZEND_VOIDP(list));
1404 			zend_argument_value_error(1, "must specify at least one encoding");
1405 			RETURN_THROWS();
1406 		}
1407 
1408 		if (MBSTRG(current_detect_order_list)) {
1409 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1410 		}
1411 		MBSTRG(current_detect_order_list) = list;
1412 		MBSTRG(current_detect_order_list_size) = size;
1413 		RETURN_TRUE;
1414 	}
1415 }
1416 /* }}} */
1417 
php_mb_check_code_point(zend_long cp)1418 static inline bool php_mb_check_code_point(zend_long cp)
1419 {
1420 	if (cp < 0 || cp >= 0x110000) {
1421 		/* Out of Unicode range */
1422 		return false;
1423 	}
1424 
1425 	if (cp >= 0xd800 && cp <= 0xdfff) {
1426 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1427 		 * substitute character. */
1428 		return false;
1429 	}
1430 
1431 	/* As we do not know the target encoding of the conversion operation that is going to
1432 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1433 	 * in the given encoding at this point. Thus we have to accept everything. */
1434 	return true;
1435 }
1436 
1437 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1438 PHP_FUNCTION(mb_substitute_character)
1439 {
1440 	zend_string *substitute_character = NULL;
1441 	zend_long substitute_codepoint;
1442 	bool substitute_is_null = 1;
1443 
1444 	ZEND_PARSE_PARAMETERS_START(0, 1)
1445 		Z_PARAM_OPTIONAL
1446 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1447 	ZEND_PARSE_PARAMETERS_END();
1448 
1449 	if (substitute_is_null) {
1450 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1451 			RETURN_STRING("none");
1452 		}
1453 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1454 			RETURN_STRING("long");
1455 		}
1456 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1457 			RETURN_STRING("entity");
1458 		}
1459 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1460 	}
1461 
1462 	if (substitute_character != NULL) {
1463 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1464 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1465 			RETURN_TRUE;
1466 		}
1467 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1468 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1469 			RETURN_TRUE;
1470 		}
1471 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1472 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1473 			RETURN_TRUE;
1474 		}
1475 		/* Invalid string value */
1476 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1477 		RETURN_THROWS();
1478 	}
1479 	/* Integer codepoint passed */
1480 	if (!php_mb_check_code_point(substitute_codepoint)) {
1481 		zend_argument_value_error(1, "is not a valid codepoint");
1482 		RETURN_THROWS();
1483 	}
1484 
1485 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1486 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1487 	RETURN_TRUE;
1488 }
1489 /* }}} */
1490 
1491 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1492 PHP_FUNCTION(mb_preferred_mime_name)
1493 {
1494 	char *name = NULL;
1495 	size_t name_len;
1496 
1497 	ZEND_PARSE_PARAMETERS_START(1, 1)
1498 		Z_PARAM_STRING(name, name_len)
1499 	ZEND_PARSE_PARAMETERS_END();
1500 
1501 	const mbfl_encoding *enc = mbfl_name2encoding(name);
1502 	if (enc == NULL) {
1503 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1504 		RETURN_THROWS();
1505 	}
1506 
1507 	const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1508 	if (preferred_name == NULL || *preferred_name == '\0') {
1509 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1510 		RETVAL_FALSE;
1511 	} else {
1512 		RETVAL_STRING((char *)preferred_name);
1513 	}
1514 }
1515 /* }}} */
1516 
1517 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1518 PHP_FUNCTION(mb_parse_str)
1519 {
1520 	zval *track_vars_array = NULL;
1521 	char *encstr;
1522 	size_t encstr_len;
1523 	php_mb_encoding_handler_info_t info;
1524 	const mbfl_encoding *detected;
1525 
1526 	ZEND_PARSE_PARAMETERS_START(2, 2)
1527 		Z_PARAM_STRING(encstr, encstr_len)
1528 		Z_PARAM_ZVAL(track_vars_array)
1529 	ZEND_PARSE_PARAMETERS_END();
1530 
1531 	track_vars_array = zend_try_array_init(track_vars_array);
1532 	if (!track_vars_array) {
1533 		RETURN_THROWS();
1534 	}
1535 
1536 	encstr = estrndup(encstr, encstr_len);
1537 
1538 	info.data_type              = PARSE_STRING;
1539 	info.separator              = PG(arg_separator).input;
1540 	info.report_errors          = true;
1541 	info.to_encoding            = MBSTRG(current_internal_encoding);
1542 	info.from_encodings         = MBSTRG(http_input_list);
1543 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1544 
1545 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1546 
1547 	MBSTRG(http_input_identify) = detected;
1548 
1549 	RETVAL_BOOL(detected);
1550 
1551 	if (encstr != NULL) efree(encstr);
1552 }
1553 /* }}} */
1554 
PHP_FUNCTION(mb_output_handler)1555 PHP_FUNCTION(mb_output_handler)
1556 {
1557 	zend_string *str;
1558 	zend_long arg_status;
1559 
1560 	ZEND_PARSE_PARAMETERS_START(2, 2)
1561 		Z_PARAM_STR(str)
1562 		Z_PARAM_LONG(arg_status)
1563 	ZEND_PARSE_PARAMETERS_END();
1564 
1565 	const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1566 	if (encoding == &mbfl_encoding_pass) {
1567 		RETURN_STR_COPY(str);
1568 	}
1569 
1570 	if (arg_status & PHP_OUTPUT_HANDLER_START) {
1571 		bool free_mimetype = false;
1572 		char *mimetype = NULL;
1573 
1574 		/* Analyze mime type */
1575 		if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1576 			char *s;
1577 			if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1578 				mimetype = estrdup(SG(sapi_headers).mimetype);
1579 			} else {
1580 				mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1581 			}
1582 			free_mimetype = true;
1583 		} else if (SG(sapi_headers).send_default_content_type) {
1584 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1585 		}
1586 
1587 		/* If content-type is not yet set, set it and enable conversion */
1588 		if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1589 			const char *charset = encoding->mime_name;
1590 			if (charset) {
1591 				char *p;
1592 				size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s",  mimetype, charset);
1593 				if (sapi_add_header(p, len, 0) != FAILURE) {
1594 					SG(sapi_headers).send_default_content_type = 0;
1595 				}
1596 			}
1597 
1598 			MBSTRG(outconv_enabled) = true;
1599 		}
1600 
1601 		if (free_mimetype) {
1602 			efree(mimetype);
1603 		}
1604 	}
1605 
1606 	if (!MBSTRG(outconv_enabled)) {
1607 		RETURN_STR_COPY(str);
1608 	}
1609 
1610 	mb_convert_buf buf;
1611 	mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1612 
1613 	uint32_t wchar_buf[128];
1614 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1615 	size_t in_len = ZSTR_LEN(str);
1616 	bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1617 
1618 	while (in_len) {
1619 		size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1620 		ZEND_ASSERT(out_len <= 128);
1621 		encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1622 	}
1623 
1624 	MBSTRG(illegalchars) += buf.errors;
1625 	RETVAL_STR(mb_convert_buf_result_raw(&buf));
1626 
1627 	if (last_feed) {
1628 		MBSTRG(outconv_enabled) = false;
1629 		MBSTRG(outconv_state) = 0;
1630 	}
1631 }
1632 
PHP_FUNCTION(mb_str_split)1633 PHP_FUNCTION(mb_str_split)
1634 {
1635 	zend_string *str, *encoding = NULL;
1636 	zend_long split_len = 1;
1637 
1638 	ZEND_PARSE_PARAMETERS_START(1, 3)
1639 		Z_PARAM_STR(str)
1640 		Z_PARAM_OPTIONAL
1641 		Z_PARAM_LONG(split_len)
1642 		Z_PARAM_STR_OR_NULL(encoding)
1643 	ZEND_PARSE_PARAMETERS_END();
1644 
1645 	if (split_len <= 0) {
1646 		zend_argument_value_error(2, "must be greater than 0");
1647 		RETURN_THROWS();
1648 	} else if (split_len > UINT_MAX / 4) {
1649 		zend_argument_value_error(2, "is too large");
1650 		RETURN_THROWS();
1651 	}
1652 
1653 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1654 	if (!enc) {
1655 		RETURN_THROWS();
1656 	}
1657 
1658 	if (ZSTR_LEN(str) == 0) {
1659 		RETURN_EMPTY_ARRAY();
1660 	}
1661 
1662 	unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1663 
1664 	unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1665 	if (char_len) {
1666 		unsigned int chunk_len = char_len * split_len;
1667 		unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1668 		array_init_size(return_value, chunks);
1669 		while (p < e) {
1670 			add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1671 			p += chunk_len;
1672 		}
1673 	} else if (enc->mblen_table) {
1674 		unsigned char const *mbtab = enc->mblen_table;
1675 
1676 		/* Assume that we have 1-byte characters */
1677 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1678 
1679 		while (p < e) {
1680 			unsigned char *chunk = p; /* start of chunk */
1681 
1682 			for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1683 				p += mbtab[*p];
1684 			}
1685 			if (p > e) {
1686 				p = e; /* ensure chunk is in bounds */
1687 			}
1688 			add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1689 		}
1690 	} else {
1691 		/* Assume that we have 1-byte characters */
1692 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1693 
1694 		uint32_t wchar_buf[128];
1695 		size_t in_len = ZSTR_LEN(str);
1696 		unsigned int state = 0, char_count = 0;
1697 
1698 		mb_convert_buf buf;
1699 
1700 		while (in_len) {
1701 			size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1702 			ZEND_ASSERT(out_len <= 128);
1703 			size_t i = 0;
1704 
1705 			/* Is there some output remaining from the previous iteration? */
1706 			if (char_count) {
1707 				if (out_len >= split_len - char_count) {
1708 					/* Finish off an incomplete chunk from previous iteration
1709 					 * ('buf' was already initialized; we don't need to do it again) */
1710 					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1711 					i += split_len - char_count;
1712 					char_count = 0;
1713 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1714 				} else {
1715 					/* Output from this iteration is not enough to finish the next chunk;
1716 					 * output what we can, and leave 'buf' to be used again on next iteration */
1717 					enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1718 					char_count += out_len;
1719 					continue;
1720 				}
1721 			}
1722 
1723 			while (i < out_len) {
1724 				/* Prepare for the next chunk */
1725 				mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1726 
1727 				if (out_len - i >= split_len) {
1728 					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1729 					i += split_len;
1730 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1731 				} else {
1732 					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1733 					 * leave them for the next iteration */
1734 					enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1735 					char_count = out_len - i;
1736 					break;
1737 				}
1738 			}
1739 		}
1740 
1741 		if (char_count) {
1742 			/* The main loop above has finished processing the input string, but
1743 			 * has left a partial chunk in 'buf' */
1744 			add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1745 		}
1746 	}
1747 }
1748 
1749 #ifdef __SSE2__
1750 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1751  * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1752  * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1753  * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1754 static inline uint32_t _mm_sum_epu8(const __m128i v)
1755 {
1756 	/* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1757 	 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1758 	 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1759 	 * halves of the destination XMM register
1760 	 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1761 	 * summed up will actually just be the 8-bit values from `v` */
1762 	__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1763 	/* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1764 	 * to extract it here; but it stored the sum as two different 16-bit values
1765 	 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1766 	 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1767 	return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1768 }
1769 #endif
1770 
1771 /* This assumes that `string` is valid UTF-8
1772  * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1773  * Interpreted as signed integers, those are all byte values less than -64
1774  * A fast way to get the length of a UTF-8 string is to start with its byte length,
1775  * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1776 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1777 {
1778 	unsigned char *e = p + len;
1779 
1780 #ifdef __SSE2__
1781 	if (len >= sizeof(__m128i)) {
1782 		e -= sizeof(__m128i);
1783 
1784 		const __m128i threshold = _mm_set1_epi8(-64);
1785 		const __m128i delta = _mm_set1_epi8(1);
1786 		__m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1787 
1788 		unsigned char reset_counter = 255;
1789 		do {
1790 			__m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1791 			__m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1792 			counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1793 
1794 			/* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1795 			 * and reset them to zero */
1796 			if (--reset_counter == 0) {
1797 				len -= _mm_sum_epu8(counter);
1798 				counter = _mm_setzero_si128();
1799 				reset_counter = 255;
1800 			}
1801 
1802 			p += sizeof(__m128i);
1803 		} while (p <= e);
1804 
1805 		e += sizeof(__m128i);
1806 		len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1807 	}
1808 #endif
1809 
1810 	/* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1811 	while (p < e) {
1812 		signed char c = *p++;
1813 		if (c < -64) {
1814 			len--;
1815 		}
1816 	}
1817 
1818 	return len;
1819 }
1820 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1821 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1822 {
1823 	unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1824 	if (char_len) {
1825 		return ZSTR_LEN(string) / char_len;
1826 	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1827 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1828 	}
1829 
1830 	uint32_t wchar_buf[128];
1831 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1832 	size_t in_len = ZSTR_LEN(string);
1833 	unsigned int state = 0;
1834 	size_t len = 0;
1835 
1836 	while (in_len) {
1837 		len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1838 	}
1839 
1840 	return len;
1841 }
1842 
1843 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1844 PHP_FUNCTION(mb_strlen)
1845 {
1846 	zend_string *string, *enc_name = NULL;
1847 
1848 	ZEND_PARSE_PARAMETERS_START(1, 2)
1849 		Z_PARAM_STR(string)
1850 		Z_PARAM_OPTIONAL
1851 		Z_PARAM_STR_OR_NULL(enc_name)
1852 	ZEND_PARSE_PARAMETERS_END();
1853 
1854 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1855 	if (!enc) {
1856 		RETURN_THROWS();
1857 	}
1858 
1859 	RETVAL_LONG(mb_get_strlen(string, enc));
1860 }
1861 /* }}} */
1862 
1863 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1864 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1865 {
1866 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1867 }
1868 
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1869 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1870 	if (offset < 0) {
1871 		unsigned char *pos = end;
1872 		while (offset < 0) {
1873 			if (pos <= str) {
1874 				return NULL;
1875 			}
1876 
1877 			unsigned char c = *--pos;
1878 			if (c < 0x80 || (c & 0xC0) != 0x80) {
1879 				offset++;
1880 			}
1881 		}
1882 		return pos;
1883 	} else {
1884 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1885 		unsigned char *pos = str;
1886 		while (offset-- > 0) {
1887 			if (pos >= end) {
1888 				return NULL;
1889 			}
1890 			pos += u8_tbl[*pos];
1891 		}
1892 		return pos;
1893 	}
1894 }
1895 
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1896 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1897 	return mb_fast_strlen_utf8(start, pos - start);
1898 }
1899 
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1900 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1901 {
1902 	size_t result;
1903 	zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1904 	unsigned char *offset_pointer;
1905 
1906 	if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1907 		unsigned int num_errors = 0;
1908 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1909 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1910 	} else {
1911 		haystack_u8 = haystack;
1912 		needle_u8 = needle;
1913 	}
1914 
1915 	offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1916 	if (!offset_pointer) {
1917 		result = MBFL_ERROR_OFFSET;
1918 		goto out;
1919 	}
1920 
1921 	result = MBFL_ERROR_NOT_FOUND;
1922 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1923 		goto out;
1924 	}
1925 
1926 	const char *found_pos;
1927 	if (!reverse) {
1928 		found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1929 	} else if (offset >= 0) {
1930 		found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1931 	} else {
1932 		size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1933 		offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1934 		if (!offset_pointer) {
1935 			offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1936 		}
1937 
1938 		found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1939 	}
1940 
1941 	if (found_pos) {
1942 		result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1943 	}
1944 
1945 out:
1946 	if (haystack_u8 != haystack) {
1947 		zend_string_free(haystack_u8);
1948 	}
1949 	if (needle_u8 != needle) {
1950 		zend_string_free(needle_u8);
1951 	}
1952 	return result;
1953 }
1954 
handle_strpos_error(size_t error)1955 static void handle_strpos_error(size_t error) {
1956 	switch (error) {
1957 	case MBFL_ERROR_NOT_FOUND:
1958 		break;
1959 	case MBFL_ERROR_ENCODING:
1960 		php_error_docref(NULL, E_WARNING, "Conversion error");
1961 		break;
1962 	case MBFL_ERROR_OFFSET:
1963 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1964 		break;
1965 	default:
1966 		zend_value_error("mb_strpos(): Unknown error");
1967 		break;
1968 	}
1969 }
1970 
PHP_FUNCTION(mb_strpos)1971 PHP_FUNCTION(mb_strpos)
1972 {
1973 	zend_long offset = 0;
1974 	zend_string *needle, *haystack;
1975 	zend_string *enc_name = NULL;
1976 
1977 	ZEND_PARSE_PARAMETERS_START(2, 4)
1978 		Z_PARAM_STR(haystack)
1979 		Z_PARAM_STR(needle)
1980 		Z_PARAM_OPTIONAL
1981 		Z_PARAM_LONG(offset)
1982 		Z_PARAM_STR_OR_NULL(enc_name)
1983 	ZEND_PARSE_PARAMETERS_END();
1984 
1985 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1986 	if (!enc) {
1987 		RETURN_THROWS();
1988 	}
1989 
1990 	size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1991 	if (!mbfl_is_error(n)) {
1992 		RETVAL_LONG(n);
1993 	} else {
1994 		handle_strpos_error(n);
1995 		RETVAL_FALSE;
1996 	}
1997 }
1998 
1999 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)2000 PHP_FUNCTION(mb_strrpos)
2001 {
2002 	zend_long offset = 0;
2003 	zend_string *needle, *haystack;
2004 	zend_string *enc_name = NULL;
2005 
2006 	ZEND_PARSE_PARAMETERS_START(2, 4)
2007 		Z_PARAM_STR(haystack)
2008 		Z_PARAM_STR(needle)
2009 		Z_PARAM_OPTIONAL
2010 		Z_PARAM_LONG(offset)
2011 		Z_PARAM_STR_OR_NULL(enc_name)
2012 	ZEND_PARSE_PARAMETERS_END();
2013 
2014 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2015 	if (!enc) {
2016 		RETURN_THROWS();
2017 	}
2018 
2019 	size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2020 	if (!mbfl_is_error(n)) {
2021 		RETVAL_LONG(n);
2022 	} else {
2023 		handle_strpos_error(n);
2024 		RETVAL_FALSE;
2025 	}
2026 }
2027 /* }}} */
2028 
2029 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2030 PHP_FUNCTION(mb_stripos)
2031 {
2032 	zend_long offset = 0;
2033 	zend_string *haystack, *needle;
2034 	zend_string *from_encoding = NULL;
2035 
2036 	ZEND_PARSE_PARAMETERS_START(2, 4)
2037 		Z_PARAM_STR(haystack)
2038 		Z_PARAM_STR(needle)
2039 		Z_PARAM_OPTIONAL
2040 		Z_PARAM_LONG(offset)
2041 		Z_PARAM_STR_OR_NULL(from_encoding)
2042 	ZEND_PARSE_PARAMETERS_END();
2043 
2044 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2045 	if (!enc) {
2046 		RETURN_THROWS();
2047 	}
2048 
2049 	size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2050 
2051 	if (!mbfl_is_error(n)) {
2052 		RETVAL_LONG(n);
2053 	} else {
2054 		handle_strpos_error(n);
2055 		RETVAL_FALSE;
2056 	}
2057 }
2058 /* }}} */
2059 
2060 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2061 PHP_FUNCTION(mb_strripos)
2062 {
2063 	zend_long offset = 0;
2064 	zend_string *haystack, *needle;
2065 	zend_string *from_encoding = NULL;
2066 
2067 	ZEND_PARSE_PARAMETERS_START(2, 4)
2068 		Z_PARAM_STR(haystack)
2069 		Z_PARAM_STR(needle)
2070 		Z_PARAM_OPTIONAL
2071 		Z_PARAM_LONG(offset)
2072 		Z_PARAM_STR_OR_NULL(from_encoding)
2073 	ZEND_PARSE_PARAMETERS_END();
2074 
2075 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2076 	if (!enc) {
2077 		RETURN_THROWS();
2078 	}
2079 
2080 	size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2081 
2082 	if (!mbfl_is_error(n)) {
2083 		RETVAL_LONG(n);
2084 	} else {
2085 		handle_strpos_error(n);
2086 		RETVAL_FALSE;
2087 	}
2088 }
2089 /* }}} */
2090 
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2091 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2092 {
2093 	uint32_t wchar_buf[128];
2094 	unsigned int state = 0;
2095 
2096 	mb_convert_buf buf;
2097 	mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2098 
2099 	while (in_len && len) {
2100 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2101 		ZEND_ASSERT(out_len <= 128);
2102 
2103 		if (from >= out_len) {
2104 			from -= out_len;
2105 		} else {
2106 			size_t needed_codepoints = MIN(out_len - from, len);
2107 			enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2108 			from = 0;
2109 			len -= needed_codepoints;
2110 		}
2111 	}
2112 
2113 	return mb_convert_buf_result(&buf, enc);
2114 }
2115 
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2116 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2117 {
2118 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2119 	size_t in_len = ZSTR_LEN(input);
2120 
2121 	if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2122 		/* Other than MacJapanese, no supported text encoding decodes to
2123 		 * more than one codepoint per byte
2124 		 * So if the number of codepoints to skip >= number of input bytes,
2125 		 * then definitely the output should be empty */
2126 		return zend_empty_string;
2127 	}
2128 
2129 	/* Does each codepoint have a fixed byte width? */
2130 	unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2131 	if (flag) {
2132 		/* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2133 		from *= flag;
2134 		len *= flag;
2135 		if (from >= in_len) {
2136 			return zend_empty_string;
2137 		}
2138 		in += from;
2139 		in_len -= from;
2140 		if (len > in_len) {
2141 			len = in_len;
2142 		}
2143 		return zend_string_init_fast((const char*)in, len);
2144 	}
2145 
2146 	return mb_get_substr_slow(in, in_len, from, len, enc);
2147 }
2148 
2149 #define MB_STRSTR 1
2150 #define MB_STRRCHR 2
2151 #define MB_STRISTR 3
2152 #define MB_STRRICHR 4
2153 
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2154 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2155 {
2156 	bool reverse_mode = false, part = false;
2157 	size_t n;
2158 	zend_string *haystack, *needle;
2159 	zend_string *encoding_name = NULL;
2160 
2161 	ZEND_PARSE_PARAMETERS_START(2, 4)
2162 		Z_PARAM_STR(haystack)
2163 		Z_PARAM_STR(needle)
2164 		Z_PARAM_OPTIONAL
2165 		Z_PARAM_BOOL(part)
2166 		Z_PARAM_STR_OR_NULL(encoding_name)
2167 	ZEND_PARSE_PARAMETERS_END();
2168 
2169 	const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2170 	if (!enc) {
2171 		RETURN_THROWS();
2172 	}
2173 
2174 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2175 		reverse_mode = true;
2176 	}
2177 
2178 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2179 		n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2180 	} else {
2181 		n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2182 	}
2183 
2184 	if (!mbfl_is_error(n)) {
2185 		if (part) {
2186 			RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2187 		} else {
2188 			RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2189 		}
2190 	} else {
2191 		// FIXME use handle_strpos_error(n)
2192 		RETVAL_FALSE;
2193 	}
2194 }
2195 
2196 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2197 PHP_FUNCTION(mb_strstr)
2198 {
2199 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2200 }
2201 /* }}} */
2202 
2203 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2204 PHP_FUNCTION(mb_strrchr)
2205 {
2206 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2207 }
2208 /* }}} */
2209 
2210 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2211 PHP_FUNCTION(mb_stristr)
2212 {
2213 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2214 }
2215 /* }}} */
2216 
2217 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2218 PHP_FUNCTION(mb_strrichr)
2219 {
2220 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2221 }
2222 /* }}} */
2223 
2224 #undef MB_STRSTR
2225 #undef MB_STRRCHR
2226 #undef MB_STRISTR
2227 #undef MB_STRRICHR
2228 
PHP_FUNCTION(mb_substr_count)2229 PHP_FUNCTION(mb_substr_count)
2230 {
2231 	zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2232 
2233 	ZEND_PARSE_PARAMETERS_START(2, 3)
2234 		Z_PARAM_STR(haystack)
2235 		Z_PARAM_STR(needle)
2236 		Z_PARAM_OPTIONAL
2237 		Z_PARAM_STR_OR_NULL(enc_name)
2238 	ZEND_PARSE_PARAMETERS_END();
2239 
2240 	if (ZSTR_LEN(needle) == 0) {
2241 		zend_argument_must_not_be_empty_error(2);
2242 		RETURN_THROWS();
2243 	}
2244 
2245 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2246 	if (!enc) {
2247 		RETURN_THROWS();
2248 	}
2249 
2250 	if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2251 		/* No need to do any conversion if haystack/needle are already known-valid UTF-8
2252 		 * (If they are not valid, then not passing them through conversion filters could affect output) */
2253 		if (ZSTR_IS_VALID_UTF8(haystack)) {
2254 			haystack_u8 = haystack;
2255 		} else {
2256 			unsigned int num_errors = 0;
2257 			haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2258 			if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2259 				GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2260 			}
2261 		}
2262 
2263 		if (ZSTR_IS_VALID_UTF8(needle)) {
2264 			needle_u8 = needle;
2265 		} else {
2266 			unsigned int num_errors = 0;
2267 			needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2268 			if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2269 				GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2270 			}
2271 		}
2272 	} else {
2273 		unsigned int num_errors = 0;
2274 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2275 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2276 		/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2277 		 * may be only escape sequences */
2278 		if (ZSTR_LEN(needle_u8) == 0) {
2279 			zend_string_free(haystack_u8);
2280 			zend_string_free(needle_u8);
2281 			zend_argument_must_not_be_empty_error(2);
2282 			RETURN_THROWS();
2283 		}
2284 	}
2285 
2286 	size_t result = 0;
2287 
2288 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2289 		goto out;
2290 	}
2291 
2292 	const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2293 	while (true) {
2294 		p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2295 		if (!p) {
2296 			break;
2297 		}
2298 		p += ZSTR_LEN(needle_u8);
2299 		result++;
2300 	}
2301 
2302 out:
2303 	if (haystack_u8 != haystack) {
2304 		zend_string_free(haystack_u8);
2305 	}
2306 	if (needle_u8 != needle) {
2307 		zend_string_free(needle_u8);
2308 	}
2309 
2310 	RETVAL_LONG(result);
2311 }
2312 
2313 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2314 PHP_FUNCTION(mb_substr)
2315 {
2316 	zend_string *str, *encoding = NULL;
2317 	zend_long from, len;
2318 	size_t real_from, real_len;
2319 	bool len_is_null = true;
2320 
2321 	ZEND_PARSE_PARAMETERS_START(2, 4)
2322 		Z_PARAM_STR(str)
2323 		Z_PARAM_LONG(from)
2324 		Z_PARAM_OPTIONAL
2325 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2326 		Z_PARAM_STR_OR_NULL(encoding)
2327 	ZEND_PARSE_PARAMETERS_END();
2328 
2329 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2330 	if (!enc) {
2331 		RETURN_THROWS();
2332 	}
2333 
2334 	size_t mblen = 0;
2335 	if (from < 0 || (!len_is_null && len < 0)) {
2336 		mblen = mb_get_strlen(str, enc);
2337 	}
2338 
2339 	/* if "from" position is negative, count start position from the end
2340 	 * of the string */
2341 	if (from >= 0) {
2342 		real_from = (size_t) from;
2343 	} else if (-from < mblen) {
2344 		real_from = mblen + from;
2345 	} else {
2346 		real_from = 0;
2347 	}
2348 
2349 	/* if "length" position is negative, set it to the length
2350 	 * needed to stop that many chars from the end of the string */
2351 	if (len_is_null) {
2352 		real_len = MBFL_SUBSTR_UNTIL_END;
2353 	} else if (len >= 0) {
2354 		real_len = (size_t) len;
2355 	} else if (real_from < mblen && -len < mblen - real_from) {
2356 		real_len = (mblen - real_from) + len;
2357 	} else {
2358 		real_len = 0;
2359 	}
2360 
2361 	RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2362 }
2363 /* }}} */
2364 
2365 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2366 PHP_FUNCTION(mb_strcut)
2367 {
2368 	zend_string *encoding = NULL;
2369 	char *string_val;
2370 	zend_long from, len;
2371 	bool len_is_null = true;
2372 	mbfl_string string, result, *ret;
2373 
2374 	ZEND_PARSE_PARAMETERS_START(2, 4)
2375 		Z_PARAM_STRING(string_val, string.len)
2376 		Z_PARAM_LONG(from)
2377 		Z_PARAM_OPTIONAL
2378 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2379 		Z_PARAM_STR_OR_NULL(encoding)
2380 	ZEND_PARSE_PARAMETERS_END();
2381 
2382 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2383 	if (!enc) {
2384 		RETURN_THROWS();
2385 	}
2386 
2387 	string.val = (unsigned char*)string_val;
2388 	string.encoding = enc;
2389 
2390 	if (len_is_null) {
2391 		len = string.len;
2392 	}
2393 
2394 	/* if "from" position is negative, count start position from the end
2395 	 * of the string */
2396 	if (from < 0) {
2397 		from = string.len + from;
2398 		if (from < 0) {
2399 			from = 0;
2400 		}
2401 	}
2402 
2403 	/* if "length" position is negative, set it to the length
2404 	 * needed to stop that many chars from the end of the string */
2405 	if (len < 0) {
2406 		len = (string.len - from) + len;
2407 		if (len < 0) {
2408 			len = 0;
2409 		}
2410 	}
2411 
2412 	if (from > string.len || len == 0) {
2413 		RETURN_EMPTY_STRING();
2414 	}
2415 
2416 	if (enc->cut) {
2417 		RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2418 	}
2419 
2420 	unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2421 	if (char_len) {
2422 		/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2423 		from &= -char_len;
2424 		if (len > string.len - from) {
2425 			len = string.len - from;
2426 		}
2427 		RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2428 	}
2429 
2430 	if (enc->mblen_table) {
2431 		const unsigned char *mbtab = enc->mblen_table;
2432 		const unsigned char *p, *q, *end;
2433 		int m = 0;
2434 		/* Search for start position */
2435 		for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2436 		if (p > q) {
2437 			p -= m;
2438 		}
2439 		const unsigned char *start = p;
2440 		/* Search for end position */
2441 		if (len >= string.len - (start - (const unsigned char*)string.val)) {
2442 			end = (const unsigned char*)(string.val + string.len);
2443 		} else {
2444 			for (q = p + len; p < q; p += (m = mbtab[*p]));
2445 			if (p > q) {
2446 				p -= m;
2447 			}
2448 			end = p;
2449 		}
2450 		RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2451 	}
2452 
2453 	ret = mbfl_strcut(&string, &result, from, len);
2454 	ZEND_ASSERT(ret != NULL);
2455 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2456 	efree(ret->val);
2457 }
2458 /* }}} */
2459 
2460 /* Some East Asian characters, when printed at a terminal (or the like), require double
2461  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2462 static size_t character_width(uint32_t c)
2463 {
2464 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2465 		return 1;
2466 	}
2467 
2468 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2469 	unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2470 	while (lo < hi) {
2471 		unsigned int probe = (lo + hi) / 2;
2472 		if (c < mbfl_eaw_table[probe].begin) {
2473 			hi = probe;
2474 		} else if (c > mbfl_eaw_table[probe].end) {
2475 			lo = probe + 1;
2476 		} else {
2477 			return 2;
2478 		}
2479 	}
2480 
2481 	return 1;
2482 }
2483 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2484 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2485 {
2486 	size_t width = 0;
2487 	uint32_t wchar_buf[128];
2488 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2489 	size_t in_len = ZSTR_LEN(string);
2490 	unsigned int state = 0;
2491 
2492 	while (in_len) {
2493 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2494 		ZEND_ASSERT(out_len <= 128);
2495 
2496 		while (out_len) {
2497 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2498 			 * If text conversion is performed with an ordinary ASCII character as
2499 			 * the 'replacement character', this will give us the correct display width. */
2500 			width += character_width(wchar_buf[--out_len]);
2501 		}
2502 	}
2503 
2504 	return width;
2505 }
2506 
2507 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2508 PHP_FUNCTION(mb_strwidth)
2509 {
2510 	zend_string *string, *enc_name = NULL;
2511 
2512 	ZEND_PARSE_PARAMETERS_START(1, 2)
2513 		Z_PARAM_STR(string)
2514 		Z_PARAM_OPTIONAL
2515 		Z_PARAM_STR_OR_NULL(enc_name)
2516 	ZEND_PARSE_PARAMETERS_END();
2517 
2518 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2519 	if (!enc) {
2520 		RETURN_THROWS();
2521 	}
2522 
2523 	RETVAL_LONG(mb_get_strwidth(string, enc));
2524 }
2525 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2526 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2527 {
2528 	uint32_t wchar_buf[128];
2529 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2530 	size_t in_len = ZSTR_LEN(input);
2531 	unsigned int state = 0;
2532 	size_t remaining_width = width;
2533 	size_t to_skip = from;
2534 	size_t out_len = 0;
2535 	bool first_call = true, input_err = false;
2536 	mb_convert_buf buf;
2537 
2538 	while (in_len) {
2539 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2540 		ZEND_ASSERT(out_len <= 128);
2541 
2542 		if (out_len <= to_skip) {
2543 			to_skip -= out_len;
2544 		} else {
2545 			for (size_t i = to_skip; i < out_len; i++) {
2546 				uint32_t w = wchar_buf[i];
2547 				size_t current_w_width = character_width(w);
2548 
2549 				input_err |= (w == MBFL_BAD_INPUT);
2550 
2551 				if (remaining_width < current_w_width) {
2552 					size_t marker_width = mb_get_strwidth(marker, enc);
2553 
2554 					/* The trim marker is larger than the desired string width */
2555 					if (width <= marker_width) {
2556 						return zend_string_copy(marker);
2557 					}
2558 
2559 					/* We need to truncate string and append trim marker */
2560 					width -= marker_width;
2561 					/* 'width' is now the amount we want to take from 'input' */
2562 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2563 
2564 					if (first_call) {
2565 						/* We can use the buffer of wchars which we have right now;
2566 						 * no need to convert again */
2567 						goto dont_restart_conversion;
2568 					} else {
2569 						goto restart_conversion;
2570 					}
2571 				}
2572 				remaining_width -= current_w_width;
2573 			}
2574 			to_skip = 0;
2575 		}
2576 		first_call = false;
2577 	}
2578 
2579 	/* The input string fits in the requested width; we don't need to append the trim marker
2580 	 * However, if the string contains erroneous byte sequences, those should be converted
2581 	 * to error markers */
2582 	if (!input_err) {
2583 		if (from == 0) {
2584 			/* This just increments the string's refcount; it doesn't really 'copy' it */
2585 			return zend_string_copy(input);
2586 		} else {
2587 			return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2588 		}
2589 	} else {
2590 		/* We can't use `mb_get_substr`, because it uses the fastest method possible of
2591 		 * picking out a substring, which may not include converting erroneous byte
2592 		 * sequences to error markers */
2593 		return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2594 	}
2595 
2596 	/* The input string is too wide; we need to build a new string which
2597 	 * includes some portion of the input string, with the trim marker
2598 	 * concatenated onto it */
2599 restart_conversion:
2600 	in = (unsigned char*)ZSTR_VAL(input);
2601 	in_len = ZSTR_LEN(input);
2602 	state = 0;
2603 
2604 	while (true) {
2605 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2606 		ZEND_ASSERT(out_len <= 128);
2607 
2608 dont_restart_conversion:
2609 		if (out_len <= from) {
2610 			from -= out_len;
2611 		} else {
2612 			for (size_t i = from; i < out_len; i++) {
2613 				size_t current_wchar_char_width = character_width(wchar_buf[i]);
2614 				if (width < current_wchar_char_width) {
2615 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2616 					goto append_trim_marker;
2617 				}
2618 				width -= current_wchar_char_width;
2619 			}
2620 			ZEND_ASSERT(in_len > 0);
2621 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2622 			from = 0;
2623 		}
2624 	}
2625 
2626 append_trim_marker:
2627 	if (ZSTR_LEN(marker) > 0) {
2628 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2629 		buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2630 	}
2631 
2632 	/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2633 	 * we have no guarantee that the trim marker string is valid UTF-8 */
2634 	return mb_convert_buf_result_raw(&buf);
2635 }
2636 
2637 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2638 PHP_FUNCTION(mb_strimwidth)
2639 {
2640 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2641 	zend_long from, width;
2642 
2643 	ZEND_PARSE_PARAMETERS_START(3, 5)
2644 		Z_PARAM_STR(str)
2645 		Z_PARAM_LONG(from)
2646 		Z_PARAM_LONG(width)
2647 		Z_PARAM_OPTIONAL
2648 		Z_PARAM_STR(trimmarker)
2649 		Z_PARAM_STR_OR_NULL(encoding)
2650 	ZEND_PARSE_PARAMETERS_END();
2651 
2652 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2653 	if (!enc) {
2654 		RETURN_THROWS();
2655 	}
2656 
2657 	if (from != 0) {
2658 		size_t str_len = mb_get_strlen(str, enc);
2659 		if (from < 0) {
2660 			from += str_len;
2661 		}
2662 		if (from < 0 || from > str_len) {
2663 			zend_argument_value_error(2, "is out of range");
2664 			RETURN_THROWS();
2665 		}
2666 	}
2667 
2668 	if (width < 0) {
2669 		php_error_docref(NULL, E_DEPRECATED,
2670 			"passing a negative integer to argument #3 ($width) is deprecated");
2671 		width += mb_get_strwidth(str, enc);
2672 
2673 		if (from > 0) {
2674 			zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2675 			width -= mb_get_strwidth(trimmed, enc);
2676 			zend_string_free(trimmed);
2677 		}
2678 
2679 		if (width < 0) {
2680 			zend_argument_value_error(3, "is out of range");
2681 			RETURN_THROWS();
2682 		}
2683 	}
2684 
2685 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2686 }
2687 
2688 
2689 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2690 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2691 {
2692 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2693 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2694 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2695 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2696 }
2697 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2698 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2699 {
2700 	unsigned int num_errors = 0;
2701 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2702 	MBSTRG(illegalchars) += num_errors;
2703 	return result;
2704 }
2705 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2706 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2707 {
2708 	const mbfl_encoding *from_encoding;
2709 
2710 	/* pre-conversion encoding */
2711 	ZEND_ASSERT(num_from_encodings >= 1);
2712 	if (num_from_encodings == 1) {
2713 		from_encoding = *from_encodings;
2714 	} else {
2715 		/* auto detect */
2716 		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2717 		if (!from_encoding) {
2718 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2719 			return NULL;
2720 		}
2721 	}
2722 
2723 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2724 }
2725 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2726 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2727 {
2728 	HashTable *output, *chash;
2729 	zend_long idx;
2730 	zend_string *key;
2731 	zval *entry, entry_tmp;
2732 
2733 	if (!input) {
2734 		return NULL;
2735 	}
2736 
2737 	if (GC_IS_RECURSIVE(input)) {
2738 		GC_UNPROTECT_RECURSION(input);
2739 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2740 		return NULL;
2741 	}
2742 	GC_TRY_PROTECT_RECURSION(input);
2743 	output = zend_new_array(zend_hash_num_elements(input));
2744 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2745 		/* convert key */
2746 		if (key) {
2747 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2748 			if (!converted_key) {
2749 				continue;
2750 			}
2751 			key = converted_key;
2752 		}
2753 		/* convert value */
2754 		ZEND_ASSERT(entry);
2755 try_again:
2756 		switch(Z_TYPE_P(entry)) {
2757 			case IS_STRING: {
2758 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2759 				if (!converted_key) {
2760 					if (key) {
2761 						zend_string_release(key);
2762 					}
2763 					continue;
2764 				}
2765 				ZVAL_STR(&entry_tmp, converted_key);
2766 				break;
2767 			}
2768 			case IS_NULL:
2769 			case IS_TRUE:
2770 			case IS_FALSE:
2771 			case IS_LONG:
2772 			case IS_DOUBLE:
2773 				ZVAL_COPY(&entry_tmp, entry);
2774 				break;
2775 			case IS_ARRAY:
2776 				chash = php_mb_convert_encoding_recursive(
2777 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2778 				if (chash) {
2779 					ZVAL_ARR(&entry_tmp, chash);
2780 				} else {
2781 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2782 				}
2783 				break;
2784 			case IS_REFERENCE:
2785 				entry = Z_REFVAL_P(entry);
2786 				goto try_again;
2787 			case IS_OBJECT:
2788 			default:
2789 				if (key) {
2790 					zend_string_release(key);
2791 				}
2792 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2793 				continue;
2794 		}
2795 		if (key) {
2796 			zend_hash_add(output, key, &entry_tmp);
2797 			zend_string_release(key);
2798 		} else {
2799 			zend_hash_index_add(output, idx, &entry_tmp);
2800 		}
2801 	} ZEND_HASH_FOREACH_END();
2802 	GC_TRY_UNPROTECT_RECURSION(input);
2803 
2804 	return output;
2805 }
2806 /* }}} */
2807 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2808 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2809 {
2810 	/* mbstring supports some 'text encodings' which aren't really text encodings
2811 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2812 	 * These should never be returned by `mb_detect_encoding`. */
2813 	unsigned int shift = 0;
2814 	for (unsigned int i = 0; i < *size; i++) {
2815 		const mbfl_encoding *encoding = elist[i];
2816 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2817 			shift++; /* Remove this encoding from the list */
2818 		} else if (shift) {
2819 			elist[i - shift] = encoding;
2820 		}
2821 	}
2822 	*size -= shift;
2823 }
2824 
2825 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2826 PHP_FUNCTION(mb_convert_encoding)
2827 {
2828 	zend_string *to_encoding_name;
2829 	zend_string *input_str, *from_encodings_str = NULL;
2830 	HashTable *input_ht, *from_encodings_ht = NULL;
2831 	const mbfl_encoding **from_encodings;
2832 	size_t num_from_encodings;
2833 	bool free_from_encodings = false;
2834 
2835 	ZEND_PARSE_PARAMETERS_START(2, 3)
2836 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2837 		Z_PARAM_STR(to_encoding_name)
2838 		Z_PARAM_OPTIONAL
2839 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2840 	ZEND_PARSE_PARAMETERS_END();
2841 
2842 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2843 	if (!to_encoding) {
2844 		RETURN_THROWS();
2845 	}
2846 
2847 	if (from_encodings_ht) {
2848 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2849 			RETURN_THROWS();
2850 		}
2851 		free_from_encodings = true;
2852 	} else if (from_encodings_str) {
2853 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2854 				&from_encodings, &num_from_encodings,
2855 				/* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2856 			RETURN_THROWS();
2857 		}
2858 		free_from_encodings = true;
2859 	} else {
2860 		from_encodings = &MBSTRG(current_internal_encoding);
2861 		num_from_encodings = 1;
2862 	}
2863 
2864 	if (num_from_encodings > 1) {
2865 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2866 	}
2867 
2868 	if (!num_from_encodings) {
2869 		efree(ZEND_VOIDP(from_encodings));
2870 		zend_argument_value_error(3, "must specify at least one encoding");
2871 		RETURN_THROWS();
2872 	}
2873 
2874 	if (input_str) {
2875 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2876 		if (ret != NULL) {
2877 			RETVAL_STR(ret);
2878 		} else {
2879 			RETVAL_FALSE;
2880 		}
2881 	} else {
2882 		HashTable *tmp;
2883 		tmp = php_mb_convert_encoding_recursive(
2884 			input_ht, to_encoding, from_encodings, num_from_encodings);
2885 		RETVAL_ARR(tmp);
2886 	}
2887 
2888 	if (free_from_encodings) {
2889 		efree(ZEND_VOIDP(from_encodings));
2890 	}
2891 }
2892 /* }}} */
2893 
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2894 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2895 {
2896 	return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2897 }
2898 
PHP_FUNCTION(mb_convert_case)2899 PHP_FUNCTION(mb_convert_case)
2900 {
2901 	zend_string *str, *from_encoding = NULL;
2902 	zend_long case_mode = 0;
2903 
2904 	ZEND_PARSE_PARAMETERS_START(2, 3)
2905 		Z_PARAM_STR(str)
2906 		Z_PARAM_LONG(case_mode)
2907 		Z_PARAM_OPTIONAL
2908 		Z_PARAM_STR_OR_NULL(from_encoding)
2909 	ZEND_PARSE_PARAMETERS_END();
2910 
2911 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2912 	if (!enc) {
2913 		RETURN_THROWS();
2914 	}
2915 
2916 	if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2917 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2918 		RETURN_THROWS();
2919 	}
2920 
2921 	RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2922 }
2923 
PHP_FUNCTION(mb_strtoupper)2924 PHP_FUNCTION(mb_strtoupper)
2925 {
2926 	zend_string *str, *from_encoding = NULL;
2927 
2928 	ZEND_PARSE_PARAMETERS_START(1, 2)
2929 		Z_PARAM_STR(str)
2930 		Z_PARAM_OPTIONAL
2931 		Z_PARAM_STR_OR_NULL(from_encoding)
2932 	ZEND_PARSE_PARAMETERS_END();
2933 
2934 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2935 	if (!enc) {
2936 		RETURN_THROWS();
2937 	}
2938 
2939 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2940 }
2941 
PHP_FUNCTION(mb_strtolower)2942 PHP_FUNCTION(mb_strtolower)
2943 {
2944 	zend_string *str, *from_encoding = NULL;
2945 
2946 	ZEND_PARSE_PARAMETERS_START(1, 2)
2947 		Z_PARAM_STR(str)
2948 		Z_PARAM_OPTIONAL
2949 		Z_PARAM_STR_OR_NULL(from_encoding)
2950 	ZEND_PARSE_PARAMETERS_END();
2951 
2952 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2953 	if (!enc) {
2954 		RETURN_THROWS();
2955 	}
2956 
2957 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2958 }
2959 
php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS,php_case_mode mode)2960 static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2961 {
2962 	zend_string *str, *from_encoding = NULL;
2963 
2964 	ZEND_PARSE_PARAMETERS_START(1, 2)
2965 		Z_PARAM_STR(str)
2966 		Z_PARAM_OPTIONAL
2967 		Z_PARAM_STR_OR_NULL(from_encoding)
2968 	ZEND_PARSE_PARAMETERS_END();
2969 
2970 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2971 	if (!enc) {
2972 		RETURN_THROWS();
2973 	}
2974 
2975 	zend_string *first = mb_get_substr(str, 0, 1, enc);
2976 	zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2977 
2978 	if (zend_string_equals(first, head)) {
2979 		zend_string_release_ex(first, false);
2980 		zend_string_release_ex(head, false);
2981 		RETURN_STR(zend_string_copy(str));
2982 	}
2983 
2984 	zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2985 	zend_string *retval = zend_string_concat2(ZSTR_VAL(head), ZSTR_LEN(head), ZSTR_VAL(second), ZSTR_LEN(second));
2986 
2987 	zend_string_release_ex(first, false);
2988 	zend_string_release_ex(head, false);
2989 	zend_string_release_ex(second, false);
2990 
2991 	RETVAL_STR(retval);
2992 }
2993 
PHP_FUNCTION(mb_ucfirst)2994 PHP_FUNCTION(mb_ucfirst)
2995 {
2996 	php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_TITLE);
2997 }
2998 
PHP_FUNCTION(mb_lcfirst)2999 PHP_FUNCTION(mb_lcfirst)
3000 {
3001 	php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_LOWER);
3002 }
3003 
3004 typedef enum {
3005 	MB_LTRIM = 1,
3006 	MB_RTRIM = 2,
3007 	MB_BOTH_TRIM = 3
3008 } mb_trim_mode;
3009 
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)3010 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3011 {
3012 	if (ht) {
3013 		return zend_hash_index_exists(ht, w);
3014 	} else {
3015 		for (size_t i = 0; i < default_chars_length; i++) {
3016 			if (w == default_chars[i]) {
3017 				return true;
3018 			}
3019 		}
3020 		return false;
3021 	}
3022 }
3023 
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)3024 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3025 {
3026 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3027 	uint32_t wchar_buf[128];
3028 	size_t in_len = ZSTR_LEN(str);
3029 	size_t out_len = 0;
3030 	unsigned int state = 0;
3031 	size_t left = 0;
3032 	size_t right = 0;
3033 	size_t total_len = 0;
3034 
3035 	while (in_len) {
3036 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3037 		ZEND_ASSERT(out_len <= 128);
3038 		total_len += out_len;
3039 
3040 		for (size_t i = 0; i < out_len; i++) {
3041 			uint32_t w = wchar_buf[i];
3042 			if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3043 				if (mode & MB_LTRIM) {
3044 					left += 1;
3045 				}
3046 				if (mode & MB_RTRIM) {
3047 					right += 1;
3048 				}
3049 			} else {
3050 				mode &= ~MB_LTRIM;
3051 				if (mode & MB_RTRIM) {
3052 					right = 0;
3053 				}
3054 			}
3055 		}
3056 	}
3057 
3058 	if (left == 0 && right == 0) {
3059 		return zend_string_copy(str);
3060 	}
3061 	return mb_get_substr(str, left, total_len - (right + left), enc);
3062 }
3063 
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3064 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3065 {
3066 	const uint32_t trim_default_chars[] = {
3067 		0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3068 		0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3069 		0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3070 		0x85, 0x180E
3071 	};
3072 	size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3073 
3074 	HashTable what_ht;
3075 	zval val;
3076 	ZVAL_TRUE(&val);
3077 
3078 	zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3079 
3080 	for (size_t i = 0; i < trim_default_chars_length; i++) {
3081 		zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3082 	}
3083 	zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3084 	zend_hash_destroy(&what_ht);
3085 
3086 	return retval;
3087 }
3088 
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3089 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3090 {
3091 	unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3092 	uint32_t what_wchar_buf[128];
3093 	size_t what_out_len = 0;
3094 	unsigned int state = 0;
3095 	size_t what_len = ZSTR_LEN(what);
3096 	HashTable what_ht;
3097 	zval val;
3098 	bool hash_initialized = false;
3099 
3100 	while (what_len) {
3101 		what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3102 		ZEND_ASSERT(what_out_len <= 128);
3103 
3104 		if (what_out_len <= 4 && !hash_initialized) {
3105 			return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3106 		} else {
3107 			if (!hash_initialized) {
3108 				hash_initialized = true;
3109 				ZVAL_TRUE(&val);
3110 				zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3111 			}
3112 			for (size_t i = 0; i < what_out_len; i++) {
3113 				zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3114 			}
3115 		}
3116 	}
3117 
3118 	if (UNEXPECTED(!hash_initialized)) {
3119 		/* This is only possible if what is empty */
3120 		return zend_string_copy(str);
3121 	}
3122 
3123 	zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3124 	zend_hash_destroy(&what_ht);
3125 
3126 	return retval;
3127 }
3128 
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3129 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3130 {
3131 	zend_string *str;
3132 	zend_string *what = NULL;
3133 	zend_string *encoding = NULL;
3134 
3135 	ZEND_PARSE_PARAMETERS_START(1, 3)
3136 		Z_PARAM_STR(str)
3137 		Z_PARAM_OPTIONAL
3138 		Z_PARAM_STR_OR_NULL(what)
3139 		Z_PARAM_STR_OR_NULL(encoding)
3140 	ZEND_PARSE_PARAMETERS_END();
3141 
3142 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3143 	if (!enc) {
3144 		RETURN_THROWS();
3145 	}
3146 
3147 	if (what) {
3148 		RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3149 	} else {
3150 		RETURN_STR(mb_trim_default_chars(str, mode, enc));
3151 	}
3152 }
3153 
PHP_FUNCTION(mb_trim)3154 PHP_FUNCTION(mb_trim)
3155 {
3156 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3157 }
3158 
PHP_FUNCTION(mb_ltrim)3159 PHP_FUNCTION(mb_ltrim)
3160 {
3161 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3162 }
3163 
PHP_FUNCTION(mb_rtrim)3164 PHP_FUNCTION(mb_rtrim)
3165 {
3166 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3167 }
3168 
duplicate_elist(const mbfl_encoding ** elist,size_t size)3169 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3170 {
3171 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3172 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3173 	return new_elist;
3174 }
3175 
estimate_demerits(uint32_t w)3176 static unsigned int estimate_demerits(uint32_t w)
3177 {
3178 	/* Receive wchars decoded from input string using candidate encoding.
3179 	 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3180 	 * a smaller number for each ASCII punctuation character, and 1 for
3181 	 * all other codepoints.
3182 	 *
3183 	 * The 'common' codepoints should cover the vast majority of
3184 	 * codepoints we are likely to see in practice, while only covering
3185 	 * a small minority of the entire Unicode encoding space. Why?
3186 	 * Well, if the test string happens to be valid in an incorrect
3187 	 * candidate encoding, the bogus codepoints which it decodes to will
3188 	 * be more or less random. By treating the majority of codepoints as
3189 	 * 'rare', we ensure that in almost all such cases, the bogus
3190 	 * codepoints will include plenty of 'rares', thus giving the
3191 	 * incorrect candidate encoding lots of demerits. See
3192 	 * common_codepoints.txt for the actual list used.
3193 	 *
3194 	 * So, why give extra demerits for ASCII punctuation characters? It's
3195 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3196 	 * which deliberately only use bytes in the ASCII range. When
3197 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3198 	 * have an unusually high number of ASCII punctuation characters.
3199 	 * So giving extra demerits for such characters will improve
3200 	 * detection accuracy for UTF-7 and similar encodings.
3201 	 *
3202 	 * Finally, why 1 demerit for all other characters? That penalizes
3203 	 * long strings, meaning we will tend to choose a candidate encoding
3204 	 * in which the test string decodes to a smaller number of
3205 	 * codepoints. That prevents single-byte encodings in which almost
3206 	 * every possible input byte decodes to a 'common' codepoint from
3207 	 * being favored too much. */
3208 	if (w > 0xFFFF) {
3209 		return 40;
3210 	} else if (w >= 0x21 && w <= 0x2F) {
3211 		return 6;
3212 	} else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3213 		return 30;
3214 	} else {
3215 		return 1;
3216 	}
3217 	return 0;
3218 }
3219 
3220 struct candidate {
3221 	const mbfl_encoding *enc;
3222 	const unsigned char *in;
3223 	size_t in_len;
3224 	uint64_t demerits; /* Wide bit size to prevent overflow */
3225 	unsigned int state;
3226 	float multiplier;
3227 };
3228 
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3229 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3230 {
3231 	size_t j = 0;
3232 
3233 	for (size_t i = 0; i < length; i++) {
3234 		const mbfl_encoding *enc = encodings[i];
3235 
3236 		array[j].enc = enc;
3237 		array[j].state = 0;
3238 		array[j].demerits = 0;
3239 
3240 		/* If any candidate encodings have specialized validation functions, use them
3241 		 * to eliminate as many candidates as possible */
3242 		if (enc->check != NULL) {
3243 			for (size_t k = 0; k < n; k++) {
3244 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
3245 					if (strict) {
3246 						goto skip_to_next;
3247 					} else {
3248 						array[j].demerits += 500;
3249 					}
3250 				}
3251 			}
3252 		}
3253 
3254 		/* This multiplier can optionally be used to make candidate encodings listed
3255 		 * first more likely to be chosen. It is a weight factor which multiplies
3256 		 * the number of demerits counted for each candidate. */
3257 		array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3258 		j++;
3259 skip_to_next: ;
3260 	}
3261 
3262 	return j;
3263 }
3264 
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3265 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3266 {
3267 	for (size_t i = 0; i < length; i++) {
3268 		const mbfl_encoding *enc = array[i].enc;
3269 
3270 		array[i].in = in;
3271 		array[i].in_len = in_len;
3272 
3273 		/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3274 		if (enc == &mbfl_encoding_utf8) {
3275 			if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3276 				array[i].in_len -= 3;
3277 				array[i].in += 3;
3278 			}
3279 		} else if (enc == &mbfl_encoding_utf16be) {
3280 			if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3281 				array[i].in_len -= 2;
3282 				array[i].in += 2;
3283 			}
3284 		} else if (enc == &mbfl_encoding_utf16le) {
3285 			if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3286 				array[i].in_len -= 2;
3287 				array[i].in += 2;
3288 			}
3289 		}
3290 	}
3291 }
3292 
count_demerits(struct candidate * array,size_t length,bool strict)3293 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3294 {
3295 	uint32_t wchar_buf[128];
3296 	unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3297 
3298 	for (size_t i = 0; i < length; i++) {
3299 		if (array[i].in_len == 0) {
3300 			finished++;
3301 		}
3302 	}
3303 
3304 	while ((strict || length > 1) && finished < length) {
3305 		/* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3306 		for (size_t i = length - 1; i != (size_t)-1; i--) {
3307 			/* Do we still have more input to process for this candidate encoding? */
3308 			if (array[i].in_len) {
3309 				const mbfl_encoding *enc = array[i].enc;
3310 				size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3311 				ZEND_ASSERT(out_len <= 128);
3312 				/* Check this batch of decoded codepoints; are there any error markers?
3313 				 * Also sum up the number of demerits */
3314 				while (out_len) {
3315 					uint32_t w = wchar_buf[--out_len];
3316 					if (w == MBFL_BAD_INPUT) {
3317 						if (strict) {
3318 							/* This candidate encoding is not valid, eliminate it from consideration */
3319 							length--;
3320 							if (i < length) {
3321 								/* The eliminated candidate was the last valid one in the list */
3322 								memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3323 							}
3324 							goto try_next_encoding;
3325 						} else {
3326 							array[i].demerits += 1000;
3327 						}
3328 					} else {
3329 						array[i].demerits += estimate_demerits(w);
3330 					}
3331 				}
3332 				if (array[i].in_len == 0) {
3333 					finished++;
3334 				}
3335 			}
3336 try_next_encoding:;
3337 		}
3338 	}
3339 
3340 	for (size_t i = 0; i < length; i++) {
3341 		array[i].demerits *= array[i].multiplier;
3342 	}
3343 
3344 	return length;
3345 }
3346 
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3347 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3348 {
3349 	if (elist_size == 0) {
3350 		return NULL;
3351 	}
3352 	if (elist_size == 1) {
3353 		if (strict) {
3354 			while (n--) {
3355 				if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3356 					return NULL;
3357 				}
3358 			}
3359 		}
3360 		return *elist;
3361 	}
3362 	if (n == 1 && *str_lengths == 0) {
3363 		return *elist;
3364 	}
3365 
3366 	/* Allocate on stack; when we return, this array is automatically freed */
3367 	struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3368 	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3369 
3370 	while (n--) {
3371 		start_string(array, elist_size, strings[n], str_lengths[n]);
3372 		elist_size = count_demerits(array, elist_size, strict);
3373 		if (elist_size == 0) {
3374 			/* All candidates were eliminated */
3375 			return NULL;
3376 		}
3377 	}
3378 
3379 	/* See which remaining candidate encoding has the least demerits */
3380 	unsigned int best = 0;
3381 	for (unsigned int i = 1; i < elist_size; i++) {
3382 		if (array[i].demerits < array[best].demerits) {
3383 			best = i;
3384 		}
3385 	}
3386 	return array[best].enc;
3387 }
3388 
3389 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3390  * is rejected. With non-strict detection, we just continue, but apply demerits for
3391  * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3392 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3393 {
3394 	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3395 }
3396 
3397 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3398 PHP_FUNCTION(mb_detect_encoding)
3399 {
3400 	zend_string *str, *encoding_str = NULL;
3401 	HashTable *encoding_ht = NULL;
3402 	bool strict = false;
3403 	const mbfl_encoding *ret, **elist;
3404 	size_t size;
3405 
3406 	ZEND_PARSE_PARAMETERS_START(1, 3)
3407 		Z_PARAM_STR(str)
3408 		Z_PARAM_OPTIONAL
3409 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3410 		Z_PARAM_BOOL(strict)
3411 	ZEND_PARSE_PARAMETERS_END();
3412 
3413 	/* Should we pay attention to the order of the provided candidate encodings and prefer
3414 	 * the earlier ones (if more than one candidate encoding matches)?
3415 	 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3416 	 * in, then don't treat the order as significant */
3417 	bool order_significant = true;
3418 
3419 	/* make encoding list */
3420 	if (encoding_ht) {
3421 		if (encoding_ht == MBSTRG(all_encodings_list)) {
3422 			order_significant = false;
3423 		}
3424 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3425 			RETURN_THROWS();
3426 		}
3427 	} else if (encoding_str) {
3428 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3429 			RETURN_THROWS();
3430 		}
3431 	} else {
3432 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3433 		size = MBSTRG(current_detect_order_list_size);
3434 	}
3435 
3436 	if (size == 0) {
3437 		efree(ZEND_VOIDP(elist));
3438 		zend_argument_value_error(2, "must specify at least one encoding");
3439 		RETURN_THROWS();
3440 	}
3441 
3442 	remove_non_encodings_from_elist(elist, &size);
3443 	if (size == 0) {
3444 		efree(ZEND_VOIDP(elist));
3445 		RETURN_FALSE;
3446 	}
3447 
3448 	if (ZEND_NUM_ARGS() < 3) {
3449 		strict = MBSTRG(strict_detection);
3450 	}
3451 
3452 	if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3453 		ret = &mbfl_encoding_utf8;
3454 	} else {
3455 		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3456 	}
3457 
3458 	efree(ZEND_VOIDP(elist));
3459 
3460 	if (ret == NULL) {
3461 		RETURN_FALSE;
3462 	}
3463 
3464 	RETVAL_STRING((char *)ret->name);
3465 }
3466 /* }}} */
3467 
3468 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3469 PHP_FUNCTION(mb_list_encodings)
3470 {
3471 	ZEND_PARSE_PARAMETERS_NONE();
3472 
3473 	if (MBSTRG(all_encodings_list) == NULL) {
3474 		/* Initialize shared array of supported encoding names
3475 		 * This is done so that we can check if `mb_list_encodings()` is being
3476 		 * passed to other mbstring functions using a cheap pointer equality check */
3477 		HashTable *array = emalloc(sizeof(HashTable));
3478 		zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3479 		for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3480 			zval tmp;
3481 			ZVAL_STRING(&tmp, (*encodings)->name);
3482 			zend_hash_next_index_insert(array, &tmp);
3483 		}
3484 		MBSTRG(all_encodings_list) = array;
3485 	}
3486 
3487 	GC_ADDREF(MBSTRG(all_encodings_list));
3488 	RETURN_ARR(MBSTRG(all_encodings_list));
3489 }
3490 /* }}} */
3491 
3492 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3493 PHP_FUNCTION(mb_encoding_aliases)
3494 {
3495 	const mbfl_encoding *encoding;
3496 	zend_string *encoding_name = NULL;
3497 
3498 	ZEND_PARSE_PARAMETERS_START(1, 1)
3499 		Z_PARAM_STR(encoding_name)
3500 	ZEND_PARSE_PARAMETERS_END();
3501 
3502 	encoding = php_mb_get_encoding(encoding_name, 1);
3503 	if (!encoding) {
3504 		RETURN_THROWS();
3505 	}
3506 
3507 	array_init(return_value);
3508 	if (encoding->aliases != NULL) {
3509 		for (const char **alias = encoding->aliases; *alias; ++alias) {
3510 			add_next_index_string(return_value, (char *)*alias);
3511 		}
3512 	}
3513 }
3514 /* }}} */
3515 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3516 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3517 {
3518 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3519 	 * if we are converting zenkaku kana to hankaku kana
3520 	 * Make the buffer for converted kana big enough that we never need to
3521 	 * perform bounds checks */
3522 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3523 	unsigned int buf_offset = 0;
3524 	unsigned int state = 0;
3525 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3526 	size_t in_len = ZSTR_LEN(input);
3527 
3528 	mb_convert_buf buf;
3529 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3530 
3531 	while (in_len) {
3532 		uint32_t *converted = converted_buf;
3533 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3534 		 * previous iteration, don't overwrite it */
3535 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3536 		out_len += buf_offset;
3537 		ZEND_ASSERT(out_len <= 64);
3538 
3539 		if (!out_len) {
3540 			continue;
3541 		}
3542 
3543 		for (size_t i = 0; i < out_len-1; i++) {
3544 			uint32_t second = 0;
3545 			bool consumed = false;
3546 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3547 			if (second) {
3548 				*converted++ = second;
3549 			}
3550 			if (consumed) {
3551 				i++;
3552 				if (i == out_len-1) {
3553 					/* We consumed two codepoints at the very end of the wchar buffer
3554 					 * So there is nothing remaining to reprocess on the next iteration */
3555 					buf_offset = 0;
3556 					goto emit_converted_kana;
3557 				}
3558 			}
3559 		}
3560 
3561 		if (!in_len) {
3562 			/* This is the last iteration, so we need to process the final codepoint now */
3563 			uint32_t second = 0;
3564 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3565 			if (second) {
3566 				*converted++ = second;
3567 			}
3568 		} else {
3569 			/* Reprocess the last codepoint on the next iteration */
3570 			wchar_buf[0] = wchar_buf[out_len-1];
3571 			buf_offset = 1;
3572 		}
3573 
3574 emit_converted_kana:
3575 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3576 	}
3577 
3578 	return mb_convert_buf_result(&buf, encoding);
3579 }
3580 
3581 char mb_convert_kana_flags[17] = {
3582 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3583 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3584 	'V'
3585 };
3586 
3587 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3588 PHP_FUNCTION(mb_convert_kana)
3589 {
3590 	unsigned int opt;
3591 	char *optstr = NULL;
3592 	size_t optstr_len;
3593 	zend_string *encname = NULL, *str;
3594 
3595 	ZEND_PARSE_PARAMETERS_START(1, 3)
3596 		Z_PARAM_STR(str)
3597 		Z_PARAM_OPTIONAL
3598 		Z_PARAM_STRING(optstr, optstr_len)
3599 		Z_PARAM_STR_OR_NULL(encname)
3600 	ZEND_PARSE_PARAMETERS_END();
3601 
3602 	if (optstr != NULL) {
3603 		char *p = optstr, *e = p + optstr_len;
3604 		opt = 0;
3605 next_option:
3606 		while (p < e) {
3607 			/* Walk through option string and convert to bit vector
3608 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3609 			char c = *p++;
3610 			if (c == 'A') {
3611 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3612 			} else if (c == 'a') {
3613 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3614 			} else {
3615 				for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3616 					if (c == mb_convert_kana_flags[i]) {
3617 						opt |= (1 << i);
3618 						goto next_option;
3619 					}
3620 				}
3621 
3622 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3623 				RETURN_THROWS();
3624 			}
3625 		}
3626 
3627 		/* Check for illegal combinations of options */
3628 		if (((opt & 0xFF00) >> 8) & opt) {
3629 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3630 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3631 			 * FW hiragana to FW katakana and then back again. */
3632 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3633 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3634 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3635 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3636 				flag1 = 'A';
3637 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3638 				flag2 = 'a';
3639 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3640 			RETURN_THROWS();
3641 		}
3642 
3643 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3644 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3645 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3646 			RETURN_THROWS();
3647 		}
3648 
3649 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3650 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3651 		 * more than one of these */
3652 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3653 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3654 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3655 				RETURN_THROWS();
3656 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3657 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3658 				RETURN_THROWS();
3659 			}
3660 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3661 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3662 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3663 				RETURN_THROWS();
3664 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3665 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3666 				RETURN_THROWS();
3667 			}
3668 		}
3669 	} else {
3670 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3671 	}
3672 
3673 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3674 	if (!enc) {
3675 		RETURN_THROWS();
3676 	}
3677 
3678 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3679 }
3680 
mb_recursive_count_strings(zval * var)3681 static unsigned int mb_recursive_count_strings(zval *var)
3682 {
3683 	unsigned int count = 0;
3684 	ZVAL_DEREF(var);
3685 
3686 	if (Z_TYPE_P(var) == IS_STRING) {
3687 		count++;
3688 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3689 		if (Z_REFCOUNTED_P(var)) {
3690 			if (Z_IS_RECURSIVE_P(var)) {
3691 				return count;
3692 			}
3693 			Z_PROTECT_RECURSION_P(var);
3694 		}
3695 
3696 		HashTable *ht = HASH_OF(var);
3697 		if (ht != NULL) {
3698 			zval *entry;
3699 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3700 				count += mb_recursive_count_strings(entry);
3701 			} ZEND_HASH_FOREACH_END();
3702 		}
3703 
3704 		if (Z_REFCOUNTED_P(var)) {
3705 			Z_UNPROTECT_RECURSION_P(var);
3706 		}
3707 	}
3708 
3709 	return count;
3710 }
3711 
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3712 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3713 {
3714 	ZVAL_DEREF(var);
3715 
3716 	if (Z_TYPE_P(var) == IS_STRING) {
3717 		val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3718 		len_list[*count] = Z_STRLEN_P(var);
3719 		(*count)++;
3720 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3721 		if (Z_REFCOUNTED_P(var)) {
3722 			if (Z_IS_RECURSIVE_P(var)) {
3723 				return true;
3724 			}
3725 			Z_PROTECT_RECURSION_P(var);
3726 		}
3727 
3728 		HashTable *ht = HASH_OF(var);
3729 		if (ht != NULL) {
3730 			zval *entry;
3731 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3732 				if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3733 					if (Z_REFCOUNTED_P(var)) {
3734 						Z_UNPROTECT_RECURSION_P(var);
3735 						return true;
3736 					}
3737 				}
3738 			} ZEND_HASH_FOREACH_END();
3739 		}
3740 
3741 		if (Z_REFCOUNTED_P(var)) {
3742 			Z_UNPROTECT_RECURSION_P(var);
3743 		}
3744 	}
3745 
3746 	return false;
3747 }
3748 
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3749 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3750 {
3751 	zval *entry, *orig_var;
3752 
3753 	orig_var = var;
3754 	ZVAL_DEREF(var);
3755 
3756 	if (Z_TYPE_P(var) == IS_STRING) {
3757 		zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3758 		zval_ptr_dtor(orig_var);
3759 		ZVAL_STR(orig_var, ret);
3760 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3761 		if (Z_TYPE_P(var) == IS_ARRAY) {
3762 			SEPARATE_ARRAY(var);
3763 		}
3764 		if (Z_REFCOUNTED_P(var)) {
3765 			if (Z_IS_RECURSIVE_P(var)) {
3766 				return true;
3767 			}
3768 			Z_PROTECT_RECURSION_P(var);
3769 		}
3770 
3771 		HashTable *ht = HASH_OF(var);
3772 		if (ht != NULL) {
3773 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3774 				if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3775 					if (Z_REFCOUNTED_P(var)) {
3776 						Z_UNPROTECT_RECURSION_P(var);
3777 					}
3778 					return true;
3779 				}
3780 			} ZEND_HASH_FOREACH_END();
3781 		}
3782 
3783 		if (Z_REFCOUNTED_P(var)) {
3784 			Z_UNPROTECT_RECURSION_P(var);
3785 		}
3786 	}
3787 
3788 	return false;
3789 }
3790 
PHP_FUNCTION(mb_convert_variables)3791 PHP_FUNCTION(mb_convert_variables)
3792 {
3793 	zval *args;
3794 	zend_string *to_enc_str;
3795 	zend_string *from_enc_str;
3796 	HashTable *from_enc_ht;
3797 	const mbfl_encoding *from_encoding, *to_encoding;
3798 	uint32_t argc;
3799 	size_t elistsz;
3800 	const mbfl_encoding **elist;
3801 
3802 	ZEND_PARSE_PARAMETERS_START(3, -1)
3803 		Z_PARAM_STR(to_enc_str)
3804 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3805 		Z_PARAM_VARIADIC('+', args, argc)
3806 	ZEND_PARSE_PARAMETERS_END();
3807 
3808 	/* new encoding */
3809 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3810 	if (!to_encoding) {
3811 		RETURN_THROWS();
3812 	}
3813 
3814 	from_encoding = MBSTRG(current_internal_encoding);
3815 
3816 	bool order_significant = true;
3817 
3818 	/* pre-conversion encoding */
3819 	if (from_enc_ht) {
3820 		if (from_enc_ht == MBSTRG(all_encodings_list)) {
3821 			/* If entire list of supported encodings returned by `mb_list_encodings` is passed
3822 			 * in, then don't treat the order of the list as significant */
3823 			order_significant = false;
3824 		}
3825 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3826 			RETURN_THROWS();
3827 		}
3828 	} else {
3829 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3830 			RETURN_THROWS();
3831 		}
3832 	}
3833 
3834 	if (elistsz == 0) {
3835 		efree(ZEND_VOIDP(elist));
3836 		zend_argument_value_error(2, "must specify at least one encoding");
3837 		RETURN_THROWS();
3838 	}
3839 
3840 	if (elistsz == 1) {
3841 		from_encoding = *elist;
3842 	} else {
3843 		/* auto detect */
3844 		unsigned int num = 0;
3845 		for (size_t n = 0; n < argc; n++) {
3846 			zval *zv = &args[n];
3847 			num += mb_recursive_count_strings(zv);
3848 		}
3849 		const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3850 		size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3851 		unsigned int i = 0;
3852 		for (size_t n = 0; n < argc; n++) {
3853 			zval *zv = &args[n];
3854 			if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3855 				efree(ZEND_VOIDP(elist));
3856 				efree(ZEND_VOIDP(val_list));
3857 				efree(len_list);
3858 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3859 				RETURN_FALSE;
3860 			}
3861 		}
3862 		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3863 		efree(ZEND_VOIDP(val_list));
3864 		efree(len_list);
3865 		if (!from_encoding) {
3866 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3867 			efree(ZEND_VOIDP(elist));
3868 			RETURN_FALSE;
3869 		}
3870 
3871 	}
3872 
3873 	efree(ZEND_VOIDP(elist));
3874 
3875 	/* convert */
3876 	for (size_t n = 0; n < argc; n++) {
3877 		zval *zv = &args[n];
3878 		ZVAL_DEREF(zv);
3879 		if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3880 			php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3881 			RETURN_FALSE;
3882 		}
3883 	}
3884 
3885 	RETURN_STRING(from_encoding->name);
3886 }
3887 
3888 /* HTML numeric entities */
3889 
3890 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3891 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3892 {
3893 	zval *hash_entry;
3894 
3895 	size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3896 	if (n_elems % 4 != 0) {
3897 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3898 		return NULL;
3899 	}
3900 
3901 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3902 	uint32_t *mapelm = convmap;
3903 
3904 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3905 		bool failed = true;
3906 		zend_long tmp = zval_try_get_long(hash_entry, &failed);
3907 		if (failed) {
3908 			efree(convmap);
3909 			zend_argument_value_error(2, "must only be composed of values of type int");
3910 			return NULL;
3911 		}
3912 		*mapelm++ = tmp;
3913 	} ZEND_HASH_FOREACH_END();
3914 
3915 	return convmap;
3916 }
3917 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3918 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3919 {
3920 	uint32_t *convmap_end = convmap + conversion_map_size;
3921 
3922 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3923 		uint32_t lo_code = mapelm[0];
3924 		uint32_t hi_code = mapelm[1];
3925 		uint32_t offset  = mapelm[2];
3926 		uint32_t mask    = mapelm[3];
3927 
3928 		if (w >= lo_code && w <= hi_code) {
3929 			/* This wchar falls inside one of the ranges which should be
3930 			 * converted to HTML entities */
3931 			*retval = (w + offset) & mask;
3932 			return true;
3933 		}
3934 	}
3935 
3936 	/* None of the ranges matched */
3937 	return false;
3938 }
3939 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3940 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3941 {
3942 	/* Each wchar which we get from decoding the input string may become up to
3943 	 * 13 wchars when we convert it to an HTML entity */
3944 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3945 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3946 
3947 	unsigned int state = 0;
3948 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3949 	size_t in_len = ZSTR_LEN(input);
3950 
3951 	mb_convert_buf buf;
3952 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3953 
3954 	while (in_len) {
3955 		/* Convert input string to wchars, up to 32 at a time */
3956 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3957 		ZEND_ASSERT(out_len <= 32);
3958 		uint32_t *converted = converted_buf;
3959 
3960 		/* Run through wchars and see if any of them fall into the ranges
3961 		 * which we want to convert to HTML entities */
3962 		for (size_t i = 0; i < out_len; i++) {
3963 			uint32_t w = wchar_buf[i];
3964 
3965 			if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3966 				*converted++ = '&';
3967 				*converted++ = '#';
3968 				if (hex) {
3969 					*converted++ = 'x';
3970 				}
3971 
3972 				/* Convert wchar to decimal/hex string */
3973 				if (w == 0) {
3974 					*converted++ = '0';
3975 				} else {
3976 					unsigned char *p = entity + sizeof(entity);
3977 					if (hex) {
3978 						while (w > 0) {
3979 							*(--p) = "0123456789ABCDEF"[w & 0xF];
3980 							w >>= 4;
3981 						}
3982 					} else {
3983 						while (w > 0) {
3984 							*(--p) = "0123456789"[w % 10];
3985 							w /= 10;
3986 						}
3987 					}
3988 					while (p < entity + sizeof(entity)) {
3989 						*converted++ = *p++;
3990 					}
3991 				}
3992 
3993 				*converted++ = ';';
3994 			} else {
3995 				*converted++ = w;
3996 			}
3997 		}
3998 
3999 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
4000 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4001 	}
4002 
4003 	return mb_convert_buf_result(&buf, encoding);
4004 }
4005 
4006 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)4007 PHP_FUNCTION(mb_encode_numericentity)
4008 {
4009 	zend_string *encoding = NULL, *str;
4010 	size_t conversion_map_size;
4011 	HashTable *target_hash;
4012 	bool is_hex = false;
4013 
4014 	ZEND_PARSE_PARAMETERS_START(2, 4)
4015 		Z_PARAM_STR(str)
4016 		Z_PARAM_ARRAY_HT(target_hash)
4017 		Z_PARAM_OPTIONAL
4018 		Z_PARAM_STR_OR_NULL(encoding)
4019 		Z_PARAM_BOOL(is_hex)
4020 	ZEND_PARSE_PARAMETERS_END();
4021 
4022 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4023 	if (!enc) {
4024 		RETURN_THROWS();
4025 	}
4026 
4027 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4028 	if (convmap == NULL) {
4029 		RETURN_THROWS();
4030 	}
4031 
4032 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4033 	efree(convmap);
4034 }
4035 /* }}} */
4036 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)4037 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4038 {
4039 	uint32_t *convmap_end = convmap + conversion_map_size;
4040 
4041 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4042 		uint32_t lo_code = mapelm[0];
4043 		uint32_t hi_code = mapelm[1];
4044 		uint32_t offset  = mapelm[2];
4045 		uint32_t codepoint = number - offset;
4046 		if (codepoint >= lo_code && codepoint <= hi_code) {
4047 			*retval = codepoint;
4048 			return true;
4049 		}
4050 	}
4051 
4052 	return false;
4053 }
4054 
4055 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
4056 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
4057 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4058 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4059 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4060 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4061 {
4062 	uint32_t wchar_buf[128], converted_buf[128];
4063 
4064 	unsigned int state = 0;
4065 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4066 	size_t in_len = ZSTR_LEN(input);
4067 
4068 	mb_convert_buf buf;
4069 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4070 
4071 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4072 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4073 	 * 2nd 'converted' buffer.
4074 	 *
4075 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4076 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4077 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4078 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4079 	 * to store the next batch of wchars after it.
4080 	 *
4081 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4082 	 * wchars from the 1st buffer to the 2nd one.
4083 	 *
4084 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4085 	 * have to do bounds checks when writing wchars into it.
4086 	 */
4087 
4088 	unsigned int wchar_buf_offset = 0;
4089 
4090 	while (in_len) {
4091 		/* Leave space for sentinel at the end of the buffer */
4092 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4093 		out_len += wchar_buf_offset;
4094 		ZEND_ASSERT(out_len <= 127);
4095 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4096 
4097 		uint32_t *p, *converted;
4098 
4099 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4100 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
4101 		if (wchar_buf_offset == 0) {
4102 			p = wchar_buf;
4103 			while (*p != '&')
4104 				p++;
4105 			if (p == wchar_buf + out_len) {
4106 				/* No HTML entities in this buffer */
4107 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4108 				continue;
4109 			}
4110 
4111 			/* Copy over the prefix with no & which we already scanned */
4112 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4113 			converted = converted_buf + (p - wchar_buf);
4114 		} else {
4115 			p = wchar_buf;
4116 			converted = converted_buf;
4117 		}
4118 
4119 found_ampersand:
4120 		ZEND_ASSERT(*p == '&');
4121 		uint32_t *p2 = p;
4122 
4123 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4124 		if (*++p2 == '#') {
4125 			if (*++p2 == 'x') {
4126 				/* Possible hex entity */
4127 				uint32_t w = *++p2;
4128 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4129 					w = *++p2;
4130 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4131 					/* We hit the end of the buffer while reading digits, and
4132 					 * more wchars are still coming in the next buffer
4133 					 * Reprocess this identity on next iteration */
4134 					memmove(wchar_buf, p, (p2 - p) * 4);
4135 					wchar_buf_offset = p2 - p;
4136 					goto process_converted_wchars;
4137 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4138 					/* Invalid entity (too long or "&#x" only) */
4139 					memcpy(converted, p, (p2 - p) * 4);
4140 					converted += p2 - p;
4141 				} else {
4142 					/* Valid hexadecimal entity */
4143 					uint32_t value = 0, *p3 = p + 3;
4144 					while (p3 < p2) {
4145 						w = *p3++;
4146 						if (w <= '9') {
4147 							value = (value * 16) + (w - '0');
4148 						} else if (w >= 'a') {
4149 							value = (value * 16) + 10 + (w - 'a');
4150 						} else {
4151 							value = (value * 16) + 10 + (w - 'A');
4152 						}
4153 					}
4154 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4155 						converted++;
4156 						if (*p2 == ';')
4157 							p2++;
4158 					} else {
4159 						memcpy(converted, p, (p2 - p) * 4);
4160 						converted += p2 - p;
4161 					}
4162 				}
4163 			} else {
4164 				/* Possible decimal entity */
4165 				uint32_t w = *p2;
4166 				while (w >= '0' && w <= '9')
4167 					w = *++p2;
4168 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4169 					/* The number of digits was legal (no more than 10 decimal digits)
4170 					 * Reprocess this identity on next iteration of main loop */
4171 					memmove(wchar_buf, p, (p2 - p) * 4);
4172 					wchar_buf_offset = p2 - p;
4173 					goto process_converted_wchars;
4174 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4175 					/* Invalid entity (too long or "&#" only) */
4176 					memcpy(converted, p, (p2 - p) * 4);
4177 					converted += p2 - p;
4178 				} else {
4179 					/* Valid decimal entity */
4180 					uint32_t value = 0, *p3 = p + 2;
4181 					while (p3 < p2) {
4182 						/* If unsigned integer overflow would occur in the below
4183 						 * multiplication by 10, this entity is no good
4184 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4185 						if (value > 0x19999999) {
4186 							memcpy(converted, p, (p2 - p) * 4);
4187 							converted += p2 - p;
4188 							goto decimal_entity_too_big;
4189 						}
4190 						value = (value * 10) + (*p3++ - '0');
4191 					}
4192 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4193 						converted++;
4194 						if (*p2 == ';')
4195 							p2++;
4196 					} else {
4197 						memcpy(converted, p, (p2 - p) * 4);
4198 						converted += p2 - p;
4199 					}
4200 				}
4201 			}
4202 		} else if ((p2 == wchar_buf + out_len) && in_len) {
4203 			/* Corner case: & at end of buffer */
4204 			wchar_buf[0] = '&';
4205 			wchar_buf_offset = 1;
4206 			goto process_converted_wchars;
4207 		} else {
4208 			*converted++ = '&';
4209 		}
4210 decimal_entity_too_big:
4211 
4212 		/* Starting to scan a new section of the wchar buffer
4213 		 * 'p2' is pointing at the next wchar which needs to be processed */
4214 		p = p2;
4215 		while (*p2 != '&')
4216 			p2++;
4217 
4218 		if (p2 > p) {
4219 			memcpy(converted, p, (p2 - p) * 4);
4220 			converted += p2 - p;
4221 			p = p2;
4222 		}
4223 
4224 		if (p < wchar_buf + out_len)
4225 			goto found_ampersand;
4226 
4227 		/* We do not have any wchars remaining at the end of this buffer which
4228 		 * we need to reprocess on the next call */
4229 		wchar_buf_offset = 0;
4230 process_converted_wchars:
4231 		ZEND_ASSERT(converted <= converted_buf + 128);
4232 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4233 	}
4234 
4235 	return mb_convert_buf_result(&buf, encoding);
4236 }
4237 
4238 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4239 PHP_FUNCTION(mb_decode_numericentity)
4240 {
4241 	zend_string *encoding = NULL, *str;
4242 	size_t conversion_map_size;
4243 	HashTable *target_hash;
4244 
4245 	ZEND_PARSE_PARAMETERS_START(2, 3)
4246 		Z_PARAM_STR(str)
4247 		Z_PARAM_ARRAY_HT(target_hash)
4248 		Z_PARAM_OPTIONAL
4249 		Z_PARAM_STR_OR_NULL(encoding)
4250 	ZEND_PARSE_PARAMETERS_END();
4251 
4252 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4253 	if (!enc) {
4254 		RETURN_THROWS();
4255 	}
4256 
4257 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4258 	if (convmap == NULL) {
4259 		RETURN_THROWS();
4260 	}
4261 
4262 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4263 	efree(convmap);
4264 }
4265 /* }}} */
4266 
4267 /* {{{ Sends an email message with MIME scheme */
4268 #define CRLF "\r\n"
4269 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4270 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4271 {
4272 	const char *ps;
4273 	size_t icnt;
4274 	int state = 0;
4275 	int crlf_state = -1;
4276 	char *token = NULL;
4277 	size_t token_pos = 0;
4278 	zend_string *fld_name, *fld_val;
4279 
4280 	ps = str;
4281 	icnt = str_len;
4282 	fld_name = fld_val = NULL;
4283 
4284 	/*
4285 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4286 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4287 	 *      state  0            1           2          3
4288 	 *
4289 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4290 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4291 	 * crlf_state -1                       0                     1 -1
4292 	 *
4293 	 */
4294 
4295 	while (icnt > 0) {
4296 		switch (*ps) {
4297 			case ':':
4298 				if (crlf_state == 1) {
4299 					token_pos++;
4300 				}
4301 
4302 				if (state == 0 || state == 1) {
4303 					if(token && token_pos > 0) {
4304 						fld_name = zend_string_init(token, token_pos, 0);
4305 					}
4306 					state = 2;
4307 				} else {
4308 					token_pos++;
4309 				}
4310 
4311 				crlf_state = 0;
4312 				break;
4313 
4314 			case '\n':
4315 				if (crlf_state == -1) {
4316 					goto out;
4317 				}
4318 				crlf_state = -1;
4319 				break;
4320 
4321 			case '\r':
4322 				if (crlf_state == 1) {
4323 					token_pos++;
4324 				} else {
4325 					crlf_state = 1;
4326 				}
4327 				break;
4328 
4329 			case ' ': case '\t':
4330 				if (crlf_state == -1) {
4331 					if (state == 3) {
4332 						/* continuing from the previous line */
4333 						state = 4;
4334 					} else {
4335 						/* simply skipping this new line */
4336 						state = 5;
4337 					}
4338 				} else {
4339 					if (crlf_state == 1) {
4340 						token_pos++;
4341 					}
4342 					if (state == 1 || state == 3) {
4343 						token_pos++;
4344 					}
4345 				}
4346 				crlf_state = 0;
4347 				break;
4348 
4349 			default:
4350 				switch (state) {
4351 					case 0:
4352 						token = (char*)ps;
4353 						token_pos = 0;
4354 						state = 1;
4355 						break;
4356 
4357 					case 2:
4358 						if (crlf_state != -1) {
4359 							token = (char*)ps;
4360 							token_pos = 0;
4361 
4362 							state = 3;
4363 							break;
4364 						}
4365 						ZEND_FALLTHROUGH;
4366 
4367 					case 3:
4368 						if (crlf_state == -1) {
4369 							if(token && token_pos > 0) {
4370 								fld_val = zend_string_init(token, token_pos, 0);
4371 							}
4372 
4373 							if (fld_name != NULL && fld_val != NULL) {
4374 								zval val;
4375 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4376 								ZVAL_STR(&val, fld_val);
4377 
4378 								zend_hash_update(ht, fld_name, &val);
4379 
4380 								zend_string_release_ex(fld_name, 0);
4381 							}
4382 
4383 							fld_name = fld_val = NULL;
4384 							token = (char*)ps;
4385 							token_pos = 0;
4386 
4387 							state = 1;
4388 						}
4389 						break;
4390 
4391 					case 4:
4392 						token_pos++;
4393 						state = 3;
4394 						break;
4395 				}
4396 
4397 				if (crlf_state == 1) {
4398 					token_pos++;
4399 				}
4400 
4401 				token_pos++;
4402 
4403 				crlf_state = 0;
4404 				break;
4405 		}
4406 		ps++, icnt--;
4407 	}
4408 out:
4409 	if (state == 2) {
4410 		token = "";
4411 		token_pos = 0;
4412 
4413 		state = 3;
4414 	}
4415 	if (state == 3) {
4416 		if(token && token_pos > 0) {
4417 			fld_val = zend_string_init(token, token_pos, 0);
4418 		}
4419 		if (fld_name != NULL && fld_val != NULL) {
4420 			zval val;
4421 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4422 			ZVAL_STR(&val, fld_val);
4423 			zend_hash_update(ht, fld_name, &val);
4424 
4425 			zend_string_release_ex(fld_name, 0);
4426 		}
4427 	}
4428 	return state;
4429 }
4430 
PHP_FUNCTION(mb_send_mail)4431 PHP_FUNCTION(mb_send_mail)
4432 {
4433 	char *to;
4434 	size_t to_len;
4435 	char *message;
4436 	size_t message_len;
4437 	zend_string *subject;
4438 	zend_string *extra_cmd = NULL;
4439 	HashTable *headers_ht = NULL;
4440 	zend_string *str_headers = NULL;
4441 	size_t i;
4442 	char *to_r = NULL;
4443 	bool suppress_content_type = false;
4444 	bool suppress_content_transfer_encoding = false;
4445 
4446 	char *p;
4447 	enum mbfl_no_encoding;
4448 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4449 						*head_enc,	/* header transfer encoding */
4450 						*body_enc;	/* body transfer encoding */
4451 	const mbfl_language *lang;
4452 	HashTable ht_headers;
4453 	zval *s;
4454 
4455 	/* character-set, transfer-encoding */
4456 	tran_cs = &mbfl_encoding_utf8;
4457 	head_enc = &mbfl_encoding_base64;
4458 	body_enc = &mbfl_encoding_base64;
4459 	lang = mbfl_no2language(MBSTRG(language));
4460 	if (lang != NULL) {
4461 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4462 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4463 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4464 	}
4465 
4466 	ZEND_PARSE_PARAMETERS_START(3, 5)
4467 		Z_PARAM_PATH(to, to_len)
4468 		Z_PARAM_PATH_STR(subject)
4469 		Z_PARAM_PATH(message, message_len)
4470 		Z_PARAM_OPTIONAL
4471 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4472 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4473 	ZEND_PARSE_PARAMETERS_END();
4474 
4475 	if (str_headers) {
4476 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4477 			zend_argument_value_error(4, "must not contain any null bytes");
4478 			RETURN_THROWS();
4479 		}
4480 		str_headers = php_trim(str_headers, NULL, 0, 2);
4481 	} else if (headers_ht) {
4482 		str_headers = php_mail_build_headers(headers_ht);
4483 		if (EG(exception)) {
4484 			RETURN_THROWS();
4485 		}
4486 	}
4487 
4488 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4489 
4490 	if (str_headers != NULL) {
4491 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4492 	}
4493 
4494 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4495 		char *tmp;
4496 		char *param_name;
4497 		char *charset = NULL;
4498 
4499 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4500 		p = strchr(Z_STRVAL_P(s), ';');
4501 
4502 		if (p != NULL) {
4503 			/* skipping the padded spaces */
4504 			do {
4505 				++p;
4506 			} while (*p == ' ' || *p == '\t');
4507 
4508 			if (*p != '\0') {
4509 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4510 					if (strcasecmp(param_name, "charset") == 0) {
4511 						const mbfl_encoding *_tran_cs = tran_cs;
4512 
4513 						charset = php_strtok_r(NULL, "= \"", &tmp);
4514 						if (charset != NULL) {
4515 							_tran_cs = mbfl_name2encoding(charset);
4516 						}
4517 
4518 						if (!_tran_cs) {
4519 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4520 							_tran_cs = &mbfl_encoding_ascii;
4521 						}
4522 						tran_cs = _tran_cs;
4523 					}
4524 				}
4525 			}
4526 		}
4527 		suppress_content_type = true;
4528 	}
4529 
4530 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4531 		const mbfl_encoding *_body_enc;
4532 
4533 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4534 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4535 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4536 			case mbfl_no_encoding_base64:
4537 			case mbfl_no_encoding_7bit:
4538 			case mbfl_no_encoding_8bit:
4539 				body_enc = _body_enc;
4540 				break;
4541 
4542 			default:
4543 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4544 				body_enc =	&mbfl_encoding_8bit;
4545 				break;
4546 		}
4547 		suppress_content_transfer_encoding = true;
4548 	}
4549 
4550 	/* To: */
4551 	if (to_len > 0) {
4552 		to_r = estrndup(to, to_len);
4553 		for (; to_len; to_len--) {
4554 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4555 				break;
4556 			}
4557 			to_r[to_len - 1] = '\0';
4558 		}
4559 		for (i = 0; to_r[i]; i++) {
4560 			if (iscntrl((unsigned char) to_r[i])) {
4561 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4562 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4563 				 * To prevent these separators from being replaced with a space, we skip over them. */
4564 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4565 					i += 2;
4566 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4567 						i++;
4568 					}
4569 					continue;
4570 				}
4571 
4572 				to_r[i] = ' ';
4573 			}
4574 		}
4575 	} else {
4576 		to_r = to;
4577 	}
4578 
4579 	/* Subject: */
4580 	const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4581 	if (enc == &mbfl_encoding_pass) {
4582 		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4583 	}
4584 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4585 	size_t line_sep_len = strlen(line_sep);
4586 
4587 	subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4588 
4589 	/* message body */
4590 	const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4591 	if (msg_enc == &mbfl_encoding_pass) {
4592 		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4593 	}
4594 
4595 	unsigned int num_errors = 0;
4596 	zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4597 	zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4598 	zend_string_free(tmpstr);
4599 	message = ZSTR_VAL(conv);
4600 
4601 	/* other headers */
4602 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4603 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4604 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4605 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4606 
4607 	smart_str str = {0};
4608 	bool empty = true;
4609 
4610 	if (str_headers != NULL) {
4611 		/* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4612 		size_t len = ZSTR_LEN(str_headers);
4613 		if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4614 			len--;
4615 		}
4616 		if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4617 			len--;
4618 		}
4619 		smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4620 		empty = false;
4621 		zend_string_release_ex(str_headers, 0);
4622 	}
4623 
4624 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4625 		if (!empty) {
4626 			smart_str_appendl(&str, line_sep, line_sep_len);
4627 		}
4628 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4629 		empty = false;
4630 	}
4631 
4632 	if (!suppress_content_type) {
4633 		if (!empty) {
4634 			smart_str_appendl(&str, line_sep, line_sep_len);
4635 		}
4636 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4637 
4638 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4639 		if (p != NULL) {
4640 			smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4641 			smart_str_appends(&str, p);
4642 		}
4643 		empty = false;
4644 	}
4645 
4646 	if (!suppress_content_transfer_encoding) {
4647 		if (!empty) {
4648 			smart_str_appendl(&str, line_sep, line_sep_len);
4649 		}
4650 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4651 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4652 		if (p == NULL) {
4653 			p = "7bit";
4654 		}
4655 		smart_str_appends(&str, p);
4656 	}
4657 
4658 	str_headers = smart_str_extract(&str);
4659 
4660 	zend_string *force_extra_parameters = zend_ini_str_ex("mail.force_extra_parameters", strlen("mail.force_extra_parameters"), false, NULL);
4661 	if (force_extra_parameters) {
4662 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4663 	} else if (extra_cmd) {
4664 		extra_cmd = php_escape_shell_cmd(extra_cmd);
4665 	}
4666 
4667 	RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4668 
4669 	if (extra_cmd) {
4670 		zend_string_release_ex(extra_cmd, 0);
4671 	}
4672 	if (to_r != to) {
4673 		efree(to_r);
4674 	}
4675 	zend_string_release(subject);
4676 	zend_string_free(conv);
4677 	zend_hash_destroy(&ht_headers);
4678 	if (str_headers) {
4679 		zend_string_release_ex(str_headers, 0);
4680 	}
4681 }
4682 
4683 #undef CRLF
4684 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4685 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4686 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4687 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4688 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4689 /* }}} */
4690 
4691 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4692 PHP_FUNCTION(mb_get_info)
4693 {
4694 	zend_string *type = NULL;
4695 	size_t n;
4696 	char *name;
4697 	zval row;
4698 	const mbfl_encoding **entry;
4699 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4700 
4701 	ZEND_ASSERT(lang);
4702 
4703 	ZEND_PARSE_PARAMETERS_START(0, 1)
4704 		Z_PARAM_OPTIONAL
4705 		Z_PARAM_STR(type)
4706 	ZEND_PARSE_PARAMETERS_END();
4707 
4708 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4709 		array_init(return_value);
4710 		if (MBSTRG(current_internal_encoding)) {
4711 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4712 		}
4713 		if (MBSTRG(http_input_identify)) {
4714 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4715 		}
4716 		if (MBSTRG(current_http_output_encoding)) {
4717 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4718 		}
4719 
4720 		add_assoc_str(return_value, "http_output_conv_mimetypes",
4721 			zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4722 		);
4723 
4724 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4725 		add_assoc_string(return_value, "mail_charset", name);
4726 
4727 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4728 		add_assoc_string(return_value, "mail_header_encoding", name);
4729 
4730 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4731 		add_assoc_string(return_value, "mail_body_encoding", name);
4732 
4733 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4734 
4735 		if (MBSTRG(encoding_translation)) {
4736 			add_assoc_string(return_value, "encoding_translation", "On");
4737 		} else {
4738 			add_assoc_string(return_value, "encoding_translation", "Off");
4739 		}
4740 
4741 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4742 		add_assoc_string(return_value, "language", name);
4743 
4744 		// TODO Seems to always have one entry at least?
4745 		n = MBSTRG(current_detect_order_list_size);
4746 		entry = MBSTRG(current_detect_order_list);
4747 		if (n > 0) {
4748 			size_t i;
4749 			array_init(&row);
4750 			for (i = 0; i < n; i++) {
4751 				add_next_index_string(&row, (*entry)->name);
4752 				entry++;
4753 			}
4754 			add_assoc_zval(return_value, "detect_order", &row);
4755 		}
4756 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4757 			add_assoc_string(return_value, "substitute_character", "none");
4758 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4759 			add_assoc_string(return_value, "substitute_character", "long");
4760 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4761 			add_assoc_string(return_value, "substitute_character", "entity");
4762 		} else {
4763 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4764 		}
4765 		if (MBSTRG(strict_detection)) {
4766 			add_assoc_string(return_value, "strict_detection", "On");
4767 		} else {
4768 			add_assoc_string(return_value, "strict_detection", "Off");
4769 		}
4770 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4771 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
4772 		RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4773 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4774 		if (MBSTRG(http_input_identify)) {
4775 			RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4776 		}
4777 		RETURN_NULL();
4778 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4779 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4780 		RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4781 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4782 		RETURN_STR(
4783 			zend_ini_str(
4784 				"mbstring.http_output_conv_mimetypes",
4785 				sizeof("mbstring.http_output_conv_mimetypes") - 1,
4786 				false
4787 			)
4788 		);
4789 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4790 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4791 		RETURN_STRING(name);
4792 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4793 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4794 		RETURN_STRING(name);
4795 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4796 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4797 		RETURN_STRING(name);
4798 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4799 		RETURN_LONG(MBSTRG(illegalchars));
4800 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4801 		if (MBSTRG(encoding_translation)) {
4802 			RETURN_STRING("On");
4803 		} else {
4804 			RETURN_STRING("Off");
4805 		}
4806 	} else if (zend_string_equals_literal_ci(type, "language")) {
4807 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4808 		RETURN_STRING(name);
4809 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4810 		// TODO Seems to always have one entry at least?
4811 		n = MBSTRG(current_detect_order_list_size);
4812 		entry = MBSTRG(current_detect_order_list);
4813 		if (n > 0) {
4814 			size_t i;
4815 			array_init(return_value);
4816 			for (i = 0; i < n; i++) {
4817 				add_next_index_string(return_value, (*entry)->name);
4818 				entry++;
4819 			}
4820 		}
4821 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4822 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4823 			RETURN_STRING("none");
4824 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4825 			RETURN_STRING("long");
4826 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4827 			RETURN_STRING("entity");
4828 		} else {
4829 			RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4830 		}
4831 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4832 		if (MBSTRG(strict_detection)) {
4833 			RETURN_STRING("On");
4834 		} else {
4835 			RETURN_STRING("Off");
4836 		}
4837 	} else {
4838 		php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4839 		RETURN_FALSE;
4840 	}
4841 }
4842 /* }}} */
4843 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4844 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4845 {
4846 	uint32_t wchar_buf[128];
4847 	unsigned char *in = (unsigned char*)input;
4848 	unsigned int state = 0;
4849 
4850 	if (encoding->check != NULL) {
4851 		return encoding->check(in, length);
4852 	}
4853 
4854 	/* If the input string is not encoded in the given encoding, there is a significant chance
4855 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4856 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4857 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4858 	ZEND_ASSERT(out_len <= 8);
4859 	for (unsigned int i = 0; i < out_len; i++) {
4860 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4861 			return false;
4862 		}
4863 	}
4864 
4865 	while (length) {
4866 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4867 		ZEND_ASSERT(out_len <= 128);
4868 		for (unsigned int i = 0; i < out_len; i++) {
4869 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4870 				return false;
4871 			}
4872 		}
4873 	}
4874 
4875 	return true;
4876 }
4877 
4878 /* MSVC 32-bit has issues with 64-bit intrinsics.
4879  * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4880  * It seems this is caused by a bug in MS Visual C++
4881  * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4882 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4883 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4884 #endif
4885 
4886 /* If we are building an AVX2-only binary, don't compile the next function */
4887 #ifndef ZEND_INTRIN_AVX2_NATIVE
4888 
4889 /* SSE2-based function for validating UTF-8 strings
4890  * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4891 static bool mb_fast_check_utf8_default(zend_string *str)
4892 {
4893 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4894 # ifdef __SSE2__
4895 	/* `e` points 1 byte past the last full 16-byte block of string content
4896 	 * Note that we include the terminating null byte which is included in each zend_string
4897 	 * as part of the content to check; this ensures that multi-byte characters which are
4898 	 * truncated abruptly at the end of the string will be detected as invalid */
4899 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4900 
4901 	/* For checking for illegal bytes 0xF5-FF */
4902 	const __m128i over_f5 = _mm_set1_epi8(-117);
4903 	/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4904 	const __m128i over_9f = _mm_set1_epi8(-97);
4905 	/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4906 	const __m128i over_8f = _mm_set1_epi8(-113);
4907 	/* For checking for illegal bytes 0xC0-C1 */
4908 	const __m128i find_c0 = _mm_set1_epi8(-64);
4909 	const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4910 	/* For checking structure of continuation bytes */
4911 	const __m128i find_e0 = _mm_set1_epi8(-32);
4912 	const __m128i find_f0 = _mm_set1_epi8(-16);
4913 
4914 	__m128i last_block = _mm_setzero_si128();
4915 	__m128i operand;
4916 
4917 	while (p < e) {
4918 		operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4919 
4920 check_operand:
4921 		/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4922 		if (!_mm_movemask_epi8(operand)) {
4923 			/* Even if this block only contains single-byte characters, there may have been a
4924 			 * multi-byte character at the end of the previous block, which was supposed to
4925 			 * have continuation bytes in this block
4926 			 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4927 			 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4928 			 * from the 3rd last */
4929 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4930 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4931 			if (_mm_movemask_epi8(bad)) {
4932 				return false;
4933 			}
4934 
4935 			/* Consume as many full blocks of single-byte characters as we can */
4936 			while (true) {
4937 				p += sizeof(__m128i);
4938 				if (p >= e) {
4939 					goto finish_up_remaining_bytes;
4940 				}
4941 				operand = _mm_loadu_si128((__m128i*)p);
4942 				if (_mm_movemask_epi8(operand)) {
4943 					break;
4944 				}
4945 			}
4946 		}
4947 
4948 		/* Check for >= 0xF5, which are illegal byte values in UTF-8
4949 		 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4950 		 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4951 		 * Then a single signed compare will pick out any bad bytes
4952 		 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4953 		__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4954 
4955 		/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4956 		 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4957 		 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4958 		 * We can check for both problems at once by generating a vector where each byte < 0xA0
4959 		 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4960 		 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4961 		__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4962 		__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4963 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4964 
4965 		/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4966 		 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4967 		 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4968 		 * Build the bitmask and compare it with the shifted block */
4969 		__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4970 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4971 
4972 		/* Check for overlong 2-byte code units
4973 		 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4974 		 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4975 		 * byte range, do a signed compare to pick out any bad bytes */
4976 		bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4977 
4978 		/* Check structure of continuation bytes
4979 		 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4980 		 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4981 		 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4982 		 * 3) 3 bytes after the start of a 4-byte character
4983 		 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4984 		 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4985 		__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4986 		__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4987 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4988 		__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4989 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4990 
4991 		/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4992 		 * a continuation byte actually is
4993 		 * XOR those two bitmasks together; if everything is good, the result should be zero
4994 		 * However, if a byte which should have been a continuation wasn't, or if a byte which
4995 		 * shouldn't have been a continuation was, we will get 0xFF in that position */
4996 		__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4997 		bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4998 
4999 		/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
5000 		 * If that value is non-zero, then we found a bad byte somewhere! */
5001 		if (_mm_movemask_epi8(bad)) {
5002 			return false;
5003 		}
5004 
5005 		last_block = operand;
5006 		p += sizeof(__m128i);
5007 	}
5008 
5009 finish_up_remaining_bytes:
5010 	/* Finish up 1-15 remaining bytes */
5011 	if (p == e) {
5012 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5013 
5014 		/* Crazy hack here for cases where 9 or more bytes are remaining...
5015 		 * We want to use the above vectorized code to check a block of less than 16 bytes,
5016 		 * but there is no good way to read a variable number of bytes into an XMM register
5017 		 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5018 		 * 'header' fields which occupy the memory just before its content
5019 		 * And, those header fields occupy more than 16 bytes...
5020 		 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5021 		 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5022 		 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5023 		 * Then, we do a left shift to get rid of the unwanted bytes
5024 		 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5025 		 *
5026 		 * The following `switch` looks useless, but it's not
5027 		 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5028 		 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5029 		 */
5030 		switch (remaining_bytes) {
5031 		case 0: ;
5032 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5033 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5034 			return _mm_movemask_epi8(bad) == 0;
5035 		case 1:
5036 		case 2:
5037 			operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5038 			goto check_operand;
5039 		case 3:
5040 		case 4:
5041 			operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5042 			goto check_operand;
5043 		case 5:
5044 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5045 			goto check_operand;
5046 		case 6:
5047 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5048 			goto check_operand;
5049 		case 7:
5050 		case 8:
5051 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5052 			operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5053 #else
5054 			operand = _mm_set_epi64x(0, *((uint64_t*)p));
5055 #endif
5056 			goto check_operand;
5057 		case 9:
5058 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5059 			goto check_operand;
5060 		case 10:
5061 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5062 			goto check_operand;
5063 		case 11:
5064 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5065 			goto check_operand;
5066 		case 12:
5067 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5068 			goto check_operand;
5069 		case 13:
5070 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5071 			goto check_operand;
5072 		case 14:
5073 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5074 			goto check_operand;
5075 		case 15:
5076 			/* No trailing bytes are left which need to be checked
5077 			 * We get 15 because we did not include the terminating null when
5078 			 * calculating `remaining_bytes`, so the value wraps around */
5079 			return true;
5080 		}
5081 
5082 		ZEND_UNREACHABLE();
5083 	}
5084 
5085 	return true;
5086 # else
5087 	/* This UTF-8 validation function is derived from PCRE2 */
5088 	size_t length = ZSTR_LEN(str);
5089 	/* Table of the number of extra bytes, indexed by the first byte masked with
5090 	0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5091 	static const uint8_t utf8_table[] = {
5092 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5093 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5094 		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5095 		3,3,3,3,3,3,3,3
5096 	};
5097 
5098 	for (; length > 0; p++) {
5099 		uint32_t d;
5100 		unsigned char c = *p;
5101 		length--;
5102 
5103 		if (c < 128) {
5104 			/* ASCII character */
5105 			continue;
5106 		}
5107 
5108 		if (c < 0xc0) {
5109 			/* Isolated 10xx xxxx byte */
5110 			return false;
5111 		}
5112 
5113 		if (c >= 0xf5) {
5114 			return false;
5115 		}
5116 
5117 		uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5118 		if (length < ab) {
5119 			/* Missing bytes */
5120 			return false;
5121 		}
5122 		length -= ab;
5123 
5124 		/* Check top bits in the second byte */
5125 		if (((d = *(++p)) & 0xc0) != 0x80) {
5126 			return false;
5127 		}
5128 
5129 		/* For each length, check that the remaining bytes start with the 0x80 bit
5130 		 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5131 		 * excluded range 0xd800 to 0xdfff. */
5132 		switch (ab) {
5133 		case 1:
5134 			/* 2-byte character. No further bytes to check for 0x80. Check first byte
5135 			 * for xx00 000x (overlong sequence). */
5136 			if ((c & 0x3e) == 0) {
5137 				return false;
5138 			}
5139 			break;
5140 
5141 		case 2:
5142 			/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5143 			 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5144 			if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5145 				return false;
5146 			}
5147 			break;
5148 
5149 		case 3:
5150 			/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5151 			 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5152 			 * character greater than 0x0010ffff (f4 8f bf bf) */
5153 			if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5154 				return false;
5155 			}
5156 			break;
5157 
5158 			EMPTY_SWITCH_DEFAULT_CASE();
5159 		}
5160 	}
5161 
5162 	return true;
5163 # endif
5164 }
5165 
5166 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5167 
5168 #ifdef ZEND_INTRIN_AVX2_NATIVE
5169 
5170 /* We are building AVX2-only binary */
5171 # include <immintrin.h>
5172 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5173 
5174 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5175 
5176 /* We are building binary which works with or without AVX2; whether or not to use
5177  * AVX2-accelerated functions will be determined at runtime */
5178 # include <immintrin.h>
5179 # include "Zend/zend_cpuinfo.h"
5180 
5181 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5182 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5183  * resolve symbols accordingly */
5184 
5185 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5186 
5187 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5188 
5189 typedef bool (*check_utf8_func_t)(zend_string*);
5190 
5191 ZEND_NO_SANITIZE_ADDRESS
5192 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5193 static check_utf8_func_t resolve_check_utf8(void)
5194 {
5195 	if (zend_cpu_supports_avx2()) {
5196 		return mb_fast_check_utf8_avx2;
5197 	}
5198 	return mb_fast_check_utf8_default;
5199 }
5200 
5201 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5202 /* We are compiling for a target where the dynamic linker will not be able to
5203  * resolve symbols according to whether the host supports AVX2 or not; so instead,
5204  * we can make calls go through a function pointer and set the function pointer
5205  * on module load */
5206 
5207 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5208 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5209 #else
5210 static bool mb_fast_check_utf8_avx2(zend_string *str);
5211 #endif
5212 
5213 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5214 
mb_fast_check_utf8(zend_string * str)5215 static bool mb_fast_check_utf8(zend_string *str)
5216 {
5217 	return check_utf8_ptr(str);
5218 }
5219 
init_check_utf8(void)5220 static void init_check_utf8(void)
5221 {
5222 	if (zend_cpu_supports_avx2()) {
5223 		check_utf8_ptr = mb_fast_check_utf8_avx2;
5224 	} else {
5225 		check_utf8_ptr = mb_fast_check_utf8_default;
5226 	}
5227 }
5228 # endif
5229 
5230 #else
5231 
5232 /* No AVX2 support */
5233 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5234 
5235 #endif
5236 
5237 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5238 
5239 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5240  * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5241 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5242 # define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5243 #endif
5244 
5245 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5246  * number of bytes, then take the low 256 bits
5247  * This is used to take some number of trailing bytes from the previous 32-byte
5248  * block followed by some number of leading bytes from the current 32-byte block
5249  *
5250  * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5251  * YMM register while shifting in bytes from another YMM register... but
5252  * it works separately on respective 128-bit halves of the YMM registers,
5253  * which is not what we want.
5254  * To make it work as desired, we first do _mm256_permute2x128_si256
5255  * (VPERM2I128) to combine the low 128 bits from the previous block and
5256  * the high 128 bits of the current block in one YMM register.
5257  * Then VPALIGNR will do what is needed. */
5258 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5259 
5260 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5261  *
5262  * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5263  * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5264  * sections. */
5265 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5266 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5267 #else
5268 static bool mb_fast_check_utf8_avx2(zend_string *str)
5269 #endif
5270 {
5271 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5272 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5273 
5274 	/* The algorithm used here for UTF-8 validation is partially adapted from the
5275 	 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5276 	 * and Daniel Lemire.
5277 	 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5278 	 *
5279 	 * Most types of invalid UTF-8 text can be detected by examining pairs of
5280 	 * successive bytes. Specifically:
5281 	 *
5282 	 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5283 	 *   No valid UTF-8 string ever uses these byte values.
5284 	 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5285 	 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5286 	 * • 5-byte or 6-byte code units, which should never be used, start with
5287 	 *   0xF8-FE.
5288 	 * • A codepoint value higher than U+10FFFF, which is the highest value for
5289 	 *   any Unicode codepoint, would either start with 0xF4, followed by a
5290 	 *   byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5291 	 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5292 	 *   be used, would start with 0xED, followed by a byte >= 0xA0.
5293 	 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5294 	 *
5295 	 * To detect all these problems, for each pair of successive bytes, we do
5296 	 * table lookups using the high nibble of the first byte, the low nibble of
5297 	 * the first byte, and the high nibble of the second byte. Each table lookup
5298 	 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5299 	 * combination; AND those three bitmasks together, and any 1 bit in the result
5300 	 * will indicate an actual invalid byte combination was found.
5301 	 */
5302 
5303 #define BAD_BYTE 0x1
5304 #define OVERLONG_2BYTE 0x2
5305 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5306 #define OVERLONG_3BYTE 0x4
5307 #define SURROGATE 0x8
5308 #define OVERLONG_4BYTE 0x10
5309 #define INVALID_CP 0x20
5310 
5311 	/* Each of these are 16-entry tables, repeated twice; this is required by the
5312 	 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5313 	 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5314 	 *
5315 	 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5316 	 * that means that high nibble 0xC is consistent with the byte pair being part of
5317 	 * an overlong 2-byte code unit */
5318 	const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5319 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5320 		0, 0, 0, 0,
5321 		0, 0, 0, 0,
5322 		0, 0, 0, 0,
5323 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5324 		0, 0, 0, 0,
5325 		0, 0, 0, 0,
5326 		0, 0, 0, 0);
5327 	const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5328 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5329 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5330 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5331 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5332 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5333 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5334 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5335 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5336 	const __m256i bad_hi_nibble = _mm256_set_epi8(
5337 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5338 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5339 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5340 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5341 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5342 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5343 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5344 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5345 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5346 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5347 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5348 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5349 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5350 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5351 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5352 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5353 
5354 	const __m256i find_continuation = _mm256_set1_epi8(-64);
5355 	const __m256i _b = _mm256_set1_epi8(0xB);
5356 	const __m256i _d = _mm256_set1_epi8(0xD);
5357 	const __m256i _f = _mm256_set1_epi8(0xF);
5358 
5359 	__m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5360 	__m256i operand;
5361 
5362 	while (p < e) {
5363 		operand = _mm256_loadu_si256((__m256i*)p);
5364 
5365 check_operand:
5366 		if (!_mm256_movemask_epi8(operand)) {
5367 			/* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5368 			 * the previous block didn't end with an incomplete multi-byte character
5369 			 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5370 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5371 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5372 			if (_mm256_movemask_epi8(bad)) {
5373 				return false;
5374 			}
5375 
5376 			/* Consume as many full blocks of single-byte characters as we can */
5377 			while (true) {
5378 				p += sizeof(__m256i);
5379 				if (p >= e) {
5380 					goto finish_up_remaining_bytes;
5381 				}
5382 				operand = _mm256_loadu_si256((__m256i*)p);
5383 				if (_mm256_movemask_epi8(operand)) {
5384 					break;
5385 				}
5386 			}
5387 		}
5388 
5389 		__m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5390 		__m256i lo_nibbles = _mm256_and_si256(operand, _f);
5391 
5392 		__m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5393 		__m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5394 
5395 		/* Do parallel table lookups in all 3 tables */
5396 		__m256i bad = _mm256_cmpgt_epi8(
5397 			_mm256_and_si256(
5398 				_mm256_and_si256(
5399 					_mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5400 					_mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5401 				_mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5402 			_mm256_setzero_si256());
5403 
5404 		__m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5405 		__m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5406 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5407 		__m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5408 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5409 
5410 		__m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5411 		bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5412 
5413 		if (_mm256_movemask_epi8(bad)) {
5414 			return false;
5415 		}
5416 
5417 		last_hi_nibbles = hi_nibbles;
5418 		last_lo_nibbles = lo_nibbles;
5419 		p += sizeof(__m256i);
5420 	}
5421 
5422 finish_up_remaining_bytes:
5423 	if (p == e) {
5424 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5425 
5426 		switch (remaining_bytes) {
5427 		case 0: ;
5428 			/* No actual data bytes are remaining */
5429 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5430 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5431 			return _mm256_movemask_epi8(bad) == 0;
5432 		case 1:
5433 		case 2:
5434 			operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5435 			goto check_operand;
5436 		case 3:
5437 		case 4:
5438 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5439 			goto check_operand;
5440 		case 5:
5441 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5442 			goto check_operand;
5443 		case 6:
5444 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5445 			goto check_operand;
5446 		case 7:
5447 		case 8:
5448 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5449 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5450 #else
5451 			operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5452 #endif
5453 			goto check_operand;
5454 		case 9:
5455 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5456 			goto check_operand;
5457 		case 10:
5458 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5459 			goto check_operand;
5460 		case 11:
5461 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5462 			goto check_operand;
5463 		case 12:
5464 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5465 			goto check_operand;
5466 		case 13:
5467 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5468 			goto check_operand;
5469 		case 14:
5470 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5471 			goto check_operand;
5472 		case 15:
5473 		case 16:
5474 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5475 			goto check_operand;
5476 		case 17:
5477 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5478 			goto check_operand;
5479 		case 18:
5480 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5481 			goto check_operand;
5482 		case 19:
5483 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5484 			goto check_operand;
5485 		case 20:
5486 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5487 			goto check_operand;
5488 		case 21:
5489 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5490 			goto check_operand;
5491 		case 22:
5492 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5493 			goto check_operand;
5494 		case 23:
5495 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5496 			goto check_operand;
5497 		case 24:
5498 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5499 			goto check_operand;
5500 		case 25:
5501 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5502 			goto check_operand;
5503 		case 26:
5504 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5505 			goto check_operand;
5506 		case 27:
5507 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5508 			goto check_operand;
5509 		case 28:
5510 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5511 			goto check_operand;
5512 		case 29:
5513 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5514 			goto check_operand;
5515 		case 30:
5516 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5517 			goto check_operand;
5518 		case 31:
5519 			return true;
5520 		}
5521 
5522 		ZEND_UNREACHABLE();
5523 	}
5524 
5525 	return true;
5526 }
5527 
5528 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5529 
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5530 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5531 {
5532 	if (encoding == &mbfl_encoding_utf8) {
5533 		if (ZSTR_IS_VALID_UTF8(str)) {
5534 			return true;
5535 		}
5536 		bool result = mb_fast_check_utf8(str);
5537 		if (result && !ZSTR_IS_INTERNED(str)) {
5538 			GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5539 		}
5540 		return result;
5541 	} else {
5542 		return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5543 	}
5544 }
5545 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5546 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5547 {
5548 	zend_long idx;
5549 	zend_string *key;
5550 	zval *entry;
5551 	bool valid = true;
5552 
5553 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5554 
5555 	if (GC_IS_RECURSIVE(vars)) {
5556 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5557 		return false;
5558 	}
5559 	GC_TRY_PROTECT_RECURSION(vars);
5560 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5561 		ZVAL_DEREF(entry);
5562 		if (key) {
5563 			if (!mb_check_str_encoding(key, encoding)) {
5564 				valid = false;
5565 				break;
5566 			}
5567 		}
5568 		switch (Z_TYPE_P(entry)) {
5569 			case IS_STRING:
5570 				if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5571 					valid = false;
5572 					break;
5573 				}
5574 				break;
5575 			case IS_ARRAY:
5576 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5577 					valid = false;
5578 					break;
5579 				}
5580 				break;
5581 			case IS_LONG:
5582 			case IS_DOUBLE:
5583 			case IS_NULL:
5584 			case IS_TRUE:
5585 			case IS_FALSE:
5586 				break;
5587 			default:
5588 				/* Other types are error. */
5589 				valid = false;
5590 				break;
5591 		}
5592 	} ZEND_HASH_FOREACH_END();
5593 	GC_TRY_UNPROTECT_RECURSION(vars);
5594 	return valid;
5595 }
5596 
5597 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5598 PHP_FUNCTION(mb_check_encoding)
5599 {
5600 	zend_string *input_str = NULL, *enc = NULL;
5601 	HashTable *input_ht = NULL;
5602 	const mbfl_encoding *encoding;
5603 
5604 	ZEND_PARSE_PARAMETERS_START(0, 2)
5605 		Z_PARAM_OPTIONAL
5606 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5607 		Z_PARAM_STR_OR_NULL(enc)
5608 	ZEND_PARSE_PARAMETERS_END();
5609 
5610 	encoding = php_mb_get_encoding(enc, 2);
5611 	if (!encoding) {
5612 		RETURN_THROWS();
5613 	}
5614 
5615 	if (input_ht) {
5616 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5617 	} else if (input_str) {
5618 		RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5619 	} else {
5620 		php_error_docref(NULL, E_DEPRECATED,
5621 			"Calling mb_check_encoding() without argument is deprecated");
5622 
5623 		/* FIXME: Actually check all inputs, except $_FILES file content. */
5624 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
5625 	}
5626 }
5627 /* }}} */
5628 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5629 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5630 	const uint32_t enc_name_arg_num)
5631 {
5632 	const mbfl_encoding *enc;
5633 	enum mbfl_no_encoding no_enc;
5634 
5635 	ZEND_ASSERT(str_len > 0);
5636 
5637 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5638 	if (!enc) {
5639 		return -2;
5640 	}
5641 
5642 	no_enc = enc->no_encoding;
5643 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5644 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5645 		return -2;
5646 	}
5647 
5648 	/* Some legacy text encodings have a minimum required wchar buffer size;
5649 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5650 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5651 	unsigned int state = 0;
5652 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5653 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5654 
5655 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5656 		return -1;
5657 	}
5658 	return wchar_buf[0];
5659 }
5660 
5661 /* {{{ */
PHP_FUNCTION(mb_ord)5662 PHP_FUNCTION(mb_ord)
5663 {
5664 	char *str;
5665 	size_t str_len;
5666 	zend_string *enc = NULL;
5667 	zend_long cp;
5668 
5669 	ZEND_PARSE_PARAMETERS_START(1, 2)
5670 		Z_PARAM_STRING(str, str_len)
5671 		Z_PARAM_OPTIONAL
5672 		Z_PARAM_STR_OR_NULL(enc)
5673 	ZEND_PARSE_PARAMETERS_END();
5674 
5675 	if (str_len == 0) {
5676 		zend_argument_must_not_be_empty_error(1);
5677 		RETURN_THROWS();
5678 	}
5679 
5680 	cp = php_mb_ord(str, str_len, enc, 2);
5681 
5682 	if (0 > cp) {
5683 		if (cp == -2) {
5684 			RETURN_THROWS();
5685 		}
5686 		RETURN_FALSE;
5687 	}
5688 
5689 	RETURN_LONG(cp);
5690 }
5691 /* }}} */
5692 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5693 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5694 {
5695 	const mbfl_encoding *enc;
5696 	enum mbfl_no_encoding no_enc;
5697 	zend_string *ret;
5698 	char buf[4];
5699 
5700 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5701 	if (!enc) {
5702 		return NULL;
5703 	}
5704 
5705 	no_enc = enc->no_encoding;
5706 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5707 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5708 		return NULL;
5709 	}
5710 
5711 	if (cp < 0 || cp > 0x10ffff) {
5712 		return NULL;
5713 	}
5714 
5715 	if (php_mb_is_no_encoding_utf8(no_enc)) {
5716 		if (cp > 0xd7ff && 0xe000 > cp) {
5717 			return NULL;
5718 		}
5719 
5720 		if (cp < 0x80) {
5721 			ret = ZSTR_CHAR(cp);
5722 		} else if (cp < 0x800) {
5723 			ret = zend_string_alloc(2, 0);
5724 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5725 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5726 			ZSTR_VAL(ret)[2] = 0;
5727 		} else if (cp < 0x10000) {
5728 			ret = zend_string_alloc(3, 0);
5729 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5730 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5731 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5732 			ZSTR_VAL(ret)[3] = 0;
5733 		} else {
5734 			ret = zend_string_alloc(4, 0);
5735 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5736 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5737 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5738 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5739 			ZSTR_VAL(ret)[4] = 0;
5740 		}
5741 
5742 		return ret;
5743 	}
5744 
5745 	buf[0] = (cp >> 24) & 0xff;
5746 	buf[1] = (cp >> 16) & 0xff;
5747 	buf[2] = (cp >>  8) & 0xff;
5748 	buf[3] = cp & 0xff;
5749 
5750 	long orig_illegalchars = MBSTRG(illegalchars);
5751 	MBSTRG(illegalchars) = 0;
5752 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5753 
5754 	if (MBSTRG(illegalchars) != 0) {
5755 		zend_string_release(ret);
5756 		ret = NULL;
5757 	}
5758 
5759 	MBSTRG(illegalchars) = orig_illegalchars;
5760 	return ret;
5761 }
5762 
5763 /* {{{ */
PHP_FUNCTION(mb_chr)5764 PHP_FUNCTION(mb_chr)
5765 {
5766 	zend_long cp;
5767 	zend_string *enc = NULL;
5768 
5769 	ZEND_PARSE_PARAMETERS_START(1, 2)
5770 		Z_PARAM_LONG(cp)
5771 		Z_PARAM_OPTIONAL
5772 		Z_PARAM_STR_OR_NULL(enc)
5773 	ZEND_PARSE_PARAMETERS_END();
5774 
5775 	zend_string* ret = php_mb_chr(cp, enc, 2);
5776 	if (ret == NULL) {
5777 		RETURN_FALSE;
5778 	}
5779 
5780 	RETURN_STR(ret);
5781 }
5782 /* }}} */
5783 
PHP_FUNCTION(mb_str_pad)5784 PHP_FUNCTION(mb_str_pad)
5785 {
5786 	zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5787 	zend_long pad_to_length;
5788 	zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5789 
5790 	ZEND_PARSE_PARAMETERS_START(2, 5)
5791 		Z_PARAM_STR(input)
5792 		Z_PARAM_LONG(pad_to_length)
5793 		Z_PARAM_OPTIONAL
5794 		Z_PARAM_STR(pad)
5795 		Z_PARAM_LONG(pad_type_val)
5796 		Z_PARAM_STR_OR_NULL(encoding_str)
5797 	ZEND_PARSE_PARAMETERS_END();
5798 
5799 	const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5800 	if (!encoding) {
5801 		RETURN_THROWS();
5802 	}
5803 
5804 	size_t input_length = mb_get_strlen(input, encoding);
5805 
5806 	/* If resulting string turns out to be shorter than input string,
5807 	   we simply copy the input and return. */
5808 	if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5809 		RETURN_STR_COPY(input);
5810 	}
5811 
5812 	if (ZSTR_LEN(pad) == 0) {
5813 		zend_argument_must_not_be_empty_error(3);
5814 		RETURN_THROWS();
5815 	}
5816 
5817 	if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5818 		zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5819 		RETURN_THROWS();
5820 	}
5821 
5822 	size_t pad_length = mb_get_strlen(pad, encoding);
5823 
5824 	size_t num_mb_pad_chars = pad_to_length - input_length;
5825 
5826 	/* We need to figure out the left/right padding lengths. */
5827 	size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5828 	switch (pad_type_val) {
5829 		case PHP_STR_PAD_RIGHT:
5830 			right_pad = num_mb_pad_chars;
5831 			break;
5832 
5833 		case PHP_STR_PAD_LEFT:
5834 			left_pad = num_mb_pad_chars;
5835 			break;
5836 
5837 		case PHP_STR_PAD_BOTH:
5838 			left_pad = num_mb_pad_chars / 2;
5839 			right_pad = num_mb_pad_chars - left_pad;
5840 			break;
5841 	}
5842 
5843 	/* How many full block copies need to happen, and how many characters are then left over? */
5844 	size_t full_left_pad_copies = left_pad / pad_length;
5845 	size_t full_right_pad_copies = right_pad / pad_length;
5846 	size_t remaining_left_pad_chars = left_pad % pad_length;
5847 	size_t remaining_right_pad_chars = right_pad % pad_length;
5848 
5849 	if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5850 		goto overflow_no_release;
5851 	}
5852 
5853 	/* Compute the number of bytes required for the padding */
5854 	size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5855 	size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5856 
5857 	/* No special fast-path handling necessary for zero-length pads because these functions will not
5858 	 * allocate memory in case a zero-length pad is required. */
5859 	zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5860 	zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5861 
5862 	if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5863 		|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5864 		goto overflow;
5865 	}
5866 
5867 	size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5868 	size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5869 
5870 	if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5871 		|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5872 		goto overflow;
5873 	}
5874 
5875 	zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5876 	char *buffer = ZSTR_VAL(result);
5877 
5878 	/* First we pad the left. */
5879 	for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5880 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5881 	}
5882 	memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5883 	buffer += ZSTR_LEN(remaining_left_pad_str);
5884 
5885 	/* Then we copy the input string. */
5886 	memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5887 	buffer += ZSTR_LEN(input);
5888 
5889 	/* Finally, we pad on the right. */
5890 	for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5891 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5892 	}
5893 	memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5894 
5895 	ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5896 
5897 	zend_string_release_ex(remaining_left_pad_str, false);
5898 	zend_string_release_ex(remaining_right_pad_str, false);
5899 
5900 	RETURN_NEW_STR(result);
5901 
5902 overflow:
5903 	zend_string_release_ex(remaining_left_pad_str, false);
5904 	zend_string_release_ex(remaining_right_pad_str, false);
5905 overflow_no_release:
5906 	zend_throw_error(NULL, "String size overflow");
5907 	RETURN_THROWS();
5908 }
5909 
5910 /* {{{ */
PHP_FUNCTION(mb_scrub)5911 PHP_FUNCTION(mb_scrub)
5912 {
5913 	zend_string *str, *enc_name = NULL;
5914 
5915 	ZEND_PARSE_PARAMETERS_START(1, 2)
5916 		Z_PARAM_STR(str)
5917 		Z_PARAM_OPTIONAL
5918 		Z_PARAM_STR_OR_NULL(enc_name)
5919 	ZEND_PARSE_PARAMETERS_END();
5920 
5921 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5922 	if (!enc) {
5923 		RETURN_THROWS();
5924 	}
5925 
5926 	if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5927 		/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5928 		RETURN_STR_COPY(str);
5929 	}
5930 
5931 	RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5932 }
5933 /* }}} */
5934 
5935 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5936 static void php_mb_populate_current_detect_order_list(void)
5937 {
5938 	const mbfl_encoding **entry = 0;
5939 	size_t nentries;
5940 
5941 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5942 		nentries = MBSTRG(detect_order_list_size);
5943 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5944 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5945 	} else {
5946 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5947 		size_t i;
5948 		nentries = MBSTRG(default_detect_order_list_size);
5949 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5950 		for (i = 0; i < nentries; i++) {
5951 			entry[i] = mbfl_no2encoding(src[i]);
5952 		}
5953 	}
5954 	MBSTRG(current_detect_order_list) = entry;
5955 	MBSTRG(current_detect_order_list_size) = nentries;
5956 }
5957 /* }}} */
5958 
5959 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5960 static int php_mb_encoding_translation(void)
5961 {
5962 	return MBSTRG(encoding_translation);
5963 }
5964 /* }}} */
5965 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5966 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5967 {
5968 	if (enc) {
5969 		if (enc->mblen_table) {
5970 			if (s) {
5971 				return enc->mblen_table[*(unsigned char *)s];
5972 			}
5973 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5974 			return 2;
5975 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5976 			return 4;
5977 		}
5978 	}
5979 	return 1;
5980 }
5981 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5982 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5983 {
5984 	const char *p = s;
5985 	char *last=NULL;
5986 
5987 	if (nbytes == (size_t)-1) {
5988 		size_t nb = 0;
5989 
5990 		while (*p != '\0') {
5991 			if (nb == 0) {
5992 				if ((unsigned char)*p == (unsigned char)c) {
5993 					last = (char *)p;
5994 				}
5995 				nb = php_mb_mbchar_bytes(p, enc);
5996 				if (nb == 0) {
5997 					return NULL; /* something is going wrong! */
5998 				}
5999 			}
6000 			--nb;
6001 			++p;
6002 		}
6003 	} else {
6004 		size_t bcnt = nbytes;
6005 		size_t nbytes_char;
6006 		while (bcnt > 0) {
6007 			if ((unsigned char)*p == (unsigned char)c) {
6008 				last = (char *)p;
6009 			}
6010 			nbytes_char = php_mb_mbchar_bytes(p, enc);
6011 			if (bcnt < nbytes_char) {
6012 				return NULL;
6013 			}
6014 			p += nbytes_char;
6015 			bcnt -= nbytes_char;
6016 		}
6017 	}
6018 	return last;
6019 }
6020 
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)6021 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
6022 {
6023 	/* We're using simple case-folding here, because we'd have to deal with remapping of
6024 	 * offsets otherwise. */
6025 	zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6026 	zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6027 
6028 	size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6029 
6030 	zend_string_free(haystack_conv);
6031 	zend_string_free(needle_conv);
6032 
6033 	return n;
6034 }
6035 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)6036 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6037 {
6038 	*list = (const zend_encoding **)MBSTRG(http_input_list);
6039 	*list_size = MBSTRG(http_input_list_size);
6040 }
6041 /* }}} */
6042 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)6043 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6044 {
6045 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
6046 }
6047 /* }}} */
6048 
6049 static const unsigned char base64_table[] = {
6050  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6051    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6052  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6053    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6054  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6055    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6056  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6057    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6058  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6059    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6060 };
6061 
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6062 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6063 {
6064 	if (base64) {
6065 		return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6066 	} else {
6067 		size_t enc_size = 0;
6068 		unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6069 		while (p < tmpbuf->out) {
6070 			unsigned char c = *p++;
6071 			enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6072 		}
6073 		return enc_size;
6074 	}
6075 }
6076 
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6077 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6078 {
6079 	unsigned char *out, *limit;
6080 	MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6081 	unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6082 
6083 	if (base64) {
6084 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6085 		while ((e - p) >= 3) {
6086 			unsigned char a = *p++;
6087 			unsigned char b = *p++;
6088 			unsigned char c = *p++;
6089 			uint32_t bits = (a << 16) | (b << 8) | c;
6090 			out = mb_convert_buf_add4(out,
6091 				base64_table[(bits >> 18) & 0x3F],
6092 				base64_table[(bits >> 12) & 0x3F],
6093 				base64_table[(bits >> 6) & 0x3F],
6094 				base64_table[bits & 0x3F]);
6095 		}
6096 		if (p != e) {
6097 			if ((e - p) == 1) {
6098 				uint32_t bits = *p++;
6099 				out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6100 			} else {
6101 				unsigned char a = *p++;
6102 				unsigned char b = *p++;
6103 				uint32_t bits = (a << 8) | b;
6104 				out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6105 			}
6106 		}
6107 	} else {
6108 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6109 		while (p < e) {
6110 			unsigned char c = *p++;
6111 			if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6112 				out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6113 			} else {
6114 				out = mb_convert_buf_add(out, c);
6115 			}
6116 		}
6117 	}
6118 
6119 	mb_convert_buf_reset(tmpbuf, 0);
6120 	MB_CONVERT_BUF_STORE(outbuf, out, limit);
6121 }
6122 
6123 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6124 
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6125 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6126 {
6127 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6128 	size_t in_len = ZSTR_LEN(input);
6129 
6130 	ZEND_ASSERT(outcode->mime_name != NULL);
6131 	ZEND_ASSERT(outcode->mime_name[0] != '\0');
6132 
6133 	if (!in_len) {
6134 		return zend_empty_string;
6135 	}
6136 
6137 	if (indent < 0 || indent >= 74) {
6138 		indent = 0;
6139 	}
6140 
6141 	if (linefeed_len > 8) {
6142 		linefeed_len = 8;
6143 	}
6144 	/* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6145 	for (size_t i = 0; i < linefeed_len; i++) {
6146 		if (linefeed[i] == '\0') {
6147 			linefeed_len = i;
6148 			break;
6149 		}
6150 	}
6151 
6152 	unsigned int state = 0;
6153 	/* wchar_buf should be big enough that when it is full, we definitely have enough
6154 	 * wchars to fill an entire line of output */
6155 	uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6156 	uint32_t *p, *e;
6157 	/* What part of wchar_buf is filled with still-unprocessed data which should not
6158 	 * be overwritten? */
6159 	unsigned int offset = 0;
6160 	size_t line_start = 0;
6161 
6162 	/* If the entire input string is ASCII with no spaces (except possibly leading
6163 	 * spaces), just pass it through unchanged */
6164 	bool checking_leading_spaces = true;
6165 	while (in_len) {
6166 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6167 		p = wchar_buf;
6168 		e = wchar_buf + out_len;
6169 
6170 		while (p < e) {
6171 			uint32_t w = *p++;
6172 			if (checking_leading_spaces) {
6173 				if (w == ' ') {
6174 					continue;
6175 				} else {
6176 					checking_leading_spaces = false;
6177 				}
6178 			}
6179 			if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6180 				/* We cannot simply pass input string through unchanged; start again */
6181 				in = (unsigned char*)ZSTR_VAL(input);
6182 				in_len = ZSTR_LEN(input);
6183 				goto no_passthrough;
6184 			}
6185 		}
6186 	}
6187 
6188 	return zend_string_copy(input); /* This just increments refcount */
6189 
6190 no_passthrough: ;
6191 
6192 	mb_convert_buf buf;
6193 	mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6194 
6195 	/* Encode some prefix of the input string as plain ASCII if possible
6196 	 * If we find it necessary to switch to Base64/QPrint encoding, we will
6197 	 * do so all the way to the end of the string */
6198 	while (in_len) {
6199 		/* Decode part of the input string, refill wchar_buf */
6200 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6201 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6202 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6203 		p = wchar_buf;
6204 		e = wchar_buf + offset + out_len;
6205 		/* ASCII output is broken into space-delimited 'words'
6206 		 * If we find a non-ASCII character in the middle of a word, we will
6207 		 * transfer-encode the entire word */
6208 		uint32_t *word_start = p;
6209 
6210 		/* Don't consider adding line feed for spaces at the beginning of a word */
6211 		while (p < e && *p == ' ' && (p - word_start) <= 74) {
6212 			p++;
6213 		}
6214 
6215 		while (p < e) {
6216 			uint32_t w = *p++;
6217 
6218 			if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6219 				/* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6220 				 * If we are already too far along on a line to include Base64/QPrint encoded data
6221 				 * on the same line (without overrunning max line length), then add a line feed
6222 				 * right now */
6223 feed_and_mime_encode:
6224 				if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6225 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6226 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6227 					buf.out = mb_convert_buf_add(buf.out, ' ');
6228 					indent = 0;
6229 					line_start = mb_convert_buf_len(&buf);
6230 				} else if (mb_convert_buf_len(&buf) > 0) {
6231 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6232 					buf.out = mb_convert_buf_add(buf.out, ' ');
6233 				}
6234 				p = word_start; /* Back up to where MIME encoding of input chars should start */
6235 				goto mime_encoding_needed;
6236 			} else if (w == ' ') {
6237 				/* When we see a space, check whether we should insert a line break */
6238 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6239 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6240 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6241 					buf.out = mb_convert_buf_add(buf.out, ' ');
6242 					indent = 0;
6243 					line_start = mb_convert_buf_len(&buf);
6244 				} else if (mb_convert_buf_len(&buf) > 0) {
6245 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6246 					buf.out = mb_convert_buf_add(buf.out, ' ');
6247 				}
6248 				/* Output one (space-delimited) word as plain ASCII */
6249 				while (word_start < p-1) {
6250 					buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6251 				}
6252 				word_start++;
6253 				while (p < e && *p == ' ') {
6254 					p++;
6255 				}
6256 			}
6257 		}
6258 
6259 		if (in_len) {
6260 			/* Copy chars which are part of an incomplete 'word' to the beginning
6261 			 * of wchar_buf and reprocess them on the next iteration.
6262 			 * But first make sure that the incomplete 'word' isn't so big that
6263 			 * there will be no space to add any more decoded wchars in the buffer
6264 			 * (which could lead to an infinite loop) */
6265 			if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6266 				goto feed_and_mime_encode;
6267 			}
6268 			offset = e - word_start;
6269 			if (offset) {
6270 				memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6271 			}
6272 		} else {
6273 			/* We have reached the end of the input string while still in 'ASCII mode';
6274 			 * process any trailing ASCII chars which were not followed by a space */
6275 			if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6276 				/* The whole input string was not just one big ASCII 'word' with no spaces
6277 				 * consider adding a line feed if necessary to prevent output lines from
6278 				 * being too long */
6279 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6280 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6281 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6282 					buf.out = mb_convert_buf_add(buf.out, ' ');
6283 				} else {
6284 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6285 					buf.out = mb_convert_buf_add(buf.out, ' ');
6286 				}
6287 			}
6288 			while (word_start < e) {
6289 				buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6290 			}
6291 		}
6292 	}
6293 
6294 	/* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6295 	return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6296 
6297 mime_encoding_needed: ;
6298 
6299 	/* We will generate the output line by line, first converting wchars to bytes
6300 	 * in the requested output encoding, then transfer-encoding those bytes as
6301 	 * Base64 or QPrint
6302 	 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6303 	 * sending them to 'buf' */
6304 	mb_convert_buf tmpbuf;
6305 	mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6306 
6307 	/* Do we need to refill wchar_buf to make sure we don't run out of wchars
6308 	 * in the middle of a line? */
6309 	offset = e - p;
6310 	if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6311 		goto start_new_line;
6312 	}
6313 	memmove(wchar_buf, p, offset * sizeof(uint32_t));
6314 
6315 	while(true) {
6316 refill_wchar_buf: ;
6317 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6318 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6319 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6320 		p = wchar_buf;
6321 		e = wchar_buf + offset + out_len;
6322 
6323 start_new_line: ;
6324 		MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6325 		buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6326 		buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6327 		buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6328 
6329 		/* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6330 		 * We do something like a 'binary search' to find the greatest number which
6331 		 * can be included on this line without exceeding max line length */
6332 		unsigned int n = 12;
6333 		size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6334 
6335 		while (true) {
6336 			ZEND_ASSERT(p < e);
6337 
6338 			/* Remember where we were in process of generating output, so we can back
6339 			 * up if necessary */
6340 			size_t tmppos = mb_convert_buf_len(&tmpbuf);
6341 			unsigned int tmpstate = tmpbuf.state;
6342 
6343 			/* Try encoding 'n' wchars in output text encoding and sending output
6344 			 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6345 			 * current line. */
6346 			n = MIN(n, e - p);
6347 			outcode->from_wchar(p, n, &tmpbuf, false);
6348 
6349 			/* For some output text encodings, there may be a few ending bytes
6350 			 * which need to be emitted to output before we break a line.
6351 			 * Again, remember where we were so we can back up */
6352 			size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6353 			unsigned int tmpstate2 = tmpbuf.state;
6354 			outcode->from_wchar(NULL, 0, &tmpbuf, true);
6355 
6356 			if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6357 				/* If we convert 'n' more wchars on the current line, it will not
6358 				 * overflow the maximum line length */
6359 				p += n;
6360 
6361 				if (p == e) {
6362 					/* We are done; we shouldn't reach here if there is more remaining
6363 					 * of the input string which needs to be processed */
6364 					ZEND_ASSERT(!in_len);
6365 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6366 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6367 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6368 					mb_convert_buf_free(&tmpbuf);
6369 					return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6370 				} else {
6371 					/* It's possible that more chars might fit on the current line,
6372 					 * so back up to where we were before emitting any ending bytes */
6373 					mb_convert_buf_reset(&tmpbuf, tmppos2);
6374 					tmpbuf.state = tmpstate2;
6375 				}
6376 			} else {
6377 				/* Converting 'n' more wchars on this line would be too much.
6378 				 * Back up to where we were before we tried that. */
6379 				mb_convert_buf_reset(&tmpbuf, tmppos);
6380 				tmpbuf.state = tmpstate;
6381 
6382 				if (n == 1) {
6383 					/* We have found the exact number of chars which will fit on the
6384 					 * current line. Finish up and move to a new line. */
6385 					outcode->from_wchar(NULL, 0, &tmpbuf, true);
6386 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6387 					tmpbuf.state = 0;
6388 
6389 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6390 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6391 
6392 					indent = 0; /* Indent argument must only affect the first line */
6393 
6394 					if (in_len || p < e) {
6395 						/* We still have more input to process */
6396 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6397 						buf.out = mb_convert_buf_add(buf.out, ' ');
6398 						line_start = mb_convert_buf_len(&buf);
6399 						offset = e - p;
6400 						if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6401 							/* Copy any remaining wchars to beginning of buffer and refill
6402 							 * the rest of the buffer */
6403 							memmove(wchar_buf, p, offset * sizeof(uint32_t));
6404 							goto refill_wchar_buf;
6405 						}
6406 						goto start_new_line;
6407 					} else {
6408 						/* We are done! */
6409 						mb_convert_buf_free(&tmpbuf);
6410 						return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6411 					}
6412 				} else {
6413 					/* Try a smaller number of wchars */
6414 					n = MAX(n >> 1, 1);
6415 				}
6416 			}
6417 		}
6418 	}
6419 }
6420 
PHP_FUNCTION(mb_encode_mimeheader)6421 PHP_FUNCTION(mb_encode_mimeheader)
6422 {
6423 	const mbfl_encoding *charset = &mbfl_encoding_pass;
6424 	zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6425 	char *linefeed = "\r\n";
6426 	size_t linefeed_len = 2;
6427 	zend_long indent = 0;
6428 	bool base64 = true;
6429 
6430 	ZEND_PARSE_PARAMETERS_START(1, 5)
6431 		Z_PARAM_STR(str)
6432 		Z_PARAM_OPTIONAL
6433 		Z_PARAM_STR(charset_name)
6434 		Z_PARAM_STR(transenc_name)
6435 		Z_PARAM_STRING(linefeed, linefeed_len)
6436 		Z_PARAM_LONG(indent)
6437 	ZEND_PARSE_PARAMETERS_END();
6438 
6439 	if (charset_name != NULL) {
6440 		charset = php_mb_get_encoding(charset_name, 2);
6441 		if (!charset) {
6442 			RETURN_THROWS();
6443 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6444 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6445 			RETURN_THROWS();
6446 		}
6447 	} else {
6448 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6449 		if (lang != NULL) {
6450 			charset = mbfl_no2encoding(lang->mail_charset);
6451 			const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6452 			char t = transenc->name[0];
6453 			if (t == 'Q' || t == 'q') {
6454 				base64 = false;
6455 			}
6456 		}
6457 	}
6458 
6459 	if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6460 		char t = ZSTR_VAL(transenc_name)[0];
6461 		if (t == 'Q' || t == 'q') {
6462 			base64 = false;
6463 		}
6464 	}
6465 
6466 	RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6467 }
6468 
decode_base64(unsigned char c)6469 static int8_t decode_base64(unsigned char c)
6470 {
6471 	if (c >= 'A' && c <= 'Z') {
6472 		return c - 'A';
6473 	} else if (c >= 'a' && c <= 'z') {
6474 		return c - 'a' + 26;
6475 	} else if (c >= '0' && c <= '9') {
6476 		return c - '0' + 52;
6477 	} else if (c == '+') {
6478 		return 62;
6479 	} else if (c == '/') {
6480 		return 63;
6481 	}
6482 	return -1;
6483 }
6484 
6485 static int8_t qprint_map[] = {
6486 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6487 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6488 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6489 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
6490 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6491 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6492 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6493 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6494 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6495 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6496 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6497 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6498 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6499 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6500 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6501 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6502 };
6503 
6504 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6505 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6506 {
6507 	if ((e - p) < 6) {
6508 		return NULL;
6509 	}
6510 
6511 	ZEND_ASSERT(p[0] == '=');
6512 	ZEND_ASSERT(p[1] == '?');
6513 	p += 2;
6514 
6515 	unsigned char *charset = p;
6516 	unsigned char *charset_end = memchr(charset, '?', e - charset);
6517 	if (charset_end == NULL) {
6518 		return NULL;
6519 	}
6520 
6521 	unsigned char *encoding = charset_end + 1;
6522 	p = encoding + 1;
6523 	if (p >= e || *p++ != '?') {
6524 		return NULL;
6525 	}
6526 
6527 	char *charset_name = estrndup((const char*)charset, charset_end - charset);
6528 	const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6529 	efree(charset_name);
6530 	if (incode == NULL) {
6531 		return NULL;
6532 	}
6533 
6534 	unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6535 	if (end_marker) {
6536 		e = end_marker;
6537 	} else if (p < e && *(e-1) == '?') {
6538 		/* If encoded word is not properly terminated, but last byte is '?',
6539 		 * take that as a terminator (legacy behavior) */
6540 		e--;
6541 	}
6542 
6543 	unsigned char *buf = emalloc(e - p), *bufp = buf;
6544 	if (*encoding == 'Q' || *encoding == 'q') {
6545 		/* Fill `buf` with bytes from decoding QPrint */
6546 		while (p < e) {
6547 			unsigned char c = *p++;
6548 			if (c == '_') {
6549 				*bufp++ = ' ';
6550 				continue;
6551 			} else if (c == '=' && (e - p) >= 2) {
6552 				unsigned char c2 = *p++;
6553 				unsigned char c3 = *p++;
6554 				if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6555 					*bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6556 					continue;
6557 				} else if (c2 == '\r') {
6558 					if (c3 != '\n') {
6559 						p--;
6560 					}
6561 					continue;
6562 				} else if (c2 == '\n') {
6563 					p--;
6564 					continue;
6565 				}
6566 			}
6567 			*bufp++ = c;
6568 		}
6569 	} else if (*encoding == 'B' || *encoding == 'b') {
6570 		/* Fill `buf` with bytes from decoding Base64 */
6571 		unsigned int bits = 0, cache = 0;
6572 		while (p < e) {
6573 			unsigned char c = *p++;
6574 			if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6575 				continue;
6576 			}
6577 			int8_t decoded = decode_base64(c);
6578 			if (decoded == -1) {
6579 				*bufp++ = '?';
6580 				continue;
6581 			}
6582 			bits += 6;
6583 			cache = (cache << 6) | (decoded & 0x3F);
6584 			if (bits == 24) {
6585 				*bufp++ = (cache >> 16) & 0xFF;
6586 				*bufp++ = (cache >> 8) & 0xFF;
6587 				*bufp++ = cache & 0xFF;
6588 				bits = cache = 0;
6589 			}
6590 		}
6591 		if (bits == 18) {
6592 			*bufp++ = (cache >> 10) & 0xFF;
6593 			*bufp++ = (cache >> 2) & 0xFF;
6594 		} else if (bits == 12) {
6595 			*bufp++ = (cache >> 4) & 0xFF;
6596 		}
6597 	} else {
6598 		efree(buf);
6599 		return NULL;
6600 	}
6601 
6602 	size_t in_len = bufp - buf;
6603 	uint32_t wchar_buf[128];
6604 
6605 	bufp = buf;
6606 	while (in_len) {
6607 		size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6608 		ZEND_ASSERT(out_len <= 128);
6609 		outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6610 	}
6611 
6612 	efree(buf);
6613 	return e + 2;
6614 }
6615 
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6616 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6617 {
6618 	unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6619 	unsigned int state = 0;
6620 	bool space_pending = false;
6621 
6622 	mb_convert_buf buf;
6623 	mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6624 
6625 	while (p < e) {
6626 		unsigned char c = *p;
6627 
6628 		if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6629 			/* Does this look like a MIME encoded word? If so, try to decode it as one */
6630 			unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6631 			if (incode_end && (e - incode_end) >= 3) {
6632 				unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6633 				if (temp) {
6634 					p = temp;
6635 					/* Decoding of MIME encoded word was successful;
6636 					 * Try to collapse a run of whitespace */
6637 					if (p < e && (*p == '\n' || *p == '\r')) {
6638 						do {
6639 							p++;
6640 						} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6641 						/* We will only actually output a space if this is not immediately followed
6642 						 * by another valid encoded word */
6643 						space_pending = true;
6644 					}
6645 					continue;
6646 				}
6647 			}
6648 		}
6649 
6650 		if (space_pending) {
6651 			uint32_t space = ' ';
6652 			outcode->from_wchar(&space, 1, &buf, false);
6653 			space_pending = false;
6654 		}
6655 
6656 		/* Consume a run of plain ASCII characters */
6657 		if (c != '\n' && c != '\r') {
6658 			unsigned char *end = p + 1;
6659 			while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6660 				end++;
6661 			}
6662 			uint32_t wchar_buf[128];
6663 			size_t in_len = end - p;
6664 			while (in_len) {
6665 				size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6666 				ZEND_ASSERT(out_len <= 128);
6667 				outcode->from_wchar(wchar_buf, out_len, &buf, false);
6668 			}
6669 		}
6670 		/* Collapse a run of whitespace into a single space */
6671 		if (p < e && (*p == '\n' || *p == '\r')) {
6672 			do {
6673 				p++;
6674 			} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6675 			if (p < e) {
6676 				/* Emulating legacy behavior of mb_decode_mimeheader here;
6677 				 * a run of whitespace is not converted to a space at the very
6678 				 * end of the input string */
6679 				uint32_t space = ' ';
6680 				outcode->from_wchar(&space, 1, &buf, false);
6681 			}
6682 		}
6683 	}
6684 
6685 	outcode->from_wchar(NULL, 0, &buf, true);
6686 
6687 	return mb_convert_buf_result(&buf, outcode);
6688 }
6689 
PHP_FUNCTION(mb_decode_mimeheader)6690 PHP_FUNCTION(mb_decode_mimeheader)
6691 {
6692 	zend_string *str;
6693 
6694 	ZEND_PARSE_PARAMETERS_START(1, 1)
6695 		Z_PARAM_STR(str)
6696 	ZEND_PARSE_PARAMETERS_END();
6697 
6698 	RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6699 }
6700