xref: /PHP-8.4/ext/mbstring/mbstring.c (revision c34d4fbb)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include <limits.h>
22 
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "main/php_output.h"
32 #include "ext/standard/info.h"
33 #include "ext/pcre/php_pcre.h"
34 
35 #include "libmbfl/mbfl/mbfilter_8bit.h"
36 #include "libmbfl/mbfl/mbfilter_pass.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_cjk.h"
40 #include "libmbfl/filters/mbfilter_qprint.h"
41 #include "libmbfl/filters/mbfilter_htmlent.h"
42 #include "libmbfl/filters/mbfilter_uuencode.h"
43 #include "libmbfl/filters/mbfilter_ucs4.h"
44 #include "libmbfl/filters/mbfilter_utf16.h"
45 #include "libmbfl/filters/mbfilter_singlebyte.h"
46 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
47 #include "libmbfl/filters/unicode_prop.h"
48 
49 #include "php_globals.h"
50 #include "rfc1867.h"
51 #include "php_content_types.h"
52 #include "SAPI.h"
53 #include "php_unicode.h"
54 #include "TSRM.h"
55 
56 #include "mb_gpc.h"
57 
58 #ifdef HAVE_MBREGEX
59 # include "php_mbregex.h"
60 #endif
61 
62 #include "zend_smart_str.h"
63 #include "zend_multibyte.h"
64 #include "mbstring_arginfo.h"
65 
66 #include "rare_cp_bitvec.h"
67 
68 #ifdef __SSE2__
69 #include <emmintrin.h>
70 #endif
71 
72 #ifdef __SSE3__
73 #include <immintrin.h>
74 #include <pmmintrin.h>
75 #endif
76 
77 /* }}} */
78 
79 /* {{{ prototypes */
80 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
81 
82 static PHP_GINIT_FUNCTION(mbstring);
83 static PHP_GSHUTDOWN_FUNCTION(mbstring);
84 
85 static void php_mb_populate_current_detect_order_list(void);
86 
87 static int php_mb_encoding_translation(void);
88 
89 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
90 
91 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
92 
93 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
94 
95 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
96 
97 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
98 
99 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
100 
101 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
102 
103 /* See mbfilter_cp5022x.c */
104 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
105 /* }}} */
106 
107 /* {{{ php_mb_default_identify_list */
108 typedef struct _php_mb_nls_ident_list {
109 	enum mbfl_no_language lang;
110 	const enum mbfl_no_encoding *list;
111 	size_t list_size;
112 } php_mb_nls_ident_list;
113 
114 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
115 	mbfl_no_encoding_ascii,
116 	mbfl_no_encoding_jis,
117 	mbfl_no_encoding_utf8,
118 	mbfl_no_encoding_euc_jp,
119 	mbfl_no_encoding_sjis
120 };
121 
122 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
123 	mbfl_no_encoding_ascii,
124 	mbfl_no_encoding_utf8,
125 	mbfl_no_encoding_euc_cn,
126 	mbfl_no_encoding_cp936
127 };
128 
129 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
130 	mbfl_no_encoding_ascii,
131 	mbfl_no_encoding_utf8,
132 	mbfl_no_encoding_euc_tw,
133 	mbfl_no_encoding_big5
134 };
135 
136 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
137 	mbfl_no_encoding_ascii,
138 	mbfl_no_encoding_utf8,
139 	mbfl_no_encoding_euc_kr,
140 	mbfl_no_encoding_uhc
141 };
142 
143 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
144 	mbfl_no_encoding_ascii,
145 	mbfl_no_encoding_utf8,
146 	mbfl_no_encoding_koi8r,
147 	mbfl_no_encoding_cp1251,
148 	mbfl_no_encoding_cp866
149 };
150 
151 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
152 	mbfl_no_encoding_ascii,
153 	mbfl_no_encoding_utf8,
154 	mbfl_no_encoding_armscii8
155 };
156 
157 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
158 	mbfl_no_encoding_ascii,
159 	mbfl_no_encoding_utf8,
160 	mbfl_no_encoding_cp1254,
161 	mbfl_no_encoding_8859_9
162 };
163 
164 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
165 	mbfl_no_encoding_ascii,
166 	mbfl_no_encoding_utf8,
167 	mbfl_no_encoding_koi8u
168 };
169 
170 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
171 	mbfl_no_encoding_ascii,
172 	mbfl_no_encoding_utf8
173 };
174 
175 
176 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
177 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
178 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
179 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
180 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
181 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
182 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
183 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
184 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
185 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
186 };
187 
188 /* }}} */
189 
190 /* {{{ mbstring_deps[] */
191 static const zend_module_dep mbstring_deps[] = {
192 	ZEND_MOD_REQUIRED("pcre")
193 	ZEND_MOD_END
194 };
195 /* }}} */
196 
197 /* {{{ zend_module_entry mbstring_module_entry */
198 zend_module_entry mbstring_module_entry = {
199 	STANDARD_MODULE_HEADER_EX,
200 	NULL,
201 	mbstring_deps,
202 	"mbstring",
203 	ext_functions,
204 	PHP_MINIT(mbstring),
205 	PHP_MSHUTDOWN(mbstring),
206 	PHP_RINIT(mbstring),
207 	PHP_RSHUTDOWN(mbstring),
208 	PHP_MINFO(mbstring),
209 	PHP_MBSTRING_VERSION,
210 	PHP_MODULE_GLOBALS(mbstring),
211 	PHP_GINIT(mbstring),
212 	PHP_GSHUTDOWN(mbstring),
213 	NULL,
214 	STANDARD_MODULE_PROPERTIES_EX
215 };
216 /* }}} */
217 
218 /* {{{ static sapi_post_entry php_post_entries[] */
219 static const sapi_post_entry php_post_entries[] = {
220 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
221 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
222 	{ NULL, 0, NULL, NULL }
223 };
224 /* }}} */
225 
226 #ifdef COMPILE_DL_MBSTRING
227 #ifdef ZTS
228 ZEND_TSRMLS_CACHE_DEFINE()
229 #endif
230 ZEND_GET_MODULE(mbstring)
231 #endif
232 
233 /* {{{ static sapi_post_entry mbstr_post_entries[] */
234 static const sapi_post_entry mbstr_post_entries[] = {
235 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
236 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
237 	{ NULL, 0, NULL, NULL }
238 };
239 /* }}} */
240 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)241 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
242 	if (encoding_name) {
243 		const mbfl_encoding *encoding;
244 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
245 		if (last_encoding_name && (last_encoding_name == encoding_name
246 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
247 			return MBSTRG(last_used_encoding);
248 		}
249 
250 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
251 		if (!encoding) {
252 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
253 			return NULL;
254 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
255 			if (encoding == &mbfl_encoding_base64) {
256 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
257 			} else if (encoding == &mbfl_encoding_qprint) {
258 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
259 			} else if (encoding == &mbfl_encoding_html_ent) {
260 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
261 			} else if (encoding == &mbfl_encoding_uuencode) {
262 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
263 			}
264 		}
265 
266 		if (last_encoding_name) {
267 			zend_string_release(last_encoding_name);
268 		}
269 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
270 		MBSTRG(last_used_encoding) = encoding;
271 		return encoding;
272 	} else {
273 		return MBSTRG(current_internal_encoding);
274 	}
275 }
276 
php_mb_get_encoding_or_pass(const char * encoding_name,size_t encoding_name_len)277 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
278 	if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
279 		return &mbfl_encoding_pass;
280 	}
281 
282 	return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
283 }
284 
count_commas(const char * p,const char * end)285 static size_t count_commas(const char *p, const char *end) {
286 	size_t count = 0;
287 	while ((p = memchr(p, ',', end - p))) {
288 		count++;
289 		p++;
290 	}
291 	return count;
292 }
293 
294 /* {{{ static zend_result php_mb_parse_encoding_list()
295  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
296  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
297  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)298 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
299 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
300 {
301 	if (value == NULL || value_length == 0) {
302 		*return_list = NULL;
303 		*return_size = 0;
304 		return SUCCESS;
305 	} else {
306 		bool included_auto;
307 		size_t n, size;
308 		const char *p1, *endp, *tmpstr;
309 		const mbfl_encoding **entry, **list;
310 
311 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
312 			tmpstr = value + 1;
313 			value_length -= 2;
314 		} else {
315 			tmpstr = value;
316 		}
317 
318 		endp = tmpstr + value_length;
319 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
320 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
321 		entry = list;
322 		n = 0;
323 		included_auto = 0;
324 		p1 = tmpstr;
325 		while (1) {
326 			const char *comma = memchr(p1, ',', endp - p1);
327 			const char *p = comma ? comma : endp;
328 			/* trim spaces */
329 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
330 				p1++;
331 			}
332 			p--;
333 			while (p > p1 && (*p == ' ' || *p == '\t')) {
334 				p--;
335 			}
336 			size_t p1_length = p - p1 + 1;
337 			/* convert to the encoding number and check encoding */
338 			if (strncasecmp(p1, "auto", p1_length) == 0) {
339 				if (!included_auto) {
340 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
341 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
342 					size_t i;
343 					included_auto = 1;
344 					for (i = 0; i < identify_list_size; i++) {
345 						*entry++ = mbfl_no2encoding(*src++);
346 						n++;
347 					}
348 				}
349 			} else {
350 				const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
351 				if (!encoding) {
352 					/* Called from an INI setting modification */
353 					if (arg_num == 0) {
354 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
355 					} else {
356 						zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
357 					}
358 					pefree(ZEND_VOIDP(list), persistent);
359 					return FAILURE;
360 				}
361 
362 				*entry++ = encoding;
363 				n++;
364 			}
365 			if (n >= size || comma == NULL) {
366 				break;
367 			}
368 			p1 = comma + 1;
369 		}
370 		*return_list = list;
371 		*return_size = n;
372 	}
373 
374 	return SUCCESS;
375 }
376 /* }}} */
377 
378 /* {{{
379  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
380  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
381  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)382 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
383 	size_t *return_size, uint32_t arg_num)
384 {
385 	/* Allocate enough space to include the default detect order if "auto" is used. */
386 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
387 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
388 	const mbfl_encoding **entry = list;
389 	bool included_auto = 0;
390 	size_t n = 0;
391 	zval *hash_entry;
392 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
393 		zend_string *encoding_str = zval_try_get_string(hash_entry);
394 		if (UNEXPECTED(!encoding_str)) {
395 			efree(ZEND_VOIDP(list));
396 			return FAILURE;
397 		}
398 
399 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
400 			if (!included_auto) {
401 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
402 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
403 				size_t j;
404 
405 				included_auto = 1;
406 				for (j = 0; j < identify_list_size; j++) {
407 					*entry++ = mbfl_no2encoding(*src++);
408 					n++;
409 				}
410 			}
411 		} else {
412 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
413 			if (encoding) {
414 				*entry++ = encoding;
415 				n++;
416 			} else {
417 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
418 				zend_string_release(encoding_str);
419 				efree(ZEND_VOIDP(list));
420 				return FAILURE;
421 			}
422 		}
423 		zend_string_release(encoding_str);
424 	} ZEND_HASH_FOREACH_END();
425 	*return_list = list;
426 	*return_size = n;
427 	return SUCCESS;
428 }
429 /* }}} */
430 
431 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)432 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
433 {
434 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
435 }
436 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)437 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
438 {
439 	return ((const mbfl_encoding *)encoding)->name;
440 }
441 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)442 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
443 {
444 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
445 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
446 }
447 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)448 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
449 {
450 	if (!list) {
451 		list = (const zend_encoding**)MBSTRG(current_detect_order_list);
452 		list_size = MBSTRG(current_detect_order_list_size);
453 	}
454 	if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
455 		/* Emulate behavior of previous implementation; it would never return "pass"
456 		 * from an encoding auto-detection operation */
457 		return NULL;
458 	}
459 	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
460 }
461 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)462 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
463 {
464 	unsigned int num_errors = 0;
465 	zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
466 
467 	*to_length = ZSTR_LEN(result);
468 	*to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
469 	memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
470 	zend_string_free(result);
471 
472 	return from_length;
473 }
474 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)475 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
476 {
477 	return php_mb_parse_encoding_list(
478 		encoding_list, encoding_list_len,
479 		(const mbfl_encoding ***)return_list, return_size,
480 		persistent, /* arg_num */ 0);
481 }
482 
php_mb_zend_internal_encoding_getter(void)483 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
484 {
485 	return (const zend_encoding *)MBSTRG(internal_encoding);
486 }
487 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)488 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
489 {
490 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
491 	return SUCCESS;
492 }
493 
494 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
495 	"mbstring",
496 	php_mb_zend_encoding_fetcher,
497 	php_mb_zend_encoding_name_getter,
498 	php_mb_zend_encoding_lexer_compatibility_checker,
499 	php_mb_zend_encoding_detector,
500 	php_mb_zend_encoding_converter,
501 	php_mb_zend_encoding_list_parser,
502 	php_mb_zend_internal_encoding_getter,
503 	php_mb_zend_internal_encoding_setter
504 };
505 /* }}} */
506 
507 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)508 static void *_php_mb_compile_regex(const char *pattern)
509 {
510 	pcre2_code *retval;
511 	PCRE2_SIZE err_offset;
512 	int errnum;
513 
514 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
515 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
516 		PCRE2_UCHAR err_str[128];
517 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
518 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
519 	}
520 	return retval;
521 }
522 /* }}} */
523 
524 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)525 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
526 {
527 	int res;
528 
529 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
530 	if (NULL == match_data) {
531 		pcre2_code_free(opaque);
532 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
533 		return FAILURE;
534 	}
535 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
536 	php_pcre_free_match_data(match_data);
537 
538 	return res;
539 }
540 /* }}} */
541 
542 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)543 static void _php_mb_free_regex(void *opaque)
544 {
545 	pcre2_code_free(opaque);
546 }
547 /* }}} */
548 
549 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)550 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
551 {
552 	size_t i;
553 
554 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
555 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
556 
557 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
558 		if (php_mb_default_identify_list[i].lang == lang) {
559 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
560 			*plist_size = php_mb_default_identify_list[i].list_size;
561 			return 1;
562 		}
563 	}
564 	return 0;
565 }
566 /* }}} */
567 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)568 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
569 {
570 	char *result = emalloc(len + 2);
571 	char *resp = result;
572 	size_t i;
573 
574 	for (i = 0; i < len && start[i] != quote; ++i) {
575 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
576 			*resp++ = start[++i];
577 		} else {
578 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
579 
580 			while (j-- > 0 && i < len) {
581 				*resp++ = start[i++];
582 			}
583 			--i;
584 		}
585 	}
586 
587 	*resp = '\0';
588 	return result;
589 }
590 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)591 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
592 {
593 	char *pos = *line, quote;
594 	char *res;
595 
596 	while (*pos && *pos != stop) {
597 		if ((quote = *pos) == '"' || quote == '\'') {
598 			++pos;
599 			while (*pos && *pos != quote) {
600 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
601 					pos += 2;
602 				} else {
603 					++pos;
604 				}
605 			}
606 			if (*pos) {
607 				++pos;
608 			}
609 		} else {
610 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
611 
612 		}
613 	}
614 	if (*pos == '\0') {
615 		res = estrdup(*line);
616 		*line += strlen(*line);
617 		return res;
618 	}
619 
620 	res = estrndup(*line, pos - *line);
621 
622 	while (*pos == stop) {
623 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
624 	}
625 
626 	*line = pos;
627 	return res;
628 }
629 /* }}} */
630 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)631 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
632 {
633 	while (*str && isspace(*(unsigned char *)str)) {
634 		++str;
635 	}
636 
637 	if (!*str) {
638 		return estrdup("");
639 	}
640 
641 	if (*str == '"' || *str == '\'') {
642 		char quote = *str;
643 
644 		str++;
645 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
646 	} else {
647 		char *strend = str;
648 
649 		while (*strend && !isspace(*(unsigned char *)strend)) {
650 			++strend;
651 		}
652 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
653 	}
654 }
655 /* }}} */
656 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)657 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
658 {
659 	char *s, *s2;
660 	const size_t filename_len = strlen(filename);
661 
662 	/* The \ check should technically be needed for win32 systems only where
663 	 * it is a valid path separator. However, IE in all it's wisdom always sends
664 	 * the full path of the file on the user's filesystem, which means that unless
665 	 * the user does basename() they get a bogus file name. Until IE's user base drops
666 	 * to nill or problem is fixed this code must remain enabled for all systems. */
667 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
668 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
669 
670 	if (s && s2) {
671 		if (s > s2) {
672 			return ++s;
673 		} else {
674 			return ++s2;
675 		}
676 	} else if (s) {
677 		return ++s;
678 	} else if (s2) {
679 		return ++s2;
680 	} else {
681 		return filename;
682 	}
683 }
684 /* }}} */
685 
686 /* {{{ php.ini directive handler */
687 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)688 static PHP_INI_MH(OnUpdate_mbstring_language)
689 {
690 	enum mbfl_no_language no_language;
691 
692 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
693 	if (no_language == mbfl_no_language_invalid) {
694 		MBSTRG(language) = mbfl_no_language_neutral;
695 		return FAILURE;
696 	}
697 	MBSTRG(language) = no_language;
698 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
699 	return SUCCESS;
700 }
701 /* }}} */
702 
703 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)704 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
705 {
706 	const mbfl_encoding **list;
707 	size_t size;
708 
709 	if (!new_value) {
710 		if (MBSTRG(detect_order_list)) {
711 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
712 		}
713 		MBSTRG(detect_order_list) = NULL;
714 		MBSTRG(detect_order_list_size) = 0;
715 		return SUCCESS;
716 	}
717 
718 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
719 		return FAILURE;
720 	}
721 
722 	if (MBSTRG(detect_order_list)) {
723 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 	}
725 	MBSTRG(detect_order_list) = list;
726 	MBSTRG(detect_order_list_size) = size;
727 	return SUCCESS;
728 }
729 /* }}} */
730 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)731 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
732 	const mbfl_encoding **list;
733 	size_t size;
734 	if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
735 		list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
736 		*list = &mbfl_encoding_pass;
737 		size = 1;
738 	} else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
739 		return FAILURE;
740 	}
741 	if (MBSTRG(http_input_list)) {
742 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
743 	}
744 	MBSTRG(http_input_list) = list;
745 	MBSTRG(http_input_list_size) = size;
746 	return SUCCESS;
747 }
748 
749 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)750 static PHP_INI_MH(OnUpdate_mbstring_http_input)
751 {
752 	if (new_value) {
753 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
754 	}
755 
756 	if (!new_value || !ZSTR_LEN(new_value)) {
757 		const char *encoding = php_get_input_encoding();
758 		MBSTRG(http_input_set) = 0;
759 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
760 		return SUCCESS;
761 	}
762 
763 	MBSTRG(http_input_set) = 1;
764 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
765 }
766 /* }}} */
767 
_php_mb_ini_mbstring_http_output_set(const char * new_value,size_t length)768 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
769 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
770 	if (!encoding) {
771 		return FAILURE;
772 	}
773 
774 	MBSTRG(http_output_encoding) = encoding;
775 	MBSTRG(current_http_output_encoding) = encoding;
776 	return SUCCESS;
777 }
778 
779 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)780 static PHP_INI_MH(OnUpdate_mbstring_http_output)
781 {
782 	if (new_value) {
783 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
784 	}
785 
786 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
787 		const char *encoding = php_get_output_encoding();
788 		MBSTRG(http_output_set) = 0;
789 		_php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
790 		return SUCCESS;
791 	}
792 
793 	MBSTRG(http_output_set) = 1;
794 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
795 }
796 /* }}} */
797 
798 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)799 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
800 {
801 	const mbfl_encoding *encoding;
802 
803 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
804 		/* falls back to UTF-8 if an unknown encoding name is given */
805 		if (new_value) {
806 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
807 		}
808 		encoding = &mbfl_encoding_utf8;
809 	}
810 	MBSTRG(internal_encoding) = encoding;
811 	MBSTRG(current_internal_encoding) = encoding;
812 #ifdef HAVE_MBREGEX
813 	{
814 		const char *enc_name = new_value;
815 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
816 			/* falls back to UTF-8 if an unknown encoding name is given */
817 			enc_name = "UTF-8";
818 			php_mb_regex_set_default_mbctype(enc_name);
819 		}
820 		php_mb_regex_set_mbctype(new_value);
821 	}
822 #endif
823 	return SUCCESS;
824 }
825 /* }}} */
826 
827 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)828 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
829 {
830 	if (new_value) {
831 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
832 	}
833 
834 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
835 		return FAILURE;
836 	}
837 
838 	if (new_value && ZSTR_LEN(new_value)) {
839 		MBSTRG(internal_encoding_set) = 1;
840 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
841 	} else {
842 		const char *encoding = php_get_internal_encoding();
843 		MBSTRG(internal_encoding_set) = 0;
844 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
845 	}
846 }
847 /* }}} */
848 
849 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)850 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
851 {
852 	if (new_value != NULL) {
853 		if (zend_string_equals_literal_ci(new_value, "none")) {
854 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
855 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
856 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
857 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
858 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
859 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
860 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
861 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
862 		} else {
863 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
864 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
865 			if (ZSTR_LEN(new_value) > 0) {
866 				char *endptr = NULL;
867 				int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868 
869 				if (*endptr == '\0') {
870 					MBSTRG(filter_illegal_substchar) = c;
871 					MBSTRG(current_filter_illegal_substchar) = c;
872 				}
873 			}
874 		}
875 	} else {
876 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
878 		MBSTRG(filter_illegal_substchar) = '?';
879 		MBSTRG(current_filter_illegal_substchar) = '?';
880 	}
881 
882 	return SUCCESS;
883 }
884 /* }}} */
885 
886 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)887 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
888 {
889 	if (new_value == NULL) {
890 		return FAILURE;
891 	}
892 
893 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
894 
895 	if (MBSTRG(encoding_translation)) {
896 		sapi_unregister_post_entry(php_post_entries);
897 		sapi_register_post_entries(mbstr_post_entries);
898 	} else {
899 		sapi_unregister_post_entry(mbstr_post_entries);
900 		sapi_register_post_entries(php_post_entries);
901 	}
902 
903 	return SUCCESS;
904 }
905 /* }}} */
906 
907 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)908 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
909 {
910 	zend_string *tmp;
911 	void *re = NULL;
912 
913 	if (!new_value) {
914 		new_value = entry->orig_value;
915 	}
916 	tmp = php_trim(new_value, NULL, 0, 3);
917 
918 	if (ZSTR_LEN(tmp) > 0) {
919 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
920 			zend_string_release_ex(tmp, 0);
921 			return FAILURE;
922 		}
923 	}
924 
925 	if (MBSTRG(http_output_conv_mimetypes)) {
926 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
927 	}
928 
929 	MBSTRG(http_output_conv_mimetypes) = re;
930 
931 	zend_string_release_ex(tmp, 0);
932 	return SUCCESS;
933 }
934 /* }}} */
935 /* }}} */
936 
937 /* {{{ php.ini directive registration */
938 PHP_INI_BEGIN()
939 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
940 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
941 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
942 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
943 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
944 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
945 
946 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
947 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
948 		OnUpdate_mbstring_encoding_translation,
949 		encoding_translation, zend_mbstring_globals, mbstring_globals)
950 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
951 		"^(text/|application/xhtml\\+xml)",
952 		PHP_INI_ALL,
953 		OnUpdate_mbstring_http_output_conv_mimetypes)
954 
955 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
956 		PHP_INI_ALL,
957 		OnUpdateBool,
958 		strict_detection, zend_mbstring_globals, mbstring_globals)
959 #ifdef HAVE_MBREGEX
960 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
961 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
962 #endif
PHP_INI_END()963 PHP_INI_END()
964 /* }}} */
965 
966 static void mbstring_internal_encoding_changed_hook(void) {
967 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
968 	if (!MBSTRG(internal_encoding_set)) {
969 		const char *encoding = php_get_internal_encoding();
970 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
971 	}
972 
973 	if (!MBSTRG(http_output_set)) {
974 		const char *encoding = php_get_output_encoding();
975 		_php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
976 	}
977 
978 	if (!MBSTRG(http_input_set)) {
979 		const char *encoding = php_get_input_encoding();
980 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
981 	}
982 }
983 
984 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)985 static PHP_GINIT_FUNCTION(mbstring)
986 {
987 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
988 ZEND_TSRMLS_CACHE_UPDATE();
989 #endif
990 
991 	mbstring_globals->language = mbfl_no_language_uni;
992 	mbstring_globals->internal_encoding = NULL;
993 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
994 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
995 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
996 	mbstring_globals->http_input_identify = NULL;
997 	mbstring_globals->http_input_identify_get = NULL;
998 	mbstring_globals->http_input_identify_post = NULL;
999 	mbstring_globals->http_input_identify_cookie = NULL;
1000 	mbstring_globals->http_input_identify_string = NULL;
1001 	mbstring_globals->http_input_list = NULL;
1002 	mbstring_globals->http_input_list_size = 0;
1003 	mbstring_globals->detect_order_list = NULL;
1004 	mbstring_globals->detect_order_list_size = 0;
1005 	mbstring_globals->current_detect_order_list = NULL;
1006 	mbstring_globals->current_detect_order_list_size = 0;
1007 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1008 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1009 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1010 	mbstring_globals->filter_illegal_substchar = '?';
1011 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1012 	mbstring_globals->current_filter_illegal_substchar = '?';
1013 	mbstring_globals->illegalchars = 0;
1014 	mbstring_globals->encoding_translation = 0;
1015 	mbstring_globals->strict_detection = 0;
1016 	mbstring_globals->outconv_enabled = false;
1017 	mbstring_globals->outconv_state = 0;
1018 	mbstring_globals->http_output_conv_mimetypes = NULL;
1019 #ifdef HAVE_MBREGEX
1020 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1021 #endif
1022 	mbstring_globals->last_used_encoding_name = NULL;
1023 	mbstring_globals->last_used_encoding = NULL;
1024 	mbstring_globals->internal_encoding_set = 0;
1025 	mbstring_globals->http_output_set = 0;
1026 	mbstring_globals->http_input_set = 0;
1027 	mbstring_globals->all_encodings_list = NULL;
1028 }
1029 /* }}} */
1030 
1031 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1032 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1033 {
1034 	if (mbstring_globals->http_input_list) {
1035 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1036 	}
1037 	if (mbstring_globals->detect_order_list) {
1038 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1039 	}
1040 	if (mbstring_globals->http_output_conv_mimetypes) {
1041 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1042 	}
1043 #ifdef HAVE_MBREGEX
1044 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1045 #endif
1046 }
1047 /* }}} */
1048 
1049 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1050 static void init_check_utf8(void);
1051 #endif
1052 
1053 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1054 PHP_MINIT_FUNCTION(mbstring)
1055 {
1056 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1057 ZEND_TSRMLS_CACHE_UPDATE();
1058 #endif
1059 
1060 	REGISTER_INI_ENTRIES();
1061 
1062 	/* We assume that we're the only user of the hook. */
1063 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1064 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1065 	mbstring_internal_encoding_changed_hook();
1066 
1067 	/* This is a global handler. Should not be set in a per-request handler. */
1068 	sapi_register_treat_data(mbstr_treat_data);
1069 
1070 	/* Post handlers are stored in the thread-local context. */
1071 	if (MBSTRG(encoding_translation)) {
1072 		sapi_register_post_entries(mbstr_post_entries);
1073 	}
1074 
1075 #ifdef HAVE_MBREGEX
1076 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1077 #endif
1078 
1079 	register_mbstring_symbols(module_number);
1080 
1081 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1082 		return FAILURE;
1083 	}
1084 
1085 	php_rfc1867_set_multibyte_callbacks(
1086 		php_mb_encoding_translation,
1087 		php_mb_gpc_get_detect_order,
1088 		php_mb_gpc_set_input_encoding,
1089 		php_mb_rfc1867_getword,
1090 		php_mb_rfc1867_getword_conf,
1091 		php_mb_rfc1867_basename);
1092 
1093 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1094 	init_check_utf8();
1095 	init_convert_utf16();
1096 #endif
1097 
1098 	return SUCCESS;
1099 }
1100 /* }}} */
1101 
1102 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1103 PHP_MSHUTDOWN_FUNCTION(mbstring)
1104 {
1105 	UNREGISTER_INI_ENTRIES();
1106 
1107 	zend_multibyte_restore_functions();
1108 
1109 #ifdef HAVE_MBREGEX
1110 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1111 #endif
1112 
1113 	php_internal_encoding_changed = NULL;
1114 
1115 	return SUCCESS;
1116 }
1117 /* }}} */
1118 
1119 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1120 PHP_RINIT_FUNCTION(mbstring)
1121 {
1122 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1123 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1124 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1125 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1126 
1127 	MBSTRG(illegalchars) = 0;
1128 
1129 	php_mb_populate_current_detect_order_list();
1130 
1131 #ifdef HAVE_MBREGEX
1132 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1133 #endif
1134 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1135 
1136 	return SUCCESS;
1137 }
1138 /* }}} */
1139 
1140 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1141 PHP_RSHUTDOWN_FUNCTION(mbstring)
1142 {
1143 	if (MBSTRG(current_detect_order_list) != NULL) {
1144 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1145 		MBSTRG(current_detect_order_list) = NULL;
1146 		MBSTRG(current_detect_order_list_size) = 0;
1147 	}
1148 
1149 	/* clear http input identification. */
1150 	MBSTRG(http_input_identify) = NULL;
1151 	MBSTRG(http_input_identify_post) = NULL;
1152 	MBSTRG(http_input_identify_get) = NULL;
1153 	MBSTRG(http_input_identify_cookie) = NULL;
1154 	MBSTRG(http_input_identify_string) = NULL;
1155 
1156 	if (MBSTRG(last_used_encoding_name)) {
1157 		zend_string_release(MBSTRG(last_used_encoding_name));
1158 		MBSTRG(last_used_encoding_name) = NULL;
1159 	}
1160 
1161 	MBSTRG(internal_encoding_set) = 0;
1162 	MBSTRG(http_output_set) = 0;
1163 	MBSTRG(http_input_set) = 0;
1164 
1165 	MBSTRG(outconv_enabled) = false;
1166 	MBSTRG(outconv_state) = 0;
1167 
1168 	if (MBSTRG(all_encodings_list)) {
1169 		GC_DELREF(MBSTRG(all_encodings_list));
1170 		zend_array_destroy(MBSTRG(all_encodings_list));
1171 		MBSTRG(all_encodings_list) = NULL;
1172 	}
1173 
1174 #ifdef HAVE_MBREGEX
1175 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1176 #endif
1177 
1178 	return SUCCESS;
1179 }
1180 /* }}} */
1181 
1182 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1183 PHP_MINFO_FUNCTION(mbstring)
1184 {
1185 	php_info_print_table_start();
1186 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1187 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1188 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1189 	{
1190 		char tmp[256];
1191 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1192 		php_info_print_table_row(2, "libmbfl version", tmp);
1193 	}
1194 	php_info_print_table_end();
1195 
1196 	php_info_print_table_start();
1197 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1198 	php_info_print_table_end();
1199 
1200 #ifdef HAVE_MBREGEX
1201 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1202 #endif
1203 
1204 	DISPLAY_INI_ENTRIES();
1205 }
1206 /* }}} */
1207 
1208 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1209 PHP_FUNCTION(mb_language)
1210 {
1211 	zend_string *name = NULL;
1212 
1213 	ZEND_PARSE_PARAMETERS_START(0, 1)
1214 		Z_PARAM_OPTIONAL
1215 		Z_PARAM_STR_OR_NULL(name)
1216 	ZEND_PARSE_PARAMETERS_END();
1217 
1218 	if (name == NULL) {
1219 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1220 	} else {
1221 		zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1222 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1223 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1224 			zend_string_release_ex(ini_name, 0);
1225 			RETURN_THROWS();
1226 		}
1227 		// TODO Make return void
1228 		RETVAL_TRUE;
1229 		zend_string_release_ex(ini_name, 0);
1230 	}
1231 }
1232 /* }}} */
1233 
1234 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1235 PHP_FUNCTION(mb_internal_encoding)
1236 {
1237 	char *name = NULL;
1238 	size_t name_len;
1239 	const mbfl_encoding *encoding;
1240 
1241 	ZEND_PARSE_PARAMETERS_START(0, 1)
1242 		Z_PARAM_OPTIONAL
1243 		Z_PARAM_STRING_OR_NULL(name, name_len)
1244 	ZEND_PARSE_PARAMETERS_END();
1245 
1246 	if (name == NULL) {
1247 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1248 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1249 	} else {
1250 		encoding = mbfl_name2encoding(name);
1251 		if (!encoding) {
1252 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1253 			RETURN_THROWS();
1254 		} else {
1255 			MBSTRG(current_internal_encoding) = encoding;
1256 			MBSTRG(internal_encoding_set) = 1;
1257 			/* TODO Return old encoding */
1258 			RETURN_TRUE;
1259 		}
1260 	}
1261 }
1262 /* }}} */
1263 
1264 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1265 PHP_FUNCTION(mb_http_input)
1266 {
1267 	char *type = NULL;
1268 	size_t type_len = 0, n;
1269 	const mbfl_encoding **entry;
1270 	const mbfl_encoding *encoding;
1271 
1272 	ZEND_PARSE_PARAMETERS_START(0, 1)
1273 		Z_PARAM_OPTIONAL
1274 		Z_PARAM_STRING_OR_NULL(type, type_len)
1275 	ZEND_PARSE_PARAMETERS_END();
1276 
1277 	if (type == NULL) {
1278 		encoding = MBSTRG(http_input_identify);
1279 	} else if (type_len != 1) {
1280 		zend_argument_value_error(1,
1281 			"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1282 		RETURN_THROWS();
1283 	} else {
1284 		switch (*type) {
1285 		case 'G':
1286 		case 'g':
1287 			encoding = MBSTRG(http_input_identify_get);
1288 			break;
1289 		case 'P':
1290 		case 'p':
1291 			encoding = MBSTRG(http_input_identify_post);
1292 			break;
1293 		case 'C':
1294 		case 'c':
1295 			encoding = MBSTRG(http_input_identify_cookie);
1296 			break;
1297 		case 'S':
1298 		case 's':
1299 			encoding = MBSTRG(http_input_identify_string);
1300 			break;
1301 		case 'I':
1302 		case 'i':
1303 			entry = MBSTRG(http_input_list);
1304 			n = MBSTRG(http_input_list_size);
1305 			array_init(return_value);
1306 			for (size_t i = 0; i < n; i++, entry++) {
1307 				add_next_index_string(return_value, (*entry)->name);
1308 			}
1309 			return;
1310 		case 'L':
1311 		case 'l':
1312 			entry = MBSTRG(http_input_list);
1313 			n = MBSTRG(http_input_list_size);
1314 			if (n == 0) {
1315 				RETURN_FALSE;
1316 			}
1317 
1318 			smart_str result = {0};
1319 			for (size_t i = 0; i < n; i++, entry++) {
1320 				if (i > 0) {
1321 					smart_str_appendc(&result, ',');
1322 				}
1323 				smart_str_appends(&result, (*entry)->name);
1324 			}
1325 			RETURN_STR(smart_str_extract(&result));
1326 		default:
1327 			zend_argument_value_error(1,
1328 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1329 			RETURN_THROWS();
1330 		}
1331 	}
1332 
1333 	if (encoding) {
1334 		RETURN_STRING(encoding->name);
1335 	} else {
1336 		RETURN_FALSE;
1337 	}
1338 }
1339 /* }}} */
1340 
1341 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1342 PHP_FUNCTION(mb_http_output)
1343 {
1344 	char *name = NULL;
1345 	size_t name_len;
1346 
1347 	ZEND_PARSE_PARAMETERS_START(0, 1)
1348 		Z_PARAM_OPTIONAL
1349 		Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1350 	ZEND_PARSE_PARAMETERS_END();
1351 
1352 	if (name == NULL) {
1353 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1354 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1355 	} else {
1356 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1357 		if (!encoding) {
1358 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1359 			RETURN_THROWS();
1360 		} else {
1361 			MBSTRG(http_output_set) = 1;
1362 			MBSTRG(current_http_output_encoding) = encoding;
1363 			/* TODO Return previous encoding? */
1364 			RETURN_TRUE;
1365 		}
1366 	}
1367 }
1368 /* }}} */
1369 
1370 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1371 PHP_FUNCTION(mb_detect_order)
1372 {
1373 	zend_string *order_str = NULL;
1374 	HashTable *order_ht = NULL;
1375 
1376 	ZEND_PARSE_PARAMETERS_START(0, 1)
1377 		Z_PARAM_OPTIONAL
1378 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1379 	ZEND_PARSE_PARAMETERS_END();
1380 
1381 	if (!order_str && !order_ht) {
1382 		size_t n = MBSTRG(current_detect_order_list_size);
1383 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1384 		array_init(return_value);
1385 		for (size_t i = 0; i < n; i++) {
1386 			add_next_index_string(return_value, (*entry)->name);
1387 			entry++;
1388 		}
1389 	} else {
1390 		const mbfl_encoding **list;
1391 		size_t size;
1392 		if (order_ht) {
1393 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1394 				RETURN_THROWS();
1395 			}
1396 		} else {
1397 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1398 				RETURN_THROWS();
1399 			}
1400 		}
1401 
1402 		if (size == 0) {
1403 			efree(ZEND_VOIDP(list));
1404 			zend_argument_value_error(1, "must specify at least one encoding");
1405 			RETURN_THROWS();
1406 		}
1407 
1408 		if (MBSTRG(current_detect_order_list)) {
1409 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1410 		}
1411 		MBSTRG(current_detect_order_list) = list;
1412 		MBSTRG(current_detect_order_list_size) = size;
1413 		RETURN_TRUE;
1414 	}
1415 }
1416 /* }}} */
1417 
php_mb_check_code_point(zend_long cp)1418 static inline bool php_mb_check_code_point(zend_long cp)
1419 {
1420 	if (cp < 0 || cp >= 0x110000) {
1421 		/* Out of Unicode range */
1422 		return false;
1423 	}
1424 
1425 	if (cp >= 0xd800 && cp <= 0xdfff) {
1426 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1427 		 * substitute character. */
1428 		return false;
1429 	}
1430 
1431 	/* As we do not know the target encoding of the conversion operation that is going to
1432 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1433 	 * in the given encoding at this point. Thus we have to accept everything. */
1434 	return true;
1435 }
1436 
1437 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1438 PHP_FUNCTION(mb_substitute_character)
1439 {
1440 	zend_string *substitute_character = NULL;
1441 	zend_long substitute_codepoint;
1442 	bool substitute_is_null = 1;
1443 
1444 	ZEND_PARSE_PARAMETERS_START(0, 1)
1445 		Z_PARAM_OPTIONAL
1446 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1447 	ZEND_PARSE_PARAMETERS_END();
1448 
1449 	if (substitute_is_null) {
1450 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1451 			RETURN_STRING("none");
1452 		}
1453 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1454 			RETURN_STRING("long");
1455 		}
1456 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1457 			RETURN_STRING("entity");
1458 		}
1459 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1460 	}
1461 
1462 	if (substitute_character != NULL) {
1463 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1464 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1465 			RETURN_TRUE;
1466 		}
1467 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1468 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1469 			RETURN_TRUE;
1470 		}
1471 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1472 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1473 			RETURN_TRUE;
1474 		}
1475 		/* Invalid string value */
1476 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1477 		RETURN_THROWS();
1478 	}
1479 	/* Integer codepoint passed */
1480 	if (!php_mb_check_code_point(substitute_codepoint)) {
1481 		zend_argument_value_error(1, "is not a valid codepoint");
1482 		RETURN_THROWS();
1483 	}
1484 
1485 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1486 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1487 	RETURN_TRUE;
1488 }
1489 /* }}} */
1490 
1491 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1492 PHP_FUNCTION(mb_preferred_mime_name)
1493 {
1494 	char *name = NULL;
1495 	size_t name_len;
1496 
1497 	ZEND_PARSE_PARAMETERS_START(1, 1)
1498 		Z_PARAM_STRING(name, name_len)
1499 	ZEND_PARSE_PARAMETERS_END();
1500 
1501 	const mbfl_encoding *enc = mbfl_name2encoding(name);
1502 	if (enc == NULL) {
1503 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1504 		RETURN_THROWS();
1505 	}
1506 
1507 	const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1508 	if (preferred_name == NULL || *preferred_name == '\0') {
1509 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1510 		RETVAL_FALSE;
1511 	} else {
1512 		RETVAL_STRING((char *)preferred_name);
1513 	}
1514 }
1515 /* }}} */
1516 
1517 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1518 PHP_FUNCTION(mb_parse_str)
1519 {
1520 	zval *track_vars_array = NULL;
1521 	char *encstr;
1522 	size_t encstr_len;
1523 	php_mb_encoding_handler_info_t info;
1524 	const mbfl_encoding *detected;
1525 
1526 	ZEND_PARSE_PARAMETERS_START(2, 2)
1527 		Z_PARAM_STRING(encstr, encstr_len)
1528 		Z_PARAM_ZVAL(track_vars_array)
1529 	ZEND_PARSE_PARAMETERS_END();
1530 
1531 	track_vars_array = zend_try_array_init(track_vars_array);
1532 	if (!track_vars_array) {
1533 		RETURN_THROWS();
1534 	}
1535 
1536 	encstr = estrndup(encstr, encstr_len);
1537 
1538 	info.data_type              = PARSE_STRING;
1539 	info.separator              = PG(arg_separator).input;
1540 	info.report_errors          = true;
1541 	info.to_encoding            = MBSTRG(current_internal_encoding);
1542 	info.from_encodings         = MBSTRG(http_input_list);
1543 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1544 
1545 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1546 
1547 	MBSTRG(http_input_identify) = detected;
1548 
1549 	RETVAL_BOOL(detected);
1550 
1551 	if (encstr != NULL) efree(encstr);
1552 }
1553 /* }}} */
1554 
PHP_FUNCTION(mb_output_handler)1555 PHP_FUNCTION(mb_output_handler)
1556 {
1557 	zend_string *str;
1558 	zend_long arg_status;
1559 
1560 	ZEND_PARSE_PARAMETERS_START(2, 2)
1561 		Z_PARAM_STR(str)
1562 		Z_PARAM_LONG(arg_status)
1563 	ZEND_PARSE_PARAMETERS_END();
1564 
1565 	const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1566 	if (encoding == &mbfl_encoding_pass) {
1567 		RETURN_STR_COPY(str);
1568 	}
1569 
1570 	if (arg_status & PHP_OUTPUT_HANDLER_START) {
1571 		bool free_mimetype = false;
1572 		char *mimetype = NULL;
1573 
1574 		/* Analyze mime type */
1575 		if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1576 			char *s;
1577 			if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1578 				mimetype = estrdup(SG(sapi_headers).mimetype);
1579 			} else {
1580 				mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1581 			}
1582 			free_mimetype = true;
1583 		} else if (SG(sapi_headers).send_default_content_type) {
1584 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1585 		}
1586 
1587 		/* If content-type is not yet set, set it and enable conversion */
1588 		if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1589 			const char *charset = encoding->mime_name;
1590 			if (charset) {
1591 				char *p;
1592 				size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s",  mimetype, charset);
1593 				if (sapi_add_header(p, len, 0) != FAILURE) {
1594 					SG(sapi_headers).send_default_content_type = 0;
1595 				}
1596 			}
1597 
1598 			MBSTRG(outconv_enabled) = true;
1599 		}
1600 
1601 		if (free_mimetype) {
1602 			efree(mimetype);
1603 		}
1604 	}
1605 
1606 	if (!MBSTRG(outconv_enabled)) {
1607 		RETURN_STR_COPY(str);
1608 	}
1609 
1610 	mb_convert_buf buf;
1611 	mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1612 
1613 	uint32_t wchar_buf[128];
1614 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1615 	size_t in_len = ZSTR_LEN(str);
1616 	bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1617 
1618 	while (in_len) {
1619 		size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1620 		ZEND_ASSERT(out_len <= 128);
1621 		encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1622 	}
1623 
1624 	MBSTRG(illegalchars) += buf.errors;
1625 	RETVAL_STR(mb_convert_buf_result_raw(&buf));
1626 
1627 	if (last_feed) {
1628 		MBSTRG(outconv_enabled) = false;
1629 		MBSTRG(outconv_state) = 0;
1630 	}
1631 }
1632 
PHP_FUNCTION(mb_str_split)1633 PHP_FUNCTION(mb_str_split)
1634 {
1635 	zend_string *str, *encoding = NULL;
1636 	zend_long split_len = 1;
1637 
1638 	ZEND_PARSE_PARAMETERS_START(1, 3)
1639 		Z_PARAM_STR(str)
1640 		Z_PARAM_OPTIONAL
1641 		Z_PARAM_LONG(split_len)
1642 		Z_PARAM_STR_OR_NULL(encoding)
1643 	ZEND_PARSE_PARAMETERS_END();
1644 
1645 	if (split_len <= 0) {
1646 		zend_argument_value_error(2, "must be greater than 0");
1647 		RETURN_THROWS();
1648 	} else if (split_len > UINT_MAX / 4) {
1649 		zend_argument_value_error(2, "is too large");
1650 		RETURN_THROWS();
1651 	}
1652 
1653 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1654 	if (!enc) {
1655 		RETURN_THROWS();
1656 	}
1657 
1658 	if (ZSTR_LEN(str) == 0) {
1659 		RETURN_EMPTY_ARRAY();
1660 	}
1661 
1662 	unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1663 
1664 	unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1665 	if (char_len) {
1666 		unsigned int chunk_len = char_len * split_len;
1667 		unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1668 		array_init_size(return_value, chunks);
1669 		while (p < e) {
1670 			add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1671 			p += chunk_len;
1672 		}
1673 	} else if (enc->mblen_table) {
1674 		unsigned char const *mbtab = enc->mblen_table;
1675 
1676 		/* Assume that we have 1-byte characters */
1677 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1678 
1679 		while (p < e) {
1680 			unsigned char *chunk = p; /* start of chunk */
1681 
1682 			for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1683 				p += mbtab[*p];
1684 			}
1685 			if (p > e) {
1686 				p = e; /* ensure chunk is in bounds */
1687 			}
1688 			add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1689 		}
1690 	} else {
1691 		/* Assume that we have 1-byte characters */
1692 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1693 
1694 		uint32_t wchar_buf[128];
1695 		size_t in_len = ZSTR_LEN(str);
1696 		unsigned int state = 0, char_count = 0;
1697 
1698 		mb_convert_buf buf;
1699 
1700 		while (in_len) {
1701 			size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1702 			ZEND_ASSERT(out_len <= 128);
1703 			size_t i = 0;
1704 
1705 			/* Is there some output remaining from the previous iteration? */
1706 			if (char_count) {
1707 				if (out_len >= split_len - char_count) {
1708 					/* Finish off an incomplete chunk from previous iteration
1709 					 * ('buf' was already initialized; we don't need to do it again) */
1710 					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1711 					i += split_len - char_count;
1712 					char_count = 0;
1713 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1714 				} else {
1715 					/* Output from this iteration is not enough to finish the next chunk;
1716 					 * output what we can, and leave 'buf' to be used again on next iteration */
1717 					enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1718 					char_count += out_len;
1719 					continue;
1720 				}
1721 			}
1722 
1723 			while (i < out_len) {
1724 				/* Prepare for the next chunk */
1725 				mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1726 
1727 				if (out_len - i >= split_len) {
1728 					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1729 					i += split_len;
1730 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1731 				} else {
1732 					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1733 					 * leave them for the next iteration */
1734 					enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1735 					char_count = out_len - i;
1736 					break;
1737 				}
1738 			}
1739 		}
1740 
1741 		if (char_count) {
1742 			/* The main loop above has finished processing the input string, but
1743 			 * has left a partial chunk in 'buf' */
1744 			add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1745 		}
1746 	}
1747 }
1748 
1749 #ifdef __SSE2__
1750 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1751  * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1752  * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1753  * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1754 static inline uint32_t _mm_sum_epu8(const __m128i v)
1755 {
1756 	/* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1757 	 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1758 	 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1759 	 * halves of the destination XMM register
1760 	 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1761 	 * summed up will actually just be the 8-bit values from `v` */
1762 	__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1763 	/* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1764 	 * to extract it here; but it stored the sum as two different 16-bit values
1765 	 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1766 	 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1767 	return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1768 }
1769 #endif
1770 
1771 /* This assumes that `string` is valid UTF-8
1772  * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1773  * Interpreted as signed integers, those are all byte values less than -64
1774  * A fast way to get the length of a UTF-8 string is to start with its byte length,
1775  * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1776 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1777 {
1778 	unsigned char *e = p + len;
1779 
1780 #ifdef __SSE2__
1781 	if (len >= sizeof(__m128i)) {
1782 		e -= sizeof(__m128i);
1783 
1784 		const __m128i threshold = _mm_set1_epi8(-64);
1785 		const __m128i delta = _mm_set1_epi8(1);
1786 		__m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1787 
1788 		unsigned char reset_counter = 255;
1789 		do {
1790 			__m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1791 			__m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1792 			counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1793 
1794 			/* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1795 			 * and reset them to zero */
1796 			if (--reset_counter == 0) {
1797 				len -= _mm_sum_epu8(counter);
1798 				counter = _mm_setzero_si128();
1799 				reset_counter = 255;
1800 			}
1801 
1802 			p += sizeof(__m128i);
1803 		} while (p <= e);
1804 
1805 		e += sizeof(__m128i);
1806 		len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1807 	}
1808 #endif
1809 
1810 	/* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1811 	while (p < e) {
1812 		signed char c = *p++;
1813 		if (c < -64) {
1814 			len--;
1815 		}
1816 	}
1817 
1818 	return len;
1819 }
1820 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1821 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1822 {
1823 	unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1824 	if (char_len) {
1825 		return ZSTR_LEN(string) / char_len;
1826 	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1827 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1828 	}
1829 
1830 	uint32_t wchar_buf[128];
1831 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1832 	size_t in_len = ZSTR_LEN(string);
1833 	unsigned int state = 0;
1834 	size_t len = 0;
1835 
1836 	while (in_len) {
1837 		len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1838 	}
1839 
1840 	return len;
1841 }
1842 
1843 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1844 PHP_FUNCTION(mb_strlen)
1845 {
1846 	zend_string *string, *enc_name = NULL;
1847 
1848 	ZEND_PARSE_PARAMETERS_START(1, 2)
1849 		Z_PARAM_STR(string)
1850 		Z_PARAM_OPTIONAL
1851 		Z_PARAM_STR_OR_NULL(enc_name)
1852 	ZEND_PARSE_PARAMETERS_END();
1853 
1854 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1855 	if (!enc) {
1856 		RETURN_THROWS();
1857 	}
1858 
1859 	RETVAL_LONG(mb_get_strlen(string, enc));
1860 }
1861 /* }}} */
1862 
1863 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1864 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1865 {
1866 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1867 }
1868 
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1869 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1870 	if (offset < 0) {
1871 		unsigned char *pos = end;
1872 		while (offset < 0) {
1873 			if (pos <= str) {
1874 				return NULL;
1875 			}
1876 
1877 			unsigned char c = *--pos;
1878 			if (c < 0x80 || (c & 0xC0) != 0x80) {
1879 				offset++;
1880 			}
1881 		}
1882 		return pos;
1883 	} else {
1884 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1885 		unsigned char *pos = str;
1886 		while (offset-- > 0) {
1887 			if (pos >= end) {
1888 				return NULL;
1889 			}
1890 			pos += u8_tbl[*pos];
1891 		}
1892 		return pos;
1893 	}
1894 }
1895 
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1896 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1897 	return mb_fast_strlen_utf8(start, pos - start);
1898 }
1899 
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1900 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1901 {
1902 	size_t result;
1903 	zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1904 	unsigned char *offset_pointer;
1905 
1906 	if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1907 		unsigned int num_errors = 0;
1908 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1909 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1910 	} else {
1911 		haystack_u8 = haystack;
1912 		needle_u8 = needle;
1913 	}
1914 
1915 	offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1916 	if (!offset_pointer) {
1917 		result = MBFL_ERROR_OFFSET;
1918 		goto out;
1919 	}
1920 
1921 	result = MBFL_ERROR_NOT_FOUND;
1922 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1923 		goto out;
1924 	}
1925 
1926 	const char *found_pos;
1927 	if (!reverse) {
1928 		found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1929 	} else if (offset >= 0) {
1930 		found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1931 	} else {
1932 		size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1933 		offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1934 		if (!offset_pointer) {
1935 			offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1936 		}
1937 
1938 		found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1939 	}
1940 
1941 	if (found_pos) {
1942 		result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1943 	}
1944 
1945 out:
1946 	if (haystack_u8 != haystack) {
1947 		zend_string_free(haystack_u8);
1948 	}
1949 	if (needle_u8 != needle) {
1950 		zend_string_free(needle_u8);
1951 	}
1952 	return result;
1953 }
1954 
handle_strpos_error(size_t error)1955 static void handle_strpos_error(size_t error) {
1956 	switch (error) {
1957 	case MBFL_ERROR_NOT_FOUND:
1958 		break;
1959 	case MBFL_ERROR_ENCODING:
1960 		php_error_docref(NULL, E_WARNING, "Conversion error");
1961 		break;
1962 	case MBFL_ERROR_OFFSET:
1963 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1964 		break;
1965 	default:
1966 		zend_value_error("mb_strpos(): Unknown error");
1967 		break;
1968 	}
1969 }
1970 
PHP_FUNCTION(mb_strpos)1971 PHP_FUNCTION(mb_strpos)
1972 {
1973 	zend_long offset = 0;
1974 	zend_string *needle, *haystack;
1975 	zend_string *enc_name = NULL;
1976 
1977 	ZEND_PARSE_PARAMETERS_START(2, 4)
1978 		Z_PARAM_STR(haystack)
1979 		Z_PARAM_STR(needle)
1980 		Z_PARAM_OPTIONAL
1981 		Z_PARAM_LONG(offset)
1982 		Z_PARAM_STR_OR_NULL(enc_name)
1983 	ZEND_PARSE_PARAMETERS_END();
1984 
1985 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1986 	if (!enc) {
1987 		RETURN_THROWS();
1988 	}
1989 
1990 	size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1991 	if (!mbfl_is_error(n)) {
1992 		RETVAL_LONG(n);
1993 	} else {
1994 		handle_strpos_error(n);
1995 		RETVAL_FALSE;
1996 	}
1997 }
1998 
1999 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)2000 PHP_FUNCTION(mb_strrpos)
2001 {
2002 	zend_long offset = 0;
2003 	zend_string *needle, *haystack;
2004 	zend_string *enc_name = NULL;
2005 
2006 	ZEND_PARSE_PARAMETERS_START(2, 4)
2007 		Z_PARAM_STR(haystack)
2008 		Z_PARAM_STR(needle)
2009 		Z_PARAM_OPTIONAL
2010 		Z_PARAM_LONG(offset)
2011 		Z_PARAM_STR_OR_NULL(enc_name)
2012 	ZEND_PARSE_PARAMETERS_END();
2013 
2014 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2015 	if (!enc) {
2016 		RETURN_THROWS();
2017 	}
2018 
2019 	size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2020 	if (!mbfl_is_error(n)) {
2021 		RETVAL_LONG(n);
2022 	} else {
2023 		handle_strpos_error(n);
2024 		RETVAL_FALSE;
2025 	}
2026 }
2027 /* }}} */
2028 
2029 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2030 PHP_FUNCTION(mb_stripos)
2031 {
2032 	zend_long offset = 0;
2033 	zend_string *haystack, *needle;
2034 	zend_string *from_encoding = NULL;
2035 
2036 	ZEND_PARSE_PARAMETERS_START(2, 4)
2037 		Z_PARAM_STR(haystack)
2038 		Z_PARAM_STR(needle)
2039 		Z_PARAM_OPTIONAL
2040 		Z_PARAM_LONG(offset)
2041 		Z_PARAM_STR_OR_NULL(from_encoding)
2042 	ZEND_PARSE_PARAMETERS_END();
2043 
2044 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2045 	if (!enc) {
2046 		RETURN_THROWS();
2047 	}
2048 
2049 	size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2050 
2051 	if (!mbfl_is_error(n)) {
2052 		RETVAL_LONG(n);
2053 	} else {
2054 		handle_strpos_error(n);
2055 		RETVAL_FALSE;
2056 	}
2057 }
2058 /* }}} */
2059 
2060 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2061 PHP_FUNCTION(mb_strripos)
2062 {
2063 	zend_long offset = 0;
2064 	zend_string *haystack, *needle;
2065 	zend_string *from_encoding = NULL;
2066 
2067 	ZEND_PARSE_PARAMETERS_START(2, 4)
2068 		Z_PARAM_STR(haystack)
2069 		Z_PARAM_STR(needle)
2070 		Z_PARAM_OPTIONAL
2071 		Z_PARAM_LONG(offset)
2072 		Z_PARAM_STR_OR_NULL(from_encoding)
2073 	ZEND_PARSE_PARAMETERS_END();
2074 
2075 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2076 	if (!enc) {
2077 		RETURN_THROWS();
2078 	}
2079 
2080 	size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2081 
2082 	if (!mbfl_is_error(n)) {
2083 		RETVAL_LONG(n);
2084 	} else {
2085 		handle_strpos_error(n);
2086 		RETVAL_FALSE;
2087 	}
2088 }
2089 /* }}} */
2090 
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2091 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2092 {
2093 	uint32_t wchar_buf[128];
2094 	unsigned int state = 0;
2095 
2096 	mb_convert_buf buf;
2097 	mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2098 
2099 	while (in_len && len) {
2100 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2101 		ZEND_ASSERT(out_len <= 128);
2102 
2103 		if (from >= out_len) {
2104 			from -= out_len;
2105 		} else {
2106 			size_t needed_codepoints = MIN(out_len - from, len);
2107 			enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2108 			from = 0;
2109 			len -= needed_codepoints;
2110 		}
2111 	}
2112 
2113 	return mb_convert_buf_result(&buf, enc);
2114 }
2115 
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2116 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2117 {
2118 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2119 	size_t in_len = ZSTR_LEN(input);
2120 
2121 	if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2122 		/* Other than MacJapanese, no supported text encoding decodes to
2123 		 * more than one codepoint per byte
2124 		 * So if the number of codepoints to skip >= number of input bytes,
2125 		 * then definitely the output should be empty */
2126 		return zend_empty_string;
2127 	}
2128 
2129 	/* Does each codepoint have a fixed byte width? */
2130 	unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2131 	if (flag) {
2132 		/* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2133 		from *= flag;
2134 		len *= flag;
2135 		if (from >= in_len) {
2136 			return zend_empty_string;
2137 		}
2138 		in += from;
2139 		in_len -= from;
2140 		if (len > in_len) {
2141 			len = in_len;
2142 		}
2143 		return zend_string_init_fast((const char*)in, len);
2144 	}
2145 
2146 	return mb_get_substr_slow(in, in_len, from, len, enc);
2147 }
2148 
2149 #define MB_STRSTR 1
2150 #define MB_STRRCHR 2
2151 #define MB_STRISTR 3
2152 #define MB_STRRICHR 4
2153 
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2154 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2155 {
2156 	bool reverse_mode = false, part = false;
2157 	size_t n;
2158 	zend_string *haystack, *needle;
2159 	zend_string *encoding_name = NULL;
2160 
2161 	ZEND_PARSE_PARAMETERS_START(2, 4)
2162 		Z_PARAM_STR(haystack)
2163 		Z_PARAM_STR(needle)
2164 		Z_PARAM_OPTIONAL
2165 		Z_PARAM_BOOL(part)
2166 		Z_PARAM_STR_OR_NULL(encoding_name)
2167 	ZEND_PARSE_PARAMETERS_END();
2168 
2169 	const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2170 	if (!enc) {
2171 		RETURN_THROWS();
2172 	}
2173 
2174 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2175 		reverse_mode = true;
2176 	}
2177 
2178 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2179 		n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2180 	} else {
2181 		n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2182 	}
2183 
2184 	if (!mbfl_is_error(n)) {
2185 		if (part) {
2186 			RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2187 		} else {
2188 			RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2189 		}
2190 	} else {
2191 		// FIXME use handle_strpos_error(n)
2192 		RETVAL_FALSE;
2193 	}
2194 }
2195 
2196 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2197 PHP_FUNCTION(mb_strstr)
2198 {
2199 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2200 }
2201 /* }}} */
2202 
2203 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2204 PHP_FUNCTION(mb_strrchr)
2205 {
2206 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2207 }
2208 /* }}} */
2209 
2210 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2211 PHP_FUNCTION(mb_stristr)
2212 {
2213 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2214 }
2215 /* }}} */
2216 
2217 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2218 PHP_FUNCTION(mb_strrichr)
2219 {
2220 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2221 }
2222 /* }}} */
2223 
2224 #undef MB_STRSTR
2225 #undef MB_STRRCHR
2226 #undef MB_STRISTR
2227 #undef MB_STRRICHR
2228 
PHP_FUNCTION(mb_substr_count)2229 PHP_FUNCTION(mb_substr_count)
2230 {
2231 	zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2232 
2233 	ZEND_PARSE_PARAMETERS_START(2, 3)
2234 		Z_PARAM_STR(haystack)
2235 		Z_PARAM_STR(needle)
2236 		Z_PARAM_OPTIONAL
2237 		Z_PARAM_STR_OR_NULL(enc_name)
2238 	ZEND_PARSE_PARAMETERS_END();
2239 
2240 	if (ZSTR_LEN(needle) == 0) {
2241 		zend_argument_must_not_be_empty_error(2);
2242 		RETURN_THROWS();
2243 	}
2244 
2245 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2246 	if (!enc) {
2247 		RETURN_THROWS();
2248 	}
2249 
2250 	if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2251 		/* No need to do any conversion if haystack/needle are already known-valid UTF-8
2252 		 * (If they are not valid, then not passing them through conversion filters could affect output) */
2253 		if (ZSTR_IS_VALID_UTF8(haystack)) {
2254 			haystack_u8 = haystack;
2255 		} else {
2256 			unsigned int num_errors = 0;
2257 			haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2258 			if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2259 				GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2260 			}
2261 		}
2262 
2263 		if (ZSTR_IS_VALID_UTF8(needle)) {
2264 			needle_u8 = needle;
2265 		} else {
2266 			unsigned int num_errors = 0;
2267 			needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2268 			if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2269 				GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2270 			}
2271 		}
2272 	} else {
2273 		unsigned int num_errors = 0;
2274 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2275 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2276 		/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2277 		 * may be only escape sequences */
2278 		if (ZSTR_LEN(needle_u8) == 0) {
2279 			zend_string_free(haystack_u8);
2280 			zend_string_free(needle_u8);
2281 			zend_argument_must_not_be_empty_error(2);
2282 			RETURN_THROWS();
2283 		}
2284 	}
2285 
2286 	size_t result = 0;
2287 
2288 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2289 		goto out;
2290 	}
2291 
2292 	const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2293 	while (true) {
2294 		p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2295 		if (!p) {
2296 			break;
2297 		}
2298 		p += ZSTR_LEN(needle_u8);
2299 		result++;
2300 	}
2301 
2302 out:
2303 	if (haystack_u8 != haystack) {
2304 		zend_string_free(haystack_u8);
2305 	}
2306 	if (needle_u8 != needle) {
2307 		zend_string_free(needle_u8);
2308 	}
2309 
2310 	RETVAL_LONG(result);
2311 }
2312 
2313 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2314 PHP_FUNCTION(mb_substr)
2315 {
2316 	zend_string *str, *encoding = NULL;
2317 	zend_long from, len;
2318 	size_t real_from, real_len;
2319 	bool len_is_null = true;
2320 
2321 	ZEND_PARSE_PARAMETERS_START(2, 4)
2322 		Z_PARAM_STR(str)
2323 		Z_PARAM_LONG(from)
2324 		Z_PARAM_OPTIONAL
2325 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2326 		Z_PARAM_STR_OR_NULL(encoding)
2327 	ZEND_PARSE_PARAMETERS_END();
2328 
2329 	if (from == ZEND_LONG_MIN) {
2330 		zend_argument_value_error(2, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2331 		RETURN_THROWS();
2332 	}
2333 
2334 	if (!len_is_null && len == ZEND_LONG_MIN) {
2335 		zend_argument_value_error(3, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2336 		RETURN_THROWS();
2337 	}
2338 
2339 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2340 	if (!enc) {
2341 		RETURN_THROWS();
2342 	}
2343 
2344 	size_t mblen = 0;
2345 	if (from < 0 || (!len_is_null && len < 0)) {
2346 		mblen = mb_get_strlen(str, enc);
2347 	}
2348 
2349 	/* if "from" position is negative, count start position from the end
2350 	 * of the string */
2351 	if (from >= 0) {
2352 		real_from = (size_t) from;
2353 	} else if (-from < mblen) {
2354 		real_from = mblen + from;
2355 	} else {
2356 		real_from = 0;
2357 	}
2358 
2359 	/* if "length" position is negative, set it to the length
2360 	 * needed to stop that many chars from the end of the string */
2361 	if (len_is_null) {
2362 		real_len = MBFL_SUBSTR_UNTIL_END;
2363 	} else if (len >= 0) {
2364 		real_len = (size_t) len;
2365 	} else if (real_from < mblen && -len < mblen - real_from) {
2366 		real_len = (mblen - real_from) + len;
2367 	} else {
2368 		real_len = 0;
2369 	}
2370 
2371 	RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2372 }
2373 /* }}} */
2374 
2375 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2376 PHP_FUNCTION(mb_strcut)
2377 {
2378 	zend_string *encoding = NULL;
2379 	char *string_val;
2380 	zend_long from, len;
2381 	bool len_is_null = true;
2382 	mbfl_string string, result, *ret;
2383 
2384 	ZEND_PARSE_PARAMETERS_START(2, 4)
2385 		Z_PARAM_STRING(string_val, string.len)
2386 		Z_PARAM_LONG(from)
2387 		Z_PARAM_OPTIONAL
2388 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2389 		Z_PARAM_STR_OR_NULL(encoding)
2390 	ZEND_PARSE_PARAMETERS_END();
2391 
2392 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2393 	if (!enc) {
2394 		RETURN_THROWS();
2395 	}
2396 
2397 	string.val = (unsigned char*)string_val;
2398 	string.encoding = enc;
2399 
2400 	if (len_is_null) {
2401 		len = string.len;
2402 	}
2403 
2404 	/* if "from" position is negative, count start position from the end
2405 	 * of the string */
2406 	if (from < 0) {
2407 		from = string.len + from;
2408 		if (from < 0) {
2409 			from = 0;
2410 		}
2411 	}
2412 
2413 	/* if "length" position is negative, set it to the length
2414 	 * needed to stop that many chars from the end of the string */
2415 	if (len < 0) {
2416 		len = (string.len - from) + len;
2417 		if (len < 0) {
2418 			len = 0;
2419 		}
2420 	}
2421 
2422 	if (from > string.len || len == 0) {
2423 		RETURN_EMPTY_STRING();
2424 	}
2425 
2426 	if (enc->cut) {
2427 		RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2428 	}
2429 
2430 	unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2431 	if (char_len) {
2432 		/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2433 		from &= -char_len;
2434 		if (len > string.len - from) {
2435 			len = string.len - from;
2436 		}
2437 		RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2438 	}
2439 
2440 	if (enc->mblen_table) {
2441 		const unsigned char *mbtab = enc->mblen_table;
2442 		const unsigned char *p, *q, *end;
2443 		int m = 0;
2444 		/* Search for start position */
2445 		for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2446 		if (p > q) {
2447 			p -= m;
2448 		}
2449 		const unsigned char *start = p;
2450 		/* Search for end position */
2451 		if (len >= string.len - (start - (const unsigned char*)string.val)) {
2452 			end = (const unsigned char*)(string.val + string.len);
2453 		} else {
2454 			for (q = p + len; p < q; p += (m = mbtab[*p]));
2455 			if (p > q) {
2456 				p -= m;
2457 			}
2458 			end = p;
2459 		}
2460 		RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2461 	}
2462 
2463 	ret = mbfl_strcut(&string, &result, from, len);
2464 	ZEND_ASSERT(ret != NULL);
2465 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2466 	efree(ret->val);
2467 }
2468 /* }}} */
2469 
2470 /* Some East Asian characters, when printed at a terminal (or the like), require double
2471  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2472 static size_t character_width(uint32_t c)
2473 {
2474 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2475 		return 1;
2476 	}
2477 
2478 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2479 	unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2480 	while (lo < hi) {
2481 		unsigned int probe = (lo + hi) / 2;
2482 		if (c < mbfl_eaw_table[probe].begin) {
2483 			hi = probe;
2484 		} else if (c > mbfl_eaw_table[probe].end) {
2485 			lo = probe + 1;
2486 		} else {
2487 			return 2;
2488 		}
2489 	}
2490 
2491 	return 1;
2492 }
2493 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2494 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2495 {
2496 	size_t width = 0;
2497 	uint32_t wchar_buf[128];
2498 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2499 	size_t in_len = ZSTR_LEN(string);
2500 	unsigned int state = 0;
2501 
2502 	while (in_len) {
2503 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2504 		ZEND_ASSERT(out_len <= 128);
2505 
2506 		while (out_len) {
2507 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2508 			 * If text conversion is performed with an ordinary ASCII character as
2509 			 * the 'replacement character', this will give us the correct display width. */
2510 			width += character_width(wchar_buf[--out_len]);
2511 		}
2512 	}
2513 
2514 	return width;
2515 }
2516 
2517 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2518 PHP_FUNCTION(mb_strwidth)
2519 {
2520 	zend_string *string, *enc_name = NULL;
2521 
2522 	ZEND_PARSE_PARAMETERS_START(1, 2)
2523 		Z_PARAM_STR(string)
2524 		Z_PARAM_OPTIONAL
2525 		Z_PARAM_STR_OR_NULL(enc_name)
2526 	ZEND_PARSE_PARAMETERS_END();
2527 
2528 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2529 	if (!enc) {
2530 		RETURN_THROWS();
2531 	}
2532 
2533 	RETVAL_LONG(mb_get_strwidth(string, enc));
2534 }
2535 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2536 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2537 {
2538 	uint32_t wchar_buf[128];
2539 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2540 	size_t in_len = ZSTR_LEN(input);
2541 	unsigned int state = 0;
2542 	size_t remaining_width = width;
2543 	size_t to_skip = from;
2544 	size_t out_len = 0;
2545 	bool first_call = true, input_err = false;
2546 	mb_convert_buf buf;
2547 
2548 	while (in_len) {
2549 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2550 		ZEND_ASSERT(out_len <= 128);
2551 
2552 		if (out_len <= to_skip) {
2553 			to_skip -= out_len;
2554 		} else {
2555 			for (size_t i = to_skip; i < out_len; i++) {
2556 				uint32_t w = wchar_buf[i];
2557 				size_t current_w_width = character_width(w);
2558 
2559 				input_err |= (w == MBFL_BAD_INPUT);
2560 
2561 				if (remaining_width < current_w_width) {
2562 					size_t marker_width = mb_get_strwidth(marker, enc);
2563 
2564 					/* The trim marker is larger than the desired string width */
2565 					if (width <= marker_width) {
2566 						return zend_string_copy(marker);
2567 					}
2568 
2569 					/* We need to truncate string and append trim marker */
2570 					width -= marker_width;
2571 					/* 'width' is now the amount we want to take from 'input' */
2572 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2573 
2574 					if (first_call) {
2575 						/* We can use the buffer of wchars which we have right now;
2576 						 * no need to convert again */
2577 						goto dont_restart_conversion;
2578 					} else {
2579 						goto restart_conversion;
2580 					}
2581 				}
2582 				remaining_width -= current_w_width;
2583 			}
2584 			to_skip = 0;
2585 		}
2586 		first_call = false;
2587 	}
2588 
2589 	/* The input string fits in the requested width; we don't need to append the trim marker
2590 	 * However, if the string contains erroneous byte sequences, those should be converted
2591 	 * to error markers */
2592 	if (!input_err) {
2593 		if (from == 0) {
2594 			/* This just increments the string's refcount; it doesn't really 'copy' it */
2595 			return zend_string_copy(input);
2596 		} else {
2597 			return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2598 		}
2599 	} else {
2600 		/* We can't use `mb_get_substr`, because it uses the fastest method possible of
2601 		 * picking out a substring, which may not include converting erroneous byte
2602 		 * sequences to error markers */
2603 		return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2604 	}
2605 
2606 	/* The input string is too wide; we need to build a new string which
2607 	 * includes some portion of the input string, with the trim marker
2608 	 * concatenated onto it */
2609 restart_conversion:
2610 	in = (unsigned char*)ZSTR_VAL(input);
2611 	in_len = ZSTR_LEN(input);
2612 	state = 0;
2613 
2614 	while (true) {
2615 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2616 		ZEND_ASSERT(out_len <= 128);
2617 
2618 dont_restart_conversion:
2619 		if (out_len <= from) {
2620 			from -= out_len;
2621 		} else {
2622 			for (size_t i = from; i < out_len; i++) {
2623 				size_t current_wchar_char_width = character_width(wchar_buf[i]);
2624 				if (width < current_wchar_char_width) {
2625 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2626 					goto append_trim_marker;
2627 				}
2628 				width -= current_wchar_char_width;
2629 			}
2630 			ZEND_ASSERT(in_len > 0);
2631 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2632 			from = 0;
2633 		}
2634 	}
2635 
2636 append_trim_marker:
2637 	if (ZSTR_LEN(marker) > 0) {
2638 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2639 		buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2640 	}
2641 
2642 	/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2643 	 * we have no guarantee that the trim marker string is valid UTF-8 */
2644 	return mb_convert_buf_result_raw(&buf);
2645 }
2646 
2647 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2648 PHP_FUNCTION(mb_strimwidth)
2649 {
2650 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2651 	zend_long from, width;
2652 
2653 	ZEND_PARSE_PARAMETERS_START(3, 5)
2654 		Z_PARAM_STR(str)
2655 		Z_PARAM_LONG(from)
2656 		Z_PARAM_LONG(width)
2657 		Z_PARAM_OPTIONAL
2658 		Z_PARAM_STR(trimmarker)
2659 		Z_PARAM_STR_OR_NULL(encoding)
2660 	ZEND_PARSE_PARAMETERS_END();
2661 
2662 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2663 	if (!enc) {
2664 		RETURN_THROWS();
2665 	}
2666 
2667 	if (from != 0) {
2668 		size_t str_len = mb_get_strlen(str, enc);
2669 		if (from < 0) {
2670 			from += str_len;
2671 		}
2672 		if (from < 0 || from > str_len) {
2673 			zend_argument_value_error(2, "is out of range");
2674 			RETURN_THROWS();
2675 		}
2676 	}
2677 
2678 	if (width < 0) {
2679 		php_error_docref(NULL, E_DEPRECATED,
2680 			"passing a negative integer to argument #3 ($width) is deprecated");
2681 		width += mb_get_strwidth(str, enc);
2682 
2683 		if (from > 0) {
2684 			zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2685 			width -= mb_get_strwidth(trimmed, enc);
2686 			zend_string_free(trimmed);
2687 		}
2688 
2689 		if (width < 0) {
2690 			zend_argument_value_error(3, "is out of range");
2691 			RETURN_THROWS();
2692 		}
2693 	}
2694 
2695 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2696 }
2697 
2698 
2699 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2700 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2701 {
2702 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2703 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2704 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2705 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2706 }
2707 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2708 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2709 {
2710 	unsigned int num_errors = 0;
2711 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2712 	MBSTRG(illegalchars) += num_errors;
2713 	return result;
2714 }
2715 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2716 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2717 {
2718 	const mbfl_encoding *from_encoding;
2719 
2720 	/* pre-conversion encoding */
2721 	ZEND_ASSERT(num_from_encodings >= 1);
2722 	if (num_from_encodings == 1) {
2723 		from_encoding = *from_encodings;
2724 	} else {
2725 		/* auto detect */
2726 		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2727 		if (!from_encoding) {
2728 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2729 			return NULL;
2730 		}
2731 	}
2732 
2733 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2734 }
2735 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2736 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2737 {
2738 	HashTable *output, *chash;
2739 	zend_long idx;
2740 	zend_string *key;
2741 	zval *entry, entry_tmp;
2742 
2743 	if (!input) {
2744 		return NULL;
2745 	}
2746 
2747 	if (GC_IS_RECURSIVE(input)) {
2748 		GC_UNPROTECT_RECURSION(input);
2749 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2750 		return NULL;
2751 	}
2752 	GC_TRY_PROTECT_RECURSION(input);
2753 	output = zend_new_array(zend_hash_num_elements(input));
2754 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2755 		/* convert key */
2756 		if (key) {
2757 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2758 			if (!converted_key) {
2759 				continue;
2760 			}
2761 			key = converted_key;
2762 		}
2763 		/* convert value */
2764 		ZEND_ASSERT(entry);
2765 try_again:
2766 		switch(Z_TYPE_P(entry)) {
2767 			case IS_STRING: {
2768 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2769 				if (!converted_key) {
2770 					if (key) {
2771 						zend_string_release(key);
2772 					}
2773 					continue;
2774 				}
2775 				ZVAL_STR(&entry_tmp, converted_key);
2776 				break;
2777 			}
2778 			case IS_NULL:
2779 			case IS_TRUE:
2780 			case IS_FALSE:
2781 			case IS_LONG:
2782 			case IS_DOUBLE:
2783 				ZVAL_COPY(&entry_tmp, entry);
2784 				break;
2785 			case IS_ARRAY:
2786 				chash = php_mb_convert_encoding_recursive(
2787 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2788 				if (chash) {
2789 					ZVAL_ARR(&entry_tmp, chash);
2790 				} else {
2791 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2792 				}
2793 				break;
2794 			case IS_REFERENCE:
2795 				entry = Z_REFVAL_P(entry);
2796 				goto try_again;
2797 			case IS_OBJECT:
2798 			default:
2799 				if (key) {
2800 					zend_string_release(key);
2801 				}
2802 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2803 				continue;
2804 		}
2805 		if (key) {
2806 			zend_hash_add(output, key, &entry_tmp);
2807 			zend_string_release(key);
2808 		} else {
2809 			zend_hash_index_add(output, idx, &entry_tmp);
2810 		}
2811 	} ZEND_HASH_FOREACH_END();
2812 	GC_TRY_UNPROTECT_RECURSION(input);
2813 
2814 	return output;
2815 }
2816 /* }}} */
2817 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2818 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2819 {
2820 	/* mbstring supports some 'text encodings' which aren't really text encodings
2821 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2822 	 * These should never be returned by `mb_detect_encoding`. */
2823 	unsigned int shift = 0;
2824 	for (unsigned int i = 0; i < *size; i++) {
2825 		const mbfl_encoding *encoding = elist[i];
2826 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2827 			shift++; /* Remove this encoding from the list */
2828 		} else if (shift) {
2829 			elist[i - shift] = encoding;
2830 		}
2831 	}
2832 	*size -= shift;
2833 }
2834 
2835 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2836 PHP_FUNCTION(mb_convert_encoding)
2837 {
2838 	zend_string *to_encoding_name;
2839 	zend_string *input_str, *from_encodings_str = NULL;
2840 	HashTable *input_ht, *from_encodings_ht = NULL;
2841 	const mbfl_encoding **from_encodings;
2842 	size_t num_from_encodings;
2843 	bool free_from_encodings = false;
2844 
2845 	ZEND_PARSE_PARAMETERS_START(2, 3)
2846 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2847 		Z_PARAM_STR(to_encoding_name)
2848 		Z_PARAM_OPTIONAL
2849 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2850 	ZEND_PARSE_PARAMETERS_END();
2851 
2852 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2853 	if (!to_encoding) {
2854 		RETURN_THROWS();
2855 	}
2856 
2857 	if (from_encodings_ht) {
2858 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2859 			RETURN_THROWS();
2860 		}
2861 		free_from_encodings = true;
2862 	} else if (from_encodings_str) {
2863 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2864 				&from_encodings, &num_from_encodings,
2865 				/* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2866 			RETURN_THROWS();
2867 		}
2868 		free_from_encodings = true;
2869 	} else {
2870 		from_encodings = &MBSTRG(current_internal_encoding);
2871 		num_from_encodings = 1;
2872 	}
2873 
2874 	if (num_from_encodings > 1) {
2875 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2876 	}
2877 
2878 	if (!num_from_encodings) {
2879 		efree(ZEND_VOIDP(from_encodings));
2880 		zend_argument_value_error(3, "must specify at least one encoding");
2881 		RETURN_THROWS();
2882 	}
2883 
2884 	if (input_str) {
2885 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2886 		if (ret != NULL) {
2887 			RETVAL_STR(ret);
2888 		} else {
2889 			RETVAL_FALSE;
2890 		}
2891 	} else {
2892 		HashTable *tmp;
2893 		tmp = php_mb_convert_encoding_recursive(
2894 			input_ht, to_encoding, from_encodings, num_from_encodings);
2895 		RETVAL_ARR(tmp);
2896 	}
2897 
2898 	if (free_from_encodings) {
2899 		efree(ZEND_VOIDP(from_encodings));
2900 	}
2901 }
2902 /* }}} */
2903 
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2904 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2905 {
2906 	return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2907 }
2908 
PHP_FUNCTION(mb_convert_case)2909 PHP_FUNCTION(mb_convert_case)
2910 {
2911 	zend_string *str, *from_encoding = NULL;
2912 	zend_long case_mode = 0;
2913 
2914 	ZEND_PARSE_PARAMETERS_START(2, 3)
2915 		Z_PARAM_STR(str)
2916 		Z_PARAM_LONG(case_mode)
2917 		Z_PARAM_OPTIONAL
2918 		Z_PARAM_STR_OR_NULL(from_encoding)
2919 	ZEND_PARSE_PARAMETERS_END();
2920 
2921 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2922 	if (!enc) {
2923 		RETURN_THROWS();
2924 	}
2925 
2926 	if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2927 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2928 		RETURN_THROWS();
2929 	}
2930 
2931 	RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2932 }
2933 
PHP_FUNCTION(mb_strtoupper)2934 PHP_FUNCTION(mb_strtoupper)
2935 {
2936 	zend_string *str, *from_encoding = NULL;
2937 
2938 	ZEND_PARSE_PARAMETERS_START(1, 2)
2939 		Z_PARAM_STR(str)
2940 		Z_PARAM_OPTIONAL
2941 		Z_PARAM_STR_OR_NULL(from_encoding)
2942 	ZEND_PARSE_PARAMETERS_END();
2943 
2944 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2945 	if (!enc) {
2946 		RETURN_THROWS();
2947 	}
2948 
2949 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2950 }
2951 
PHP_FUNCTION(mb_strtolower)2952 PHP_FUNCTION(mb_strtolower)
2953 {
2954 	zend_string *str, *from_encoding = NULL;
2955 
2956 	ZEND_PARSE_PARAMETERS_START(1, 2)
2957 		Z_PARAM_STR(str)
2958 		Z_PARAM_OPTIONAL
2959 		Z_PARAM_STR_OR_NULL(from_encoding)
2960 	ZEND_PARSE_PARAMETERS_END();
2961 
2962 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2963 	if (!enc) {
2964 		RETURN_THROWS();
2965 	}
2966 
2967 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2968 }
2969 
php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS,php_case_mode mode)2970 static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2971 {
2972 	zend_string *str, *from_encoding = NULL;
2973 
2974 	ZEND_PARSE_PARAMETERS_START(1, 2)
2975 		Z_PARAM_STR(str)
2976 		Z_PARAM_OPTIONAL
2977 		Z_PARAM_STR_OR_NULL(from_encoding)
2978 	ZEND_PARSE_PARAMETERS_END();
2979 
2980 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2981 	if (!enc) {
2982 		RETURN_THROWS();
2983 	}
2984 
2985 	zend_string *first = mb_get_substr(str, 0, 1, enc);
2986 	zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2987 
2988 	if (zend_string_equals(first, head)) {
2989 		zend_string_release_ex(first, false);
2990 		zend_string_release_ex(head, false);
2991 		RETURN_STR(zend_string_copy(str));
2992 	}
2993 
2994 	zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2995 	zend_string *retval = zend_string_concat2(ZSTR_VAL(head), ZSTR_LEN(head), ZSTR_VAL(second), ZSTR_LEN(second));
2996 
2997 	zend_string_release_ex(first, false);
2998 	zend_string_release_ex(head, false);
2999 	zend_string_release_ex(second, false);
3000 
3001 	RETVAL_STR(retval);
3002 }
3003 
PHP_FUNCTION(mb_ucfirst)3004 PHP_FUNCTION(mb_ucfirst)
3005 {
3006 	php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_TITLE);
3007 }
3008 
PHP_FUNCTION(mb_lcfirst)3009 PHP_FUNCTION(mb_lcfirst)
3010 {
3011 	php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_LOWER);
3012 }
3013 
3014 typedef enum {
3015 	MB_LTRIM = 1,
3016 	MB_RTRIM = 2,
3017 	MB_BOTH_TRIM = 3
3018 } mb_trim_mode;
3019 
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)3020 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3021 {
3022 	if (ht) {
3023 		return zend_hash_index_exists(ht, w);
3024 	} else {
3025 		for (size_t i = 0; i < default_chars_length; i++) {
3026 			if (w == default_chars[i]) {
3027 				return true;
3028 			}
3029 		}
3030 		return false;
3031 	}
3032 }
3033 
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)3034 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3035 {
3036 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3037 	uint32_t wchar_buf[128];
3038 	size_t in_len = ZSTR_LEN(str);
3039 	size_t out_len = 0;
3040 	unsigned int state = 0;
3041 	size_t left = 0;
3042 	size_t right = 0;
3043 	size_t total_len = 0;
3044 
3045 	while (in_len) {
3046 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3047 		ZEND_ASSERT(out_len <= 128);
3048 		total_len += out_len;
3049 
3050 		for (size_t i = 0; i < out_len; i++) {
3051 			uint32_t w = wchar_buf[i];
3052 			if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3053 				if (mode & MB_LTRIM) {
3054 					left += 1;
3055 				}
3056 				if (mode & MB_RTRIM) {
3057 					right += 1;
3058 				}
3059 			} else {
3060 				mode &= ~MB_LTRIM;
3061 				if (mode & MB_RTRIM) {
3062 					right = 0;
3063 				}
3064 			}
3065 		}
3066 	}
3067 
3068 	if (left == 0 && right == 0) {
3069 		return zend_string_copy(str);
3070 	}
3071 	return mb_get_substr(str, left, total_len - (right + left), enc);
3072 }
3073 
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3074 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3075 {
3076 	const uint32_t trim_default_chars[] = {
3077 		0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3078 		0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3079 		0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3080 		0x85, 0x180E
3081 	};
3082 	size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3083 
3084 	HashTable what_ht;
3085 	zval val;
3086 	ZVAL_TRUE(&val);
3087 
3088 	zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3089 
3090 	for (size_t i = 0; i < trim_default_chars_length; i++) {
3091 		zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3092 	}
3093 	zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3094 	zend_hash_destroy(&what_ht);
3095 
3096 	return retval;
3097 }
3098 
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3099 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3100 {
3101 	unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3102 	uint32_t what_wchar_buf[128];
3103 	size_t what_out_len = 0;
3104 	unsigned int state = 0;
3105 	size_t what_len = ZSTR_LEN(what);
3106 	HashTable what_ht;
3107 	zval val;
3108 	bool hash_initialized = false;
3109 
3110 	while (what_len) {
3111 		what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3112 		ZEND_ASSERT(what_out_len <= 128);
3113 
3114 		if (what_out_len <= 4 && !hash_initialized) {
3115 			return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3116 		} else {
3117 			if (!hash_initialized) {
3118 				hash_initialized = true;
3119 				ZVAL_TRUE(&val);
3120 				zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3121 			}
3122 			for (size_t i = 0; i < what_out_len; i++) {
3123 				zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3124 			}
3125 		}
3126 	}
3127 
3128 	if (UNEXPECTED(!hash_initialized)) {
3129 		/* This is only possible if what is empty */
3130 		return zend_string_copy(str);
3131 	}
3132 
3133 	zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3134 	zend_hash_destroy(&what_ht);
3135 
3136 	return retval;
3137 }
3138 
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3139 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3140 {
3141 	zend_string *str;
3142 	zend_string *what = NULL;
3143 	zend_string *encoding = NULL;
3144 
3145 	ZEND_PARSE_PARAMETERS_START(1, 3)
3146 		Z_PARAM_STR(str)
3147 		Z_PARAM_OPTIONAL
3148 		Z_PARAM_STR_OR_NULL(what)
3149 		Z_PARAM_STR_OR_NULL(encoding)
3150 	ZEND_PARSE_PARAMETERS_END();
3151 
3152 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3153 	if (!enc) {
3154 		RETURN_THROWS();
3155 	}
3156 
3157 	if (what) {
3158 		RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3159 	} else {
3160 		RETURN_STR(mb_trim_default_chars(str, mode, enc));
3161 	}
3162 }
3163 
PHP_FUNCTION(mb_trim)3164 PHP_FUNCTION(mb_trim)
3165 {
3166 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3167 }
3168 
PHP_FUNCTION(mb_ltrim)3169 PHP_FUNCTION(mb_ltrim)
3170 {
3171 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3172 }
3173 
PHP_FUNCTION(mb_rtrim)3174 PHP_FUNCTION(mb_rtrim)
3175 {
3176 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3177 }
3178 
duplicate_elist(const mbfl_encoding ** elist,size_t size)3179 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3180 {
3181 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3182 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3183 	return new_elist;
3184 }
3185 
estimate_demerits(uint32_t w)3186 static unsigned int estimate_demerits(uint32_t w)
3187 {
3188 	/* Receive wchars decoded from input string using candidate encoding.
3189 	 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3190 	 * a smaller number for each ASCII punctuation character, and 1 for
3191 	 * all other codepoints.
3192 	 *
3193 	 * The 'common' codepoints should cover the vast majority of
3194 	 * codepoints we are likely to see in practice, while only covering
3195 	 * a small minority of the entire Unicode encoding space. Why?
3196 	 * Well, if the test string happens to be valid in an incorrect
3197 	 * candidate encoding, the bogus codepoints which it decodes to will
3198 	 * be more or less random. By treating the majority of codepoints as
3199 	 * 'rare', we ensure that in almost all such cases, the bogus
3200 	 * codepoints will include plenty of 'rares', thus giving the
3201 	 * incorrect candidate encoding lots of demerits. See
3202 	 * common_codepoints.txt for the actual list used.
3203 	 *
3204 	 * So, why give extra demerits for ASCII punctuation characters? It's
3205 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3206 	 * which deliberately only use bytes in the ASCII range. When
3207 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3208 	 * have an unusually high number of ASCII punctuation characters.
3209 	 * So giving extra demerits for such characters will improve
3210 	 * detection accuracy for UTF-7 and similar encodings.
3211 	 *
3212 	 * Finally, why 1 demerit for all other characters? That penalizes
3213 	 * long strings, meaning we will tend to choose a candidate encoding
3214 	 * in which the test string decodes to a smaller number of
3215 	 * codepoints. That prevents single-byte encodings in which almost
3216 	 * every possible input byte decodes to a 'common' codepoint from
3217 	 * being favored too much. */
3218 	if (w > 0xFFFF) {
3219 		return 40;
3220 	} else if (w >= 0x21 && w <= 0x2F) {
3221 		return 6;
3222 	} else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3223 		return 30;
3224 	} else {
3225 		return 1;
3226 	}
3227 	return 0;
3228 }
3229 
3230 struct candidate {
3231 	const mbfl_encoding *enc;
3232 	const unsigned char *in;
3233 	size_t in_len;
3234 	uint64_t demerits; /* Wide bit size to prevent overflow */
3235 	unsigned int state;
3236 	float multiplier;
3237 };
3238 
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3239 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3240 {
3241 	size_t j = 0;
3242 
3243 	for (size_t i = 0; i < length; i++) {
3244 		const mbfl_encoding *enc = encodings[i];
3245 
3246 		array[j].enc = enc;
3247 		array[j].state = 0;
3248 		array[j].demerits = 0;
3249 
3250 		/* If any candidate encodings have specialized validation functions, use them
3251 		 * to eliminate as many candidates as possible */
3252 		if (enc->check != NULL) {
3253 			for (size_t k = 0; k < n; k++) {
3254 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
3255 					if (strict) {
3256 						goto skip_to_next;
3257 					} else {
3258 						array[j].demerits += 500;
3259 					}
3260 				}
3261 			}
3262 		}
3263 
3264 		/* This multiplier can optionally be used to make candidate encodings listed
3265 		 * first more likely to be chosen. It is a weight factor which multiplies
3266 		 * the number of demerits counted for each candidate. */
3267 		array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3268 		j++;
3269 skip_to_next: ;
3270 	}
3271 
3272 	return j;
3273 }
3274 
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3275 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3276 {
3277 	for (size_t i = 0; i < length; i++) {
3278 		const mbfl_encoding *enc = array[i].enc;
3279 
3280 		array[i].in = in;
3281 		array[i].in_len = in_len;
3282 
3283 		/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3284 		if (enc == &mbfl_encoding_utf8) {
3285 			if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3286 				array[i].in_len -= 3;
3287 				array[i].in += 3;
3288 			}
3289 		} else if (enc == &mbfl_encoding_utf16be) {
3290 			if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3291 				array[i].in_len -= 2;
3292 				array[i].in += 2;
3293 			}
3294 		} else if (enc == &mbfl_encoding_utf16le) {
3295 			if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3296 				array[i].in_len -= 2;
3297 				array[i].in += 2;
3298 			}
3299 		}
3300 	}
3301 }
3302 
count_demerits(struct candidate * array,size_t length,bool strict)3303 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3304 {
3305 	uint32_t wchar_buf[128];
3306 	unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3307 
3308 	for (size_t i = 0; i < length; i++) {
3309 		if (array[i].in_len == 0) {
3310 			finished++;
3311 		}
3312 	}
3313 
3314 	while ((strict || length > 1) && finished < length) {
3315 		/* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3316 		for (size_t i = length - 1; i != (size_t)-1; i--) {
3317 			/* Do we still have more input to process for this candidate encoding? */
3318 			if (array[i].in_len) {
3319 				const mbfl_encoding *enc = array[i].enc;
3320 				size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3321 				ZEND_ASSERT(out_len <= 128);
3322 				/* Check this batch of decoded codepoints; are there any error markers?
3323 				 * Also sum up the number of demerits */
3324 				while (out_len) {
3325 					uint32_t w = wchar_buf[--out_len];
3326 					if (w == MBFL_BAD_INPUT) {
3327 						if (strict) {
3328 							/* This candidate encoding is not valid, eliminate it from consideration */
3329 							length--;
3330 							if (i < length) {
3331 								/* The eliminated candidate was the last valid one in the list */
3332 								memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3333 							}
3334 							goto try_next_encoding;
3335 						} else {
3336 							array[i].demerits += 1000;
3337 						}
3338 					} else {
3339 						array[i].demerits += estimate_demerits(w);
3340 					}
3341 				}
3342 				if (array[i].in_len == 0) {
3343 					finished++;
3344 				}
3345 			}
3346 try_next_encoding:;
3347 		}
3348 	}
3349 
3350 	for (size_t i = 0; i < length; i++) {
3351 		array[i].demerits *= array[i].multiplier;
3352 	}
3353 
3354 	return length;
3355 }
3356 
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3357 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3358 {
3359 	if (elist_size == 0) {
3360 		return NULL;
3361 	}
3362 	if (elist_size == 1) {
3363 		if (strict) {
3364 			while (n--) {
3365 				if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3366 					return NULL;
3367 				}
3368 			}
3369 		}
3370 		return *elist;
3371 	}
3372 	if (n == 1 && *str_lengths == 0) {
3373 		return *elist;
3374 	}
3375 
3376 	/* Allocate on stack; when we return, this array is automatically freed */
3377 	struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3378 	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3379 
3380 	while (n--) {
3381 		start_string(array, elist_size, strings[n], str_lengths[n]);
3382 		elist_size = count_demerits(array, elist_size, strict);
3383 		if (elist_size == 0) {
3384 			/* All candidates were eliminated */
3385 			return NULL;
3386 		}
3387 	}
3388 
3389 	/* See which remaining candidate encoding has the least demerits */
3390 	unsigned int best = 0;
3391 	for (unsigned int i = 1; i < elist_size; i++) {
3392 		if (array[i].demerits < array[best].demerits) {
3393 			best = i;
3394 		}
3395 	}
3396 	return array[best].enc;
3397 }
3398 
3399 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3400  * is rejected. With non-strict detection, we just continue, but apply demerits for
3401  * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3402 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3403 {
3404 	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3405 }
3406 
3407 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3408 PHP_FUNCTION(mb_detect_encoding)
3409 {
3410 	zend_string *str, *encoding_str = NULL;
3411 	HashTable *encoding_ht = NULL;
3412 	bool strict = false;
3413 	const mbfl_encoding *ret, **elist;
3414 	size_t size;
3415 
3416 	ZEND_PARSE_PARAMETERS_START(1, 3)
3417 		Z_PARAM_STR(str)
3418 		Z_PARAM_OPTIONAL
3419 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3420 		Z_PARAM_BOOL(strict)
3421 	ZEND_PARSE_PARAMETERS_END();
3422 
3423 	/* Should we pay attention to the order of the provided candidate encodings and prefer
3424 	 * the earlier ones (if more than one candidate encoding matches)?
3425 	 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3426 	 * in, then don't treat the order as significant */
3427 	bool order_significant = true;
3428 
3429 	/* make encoding list */
3430 	if (encoding_ht) {
3431 		if (encoding_ht == MBSTRG(all_encodings_list)) {
3432 			order_significant = false;
3433 		}
3434 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3435 			RETURN_THROWS();
3436 		}
3437 	} else if (encoding_str) {
3438 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3439 			RETURN_THROWS();
3440 		}
3441 	} else {
3442 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3443 		size = MBSTRG(current_detect_order_list_size);
3444 	}
3445 
3446 	if (size == 0) {
3447 		efree(ZEND_VOIDP(elist));
3448 		zend_argument_value_error(2, "must specify at least one encoding");
3449 		RETURN_THROWS();
3450 	}
3451 
3452 	remove_non_encodings_from_elist(elist, &size);
3453 	if (size == 0) {
3454 		efree(ZEND_VOIDP(elist));
3455 		RETURN_FALSE;
3456 	}
3457 
3458 	if (ZEND_NUM_ARGS() < 3) {
3459 		strict = MBSTRG(strict_detection);
3460 	}
3461 
3462 	if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3463 		ret = &mbfl_encoding_utf8;
3464 	} else {
3465 		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3466 	}
3467 
3468 	efree(ZEND_VOIDP(elist));
3469 
3470 	if (ret == NULL) {
3471 		RETURN_FALSE;
3472 	}
3473 
3474 	RETVAL_STRING((char *)ret->name);
3475 }
3476 /* }}} */
3477 
3478 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3479 PHP_FUNCTION(mb_list_encodings)
3480 {
3481 	ZEND_PARSE_PARAMETERS_NONE();
3482 
3483 	if (MBSTRG(all_encodings_list) == NULL) {
3484 		/* Initialize shared array of supported encoding names
3485 		 * This is done so that we can check if `mb_list_encodings()` is being
3486 		 * passed to other mbstring functions using a cheap pointer equality check */
3487 		HashTable *array = emalloc(sizeof(HashTable));
3488 		zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3489 		for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3490 			zval tmp;
3491 			ZVAL_STRING(&tmp, (*encodings)->name);
3492 			zend_hash_next_index_insert(array, &tmp);
3493 		}
3494 		MBSTRG(all_encodings_list) = array;
3495 	}
3496 
3497 	GC_ADDREF(MBSTRG(all_encodings_list));
3498 	RETURN_ARR(MBSTRG(all_encodings_list));
3499 }
3500 /* }}} */
3501 
3502 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3503 PHP_FUNCTION(mb_encoding_aliases)
3504 {
3505 	const mbfl_encoding *encoding;
3506 	zend_string *encoding_name = NULL;
3507 
3508 	ZEND_PARSE_PARAMETERS_START(1, 1)
3509 		Z_PARAM_STR(encoding_name)
3510 	ZEND_PARSE_PARAMETERS_END();
3511 
3512 	encoding = php_mb_get_encoding(encoding_name, 1);
3513 	if (!encoding) {
3514 		RETURN_THROWS();
3515 	}
3516 
3517 	array_init(return_value);
3518 	if (encoding->aliases != NULL) {
3519 		for (const char **alias = encoding->aliases; *alias; ++alias) {
3520 			add_next_index_string(return_value, (char *)*alias);
3521 		}
3522 	}
3523 }
3524 /* }}} */
3525 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3526 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3527 {
3528 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3529 	 * if we are converting zenkaku kana to hankaku kana
3530 	 * Make the buffer for converted kana big enough that we never need to
3531 	 * perform bounds checks */
3532 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3533 	unsigned int buf_offset = 0;
3534 	unsigned int state = 0;
3535 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3536 	size_t in_len = ZSTR_LEN(input);
3537 
3538 	mb_convert_buf buf;
3539 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3540 
3541 	while (in_len) {
3542 		uint32_t *converted = converted_buf;
3543 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3544 		 * previous iteration, don't overwrite it */
3545 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3546 		out_len += buf_offset;
3547 		ZEND_ASSERT(out_len <= 64);
3548 
3549 		if (!out_len) {
3550 			continue;
3551 		}
3552 
3553 		for (size_t i = 0; i < out_len-1; i++) {
3554 			uint32_t second = 0;
3555 			bool consumed = false;
3556 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3557 			if (second) {
3558 				*converted++ = second;
3559 			}
3560 			if (consumed) {
3561 				i++;
3562 				if (i == out_len-1) {
3563 					/* We consumed two codepoints at the very end of the wchar buffer
3564 					 * So there is nothing remaining to reprocess on the next iteration */
3565 					buf_offset = 0;
3566 					goto emit_converted_kana;
3567 				}
3568 			}
3569 		}
3570 
3571 		if (!in_len) {
3572 			/* This is the last iteration, so we need to process the final codepoint now */
3573 			uint32_t second = 0;
3574 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3575 			if (second) {
3576 				*converted++ = second;
3577 			}
3578 		} else {
3579 			/* Reprocess the last codepoint on the next iteration */
3580 			wchar_buf[0] = wchar_buf[out_len-1];
3581 			buf_offset = 1;
3582 		}
3583 
3584 emit_converted_kana:
3585 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3586 	}
3587 
3588 	return mb_convert_buf_result(&buf, encoding);
3589 }
3590 
3591 char mb_convert_kana_flags[17] = {
3592 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3593 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3594 	'V'
3595 };
3596 
3597 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3598 PHP_FUNCTION(mb_convert_kana)
3599 {
3600 	unsigned int opt;
3601 	char *optstr = NULL;
3602 	size_t optstr_len;
3603 	zend_string *encname = NULL, *str;
3604 
3605 	ZEND_PARSE_PARAMETERS_START(1, 3)
3606 		Z_PARAM_STR(str)
3607 		Z_PARAM_OPTIONAL
3608 		Z_PARAM_STRING(optstr, optstr_len)
3609 		Z_PARAM_STR_OR_NULL(encname)
3610 	ZEND_PARSE_PARAMETERS_END();
3611 
3612 	if (optstr != NULL) {
3613 		char *p = optstr, *e = p + optstr_len;
3614 		opt = 0;
3615 next_option:
3616 		while (p < e) {
3617 			/* Walk through option string and convert to bit vector
3618 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3619 			char c = *p++;
3620 			if (c == 'A') {
3621 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3622 			} else if (c == 'a') {
3623 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3624 			} else {
3625 				for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3626 					if (c == mb_convert_kana_flags[i]) {
3627 						opt |= (1 << i);
3628 						goto next_option;
3629 					}
3630 				}
3631 
3632 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3633 				RETURN_THROWS();
3634 			}
3635 		}
3636 
3637 		/* Check for illegal combinations of options */
3638 		if (((opt & 0xFF00) >> 8) & opt) {
3639 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3640 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3641 			 * FW hiragana to FW katakana and then back again. */
3642 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3643 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3644 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3645 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3646 				flag1 = 'A';
3647 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3648 				flag2 = 'a';
3649 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3650 			RETURN_THROWS();
3651 		}
3652 
3653 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3654 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3655 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3656 			RETURN_THROWS();
3657 		}
3658 
3659 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3660 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3661 		 * more than one of these */
3662 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3663 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3664 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3665 				RETURN_THROWS();
3666 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3667 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3668 				RETURN_THROWS();
3669 			}
3670 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3671 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3672 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3673 				RETURN_THROWS();
3674 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3675 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3676 				RETURN_THROWS();
3677 			}
3678 		}
3679 	} else {
3680 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3681 	}
3682 
3683 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3684 	if (!enc) {
3685 		RETURN_THROWS();
3686 	}
3687 
3688 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3689 }
3690 
mb_recursive_count_strings(zval * var)3691 static unsigned int mb_recursive_count_strings(zval *var)
3692 {
3693 	unsigned int count = 0;
3694 	ZVAL_DEREF(var);
3695 
3696 	if (Z_TYPE_P(var) == IS_STRING) {
3697 		count++;
3698 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3699 		if (Z_REFCOUNTED_P(var)) {
3700 			if (Z_IS_RECURSIVE_P(var)) {
3701 				return count;
3702 			}
3703 			Z_PROTECT_RECURSION_P(var);
3704 		}
3705 
3706 		HashTable *ht = HASH_OF(var);
3707 		if (ht != NULL) {
3708 			zval *entry;
3709 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3710 				count += mb_recursive_count_strings(entry);
3711 			} ZEND_HASH_FOREACH_END();
3712 		}
3713 
3714 		if (Z_REFCOUNTED_P(var)) {
3715 			Z_UNPROTECT_RECURSION_P(var);
3716 		}
3717 	}
3718 
3719 	return count;
3720 }
3721 
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3722 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3723 {
3724 	ZVAL_DEREF(var);
3725 
3726 	if (Z_TYPE_P(var) == IS_STRING) {
3727 		val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3728 		len_list[*count] = Z_STRLEN_P(var);
3729 		(*count)++;
3730 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3731 		if (Z_REFCOUNTED_P(var)) {
3732 			if (Z_IS_RECURSIVE_P(var)) {
3733 				return true;
3734 			}
3735 			Z_PROTECT_RECURSION_P(var);
3736 		}
3737 
3738 		HashTable *ht = HASH_OF(var);
3739 		if (ht != NULL) {
3740 			zval *entry;
3741 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3742 				if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3743 					if (Z_REFCOUNTED_P(var)) {
3744 						Z_UNPROTECT_RECURSION_P(var);
3745 						return true;
3746 					}
3747 				}
3748 			} ZEND_HASH_FOREACH_END();
3749 		}
3750 
3751 		if (Z_REFCOUNTED_P(var)) {
3752 			Z_UNPROTECT_RECURSION_P(var);
3753 		}
3754 	}
3755 
3756 	return false;
3757 }
3758 
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3759 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3760 {
3761 	zval *entry, *orig_var;
3762 
3763 	orig_var = var;
3764 	ZVAL_DEREF(var);
3765 
3766 	if (Z_TYPE_P(var) == IS_STRING) {
3767 		zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3768 		zval_ptr_dtor(orig_var);
3769 		ZVAL_STR(orig_var, ret);
3770 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3771 		if (Z_TYPE_P(var) == IS_ARRAY) {
3772 			SEPARATE_ARRAY(var);
3773 		}
3774 		if (Z_REFCOUNTED_P(var)) {
3775 			if (Z_IS_RECURSIVE_P(var)) {
3776 				return true;
3777 			}
3778 			Z_PROTECT_RECURSION_P(var);
3779 		}
3780 
3781 		HashTable *ht = HASH_OF(var);
3782 		if (ht != NULL) {
3783 			ZEND_HASH_FOREACH_VAL(ht, entry) {
3784 				/* Can be a typed property declaration, in which case we need to remove the reference from the source list.
3785 				 * Just using ZEND_TRY_ASSIGN_STRINGL is not sufficient because that would not unwrap the reference
3786 				 * and change values through references (see bug #26639). */
3787 				if (Z_TYPE_P(entry) == IS_INDIRECT) {
3788 					ZEND_ASSERT(Z_TYPE_P(var) == IS_OBJECT);
3789 
3790 					entry = Z_INDIRECT_P(entry);
3791 					if (Z_ISREF_P(entry) && Z_TYPE_P(Z_REFVAL_P(entry)) == IS_STRING) {
3792 						zend_property_info *info = zend_get_typed_property_info_for_slot(Z_OBJ_P(var), entry);
3793 						if (info) {
3794 							ZEND_REF_DEL_TYPE_SOURCE(Z_REF_P(entry), info);
3795 						}
3796 					}
3797 				}
3798 
3799 				if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3800 					if (Z_REFCOUNTED_P(var)) {
3801 						Z_UNPROTECT_RECURSION_P(var);
3802 					}
3803 					return true;
3804 				}
3805 			} ZEND_HASH_FOREACH_END();
3806 		}
3807 
3808 		if (Z_REFCOUNTED_P(var)) {
3809 			Z_UNPROTECT_RECURSION_P(var);
3810 		}
3811 	}
3812 
3813 	return false;
3814 }
3815 
PHP_FUNCTION(mb_convert_variables)3816 PHP_FUNCTION(mb_convert_variables)
3817 {
3818 	zval *args;
3819 	zend_string *to_enc_str;
3820 	zend_string *from_enc_str;
3821 	HashTable *from_enc_ht;
3822 	const mbfl_encoding *from_encoding, *to_encoding;
3823 	uint32_t argc;
3824 	size_t elistsz;
3825 	const mbfl_encoding **elist;
3826 
3827 	ZEND_PARSE_PARAMETERS_START(3, -1)
3828 		Z_PARAM_STR(to_enc_str)
3829 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3830 		Z_PARAM_VARIADIC('+', args, argc)
3831 	ZEND_PARSE_PARAMETERS_END();
3832 
3833 	/* new encoding */
3834 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3835 	if (!to_encoding) {
3836 		RETURN_THROWS();
3837 	}
3838 
3839 	from_encoding = MBSTRG(current_internal_encoding);
3840 
3841 	bool order_significant = true;
3842 
3843 	/* pre-conversion encoding */
3844 	if (from_enc_ht) {
3845 		if (from_enc_ht == MBSTRG(all_encodings_list)) {
3846 			/* If entire list of supported encodings returned by `mb_list_encodings` is passed
3847 			 * in, then don't treat the order of the list as significant */
3848 			order_significant = false;
3849 		}
3850 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3851 			RETURN_THROWS();
3852 		}
3853 	} else {
3854 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3855 			RETURN_THROWS();
3856 		}
3857 	}
3858 
3859 	if (elistsz == 0) {
3860 		efree(ZEND_VOIDP(elist));
3861 		zend_argument_value_error(2, "must specify at least one encoding");
3862 		RETURN_THROWS();
3863 	}
3864 
3865 	if (elistsz == 1) {
3866 		from_encoding = *elist;
3867 	} else {
3868 		/* auto detect */
3869 		unsigned int num = 0;
3870 		for (size_t n = 0; n < argc; n++) {
3871 			zval *zv = &args[n];
3872 			num += mb_recursive_count_strings(zv);
3873 		}
3874 		const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3875 		size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3876 		unsigned int i = 0;
3877 		for (size_t n = 0; n < argc; n++) {
3878 			zval *zv = &args[n];
3879 			if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3880 				efree(ZEND_VOIDP(elist));
3881 				efree(ZEND_VOIDP(val_list));
3882 				efree(len_list);
3883 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3884 				RETURN_FALSE;
3885 			}
3886 		}
3887 		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3888 		efree(ZEND_VOIDP(val_list));
3889 		efree(len_list);
3890 		if (!from_encoding) {
3891 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3892 			efree(ZEND_VOIDP(elist));
3893 			RETURN_FALSE;
3894 		}
3895 
3896 	}
3897 
3898 	efree(ZEND_VOIDP(elist));
3899 
3900 	/* convert */
3901 	for (size_t n = 0; n < argc; n++) {
3902 		zval *zv = &args[n];
3903 		ZVAL_DEREF(zv);
3904 		if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3905 			php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3906 			RETURN_FALSE;
3907 		}
3908 	}
3909 
3910 	RETURN_STRING(from_encoding->name);
3911 }
3912 
3913 /* HTML numeric entities */
3914 
3915 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3916 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3917 {
3918 	zval *hash_entry;
3919 
3920 	size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3921 	if (n_elems % 4 != 0) {
3922 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3923 		return NULL;
3924 	}
3925 
3926 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3927 	uint32_t *mapelm = convmap;
3928 
3929 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3930 		bool failed = true;
3931 		zend_long tmp = zval_try_get_long(hash_entry, &failed);
3932 		if (failed) {
3933 			efree(convmap);
3934 			zend_argument_value_error(2, "must only be composed of values of type int");
3935 			return NULL;
3936 		}
3937 		*mapelm++ = tmp;
3938 	} ZEND_HASH_FOREACH_END();
3939 
3940 	return convmap;
3941 }
3942 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3943 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3944 {
3945 	uint32_t *convmap_end = convmap + conversion_map_size;
3946 
3947 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3948 		uint32_t lo_code = mapelm[0];
3949 		uint32_t hi_code = mapelm[1];
3950 		uint32_t offset  = mapelm[2];
3951 		uint32_t mask    = mapelm[3];
3952 
3953 		if (w >= lo_code && w <= hi_code) {
3954 			/* This wchar falls inside one of the ranges which should be
3955 			 * converted to HTML entities */
3956 			*retval = (w + offset) & mask;
3957 			return true;
3958 		}
3959 	}
3960 
3961 	/* None of the ranges matched */
3962 	return false;
3963 }
3964 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3965 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3966 {
3967 	/* Each wchar which we get from decoding the input string may become up to
3968 	 * 13 wchars when we convert it to an HTML entity */
3969 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3970 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3971 
3972 	unsigned int state = 0;
3973 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3974 	size_t in_len = ZSTR_LEN(input);
3975 
3976 	mb_convert_buf buf;
3977 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3978 
3979 	while (in_len) {
3980 		/* Convert input string to wchars, up to 32 at a time */
3981 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3982 		ZEND_ASSERT(out_len <= 32);
3983 		uint32_t *converted = converted_buf;
3984 
3985 		/* Run through wchars and see if any of them fall into the ranges
3986 		 * which we want to convert to HTML entities */
3987 		for (size_t i = 0; i < out_len; i++) {
3988 			uint32_t w = wchar_buf[i];
3989 
3990 			if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3991 				*converted++ = '&';
3992 				*converted++ = '#';
3993 				if (hex) {
3994 					*converted++ = 'x';
3995 				}
3996 
3997 				/* Convert wchar to decimal/hex string */
3998 				if (w == 0) {
3999 					*converted++ = '0';
4000 				} else {
4001 					unsigned char *p = entity + sizeof(entity);
4002 					if (hex) {
4003 						while (w > 0) {
4004 							*(--p) = "0123456789ABCDEF"[w & 0xF];
4005 							w >>= 4;
4006 						}
4007 					} else {
4008 						while (w > 0) {
4009 							*(--p) = "0123456789"[w % 10];
4010 							w /= 10;
4011 						}
4012 					}
4013 					while (p < entity + sizeof(entity)) {
4014 						*converted++ = *p++;
4015 					}
4016 				}
4017 
4018 				*converted++ = ';';
4019 			} else {
4020 				*converted++ = w;
4021 			}
4022 		}
4023 
4024 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
4025 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4026 	}
4027 
4028 	return mb_convert_buf_result(&buf, encoding);
4029 }
4030 
4031 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)4032 PHP_FUNCTION(mb_encode_numericentity)
4033 {
4034 	zend_string *encoding = NULL, *str;
4035 	size_t conversion_map_size;
4036 	HashTable *target_hash;
4037 	bool is_hex = false;
4038 
4039 	ZEND_PARSE_PARAMETERS_START(2, 4)
4040 		Z_PARAM_STR(str)
4041 		Z_PARAM_ARRAY_HT(target_hash)
4042 		Z_PARAM_OPTIONAL
4043 		Z_PARAM_STR_OR_NULL(encoding)
4044 		Z_PARAM_BOOL(is_hex)
4045 	ZEND_PARSE_PARAMETERS_END();
4046 
4047 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4048 	if (!enc) {
4049 		RETURN_THROWS();
4050 	}
4051 
4052 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4053 	if (convmap == NULL) {
4054 		RETURN_THROWS();
4055 	}
4056 
4057 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4058 	efree(convmap);
4059 }
4060 /* }}} */
4061 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)4062 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4063 {
4064 	uint32_t *convmap_end = convmap + conversion_map_size;
4065 
4066 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4067 		uint32_t lo_code = mapelm[0];
4068 		uint32_t hi_code = mapelm[1];
4069 		uint32_t offset  = mapelm[2];
4070 		uint32_t codepoint = number - offset;
4071 		if (codepoint >= lo_code && codepoint <= hi_code) {
4072 			*retval = codepoint;
4073 			return true;
4074 		}
4075 	}
4076 
4077 	return false;
4078 }
4079 
4080 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
4081 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
4082 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4083 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4084 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4085 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4086 {
4087 	uint32_t wchar_buf[128], converted_buf[128];
4088 
4089 	unsigned int state = 0;
4090 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4091 	size_t in_len = ZSTR_LEN(input);
4092 
4093 	mb_convert_buf buf;
4094 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4095 
4096 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4097 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4098 	 * 2nd 'converted' buffer.
4099 	 *
4100 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4101 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4102 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4103 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4104 	 * to store the next batch of wchars after it.
4105 	 *
4106 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4107 	 * wchars from the 1st buffer to the 2nd one.
4108 	 *
4109 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4110 	 * have to do bounds checks when writing wchars into it.
4111 	 */
4112 
4113 	unsigned int wchar_buf_offset = 0;
4114 
4115 	while (in_len) {
4116 		/* Leave space for sentinel at the end of the buffer */
4117 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4118 		out_len += wchar_buf_offset;
4119 		ZEND_ASSERT(out_len <= 127);
4120 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4121 
4122 		uint32_t *p, *converted;
4123 
4124 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4125 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
4126 		if (wchar_buf_offset == 0) {
4127 			p = wchar_buf;
4128 			while (*p != '&')
4129 				p++;
4130 			if (p == wchar_buf + out_len) {
4131 				/* No HTML entities in this buffer */
4132 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4133 				continue;
4134 			}
4135 
4136 			/* Copy over the prefix with no & which we already scanned */
4137 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4138 			converted = converted_buf + (p - wchar_buf);
4139 		} else {
4140 			p = wchar_buf;
4141 			converted = converted_buf;
4142 		}
4143 
4144 found_ampersand:
4145 		ZEND_ASSERT(*p == '&');
4146 		uint32_t *p2 = p;
4147 
4148 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4149 		if (*++p2 == '#') {
4150 			if (*++p2 == 'x') {
4151 				/* Possible hex entity */
4152 				uint32_t w = *++p2;
4153 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4154 					w = *++p2;
4155 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4156 					/* We hit the end of the buffer while reading digits, and
4157 					 * more wchars are still coming in the next buffer
4158 					 * Reprocess this identity on next iteration */
4159 					memmove(wchar_buf, p, (p2 - p) * 4);
4160 					wchar_buf_offset = p2 - p;
4161 					goto process_converted_wchars;
4162 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4163 					/* Invalid entity (too long or "&#x" only) */
4164 					memcpy(converted, p, (p2 - p) * 4);
4165 					converted += p2 - p;
4166 				} else {
4167 					/* Valid hexadecimal entity */
4168 					uint32_t value = 0, *p3 = p + 3;
4169 					while (p3 < p2) {
4170 						w = *p3++;
4171 						if (w <= '9') {
4172 							value = (value * 16) + (w - '0');
4173 						} else if (w >= 'a') {
4174 							value = (value * 16) + 10 + (w - 'a');
4175 						} else {
4176 							value = (value * 16) + 10 + (w - 'A');
4177 						}
4178 					}
4179 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4180 						converted++;
4181 						if (*p2 == ';')
4182 							p2++;
4183 					} else {
4184 						memcpy(converted, p, (p2 - p) * 4);
4185 						converted += p2 - p;
4186 					}
4187 				}
4188 			} else {
4189 				/* Possible decimal entity */
4190 				uint32_t w = *p2;
4191 				while (w >= '0' && w <= '9')
4192 					w = *++p2;
4193 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4194 					/* The number of digits was legal (no more than 10 decimal digits)
4195 					 * Reprocess this identity on next iteration of main loop */
4196 					memmove(wchar_buf, p, (p2 - p) * 4);
4197 					wchar_buf_offset = p2 - p;
4198 					goto process_converted_wchars;
4199 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4200 					/* Invalid entity (too long or "&#" only) */
4201 					memcpy(converted, p, (p2 - p) * 4);
4202 					converted += p2 - p;
4203 				} else {
4204 					/* Valid decimal entity */
4205 					uint32_t value = 0, *p3 = p + 2;
4206 					while (p3 < p2) {
4207 						/* If unsigned integer overflow would occur in the below
4208 						 * multiplication by 10, this entity is no good
4209 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4210 						if (value > 0x19999999) {
4211 							memcpy(converted, p, (p2 - p) * 4);
4212 							converted += p2 - p;
4213 							goto decimal_entity_too_big;
4214 						}
4215 						value = (value * 10) + (*p3++ - '0');
4216 					}
4217 					if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4218 						converted++;
4219 						if (*p2 == ';')
4220 							p2++;
4221 					} else {
4222 						memcpy(converted, p, (p2 - p) * 4);
4223 						converted += p2 - p;
4224 					}
4225 				}
4226 			}
4227 		} else if ((p2 == wchar_buf + out_len) && in_len) {
4228 			/* Corner case: & at end of buffer */
4229 			wchar_buf[0] = '&';
4230 			wchar_buf_offset = 1;
4231 			goto process_converted_wchars;
4232 		} else {
4233 			*converted++ = '&';
4234 		}
4235 decimal_entity_too_big:
4236 
4237 		/* Starting to scan a new section of the wchar buffer
4238 		 * 'p2' is pointing at the next wchar which needs to be processed */
4239 		p = p2;
4240 		while (*p2 != '&')
4241 			p2++;
4242 
4243 		if (p2 > p) {
4244 			memcpy(converted, p, (p2 - p) * 4);
4245 			converted += p2 - p;
4246 			p = p2;
4247 		}
4248 
4249 		if (p < wchar_buf + out_len)
4250 			goto found_ampersand;
4251 
4252 		/* We do not have any wchars remaining at the end of this buffer which
4253 		 * we need to reprocess on the next call */
4254 		wchar_buf_offset = 0;
4255 process_converted_wchars:
4256 		ZEND_ASSERT(converted <= converted_buf + 128);
4257 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4258 	}
4259 
4260 	return mb_convert_buf_result(&buf, encoding);
4261 }
4262 
4263 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4264 PHP_FUNCTION(mb_decode_numericentity)
4265 {
4266 	zend_string *encoding = NULL, *str;
4267 	size_t conversion_map_size;
4268 	HashTable *target_hash;
4269 
4270 	ZEND_PARSE_PARAMETERS_START(2, 3)
4271 		Z_PARAM_STR(str)
4272 		Z_PARAM_ARRAY_HT(target_hash)
4273 		Z_PARAM_OPTIONAL
4274 		Z_PARAM_STR_OR_NULL(encoding)
4275 	ZEND_PARSE_PARAMETERS_END();
4276 
4277 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4278 	if (!enc) {
4279 		RETURN_THROWS();
4280 	}
4281 
4282 	uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4283 	if (convmap == NULL) {
4284 		RETURN_THROWS();
4285 	}
4286 
4287 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4288 	efree(convmap);
4289 }
4290 /* }}} */
4291 
4292 /* {{{ Sends an email message with MIME scheme */
4293 #define CRLF "\r\n"
4294 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4295 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4296 {
4297 	const char *ps;
4298 	size_t icnt;
4299 	int state = 0;
4300 	int crlf_state = -1;
4301 	char *token = NULL;
4302 	size_t token_pos = 0;
4303 	zend_string *fld_name, *fld_val;
4304 
4305 	ps = str;
4306 	icnt = str_len;
4307 	fld_name = fld_val = NULL;
4308 
4309 	/*
4310 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4311 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4312 	 *      state  0            1           2          3
4313 	 *
4314 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4315 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4316 	 * crlf_state -1                       0                     1 -1
4317 	 *
4318 	 */
4319 
4320 	while (icnt > 0) {
4321 		switch (*ps) {
4322 			case ':':
4323 				if (crlf_state == 1) {
4324 					token_pos++;
4325 				}
4326 
4327 				if (state == 0 || state == 1) {
4328 					if(token && token_pos > 0) {
4329 						fld_name = zend_string_init(token, token_pos, 0);
4330 					}
4331 					state = 2;
4332 				} else {
4333 					token_pos++;
4334 				}
4335 
4336 				crlf_state = 0;
4337 				break;
4338 
4339 			case '\n':
4340 				if (crlf_state == -1) {
4341 					goto out;
4342 				}
4343 				crlf_state = -1;
4344 				break;
4345 
4346 			case '\r':
4347 				if (crlf_state == 1) {
4348 					token_pos++;
4349 				} else {
4350 					crlf_state = 1;
4351 				}
4352 				break;
4353 
4354 			case ' ': case '\t':
4355 				if (crlf_state == -1) {
4356 					if (state == 3) {
4357 						/* continuing from the previous line */
4358 						state = 4;
4359 					} else {
4360 						/* simply skipping this new line */
4361 						state = 5;
4362 					}
4363 				} else {
4364 					if (crlf_state == 1) {
4365 						token_pos++;
4366 					}
4367 					if (state == 1 || state == 3) {
4368 						token_pos++;
4369 					}
4370 				}
4371 				crlf_state = 0;
4372 				break;
4373 
4374 			default:
4375 				switch (state) {
4376 					case 0:
4377 						token = (char*)ps;
4378 						token_pos = 0;
4379 						state = 1;
4380 						break;
4381 
4382 					case 2:
4383 						if (crlf_state != -1) {
4384 							token = (char*)ps;
4385 							token_pos = 0;
4386 
4387 							state = 3;
4388 							break;
4389 						}
4390 						ZEND_FALLTHROUGH;
4391 
4392 					case 3:
4393 						if (crlf_state == -1) {
4394 							if(token && token_pos > 0) {
4395 								fld_val = zend_string_init(token, token_pos, 0);
4396 							}
4397 
4398 							if (fld_name != NULL && fld_val != NULL) {
4399 								zval val;
4400 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4401 								ZVAL_STR(&val, fld_val);
4402 
4403 								zend_hash_update(ht, fld_name, &val);
4404 
4405 								zend_string_release_ex(fld_name, 0);
4406 							}
4407 
4408 							fld_name = fld_val = NULL;
4409 							token = (char*)ps;
4410 							token_pos = 0;
4411 
4412 							state = 1;
4413 						}
4414 						break;
4415 
4416 					case 4:
4417 						token_pos++;
4418 						state = 3;
4419 						break;
4420 				}
4421 
4422 				if (crlf_state == 1) {
4423 					token_pos++;
4424 				}
4425 
4426 				token_pos++;
4427 
4428 				crlf_state = 0;
4429 				break;
4430 		}
4431 		ps++, icnt--;
4432 	}
4433 out:
4434 	if (state == 2) {
4435 		token = "";
4436 		token_pos = 0;
4437 
4438 		state = 3;
4439 	}
4440 	if (state == 3) {
4441 		if(token && token_pos > 0) {
4442 			fld_val = zend_string_init(token, token_pos, 0);
4443 		}
4444 		if (fld_name != NULL && fld_val != NULL) {
4445 			zval val;
4446 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4447 			ZVAL_STR(&val, fld_val);
4448 			zend_hash_update(ht, fld_name, &val);
4449 
4450 			zend_string_release_ex(fld_name, 0);
4451 		}
4452 	}
4453 	return state;
4454 }
4455 
PHP_FUNCTION(mb_send_mail)4456 PHP_FUNCTION(mb_send_mail)
4457 {
4458 	char *to;
4459 	size_t to_len;
4460 	char *message;
4461 	size_t message_len;
4462 	zend_string *subject;
4463 	zend_string *extra_cmd = NULL;
4464 	HashTable *headers_ht = NULL;
4465 	zend_string *str_headers = NULL;
4466 	size_t i;
4467 	char *to_r = NULL;
4468 	bool suppress_content_type = false;
4469 	bool suppress_content_transfer_encoding = false;
4470 
4471 	char *p;
4472 	enum mbfl_no_encoding;
4473 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4474 						*head_enc,	/* header transfer encoding */
4475 						*body_enc;	/* body transfer encoding */
4476 	const mbfl_language *lang;
4477 	HashTable ht_headers;
4478 	zval *s;
4479 
4480 	/* character-set, transfer-encoding */
4481 	tran_cs = &mbfl_encoding_utf8;
4482 	head_enc = &mbfl_encoding_base64;
4483 	body_enc = &mbfl_encoding_base64;
4484 	lang = mbfl_no2language(MBSTRG(language));
4485 	if (lang != NULL) {
4486 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4487 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4488 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4489 	}
4490 
4491 	ZEND_PARSE_PARAMETERS_START(3, 5)
4492 		Z_PARAM_PATH(to, to_len)
4493 		Z_PARAM_PATH_STR(subject)
4494 		Z_PARAM_PATH(message, message_len)
4495 		Z_PARAM_OPTIONAL
4496 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4497 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4498 	ZEND_PARSE_PARAMETERS_END();
4499 
4500 	if (str_headers) {
4501 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4502 			zend_argument_value_error(4, "must not contain any null bytes");
4503 			RETURN_THROWS();
4504 		}
4505 		str_headers = php_trim(str_headers, NULL, 0, 2);
4506 	} else if (headers_ht) {
4507 		str_headers = php_mail_build_headers(headers_ht);
4508 		if (EG(exception)) {
4509 			RETURN_THROWS();
4510 		}
4511 	}
4512 
4513 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4514 
4515 	if (str_headers != NULL) {
4516 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4517 	}
4518 
4519 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4520 		char *tmp;
4521 		char *param_name;
4522 		char *charset = NULL;
4523 
4524 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4525 		p = strchr(Z_STRVAL_P(s), ';');
4526 
4527 		if (p != NULL) {
4528 			/* skipping the padded spaces */
4529 			do {
4530 				++p;
4531 			} while (*p == ' ' || *p == '\t');
4532 
4533 			if (*p != '\0') {
4534 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4535 					if (strcasecmp(param_name, "charset") == 0) {
4536 						const mbfl_encoding *_tran_cs = tran_cs;
4537 
4538 						charset = php_strtok_r(NULL, "= \"", &tmp);
4539 						if (charset != NULL) {
4540 							_tran_cs = mbfl_name2encoding(charset);
4541 						}
4542 
4543 						if (!_tran_cs) {
4544 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4545 							_tran_cs = &mbfl_encoding_ascii;
4546 						}
4547 						tran_cs = _tran_cs;
4548 					}
4549 				}
4550 			}
4551 		}
4552 		suppress_content_type = true;
4553 	}
4554 
4555 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4556 		const mbfl_encoding *_body_enc;
4557 
4558 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4559 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4560 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4561 			case mbfl_no_encoding_base64:
4562 			case mbfl_no_encoding_7bit:
4563 			case mbfl_no_encoding_8bit:
4564 				body_enc = _body_enc;
4565 				break;
4566 
4567 			default:
4568 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4569 				body_enc =	&mbfl_encoding_8bit;
4570 				break;
4571 		}
4572 		suppress_content_transfer_encoding = true;
4573 	}
4574 
4575 	/* To: */
4576 	if (to_len > 0) {
4577 		to_r = estrndup(to, to_len);
4578 		for (; to_len; to_len--) {
4579 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4580 				break;
4581 			}
4582 			to_r[to_len - 1] = '\0';
4583 		}
4584 		for (i = 0; to_r[i]; i++) {
4585 			if (iscntrl((unsigned char) to_r[i])) {
4586 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4587 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4588 				 * To prevent these separators from being replaced with a space, we skip over them. */
4589 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4590 					i += 2;
4591 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4592 						i++;
4593 					}
4594 					continue;
4595 				}
4596 
4597 				to_r[i] = ' ';
4598 			}
4599 		}
4600 	} else {
4601 		to_r = to;
4602 	}
4603 
4604 	/* Subject: */
4605 	const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4606 	if (enc == &mbfl_encoding_pass) {
4607 		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4608 	}
4609 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4610 	size_t line_sep_len = strlen(line_sep);
4611 
4612 	subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4613 
4614 	/* message body */
4615 	const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4616 	if (msg_enc == &mbfl_encoding_pass) {
4617 		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4618 	}
4619 
4620 	unsigned int num_errors = 0;
4621 	zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4622 	zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4623 	zend_string_free(tmpstr);
4624 	message = ZSTR_VAL(conv);
4625 
4626 	/* other headers */
4627 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4628 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4629 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4630 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4631 
4632 	smart_str str = {0};
4633 	bool empty = true;
4634 
4635 	if (str_headers != NULL && ZSTR_LEN(str_headers) > 0) {
4636 		/* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4637 		size_t len = ZSTR_LEN(str_headers);
4638 		if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4639 			len--;
4640 		}
4641 		if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4642 			len--;
4643 		}
4644 		smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4645 		empty = false;
4646 		zend_string_release_ex(str_headers, 0);
4647 	}
4648 
4649 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4650 		if (!empty) {
4651 			smart_str_appendl(&str, line_sep, line_sep_len);
4652 		}
4653 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4654 		empty = false;
4655 	}
4656 
4657 	if (!suppress_content_type) {
4658 		if (!empty) {
4659 			smart_str_appendl(&str, line_sep, line_sep_len);
4660 		}
4661 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4662 
4663 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4664 		if (p != NULL) {
4665 			smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4666 			smart_str_appends(&str, p);
4667 		}
4668 		empty = false;
4669 	}
4670 
4671 	if (!suppress_content_transfer_encoding) {
4672 		if (!empty) {
4673 			smart_str_appendl(&str, line_sep, line_sep_len);
4674 		}
4675 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4676 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4677 		if (p == NULL) {
4678 			p = "7bit";
4679 		}
4680 		smart_str_appends(&str, p);
4681 	}
4682 
4683 	str_headers = smart_str_extract(&str);
4684 
4685 	zend_string *force_extra_parameters = zend_ini_str_ex("mail.force_extra_parameters", strlen("mail.force_extra_parameters"), false, NULL);
4686 	if (force_extra_parameters) {
4687 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4688 	} else if (extra_cmd) {
4689 		extra_cmd = php_escape_shell_cmd(extra_cmd);
4690 	}
4691 
4692 	RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4693 
4694 	if (extra_cmd) {
4695 		zend_string_release_ex(extra_cmd, 0);
4696 	}
4697 	if (to_r != to) {
4698 		efree(to_r);
4699 	}
4700 	zend_string_release(subject);
4701 	zend_string_free(conv);
4702 	zend_hash_destroy(&ht_headers);
4703 	if (str_headers) {
4704 		zend_string_release_ex(str_headers, 0);
4705 	}
4706 }
4707 
4708 #undef CRLF
4709 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4710 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4711 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4712 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4713 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4714 /* }}} */
4715 
4716 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4717 PHP_FUNCTION(mb_get_info)
4718 {
4719 	zend_string *type = NULL;
4720 	size_t n;
4721 	char *name;
4722 	zval row;
4723 	const mbfl_encoding **entry;
4724 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4725 
4726 	ZEND_ASSERT(lang);
4727 
4728 	ZEND_PARSE_PARAMETERS_START(0, 1)
4729 		Z_PARAM_OPTIONAL
4730 		Z_PARAM_STR(type)
4731 	ZEND_PARSE_PARAMETERS_END();
4732 
4733 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4734 		array_init(return_value);
4735 		if (MBSTRG(current_internal_encoding)) {
4736 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4737 		}
4738 		if (MBSTRG(http_input_identify)) {
4739 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4740 		}
4741 		if (MBSTRG(current_http_output_encoding)) {
4742 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4743 		}
4744 
4745 		add_assoc_str(return_value, "http_output_conv_mimetypes",
4746 			zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4747 		);
4748 
4749 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4750 		add_assoc_string(return_value, "mail_charset", name);
4751 
4752 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4753 		add_assoc_string(return_value, "mail_header_encoding", name);
4754 
4755 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4756 		add_assoc_string(return_value, "mail_body_encoding", name);
4757 
4758 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4759 
4760 		if (MBSTRG(encoding_translation)) {
4761 			add_assoc_string(return_value, "encoding_translation", "On");
4762 		} else {
4763 			add_assoc_string(return_value, "encoding_translation", "Off");
4764 		}
4765 
4766 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4767 		add_assoc_string(return_value, "language", name);
4768 
4769 		// TODO Seems to always have one entry at least?
4770 		n = MBSTRG(current_detect_order_list_size);
4771 		entry = MBSTRG(current_detect_order_list);
4772 		if (n > 0) {
4773 			size_t i;
4774 			array_init(&row);
4775 			for (i = 0; i < n; i++) {
4776 				add_next_index_string(&row, (*entry)->name);
4777 				entry++;
4778 			}
4779 			add_assoc_zval(return_value, "detect_order", &row);
4780 		}
4781 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4782 			add_assoc_string(return_value, "substitute_character", "none");
4783 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4784 			add_assoc_string(return_value, "substitute_character", "long");
4785 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4786 			add_assoc_string(return_value, "substitute_character", "entity");
4787 		} else {
4788 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4789 		}
4790 		if (MBSTRG(strict_detection)) {
4791 			add_assoc_string(return_value, "strict_detection", "On");
4792 		} else {
4793 			add_assoc_string(return_value, "strict_detection", "Off");
4794 		}
4795 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4796 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
4797 		RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4798 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4799 		if (MBSTRG(http_input_identify)) {
4800 			RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4801 		}
4802 		RETURN_NULL();
4803 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4804 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4805 		RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4806 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4807 		RETURN_STR(
4808 			zend_ini_str(
4809 				"mbstring.http_output_conv_mimetypes",
4810 				sizeof("mbstring.http_output_conv_mimetypes") - 1,
4811 				false
4812 			)
4813 		);
4814 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4815 		name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4816 		RETURN_STRING(name);
4817 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4818 		name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4819 		RETURN_STRING(name);
4820 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4821 		name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4822 		RETURN_STRING(name);
4823 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4824 		RETURN_LONG(MBSTRG(illegalchars));
4825 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4826 		if (MBSTRG(encoding_translation)) {
4827 			RETURN_STRING("On");
4828 		} else {
4829 			RETURN_STRING("Off");
4830 		}
4831 	} else if (zend_string_equals_literal_ci(type, "language")) {
4832 		name = (char *)mbfl_no_language2name(MBSTRG(language));
4833 		RETURN_STRING(name);
4834 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4835 		// TODO Seems to always have one entry at least?
4836 		n = MBSTRG(current_detect_order_list_size);
4837 		entry = MBSTRG(current_detect_order_list);
4838 		if (n > 0) {
4839 			size_t i;
4840 			array_init(return_value);
4841 			for (i = 0; i < n; i++) {
4842 				add_next_index_string(return_value, (*entry)->name);
4843 				entry++;
4844 			}
4845 		}
4846 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4847 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4848 			RETURN_STRING("none");
4849 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4850 			RETURN_STRING("long");
4851 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4852 			RETURN_STRING("entity");
4853 		} else {
4854 			RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4855 		}
4856 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4857 		if (MBSTRG(strict_detection)) {
4858 			RETURN_STRING("On");
4859 		} else {
4860 			RETURN_STRING("Off");
4861 		}
4862 	} else {
4863 		php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4864 		RETURN_FALSE;
4865 	}
4866 }
4867 /* }}} */
4868 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4869 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4870 {
4871 	uint32_t wchar_buf[128];
4872 	unsigned char *in = (unsigned char*)input;
4873 	unsigned int state = 0;
4874 
4875 	if (encoding->check != NULL) {
4876 		return encoding->check(in, length);
4877 	}
4878 
4879 	/* If the input string is not encoded in the given encoding, there is a significant chance
4880 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4881 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4882 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4883 	ZEND_ASSERT(out_len <= 8);
4884 	for (unsigned int i = 0; i < out_len; i++) {
4885 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4886 			return false;
4887 		}
4888 	}
4889 
4890 	while (length) {
4891 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4892 		ZEND_ASSERT(out_len <= 128);
4893 		for (unsigned int i = 0; i < out_len; i++) {
4894 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4895 				return false;
4896 			}
4897 		}
4898 	}
4899 
4900 	return true;
4901 }
4902 
4903 /* MSVC 32-bit has issues with 64-bit intrinsics.
4904  * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4905  * It seems this is caused by a bug in MS Visual C++
4906  * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4907 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4908 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4909 #endif
4910 
4911 /* If we are building an AVX2-only binary, don't compile the next function */
4912 #ifndef ZEND_INTRIN_AVX2_NATIVE
4913 
4914 /* SSE2-based function for validating UTF-8 strings
4915  * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4916 static bool mb_fast_check_utf8_default(zend_string *str)
4917 {
4918 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4919 # ifdef __SSE2__
4920 	/* `e` points 1 byte past the last full 16-byte block of string content
4921 	 * Note that we include the terminating null byte which is included in each zend_string
4922 	 * as part of the content to check; this ensures that multi-byte characters which are
4923 	 * truncated abruptly at the end of the string will be detected as invalid */
4924 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4925 
4926 	/* For checking for illegal bytes 0xF5-FF */
4927 	const __m128i over_f5 = _mm_set1_epi8(-117);
4928 	/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4929 	const __m128i over_9f = _mm_set1_epi8(-97);
4930 	/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4931 	const __m128i over_8f = _mm_set1_epi8(-113);
4932 	/* For checking for illegal bytes 0xC0-C1 */
4933 	const __m128i find_c0 = _mm_set1_epi8(-64);
4934 	const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4935 	/* For checking structure of continuation bytes */
4936 	const __m128i find_e0 = _mm_set1_epi8(-32);
4937 	const __m128i find_f0 = _mm_set1_epi8(-16);
4938 
4939 	__m128i last_block = _mm_setzero_si128();
4940 	__m128i operand;
4941 
4942 	while (p < e) {
4943 		operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4944 
4945 check_operand:
4946 		/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4947 		if (!_mm_movemask_epi8(operand)) {
4948 			/* Even if this block only contains single-byte characters, there may have been a
4949 			 * multi-byte character at the end of the previous block, which was supposed to
4950 			 * have continuation bytes in this block
4951 			 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4952 			 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4953 			 * from the 3rd last */
4954 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4955 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4956 			if (_mm_movemask_epi8(bad)) {
4957 				return false;
4958 			}
4959 
4960 			/* Consume as many full blocks of single-byte characters as we can */
4961 			while (true) {
4962 				p += sizeof(__m128i);
4963 				if (p >= e) {
4964 					goto finish_up_remaining_bytes;
4965 				}
4966 				operand = _mm_loadu_si128((__m128i*)p);
4967 				if (_mm_movemask_epi8(operand)) {
4968 					break;
4969 				}
4970 			}
4971 		}
4972 
4973 		/* Check for >= 0xF5, which are illegal byte values in UTF-8
4974 		 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4975 		 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4976 		 * Then a single signed compare will pick out any bad bytes
4977 		 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4978 		__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4979 
4980 		/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4981 		 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4982 		 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4983 		 * We can check for both problems at once by generating a vector where each byte < 0xA0
4984 		 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4985 		 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4986 		__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4987 		__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4988 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4989 
4990 		/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4991 		 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4992 		 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4993 		 * Build the bitmask and compare it with the shifted block */
4994 		__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4995 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4996 
4997 		/* Check for overlong 2-byte code units
4998 		 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4999 		 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
5000 		 * byte range, do a signed compare to pick out any bad bytes */
5001 		bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
5002 
5003 		/* Check structure of continuation bytes
5004 		 * A UTF-8 byte should be a continuation byte if, and only if, it is:
5005 		 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
5006 		 * 2) 2 bytes after the start of a 3-byte or 4-byte character
5007 		 * 3) 3 bytes after the start of a 4-byte character
5008 		 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
5009 		 * get a single bitmask with 0xFF in each position where a continuation byte should be */
5010 		__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
5011 		__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
5012 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
5013 		__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
5014 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
5015 
5016 		/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
5017 		 * a continuation byte actually is
5018 		 * XOR those two bitmasks together; if everything is good, the result should be zero
5019 		 * However, if a byte which should have been a continuation wasn't, or if a byte which
5020 		 * shouldn't have been a continuation was, we will get 0xFF in that position */
5021 		__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
5022 		bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
5023 
5024 		/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
5025 		 * If that value is non-zero, then we found a bad byte somewhere! */
5026 		if (_mm_movemask_epi8(bad)) {
5027 			return false;
5028 		}
5029 
5030 		last_block = operand;
5031 		p += sizeof(__m128i);
5032 	}
5033 
5034 finish_up_remaining_bytes:
5035 	/* Finish up 1-15 remaining bytes */
5036 	if (p == e) {
5037 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5038 
5039 		/* Crazy hack here for cases where 9 or more bytes are remaining...
5040 		 * We want to use the above vectorized code to check a block of less than 16 bytes,
5041 		 * but there is no good way to read a variable number of bytes into an XMM register
5042 		 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5043 		 * 'header' fields which occupy the memory just before its content
5044 		 * And, those header fields occupy more than 16 bytes...
5045 		 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5046 		 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5047 		 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5048 		 * Then, we do a left shift to get rid of the unwanted bytes
5049 		 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5050 		 *
5051 		 * The following `switch` looks useless, but it's not
5052 		 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5053 		 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5054 		 */
5055 		switch (remaining_bytes) {
5056 		case 0: ;
5057 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5058 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5059 			return _mm_movemask_epi8(bad) == 0;
5060 		case 1:
5061 		case 2:
5062 			operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5063 			goto check_operand;
5064 		case 3:
5065 		case 4:
5066 			operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5067 			goto check_operand;
5068 		case 5:
5069 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5070 			goto check_operand;
5071 		case 6:
5072 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5073 			goto check_operand;
5074 		case 7:
5075 		case 8:
5076 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5077 			operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5078 #else
5079 			operand = _mm_set_epi64x(0, *((uint64_t*)p));
5080 #endif
5081 			goto check_operand;
5082 		case 9:
5083 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5084 			goto check_operand;
5085 		case 10:
5086 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5087 			goto check_operand;
5088 		case 11:
5089 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5090 			goto check_operand;
5091 		case 12:
5092 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5093 			goto check_operand;
5094 		case 13:
5095 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5096 			goto check_operand;
5097 		case 14:
5098 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5099 			goto check_operand;
5100 		case 15:
5101 			/* No trailing bytes are left which need to be checked
5102 			 * We get 15 because we did not include the terminating null when
5103 			 * calculating `remaining_bytes`, so the value wraps around */
5104 			return true;
5105 		}
5106 
5107 		ZEND_UNREACHABLE();
5108 	}
5109 
5110 	return true;
5111 # else
5112 	/* This UTF-8 validation function is derived from PCRE2 */
5113 	size_t length = ZSTR_LEN(str);
5114 	/* Table of the number of extra bytes, indexed by the first byte masked with
5115 	0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5116 	static const uint8_t utf8_table[] = {
5117 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5118 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5119 		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5120 		3,3,3,3,3,3,3,3
5121 	};
5122 
5123 	for (; length > 0; p++) {
5124 		uint32_t d;
5125 		unsigned char c = *p;
5126 		length--;
5127 
5128 		if (c < 128) {
5129 			/* ASCII character */
5130 			continue;
5131 		}
5132 
5133 		if (c < 0xc0) {
5134 			/* Isolated 10xx xxxx byte */
5135 			return false;
5136 		}
5137 
5138 		if (c >= 0xf5) {
5139 			return false;
5140 		}
5141 
5142 		uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5143 		if (length < ab) {
5144 			/* Missing bytes */
5145 			return false;
5146 		}
5147 		length -= ab;
5148 
5149 		/* Check top bits in the second byte */
5150 		if (((d = *(++p)) & 0xc0) != 0x80) {
5151 			return false;
5152 		}
5153 
5154 		/* For each length, check that the remaining bytes start with the 0x80 bit
5155 		 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5156 		 * excluded range 0xd800 to 0xdfff. */
5157 		switch (ab) {
5158 		case 1:
5159 			/* 2-byte character. No further bytes to check for 0x80. Check first byte
5160 			 * for xx00 000x (overlong sequence). */
5161 			if ((c & 0x3e) == 0) {
5162 				return false;
5163 			}
5164 			break;
5165 
5166 		case 2:
5167 			/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5168 			 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5169 			if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5170 				return false;
5171 			}
5172 			break;
5173 
5174 		case 3:
5175 			/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5176 			 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5177 			 * character greater than 0x0010ffff (f4 8f bf bf) */
5178 			if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5179 				return false;
5180 			}
5181 			break;
5182 
5183 			EMPTY_SWITCH_DEFAULT_CASE();
5184 		}
5185 	}
5186 
5187 	return true;
5188 # endif
5189 }
5190 
5191 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5192 
5193 #ifdef ZEND_INTRIN_AVX2_NATIVE
5194 
5195 /* We are building AVX2-only binary */
5196 # include <immintrin.h>
5197 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5198 
5199 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5200 
5201 /* We are building binary which works with or without AVX2; whether or not to use
5202  * AVX2-accelerated functions will be determined at runtime */
5203 # include <immintrin.h>
5204 # include "Zend/zend_cpuinfo.h"
5205 
5206 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5207 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5208  * resolve symbols accordingly */
5209 
5210 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5211 
5212 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5213 
5214 typedef bool (*check_utf8_func_t)(zend_string*);
5215 
5216 ZEND_NO_SANITIZE_ADDRESS
5217 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5218 static check_utf8_func_t resolve_check_utf8(void)
5219 {
5220 	if (zend_cpu_supports_avx2()) {
5221 		return mb_fast_check_utf8_avx2;
5222 	}
5223 	return mb_fast_check_utf8_default;
5224 }
5225 
5226 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5227 /* We are compiling for a target where the dynamic linker will not be able to
5228  * resolve symbols according to whether the host supports AVX2 or not; so instead,
5229  * we can make calls go through a function pointer and set the function pointer
5230  * on module load */
5231 
5232 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5233 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5234 #else
5235 static bool mb_fast_check_utf8_avx2(zend_string *str);
5236 #endif
5237 
5238 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5239 
mb_fast_check_utf8(zend_string * str)5240 static bool mb_fast_check_utf8(zend_string *str)
5241 {
5242 	return check_utf8_ptr(str);
5243 }
5244 
init_check_utf8(void)5245 static void init_check_utf8(void)
5246 {
5247 	if (zend_cpu_supports_avx2()) {
5248 		check_utf8_ptr = mb_fast_check_utf8_avx2;
5249 	} else {
5250 		check_utf8_ptr = mb_fast_check_utf8_default;
5251 	}
5252 }
5253 # endif
5254 
5255 #else
5256 
5257 /* No AVX2 support */
5258 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5259 
5260 #endif
5261 
5262 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5263 
5264 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5265  * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5266 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5267 # define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5268 #endif
5269 
5270 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5271  * number of bytes, then take the low 256 bits
5272  * This is used to take some number of trailing bytes from the previous 32-byte
5273  * block followed by some number of leading bytes from the current 32-byte block
5274  *
5275  * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5276  * YMM register while shifting in bytes from another YMM register... but
5277  * it works separately on respective 128-bit halves of the YMM registers,
5278  * which is not what we want.
5279  * To make it work as desired, we first do _mm256_permute2x128_si256
5280  * (VPERM2I128) to combine the low 128 bits from the previous block and
5281  * the high 128 bits of the current block in one YMM register.
5282  * Then VPALIGNR will do what is needed. */
5283 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5284 
5285 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5286  *
5287  * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5288  * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5289  * sections. */
5290 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5291 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5292 #else
5293 static bool mb_fast_check_utf8_avx2(zend_string *str)
5294 #endif
5295 {
5296 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5297 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5298 
5299 	/* The algorithm used here for UTF-8 validation is partially adapted from the
5300 	 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5301 	 * and Daniel Lemire.
5302 	 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5303 	 *
5304 	 * Most types of invalid UTF-8 text can be detected by examining pairs of
5305 	 * successive bytes. Specifically:
5306 	 *
5307 	 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5308 	 *   No valid UTF-8 string ever uses these byte values.
5309 	 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5310 	 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5311 	 * • 5-byte or 6-byte code units, which should never be used, start with
5312 	 *   0xF8-FE.
5313 	 * • A codepoint value higher than U+10FFFF, which is the highest value for
5314 	 *   any Unicode codepoint, would either start with 0xF4, followed by a
5315 	 *   byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5316 	 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5317 	 *   be used, would start with 0xED, followed by a byte >= 0xA0.
5318 	 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5319 	 *
5320 	 * To detect all these problems, for each pair of successive bytes, we do
5321 	 * table lookups using the high nibble of the first byte, the low nibble of
5322 	 * the first byte, and the high nibble of the second byte. Each table lookup
5323 	 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5324 	 * combination; AND those three bitmasks together, and any 1 bit in the result
5325 	 * will indicate an actual invalid byte combination was found.
5326 	 */
5327 
5328 #define BAD_BYTE 0x1
5329 #define OVERLONG_2BYTE 0x2
5330 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5331 #define OVERLONG_3BYTE 0x4
5332 #define SURROGATE 0x8
5333 #define OVERLONG_4BYTE 0x10
5334 #define INVALID_CP 0x20
5335 
5336 	/* Each of these are 16-entry tables, repeated twice; this is required by the
5337 	 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5338 	 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5339 	 *
5340 	 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5341 	 * that means that high nibble 0xC is consistent with the byte pair being part of
5342 	 * an overlong 2-byte code unit */
5343 	const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5344 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5345 		0, 0, 0, 0,
5346 		0, 0, 0, 0,
5347 		0, 0, 0, 0,
5348 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5349 		0, 0, 0, 0,
5350 		0, 0, 0, 0,
5351 		0, 0, 0, 0);
5352 	const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5353 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5354 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5355 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5356 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5357 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5358 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5359 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5360 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5361 	const __m256i bad_hi_nibble = _mm256_set_epi8(
5362 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5363 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5364 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5365 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5366 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5367 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5368 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5369 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5370 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5371 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5372 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5373 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5374 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5375 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5376 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5377 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5378 
5379 	const __m256i find_continuation = _mm256_set1_epi8(-64);
5380 	const __m256i _b = _mm256_set1_epi8(0xB);
5381 	const __m256i _d = _mm256_set1_epi8(0xD);
5382 	const __m256i _f = _mm256_set1_epi8(0xF);
5383 
5384 	__m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5385 	__m256i operand;
5386 
5387 	while (p < e) {
5388 		operand = _mm256_loadu_si256((__m256i*)p);
5389 
5390 check_operand:
5391 		if (!_mm256_movemask_epi8(operand)) {
5392 			/* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5393 			 * the previous block didn't end with an incomplete multi-byte character
5394 			 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5395 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5396 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5397 			if (_mm256_movemask_epi8(bad)) {
5398 				return false;
5399 			}
5400 
5401 			/* Consume as many full blocks of single-byte characters as we can */
5402 			while (true) {
5403 				p += sizeof(__m256i);
5404 				if (p >= e) {
5405 					goto finish_up_remaining_bytes;
5406 				}
5407 				operand = _mm256_loadu_si256((__m256i*)p);
5408 				if (_mm256_movemask_epi8(operand)) {
5409 					break;
5410 				}
5411 			}
5412 		}
5413 
5414 		__m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5415 		__m256i lo_nibbles = _mm256_and_si256(operand, _f);
5416 
5417 		__m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5418 		__m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5419 
5420 		/* Do parallel table lookups in all 3 tables */
5421 		__m256i bad = _mm256_cmpgt_epi8(
5422 			_mm256_and_si256(
5423 				_mm256_and_si256(
5424 					_mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5425 					_mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5426 				_mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5427 			_mm256_setzero_si256());
5428 
5429 		__m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5430 		__m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5431 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5432 		__m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5433 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5434 
5435 		__m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5436 		bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5437 
5438 		if (_mm256_movemask_epi8(bad)) {
5439 			return false;
5440 		}
5441 
5442 		last_hi_nibbles = hi_nibbles;
5443 		last_lo_nibbles = lo_nibbles;
5444 		p += sizeof(__m256i);
5445 	}
5446 
5447 finish_up_remaining_bytes:
5448 	if (p == e) {
5449 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5450 
5451 		switch (remaining_bytes) {
5452 		case 0: ;
5453 			/* No actual data bytes are remaining */
5454 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5455 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5456 			return _mm256_movemask_epi8(bad) == 0;
5457 		case 1:
5458 		case 2:
5459 			operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5460 			goto check_operand;
5461 		case 3:
5462 		case 4:
5463 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5464 			goto check_operand;
5465 		case 5:
5466 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5467 			goto check_operand;
5468 		case 6:
5469 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5470 			goto check_operand;
5471 		case 7:
5472 		case 8:
5473 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5474 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5475 #else
5476 			operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5477 #endif
5478 			goto check_operand;
5479 		case 9:
5480 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5481 			goto check_operand;
5482 		case 10:
5483 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5484 			goto check_operand;
5485 		case 11:
5486 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5487 			goto check_operand;
5488 		case 12:
5489 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5490 			goto check_operand;
5491 		case 13:
5492 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5493 			goto check_operand;
5494 		case 14:
5495 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5496 			goto check_operand;
5497 		case 15:
5498 		case 16:
5499 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5500 			goto check_operand;
5501 		case 17:
5502 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5503 			goto check_operand;
5504 		case 18:
5505 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5506 			goto check_operand;
5507 		case 19:
5508 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5509 			goto check_operand;
5510 		case 20:
5511 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5512 			goto check_operand;
5513 		case 21:
5514 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5515 			goto check_operand;
5516 		case 22:
5517 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5518 			goto check_operand;
5519 		case 23:
5520 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5521 			goto check_operand;
5522 		case 24:
5523 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5524 			goto check_operand;
5525 		case 25:
5526 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5527 			goto check_operand;
5528 		case 26:
5529 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5530 			goto check_operand;
5531 		case 27:
5532 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5533 			goto check_operand;
5534 		case 28:
5535 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5536 			goto check_operand;
5537 		case 29:
5538 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5539 			goto check_operand;
5540 		case 30:
5541 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5542 			goto check_operand;
5543 		case 31:
5544 			return true;
5545 		}
5546 
5547 		ZEND_UNREACHABLE();
5548 	}
5549 
5550 	return true;
5551 }
5552 
5553 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5554 
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5555 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5556 {
5557 	if (encoding == &mbfl_encoding_utf8) {
5558 		if (ZSTR_IS_VALID_UTF8(str)) {
5559 			return true;
5560 		}
5561 		bool result = mb_fast_check_utf8(str);
5562 		if (result && !ZSTR_IS_INTERNED(str)) {
5563 			GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5564 		}
5565 		return result;
5566 	} else {
5567 		return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5568 	}
5569 }
5570 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5571 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5572 {
5573 	zend_long idx;
5574 	zend_string *key;
5575 	zval *entry;
5576 	bool valid = true;
5577 
5578 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5579 
5580 	if (GC_IS_RECURSIVE(vars)) {
5581 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5582 		return false;
5583 	}
5584 	GC_TRY_PROTECT_RECURSION(vars);
5585 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5586 		ZVAL_DEREF(entry);
5587 		if (key) {
5588 			if (!mb_check_str_encoding(key, encoding)) {
5589 				valid = false;
5590 				break;
5591 			}
5592 		}
5593 		switch (Z_TYPE_P(entry)) {
5594 			case IS_STRING:
5595 				if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5596 					valid = false;
5597 					break;
5598 				}
5599 				break;
5600 			case IS_ARRAY:
5601 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5602 					valid = false;
5603 					break;
5604 				}
5605 				break;
5606 			case IS_LONG:
5607 			case IS_DOUBLE:
5608 			case IS_NULL:
5609 			case IS_TRUE:
5610 			case IS_FALSE:
5611 				break;
5612 			default:
5613 				/* Other types are error. */
5614 				valid = false;
5615 				break;
5616 		}
5617 	} ZEND_HASH_FOREACH_END();
5618 	GC_TRY_UNPROTECT_RECURSION(vars);
5619 	return valid;
5620 }
5621 
5622 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5623 PHP_FUNCTION(mb_check_encoding)
5624 {
5625 	zend_string *input_str = NULL, *enc = NULL;
5626 	HashTable *input_ht = NULL;
5627 	const mbfl_encoding *encoding;
5628 
5629 	ZEND_PARSE_PARAMETERS_START(0, 2)
5630 		Z_PARAM_OPTIONAL
5631 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5632 		Z_PARAM_STR_OR_NULL(enc)
5633 	ZEND_PARSE_PARAMETERS_END();
5634 
5635 	encoding = php_mb_get_encoding(enc, 2);
5636 	if (!encoding) {
5637 		RETURN_THROWS();
5638 	}
5639 
5640 	if (input_ht) {
5641 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5642 	} else if (input_str) {
5643 		RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5644 	} else {
5645 		php_error_docref(NULL, E_DEPRECATED,
5646 			"Calling mb_check_encoding() without argument is deprecated");
5647 
5648 		/* FIXME: Actually check all inputs, except $_FILES file content. */
5649 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
5650 	}
5651 }
5652 /* }}} */
5653 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5654 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5655 	const uint32_t enc_name_arg_num)
5656 {
5657 	const mbfl_encoding *enc;
5658 	enum mbfl_no_encoding no_enc;
5659 
5660 	ZEND_ASSERT(str_len > 0);
5661 
5662 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5663 	if (!enc) {
5664 		return -2;
5665 	}
5666 
5667 	no_enc = enc->no_encoding;
5668 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5669 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5670 		return -2;
5671 	}
5672 
5673 	/* Some legacy text encodings have a minimum required wchar buffer size;
5674 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5675 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5676 	unsigned int state = 0;
5677 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5678 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5679 
5680 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5681 		return -1;
5682 	}
5683 	return wchar_buf[0];
5684 }
5685 
5686 /* {{{ */
PHP_FUNCTION(mb_ord)5687 PHP_FUNCTION(mb_ord)
5688 {
5689 	char *str;
5690 	size_t str_len;
5691 	zend_string *enc = NULL;
5692 	zend_long cp;
5693 
5694 	ZEND_PARSE_PARAMETERS_START(1, 2)
5695 		Z_PARAM_STRING(str, str_len)
5696 		Z_PARAM_OPTIONAL
5697 		Z_PARAM_STR_OR_NULL(enc)
5698 	ZEND_PARSE_PARAMETERS_END();
5699 
5700 	if (str_len == 0) {
5701 		zend_argument_must_not_be_empty_error(1);
5702 		RETURN_THROWS();
5703 	}
5704 
5705 	cp = php_mb_ord(str, str_len, enc, 2);
5706 
5707 	if (0 > cp) {
5708 		if (cp == -2) {
5709 			RETURN_THROWS();
5710 		}
5711 		RETURN_FALSE;
5712 	}
5713 
5714 	RETURN_LONG(cp);
5715 }
5716 /* }}} */
5717 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5718 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5719 {
5720 	const mbfl_encoding *enc;
5721 	enum mbfl_no_encoding no_enc;
5722 	zend_string *ret;
5723 	char buf[4];
5724 
5725 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5726 	if (!enc) {
5727 		return NULL;
5728 	}
5729 
5730 	no_enc = enc->no_encoding;
5731 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5732 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5733 		return NULL;
5734 	}
5735 
5736 	if (cp < 0 || cp > 0x10ffff) {
5737 		return NULL;
5738 	}
5739 
5740 	if (php_mb_is_no_encoding_utf8(no_enc)) {
5741 		if (cp > 0xd7ff && 0xe000 > cp) {
5742 			return NULL;
5743 		}
5744 
5745 		if (cp < 0x80) {
5746 			ret = ZSTR_CHAR(cp);
5747 		} else if (cp < 0x800) {
5748 			ret = zend_string_alloc(2, 0);
5749 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5750 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5751 			ZSTR_VAL(ret)[2] = 0;
5752 		} else if (cp < 0x10000) {
5753 			ret = zend_string_alloc(3, 0);
5754 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5755 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5756 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5757 			ZSTR_VAL(ret)[3] = 0;
5758 		} else {
5759 			ret = zend_string_alloc(4, 0);
5760 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5761 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5762 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5763 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5764 			ZSTR_VAL(ret)[4] = 0;
5765 		}
5766 
5767 		return ret;
5768 	}
5769 
5770 	buf[0] = (cp >> 24) & 0xff;
5771 	buf[1] = (cp >> 16) & 0xff;
5772 	buf[2] = (cp >>  8) & 0xff;
5773 	buf[3] = cp & 0xff;
5774 
5775 	long orig_illegalchars = MBSTRG(illegalchars);
5776 	MBSTRG(illegalchars) = 0;
5777 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5778 
5779 	if (MBSTRG(illegalchars) != 0) {
5780 		zend_string_release(ret);
5781 		ret = NULL;
5782 	}
5783 
5784 	MBSTRG(illegalchars) = orig_illegalchars;
5785 	return ret;
5786 }
5787 
5788 /* {{{ */
PHP_FUNCTION(mb_chr)5789 PHP_FUNCTION(mb_chr)
5790 {
5791 	zend_long cp;
5792 	zend_string *enc = NULL;
5793 
5794 	ZEND_PARSE_PARAMETERS_START(1, 2)
5795 		Z_PARAM_LONG(cp)
5796 		Z_PARAM_OPTIONAL
5797 		Z_PARAM_STR_OR_NULL(enc)
5798 	ZEND_PARSE_PARAMETERS_END();
5799 
5800 	zend_string* ret = php_mb_chr(cp, enc, 2);
5801 	if (ret == NULL) {
5802 		RETURN_FALSE;
5803 	}
5804 
5805 	RETURN_STR(ret);
5806 }
5807 /* }}} */
5808 
PHP_FUNCTION(mb_str_pad)5809 PHP_FUNCTION(mb_str_pad)
5810 {
5811 	zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5812 	zend_long pad_to_length;
5813 	zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5814 
5815 	ZEND_PARSE_PARAMETERS_START(2, 5)
5816 		Z_PARAM_STR(input)
5817 		Z_PARAM_LONG(pad_to_length)
5818 		Z_PARAM_OPTIONAL
5819 		Z_PARAM_STR(pad)
5820 		Z_PARAM_LONG(pad_type_val)
5821 		Z_PARAM_STR_OR_NULL(encoding_str)
5822 	ZEND_PARSE_PARAMETERS_END();
5823 
5824 	const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5825 	if (!encoding) {
5826 		RETURN_THROWS();
5827 	}
5828 
5829 	size_t input_length = mb_get_strlen(input, encoding);
5830 
5831 	/* If resulting string turns out to be shorter than input string,
5832 	   we simply copy the input and return. */
5833 	if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5834 		RETURN_STR_COPY(input);
5835 	}
5836 
5837 	if (ZSTR_LEN(pad) == 0) {
5838 		zend_argument_must_not_be_empty_error(3);
5839 		RETURN_THROWS();
5840 	}
5841 
5842 	if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5843 		zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5844 		RETURN_THROWS();
5845 	}
5846 
5847 	size_t pad_length = mb_get_strlen(pad, encoding);
5848 
5849 	size_t num_mb_pad_chars = pad_to_length - input_length;
5850 
5851 	/* We need to figure out the left/right padding lengths. */
5852 	size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5853 	switch (pad_type_val) {
5854 		case PHP_STR_PAD_RIGHT:
5855 			right_pad = num_mb_pad_chars;
5856 			break;
5857 
5858 		case PHP_STR_PAD_LEFT:
5859 			left_pad = num_mb_pad_chars;
5860 			break;
5861 
5862 		case PHP_STR_PAD_BOTH:
5863 			left_pad = num_mb_pad_chars / 2;
5864 			right_pad = num_mb_pad_chars - left_pad;
5865 			break;
5866 	}
5867 
5868 	/* How many full block copies need to happen, and how many characters are then left over? */
5869 	size_t full_left_pad_copies = left_pad / pad_length;
5870 	size_t full_right_pad_copies = right_pad / pad_length;
5871 	size_t remaining_left_pad_chars = left_pad % pad_length;
5872 	size_t remaining_right_pad_chars = right_pad % pad_length;
5873 
5874 	if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5875 		goto overflow_no_release;
5876 	}
5877 
5878 	/* Compute the number of bytes required for the padding */
5879 	size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5880 	size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5881 
5882 	/* No special fast-path handling necessary for zero-length pads because these functions will not
5883 	 * allocate memory in case a zero-length pad is required. */
5884 	zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5885 	zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5886 
5887 	if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5888 		|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5889 		goto overflow;
5890 	}
5891 
5892 	size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5893 	size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5894 
5895 	if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5896 		|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5897 		goto overflow;
5898 	}
5899 
5900 	zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5901 	char *buffer = ZSTR_VAL(result);
5902 
5903 	/* First we pad the left. */
5904 	for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5905 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5906 	}
5907 	memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5908 	buffer += ZSTR_LEN(remaining_left_pad_str);
5909 
5910 	/* Then we copy the input string. */
5911 	memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5912 	buffer += ZSTR_LEN(input);
5913 
5914 	/* Finally, we pad on the right. */
5915 	for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5916 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5917 	}
5918 	memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5919 
5920 	ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5921 
5922 	zend_string_release_ex(remaining_left_pad_str, false);
5923 	zend_string_release_ex(remaining_right_pad_str, false);
5924 
5925 	RETURN_NEW_STR(result);
5926 
5927 overflow:
5928 	zend_string_release_ex(remaining_left_pad_str, false);
5929 	zend_string_release_ex(remaining_right_pad_str, false);
5930 overflow_no_release:
5931 	zend_throw_error(NULL, "String size overflow");
5932 	RETURN_THROWS();
5933 }
5934 
5935 /* {{{ */
PHP_FUNCTION(mb_scrub)5936 PHP_FUNCTION(mb_scrub)
5937 {
5938 	zend_string *str, *enc_name = NULL;
5939 
5940 	ZEND_PARSE_PARAMETERS_START(1, 2)
5941 		Z_PARAM_STR(str)
5942 		Z_PARAM_OPTIONAL
5943 		Z_PARAM_STR_OR_NULL(enc_name)
5944 	ZEND_PARSE_PARAMETERS_END();
5945 
5946 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5947 	if (!enc) {
5948 		RETURN_THROWS();
5949 	}
5950 
5951 	if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5952 		/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5953 		RETURN_STR_COPY(str);
5954 	}
5955 
5956 	RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5957 }
5958 /* }}} */
5959 
5960 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5961 static void php_mb_populate_current_detect_order_list(void)
5962 {
5963 	const mbfl_encoding **entry = 0;
5964 	size_t nentries;
5965 
5966 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5967 		nentries = MBSTRG(detect_order_list_size);
5968 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5969 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5970 	} else {
5971 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5972 		size_t i;
5973 		nentries = MBSTRG(default_detect_order_list_size);
5974 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5975 		for (i = 0; i < nentries; i++) {
5976 			entry[i] = mbfl_no2encoding(src[i]);
5977 		}
5978 	}
5979 	MBSTRG(current_detect_order_list) = entry;
5980 	MBSTRG(current_detect_order_list_size) = nentries;
5981 }
5982 /* }}} */
5983 
5984 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5985 static int php_mb_encoding_translation(void)
5986 {
5987 	return MBSTRG(encoding_translation);
5988 }
5989 /* }}} */
5990 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5991 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5992 {
5993 	if (enc) {
5994 		if (enc->mblen_table) {
5995 			if (s) {
5996 				return enc->mblen_table[*(unsigned char *)s];
5997 			}
5998 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5999 			return 2;
6000 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
6001 			return 4;
6002 		}
6003 	}
6004 	return 1;
6005 }
6006 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)6007 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
6008 {
6009 	const char *p = s;
6010 	char *last=NULL;
6011 
6012 	if (nbytes == (size_t)-1) {
6013 		size_t nb = 0;
6014 
6015 		while (*p != '\0') {
6016 			if (nb == 0) {
6017 				if ((unsigned char)*p == (unsigned char)c) {
6018 					last = (char *)p;
6019 				}
6020 				nb = php_mb_mbchar_bytes(p, enc);
6021 				if (nb == 0) {
6022 					return NULL; /* something is going wrong! */
6023 				}
6024 			}
6025 			--nb;
6026 			++p;
6027 		}
6028 	} else {
6029 		size_t bcnt = nbytes;
6030 		size_t nbytes_char;
6031 		while (bcnt > 0) {
6032 			if ((unsigned char)*p == (unsigned char)c) {
6033 				last = (char *)p;
6034 			}
6035 			nbytes_char = php_mb_mbchar_bytes(p, enc);
6036 			if (bcnt < nbytes_char) {
6037 				return NULL;
6038 			}
6039 			p += nbytes_char;
6040 			bcnt -= nbytes_char;
6041 		}
6042 	}
6043 	return last;
6044 }
6045 
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)6046 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
6047 {
6048 	/* We're using simple case-folding here, because we'd have to deal with remapping of
6049 	 * offsets otherwise. */
6050 	zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6051 	zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6052 
6053 	size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6054 
6055 	zend_string_free(haystack_conv);
6056 	zend_string_free(needle_conv);
6057 
6058 	return n;
6059 }
6060 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)6061 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6062 {
6063 	*list = (const zend_encoding **)MBSTRG(http_input_list);
6064 	*list_size = MBSTRG(http_input_list_size);
6065 }
6066 /* }}} */
6067 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)6068 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6069 {
6070 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
6071 }
6072 /* }}} */
6073 
6074 static const unsigned char base64_table[] = {
6075  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6076    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6077  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6078    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6079  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6080    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6081  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6082    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6083  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6084    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6085 };
6086 
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6087 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6088 {
6089 	if (base64) {
6090 		return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6091 	} else {
6092 		size_t enc_size = 0;
6093 		unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6094 		while (p < tmpbuf->out) {
6095 			unsigned char c = *p++;
6096 			enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6097 		}
6098 		return enc_size;
6099 	}
6100 }
6101 
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6102 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6103 {
6104 	unsigned char *out, *limit;
6105 	MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6106 	unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6107 
6108 	if (base64) {
6109 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6110 		while ((e - p) >= 3) {
6111 			unsigned char a = *p++;
6112 			unsigned char b = *p++;
6113 			unsigned char c = *p++;
6114 			uint32_t bits = (a << 16) | (b << 8) | c;
6115 			out = mb_convert_buf_add4(out,
6116 				base64_table[(bits >> 18) & 0x3F],
6117 				base64_table[(bits >> 12) & 0x3F],
6118 				base64_table[(bits >> 6) & 0x3F],
6119 				base64_table[bits & 0x3F]);
6120 		}
6121 		if (p != e) {
6122 			if ((e - p) == 1) {
6123 				uint32_t bits = *p++;
6124 				out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6125 			} else {
6126 				unsigned char a = *p++;
6127 				unsigned char b = *p++;
6128 				uint32_t bits = (a << 8) | b;
6129 				out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6130 			}
6131 		}
6132 	} else {
6133 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6134 		while (p < e) {
6135 			unsigned char c = *p++;
6136 			if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6137 				out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6138 			} else {
6139 				out = mb_convert_buf_add(out, c);
6140 			}
6141 		}
6142 	}
6143 
6144 	mb_convert_buf_reset(tmpbuf, 0);
6145 	MB_CONVERT_BUF_STORE(outbuf, out, limit);
6146 }
6147 
6148 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6149 
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6150 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6151 {
6152 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6153 	size_t in_len = ZSTR_LEN(input);
6154 
6155 	ZEND_ASSERT(outcode->mime_name != NULL);
6156 	ZEND_ASSERT(outcode->mime_name[0] != '\0');
6157 
6158 	if (!in_len) {
6159 		return zend_empty_string;
6160 	}
6161 
6162 	if (indent < 0 || indent >= 74) {
6163 		indent = 0;
6164 	}
6165 
6166 	if (linefeed_len > 8) {
6167 		linefeed_len = 8;
6168 	}
6169 	/* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6170 	for (size_t i = 0; i < linefeed_len; i++) {
6171 		if (linefeed[i] == '\0') {
6172 			linefeed_len = i;
6173 			break;
6174 		}
6175 	}
6176 
6177 	unsigned int state = 0;
6178 	/* wchar_buf should be big enough that when it is full, we definitely have enough
6179 	 * wchars to fill an entire line of output */
6180 	uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6181 	uint32_t *p, *e;
6182 	/* What part of wchar_buf is filled with still-unprocessed data which should not
6183 	 * be overwritten? */
6184 	unsigned int offset = 0;
6185 	size_t line_start = 0;
6186 
6187 	/* If the entire input string is ASCII with no spaces (except possibly leading
6188 	 * spaces), just pass it through unchanged */
6189 	bool checking_leading_spaces = true;
6190 	while (in_len) {
6191 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6192 		p = wchar_buf;
6193 		e = wchar_buf + out_len;
6194 
6195 		while (p < e) {
6196 			uint32_t w = *p++;
6197 			if (checking_leading_spaces) {
6198 				if (w == ' ') {
6199 					continue;
6200 				} else {
6201 					checking_leading_spaces = false;
6202 				}
6203 			}
6204 			if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6205 				/* We cannot simply pass input string through unchanged; start again */
6206 				in = (unsigned char*)ZSTR_VAL(input);
6207 				in_len = ZSTR_LEN(input);
6208 				goto no_passthrough;
6209 			}
6210 		}
6211 	}
6212 
6213 	return zend_string_copy(input); /* This just increments refcount */
6214 
6215 no_passthrough: ;
6216 
6217 	mb_convert_buf buf;
6218 	mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6219 
6220 	/* Encode some prefix of the input string as plain ASCII if possible
6221 	 * If we find it necessary to switch to Base64/QPrint encoding, we will
6222 	 * do so all the way to the end of the string */
6223 	while (in_len) {
6224 		/* Decode part of the input string, refill wchar_buf */
6225 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6226 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6227 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6228 		p = wchar_buf;
6229 		e = wchar_buf + offset + out_len;
6230 		/* ASCII output is broken into space-delimited 'words'
6231 		 * If we find a non-ASCII character in the middle of a word, we will
6232 		 * transfer-encode the entire word */
6233 		uint32_t *word_start = p;
6234 
6235 		/* Don't consider adding line feed for spaces at the beginning of a word */
6236 		while (p < e && *p == ' ' && (p - word_start) <= 74) {
6237 			p++;
6238 		}
6239 
6240 		while (p < e) {
6241 			uint32_t w = *p++;
6242 
6243 			if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6244 				/* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6245 				 * If we are already too far along on a line to include Base64/QPrint encoded data
6246 				 * on the same line (without overrunning max line length), then add a line feed
6247 				 * right now */
6248 feed_and_mime_encode:
6249 				if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6250 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6251 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6252 					buf.out = mb_convert_buf_add(buf.out, ' ');
6253 					indent = 0;
6254 					line_start = mb_convert_buf_len(&buf);
6255 				} else if (mb_convert_buf_len(&buf) > 0) {
6256 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6257 					buf.out = mb_convert_buf_add(buf.out, ' ');
6258 				}
6259 				p = word_start; /* Back up to where MIME encoding of input chars should start */
6260 				goto mime_encoding_needed;
6261 			} else if (w == ' ') {
6262 				/* When we see a space, check whether we should insert a line break */
6263 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6264 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6265 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6266 					buf.out = mb_convert_buf_add(buf.out, ' ');
6267 					indent = 0;
6268 					line_start = mb_convert_buf_len(&buf);
6269 				} else if (mb_convert_buf_len(&buf) > 0) {
6270 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6271 					buf.out = mb_convert_buf_add(buf.out, ' ');
6272 				}
6273 				/* Output one (space-delimited) word as plain ASCII */
6274 				while (word_start < p-1) {
6275 					buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6276 				}
6277 				word_start++;
6278 				while (p < e && *p == ' ') {
6279 					p++;
6280 				}
6281 			}
6282 		}
6283 
6284 		if (in_len) {
6285 			/* Copy chars which are part of an incomplete 'word' to the beginning
6286 			 * of wchar_buf and reprocess them on the next iteration.
6287 			 * But first make sure that the incomplete 'word' isn't so big that
6288 			 * there will be no space to add any more decoded wchars in the buffer
6289 			 * (which could lead to an infinite loop) */
6290 			if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6291 				goto feed_and_mime_encode;
6292 			}
6293 			offset = e - word_start;
6294 			if (offset) {
6295 				memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6296 			}
6297 		} else {
6298 			/* We have reached the end of the input string while still in 'ASCII mode';
6299 			 * process any trailing ASCII chars which were not followed by a space */
6300 			if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6301 				/* The whole input string was not just one big ASCII 'word' with no spaces
6302 				 * consider adding a line feed if necessary to prevent output lines from
6303 				 * being too long */
6304 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6305 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6306 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6307 					buf.out = mb_convert_buf_add(buf.out, ' ');
6308 				} else {
6309 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6310 					buf.out = mb_convert_buf_add(buf.out, ' ');
6311 				}
6312 			}
6313 			while (word_start < e) {
6314 				buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6315 			}
6316 		}
6317 	}
6318 
6319 	/* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6320 	return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6321 
6322 mime_encoding_needed: ;
6323 
6324 	/* We will generate the output line by line, first converting wchars to bytes
6325 	 * in the requested output encoding, then transfer-encoding those bytes as
6326 	 * Base64 or QPrint
6327 	 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6328 	 * sending them to 'buf' */
6329 	mb_convert_buf tmpbuf;
6330 	mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6331 
6332 	/* Do we need to refill wchar_buf to make sure we don't run out of wchars
6333 	 * in the middle of a line? */
6334 	offset = e - p;
6335 	if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6336 		goto start_new_line;
6337 	}
6338 	memmove(wchar_buf, p, offset * sizeof(uint32_t));
6339 
6340 	while(true) {
6341 refill_wchar_buf: ;
6342 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6343 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6344 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6345 		p = wchar_buf;
6346 		e = wchar_buf + offset + out_len;
6347 
6348 start_new_line: ;
6349 		MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6350 		buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6351 		buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6352 		buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6353 
6354 		/* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6355 		 * We do something like a 'binary search' to find the greatest number which
6356 		 * can be included on this line without exceeding max line length */
6357 		unsigned int n = 12;
6358 		size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6359 
6360 		while (true) {
6361 			ZEND_ASSERT(p < e);
6362 
6363 			/* Remember where we were in process of generating output, so we can back
6364 			 * up if necessary */
6365 			size_t tmppos = mb_convert_buf_len(&tmpbuf);
6366 			unsigned int tmpstate = tmpbuf.state;
6367 
6368 			/* Try encoding 'n' wchars in output text encoding and sending output
6369 			 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6370 			 * current line. */
6371 			n = MIN(n, e - p);
6372 			outcode->from_wchar(p, n, &tmpbuf, false);
6373 
6374 			/* For some output text encodings, there may be a few ending bytes
6375 			 * which need to be emitted to output before we break a line.
6376 			 * Again, remember where we were so we can back up */
6377 			size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6378 			unsigned int tmpstate2 = tmpbuf.state;
6379 			outcode->from_wchar(NULL, 0, &tmpbuf, true);
6380 
6381 			if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6382 				/* If we convert 'n' more wchars on the current line, it will not
6383 				 * overflow the maximum line length */
6384 				p += n;
6385 
6386 				if (p == e) {
6387 					/* We are done; we shouldn't reach here if there is more remaining
6388 					 * of the input string which needs to be processed */
6389 					ZEND_ASSERT(!in_len);
6390 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6391 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6392 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6393 					mb_convert_buf_free(&tmpbuf);
6394 					return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6395 				} else {
6396 					/* It's possible that more chars might fit on the current line,
6397 					 * so back up to where we were before emitting any ending bytes */
6398 					mb_convert_buf_reset(&tmpbuf, tmppos2);
6399 					tmpbuf.state = tmpstate2;
6400 				}
6401 			} else {
6402 				/* Converting 'n' more wchars on this line would be too much.
6403 				 * Back up to where we were before we tried that. */
6404 				mb_convert_buf_reset(&tmpbuf, tmppos);
6405 				tmpbuf.state = tmpstate;
6406 
6407 				if (n == 1) {
6408 					/* We have found the exact number of chars which will fit on the
6409 					 * current line. Finish up and move to a new line. */
6410 					outcode->from_wchar(NULL, 0, &tmpbuf, true);
6411 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6412 					tmpbuf.state = 0;
6413 
6414 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6415 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6416 
6417 					indent = 0; /* Indent argument must only affect the first line */
6418 
6419 					if (in_len || p < e) {
6420 						/* We still have more input to process */
6421 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6422 						buf.out = mb_convert_buf_add(buf.out, ' ');
6423 						line_start = mb_convert_buf_len(&buf);
6424 						offset = e - p;
6425 						if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6426 							/* Copy any remaining wchars to beginning of buffer and refill
6427 							 * the rest of the buffer */
6428 							memmove(wchar_buf, p, offset * sizeof(uint32_t));
6429 							goto refill_wchar_buf;
6430 						}
6431 						goto start_new_line;
6432 					} else {
6433 						/* We are done! */
6434 						mb_convert_buf_free(&tmpbuf);
6435 						return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6436 					}
6437 				} else {
6438 					/* Try a smaller number of wchars */
6439 					n = MAX(n >> 1, 1);
6440 				}
6441 			}
6442 		}
6443 	}
6444 }
6445 
PHP_FUNCTION(mb_encode_mimeheader)6446 PHP_FUNCTION(mb_encode_mimeheader)
6447 {
6448 	const mbfl_encoding *charset = &mbfl_encoding_pass;
6449 	zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6450 	char *linefeed = "\r\n";
6451 	size_t linefeed_len = 2;
6452 	zend_long indent = 0;
6453 	bool base64 = true;
6454 
6455 	ZEND_PARSE_PARAMETERS_START(1, 5)
6456 		Z_PARAM_STR(str)
6457 		Z_PARAM_OPTIONAL
6458 		Z_PARAM_STR(charset_name)
6459 		Z_PARAM_STR(transenc_name)
6460 		Z_PARAM_STRING(linefeed, linefeed_len)
6461 		Z_PARAM_LONG(indent)
6462 	ZEND_PARSE_PARAMETERS_END();
6463 
6464 	if (charset_name != NULL) {
6465 		charset = php_mb_get_encoding(charset_name, 2);
6466 		if (!charset) {
6467 			RETURN_THROWS();
6468 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6469 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6470 			RETURN_THROWS();
6471 		}
6472 	} else {
6473 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6474 		if (lang != NULL) {
6475 			charset = mbfl_no2encoding(lang->mail_charset);
6476 			const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6477 			char t = transenc->name[0];
6478 			if (t == 'Q' || t == 'q') {
6479 				base64 = false;
6480 			}
6481 		}
6482 	}
6483 
6484 	if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6485 		char t = ZSTR_VAL(transenc_name)[0];
6486 		if (t == 'Q' || t == 'q') {
6487 			base64 = false;
6488 		}
6489 	}
6490 
6491 	RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6492 }
6493 
decode_base64(unsigned char c)6494 static int8_t decode_base64(unsigned char c)
6495 {
6496 	if (c >= 'A' && c <= 'Z') {
6497 		return c - 'A';
6498 	} else if (c >= 'a' && c <= 'z') {
6499 		return c - 'a' + 26;
6500 	} else if (c >= '0' && c <= '9') {
6501 		return c - '0' + 52;
6502 	} else if (c == '+') {
6503 		return 62;
6504 	} else if (c == '/') {
6505 		return 63;
6506 	}
6507 	return -1;
6508 }
6509 
6510 static int8_t qprint_map[] = {
6511 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6512 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6513 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6514 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
6515 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6516 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6517 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6518 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6519 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6520 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6521 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6522 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6523 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6524 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6525 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6526 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6527 };
6528 
6529 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6530 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6531 {
6532 	if ((e - p) < 6) {
6533 		return NULL;
6534 	}
6535 
6536 	ZEND_ASSERT(p[0] == '=');
6537 	ZEND_ASSERT(p[1] == '?');
6538 	p += 2;
6539 
6540 	unsigned char *charset = p;
6541 	unsigned char *charset_end = memchr(charset, '?', e - charset);
6542 	if (charset_end == NULL) {
6543 		return NULL;
6544 	}
6545 
6546 	unsigned char *encoding = charset_end + 1;
6547 	p = encoding + 1;
6548 	if (p >= e || *p++ != '?') {
6549 		return NULL;
6550 	}
6551 
6552 	char *charset_name = estrndup((const char*)charset, charset_end - charset);
6553 	const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6554 	efree(charset_name);
6555 	if (incode == NULL) {
6556 		return NULL;
6557 	}
6558 
6559 	unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6560 	if (end_marker) {
6561 		e = end_marker;
6562 	} else if (p < e && *(e-1) == '?') {
6563 		/* If encoded word is not properly terminated, but last byte is '?',
6564 		 * take that as a terminator (legacy behavior) */
6565 		e--;
6566 	}
6567 
6568 	unsigned char *buf = emalloc(e - p), *bufp = buf;
6569 	if (*encoding == 'Q' || *encoding == 'q') {
6570 		/* Fill `buf` with bytes from decoding QPrint */
6571 		while (p < e) {
6572 			unsigned char c = *p++;
6573 			if (c == '_') {
6574 				*bufp++ = ' ';
6575 				continue;
6576 			} else if (c == '=' && (e - p) >= 2) {
6577 				unsigned char c2 = *p++;
6578 				unsigned char c3 = *p++;
6579 				if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6580 					*bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6581 					continue;
6582 				} else if (c2 == '\r') {
6583 					if (c3 != '\n') {
6584 						p--;
6585 					}
6586 					continue;
6587 				} else if (c2 == '\n') {
6588 					p--;
6589 					continue;
6590 				}
6591 			}
6592 			*bufp++ = c;
6593 		}
6594 	} else if (*encoding == 'B' || *encoding == 'b') {
6595 		/* Fill `buf` with bytes from decoding Base64 */
6596 		unsigned int bits = 0, cache = 0;
6597 		while (p < e) {
6598 			unsigned char c = *p++;
6599 			if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6600 				continue;
6601 			}
6602 			int8_t decoded = decode_base64(c);
6603 			if (decoded == -1) {
6604 				*bufp++ = '?';
6605 				continue;
6606 			}
6607 			bits += 6;
6608 			cache = (cache << 6) | (decoded & 0x3F);
6609 			if (bits == 24) {
6610 				*bufp++ = (cache >> 16) & 0xFF;
6611 				*bufp++ = (cache >> 8) & 0xFF;
6612 				*bufp++ = cache & 0xFF;
6613 				bits = cache = 0;
6614 			}
6615 		}
6616 		if (bits == 18) {
6617 			*bufp++ = (cache >> 10) & 0xFF;
6618 			*bufp++ = (cache >> 2) & 0xFF;
6619 		} else if (bits == 12) {
6620 			*bufp++ = (cache >> 4) & 0xFF;
6621 		}
6622 	} else {
6623 		efree(buf);
6624 		return NULL;
6625 	}
6626 
6627 	size_t in_len = bufp - buf;
6628 	uint32_t wchar_buf[128];
6629 
6630 	bufp = buf;
6631 	while (in_len) {
6632 		size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6633 		ZEND_ASSERT(out_len <= 128);
6634 		outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6635 	}
6636 
6637 	efree(buf);
6638 	return e + 2;
6639 }
6640 
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6641 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6642 {
6643 	unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6644 	unsigned int state = 0;
6645 	bool space_pending = false;
6646 
6647 	mb_convert_buf buf;
6648 	mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6649 
6650 	while (p < e) {
6651 		unsigned char c = *p;
6652 
6653 		if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6654 			/* Does this look like a MIME encoded word? If so, try to decode it as one */
6655 			unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6656 			if (incode_end && (e - incode_end) >= 3) {
6657 				unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6658 				if (temp) {
6659 					p = temp;
6660 					/* Decoding of MIME encoded word was successful;
6661 					 * Try to collapse a run of whitespace */
6662 					if (p < e && (*p == '\n' || *p == '\r')) {
6663 						do {
6664 							p++;
6665 						} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6666 						/* We will only actually output a space if this is not immediately followed
6667 						 * by another valid encoded word */
6668 						space_pending = true;
6669 					}
6670 					continue;
6671 				}
6672 			}
6673 		}
6674 
6675 		if (space_pending) {
6676 			uint32_t space = ' ';
6677 			outcode->from_wchar(&space, 1, &buf, false);
6678 			space_pending = false;
6679 		}
6680 
6681 		/* Consume a run of plain ASCII characters */
6682 		if (c != '\n' && c != '\r') {
6683 			unsigned char *end = p + 1;
6684 			while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6685 				end++;
6686 			}
6687 			uint32_t wchar_buf[128];
6688 			size_t in_len = end - p;
6689 			while (in_len) {
6690 				size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6691 				ZEND_ASSERT(out_len <= 128);
6692 				outcode->from_wchar(wchar_buf, out_len, &buf, false);
6693 			}
6694 		}
6695 		/* Collapse a run of whitespace into a single space */
6696 		if (p < e && (*p == '\n' || *p == '\r')) {
6697 			do {
6698 				p++;
6699 			} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6700 			if (p < e) {
6701 				/* Emulating legacy behavior of mb_decode_mimeheader here;
6702 				 * a run of whitespace is not converted to a space at the very
6703 				 * end of the input string */
6704 				uint32_t space = ' ';
6705 				outcode->from_wchar(&space, 1, &buf, false);
6706 			}
6707 		}
6708 	}
6709 
6710 	outcode->from_wchar(NULL, 0, &buf, true);
6711 
6712 	return mb_convert_buf_result(&buf, outcode);
6713 }
6714 
PHP_FUNCTION(mb_decode_mimeheader)6715 PHP_FUNCTION(mb_decode_mimeheader)
6716 {
6717 	zend_string *str;
6718 
6719 	ZEND_PARSE_PARAMETERS_START(1, 1)
6720 		Z_PARAM_STR(str)
6721 	ZEND_PARSE_PARAMETERS_END();
6722 
6723 	RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6724 }
6725