xref: /PHP-8.3/ext/mbstring/mbstring.c (revision 2cfd9df1)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include <limits.h>
22 
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "ext/standard/url.h"
32 #include "main/php_output.h"
33 #include "ext/standard/info.h"
34 #include "ext/pcre/php_pcre.h"
35 
36 #include "libmbfl/mbfl/mbfilter_8bit.h"
37 #include "libmbfl/mbfl/mbfilter_pass.h"
38 #include "libmbfl/mbfl/mbfilter_wchar.h"
39 #include "libmbfl/mbfl/eaw_table.h"
40 #include "libmbfl/filters/mbfilter_base64.h"
41 #include "libmbfl/filters/mbfilter_cjk.h"
42 #include "libmbfl/filters/mbfilter_qprint.h"
43 #include "libmbfl/filters/mbfilter_htmlent.h"
44 #include "libmbfl/filters/mbfilter_uuencode.h"
45 #include "libmbfl/filters/mbfilter_ucs4.h"
46 #include "libmbfl/filters/mbfilter_utf8.h"
47 #include "libmbfl/filters/mbfilter_utf16.h"
48 #include "libmbfl/filters/mbfilter_singlebyte.h"
49 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
50 #include "libmbfl/filters/unicode_prop.h"
51 
52 #include "php_variables.h"
53 #include "php_globals.h"
54 #include "rfc1867.h"
55 #include "php_content_types.h"
56 #include "SAPI.h"
57 #include "php_unicode.h"
58 #include "TSRM.h"
59 
60 #include "mb_gpc.h"
61 
62 #ifdef HAVE_MBREGEX
63 # include "php_mbregex.h"
64 #endif
65 
66 #include "zend_smart_str.h"
67 #include "zend_multibyte.h"
68 #include "mbstring_arginfo.h"
69 
70 #include "rare_cp_bitvec.h"
71 
72 /* }}} */
73 
74 /* {{{ prototypes */
75 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
76 
77 static PHP_GINIT_FUNCTION(mbstring);
78 static PHP_GSHUTDOWN_FUNCTION(mbstring);
79 
80 static void php_mb_populate_current_detect_order_list(void);
81 
82 static int php_mb_encoding_translation(void);
83 
84 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
85 
86 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
87 
88 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
89 
90 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
91 
92 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
93 
94 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
95 
96 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
97 
98 /* See mbfilter_cp5022x.c */
99 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
100 /* }}} */
101 
102 /* {{{ php_mb_default_identify_list */
103 typedef struct _php_mb_nls_ident_list {
104 	enum mbfl_no_language lang;
105 	const enum mbfl_no_encoding *list;
106 	size_t list_size;
107 } php_mb_nls_ident_list;
108 
109 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
110 	mbfl_no_encoding_ascii,
111 	mbfl_no_encoding_jis,
112 	mbfl_no_encoding_utf8,
113 	mbfl_no_encoding_euc_jp,
114 	mbfl_no_encoding_sjis
115 };
116 
117 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
118 	mbfl_no_encoding_ascii,
119 	mbfl_no_encoding_utf8,
120 	mbfl_no_encoding_euc_cn,
121 	mbfl_no_encoding_cp936
122 };
123 
124 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
125 	mbfl_no_encoding_ascii,
126 	mbfl_no_encoding_utf8,
127 	mbfl_no_encoding_euc_tw,
128 	mbfl_no_encoding_big5
129 };
130 
131 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
132 	mbfl_no_encoding_ascii,
133 	mbfl_no_encoding_utf8,
134 	mbfl_no_encoding_euc_kr,
135 	mbfl_no_encoding_uhc
136 };
137 
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
139 	mbfl_no_encoding_ascii,
140 	mbfl_no_encoding_utf8,
141 	mbfl_no_encoding_koi8r,
142 	mbfl_no_encoding_cp1251,
143 	mbfl_no_encoding_cp866
144 };
145 
146 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
147 	mbfl_no_encoding_ascii,
148 	mbfl_no_encoding_utf8,
149 	mbfl_no_encoding_armscii8
150 };
151 
152 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
153 	mbfl_no_encoding_ascii,
154 	mbfl_no_encoding_utf8,
155 	mbfl_no_encoding_cp1254,
156 	mbfl_no_encoding_8859_9
157 };
158 
159 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
160 	mbfl_no_encoding_ascii,
161 	mbfl_no_encoding_utf8,
162 	mbfl_no_encoding_koi8u
163 };
164 
165 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
166 	mbfl_no_encoding_ascii,
167 	mbfl_no_encoding_utf8
168 };
169 
170 
171 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
172 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
173 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
174 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
175 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
176 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
177 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
178 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
179 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
180 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
181 };
182 
183 /* }}} */
184 
185 /* {{{ mbstring_deps[] */
186 static const zend_module_dep mbstring_deps[] = {
187 	ZEND_MOD_REQUIRED("pcre")
188 	ZEND_MOD_END
189 };
190 /* }}} */
191 
192 /* {{{ zend_module_entry mbstring_module_entry */
193 zend_module_entry mbstring_module_entry = {
194 	STANDARD_MODULE_HEADER_EX,
195 	NULL,
196 	mbstring_deps,
197 	"mbstring",
198 	ext_functions,
199 	PHP_MINIT(mbstring),
200 	PHP_MSHUTDOWN(mbstring),
201 	PHP_RINIT(mbstring),
202 	PHP_RSHUTDOWN(mbstring),
203 	PHP_MINFO(mbstring),
204 	PHP_MBSTRING_VERSION,
205 	PHP_MODULE_GLOBALS(mbstring),
206 	PHP_GINIT(mbstring),
207 	PHP_GSHUTDOWN(mbstring),
208 	NULL,
209 	STANDARD_MODULE_PROPERTIES_EX
210 };
211 /* }}} */
212 
213 /* {{{ static sapi_post_entry php_post_entries[] */
214 static const sapi_post_entry php_post_entries[] = {
215 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
216 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
217 	{ NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220 
221 #ifdef COMPILE_DL_MBSTRING
222 #ifdef ZTS
223 ZEND_TSRMLS_CACHE_DEFINE()
224 #endif
225 ZEND_GET_MODULE(mbstring)
226 #endif
227 
228 /* {{{ static sapi_post_entry mbstr_post_entries[] */
229 static const sapi_post_entry mbstr_post_entries[] = {
230 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
231 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
232 	{ NULL, 0, NULL, NULL }
233 };
234 /* }}} */
235 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)236 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
237 	if (encoding_name) {
238 		const mbfl_encoding *encoding;
239 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
240 		if (last_encoding_name && (last_encoding_name == encoding_name
241 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
242 			return MBSTRG(last_used_encoding);
243 		}
244 
245 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
246 		if (!encoding) {
247 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
248 			return NULL;
249 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
250 			if (encoding == &mbfl_encoding_base64) {
251 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
252 			} else if (encoding == &mbfl_encoding_qprint) {
253 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
254 			} else if (encoding == &mbfl_encoding_html_ent) {
255 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
256 			} else if (encoding == &mbfl_encoding_uuencode) {
257 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
258 			}
259 		}
260 
261 		if (last_encoding_name) {
262 			zend_string_release(last_encoding_name);
263 		}
264 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
265 		MBSTRG(last_used_encoding) = encoding;
266 		return encoding;
267 	} else {
268 		return MBSTRG(current_internal_encoding);
269 	}
270 }
271 
php_mb_get_encoding_or_pass(const char * encoding_name)272 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
273 	if (strcmp(encoding_name, "pass") == 0) {
274 		return &mbfl_encoding_pass;
275 	}
276 
277 	return mbfl_name2encoding(encoding_name);
278 }
279 
count_commas(const char * p,const char * end)280 static size_t count_commas(const char *p, const char *end) {
281 	size_t count = 0;
282 	while ((p = memchr(p, ',', end - p))) {
283 		count++;
284 		p++;
285 	}
286 	return count;
287 }
288 
289 /* {{{ static zend_result php_mb_parse_encoding_list()
290  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
291  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
292  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)293 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
294 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
295 {
296 	if (value == NULL || value_length == 0) {
297 		*return_list = NULL;
298 		*return_size = 0;
299 		return SUCCESS;
300 	} else {
301 		bool included_auto;
302 		size_t n, size;
303 		char *p1, *endp, *tmpstr;
304 		const mbfl_encoding **entry, **list;
305 
306 		/* copy the value string for work */
307 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
308 			tmpstr = (char *)estrndup(value+1, value_length-2);
309 			value_length -= 2;
310 		} else {
311 			tmpstr = (char *)estrndup(value, value_length);
312 		}
313 
314 		endp = tmpstr + value_length;
315 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
316 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
317 		entry = list;
318 		n = 0;
319 		included_auto = 0;
320 		p1 = tmpstr;
321 		while (1) {
322 			char *comma = memchr(p1, ',', endp - p1);
323 			char *p = comma ? comma : endp;
324 			*p = '\0';
325 			/* trim spaces */
326 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
327 				p1++;
328 			}
329 			p--;
330 			while (p > p1 && (*p == ' ' || *p == '\t')) {
331 				*p = '\0';
332 				p--;
333 			}
334 			/* convert to the encoding number and check encoding */
335 			if (strcasecmp(p1, "auto") == 0) {
336 				if (!included_auto) {
337 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
338 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
339 					size_t i;
340 					included_auto = 1;
341 					for (i = 0; i < identify_list_size; i++) {
342 						*entry++ = mbfl_no2encoding(*src++);
343 						n++;
344 					}
345 				}
346 			} else {
347 				const mbfl_encoding *encoding = mbfl_name2encoding(p1);
348 				if (!encoding) {
349 					/* Called from an INI setting modification */
350 					if (arg_num == 0) {
351 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
352 					} else {
353 						zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
354 					}
355 					efree(tmpstr);
356 					pefree(ZEND_VOIDP(list), persistent);
357 					return FAILURE;
358 				}
359 
360 				*entry++ = encoding;
361 				n++;
362 			}
363 			if (n >= size || comma == NULL) {
364 				break;
365 			}
366 			p1 = comma + 1;
367 		}
368 		*return_list = list;
369 		*return_size = n;
370 		efree(tmpstr);
371 	}
372 
373 	return SUCCESS;
374 }
375 /* }}} */
376 
377 /* {{{ static int php_mb_parse_encoding_array()
378  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
379  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
380  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)381 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
382 	size_t *return_size, uint32_t arg_num)
383 {
384 	/* Allocate enough space to include the default detect order if "auto" is used. */
385 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
386 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
387 	const mbfl_encoding **entry = list;
388 	bool included_auto = 0;
389 	size_t n = 0;
390 	zval *hash_entry;
391 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
392 		zend_string *encoding_str = zval_try_get_string(hash_entry);
393 		if (UNEXPECTED(!encoding_str)) {
394 			efree(ZEND_VOIDP(list));
395 			return FAILURE;
396 		}
397 
398 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
399 			if (!included_auto) {
400 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
401 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
402 				size_t j;
403 
404 				included_auto = 1;
405 				for (j = 0; j < identify_list_size; j++) {
406 					*entry++ = mbfl_no2encoding(*src++);
407 					n++;
408 				}
409 			}
410 		} else {
411 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
412 			if (encoding) {
413 				*entry++ = encoding;
414 				n++;
415 			} else {
416 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
417 				zend_string_release(encoding_str);
418 				efree(ZEND_VOIDP(list));
419 				return FAILURE;
420 			}
421 		}
422 		zend_string_release(encoding_str);
423 	} ZEND_HASH_FOREACH_END();
424 	*return_list = list;
425 	*return_size = n;
426 	return SUCCESS;
427 }
428 /* }}} */
429 
430 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)431 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
432 {
433 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
434 }
435 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)436 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
437 {
438 	return ((const mbfl_encoding *)encoding)->name;
439 }
440 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)441 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
442 {
443 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
444 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
445 }
446 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)447 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
448 {
449 	if (!list) {
450 		list = (const zend_encoding**)MBSTRG(current_detect_order_list);
451 		list_size = MBSTRG(current_detect_order_list_size);
452 	}
453 	if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
454 		/* Emulate behavior of previous implementation; it would never return "pass"
455 		 * from an encoding auto-detection operation */
456 		return NULL;
457 	}
458 	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
459 }
460 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)461 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
462 {
463 	unsigned int num_errors = 0;
464 	zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
465 
466 	*to_length = ZSTR_LEN(result);
467 	*to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
468 	memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
469 	zend_string_free(result);
470 
471 	return from_length;
472 }
473 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)474 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
475 {
476 	return php_mb_parse_encoding_list(
477 		encoding_list, encoding_list_len,
478 		(const mbfl_encoding ***)return_list, return_size,
479 		persistent, /* arg_num */ 0);
480 }
481 
php_mb_zend_internal_encoding_getter(void)482 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
483 {
484 	return (const zend_encoding *)MBSTRG(internal_encoding);
485 }
486 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)487 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
488 {
489 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
490 	return SUCCESS;
491 }
492 
493 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
494 	"mbstring",
495 	php_mb_zend_encoding_fetcher,
496 	php_mb_zend_encoding_name_getter,
497 	php_mb_zend_encoding_lexer_compatibility_checker,
498 	php_mb_zend_encoding_detector,
499 	php_mb_zend_encoding_converter,
500 	php_mb_zend_encoding_list_parser,
501 	php_mb_zend_internal_encoding_getter,
502 	php_mb_zend_internal_encoding_setter
503 };
504 /* }}} */
505 
506 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)507 static void *_php_mb_compile_regex(const char *pattern)
508 {
509 	pcre2_code *retval;
510 	PCRE2_SIZE err_offset;
511 	int errnum;
512 
513 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
514 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
515 		PCRE2_UCHAR err_str[128];
516 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
517 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
518 	}
519 	return retval;
520 }
521 /* }}} */
522 
523 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)524 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
525 {
526 	int res;
527 
528 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
529 	if (NULL == match_data) {
530 		pcre2_code_free(opaque);
531 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
532 		return FAILURE;
533 	}
534 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
535 	php_pcre_free_match_data(match_data);
536 
537 	return res;
538 }
539 /* }}} */
540 
541 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)542 static void _php_mb_free_regex(void *opaque)
543 {
544 	pcre2_code_free(opaque);
545 }
546 /* }}} */
547 
548 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)549 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
550 {
551 	size_t i;
552 
553 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
554 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
555 
556 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
557 		if (php_mb_default_identify_list[i].lang == lang) {
558 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
559 			*plist_size = php_mb_default_identify_list[i].list_size;
560 			return 1;
561 		}
562 	}
563 	return 0;
564 }
565 /* }}} */
566 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)567 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
568 {
569 	char *result = emalloc(len + 2);
570 	char *resp = result;
571 	size_t i;
572 
573 	for (i = 0; i < len && start[i] != quote; ++i) {
574 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
575 			*resp++ = start[++i];
576 		} else {
577 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
578 
579 			while (j-- > 0 && i < len) {
580 				*resp++ = start[i++];
581 			}
582 			--i;
583 		}
584 	}
585 
586 	*resp = '\0';
587 	return result;
588 }
589 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)590 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
591 {
592 	char *pos = *line, quote;
593 	char *res;
594 
595 	while (*pos && *pos != stop) {
596 		if ((quote = *pos) == '"' || quote == '\'') {
597 			++pos;
598 			while (*pos && *pos != quote) {
599 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
600 					pos += 2;
601 				} else {
602 					++pos;
603 				}
604 			}
605 			if (*pos) {
606 				++pos;
607 			}
608 		} else {
609 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
610 
611 		}
612 	}
613 	if (*pos == '\0') {
614 		res = estrdup(*line);
615 		*line += strlen(*line);
616 		return res;
617 	}
618 
619 	res = estrndup(*line, pos - *line);
620 
621 	while (*pos == stop) {
622 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
623 	}
624 
625 	*line = pos;
626 	return res;
627 }
628 /* }}} */
629 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)630 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
631 {
632 	while (*str && isspace(*(unsigned char *)str)) {
633 		++str;
634 	}
635 
636 	if (!*str) {
637 		return estrdup("");
638 	}
639 
640 	if (*str == '"' || *str == '\'') {
641 		char quote = *str;
642 
643 		str++;
644 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
645 	} else {
646 		char *strend = str;
647 
648 		while (*strend && !isspace(*(unsigned char *)strend)) {
649 			++strend;
650 		}
651 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
652 	}
653 }
654 /* }}} */
655 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)656 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
657 {
658 	char *s, *s2;
659 	const size_t filename_len = strlen(filename);
660 
661 	/* The \ check should technically be needed for win32 systems only where
662 	 * it is a valid path separator. However, IE in all it's wisdom always sends
663 	 * the full path of the file on the user's filesystem, which means that unless
664 	 * the user does basename() they get a bogus file name. Until IE's user base drops
665 	 * to nill or problem is fixed this code must remain enabled for all systems. */
666 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
667 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
668 
669 	if (s && s2) {
670 		if (s > s2) {
671 			return ++s;
672 		} else {
673 			return ++s2;
674 		}
675 	} else if (s) {
676 		return ++s;
677 	} else if (s2) {
678 		return ++s2;
679 	} else {
680 		return filename;
681 	}
682 }
683 /* }}} */
684 
685 /* {{{ php.ini directive handler */
686 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)687 static PHP_INI_MH(OnUpdate_mbstring_language)
688 {
689 	enum mbfl_no_language no_language;
690 
691 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
692 	if (no_language == mbfl_no_language_invalid) {
693 		MBSTRG(language) = mbfl_no_language_neutral;
694 		return FAILURE;
695 	}
696 	MBSTRG(language) = no_language;
697 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
698 	return SUCCESS;
699 }
700 /* }}} */
701 
702 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)703 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
704 {
705 	const mbfl_encoding **list;
706 	size_t size;
707 
708 	if (!new_value) {
709 		if (MBSTRG(detect_order_list)) {
710 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
711 		}
712 		MBSTRG(detect_order_list) = NULL;
713 		MBSTRG(detect_order_list_size) = 0;
714 		return SUCCESS;
715 	}
716 
717 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
718 		return FAILURE;
719 	}
720 
721 	if (MBSTRG(detect_order_list)) {
722 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
723 	}
724 	MBSTRG(detect_order_list) = list;
725 	MBSTRG(detect_order_list_size) = size;
726 	return SUCCESS;
727 }
728 /* }}} */
729 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)730 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
731 	const mbfl_encoding **list;
732 	size_t size;
733 	if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
734 		list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
735 		*list = &mbfl_encoding_pass;
736 		size = 1;
737 	} else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
738 		return FAILURE;
739 	}
740 	if (MBSTRG(http_input_list)) {
741 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
742 	}
743 	MBSTRG(http_input_list) = list;
744 	MBSTRG(http_input_list_size) = size;
745 	return SUCCESS;
746 }
747 
748 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)749 static PHP_INI_MH(OnUpdate_mbstring_http_input)
750 {
751 	if (new_value) {
752 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
753 	}
754 
755 	if (!new_value || !ZSTR_LEN(new_value)) {
756 		const char *encoding = php_get_input_encoding();
757 		MBSTRG(http_input_set) = 0;
758 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
759 		return SUCCESS;
760 	}
761 
762 	MBSTRG(http_input_set) = 1;
763 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
764 }
765 /* }}} */
766 
_php_mb_ini_mbstring_http_output_set(const char * new_value)767 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
768 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
769 	if (!encoding) {
770 		return FAILURE;
771 	}
772 
773 	MBSTRG(http_output_encoding) = encoding;
774 	MBSTRG(current_http_output_encoding) = encoding;
775 	return SUCCESS;
776 }
777 
778 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)779 static PHP_INI_MH(OnUpdate_mbstring_http_output)
780 {
781 	if (new_value) {
782 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
783 	}
784 
785 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
786 		MBSTRG(http_output_set) = 0;
787 		_php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
788 		return SUCCESS;
789 	}
790 
791 	MBSTRG(http_output_set) = 1;
792 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
793 }
794 /* }}} */
795 
796 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)797 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
798 {
799 	const mbfl_encoding *encoding;
800 
801 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
802 		/* falls back to UTF-8 if an unknown encoding name is given */
803 		if (new_value) {
804 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
805 		}
806 		encoding = &mbfl_encoding_utf8;
807 	}
808 	MBSTRG(internal_encoding) = encoding;
809 	MBSTRG(current_internal_encoding) = encoding;
810 #ifdef HAVE_MBREGEX
811 	{
812 		const char *enc_name = new_value;
813 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
814 			/* falls back to UTF-8 if an unknown encoding name is given */
815 			enc_name = "UTF-8";
816 			php_mb_regex_set_default_mbctype(enc_name);
817 		}
818 		php_mb_regex_set_mbctype(new_value);
819 	}
820 #endif
821 	return SUCCESS;
822 }
823 /* }}} */
824 
825 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)826 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
827 {
828 	if (new_value) {
829 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
830 	}
831 
832 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
833 		return FAILURE;
834 	}
835 
836 	if (new_value && ZSTR_LEN(new_value)) {
837 		MBSTRG(internal_encoding_set) = 1;
838 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
839 	} else {
840 		const char *encoding = php_get_internal_encoding();
841 		MBSTRG(internal_encoding_set) = 0;
842 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
843 	}
844 }
845 /* }}} */
846 
847 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)848 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
849 {
850 	int c;
851 	char *endptr = NULL;
852 
853 	if (new_value != NULL) {
854 		if (zend_string_equals_literal_ci(new_value, "none")) {
855 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
856 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
857 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
858 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
859 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
860 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
861 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
862 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
863 		} else {
864 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
865 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
866 			if (ZSTR_LEN(new_value) > 0) {
867 				c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868 				if (*endptr == '\0') {
869 					MBSTRG(filter_illegal_substchar) = c;
870 					MBSTRG(current_filter_illegal_substchar) = c;
871 				}
872 			}
873 		}
874 	} else {
875 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
876 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 		MBSTRG(filter_illegal_substchar) = '?';
878 		MBSTRG(current_filter_illegal_substchar) = '?';
879 	}
880 
881 	return SUCCESS;
882 }
883 /* }}} */
884 
885 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)886 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
887 {
888 	if (new_value == NULL) {
889 		return FAILURE;
890 	}
891 
892 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
893 
894 	if (MBSTRG(encoding_translation)) {
895 		sapi_unregister_post_entry(php_post_entries);
896 		sapi_register_post_entries(mbstr_post_entries);
897 	} else {
898 		sapi_unregister_post_entry(mbstr_post_entries);
899 		sapi_register_post_entries(php_post_entries);
900 	}
901 
902 	return SUCCESS;
903 }
904 /* }}} */
905 
906 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)907 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
908 {
909 	zend_string *tmp;
910 	void *re = NULL;
911 
912 	if (!new_value) {
913 		new_value = entry->orig_value;
914 	}
915 	tmp = php_trim(new_value, NULL, 0, 3);
916 
917 	if (ZSTR_LEN(tmp) > 0) {
918 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
919 			zend_string_release_ex(tmp, 0);
920 			return FAILURE;
921 		}
922 	}
923 
924 	if (MBSTRG(http_output_conv_mimetypes)) {
925 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
926 	}
927 
928 	MBSTRG(http_output_conv_mimetypes) = re;
929 
930 	zend_string_release_ex(tmp, 0);
931 	return SUCCESS;
932 }
933 /* }}} */
934 /* }}} */
935 
936 /* {{{ php.ini directive registration */
937 PHP_INI_BEGIN()
938 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
939 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
940 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
941 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
942 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
943 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
944 
945 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
946 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
947 		OnUpdate_mbstring_encoding_translation,
948 		encoding_translation, zend_mbstring_globals, mbstring_globals)
949 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
950 		"^(text/|application/xhtml\\+xml)",
951 		PHP_INI_ALL,
952 		OnUpdate_mbstring_http_output_conv_mimetypes)
953 
954 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
955 		PHP_INI_ALL,
956 		OnUpdateBool,
957 		strict_detection, zend_mbstring_globals, mbstring_globals)
958 #ifdef HAVE_MBREGEX
959 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
960 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
961 #endif
PHP_INI_END()962 PHP_INI_END()
963 /* }}} */
964 
965 static void mbstring_internal_encoding_changed_hook(void) {
966 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
967 	if (!MBSTRG(internal_encoding_set)) {
968 		const char *encoding = php_get_internal_encoding();
969 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
970 	}
971 
972 	if (!MBSTRG(http_output_set)) {
973 		const char *encoding = php_get_output_encoding();
974 		_php_mb_ini_mbstring_http_output_set(encoding);
975 	}
976 
977 	if (!MBSTRG(http_input_set)) {
978 		const char *encoding = php_get_input_encoding();
979 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
980 	}
981 }
982 
983 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)984 static PHP_GINIT_FUNCTION(mbstring)
985 {
986 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
987 ZEND_TSRMLS_CACHE_UPDATE();
988 #endif
989 
990 	mbstring_globals->language = mbfl_no_language_uni;
991 	mbstring_globals->internal_encoding = NULL;
992 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
993 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
994 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
995 	mbstring_globals->http_input_identify = NULL;
996 	mbstring_globals->http_input_identify_get = NULL;
997 	mbstring_globals->http_input_identify_post = NULL;
998 	mbstring_globals->http_input_identify_cookie = NULL;
999 	mbstring_globals->http_input_identify_string = NULL;
1000 	mbstring_globals->http_input_list = NULL;
1001 	mbstring_globals->http_input_list_size = 0;
1002 	mbstring_globals->detect_order_list = NULL;
1003 	mbstring_globals->detect_order_list_size = 0;
1004 	mbstring_globals->current_detect_order_list = NULL;
1005 	mbstring_globals->current_detect_order_list_size = 0;
1006 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1007 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1008 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1009 	mbstring_globals->filter_illegal_substchar = '?';
1010 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1011 	mbstring_globals->current_filter_illegal_substchar = '?';
1012 	mbstring_globals->illegalchars = 0;
1013 	mbstring_globals->encoding_translation = 0;
1014 	mbstring_globals->strict_detection = 0;
1015 	mbstring_globals->outconv_enabled = false;
1016 	mbstring_globals->outconv_state = 0;
1017 	mbstring_globals->http_output_conv_mimetypes = NULL;
1018 #ifdef HAVE_MBREGEX
1019 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1020 #endif
1021 	mbstring_globals->last_used_encoding_name = NULL;
1022 	mbstring_globals->last_used_encoding = NULL;
1023 	mbstring_globals->internal_encoding_set = 0;
1024 	mbstring_globals->http_output_set = 0;
1025 	mbstring_globals->http_input_set = 0;
1026 	mbstring_globals->all_encodings_list = NULL;
1027 }
1028 /* }}} */
1029 
1030 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1031 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1032 {
1033 	if (mbstring_globals->http_input_list) {
1034 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1035 	}
1036 	if (mbstring_globals->detect_order_list) {
1037 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1038 	}
1039 	if (mbstring_globals->http_output_conv_mimetypes) {
1040 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1041 	}
1042 #ifdef HAVE_MBREGEX
1043 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1044 #endif
1045 }
1046 /* }}} */
1047 
1048 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1049 static void init_check_utf8(void);
1050 #endif
1051 
1052 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1053 PHP_MINIT_FUNCTION(mbstring)
1054 {
1055 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1056 ZEND_TSRMLS_CACHE_UPDATE();
1057 #endif
1058 
1059 	REGISTER_INI_ENTRIES();
1060 
1061 	/* We assume that we're the only user of the hook. */
1062 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1063 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1064 	mbstring_internal_encoding_changed_hook();
1065 
1066 	/* This is a global handler. Should not be set in a per-request handler. */
1067 	sapi_register_treat_data(mbstr_treat_data);
1068 
1069 	/* Post handlers are stored in the thread-local context. */
1070 	if (MBSTRG(encoding_translation)) {
1071 		sapi_register_post_entries(mbstr_post_entries);
1072 	}
1073 
1074 #ifdef HAVE_MBREGEX
1075 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1076 #endif
1077 
1078 	register_mbstring_symbols(module_number);
1079 
1080 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1081 		return FAILURE;
1082 	}
1083 
1084 	php_rfc1867_set_multibyte_callbacks(
1085 		php_mb_encoding_translation,
1086 		php_mb_gpc_get_detect_order,
1087 		php_mb_gpc_set_input_encoding,
1088 		php_mb_rfc1867_getword,
1089 		php_mb_rfc1867_getword_conf,
1090 		php_mb_rfc1867_basename);
1091 
1092 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1093 	init_check_utf8();
1094 	init_convert_utf16();
1095 #endif
1096 
1097 	return SUCCESS;
1098 }
1099 /* }}} */
1100 
1101 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1102 PHP_MSHUTDOWN_FUNCTION(mbstring)
1103 {
1104 	UNREGISTER_INI_ENTRIES();
1105 
1106 	zend_multibyte_restore_functions();
1107 
1108 #ifdef HAVE_MBREGEX
1109 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1110 #endif
1111 
1112 	php_internal_encoding_changed = NULL;
1113 
1114 	return SUCCESS;
1115 }
1116 /* }}} */
1117 
1118 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1119 PHP_RINIT_FUNCTION(mbstring)
1120 {
1121 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1122 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1123 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1124 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1125 
1126 	MBSTRG(illegalchars) = 0;
1127 
1128 	php_mb_populate_current_detect_order_list();
1129 
1130 #ifdef HAVE_MBREGEX
1131 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1132 #endif
1133 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1134 
1135 	return SUCCESS;
1136 }
1137 /* }}} */
1138 
1139 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1140 PHP_RSHUTDOWN_FUNCTION(mbstring)
1141 {
1142 	if (MBSTRG(current_detect_order_list) != NULL) {
1143 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1144 		MBSTRG(current_detect_order_list) = NULL;
1145 		MBSTRG(current_detect_order_list_size) = 0;
1146 	}
1147 
1148 	/* clear http input identification. */
1149 	MBSTRG(http_input_identify) = NULL;
1150 	MBSTRG(http_input_identify_post) = NULL;
1151 	MBSTRG(http_input_identify_get) = NULL;
1152 	MBSTRG(http_input_identify_cookie) = NULL;
1153 	MBSTRG(http_input_identify_string) = NULL;
1154 
1155 	if (MBSTRG(last_used_encoding_name)) {
1156 		zend_string_release(MBSTRG(last_used_encoding_name));
1157 		MBSTRG(last_used_encoding_name) = NULL;
1158 	}
1159 
1160 	MBSTRG(internal_encoding_set) = 0;
1161 	MBSTRG(http_output_set) = 0;
1162 	MBSTRG(http_input_set) = 0;
1163 
1164 	MBSTRG(outconv_enabled) = false;
1165 	MBSTRG(outconv_state) = 0;
1166 
1167 	if (MBSTRG(all_encodings_list)) {
1168 		GC_DELREF(MBSTRG(all_encodings_list));
1169 		zend_array_destroy(MBSTRG(all_encodings_list));
1170 		MBSTRG(all_encodings_list) = NULL;
1171 	}
1172 
1173 #ifdef HAVE_MBREGEX
1174 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1175 #endif
1176 
1177 	return SUCCESS;
1178 }
1179 /* }}} */
1180 
1181 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1182 PHP_MINFO_FUNCTION(mbstring)
1183 {
1184 	php_info_print_table_start();
1185 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1186 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1187 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1188 	{
1189 		char tmp[256];
1190 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1191 		php_info_print_table_row(2, "libmbfl version", tmp);
1192 	}
1193 	php_info_print_table_end();
1194 
1195 	php_info_print_table_start();
1196 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1197 	php_info_print_table_end();
1198 
1199 #ifdef HAVE_MBREGEX
1200 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1201 #endif
1202 
1203 	DISPLAY_INI_ENTRIES();
1204 }
1205 /* }}} */
1206 
1207 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1208 PHP_FUNCTION(mb_language)
1209 {
1210 	zend_string *name = NULL;
1211 
1212 	ZEND_PARSE_PARAMETERS_START(0, 1)
1213 		Z_PARAM_OPTIONAL
1214 		Z_PARAM_STR_OR_NULL(name)
1215 	ZEND_PARSE_PARAMETERS_END();
1216 
1217 	if (name == NULL) {
1218 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1219 	} else {
1220 		zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1221 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1222 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1223 			zend_string_release_ex(ini_name, 0);
1224 			RETURN_THROWS();
1225 		}
1226 		// TODO Make return void
1227 		RETVAL_TRUE;
1228 		zend_string_release_ex(ini_name, 0);
1229 	}
1230 }
1231 /* }}} */
1232 
1233 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1234 PHP_FUNCTION(mb_internal_encoding)
1235 {
1236 	char *name = NULL;
1237 	size_t name_len;
1238 	const mbfl_encoding *encoding;
1239 
1240 	ZEND_PARSE_PARAMETERS_START(0, 1)
1241 		Z_PARAM_OPTIONAL
1242 		Z_PARAM_STRING_OR_NULL(name, name_len)
1243 	ZEND_PARSE_PARAMETERS_END();
1244 
1245 	if (name == NULL) {
1246 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1247 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1248 	} else {
1249 		encoding = mbfl_name2encoding(name);
1250 		if (!encoding) {
1251 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1252 			RETURN_THROWS();
1253 		} else {
1254 			MBSTRG(current_internal_encoding) = encoding;
1255 			MBSTRG(internal_encoding_set) = 1;
1256 			/* TODO Return old encoding */
1257 			RETURN_TRUE;
1258 		}
1259 	}
1260 }
1261 /* }}} */
1262 
1263 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1264 PHP_FUNCTION(mb_http_input)
1265 {
1266 	char *type = NULL;
1267 	size_t type_len = 0, n;
1268 	const mbfl_encoding **entry;
1269 	const mbfl_encoding *encoding;
1270 
1271 	ZEND_PARSE_PARAMETERS_START(0, 1)
1272 		Z_PARAM_OPTIONAL
1273 		Z_PARAM_STRING_OR_NULL(type, type_len)
1274 	ZEND_PARSE_PARAMETERS_END();
1275 
1276 	if (type == NULL) {
1277 		encoding = MBSTRG(http_input_identify);
1278 	} else {
1279 		switch (*type) {
1280 		case 'G':
1281 		case 'g':
1282 			encoding = MBSTRG(http_input_identify_get);
1283 			break;
1284 		case 'P':
1285 		case 'p':
1286 			encoding = MBSTRG(http_input_identify_post);
1287 			break;
1288 		case 'C':
1289 		case 'c':
1290 			encoding = MBSTRG(http_input_identify_cookie);
1291 			break;
1292 		case 'S':
1293 		case 's':
1294 			encoding = MBSTRG(http_input_identify_string);
1295 			break;
1296 		case 'I':
1297 		case 'i':
1298 			entry = MBSTRG(http_input_list);
1299 			n = MBSTRG(http_input_list_size);
1300 			array_init(return_value);
1301 			for (size_t i = 0; i < n; i++, entry++) {
1302 				add_next_index_string(return_value, (*entry)->name);
1303 			}
1304 			return;
1305 		case 'L':
1306 		case 'l':
1307 			entry = MBSTRG(http_input_list);
1308 			n = MBSTRG(http_input_list_size);
1309 			if (n == 0) {
1310 				RETURN_FALSE;
1311 			}
1312 
1313 			smart_str result = {0};
1314 			for (size_t i = 0; i < n; i++, entry++) {
1315 				if (i > 0) {
1316 					smart_str_appendc(&result, ',');
1317 				}
1318 				smart_str_appends(&result, (*entry)->name);
1319 			}
1320 			RETURN_STR(smart_str_extract(&result));
1321 		default:
1322 			zend_argument_value_error(1,
1323 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1324 			RETURN_THROWS();
1325 		}
1326 	}
1327 
1328 	if (encoding) {
1329 		RETURN_STRING(encoding->name);
1330 	} else {
1331 		RETURN_FALSE;
1332 	}
1333 }
1334 /* }}} */
1335 
1336 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1337 PHP_FUNCTION(mb_http_output)
1338 {
1339 	char *name = NULL;
1340 	size_t name_len;
1341 
1342 	ZEND_PARSE_PARAMETERS_START(0, 1)
1343 		Z_PARAM_OPTIONAL
1344 		Z_PARAM_STRING_OR_NULL(name, name_len)
1345 	ZEND_PARSE_PARAMETERS_END();
1346 
1347 	if (name == NULL) {
1348 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1349 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1350 	} else {
1351 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1352 		if (!encoding) {
1353 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1354 			RETURN_THROWS();
1355 		} else {
1356 			MBSTRG(http_output_set) = 1;
1357 			MBSTRG(current_http_output_encoding) = encoding;
1358 			/* TODO Return previous encoding? */
1359 			RETURN_TRUE;
1360 		}
1361 	}
1362 }
1363 /* }}} */
1364 
1365 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1366 PHP_FUNCTION(mb_detect_order)
1367 {
1368 	zend_string *order_str = NULL;
1369 	HashTable *order_ht = NULL;
1370 
1371 	ZEND_PARSE_PARAMETERS_START(0, 1)
1372 		Z_PARAM_OPTIONAL
1373 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1374 	ZEND_PARSE_PARAMETERS_END();
1375 
1376 	if (!order_str && !order_ht) {
1377 		size_t n = MBSTRG(current_detect_order_list_size);
1378 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1379 		array_init(return_value);
1380 		for (size_t i = 0; i < n; i++) {
1381 			add_next_index_string(return_value, (*entry)->name);
1382 			entry++;
1383 		}
1384 	} else {
1385 		const mbfl_encoding **list;
1386 		size_t size;
1387 		if (order_ht) {
1388 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1389 				RETURN_THROWS();
1390 			}
1391 		} else {
1392 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1393 				RETURN_THROWS();
1394 			}
1395 		}
1396 
1397 		if (size == 0) {
1398 			efree(ZEND_VOIDP(list));
1399 			zend_argument_value_error(1, "must specify at least one encoding");
1400 			RETURN_THROWS();
1401 		}
1402 
1403 		if (MBSTRG(current_detect_order_list)) {
1404 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1405 		}
1406 		MBSTRG(current_detect_order_list) = list;
1407 		MBSTRG(current_detect_order_list_size) = size;
1408 		RETURN_TRUE;
1409 	}
1410 }
1411 /* }}} */
1412 
php_mb_check_code_point(zend_long cp)1413 static inline int php_mb_check_code_point(zend_long cp)
1414 {
1415 	if (cp < 0 || cp >= 0x110000) {
1416 		/* Out of Unicode range */
1417 		return 0;
1418 	}
1419 
1420 	if (cp >= 0xd800 && cp <= 0xdfff) {
1421 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1422 		 * substitute character. */
1423 		return 0;
1424 	}
1425 
1426 	/* As we do not know the target encoding of the conversion operation that is going to
1427 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1428 	 * in the given encoding at this point. Thus we have to accept everything. */
1429 	return 1;
1430 }
1431 
1432 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1433 PHP_FUNCTION(mb_substitute_character)
1434 {
1435 	zend_string *substitute_character = NULL;
1436 	zend_long substitute_codepoint;
1437 	bool substitute_is_null = 1;
1438 
1439 	ZEND_PARSE_PARAMETERS_START(0, 1)
1440 		Z_PARAM_OPTIONAL
1441 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1442 	ZEND_PARSE_PARAMETERS_END();
1443 
1444 	if (substitute_is_null) {
1445 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1446 			RETURN_STRING("none");
1447 		}
1448 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1449 			RETURN_STRING("long");
1450 		}
1451 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1452 			RETURN_STRING("entity");
1453 		}
1454 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1455 	}
1456 
1457 	if (substitute_character != NULL) {
1458 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1459 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1460 			RETURN_TRUE;
1461 		}
1462 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1463 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1464 			RETURN_TRUE;
1465 		}
1466 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1467 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1468 			RETURN_TRUE;
1469 		}
1470 		/* Invalid string value */
1471 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1472 		RETURN_THROWS();
1473 	}
1474 	/* Integer codepoint passed */
1475 	if (!php_mb_check_code_point(substitute_codepoint)) {
1476 		zend_argument_value_error(1, "is not a valid codepoint");
1477 		RETURN_THROWS();
1478 	}
1479 
1480 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1481 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1482 	RETURN_TRUE;
1483 }
1484 /* }}} */
1485 
1486 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1487 PHP_FUNCTION(mb_preferred_mime_name)
1488 {
1489 	char *name = NULL;
1490 	size_t name_len;
1491 
1492 	ZEND_PARSE_PARAMETERS_START(1, 1)
1493 		Z_PARAM_STRING(name, name_len)
1494 	ZEND_PARSE_PARAMETERS_END();
1495 
1496 	const mbfl_encoding *enc = mbfl_name2encoding(name);
1497 	if (enc == NULL) {
1498 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1499 		RETURN_THROWS();
1500 	}
1501 
1502 	const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1503 	if (preferred_name == NULL || *preferred_name == '\0') {
1504 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1505 		RETVAL_FALSE;
1506 	} else {
1507 		RETVAL_STRING((char *)preferred_name);
1508 	}
1509 }
1510 /* }}} */
1511 
1512 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1513 PHP_FUNCTION(mb_parse_str)
1514 {
1515 	zval *track_vars_array = NULL;
1516 	char *encstr;
1517 	size_t encstr_len;
1518 	php_mb_encoding_handler_info_t info;
1519 	const mbfl_encoding *detected;
1520 
1521 	ZEND_PARSE_PARAMETERS_START(2, 2)
1522 		Z_PARAM_STRING(encstr, encstr_len)
1523 		Z_PARAM_ZVAL(track_vars_array)
1524 	ZEND_PARSE_PARAMETERS_END();
1525 
1526 	track_vars_array = zend_try_array_init(track_vars_array);
1527 	if (!track_vars_array) {
1528 		RETURN_THROWS();
1529 	}
1530 
1531 	encstr = estrndup(encstr, encstr_len);
1532 
1533 	info.data_type              = PARSE_STRING;
1534 	info.separator              = PG(arg_separator).input;
1535 	info.report_errors          = true;
1536 	info.to_encoding            = MBSTRG(current_internal_encoding);
1537 	info.from_encodings         = MBSTRG(http_input_list);
1538 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1539 
1540 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1541 
1542 	MBSTRG(http_input_identify) = detected;
1543 
1544 	RETVAL_BOOL(detected);
1545 
1546 	if (encstr != NULL) efree(encstr);
1547 }
1548 /* }}} */
1549 
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 	zend_string *str;
1553 	zend_long arg_status;
1554 
1555 	ZEND_PARSE_PARAMETERS_START(2, 2)
1556 		Z_PARAM_STR(str)
1557 		Z_PARAM_LONG(arg_status)
1558 	ZEND_PARSE_PARAMETERS_END();
1559 
1560 	const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1561 	if (encoding == &mbfl_encoding_pass) {
1562 		RETURN_STR_COPY(str);
1563 	}
1564 
1565 	if (arg_status & PHP_OUTPUT_HANDLER_START) {
1566 		bool free_mimetype = false;
1567 		char *mimetype = NULL;
1568 
1569 		/* Analyze mime type */
1570 		if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1571 			char *s;
1572 			if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1573 				mimetype = estrdup(SG(sapi_headers).mimetype);
1574 			} else {
1575 				mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1576 			}
1577 			free_mimetype = true;
1578 		} else if (SG(sapi_headers).send_default_content_type) {
1579 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1580 		}
1581 
1582 		/* If content-type is not yet set, set it and enable conversion */
1583 		if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1584 			const char *charset = encoding->mime_name;
1585 			if (charset) {
1586 				char *p;
1587 				size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s",  mimetype, charset);
1588 				if (sapi_add_header(p, len, 0) != FAILURE) {
1589 					SG(sapi_headers).send_default_content_type = 0;
1590 				}
1591 			}
1592 
1593 			MBSTRG(outconv_enabled) = true;
1594 		}
1595 
1596 		if (free_mimetype) {
1597 			efree(mimetype);
1598 		}
1599 	}
1600 
1601 	if (!MBSTRG(outconv_enabled)) {
1602 		RETURN_STR_COPY(str);
1603 	}
1604 
1605 	mb_convert_buf buf;
1606 	mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1607 
1608 	uint32_t wchar_buf[128];
1609 	unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1610 	size_t in_len = ZSTR_LEN(str);
1611 	bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1612 
1613 	while (in_len) {
1614 		size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1615 		ZEND_ASSERT(out_len <= 128);
1616 		encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1617 	}
1618 
1619 	MBSTRG(illegalchars) += buf.errors;
1620 	RETVAL_STR(mb_convert_buf_result_raw(&buf));
1621 
1622 	if (last_feed) {
1623 		MBSTRG(outconv_enabled) = false;
1624 		MBSTRG(outconv_state) = 0;
1625 	}
1626 }
1627 
PHP_FUNCTION(mb_str_split)1628 PHP_FUNCTION(mb_str_split)
1629 {
1630 	zend_string *str, *encoding = NULL;
1631 	zend_long split_len = 1;
1632 
1633 	ZEND_PARSE_PARAMETERS_START(1, 3)
1634 		Z_PARAM_STR(str)
1635 		Z_PARAM_OPTIONAL
1636 		Z_PARAM_LONG(split_len)
1637 		Z_PARAM_STR_OR_NULL(encoding)
1638 	ZEND_PARSE_PARAMETERS_END();
1639 
1640 	if (split_len <= 0) {
1641 		zend_argument_value_error(2, "must be greater than 0");
1642 		RETURN_THROWS();
1643 	} else if (split_len > UINT_MAX / 4) {
1644 		zend_argument_value_error(2, "is too large");
1645 		RETURN_THROWS();
1646 	}
1647 
1648 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1649 	if (!enc) {
1650 		RETURN_THROWS();
1651 	}
1652 
1653 	if (ZSTR_LEN(str) == 0) {
1654 		RETURN_EMPTY_ARRAY();
1655 	}
1656 
1657 	unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1658 
1659 	unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1660 	if (char_len) {
1661 		unsigned int chunk_len = char_len * split_len;
1662 		unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1663 		array_init_size(return_value, chunks);
1664 		while (p < e) {
1665 			add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1666 			p += chunk_len;
1667 		}
1668 	} else if (enc->mblen_table) {
1669 		unsigned char const *mbtab = enc->mblen_table;
1670 
1671 		/* Assume that we have 1-byte characters */
1672 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1673 
1674 		while (p < e) {
1675 			unsigned char *chunk = p; /* start of chunk */
1676 
1677 			for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1678 				p += mbtab[*p];
1679 			}
1680 			if (p > e) {
1681 				p = e; /* ensure chunk is in bounds */
1682 			}
1683 			add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1684 		}
1685 	} else {
1686 		/* Assume that we have 1-byte characters */
1687 		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1688 
1689 		uint32_t wchar_buf[128];
1690 		size_t in_len = ZSTR_LEN(str);
1691 		unsigned int state = 0, char_count = 0;
1692 
1693 		mb_convert_buf buf;
1694 
1695 		while (in_len) {
1696 			size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1697 			ZEND_ASSERT(out_len <= 128);
1698 			size_t i = 0;
1699 
1700 			/* Is there some output remaining from the previous iteration? */
1701 			if (char_count) {
1702 				if (out_len >= split_len - char_count) {
1703 					/* Finish off an incomplete chunk from previous iteration
1704 					 * ('buf' was already initialized; we don't need to do it again) */
1705 					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1706 					i += split_len - char_count;
1707 					char_count = 0;
1708 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1709 				} else {
1710 					/* Output from this iteration is not enough to finish the next chunk;
1711 					 * output what we can, and leave 'buf' to be used again on next iteration */
1712 					enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1713 					char_count += out_len;
1714 					continue;
1715 				}
1716 			}
1717 
1718 			while (i < out_len) {
1719 				/* Prepare for the next chunk */
1720 				mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1721 
1722 				if (out_len - i >= split_len) {
1723 					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1724 					i += split_len;
1725 					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1726 				} else {
1727 					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1728 					 * leave them for the next iteration */
1729 					enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1730 					char_count = out_len - i;
1731 					break;
1732 				}
1733 			}
1734 		}
1735 
1736 		if (char_count) {
1737 			/* The main loop above has finished processing the input string, but
1738 			 * has left a partial chunk in 'buf' */
1739 			add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1740 		}
1741 	}
1742 }
1743 
1744 #ifdef __SSE2__
1745 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1746  * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1747  * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1748  * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1749 static inline uint32_t _mm_sum_epu8(const __m128i v)
1750 {
1751 	/* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1752 	 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1753 	 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1754 	 * halves of the destination XMM register
1755 	 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1756 	 * summed up will actually just be the 8-bit values from `v` */
1757 	__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1758 	/* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1759 	 * to extract it here; but it stored the sum as two different 16-bit values
1760 	 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1761 	 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1762 	return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1763 }
1764 #endif
1765 
1766 /* This assumes that `string` is valid UTF-8
1767  * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1768  * Interpreted as signed integers, those are all byte values less than -64
1769  * A fast way to get the length of a UTF-8 string is to start with its byte length,
1770  * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1771 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1772 {
1773 	unsigned char *e = p + len;
1774 
1775 #ifdef __SSE2__
1776 	if (len >= sizeof(__m128i)) {
1777 		e -= sizeof(__m128i);
1778 
1779 		const __m128i threshold = _mm_set1_epi8(-64);
1780 		const __m128i delta = _mm_set1_epi8(1);
1781 		__m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1782 
1783 		int reset_counter = 255;
1784 		do {
1785 			__m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1786 			__m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1787 			counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1788 
1789 			/* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1790 			 * and reset them to zero */
1791 			if (--reset_counter == 0) {
1792 				len -= _mm_sum_epu8(counter);
1793 				counter = _mm_setzero_si128();
1794 				reset_counter = 255;
1795 			}
1796 
1797 			p += sizeof(__m128i);
1798 		} while (p <= e);
1799 
1800 		e += sizeof(__m128i);
1801 		len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1802 	}
1803 #endif
1804 
1805 	/* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1806 	while (p < e) {
1807 		signed char c = *p++;
1808 		if (c < -64) {
1809 			len--;
1810 		}
1811 	}
1812 
1813 	return len;
1814 }
1815 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1816 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1817 {
1818 	unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1819 	if (char_len) {
1820 		return ZSTR_LEN(string) / char_len;
1821 	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) {
1822 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1823 	}
1824 
1825 	uint32_t wchar_buf[128];
1826 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1827 	size_t in_len = ZSTR_LEN(string);
1828 	unsigned int state = 0;
1829 	size_t len = 0;
1830 
1831 	while (in_len) {
1832 		len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1833 	}
1834 
1835 	return len;
1836 }
1837 
1838 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1839 PHP_FUNCTION(mb_strlen)
1840 {
1841 	zend_string *string, *enc_name = NULL;
1842 
1843 	ZEND_PARSE_PARAMETERS_START(1, 2)
1844 		Z_PARAM_STR(string)
1845 		Z_PARAM_OPTIONAL
1846 		Z_PARAM_STR_OR_NULL(enc_name)
1847 	ZEND_PARSE_PARAMETERS_END();
1848 
1849 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1850 	if (!enc) {
1851 		RETURN_THROWS();
1852 	}
1853 
1854 	RETVAL_LONG(mb_get_strlen(string, enc));
1855 }
1856 /* }}} */
1857 
1858 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1859 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1860 {
1861 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1862 }
1863 
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1864 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1865 	if (offset < 0) {
1866 		unsigned char *pos = end;
1867 		while (offset < 0) {
1868 			if (pos <= str) {
1869 				return NULL;
1870 			}
1871 
1872 			unsigned char c = *--pos;
1873 			if (c < 0x80 || (c & 0xC0) != 0x80) {
1874 				offset++;
1875 			}
1876 		}
1877 		return pos;
1878 	} else {
1879 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1880 		unsigned char *pos = str;
1881 		while (offset-- > 0) {
1882 			if (pos >= end) {
1883 				return NULL;
1884 			}
1885 			pos += u8_tbl[*pos];
1886 		}
1887 		return pos;
1888 	}
1889 }
1890 
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1891 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1892 	return mb_fast_strlen_utf8(start, pos - start);
1893 }
1894 
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1895 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1896 {
1897 	size_t result;
1898 	zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1899 	unsigned char *offset_pointer;
1900 
1901 	if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1902 		unsigned int num_errors = 0;
1903 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1904 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1905 	} else {
1906 		haystack_u8 = haystack;
1907 		needle_u8 = needle;
1908 	}
1909 
1910 	offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1911 	if (!offset_pointer) {
1912 		result = MBFL_ERROR_OFFSET;
1913 		goto out;
1914 	}
1915 
1916 	result = MBFL_ERROR_NOT_FOUND;
1917 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1918 		goto out;
1919 	}
1920 
1921 	const char *found_pos;
1922 	if (!reverse) {
1923 		found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1924 	} else if (offset >= 0) {
1925 		found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1926 	} else {
1927 		size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1928 		offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1929 		if (!offset_pointer) {
1930 			offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1931 		}
1932 
1933 		found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1934 	}
1935 
1936 	if (found_pos) {
1937 		result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1938 	}
1939 
1940 out:
1941 	if (haystack_u8 != haystack) {
1942 		zend_string_free(haystack_u8);
1943 	}
1944 	if (needle_u8 != needle) {
1945 		zend_string_free(needle_u8);
1946 	}
1947 	return result;
1948 }
1949 
handle_strpos_error(size_t error)1950 static void handle_strpos_error(size_t error) {
1951 	switch (error) {
1952 	case MBFL_ERROR_NOT_FOUND:
1953 		break;
1954 	case MBFL_ERROR_ENCODING:
1955 		php_error_docref(NULL, E_WARNING, "Conversion error");
1956 		break;
1957 	case MBFL_ERROR_OFFSET:
1958 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1959 		break;
1960 	default:
1961 		zend_value_error("mb_strpos(): Unknown error");
1962 		break;
1963 	}
1964 }
1965 
PHP_FUNCTION(mb_strpos)1966 PHP_FUNCTION(mb_strpos)
1967 {
1968 	zend_long offset = 0;
1969 	zend_string *needle, *haystack;
1970 	zend_string *enc_name = NULL;
1971 
1972 	ZEND_PARSE_PARAMETERS_START(2, 4)
1973 		Z_PARAM_STR(haystack)
1974 		Z_PARAM_STR(needle)
1975 		Z_PARAM_OPTIONAL
1976 		Z_PARAM_LONG(offset)
1977 		Z_PARAM_STR_OR_NULL(enc_name)
1978 	ZEND_PARSE_PARAMETERS_END();
1979 
1980 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1981 	if (!enc) {
1982 		RETURN_THROWS();
1983 	}
1984 
1985 	size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1986 	if (!mbfl_is_error(n)) {
1987 		RETVAL_LONG(n);
1988 	} else {
1989 		handle_strpos_error(n);
1990 		RETVAL_FALSE;
1991 	}
1992 }
1993 
1994 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1995 PHP_FUNCTION(mb_strrpos)
1996 {
1997 	zend_long offset = 0;
1998 	zend_string *needle, *haystack;
1999 	zend_string *enc_name = NULL;
2000 
2001 	ZEND_PARSE_PARAMETERS_START(2, 4)
2002 		Z_PARAM_STR(haystack)
2003 		Z_PARAM_STR(needle)
2004 		Z_PARAM_OPTIONAL
2005 		Z_PARAM_LONG(offset)
2006 		Z_PARAM_STR_OR_NULL(enc_name)
2007 	ZEND_PARSE_PARAMETERS_END();
2008 
2009 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2010 	if (!enc) {
2011 		RETURN_THROWS();
2012 	}
2013 
2014 	size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2015 	if (!mbfl_is_error(n)) {
2016 		RETVAL_LONG(n);
2017 	} else {
2018 		handle_strpos_error(n);
2019 		RETVAL_FALSE;
2020 	}
2021 }
2022 /* }}} */
2023 
2024 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2025 PHP_FUNCTION(mb_stripos)
2026 {
2027 	zend_long offset = 0;
2028 	zend_string *haystack, *needle;
2029 	zend_string *from_encoding = NULL;
2030 
2031 	ZEND_PARSE_PARAMETERS_START(2, 4)
2032 		Z_PARAM_STR(haystack)
2033 		Z_PARAM_STR(needle)
2034 		Z_PARAM_OPTIONAL
2035 		Z_PARAM_LONG(offset)
2036 		Z_PARAM_STR_OR_NULL(from_encoding)
2037 	ZEND_PARSE_PARAMETERS_END();
2038 
2039 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2040 	if (!enc) {
2041 		RETURN_THROWS();
2042 	}
2043 
2044 	size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2045 
2046 	if (!mbfl_is_error(n)) {
2047 		RETVAL_LONG(n);
2048 	} else {
2049 		handle_strpos_error(n);
2050 		RETVAL_FALSE;
2051 	}
2052 }
2053 /* }}} */
2054 
2055 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2056 PHP_FUNCTION(mb_strripos)
2057 {
2058 	zend_long offset = 0;
2059 	zend_string *haystack, *needle;
2060 	zend_string *from_encoding = NULL;
2061 
2062 	ZEND_PARSE_PARAMETERS_START(2, 4)
2063 		Z_PARAM_STR(haystack)
2064 		Z_PARAM_STR(needle)
2065 		Z_PARAM_OPTIONAL
2066 		Z_PARAM_LONG(offset)
2067 		Z_PARAM_STR_OR_NULL(from_encoding)
2068 	ZEND_PARSE_PARAMETERS_END();
2069 
2070 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2071 	if (!enc) {
2072 		RETURN_THROWS();
2073 	}
2074 
2075 	size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2076 
2077 	if (!mbfl_is_error(n)) {
2078 		RETVAL_LONG(n);
2079 	} else {
2080 		handle_strpos_error(n);
2081 		RETVAL_FALSE;
2082 	}
2083 }
2084 /* }}} */
2085 
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2086 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2087 {
2088 	uint32_t wchar_buf[128];
2089 	unsigned int state = 0;
2090 
2091 	mb_convert_buf buf;
2092 	mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2093 
2094 	while (in_len && len) {
2095 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2096 		ZEND_ASSERT(out_len <= 128);
2097 
2098 		if (from >= out_len) {
2099 			from -= out_len;
2100 		} else {
2101 			size_t needed_codepoints = MIN(out_len - from, len);
2102 			enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2103 			from = 0;
2104 			len -= needed_codepoints;
2105 		}
2106 	}
2107 
2108 	return mb_convert_buf_result(&buf, enc);
2109 }
2110 
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2111 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2112 {
2113 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2114 	size_t in_len = ZSTR_LEN(input);
2115 
2116 	if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2117 		/* Other than MacJapanese, no supported text encoding decodes to
2118 		 * more than one codepoint per byte
2119 		 * So if the number of codepoints to skip >= number of input bytes,
2120 		 * then definitely the output should be empty */
2121 		return zend_empty_string;
2122 	}
2123 
2124 	/* Does each codepoint have a fixed byte width? */
2125 	unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2126 	if (flag) {
2127 		/* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2128 		from *= flag;
2129 		len *= flag;
2130 		if (from >= in_len) {
2131 			return zend_empty_string;
2132 		}
2133 		in += from;
2134 		in_len -= from;
2135 		if (len > in_len) {
2136 			len = in_len;
2137 		}
2138 		return zend_string_init_fast((const char*)in, len);
2139 	}
2140 
2141 	return mb_get_substr_slow(in, in_len, from, len, enc);
2142 }
2143 
2144 #define MB_STRSTR 1
2145 #define MB_STRRCHR 2
2146 #define MB_STRISTR 3
2147 #define MB_STRRICHR 4
2148 
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2149 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2150 {
2151 	bool reverse_mode = false, part = false;
2152 	size_t n;
2153 	zend_string *haystack, *needle;
2154 	zend_string *encoding_name = NULL;
2155 
2156 	ZEND_PARSE_PARAMETERS_START(2, 4)
2157 		Z_PARAM_STR(haystack)
2158 		Z_PARAM_STR(needle)
2159 		Z_PARAM_OPTIONAL
2160 		Z_PARAM_BOOL(part)
2161 		Z_PARAM_STR_OR_NULL(encoding_name)
2162 	ZEND_PARSE_PARAMETERS_END();
2163 
2164 	const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2165 	if (!enc) {
2166 		RETURN_THROWS();
2167 	}
2168 
2169 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2170 		reverse_mode = true;
2171 	}
2172 
2173 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2174 		n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2175 	} else {
2176 		n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2177 	}
2178 
2179 	if (!mbfl_is_error(n)) {
2180 		if (part) {
2181 			RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2182 		} else {
2183 			RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2184 		}
2185 	} else {
2186 		// FIXME use handle_strpos_error(n)
2187 		RETVAL_FALSE;
2188 	}
2189 }
2190 
2191 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2192 PHP_FUNCTION(mb_strstr)
2193 {
2194 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2195 }
2196 /* }}} */
2197 
2198 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2199 PHP_FUNCTION(mb_strrchr)
2200 {
2201 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2202 }
2203 /* }}} */
2204 
2205 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2206 PHP_FUNCTION(mb_stristr)
2207 {
2208 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2209 }
2210 /* }}} */
2211 
2212 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2213 PHP_FUNCTION(mb_strrichr)
2214 {
2215 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2216 }
2217 /* }}} */
2218 
2219 #undef MB_STRSTR
2220 #undef MB_STRRCHR
2221 #undef MB_STRISTR
2222 #undef MB_STRRICHR
2223 
PHP_FUNCTION(mb_substr_count)2224 PHP_FUNCTION(mb_substr_count)
2225 {
2226 	zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2227 
2228 	ZEND_PARSE_PARAMETERS_START(2, 3)
2229 		Z_PARAM_STR(haystack)
2230 		Z_PARAM_STR(needle)
2231 		Z_PARAM_OPTIONAL
2232 		Z_PARAM_STR_OR_NULL(enc_name)
2233 	ZEND_PARSE_PARAMETERS_END();
2234 
2235 	if (ZSTR_LEN(needle) == 0) {
2236 		zend_argument_value_error(2, "must not be empty");
2237 		RETURN_THROWS();
2238 	}
2239 
2240 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2241 	if (!enc) {
2242 		RETURN_THROWS();
2243 	}
2244 
2245 	if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2246 		/* No need to do any conversion if haystack/needle are already known-valid UTF-8
2247 		 * (If they are not valid, then not passing them through conversion filters could affect output) */
2248 		if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
2249 			haystack_u8 = haystack;
2250 		} else {
2251 			unsigned int num_errors = 0;
2252 			haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2253 			if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2254 				GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2255 			}
2256 		}
2257 
2258 		if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
2259 			needle_u8 = needle;
2260 		} else {
2261 			unsigned int num_errors = 0;
2262 			needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2263 			if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2264 				GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2265 			}
2266 		}
2267 	} else {
2268 		unsigned int num_errors = 0;
2269 		haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 		needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2271 		/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2272 		 * may be only escape sequences */
2273 		if (ZSTR_LEN(needle_u8) == 0) {
2274 			zend_string_free(haystack_u8);
2275 			zend_string_free(needle_u8);
2276 			zend_argument_value_error(2, "must not be empty");
2277 			RETURN_THROWS();
2278 		}
2279 	}
2280 
2281 	size_t result = 0;
2282 
2283 	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2284 		goto out;
2285 	}
2286 
2287 	const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2288 	while (true) {
2289 		p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2290 		if (!p) {
2291 			break;
2292 		}
2293 		p += ZSTR_LEN(needle_u8);
2294 		result++;
2295 	}
2296 
2297 out:
2298 	if (haystack_u8 != haystack) {
2299 		zend_string_free(haystack_u8);
2300 	}
2301 	if (needle_u8 != needle) {
2302 		zend_string_free(needle_u8);
2303 	}
2304 
2305 	RETVAL_LONG(result);
2306 }
2307 
2308 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2309 PHP_FUNCTION(mb_substr)
2310 {
2311 	zend_string *str, *encoding = NULL;
2312 	zend_long from, len;
2313 	size_t real_from, real_len;
2314 	bool len_is_null = true;
2315 
2316 	ZEND_PARSE_PARAMETERS_START(2, 4)
2317 		Z_PARAM_STR(str)
2318 		Z_PARAM_LONG(from)
2319 		Z_PARAM_OPTIONAL
2320 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2321 		Z_PARAM_STR_OR_NULL(encoding)
2322 	ZEND_PARSE_PARAMETERS_END();
2323 
2324 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2325 	if (!enc) {
2326 		RETURN_THROWS();
2327 	}
2328 
2329 	size_t mblen = 0;
2330 	if (from < 0 || (!len_is_null && len < 0)) {
2331 		mblen = mb_get_strlen(str, enc);
2332 	}
2333 
2334 	/* if "from" position is negative, count start position from the end
2335 	 * of the string */
2336 	if (from >= 0) {
2337 		real_from = (size_t) from;
2338 	} else if (-from < mblen) {
2339 		real_from = mblen + from;
2340 	} else {
2341 		real_from = 0;
2342 	}
2343 
2344 	/* if "length" position is negative, set it to the length
2345 	 * needed to stop that many chars from the end of the string */
2346 	if (len_is_null) {
2347 		real_len = MBFL_SUBSTR_UNTIL_END;
2348 	} else if (len >= 0) {
2349 		real_len = (size_t) len;
2350 	} else if (real_from < mblen && -len < mblen - real_from) {
2351 		real_len = (mblen - real_from) + len;
2352 	} else {
2353 		real_len = 0;
2354 	}
2355 
2356 	RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2357 }
2358 /* }}} */
2359 
2360 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2361 PHP_FUNCTION(mb_strcut)
2362 {
2363 	zend_string *encoding = NULL;
2364 	char *string_val;
2365 	zend_long from, len;
2366 	bool len_is_null = 1;
2367 	mbfl_string string, result, *ret;
2368 
2369 	ZEND_PARSE_PARAMETERS_START(2, 4)
2370 		Z_PARAM_STRING(string_val, string.len)
2371 		Z_PARAM_LONG(from)
2372 		Z_PARAM_OPTIONAL
2373 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2374 		Z_PARAM_STR_OR_NULL(encoding)
2375 	ZEND_PARSE_PARAMETERS_END();
2376 
2377 	string.val = (unsigned char*)string_val;
2378 	string.encoding = php_mb_get_encoding(encoding, 4);
2379 	if (!string.encoding) {
2380 		RETURN_THROWS();
2381 	}
2382 
2383 	if (len_is_null) {
2384 		len = string.len;
2385 	}
2386 
2387 	/* if "from" position is negative, count start position from the end
2388 	 * of the string
2389 	 */
2390 	if (from < 0) {
2391 		from = string.len + from;
2392 		if (from < 0) {
2393 			from = 0;
2394 		}
2395 	}
2396 
2397 	/* if "length" position is negative, set it to the length
2398 	 * needed to stop that many chars from the end of the string
2399 	 */
2400 	if (len < 0) {
2401 		len = (string.len - from) + len;
2402 		if (len < 0) {
2403 			len = 0;
2404 		}
2405 	}
2406 
2407 	if (from > string.len) {
2408 		RETURN_EMPTY_STRING();
2409 	}
2410 
2411 	ret = mbfl_strcut(&string, &result, from, len);
2412 	ZEND_ASSERT(ret != NULL);
2413 
2414 	// TODO: avoid reallocation ???
2415 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2416 	efree(ret->val);
2417 }
2418 /* }}} */
2419 
2420 /* Some East Asian characters, when printed at a terminal (or the like), require double
2421  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2422 static size_t character_width(uint32_t c)
2423 {
2424 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2425 		return 1;
2426 	}
2427 
2428 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2429 	int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2430 	while (lo < hi) {
2431 		int probe = (lo + hi) / 2;
2432 		if (c < mbfl_eaw_table[probe].begin) {
2433 			hi = probe;
2434 		} else if (c > mbfl_eaw_table[probe].end) {
2435 			lo = probe + 1;
2436 		} else {
2437 			return 2;
2438 		}
2439 	}
2440 
2441 	return 1;
2442 }
2443 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2444 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2445 {
2446 	size_t width = 0;
2447 	uint32_t wchar_buf[128];
2448 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2449 	size_t in_len = ZSTR_LEN(string);
2450 	unsigned int state = 0;
2451 
2452 	while (in_len) {
2453 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2454 		ZEND_ASSERT(out_len <= 128);
2455 
2456 		while (out_len) {
2457 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2458 			 * If text conversion is performed with an ordinary ASCII character as
2459 			 * the 'replacement character', this will give us the correct display width. */
2460 			width += character_width(wchar_buf[--out_len]);
2461 		}
2462 	}
2463 
2464 	return width;
2465 }
2466 
2467 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2468 PHP_FUNCTION(mb_strwidth)
2469 {
2470 	zend_string *string, *enc_name = NULL;
2471 
2472 	ZEND_PARSE_PARAMETERS_START(1, 2)
2473 		Z_PARAM_STR(string)
2474 		Z_PARAM_OPTIONAL
2475 		Z_PARAM_STR_OR_NULL(enc_name)
2476 	ZEND_PARSE_PARAMETERS_END();
2477 
2478 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2479 	if (!enc) {
2480 		RETURN_THROWS();
2481 	}
2482 
2483 	RETVAL_LONG(mb_get_strwidth(string, enc));
2484 }
2485 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2486 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2487 {
2488 	uint32_t wchar_buf[128];
2489 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2490 	size_t in_len = ZSTR_LEN(input);
2491 	unsigned int state = 0;
2492 	int remaining_width = width;
2493 	unsigned int to_skip = from;
2494 	size_t out_len = 0;
2495 	bool first_call = true, input_err = false;
2496 	mb_convert_buf buf;
2497 
2498 	while (in_len) {
2499 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2500 		ZEND_ASSERT(out_len <= 128);
2501 
2502 		if (out_len <= to_skip) {
2503 			to_skip -= out_len;
2504 		} else {
2505 			for (int i = to_skip; i < out_len; i++) {
2506 				uint32_t w = wchar_buf[i];
2507 				input_err |= (w == MBFL_BAD_INPUT);
2508 				remaining_width -= character_width(w);
2509 				if (remaining_width < 0) {
2510 					/* We need to truncate string and append trim marker */
2511 					width -= mb_get_strwidth(marker, enc);
2512 					/* 'width' is now the amount we want to take from 'input' */
2513 					if (width <= 0) {
2514 						return zend_string_copy(marker);
2515 					}
2516 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2517 
2518 					if (first_call) {
2519 						/* We can use the buffer of wchars which we have right now;
2520 						 * no need to convert again */
2521 						goto dont_restart_conversion;
2522 					} else {
2523 						goto restart_conversion;
2524 					}
2525 				}
2526 			}
2527 			to_skip = 0;
2528 		}
2529 		first_call = false;
2530 	}
2531 
2532 	/* The input string fits in the requested width; we don't need to append the trim marker
2533 	 * However, if the string contains erroneous byte sequences, those should be converted
2534 	 * to error markers */
2535 	if (!input_err) {
2536 		if (from == 0) {
2537 			/* This just increments the string's refcount; it doesn't really 'copy' it */
2538 			return zend_string_copy(input);
2539 		} else {
2540 			return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2541 		}
2542 	} else {
2543 		/* We can't use `mb_get_substr`, because it uses the fastest method possible of
2544 		 * picking out a substring, which may not include converting erroneous byte
2545 		 * sequences to error markers */
2546 		return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2547 	}
2548 
2549 	/* The input string is too wide; we need to build a new string which
2550 	 * includes some portion of the input string, with the trim marker
2551 	 * concatenated onto it */
2552 restart_conversion:
2553 	in = (unsigned char*)ZSTR_VAL(input);
2554 	in_len = ZSTR_LEN(input);
2555 	state = 0;
2556 
2557 	while (true) {
2558 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2559 		ZEND_ASSERT(out_len <= 128);
2560 
2561 dont_restart_conversion:
2562 		if (out_len <= from) {
2563 			from -= out_len;
2564 		} else {
2565 			for (int i = from; i < out_len; i++) {
2566 				width -= character_width(wchar_buf[i]);
2567 				if (width < 0) {
2568 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2569 					goto append_trim_marker;
2570 				}
2571 			}
2572 			ZEND_ASSERT(in_len > 0);
2573 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2574 			from = 0;
2575 		}
2576 	}
2577 
2578 append_trim_marker:
2579 	if (ZSTR_LEN(marker) > 0) {
2580 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2581 		memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2582 		buf.out += ZSTR_LEN(marker);
2583 	}
2584 
2585 	/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2586 	 * we have no guarantee that the trim marker string is valid UTF-8 */
2587 	return mb_convert_buf_result_raw(&buf);
2588 }
2589 
2590 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2591 PHP_FUNCTION(mb_strimwidth)
2592 {
2593 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2594 	zend_long from, width;
2595 
2596 	ZEND_PARSE_PARAMETERS_START(3, 5)
2597 		Z_PARAM_STR(str)
2598 		Z_PARAM_LONG(from)
2599 		Z_PARAM_LONG(width)
2600 		Z_PARAM_OPTIONAL
2601 		Z_PARAM_STR(trimmarker)
2602 		Z_PARAM_STR_OR_NULL(encoding)
2603 	ZEND_PARSE_PARAMETERS_END();
2604 
2605 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2606 	if (!enc) {
2607 		RETURN_THROWS();
2608 	}
2609 
2610 	if (from != 0) {
2611 		size_t str_len = mb_get_strlen(str, enc);
2612 		if (from < 0) {
2613 			from += str_len;
2614 		}
2615 		if (from < 0 || from > str_len) {
2616 			zend_argument_value_error(2, "is out of range");
2617 			RETURN_THROWS();
2618 		}
2619 	}
2620 
2621 	if (width < 0) {
2622 		php_error_docref(NULL, E_DEPRECATED,
2623 			"passing a negative integer to argument #3 ($width) is deprecated");
2624 		width += mb_get_strwidth(str, enc);
2625 
2626 		if (from > 0) {
2627 			zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2628 			width -= mb_get_strwidth(trimmed, enc);
2629 			zend_string_free(trimmed);
2630 		}
2631 
2632 		if (width < 0) {
2633 			zend_argument_value_error(3, "is out of range");
2634 			RETURN_THROWS();
2635 		}
2636 	}
2637 
2638 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2639 }
2640 
2641 
2642 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2643 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2644 {
2645 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2646 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2647 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2648 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2649 }
2650 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2651 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2652 {
2653 	unsigned int num_errors = 0;
2654 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2655 	MBSTRG(illegalchars) += num_errors;
2656 	return result;
2657 }
2658 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2659 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2660 {
2661 	const mbfl_encoding *from_encoding;
2662 
2663 	/* pre-conversion encoding */
2664 	ZEND_ASSERT(num_from_encodings >= 1);
2665 	if (num_from_encodings == 1) {
2666 		from_encoding = *from_encodings;
2667 	} else {
2668 		/* auto detect */
2669 		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2670 		if (!from_encoding) {
2671 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2672 			return NULL;
2673 		}
2674 	}
2675 
2676 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2677 }
2678 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2679 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2680 {
2681 	HashTable *output, *chash;
2682 	zend_long idx;
2683 	zend_string *key;
2684 	zval *entry, entry_tmp;
2685 
2686 	if (!input) {
2687 		return NULL;
2688 	}
2689 
2690 	if (GC_IS_RECURSIVE(input)) {
2691 		GC_UNPROTECT_RECURSION(input);
2692 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2693 		return NULL;
2694 	}
2695 	GC_TRY_PROTECT_RECURSION(input);
2696 	output = zend_new_array(zend_hash_num_elements(input));
2697 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2698 		/* convert key */
2699 		if (key) {
2700 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2701 			if (!converted_key) {
2702 				continue;
2703 			}
2704 			key = converted_key;
2705 		}
2706 		/* convert value */
2707 		ZEND_ASSERT(entry);
2708 try_again:
2709 		switch(Z_TYPE_P(entry)) {
2710 			case IS_STRING: {
2711 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2712 				if (!converted_key) {
2713 					if (key) {
2714 						zend_string_release(key);
2715 					}
2716 					continue;
2717 				}
2718 				ZVAL_STR(&entry_tmp, converted_key);
2719 				break;
2720 			}
2721 			case IS_NULL:
2722 			case IS_TRUE:
2723 			case IS_FALSE:
2724 			case IS_LONG:
2725 			case IS_DOUBLE:
2726 				ZVAL_COPY(&entry_tmp, entry);
2727 				break;
2728 			case IS_ARRAY:
2729 				chash = php_mb_convert_encoding_recursive(
2730 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2731 				if (chash) {
2732 					ZVAL_ARR(&entry_tmp, chash);
2733 				} else {
2734 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2735 				}
2736 				break;
2737 			case IS_REFERENCE:
2738 				entry = Z_REFVAL_P(entry);
2739 				goto try_again;
2740 			case IS_OBJECT:
2741 			default:
2742 				if (key) {
2743 					zend_string_release(key);
2744 				}
2745 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2746 				continue;
2747 		}
2748 		if (key) {
2749 			zend_hash_add(output, key, &entry_tmp);
2750 			zend_string_release(key);
2751 		} else {
2752 			zend_hash_index_add(output, idx, &entry_tmp);
2753 		}
2754 	} ZEND_HASH_FOREACH_END();
2755 	GC_TRY_UNPROTECT_RECURSION(input);
2756 
2757 	return output;
2758 }
2759 /* }}} */
2760 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2761 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2762 {
2763 	/* mbstring supports some 'text encodings' which aren't really text encodings
2764 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2765 	 * These should never be returned by `mb_detect_encoding`. */
2766 	int shift = 0;
2767 	for (int i = 0; i < *size; i++) {
2768 		const mbfl_encoding *encoding = elist[i];
2769 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2770 			shift++; /* Remove this encoding from the list */
2771 		} else if (shift) {
2772 			elist[i - shift] = encoding;
2773 		}
2774 	}
2775 	*size -= shift;
2776 }
2777 
2778 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2779 PHP_FUNCTION(mb_convert_encoding)
2780 {
2781 	zend_string *to_encoding_name;
2782 	zend_string *input_str, *from_encodings_str = NULL;
2783 	HashTable *input_ht, *from_encodings_ht = NULL;
2784 	const mbfl_encoding **from_encodings;
2785 	size_t num_from_encodings;
2786 	bool free_from_encodings = false;
2787 
2788 	ZEND_PARSE_PARAMETERS_START(2, 3)
2789 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2790 		Z_PARAM_STR(to_encoding_name)
2791 		Z_PARAM_OPTIONAL
2792 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2793 	ZEND_PARSE_PARAMETERS_END();
2794 
2795 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2796 	if (!to_encoding) {
2797 		RETURN_THROWS();
2798 	}
2799 
2800 	if (from_encodings_ht) {
2801 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2802 			RETURN_THROWS();
2803 		}
2804 		free_from_encodings = true;
2805 	} else if (from_encodings_str) {
2806 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2807 				&from_encodings, &num_from_encodings,
2808 				/* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2809 			RETURN_THROWS();
2810 		}
2811 		free_from_encodings = true;
2812 	} else {
2813 		from_encodings = &MBSTRG(current_internal_encoding);
2814 		num_from_encodings = 1;
2815 	}
2816 
2817 	if (num_from_encodings > 1) {
2818 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2819 	}
2820 
2821 	if (!num_from_encodings) {
2822 		efree(ZEND_VOIDP(from_encodings));
2823 		zend_argument_value_error(3, "must specify at least one encoding");
2824 		RETURN_THROWS();
2825 	}
2826 
2827 	if (input_str) {
2828 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2829 		if (ret != NULL) {
2830 			RETVAL_STR(ret);
2831 		} else {
2832 			RETVAL_FALSE;
2833 		}
2834 	} else {
2835 		HashTable *tmp;
2836 		tmp = php_mb_convert_encoding_recursive(
2837 			input_ht, to_encoding, from_encodings, num_from_encodings);
2838 		RETVAL_ARR(tmp);
2839 	}
2840 
2841 	if (free_from_encodings) {
2842 		efree(ZEND_VOIDP(from_encodings));
2843 	}
2844 }
2845 /* }}} */
2846 
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2847 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2848 {
2849 	return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2850 }
2851 
PHP_FUNCTION(mb_convert_case)2852 PHP_FUNCTION(mb_convert_case)
2853 {
2854 	zend_string *str, *from_encoding = NULL;
2855 	zend_long case_mode = 0;
2856 
2857 	ZEND_PARSE_PARAMETERS_START(2, 3)
2858 		Z_PARAM_STR(str)
2859 		Z_PARAM_LONG(case_mode)
2860 		Z_PARAM_OPTIONAL
2861 		Z_PARAM_STR_OR_NULL(from_encoding)
2862 	ZEND_PARSE_PARAMETERS_END();
2863 
2864 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2865 	if (!enc) {
2866 		RETURN_THROWS();
2867 	}
2868 
2869 	if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2870 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2871 		RETURN_THROWS();
2872 	}
2873 
2874 	RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2875 }
2876 
PHP_FUNCTION(mb_strtoupper)2877 PHP_FUNCTION(mb_strtoupper)
2878 {
2879 	zend_string *str, *from_encoding = NULL;
2880 
2881 	ZEND_PARSE_PARAMETERS_START(1, 2)
2882 		Z_PARAM_STR(str)
2883 		Z_PARAM_OPTIONAL
2884 		Z_PARAM_STR_OR_NULL(from_encoding)
2885 	ZEND_PARSE_PARAMETERS_END();
2886 
2887 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2888 	if (!enc) {
2889 		RETURN_THROWS();
2890 	}
2891 
2892 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2893 }
2894 
PHP_FUNCTION(mb_strtolower)2895 PHP_FUNCTION(mb_strtolower)
2896 {
2897 	zend_string *str, *from_encoding = NULL;
2898 
2899 	ZEND_PARSE_PARAMETERS_START(1, 2)
2900 		Z_PARAM_STR(str)
2901 		Z_PARAM_OPTIONAL
2902 		Z_PARAM_STR_OR_NULL(from_encoding)
2903 	ZEND_PARSE_PARAMETERS_END();
2904 
2905 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2906 	if (!enc) {
2907 		RETURN_THROWS();
2908 	}
2909 
2910 	RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2911 }
2912 
duplicate_elist(const mbfl_encoding ** elist,size_t size)2913 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2914 {
2915 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2916 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2917 	return new_elist;
2918 }
2919 
estimate_demerits(uint32_t w)2920 static unsigned int estimate_demerits(uint32_t w)
2921 {
2922 	/* Receive wchars decoded from input string using candidate encoding.
2923 	 * Give the candidate many 'demerits' for each 'rare' codepoint found,
2924 	 * a smaller number for each ASCII punctuation character, and 1 for
2925 	 * all other codepoints.
2926 	 *
2927 	 * The 'common' codepoints should cover the vast majority of
2928 	 * codepoints we are likely to see in practice, while only covering
2929 	 * a small minority of the entire Unicode encoding space. Why?
2930 	 * Well, if the test string happens to be valid in an incorrect
2931 	 * candidate encoding, the bogus codepoints which it decodes to will
2932 	 * be more or less random. By treating the majority of codepoints as
2933 	 * 'rare', we ensure that in almost all such cases, the bogus
2934 	 * codepoints will include plenty of 'rares', thus giving the
2935 	 * incorrect candidate encoding lots of demerits. See
2936 	 * common_codepoints.txt for the actual list used.
2937 	 *
2938 	 * So, why give extra demerits for ASCII punctuation characters? It's
2939 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
2940 	 * which deliberately only use bytes in the ASCII range. When
2941 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
2942 	 * have an unusually high number of ASCII punctuation characters.
2943 	 * So giving extra demerits for such characters will improve
2944 	 * detection accuracy for UTF-7 and similar encodings.
2945 	 *
2946 	 * Finally, why 1 demerit for all other characters? That penalizes
2947 	 * long strings, meaning we will tend to choose a candidate encoding
2948 	 * in which the test string decodes to a smaller number of
2949 	 * codepoints. That prevents single-byte encodings in which almost
2950 	 * every possible input byte decodes to a 'common' codepoint from
2951 	 * being favored too much. */
2952 	if (w > 0xFFFF) {
2953 		return 40;
2954 	} else if (w >= 0x21 && w <= 0x2F) {
2955 		return 6;
2956 	} else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
2957 		return 30;
2958 	} else {
2959 		return 1;
2960 	}
2961 	return 0;
2962 }
2963 
2964 struct candidate {
2965 	const mbfl_encoding *enc;
2966 	const unsigned char *in;
2967 	size_t in_len;
2968 	uint64_t demerits; /* Wide bit size to prevent overflow */
2969 	unsigned int state;
2970 	float multiplier;
2971 };
2972 
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)2973 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
2974 {
2975 	size_t j = 0;
2976 
2977 	for (size_t i = 0; i < length; i++) {
2978 		const mbfl_encoding *enc = encodings[i];
2979 
2980 		array[j].enc = enc;
2981 		array[j].state = 0;
2982 		array[j].demerits = 0;
2983 
2984 		/* If any candidate encodings have specialized validation functions, use them
2985 		 * to eliminate as many candidates as possible */
2986 		if (enc->check != NULL) {
2987 			for (size_t k = 0; k < n; k++) {
2988 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
2989 					if (strict) {
2990 						goto skip_to_next;
2991 					} else {
2992 						array[j].demerits += 500;
2993 					}
2994 				}
2995 			}
2996 		}
2997 
2998 		/* This multiplier can optionally be used to make candidate encodings listed
2999 		 * first more likely to be chosen. It is a weight factor which multiplies
3000 		 * the number of demerits counted for each candidate. */
3001 		array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3002 		j++;
3003 skip_to_next: ;
3004 	}
3005 
3006 	return j;
3007 }
3008 
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3009 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3010 {
3011 	for (size_t i = 0; i < length; i++) {
3012 		const mbfl_encoding *enc = array[i].enc;
3013 
3014 		array[i].in = in;
3015 		array[i].in_len = in_len;
3016 
3017 		/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3018 		if (enc == &mbfl_encoding_utf8) {
3019 			if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3020 				array[i].in_len -= 3;
3021 				array[i].in += 3;
3022 			}
3023 		} else if (enc == &mbfl_encoding_utf16be) {
3024 			if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3025 				array[i].in_len -= 2;
3026 				array[i].in += 2;
3027 			}
3028 		} else if (enc == &mbfl_encoding_utf16le) {
3029 			if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3030 				array[i].in_len -= 2;
3031 				array[i].in += 2;
3032 			}
3033 		}
3034 	}
3035 }
3036 
count_demerits(struct candidate * array,size_t length,bool strict)3037 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3038 {
3039 	uint32_t wchar_buf[128];
3040 	unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3041 
3042 	for (size_t i = 0; i < length; i++) {
3043 		if (array[i].in_len == 0) {
3044 			finished++;
3045 		}
3046 	}
3047 
3048 	while ((strict || length > 1) && finished < length) {
3049 		/* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3050 		for (size_t i = length - 1; i != (size_t)-1; i--) {
3051 			/* Do we still have more input to process for this candidate encoding? */
3052 			if (array[i].in_len) {
3053 				const mbfl_encoding *enc = array[i].enc;
3054 				size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3055 				ZEND_ASSERT(out_len <= 128);
3056 				/* Check this batch of decoded codepoints; are there any error markers?
3057 				 * Also sum up the number of demerits */
3058 				while (out_len) {
3059 					uint32_t w = wchar_buf[--out_len];
3060 					if (w == MBFL_BAD_INPUT) {
3061 						if (strict) {
3062 							/* This candidate encoding is not valid, eliminate it from consideration */
3063 							length--;
3064 							if (i < length) {
3065 								/* The eliminated candidate was the last valid one in the list */
3066 								memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3067 							}
3068 							goto try_next_encoding;
3069 						} else {
3070 							array[i].demerits += 1000;
3071 						}
3072 					} else {
3073 						array[i].demerits += estimate_demerits(w);
3074 					}
3075 				}
3076 				if (array[i].in_len == 0) {
3077 					finished++;
3078 				}
3079 			}
3080 try_next_encoding:;
3081 		}
3082 	}
3083 
3084 	for (size_t i = 0; i < length; i++) {
3085 		array[i].demerits *= array[i].multiplier;
3086 	}
3087 
3088 	return length;
3089 }
3090 
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3091 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3092 {
3093 	if (elist_size == 0) {
3094 		return NULL;
3095 	}
3096 	if (elist_size == 1) {
3097 		if (strict) {
3098 			while (n--) {
3099 				if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3100 					return NULL;
3101 				}
3102 			}
3103 		}
3104 		return *elist;
3105 	}
3106 	if (n == 1 && *str_lengths == 0) {
3107 		return *elist;
3108 	}
3109 
3110 	/* Allocate on stack; when we return, this array is automatically freed */
3111 	struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3112 	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3113 
3114 	while (n--) {
3115 		start_string(array, elist_size, strings[n], str_lengths[n]);
3116 		elist_size = count_demerits(array, elist_size, strict);
3117 		if (elist_size == 0) {
3118 			/* All candidates were eliminated */
3119 			return NULL;
3120 		}
3121 	}
3122 
3123 	/* See which remaining candidate encoding has the least demerits */
3124 	unsigned int best = 0;
3125 	for (unsigned int i = 1; i < elist_size; i++) {
3126 		if (array[i].demerits < array[best].demerits) {
3127 			best = i;
3128 		}
3129 	}
3130 	return array[best].enc;
3131 }
3132 
3133 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3134  * is rejected. With non-strict detection, we just continue, but apply demerits for
3135  * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3136 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3137 {
3138 	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3139 }
3140 
3141 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3142 PHP_FUNCTION(mb_detect_encoding)
3143 {
3144 	zend_string *str, *encoding_str = NULL;
3145 	HashTable *encoding_ht = NULL;
3146 	bool strict = false;
3147 	const mbfl_encoding *ret, **elist;
3148 	size_t size;
3149 
3150 	ZEND_PARSE_PARAMETERS_START(1, 3)
3151 		Z_PARAM_STR(str)
3152 		Z_PARAM_OPTIONAL
3153 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3154 		Z_PARAM_BOOL(strict)
3155 	ZEND_PARSE_PARAMETERS_END();
3156 
3157 	/* Should we pay attention to the order of the provided candidate encodings and prefer
3158 	 * the earlier ones (if more than one candidate encoding matches)?
3159 	 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3160 	 * in, then don't treat the order as significant */
3161 	bool order_significant = true;
3162 
3163 	/* make encoding list */
3164 	if (encoding_ht) {
3165 		if (encoding_ht == MBSTRG(all_encodings_list)) {
3166 			order_significant = false;
3167 		}
3168 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3169 			RETURN_THROWS();
3170 		}
3171 	} else if (encoding_str) {
3172 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3173 			RETURN_THROWS();
3174 		}
3175 	} else {
3176 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3177 		size = MBSTRG(current_detect_order_list_size);
3178 	}
3179 
3180 	if (size == 0) {
3181 		efree(ZEND_VOIDP(elist));
3182 		zend_argument_value_error(2, "must specify at least one encoding");
3183 		RETURN_THROWS();
3184 	}
3185 
3186 	remove_non_encodings_from_elist(elist, &size);
3187 	if (size == 0) {
3188 		efree(ZEND_VOIDP(elist));
3189 		RETURN_FALSE;
3190 	}
3191 
3192 	if (ZEND_NUM_ARGS() < 3) {
3193 		strict = MBSTRG(strict_detection);
3194 	}
3195 
3196 	if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
3197 		ret = &mbfl_encoding_utf8;
3198 	} else {
3199 		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3200 	}
3201 
3202 	efree(ZEND_VOIDP(elist));
3203 
3204 	if (ret == NULL) {
3205 		RETURN_FALSE;
3206 	}
3207 
3208 	RETVAL_STRING((char *)ret->name);
3209 }
3210 /* }}} */
3211 
3212 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3213 PHP_FUNCTION(mb_list_encodings)
3214 {
3215 	ZEND_PARSE_PARAMETERS_NONE();
3216 
3217 	if (MBSTRG(all_encodings_list) == NULL) {
3218 		/* Initialize shared array of supported encoding names
3219 		 * This is done so that we can check if `mb_list_encodings()` is being
3220 		 * passed to other mbstring functions using a cheap pointer equality check */
3221 		HashTable *array = emalloc(sizeof(HashTable));
3222 		zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3223 		for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3224 			zval tmp;
3225 			ZVAL_STRING(&tmp, (*encodings)->name);
3226 			zend_hash_next_index_insert(array, &tmp);
3227 		}
3228 		MBSTRG(all_encodings_list) = array;
3229 	}
3230 
3231 	GC_ADDREF(MBSTRG(all_encodings_list));
3232 	RETURN_ARR(MBSTRG(all_encodings_list));
3233 }
3234 /* }}} */
3235 
3236 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3237 PHP_FUNCTION(mb_encoding_aliases)
3238 {
3239 	const mbfl_encoding *encoding;
3240 	zend_string *encoding_name = NULL;
3241 
3242 	ZEND_PARSE_PARAMETERS_START(1, 1)
3243 		Z_PARAM_STR(encoding_name)
3244 	ZEND_PARSE_PARAMETERS_END();
3245 
3246 	encoding = php_mb_get_encoding(encoding_name, 1);
3247 	if (!encoding) {
3248 		RETURN_THROWS();
3249 	}
3250 
3251 	array_init(return_value);
3252 	if (encoding->aliases != NULL) {
3253 		for (const char **alias = encoding->aliases; *alias; ++alias) {
3254 			add_next_index_string(return_value, (char *)*alias);
3255 		}
3256 	}
3257 }
3258 /* }}} */
3259 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3260 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3261 {
3262 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3263 	 * if we are converting zenkaku kana to hankaku kana
3264 	 * Make the buffer for converted kana big enough that we never need to
3265 	 * perform bounds checks */
3266 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3267 	unsigned int buf_offset = 0;
3268 	unsigned int state = 0;
3269 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3270 	size_t in_len = ZSTR_LEN(input);
3271 
3272 	mb_convert_buf buf;
3273 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3274 
3275 	while (in_len) {
3276 		uint32_t *converted = converted_buf;
3277 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3278 		 * previous iteration, don't overwrite it */
3279 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3280 		out_len += buf_offset;
3281 		ZEND_ASSERT(out_len <= 64);
3282 
3283 		if (!out_len) {
3284 			continue;
3285 		}
3286 
3287 		for (int i = 0; i < out_len-1; i++) {
3288 			uint32_t second = 0;
3289 			bool consumed = false;
3290 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3291 			if (second) {
3292 				*converted++ = second;
3293 			}
3294 			if (consumed) {
3295 				i++;
3296 				if (i == out_len-1) {
3297 					/* We consumed two codepoints at the very end of the wchar buffer
3298 					 * So there is nothing remaining to reprocess on the next iteration */
3299 					buf_offset = 0;
3300 					goto emit_converted_kana;
3301 				}
3302 			}
3303 		}
3304 
3305 		if (!in_len) {
3306 			/* This is the last iteration, so we need to process the final codepoint now */
3307 			uint32_t second = 0;
3308 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3309 			if (second) {
3310 				*converted++ = second;
3311 			}
3312 		} else {
3313 			/* Reprocess the last codepoint on the next iteration */
3314 			wchar_buf[0] = wchar_buf[out_len-1];
3315 			buf_offset = 1;
3316 		}
3317 
3318 emit_converted_kana:
3319 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3320 	}
3321 
3322 	return mb_convert_buf_result(&buf, encoding);
3323 }
3324 
3325 char mb_convert_kana_flags[17] = {
3326 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3327 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3328 	'V'
3329 };
3330 
3331 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3332 PHP_FUNCTION(mb_convert_kana)
3333 {
3334 	unsigned int opt;
3335 	char *optstr = NULL;
3336 	size_t optstr_len;
3337 	zend_string *encname = NULL, *str;
3338 
3339 	ZEND_PARSE_PARAMETERS_START(1, 3)
3340 		Z_PARAM_STR(str)
3341 		Z_PARAM_OPTIONAL
3342 		Z_PARAM_STRING(optstr, optstr_len)
3343 		Z_PARAM_STR_OR_NULL(encname)
3344 	ZEND_PARSE_PARAMETERS_END();
3345 
3346 	if (optstr != NULL) {
3347 		char *p = optstr, *e = p + optstr_len;
3348 		opt = 0;
3349 next_option:
3350 		while (p < e) {
3351 			/* Walk through option string and convert to bit vector
3352 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3353 			char c = *p++;
3354 			if (c == 'A') {
3355 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3356 			} else if (c == 'a') {
3357 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3358 			} else {
3359 				for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3360 					if (c == mb_convert_kana_flags[i]) {
3361 						opt |= (1 << i);
3362 						goto next_option;
3363 					}
3364 				}
3365 
3366 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3367 				RETURN_THROWS();
3368 			}
3369 		}
3370 
3371 		/* Check for illegal combinations of options */
3372 		if (((opt & 0xFF00) >> 8) & opt) {
3373 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3374 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3375 			 * FW hiragana to FW katakana and then back again. */
3376 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3377 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3378 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3379 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3380 				flag1 = 'A';
3381 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3382 				flag2 = 'a';
3383 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3384 			RETURN_THROWS();
3385 		}
3386 
3387 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3388 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3389 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3390 			RETURN_THROWS();
3391 		}
3392 
3393 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3394 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3395 		 * more than one of these */
3396 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3397 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3398 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3399 				RETURN_THROWS();
3400 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3401 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3402 				RETURN_THROWS();
3403 			}
3404 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3405 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3406 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3407 				RETURN_THROWS();
3408 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3409 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3410 				RETURN_THROWS();
3411 			}
3412 		}
3413 	} else {
3414 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3415 	}
3416 
3417 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3418 	if (!enc) {
3419 		RETURN_THROWS();
3420 	}
3421 
3422 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3423 }
3424 
mb_recursive_count_strings(zval * var)3425 static unsigned int mb_recursive_count_strings(zval *var)
3426 {
3427 	unsigned int count = 0;
3428 	ZVAL_DEREF(var);
3429 
3430 	if (Z_TYPE_P(var) == IS_STRING) {
3431 		count++;
3432 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3433 		if (Z_REFCOUNTED_P(var)) {
3434 			if (Z_IS_RECURSIVE_P(var)) {
3435 				return count;
3436 			}
3437 			Z_PROTECT_RECURSION_P(var);
3438 		}
3439 
3440 		HashTable *ht = HASH_OF(var);
3441 		if (ht != NULL) {
3442 			zval *entry;
3443 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3444 				count += mb_recursive_count_strings(entry);
3445 			} ZEND_HASH_FOREACH_END();
3446 		}
3447 
3448 		if (Z_REFCOUNTED_P(var)) {
3449 			Z_UNPROTECT_RECURSION_P(var);
3450 		}
3451 	}
3452 
3453 	return count;
3454 }
3455 
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3456 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3457 {
3458 	ZVAL_DEREF(var);
3459 
3460 	if (Z_TYPE_P(var) == IS_STRING) {
3461 		val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3462 		len_list[*count] = Z_STRLEN_P(var);
3463 		(*count)++;
3464 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3465 		if (Z_REFCOUNTED_P(var)) {
3466 			if (Z_IS_RECURSIVE_P(var)) {
3467 				return true;
3468 			}
3469 			Z_PROTECT_RECURSION_P(var);
3470 		}
3471 
3472 		HashTable *ht = HASH_OF(var);
3473 		if (ht != NULL) {
3474 			zval *entry;
3475 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3476 				if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3477 					if (Z_REFCOUNTED_P(var)) {
3478 						Z_UNPROTECT_RECURSION_P(var);
3479 						return true;
3480 					}
3481 				}
3482 			} ZEND_HASH_FOREACH_END();
3483 		}
3484 
3485 		if (Z_REFCOUNTED_P(var)) {
3486 			Z_UNPROTECT_RECURSION_P(var);
3487 		}
3488 	}
3489 
3490 	return false;
3491 }
3492 
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3493 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3494 {
3495 	zval *entry, *orig_var;
3496 
3497 	orig_var = var;
3498 	ZVAL_DEREF(var);
3499 
3500 	if (Z_TYPE_P(var) == IS_STRING) {
3501 		zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3502 		zval_ptr_dtor(orig_var);
3503 		ZVAL_STR(orig_var, ret);
3504 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3505 		if (Z_TYPE_P(var) == IS_ARRAY) {
3506 			SEPARATE_ARRAY(var);
3507 		}
3508 		if (Z_REFCOUNTED_P(var)) {
3509 			if (Z_IS_RECURSIVE_P(var)) {
3510 				return true;
3511 			}
3512 			Z_PROTECT_RECURSION_P(var);
3513 		}
3514 
3515 		HashTable *ht = HASH_OF(var);
3516 		if (ht != NULL) {
3517 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3518 				if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3519 					if (Z_REFCOUNTED_P(var)) {
3520 						Z_UNPROTECT_RECURSION_P(var);
3521 					}
3522 					return true;
3523 				}
3524 			} ZEND_HASH_FOREACH_END();
3525 		}
3526 
3527 		if (Z_REFCOUNTED_P(var)) {
3528 			Z_UNPROTECT_RECURSION_P(var);
3529 		}
3530 	}
3531 
3532 	return false;
3533 }
3534 
PHP_FUNCTION(mb_convert_variables)3535 PHP_FUNCTION(mb_convert_variables)
3536 {
3537 	zval *args;
3538 	zend_string *to_enc_str;
3539 	zend_string *from_enc_str;
3540 	HashTable *from_enc_ht;
3541 	const mbfl_encoding *from_encoding, *to_encoding;
3542 	uint32_t argc;
3543 	size_t elistsz;
3544 	const mbfl_encoding **elist;
3545 
3546 	ZEND_PARSE_PARAMETERS_START(3, -1)
3547 		Z_PARAM_STR(to_enc_str)
3548 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3549 		Z_PARAM_VARIADIC('+', args, argc)
3550 	ZEND_PARSE_PARAMETERS_END();
3551 
3552 	/* new encoding */
3553 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3554 	if (!to_encoding) {
3555 		RETURN_THROWS();
3556 	}
3557 
3558 	from_encoding = MBSTRG(current_internal_encoding);
3559 
3560 	bool order_significant = true;
3561 
3562 	/* pre-conversion encoding */
3563 	if (from_enc_ht) {
3564 		if (from_enc_ht == MBSTRG(all_encodings_list)) {
3565 			/* If entire list of supported encodings returned by `mb_list_encodings` is passed
3566 			 * in, then don't treat the order of the list as significant */
3567 			order_significant = false;
3568 		}
3569 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3570 			RETURN_THROWS();
3571 		}
3572 	} else {
3573 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3574 			RETURN_THROWS();
3575 		}
3576 	}
3577 
3578 	if (elistsz == 0) {
3579 		efree(ZEND_VOIDP(elist));
3580 		zend_argument_value_error(2, "must specify at least one encoding");
3581 		RETURN_THROWS();
3582 	}
3583 
3584 	if (elistsz == 1) {
3585 		from_encoding = *elist;
3586 	} else {
3587 		/* auto detect */
3588 		unsigned int num = 0;
3589 		for (size_t n = 0; n < argc; n++) {
3590 			zval *zv = &args[n];
3591 			num += mb_recursive_count_strings(zv);
3592 		}
3593 		const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3594 		size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3595 		unsigned int i = 0;
3596 		for (size_t n = 0; n < argc; n++) {
3597 			zval *zv = &args[n];
3598 			if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3599 				efree(ZEND_VOIDP(elist));
3600 				efree(ZEND_VOIDP(val_list));
3601 				efree(len_list);
3602 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3603 				RETURN_FALSE;
3604 			}
3605 		}
3606 		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3607 		efree(ZEND_VOIDP(val_list));
3608 		efree(len_list);
3609 		if (!from_encoding) {
3610 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3611 			efree(ZEND_VOIDP(elist));
3612 			RETURN_FALSE;
3613 		}
3614 
3615 	}
3616 
3617 	efree(ZEND_VOIDP(elist));
3618 
3619 	/* convert */
3620 	for (size_t n = 0; n < argc; n++) {
3621 		zval *zv = &args[n];
3622 		ZVAL_DEREF(zv);
3623 		if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3624 			php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3625 			RETURN_FALSE;
3626 		}
3627 	}
3628 
3629 	RETURN_STRING(from_encoding->name);
3630 }
3631 
3632 /* HTML numeric entities */
3633 
3634 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3635 static uint32_t *make_conversion_map(HashTable *target_hash, int *convmap_size)
3636 {
3637 	zval *hash_entry;
3638 
3639 	int n_elems = zend_hash_num_elements(target_hash);
3640 	if (n_elems % 4 != 0) {
3641 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3642 		return NULL;
3643 	}
3644 
3645 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3646 	uint32_t *mapelm = convmap;
3647 
3648 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3649 		*mapelm++ = zval_get_long(hash_entry);
3650 	} ZEND_HASH_FOREACH_END();
3651 
3652 	*convmap_size = n_elems / 4;
3653 	return convmap;
3654 }
3655 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,int mapsize,uint32_t * retval)3656 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, int mapsize, uint32_t *retval)
3657 {
3658 	uint32_t *convmap_end = convmap + (mapsize * 4);
3659 
3660 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3661 		uint32_t lo_code = mapelm[0];
3662 		uint32_t hi_code = mapelm[1];
3663 		uint32_t offset  = mapelm[2];
3664 		uint32_t mask    = mapelm[3];
3665 
3666 		if (w >= lo_code && w <= hi_code) {
3667 			/* This wchar falls inside one of the ranges which should be
3668 			 * converted to HTML entities */
3669 			*retval = (w + offset) & mask;
3670 			return true;
3671 		}
3672 	}
3673 
3674 	/* None of the ranges matched */
3675 	return false;
3676 }
3677 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize,bool hex)3678 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize, bool hex)
3679 {
3680 	/* Each wchar which we get from decoding the input string may become up to
3681 	 * 13 wchars when we convert it to an HTML entity */
3682 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3683 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3684 
3685 	unsigned int state = 0;
3686 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3687 	size_t in_len = ZSTR_LEN(input);
3688 
3689 	mb_convert_buf buf;
3690 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3691 
3692 	while (in_len) {
3693 		/* Convert input string to wchars, up to 32 at a time */
3694 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3695 		ZEND_ASSERT(out_len <= 32);
3696 		uint32_t *converted = converted_buf;
3697 
3698 		/* Run through wchars and see if any of them fall into the ranges
3699 		 * which we want to convert to HTML entities */
3700 		for (int i = 0; i < out_len; i++) {
3701 			uint32_t w = wchar_buf[i];
3702 
3703 			if (html_numeric_entity_convert(w, convmap, mapsize, &w)) {
3704 				*converted++ = '&';
3705 				*converted++ = '#';
3706 				if (hex) {
3707 					*converted++ = 'x';
3708 				}
3709 
3710 				/* Convert wchar to decimal/hex string */
3711 				if (w == 0) {
3712 					*converted++ = '0';
3713 				} else {
3714 					unsigned char *p = entity + sizeof(entity);
3715 					if (hex) {
3716 						while (w > 0) {
3717 							*(--p) = "0123456789ABCDEF"[w & 0xF];
3718 							w >>= 4;
3719 						}
3720 					} else {
3721 						while (w > 0) {
3722 							*(--p) = "0123456789"[w % 10];
3723 							w /= 10;
3724 						}
3725 					}
3726 					while (p < entity + sizeof(entity)) {
3727 						*converted++ = *p++;
3728 					}
3729 				}
3730 
3731 				*converted++ = ';';
3732 			} else {
3733 				*converted++ = w;
3734 			}
3735 		}
3736 
3737 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3738 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3739 	}
3740 
3741 	return mb_convert_buf_result(&buf, encoding);
3742 }
3743 
3744 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3745 PHP_FUNCTION(mb_encode_numericentity)
3746 {
3747 	zend_string *encoding = NULL, *str;
3748 	int mapsize;
3749 	HashTable *target_hash;
3750 	bool is_hex = false;
3751 
3752 	ZEND_PARSE_PARAMETERS_START(2, 4)
3753 		Z_PARAM_STR(str)
3754 		Z_PARAM_ARRAY_HT(target_hash)
3755 		Z_PARAM_OPTIONAL
3756 		Z_PARAM_STR_OR_NULL(encoding)
3757 		Z_PARAM_BOOL(is_hex)
3758 	ZEND_PARSE_PARAMETERS_END();
3759 
3760 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3761 	if (!enc) {
3762 		RETURN_THROWS();
3763 	}
3764 
3765 	uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3766 	if (convmap == NULL) {
3767 		RETURN_THROWS();
3768 	}
3769 
3770 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, mapsize, is_hex));
3771 	efree(convmap);
3772 }
3773 /* }}} */
3774 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,int mapsize,uint32_t * retval)3775 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, int mapsize, uint32_t *retval)
3776 {
3777 	uint32_t *convmap_end = convmap + (mapsize * 4);
3778 
3779 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3780 		uint32_t lo_code = mapelm[0];
3781 		uint32_t hi_code = mapelm[1];
3782 		uint32_t offset  = mapelm[2];
3783 		uint32_t codepoint = number - offset;
3784 		if (codepoint >= lo_code && codepoint <= hi_code) {
3785 			*retval = codepoint;
3786 			return true;
3787 		}
3788 	}
3789 
3790 	return false;
3791 }
3792 
3793 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
3794 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
3795 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3796 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3797 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize)3798 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
3799 {
3800 	uint32_t wchar_buf[128], converted_buf[128];
3801 
3802 	unsigned int state = 0;
3803 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3804 	size_t in_len = ZSTR_LEN(input);
3805 
3806 	mb_convert_buf buf;
3807 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3808 
3809 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
3810 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
3811 	 * 2nd 'converted' buffer.
3812 	 *
3813 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
3814 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
3815 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
3816 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
3817 	 * to store the next batch of wchars after it.
3818 	 *
3819 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
3820 	 * wchars from the 1st buffer to the 2nd one.
3821 	 *
3822 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
3823 	 * have to do bounds checks when writing wchars into it.
3824 	 */
3825 
3826 	unsigned int wchar_buf_offset = 0;
3827 
3828 	while (in_len) {
3829 		/* Leave space for sentinel at the end of the buffer */
3830 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
3831 		out_len += wchar_buf_offset;
3832 		ZEND_ASSERT(out_len <= 127);
3833 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
3834 
3835 		uint32_t *p, *converted;
3836 
3837 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
3838 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
3839 		if (wchar_buf_offset == 0) {
3840 			p = wchar_buf;
3841 			while (*p != '&')
3842 				p++;
3843 			if (p == wchar_buf + out_len) {
3844 				/* No HTML entities in this buffer */
3845 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
3846 				continue;
3847 			}
3848 
3849 			/* Copy over the prefix with no & which we already scanned */
3850 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
3851 			converted = converted_buf + (p - wchar_buf);
3852 		} else {
3853 			p = wchar_buf;
3854 			converted = converted_buf;
3855 		}
3856 
3857 found_ampersand:
3858 		ZEND_ASSERT(*p == '&');
3859 		uint32_t *p2 = p;
3860 
3861 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
3862 		if (*++p2 == '#') {
3863 			if (*++p2 == 'x') {
3864 				/* Possible hex entity */
3865 				uint32_t w = *++p2;
3866 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
3867 					w = *++p2;
3868 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3869 					/* We hit the end of the buffer while reading digits, and
3870 					 * more wchars are still coming in the next buffer
3871 					 * Reprocess this identity on next iteration */
3872 					memmove(wchar_buf, p, (p2 - p) * 4);
3873 					wchar_buf_offset = p2 - p;
3874 					goto process_converted_wchars;
3875 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3876 					/* Invalid entity (too long or "&#x" only) */
3877 					memcpy(converted, p, (p2 - p) * 4);
3878 					converted += p2 - p;
3879 				} else {
3880 					/* Valid hexadecimal entity */
3881 					uint32_t value = 0, *p3 = p + 3;
3882 					while (p3 < p2) {
3883 						w = *p3++;
3884 						if (w <= '9') {
3885 							value = (value * 16) + (w - '0');
3886 						} else if (w >= 'a') {
3887 							value = (value * 16) + 10 + (w - 'a');
3888 						} else {
3889 							value = (value * 16) + 10 + (w - 'A');
3890 						}
3891 					}
3892 					if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3893 						converted++;
3894 						if (*p2 == ';')
3895 							p2++;
3896 					} else {
3897 						memcpy(converted, p, (p2 - p) * 4);
3898 						converted += p2 - p;
3899 					}
3900 				}
3901 			} else {
3902 				/* Possible decimal entity */
3903 				uint32_t w = *p2;
3904 				while (w >= '0' && w <= '9')
3905 					w = *++p2;
3906 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3907 					/* The number of digits was legal (no more than 10 decimal digits)
3908 					 * Reprocess this identity on next iteration of main loop */
3909 					memmove(wchar_buf, p, (p2 - p) * 4);
3910 					wchar_buf_offset = p2 - p;
3911 					goto process_converted_wchars;
3912 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3913 					/* Invalid entity (too long or "&#" only) */
3914 					memcpy(converted, p, (p2 - p) * 4);
3915 					converted += p2 - p;
3916 				} else {
3917 					/* Valid decimal entity */
3918 					uint32_t value = 0, *p3 = p + 2;
3919 					while (p3 < p2) {
3920 						/* If unsigned integer overflow would occur in the below
3921 						 * multiplication by 10, this entity is no good
3922 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
3923 						if (value > 0x19999999) {
3924 							memcpy(converted, p, (p2 - p) * 4);
3925 							converted += p2 - p;
3926 							goto decimal_entity_too_big;
3927 						}
3928 						value = (value * 10) + (*p3++ - '0');
3929 					}
3930 					if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3931 						converted++;
3932 						if (*p2 == ';')
3933 							p2++;
3934 					} else {
3935 						memcpy(converted, p, (p2 - p) * 4);
3936 						converted += p2 - p;
3937 					}
3938 				}
3939 			}
3940 		} else if ((p2 == wchar_buf + out_len) && in_len) {
3941 			/* Corner case: & at end of buffer */
3942 			wchar_buf[0] = '&';
3943 			wchar_buf_offset = 1;
3944 			goto process_converted_wchars;
3945 		} else {
3946 			*converted++ = '&';
3947 		}
3948 decimal_entity_too_big:
3949 
3950 		/* Starting to scan a new section of the wchar buffer
3951 		 * 'p2' is pointing at the next wchar which needs to be processed */
3952 		p = p2;
3953 		while (*p2 != '&')
3954 			p2++;
3955 
3956 		if (p2 > p) {
3957 			memcpy(converted, p, (p2 - p) * 4);
3958 			converted += p2 - p;
3959 			p = p2;
3960 		}
3961 
3962 		if (p < wchar_buf + out_len)
3963 			goto found_ampersand;
3964 
3965 		/* We do not have any wchars remaining at the end of this buffer which
3966 		 * we need to reprocess on the next call */
3967 		wchar_buf_offset = 0;
3968 process_converted_wchars:
3969 		ZEND_ASSERT(converted <= converted_buf + 128);
3970 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3971 	}
3972 
3973 	return mb_convert_buf_result(&buf, encoding);
3974 }
3975 
3976 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3977 PHP_FUNCTION(mb_decode_numericentity)
3978 {
3979 	zend_string *encoding = NULL, *str;
3980 	int mapsize;
3981 	HashTable *target_hash;
3982 
3983 	ZEND_PARSE_PARAMETERS_START(2, 3)
3984 		Z_PARAM_STR(str)
3985 		Z_PARAM_ARRAY_HT(target_hash)
3986 		Z_PARAM_OPTIONAL
3987 		Z_PARAM_STR_OR_NULL(encoding)
3988 	ZEND_PARSE_PARAMETERS_END();
3989 
3990 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3991 	if (!enc) {
3992 		RETURN_THROWS();
3993 	}
3994 
3995 	uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3996 	if (convmap == NULL) {
3997 		RETURN_THROWS();
3998 	}
3999 
4000 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, mapsize));
4001 	efree(convmap);
4002 }
4003 /* }}} */
4004 
4005 /* {{{ Sends an email message with MIME scheme */
4006 #define CRLF "\r\n"
4007 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4008 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4009 {
4010 	const char *ps;
4011 	size_t icnt;
4012 	int state = 0;
4013 	int crlf_state = -1;
4014 	char *token = NULL;
4015 	size_t token_pos = 0;
4016 	zend_string *fld_name, *fld_val;
4017 
4018 	ps = str;
4019 	icnt = str_len;
4020 	fld_name = fld_val = NULL;
4021 
4022 	/*
4023 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4024 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4025 	 *      state  0            1           2          3
4026 	 *
4027 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
4028 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4029 	 * crlf_state -1                       0                     1 -1
4030 	 *
4031 	 */
4032 
4033 	while (icnt > 0) {
4034 		switch (*ps) {
4035 			case ':':
4036 				if (crlf_state == 1) {
4037 					token_pos++;
4038 				}
4039 
4040 				if (state == 0 || state == 1) {
4041 					if(token && token_pos > 0) {
4042 						fld_name = zend_string_init(token, token_pos, 0);
4043 					}
4044 					state = 2;
4045 				} else {
4046 					token_pos++;
4047 				}
4048 
4049 				crlf_state = 0;
4050 				break;
4051 
4052 			case '\n':
4053 				if (crlf_state == -1) {
4054 					goto out;
4055 				}
4056 				crlf_state = -1;
4057 				break;
4058 
4059 			case '\r':
4060 				if (crlf_state == 1) {
4061 					token_pos++;
4062 				} else {
4063 					crlf_state = 1;
4064 				}
4065 				break;
4066 
4067 			case ' ': case '\t':
4068 				if (crlf_state == -1) {
4069 					if (state == 3) {
4070 						/* continuing from the previous line */
4071 						state = 4;
4072 					} else {
4073 						/* simply skipping this new line */
4074 						state = 5;
4075 					}
4076 				} else {
4077 					if (crlf_state == 1) {
4078 						token_pos++;
4079 					}
4080 					if (state == 1 || state == 3) {
4081 						token_pos++;
4082 					}
4083 				}
4084 				crlf_state = 0;
4085 				break;
4086 
4087 			default:
4088 				switch (state) {
4089 					case 0:
4090 						token = (char*)ps;
4091 						token_pos = 0;
4092 						state = 1;
4093 						break;
4094 
4095 					case 2:
4096 						if (crlf_state != -1) {
4097 							token = (char*)ps;
4098 							token_pos = 0;
4099 
4100 							state = 3;
4101 							break;
4102 						}
4103 						ZEND_FALLTHROUGH;
4104 
4105 					case 3:
4106 						if (crlf_state == -1) {
4107 							if(token && token_pos > 0) {
4108 								fld_val = zend_string_init(token, token_pos, 0);
4109 							}
4110 
4111 							if (fld_name != NULL && fld_val != NULL) {
4112 								zval val;
4113 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4114 								ZVAL_STR(&val, fld_val);
4115 
4116 								zend_hash_update(ht, fld_name, &val);
4117 
4118 								zend_string_release_ex(fld_name, 0);
4119 							}
4120 
4121 							fld_name = fld_val = NULL;
4122 							token = (char*)ps;
4123 							token_pos = 0;
4124 
4125 							state = 1;
4126 						}
4127 						break;
4128 
4129 					case 4:
4130 						token_pos++;
4131 						state = 3;
4132 						break;
4133 				}
4134 
4135 				if (crlf_state == 1) {
4136 					token_pos++;
4137 				}
4138 
4139 				token_pos++;
4140 
4141 				crlf_state = 0;
4142 				break;
4143 		}
4144 		ps++, icnt--;
4145 	}
4146 out:
4147 	if (state == 2) {
4148 		token = "";
4149 		token_pos = 0;
4150 
4151 		state = 3;
4152 	}
4153 	if (state == 3) {
4154 		if(token && token_pos > 0) {
4155 			fld_val = zend_string_init(token, token_pos, 0);
4156 		}
4157 		if (fld_name != NULL && fld_val != NULL) {
4158 			zval val;
4159 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4160 			ZVAL_STR(&val, fld_val);
4161 			zend_hash_update(ht, fld_name, &val);
4162 
4163 			zend_string_release_ex(fld_name, 0);
4164 		}
4165 	}
4166 	return state;
4167 }
4168 
PHP_FUNCTION(mb_send_mail)4169 PHP_FUNCTION(mb_send_mail)
4170 {
4171 	char *to;
4172 	size_t to_len;
4173 	char *message;
4174 	size_t message_len;
4175 	zend_string *subject;
4176 	zend_string *extra_cmd = NULL;
4177 	HashTable *headers_ht = NULL;
4178 	zend_string *str_headers = NULL;
4179 	size_t i;
4180 	char *to_r = NULL;
4181 	char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4182 	bool suppress_content_type = false;
4183 	bool suppress_content_transfer_encoding = false;
4184 
4185 	char *p;
4186 	enum mbfl_no_encoding;
4187 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4188 						*head_enc,	/* header transfer encoding */
4189 						*body_enc;	/* body transfer encoding */
4190 	const mbfl_language *lang;
4191 	HashTable ht_headers;
4192 	zval *s;
4193 
4194 	/* character-set, transfer-encoding */
4195 	tran_cs = &mbfl_encoding_utf8;
4196 	head_enc = &mbfl_encoding_base64;
4197 	body_enc = &mbfl_encoding_base64;
4198 	lang = mbfl_no2language(MBSTRG(language));
4199 	if (lang != NULL) {
4200 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4201 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4202 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4203 	}
4204 
4205 	ZEND_PARSE_PARAMETERS_START(3, 5)
4206 		Z_PARAM_PATH(to, to_len)
4207 		Z_PARAM_PATH_STR(subject)
4208 		Z_PARAM_PATH(message, message_len)
4209 		Z_PARAM_OPTIONAL
4210 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4211 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4212 	ZEND_PARSE_PARAMETERS_END();
4213 
4214 	if (str_headers) {
4215 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4216 			zend_argument_value_error(4, "must not contain any null bytes");
4217 			RETURN_THROWS();
4218 		}
4219 		str_headers = php_trim(str_headers, NULL, 0, 2);
4220 	} else if (headers_ht) {
4221 		str_headers = php_mail_build_headers(headers_ht);
4222 		if (EG(exception)) {
4223 			RETURN_THROWS();
4224 		}
4225 	}
4226 
4227 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4228 
4229 	if (str_headers != NULL) {
4230 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4231 	}
4232 
4233 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4234 		char *tmp;
4235 		char *param_name;
4236 		char *charset = NULL;
4237 
4238 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4239 		p = strchr(Z_STRVAL_P(s), ';');
4240 
4241 		if (p != NULL) {
4242 			/* skipping the padded spaces */
4243 			do {
4244 				++p;
4245 			} while (*p == ' ' || *p == '\t');
4246 
4247 			if (*p != '\0') {
4248 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4249 					if (strcasecmp(param_name, "charset") == 0) {
4250 						const mbfl_encoding *_tran_cs = tran_cs;
4251 
4252 						charset = php_strtok_r(NULL, "= \"", &tmp);
4253 						if (charset != NULL) {
4254 							_tran_cs = mbfl_name2encoding(charset);
4255 						}
4256 
4257 						if (!_tran_cs) {
4258 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4259 							_tran_cs = &mbfl_encoding_ascii;
4260 						}
4261 						tran_cs = _tran_cs;
4262 					}
4263 				}
4264 			}
4265 		}
4266 		suppress_content_type = true;
4267 	}
4268 
4269 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4270 		const mbfl_encoding *_body_enc;
4271 
4272 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4273 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4274 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4275 			case mbfl_no_encoding_base64:
4276 			case mbfl_no_encoding_7bit:
4277 			case mbfl_no_encoding_8bit:
4278 				body_enc = _body_enc;
4279 				break;
4280 
4281 			default:
4282 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4283 				body_enc =	&mbfl_encoding_8bit;
4284 				break;
4285 		}
4286 		suppress_content_transfer_encoding = true;
4287 	}
4288 
4289 	/* To: */
4290 	if (to_len > 0) {
4291 		to_r = estrndup(to, to_len);
4292 		for (; to_len; to_len--) {
4293 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4294 				break;
4295 			}
4296 			to_r[to_len - 1] = '\0';
4297 		}
4298 		for (i = 0; to_r[i]; i++) {
4299 			if (iscntrl((unsigned char) to_r[i])) {
4300 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4301 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4302 				 * To prevent these separators from being replaced with a space, we skip over them. */
4303 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4304 					i += 2;
4305 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4306 						i++;
4307 					}
4308 					continue;
4309 				}
4310 
4311 				to_r[i] = ' ';
4312 			}
4313 		}
4314 	} else {
4315 		to_r = to;
4316 	}
4317 
4318 	/* Subject: */
4319 	const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4320 	if (enc == &mbfl_encoding_pass) {
4321 		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4322 	}
4323 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4324 	size_t line_sep_len = strlen(line_sep);
4325 
4326 	subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4327 
4328 	/* message body */
4329 	const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4330 	if (msg_enc == &mbfl_encoding_pass) {
4331 		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4332 	}
4333 
4334 	unsigned int num_errors = 0;
4335 	zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4336 	zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4337 	zend_string_free(tmpstr);
4338 	message = ZSTR_VAL(conv);
4339 
4340 	/* other headers */
4341 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4342 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4343 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4344 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4345 
4346 	smart_str str = {0};
4347 	bool empty = true;
4348 
4349 	if (str_headers != NULL) {
4350 		/* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4351 		size_t len = ZSTR_LEN(str_headers);
4352 		if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4353 			len--;
4354 		}
4355 		if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4356 			len--;
4357 		}
4358 		smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4359 		empty = false;
4360 		zend_string_release_ex(str_headers, 0);
4361 	}
4362 
4363 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4364 		if (!empty) {
4365 			smart_str_appendl(&str, line_sep, line_sep_len);
4366 		}
4367 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4368 		empty = false;
4369 	}
4370 
4371 	if (!suppress_content_type) {
4372 		if (!empty) {
4373 			smart_str_appendl(&str, line_sep, line_sep_len);
4374 		}
4375 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4376 
4377 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4378 		if (p != NULL) {
4379 			smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4380 			smart_str_appends(&str, p);
4381 		}
4382 		empty = false;
4383 	}
4384 
4385 	if (!suppress_content_transfer_encoding) {
4386 		if (!empty) {
4387 			smart_str_appendl(&str, line_sep, line_sep_len);
4388 		}
4389 		smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4390 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4391 		if (p == NULL) {
4392 			p = "7bit";
4393 		}
4394 		smart_str_appends(&str, p);
4395 	}
4396 
4397 	str_headers = smart_str_extract(&str);
4398 
4399 	if (force_extra_parameters) {
4400 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4401 	} else if (extra_cmd) {
4402 		extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4403 	}
4404 
4405 	RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4406 
4407 	if (extra_cmd) {
4408 		zend_string_release_ex(extra_cmd, 0);
4409 	}
4410 	if (to_r != to) {
4411 		efree(to_r);
4412 	}
4413 	zend_string_release(subject);
4414 	zend_string_free(conv);
4415 	zend_hash_destroy(&ht_headers);
4416 	if (str_headers) {
4417 		zend_string_release_ex(str_headers, 0);
4418 	}
4419 }
4420 
4421 #undef CRLF
4422 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4423 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4424 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4425 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4426 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4427 /* }}} */
4428 
4429 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4430 PHP_FUNCTION(mb_get_info)
4431 {
4432 	zend_string *type = NULL;
4433 	size_t n;
4434 	char *name;
4435 	zval row;
4436 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4437 	const mbfl_encoding **entry;
4438 
4439 	ZEND_PARSE_PARAMETERS_START(0, 1)
4440 		Z_PARAM_OPTIONAL
4441 		Z_PARAM_STR(type)
4442 	ZEND_PARSE_PARAMETERS_END();
4443 
4444 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4445 		array_init(return_value);
4446 		if (MBSTRG(current_internal_encoding)) {
4447 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4448 		}
4449 		if (MBSTRG(http_input_identify)) {
4450 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4451 		}
4452 		if (MBSTRG(current_http_output_encoding)) {
4453 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4454 		}
4455 		if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4456 			add_assoc_string(return_value, "http_output_conv_mimetypes", name);
4457 		}
4458 		if (lang != NULL) {
4459 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4460 				add_assoc_string(return_value, "mail_charset", name);
4461 			}
4462 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4463 				add_assoc_string(return_value, "mail_header_encoding", name);
4464 			}
4465 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4466 				add_assoc_string(return_value, "mail_body_encoding", name);
4467 			}
4468 		}
4469 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4470 		if (MBSTRG(encoding_translation)) {
4471 			add_assoc_string(return_value, "encoding_translation", "On");
4472 		} else {
4473 			add_assoc_string(return_value, "encoding_translation", "Off");
4474 		}
4475 		if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4476 			add_assoc_string(return_value, "language", name);
4477 		}
4478 		n = MBSTRG(current_detect_order_list_size);
4479 		entry = MBSTRG(current_detect_order_list);
4480 		if (n > 0) {
4481 			size_t i;
4482 			array_init(&row);
4483 			for (i = 0; i < n; i++) {
4484 				add_next_index_string(&row, (*entry)->name);
4485 				entry++;
4486 			}
4487 			add_assoc_zval(return_value, "detect_order", &row);
4488 		}
4489 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4490 			add_assoc_string(return_value, "substitute_character", "none");
4491 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4492 			add_assoc_string(return_value, "substitute_character", "long");
4493 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4494 			add_assoc_string(return_value, "substitute_character", "entity");
4495 		} else {
4496 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4497 		}
4498 		if (MBSTRG(strict_detection)) {
4499 			add_assoc_string(return_value, "strict_detection", "On");
4500 		} else {
4501 			add_assoc_string(return_value, "strict_detection", "Off");
4502 		}
4503 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4504 		if (MBSTRG(current_internal_encoding)) {
4505 			RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
4506 		}
4507 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4508 		if (MBSTRG(http_input_identify)) {
4509 			RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
4510 		}
4511 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4512 		if (MBSTRG(current_http_output_encoding)) {
4513 			RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4514 		}
4515 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4516 		if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4517 			RETVAL_STRING(name);
4518 		}
4519 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4520 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4521 			RETVAL_STRING(name);
4522 		}
4523 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4524 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4525 			RETVAL_STRING(name);
4526 		}
4527 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4528 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4529 			RETVAL_STRING(name);
4530 		}
4531 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4532 		RETVAL_LONG(MBSTRG(illegalchars));
4533 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4534 		if (MBSTRG(encoding_translation)) {
4535 			RETVAL_STRING("On");
4536 		} else {
4537 			RETVAL_STRING("Off");
4538 		}
4539 	} else if (zend_string_equals_literal_ci(type, "language")) {
4540 		if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4541 			RETVAL_STRING(name);
4542 		}
4543 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4544 		n = MBSTRG(current_detect_order_list_size);
4545 		entry = MBSTRG(current_detect_order_list);
4546 		if (n > 0) {
4547 			size_t i;
4548 			array_init(return_value);
4549 			for (i = 0; i < n; i++) {
4550 				add_next_index_string(return_value, (*entry)->name);
4551 				entry++;
4552 			}
4553 		}
4554 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4555 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4556 			RETVAL_STRING("none");
4557 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4558 			RETVAL_STRING("long");
4559 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4560 			RETVAL_STRING("entity");
4561 		} else {
4562 			RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
4563 		}
4564 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4565 		if (MBSTRG(strict_detection)) {
4566 			RETVAL_STRING("On");
4567 		} else {
4568 			RETVAL_STRING("Off");
4569 		}
4570 	} else {
4571 		// TODO Convert to ValueError
4572 		RETURN_FALSE;
4573 	}
4574 }
4575 /* }}} */
4576 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4577 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4578 {
4579 	uint32_t wchar_buf[128];
4580 	unsigned char *in = (unsigned char*)input;
4581 	unsigned int state = 0;
4582 
4583 	if (encoding->check != NULL) {
4584 		return encoding->check(in, length);
4585 	}
4586 
4587 	/* If the input string is not encoded in the given encoding, there is a significant chance
4588 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4589 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4590 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4591 	ZEND_ASSERT(out_len <= 8);
4592 	for (int i = 0; i < out_len; i++) {
4593 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4594 			return false;
4595 		}
4596 	}
4597 
4598 	while (length) {
4599 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4600 		ZEND_ASSERT(out_len <= 128);
4601 		for (int i = 0; i < out_len; i++) {
4602 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4603 				return false;
4604 			}
4605 		}
4606 	}
4607 
4608 	return true;
4609 }
4610 
4611 /* MSVC 32-bit has issues with 64-bit intrinsics.
4612  * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4613  * It seems this is caused by a bug in MS Visual C++
4614  * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4615 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4616 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4617 #endif
4618 
4619 /* If we are building an AVX2-only binary, don't compile the next function */
4620 #ifndef ZEND_INTRIN_AVX2_NATIVE
4621 
4622 /* SSE2-based function for validating UTF-8 strings
4623  * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4624 static bool mb_fast_check_utf8_default(zend_string *str)
4625 {
4626 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4627 # ifdef __SSE2__
4628 	/* `e` points 1 byte past the last full 16-byte block of string content
4629 	 * Note that we include the terminating null byte which is included in each zend_string
4630 	 * as part of the content to check; this ensures that multi-byte characters which are
4631 	 * truncated abruptly at the end of the string will be detected as invalid */
4632 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4633 
4634 	/* For checking for illegal bytes 0xF5-FF */
4635 	const __m128i over_f5 = _mm_set1_epi8(-117);
4636 	/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4637 	const __m128i over_9f = _mm_set1_epi8(-97);
4638 	/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4639 	const __m128i over_8f = _mm_set1_epi8(-113);
4640 	/* For checking for illegal bytes 0xC0-C1 */
4641 	const __m128i find_c0 = _mm_set1_epi8(-64);
4642 	const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4643 	/* For checking structure of continuation bytes */
4644 	const __m128i find_e0 = _mm_set1_epi8(-32);
4645 	const __m128i find_f0 = _mm_set1_epi8(-16);
4646 
4647 	__m128i last_block = _mm_setzero_si128();
4648 	__m128i operand;
4649 
4650 	while (p < e) {
4651 		operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4652 
4653 check_operand:
4654 		/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4655 		if (!_mm_movemask_epi8(operand)) {
4656 			/* Even if this block only contains single-byte characters, there may have been a
4657 			 * multi-byte character at the end of the previous block, which was supposed to
4658 			 * have continuation bytes in this block
4659 			 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4660 			 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4661 			 * from the 3rd last */
4662 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4663 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4664 			if (_mm_movemask_epi8(bad)) {
4665 				return false;
4666 			}
4667 
4668 			/* Consume as many full blocks of single-byte characters as we can */
4669 			while (true) {
4670 				p += sizeof(__m128i);
4671 				if (p >= e) {
4672 					goto finish_up_remaining_bytes;
4673 				}
4674 				operand = _mm_loadu_si128((__m128i*)p);
4675 				if (_mm_movemask_epi8(operand)) {
4676 					break;
4677 				}
4678 			}
4679 		}
4680 
4681 		/* Check for >= 0xF5, which are illegal byte values in UTF-8
4682 		 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4683 		 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4684 		 * Then a single signed compare will pick out any bad bytes
4685 		 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4686 		__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4687 
4688 		/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4689 		 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4690 		 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4691 		 * We can check for both problems at once by generating a vector where each byte < 0xA0
4692 		 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4693 		 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4694 		__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4695 		__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4696 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4697 
4698 		/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4699 		 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4700 		 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4701 		 * Build the bitmask and compare it with the shifted block */
4702 		__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4703 		bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4704 
4705 		/* Check for overlong 2-byte code units
4706 		 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4707 		 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4708 		 * byte range, do a signed compare to pick out any bad bytes */
4709 		bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4710 
4711 		/* Check structure of continuation bytes
4712 		 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4713 		 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4714 		 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4715 		 * 3) 3 bytes after the start of a 4-byte character
4716 		 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4717 		 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4718 		__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4719 		__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4720 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4721 		__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4722 		cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4723 
4724 		/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4725 		 * a continuation byte actually is
4726 		 * XOR those two bitmasks together; if everything is good, the result should be zero
4727 		 * However, if a byte which should have been a continuation wasn't, or if a byte which
4728 		 * shouldn't have been a continuation was, we will get 0xFF in that position */
4729 		__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4730 		bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4731 
4732 		/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4733 		 * If that value is non-zero, then we found a bad byte somewhere! */
4734 		if (_mm_movemask_epi8(bad)) {
4735 			return false;
4736 		}
4737 
4738 		last_block = operand;
4739 		p += sizeof(__m128i);
4740 	}
4741 
4742 finish_up_remaining_bytes:
4743 	/* Finish up 1-15 remaining bytes */
4744 	if (p == e) {
4745 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
4746 
4747 		/* Crazy hack here for cases where 9 or more bytes are remaining...
4748 		 * We want to use the above vectorized code to check a block of less than 16 bytes,
4749 		 * but there is no good way to read a variable number of bytes into an XMM register
4750 		 * However, we know that these bytes are part of a zend_string, and a zend_string has some
4751 		 * 'header' fields which occupy the memory just before its content
4752 		 * And, those header fields occupy more than 16 bytes...
4753 		 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4754 		 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4755 		 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4756 		 * Then, we do a left shift to get rid of the unwanted bytes
4757 		 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4758 		 *
4759 		 * The following `switch` looks useless, but it's not
4760 		 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4761 		 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4762 		 */
4763 		switch (remaining_bytes) {
4764 		case 0: ;
4765 			__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4766 			__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4767 			return _mm_movemask_epi8(bad) == 0;
4768 		case 1:
4769 		case 2:
4770 			operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
4771 			goto check_operand;
4772 		case 3:
4773 		case 4:
4774 			operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
4775 			goto check_operand;
4776 		case 5:
4777 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
4778 			goto check_operand;
4779 		case 6:
4780 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
4781 			goto check_operand;
4782 		case 7:
4783 		case 8:
4784 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4785 			operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
4786 #else
4787 			operand = _mm_set_epi64x(0, *((uint64_t*)p));
4788 #endif
4789 			goto check_operand;
4790 		case 9:
4791 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
4792 			goto check_operand;
4793 		case 10:
4794 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
4795 			goto check_operand;
4796 		case 11:
4797 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
4798 			goto check_operand;
4799 		case 12:
4800 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
4801 			goto check_operand;
4802 		case 13:
4803 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
4804 			goto check_operand;
4805 		case 14:
4806 			operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
4807 			goto check_operand;
4808 		case 15:
4809 			/* No trailing bytes are left which need to be checked
4810 			 * We get 15 because we did not include the terminating null when
4811 			 * calculating `remaining_bytes`, so the value wraps around */
4812 			return true;
4813 		}
4814 
4815 		ZEND_UNREACHABLE();
4816 	}
4817 
4818 	return true;
4819 # else
4820 	/* This UTF-8 validation function is derived from PCRE2 */
4821 	size_t length = ZSTR_LEN(str);
4822 	/* Table of the number of extra bytes, indexed by the first byte masked with
4823 	0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
4824 	static const uint8_t utf8_table[] = {
4825 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
4826 		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
4827 		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
4828 		3,3,3,3,3,3,3,3
4829 	};
4830 
4831 	for (; length > 0; p++) {
4832 		uint32_t d;
4833 		unsigned char c = *p;
4834 		length--;
4835 
4836 		if (c < 128) {
4837 			/* ASCII character */
4838 			continue;
4839 		}
4840 
4841 		if (c < 0xc0) {
4842 			/* Isolated 10xx xxxx byte */
4843 			return false;
4844 		}
4845 
4846 		if (c >= 0xf5) {
4847 			return false;
4848 		}
4849 
4850 		uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
4851 		if (length < ab) {
4852 			/* Missing bytes */
4853 			return false;
4854 		}
4855 		length -= ab;
4856 
4857 		/* Check top bits in the second byte */
4858 		if (((d = *(++p)) & 0xc0) != 0x80) {
4859 			return false;
4860 		}
4861 
4862 		/* For each length, check that the remaining bytes start with the 0x80 bit
4863 		 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
4864 		 * excluded range 0xd800 to 0xdfff. */
4865 		switch (ab) {
4866 		case 1:
4867 			/* 2-byte character. No further bytes to check for 0x80. Check first byte
4868 			 * for xx00 000x (overlong sequence). */
4869 			if ((c & 0x3e) == 0) {
4870 				return false;
4871 			}
4872 			break;
4873 
4874 		case 2:
4875 			/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
4876 			 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
4877 			if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
4878 				return false;
4879 			}
4880 			break;
4881 
4882 		case 3:
4883 			/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
4884 			 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
4885 			 * character greater than 0x0010ffff (f4 8f bf bf) */
4886 			if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
4887 				return false;
4888 			}
4889 			break;
4890 
4891 			EMPTY_SWITCH_DEFAULT_CASE();
4892 		}
4893 	}
4894 
4895 	return true;
4896 # endif
4897 }
4898 
4899 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
4900 
4901 #ifdef ZEND_INTRIN_AVX2_NATIVE
4902 
4903 /* We are building AVX2-only binary */
4904 # include <immintrin.h>
4905 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
4906 
4907 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
4908 
4909 /* We are building binary which works with or without AVX2; whether or not to use
4910  * AVX2-accelerated functions will be determined at runtime */
4911 # include <immintrin.h>
4912 # include "Zend/zend_cpuinfo.h"
4913 
4914 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
4915 /* Dynamic linker will decide whether or not to use AVX2-based functions and
4916  * resolve symbols accordingly */
4917 
4918 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
4919 
4920 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
4921 
4922 typedef bool (*check_utf8_func_t)(zend_string*);
4923 
4924 ZEND_NO_SANITIZE_ADDRESS
4925 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)4926 static check_utf8_func_t resolve_check_utf8(void)
4927 {
4928 	if (zend_cpu_supports_avx2()) {
4929 		return mb_fast_check_utf8_avx2;
4930 	}
4931 	return mb_fast_check_utf8_default;
4932 }
4933 
4934 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
4935 /* We are compiling for a target where the dynamic linker will not be able to
4936  * resolve symbols according to whether the host supports AVX2 or not; so instead,
4937  * we can make calls go through a function pointer and set the function pointer
4938  * on module load */
4939 
4940 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
4941 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
4942 #else
4943 static bool mb_fast_check_utf8_avx2(zend_string *str);
4944 #endif
4945 
4946 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
4947 
mb_fast_check_utf8(zend_string * str)4948 static bool mb_fast_check_utf8(zend_string *str)
4949 {
4950 	return check_utf8_ptr(str);
4951 }
4952 
init_check_utf8(void)4953 static void init_check_utf8(void)
4954 {
4955 	if (zend_cpu_supports_avx2()) {
4956 		check_utf8_ptr = mb_fast_check_utf8_avx2;
4957 	} else {
4958 		check_utf8_ptr = mb_fast_check_utf8_default;
4959 	}
4960 }
4961 # endif
4962 
4963 #else
4964 
4965 /* No AVX2 support */
4966 #define mb_fast_check_utf8 mb_fast_check_utf8_default
4967 
4968 #endif
4969 
4970 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
4971 
4972 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
4973  * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
4974 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
4975 # define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
4976 #endif
4977 
4978 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
4979  * number of bytes, then take the low 256 bits
4980  * This is used to take some number of trailing bytes from the previous 32-byte
4981  * block followed by some number of leading bytes from the current 32-byte block
4982  *
4983  * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
4984  * YMM register while shifting in bytes from another YMM register... but
4985  * it works separately on respective 128-bit halves of the YMM registers,
4986  * which is not what we want.
4987  * To make it work as desired, we first do _mm256_permute2x128_si256
4988  * (VPERM2I128) to combine the low 128 bits from the previous block and
4989  * the high 128 bits of the current block in one YMM register.
4990  * Then VPALIGNR will do what is needed. */
4991 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
4992 
4993 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
4994  *
4995  * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
4996  * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
4997  * sections. */
4998 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)4999 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5000 #else
5001 static bool mb_fast_check_utf8_avx2(zend_string *str)
5002 #endif
5003 {
5004 	unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5005 	unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5006 
5007 	/* The algorithm used here for UTF-8 validation is partially adapted from the
5008 	 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5009 	 * and Daniel Lemire.
5010 	 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5011 	 *
5012 	 * Most types of invalid UTF-8 text can be detected by examining pairs of
5013 	 * successive bytes. Specifically:
5014 	 *
5015 	 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5016 	 *   No valid UTF-8 string ever uses these byte values.
5017 	 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5018 	 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5019 	 * • 5-byte or 6-byte code units, which should never be used, start with
5020 	 *   0xF8-FE.
5021 	 * • A codepoint value higher than U+10FFFF, which is the highest value for
5022 	 *   any Unicode codepoint, would either start with 0xF4, followed by a
5023 	 *   byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5024 	 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5025 	 *   be used, would start with 0xED, followed by a byte >= 0xA0.
5026 	 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5027 	 *
5028 	 * To detect all these problems, for each pair of successive bytes, we do
5029 	 * table lookups using the high nibble of the first byte, the low nibble of
5030 	 * the first byte, and the high nibble of the second byte. Each table lookup
5031 	 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5032 	 * combination; AND those three bitmasks together, and any 1 bit in the result
5033 	 * will indicate an actual invalid byte combination was found.
5034 	 */
5035 
5036 #define BAD_BYTE 0x1
5037 #define OVERLONG_2BYTE 0x2
5038 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5039 #define OVERLONG_3BYTE 0x4
5040 #define SURROGATE 0x8
5041 #define OVERLONG_4BYTE 0x10
5042 #define INVALID_CP 0x20
5043 
5044 	/* Each of these are 16-entry tables, repeated twice; this is required by the
5045 	 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5046 	 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5047 	 *
5048 	 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5049 	 * that means that high nibble 0xC is consistent with the byte pair being part of
5050 	 * an overlong 2-byte code unit */
5051 	const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5052 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5053 		0, 0, 0, 0,
5054 		0, 0, 0, 0,
5055 		0, 0, 0, 0,
5056 		BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5057 		0, 0, 0, 0,
5058 		0, 0, 0, 0,
5059 		0, 0, 0, 0);
5060 	const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5061 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5062 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5063 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5064 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5065 		BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5066 		BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5067 		BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5068 		0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5069 	const __m256i bad_hi_nibble = _mm256_set_epi8(
5070 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5071 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5072 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5073 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5074 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5075 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5076 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5077 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5078 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5079 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5080 		_1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5081 		_1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5082 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5083 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5084 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5085 		_1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5086 
5087 	const __m256i find_continuation = _mm256_set1_epi8(-64);
5088 	const __m256i _b = _mm256_set1_epi8(0xB);
5089 	const __m256i _d = _mm256_set1_epi8(0xD);
5090 	const __m256i _f = _mm256_set1_epi8(0xF);
5091 
5092 	__m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5093 	__m256i operand;
5094 
5095 	while (p < e) {
5096 		operand = _mm256_loadu_si256((__m256i*)p);
5097 
5098 check_operand:
5099 		if (!_mm256_movemask_epi8(operand)) {
5100 			/* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5101 			 * the previous block didn't end with an incomplete multi-byte character
5102 			 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5103 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5104 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5105 			if (_mm256_movemask_epi8(bad)) {
5106 				return false;
5107 			}
5108 
5109 			/* Consume as many full blocks of single-byte characters as we can */
5110 			while (true) {
5111 				p += sizeof(__m256i);
5112 				if (p >= e) {
5113 					goto finish_up_remaining_bytes;
5114 				}
5115 				operand = _mm256_loadu_si256((__m256i*)p);
5116 				if (_mm256_movemask_epi8(operand)) {
5117 					break;
5118 				}
5119 			}
5120 		}
5121 
5122 		__m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5123 		__m256i lo_nibbles = _mm256_and_si256(operand, _f);
5124 
5125 		__m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5126 		__m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5127 
5128 		/* Do parallel table lookups in all 3 tables */
5129 		__m256i bad = _mm256_cmpgt_epi8(
5130 			_mm256_and_si256(
5131 				_mm256_and_si256(
5132 					_mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5133 					_mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5134 				_mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5135 			_mm256_setzero_si256());
5136 
5137 		__m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5138 		__m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5139 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5140 		__m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5141 		cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5142 
5143 		__m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5144 		bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5145 
5146 		if (_mm256_movemask_epi8(bad)) {
5147 			return false;
5148 		}
5149 
5150 		last_hi_nibbles = hi_nibbles;
5151 		last_lo_nibbles = lo_nibbles;
5152 		p += sizeof(__m256i);
5153 	}
5154 
5155 finish_up_remaining_bytes:
5156 	if (p == e) {
5157 		uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5158 
5159 		switch (remaining_bytes) {
5160 		case 0: ;
5161 			/* No actual data bytes are remaining */
5162 			__m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5163 			__m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5164 			return _mm256_movemask_epi8(bad) == 0;
5165 		case 1:
5166 		case 2:
5167 			operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5168 			goto check_operand;
5169 		case 3:
5170 		case 4:
5171 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5172 			goto check_operand;
5173 		case 5:
5174 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5175 			goto check_operand;
5176 		case 6:
5177 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5178 			goto check_operand;
5179 		case 7:
5180 		case 8:
5181 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5182 			operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5183 #else
5184 			operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5185 #endif
5186 			goto check_operand;
5187 		case 9:
5188 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5189 			goto check_operand;
5190 		case 10:
5191 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5192 			goto check_operand;
5193 		case 11:
5194 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5195 			goto check_operand;
5196 		case 12:
5197 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5198 			goto check_operand;
5199 		case 13:
5200 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5201 			goto check_operand;
5202 		case 14:
5203 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5204 			goto check_operand;
5205 		case 15:
5206 		case 16:
5207 			operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5208 			goto check_operand;
5209 		case 17:
5210 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5211 			goto check_operand;
5212 		case 18:
5213 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5214 			goto check_operand;
5215 		case 19:
5216 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5217 			goto check_operand;
5218 		case 20:
5219 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5220 			goto check_operand;
5221 		case 21:
5222 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5223 			goto check_operand;
5224 		case 22:
5225 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5226 			goto check_operand;
5227 		case 23:
5228 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5229 			goto check_operand;
5230 		case 24:
5231 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5232 			goto check_operand;
5233 		case 25:
5234 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5235 			goto check_operand;
5236 		case 26:
5237 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5238 			goto check_operand;
5239 		case 27:
5240 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5241 			goto check_operand;
5242 		case 28:
5243 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5244 			goto check_operand;
5245 		case 29:
5246 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5247 			goto check_operand;
5248 		case 30:
5249 			operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5250 			goto check_operand;
5251 		case 31:
5252 			return true;
5253 		}
5254 
5255 		ZEND_UNREACHABLE();
5256 	}
5257 
5258 	return true;
5259 }
5260 
5261 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5262 
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5263 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5264 {
5265 	if (encoding == &mbfl_encoding_utf8) {
5266 		if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
5267 			return true;
5268 		}
5269 		bool result = mb_fast_check_utf8(str);
5270 		if (result && !ZSTR_IS_INTERNED(str)) {
5271 			GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5272 		}
5273 		return result;
5274 	} else {
5275 		return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5276 	}
5277 }
5278 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5279 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5280 {
5281 	zend_long idx;
5282 	zend_string *key;
5283 	zval *entry;
5284 	int valid = 1;
5285 
5286 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5287 
5288 	if (GC_IS_RECURSIVE(vars)) {
5289 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5290 		return 0;
5291 	}
5292 	GC_TRY_PROTECT_RECURSION(vars);
5293 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5294 		ZVAL_DEREF(entry);
5295 		if (key) {
5296 			if (!mb_check_str_encoding(key, encoding)) {
5297 				valid = 0;
5298 				break;
5299 			}
5300 		}
5301 		switch (Z_TYPE_P(entry)) {
5302 			case IS_STRING:
5303 				if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5304 					valid = 0;
5305 					break;
5306 				}
5307 				break;
5308 			case IS_ARRAY:
5309 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5310 					valid = 0;
5311 					break;
5312 				}
5313 				break;
5314 			case IS_LONG:
5315 			case IS_DOUBLE:
5316 			case IS_NULL:
5317 			case IS_TRUE:
5318 			case IS_FALSE:
5319 				break;
5320 			default:
5321 				/* Other types are error. */
5322 				valid = 0;
5323 				break;
5324 		}
5325 	} ZEND_HASH_FOREACH_END();
5326 	GC_TRY_UNPROTECT_RECURSION(vars);
5327 	return valid;
5328 }
5329 
5330 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5331 PHP_FUNCTION(mb_check_encoding)
5332 {
5333 	zend_string *input_str = NULL, *enc = NULL;
5334 	HashTable *input_ht = NULL;
5335 	const mbfl_encoding *encoding;
5336 
5337 	ZEND_PARSE_PARAMETERS_START(0, 2)
5338 		Z_PARAM_OPTIONAL
5339 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5340 		Z_PARAM_STR_OR_NULL(enc)
5341 	ZEND_PARSE_PARAMETERS_END();
5342 
5343 	encoding = php_mb_get_encoding(enc, 2);
5344 	if (!encoding) {
5345 		RETURN_THROWS();
5346 	}
5347 
5348 	if (input_ht) {
5349 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5350 	} else if (input_str) {
5351 		RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5352 	} else {
5353 		php_error_docref(NULL, E_DEPRECATED,
5354 			"Calling mb_check_encoding() without argument is deprecated");
5355 
5356 		/* FIXME: Actually check all inputs, except $_FILES file content. */
5357 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
5358 	}
5359 }
5360 /* }}} */
5361 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5362 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5363 	const uint32_t enc_name_arg_num)
5364 {
5365 	const mbfl_encoding *enc;
5366 	enum mbfl_no_encoding no_enc;
5367 
5368 	ZEND_ASSERT(str_len > 0);
5369 
5370 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5371 	if (!enc) {
5372 		return -2;
5373 	}
5374 
5375 	no_enc = enc->no_encoding;
5376 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5377 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5378 		return -2;
5379 	}
5380 
5381 	/* Some legacy text encodings have a minimum required wchar buffer size;
5382 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5383 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5384 	unsigned int state = 0;
5385 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5386 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5387 
5388 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5389 		return -1;
5390 	}
5391 	return wchar_buf[0];
5392 }
5393 
5394 /* {{{ */
PHP_FUNCTION(mb_ord)5395 PHP_FUNCTION(mb_ord)
5396 {
5397 	char *str;
5398 	size_t str_len;
5399 	zend_string *enc = NULL;
5400 	zend_long cp;
5401 
5402 	ZEND_PARSE_PARAMETERS_START(1, 2)
5403 		Z_PARAM_STRING(str, str_len)
5404 		Z_PARAM_OPTIONAL
5405 		Z_PARAM_STR_OR_NULL(enc)
5406 	ZEND_PARSE_PARAMETERS_END();
5407 
5408 	if (str_len == 0) {
5409 		zend_argument_value_error(1, "must not be empty");
5410 		RETURN_THROWS();
5411 	}
5412 
5413 	cp = php_mb_ord(str, str_len, enc, 2);
5414 
5415 	if (0 > cp) {
5416 		if (cp == -2) {
5417 			RETURN_THROWS();
5418 		}
5419 		RETURN_FALSE;
5420 	}
5421 
5422 	RETURN_LONG(cp);
5423 }
5424 /* }}} */
5425 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5426 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5427 {
5428 	const mbfl_encoding *enc;
5429 	enum mbfl_no_encoding no_enc;
5430 	zend_string *ret;
5431 	char buf[4];
5432 
5433 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5434 	if (!enc) {
5435 		return NULL;
5436 	}
5437 
5438 	no_enc = enc->no_encoding;
5439 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
5440 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5441 		return NULL;
5442 	}
5443 
5444 	if (cp < 0 || cp > 0x10ffff) {
5445 		return NULL;
5446 	}
5447 
5448 	if (php_mb_is_no_encoding_utf8(no_enc)) {
5449 		if (cp > 0xd7ff && 0xe000 > cp) {
5450 			return NULL;
5451 		}
5452 
5453 		if (cp < 0x80) {
5454 			ret = ZSTR_CHAR(cp);
5455 		} else if (cp < 0x800) {
5456 			ret = zend_string_alloc(2, 0);
5457 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5458 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5459 			ZSTR_VAL(ret)[2] = 0;
5460 		} else if (cp < 0x10000) {
5461 			ret = zend_string_alloc(3, 0);
5462 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5463 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5464 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5465 			ZSTR_VAL(ret)[3] = 0;
5466 		} else {
5467 			ret = zend_string_alloc(4, 0);
5468 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5469 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5470 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5471 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5472 			ZSTR_VAL(ret)[4] = 0;
5473 		}
5474 
5475 		return ret;
5476 	}
5477 
5478 	buf[0] = (cp >> 24) & 0xff;
5479 	buf[1] = (cp >> 16) & 0xff;
5480 	buf[2] = (cp >>  8) & 0xff;
5481 	buf[3] = cp & 0xff;
5482 
5483 	long orig_illegalchars = MBSTRG(illegalchars);
5484 	MBSTRG(illegalchars) = 0;
5485 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5486 
5487 	if (MBSTRG(illegalchars) != 0) {
5488 		zend_string_release(ret);
5489 		ret = NULL;
5490 	}
5491 
5492 	MBSTRG(illegalchars) = orig_illegalchars;
5493 	return ret;
5494 }
5495 
5496 /* {{{ */
PHP_FUNCTION(mb_chr)5497 PHP_FUNCTION(mb_chr)
5498 {
5499 	zend_long cp;
5500 	zend_string *enc = NULL;
5501 
5502 	ZEND_PARSE_PARAMETERS_START(1, 2)
5503 		Z_PARAM_LONG(cp)
5504 		Z_PARAM_OPTIONAL
5505 		Z_PARAM_STR_OR_NULL(enc)
5506 	ZEND_PARSE_PARAMETERS_END();
5507 
5508 	zend_string* ret = php_mb_chr(cp, enc, 2);
5509 	if (ret == NULL) {
5510 		RETURN_FALSE;
5511 	}
5512 
5513 	RETURN_STR(ret);
5514 }
5515 /* }}} */
5516 
PHP_FUNCTION(mb_str_pad)5517 PHP_FUNCTION(mb_str_pad)
5518 {
5519 	zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5520 	zend_long pad_to_length;
5521 	zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5522 
5523 	ZEND_PARSE_PARAMETERS_START(2, 5)
5524 		Z_PARAM_STR(input)
5525 		Z_PARAM_LONG(pad_to_length)
5526 		Z_PARAM_OPTIONAL
5527 		Z_PARAM_STR(pad)
5528 		Z_PARAM_LONG(pad_type_val)
5529 		Z_PARAM_STR_OR_NULL(encoding_str)
5530 	ZEND_PARSE_PARAMETERS_END();
5531 
5532 	const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5533 	if (!encoding) {
5534 		RETURN_THROWS();
5535 	}
5536 
5537 	size_t input_length = mb_get_strlen(input, encoding);
5538 
5539 	/* If resulting string turns out to be shorter than input string,
5540 	   we simply copy the input and return. */
5541 	if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5542 		RETURN_STR_COPY(input);
5543 	}
5544 
5545 	if (ZSTR_LEN(pad) == 0) {
5546 		zend_argument_value_error(3, "must be a non-empty string");
5547 		RETURN_THROWS();
5548 	}
5549 
5550 	if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5551 		zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5552 		RETURN_THROWS();
5553 	}
5554 
5555 	size_t pad_length = mb_get_strlen(pad, encoding);
5556 
5557 	size_t num_mb_pad_chars = pad_to_length - input_length;
5558 
5559 	/* We need to figure out the left/right padding lengths. */
5560 	size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5561 	switch (pad_type_val) {
5562 		case PHP_STR_PAD_RIGHT:
5563 			right_pad = num_mb_pad_chars;
5564 			break;
5565 
5566 		case PHP_STR_PAD_LEFT:
5567 			left_pad = num_mb_pad_chars;
5568 			break;
5569 
5570 		case PHP_STR_PAD_BOTH:
5571 			left_pad = num_mb_pad_chars / 2;
5572 			right_pad = num_mb_pad_chars - left_pad;
5573 			break;
5574 	}
5575 
5576 	/* How many full block copies need to happen, and how many characters are then left over? */
5577 	size_t full_left_pad_copies = left_pad / pad_length;
5578 	size_t full_right_pad_copies = right_pad / pad_length;
5579 	size_t remaining_left_pad_chars = left_pad % pad_length;
5580 	size_t remaining_right_pad_chars = right_pad % pad_length;
5581 
5582 	if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5583 		goto overflow_no_release;
5584 	}
5585 
5586 	/* Compute the number of bytes required for the padding */
5587 	size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5588 	size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5589 
5590 	/* No special fast-path handling necessary for zero-length pads because these functions will not
5591 	 * allocate memory in case a zero-length pad is required. */
5592 	zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5593 	zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5594 
5595 	if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5596 		|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5597 		goto overflow;
5598 	}
5599 
5600 	size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5601 	size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5602 
5603 	if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5604 		|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5605 		goto overflow;
5606 	}
5607 
5608 	zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5609 	char *buffer = ZSTR_VAL(result);
5610 
5611 	/* First we pad the left. */
5612 	for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5613 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5614 	}
5615 	memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5616 	buffer += ZSTR_LEN(remaining_left_pad_str);
5617 
5618 	/* Then we copy the input string. */
5619 	memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5620 	buffer += ZSTR_LEN(input);
5621 
5622 	/* Finally, we pad on the right. */
5623 	for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5624 		memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5625 	}
5626 	memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5627 
5628 	ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5629 
5630 	zend_string_release_ex(remaining_left_pad_str, false);
5631 	zend_string_release_ex(remaining_right_pad_str, false);
5632 
5633 	RETURN_NEW_STR(result);
5634 
5635 overflow:
5636 	zend_string_release_ex(remaining_left_pad_str, false);
5637 	zend_string_release_ex(remaining_right_pad_str, false);
5638 overflow_no_release:
5639 	zend_throw_error(NULL, "String size overflow");
5640 	RETURN_THROWS();
5641 }
5642 
5643 /* {{{ */
PHP_FUNCTION(mb_scrub)5644 PHP_FUNCTION(mb_scrub)
5645 {
5646 	zend_string *str, *enc_name = NULL;
5647 
5648 	ZEND_PARSE_PARAMETERS_START(1, 2)
5649 		Z_PARAM_STR(str)
5650 		Z_PARAM_OPTIONAL
5651 		Z_PARAM_STR_OR_NULL(enc_name)
5652 	ZEND_PARSE_PARAMETERS_END();
5653 
5654 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5655 	if (!enc) {
5656 		RETURN_THROWS();
5657 	}
5658 
5659 	if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
5660 		/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5661 		RETURN_STR_COPY(str);
5662 	}
5663 
5664 	RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5665 }
5666 /* }}} */
5667 
5668 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5669 static void php_mb_populate_current_detect_order_list(void)
5670 {
5671 	const mbfl_encoding **entry = 0;
5672 	size_t nentries;
5673 
5674 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5675 		nentries = MBSTRG(detect_order_list_size);
5676 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5677 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5678 	} else {
5679 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5680 		size_t i;
5681 		nentries = MBSTRG(default_detect_order_list_size);
5682 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5683 		for (i = 0; i < nentries; i++) {
5684 			entry[i] = mbfl_no2encoding(src[i]);
5685 		}
5686 	}
5687 	MBSTRG(current_detect_order_list) = entry;
5688 	MBSTRG(current_detect_order_list_size) = nentries;
5689 }
5690 /* }}} */
5691 
5692 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5693 static int php_mb_encoding_translation(void)
5694 {
5695 	return MBSTRG(encoding_translation);
5696 }
5697 /* }}} */
5698 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5699 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5700 {
5701 	if (enc) {
5702 		if (enc->mblen_table) {
5703 			if (s) {
5704 				return enc->mblen_table[*(unsigned char *)s];
5705 			}
5706 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5707 			return 2;
5708 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5709 			return 4;
5710 		}
5711 	}
5712 	return 1;
5713 }
5714 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5715 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5716 {
5717 	const char *p = s;
5718 	char *last=NULL;
5719 
5720 	if (nbytes == (size_t)-1) {
5721 		size_t nb = 0;
5722 
5723 		while (*p != '\0') {
5724 			if (nb == 0) {
5725 				if ((unsigned char)*p == (unsigned char)c) {
5726 					last = (char *)p;
5727 				}
5728 				nb = php_mb_mbchar_bytes(p, enc);
5729 				if (nb == 0) {
5730 					return NULL; /* something is going wrong! */
5731 				}
5732 			}
5733 			--nb;
5734 			++p;
5735 		}
5736 	} else {
5737 		size_t bcnt = nbytes;
5738 		size_t nbytes_char;
5739 		while (bcnt > 0) {
5740 			if ((unsigned char)*p == (unsigned char)c) {
5741 				last = (char *)p;
5742 			}
5743 			nbytes_char = php_mb_mbchar_bytes(p, enc);
5744 			if (bcnt < nbytes_char) {
5745 				return NULL;
5746 			}
5747 			p += nbytes_char;
5748 			bcnt -= nbytes_char;
5749 		}
5750 	}
5751 	return last;
5752 }
5753 
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)5754 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
5755 {
5756 	/* We're using simple case-folding here, because we'd have to deal with remapping of
5757 	 * offsets otherwise. */
5758 	zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5759 	zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5760 
5761 	size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
5762 
5763 	zend_string_free(haystack_conv);
5764 	zend_string_free(needle_conv);
5765 
5766 	return n;
5767 }
5768 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)5769 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
5770 {
5771 	*list = (const zend_encoding **)MBSTRG(http_input_list);
5772 	*list_size = MBSTRG(http_input_list_size);
5773 }
5774 /* }}} */
5775 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)5776 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
5777 {
5778 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
5779 }
5780 /* }}} */
5781 
5782 static const unsigned char base64_table[] = {
5783  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
5784    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
5785  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
5786    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
5787  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
5788    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
5789  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
5790    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
5791  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
5792    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
5793 };
5794 
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)5795 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
5796 {
5797 	if (base64) {
5798 		return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
5799 	} else {
5800 		size_t enc_size = 0;
5801 		unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
5802 		while (p < tmpbuf->out) {
5803 			unsigned char c = *p++;
5804 			enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
5805 		}
5806 		return enc_size;
5807 	}
5808 }
5809 
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)5810 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
5811 {
5812 	unsigned char *out, *limit;
5813 	MB_CONVERT_BUF_LOAD(outbuf, out, limit);
5814 	unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
5815 
5816 	if (base64) {
5817 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
5818 		while ((e - p) >= 3) {
5819 			unsigned char a = *p++;
5820 			unsigned char b = *p++;
5821 			unsigned char c = *p++;
5822 			uint32_t bits = (a << 16) | (b << 8) | c;
5823 			out = mb_convert_buf_add4(out,
5824 				base64_table[(bits >> 18) & 0x3F],
5825 				base64_table[(bits >> 12) & 0x3F],
5826 				base64_table[(bits >> 6) & 0x3F],
5827 				base64_table[bits & 0x3F]);
5828 		}
5829 		if (p != e) {
5830 			if ((e - p) == 1) {
5831 				uint32_t bits = *p++;
5832 				out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
5833 			} else {
5834 				unsigned char a = *p++;
5835 				unsigned char b = *p++;
5836 				uint32_t bits = (a << 8) | b;
5837 				out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
5838 			}
5839 		}
5840 	} else {
5841 		MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
5842 		while (p < e) {
5843 			unsigned char c = *p++;
5844 			if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
5845 				out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
5846 			} else {
5847 				out = mb_convert_buf_add(out, c);
5848 			}
5849 		}
5850 	}
5851 
5852 	mb_convert_buf_reset(tmpbuf, 0);
5853 	MB_CONVERT_BUF_STORE(outbuf, out, limit);
5854 }
5855 
5856 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
5857 
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)5858 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
5859 {
5860 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
5861 	size_t in_len = ZSTR_LEN(input);
5862 
5863 	ZEND_ASSERT(outcode->mime_name != NULL);
5864 	ZEND_ASSERT(outcode->mime_name[0] != '\0');
5865 
5866 	if (!in_len) {
5867 		return zend_empty_string;
5868 	}
5869 
5870 	if (indent < 0 || indent >= 74) {
5871 		indent = 0;
5872 	}
5873 
5874 	if (linefeed_len > 8) {
5875 		linefeed_len = 8;
5876 	}
5877 	/* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
5878 	for (size_t i = 0; i < linefeed_len; i++) {
5879 		if (linefeed[i] == '\0') {
5880 			linefeed_len = i;
5881 			break;
5882 		}
5883 	}
5884 
5885 	unsigned int state = 0;
5886 	/* wchar_buf should be big enough that when it is full, we definitely have enough
5887 	 * wchars to fill an entire line of output */
5888 	uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
5889 	uint32_t *p, *e;
5890 	/* What part of wchar_buf is filled with still-unprocessed data which should not
5891 	 * be overwritten? */
5892 	unsigned int offset = 0;
5893 	size_t line_start = 0;
5894 
5895 	/* If the entire input string is ASCII with no spaces (except possibly leading
5896 	 * spaces), just pass it through unchanged */
5897 	bool checking_leading_spaces = true;
5898 	while (in_len) {
5899 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
5900 		p = wchar_buf;
5901 		e = wchar_buf + out_len;
5902 
5903 		while (p < e) {
5904 			uint32_t w = *p++;
5905 			if (checking_leading_spaces) {
5906 				if (w == ' ') {
5907 					continue;
5908 				} else {
5909 					checking_leading_spaces = false;
5910 				}
5911 			}
5912 			if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
5913 				/* We cannot simply pass input string through unchanged; start again */
5914 				in = (unsigned char*)ZSTR_VAL(input);
5915 				in_len = ZSTR_LEN(input);
5916 				goto no_passthrough;
5917 			}
5918 		}
5919 	}
5920 
5921 	return zend_string_copy(input); /* This just increments refcount */
5922 
5923 no_passthrough: ;
5924 
5925 	mb_convert_buf buf;
5926 	mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
5927 
5928 	/* Encode some prefix of the input string as plain ASCII if possible
5929 	 * If we find it necessary to switch to Base64/QPrint encoding, we will
5930 	 * do so all the way to the end of the string */
5931 	while (in_len) {
5932 		/* Decode part of the input string, refill wchar_buf */
5933 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
5934 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
5935 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
5936 		p = wchar_buf;
5937 		e = wchar_buf + offset + out_len;
5938 		/* ASCII output is broken into space-delimited 'words'
5939 		 * If we find a non-ASCII character in the middle of a word, we will
5940 		 * transfer-encode the entire word */
5941 		uint32_t *word_start = p;
5942 
5943 		/* Don't consider adding line feed for spaces at the beginning of a word */
5944 		while (p < e && *p == ' ' && (p - word_start) <= 74) {
5945 			p++;
5946 		}
5947 
5948 		while (p < e) {
5949 			uint32_t w = *p++;
5950 
5951 			if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
5952 				/* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
5953 				 * If we are already too far along on a line to include Base64/QPrint encoded data
5954 				 * on the same line (without overrunning max line length), then add a line feed
5955 				 * right now */
5956 feed_and_mime_encode:
5957 				if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
5958 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
5959 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
5960 					buf.out = mb_convert_buf_add(buf.out, ' ');
5961 					indent = 0;
5962 					line_start = mb_convert_buf_len(&buf);
5963 				} else if (mb_convert_buf_len(&buf) > 0) {
5964 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
5965 					buf.out = mb_convert_buf_add(buf.out, ' ');
5966 				}
5967 				p = word_start; /* Back up to where MIME encoding of input chars should start */
5968 				goto mime_encoding_needed;
5969 			} else if (w == ' ') {
5970 				/* When we see a space, check whether we should insert a line break */
5971 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
5972 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
5973 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
5974 					buf.out = mb_convert_buf_add(buf.out, ' ');
5975 					indent = 0;
5976 					line_start = mb_convert_buf_len(&buf);
5977 				} else if (mb_convert_buf_len(&buf) > 0) {
5978 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
5979 					buf.out = mb_convert_buf_add(buf.out, ' ');
5980 				}
5981 				/* Output one (space-delimited) word as plain ASCII */
5982 				while (word_start < p-1) {
5983 					buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
5984 				}
5985 				word_start++;
5986 				while (p < e && *p == ' ') {
5987 					p++;
5988 				}
5989 			}
5990 		}
5991 
5992 		if (in_len) {
5993 			/* Copy chars which are part of an incomplete 'word' to the beginning
5994 			 * of wchar_buf and reprocess them on the next iteration.
5995 			 * But first make sure that the incomplete 'word' isn't so big that
5996 			 * there will be no space to add any more decoded wchars in the buffer
5997 			 * (which could lead to an infinite loop) */
5998 			if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
5999 				goto feed_and_mime_encode;
6000 			}
6001 			offset = e - word_start;
6002 			if (offset) {
6003 				memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6004 			}
6005 		} else {
6006 			/* We have reached the end of the input string while still in 'ASCII mode';
6007 			 * process any trailing ASCII chars which were not followed by a space */
6008 			if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6009 				/* The whole input string was not just one big ASCII 'word' with no spaces
6010 				 * consider adding a line feed if necessary to prevent output lines from
6011 				 * being too long */
6012 				if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6013 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6014 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6015 					buf.out = mb_convert_buf_add(buf.out, ' ');
6016 				} else {
6017 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6018 					buf.out = mb_convert_buf_add(buf.out, ' ');
6019 				}
6020 			}
6021 			while (word_start < e) {
6022 				buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6023 			}
6024 		}
6025 	}
6026 
6027 	/* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6028 	return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6029 
6030 mime_encoding_needed: ;
6031 
6032 	/* We will generate the output line by line, first converting wchars to bytes
6033 	 * in the requested output encoding, then transfer-encoding those bytes as
6034 	 * Base64 or QPrint
6035 	 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6036 	 * sending them to 'buf' */
6037 	mb_convert_buf tmpbuf;
6038 	mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6039 
6040 	/* Do we need to refill wchar_buf to make sure we don't run out of wchars
6041 	 * in the middle of a line? */
6042 	offset = e - p;
6043 	if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6044 		goto start_new_line;
6045 	}
6046 	memmove(wchar_buf, p, offset * sizeof(uint32_t));
6047 
6048 	while(true) {
6049 refill_wchar_buf: ;
6050 		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6051 		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6052 		ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6053 		p = wchar_buf;
6054 		e = wchar_buf + offset + out_len;
6055 
6056 start_new_line: ;
6057 		MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6058 		buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6059 		buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6060 		buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6061 
6062 		/* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6063 		 * We do something like a 'binary search' to find the greatest number which
6064 		 * can be included on this line without exceeding max line length */
6065 		unsigned int n = 12;
6066 		size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6067 
6068 		while (true) {
6069 			ZEND_ASSERT(p < e);
6070 
6071 			/* Remember where we were in process of generating output, so we can back
6072 			 * up if necessary */
6073 			size_t tmppos = mb_convert_buf_len(&tmpbuf);
6074 			unsigned int tmpstate = tmpbuf.state;
6075 
6076 			/* Try encoding 'n' wchars in output text encoding and sending output
6077 			 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6078 			 * current line. */
6079 			n = MIN(n, e - p);
6080 			outcode->from_wchar(p, n, &tmpbuf, false);
6081 
6082 			/* For some output text encodings, there may be a few ending bytes
6083 			 * which need to be emitted to output before we break a line.
6084 			 * Again, remember where we were so we can back up */
6085 			size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6086 			unsigned int tmpstate2 = tmpbuf.state;
6087 			outcode->from_wchar(NULL, 0, &tmpbuf, true);
6088 
6089 			if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6090 				/* If we convert 'n' more wchars on the current line, it will not
6091 				 * overflow the maximum line length */
6092 				p += n;
6093 
6094 				if (p == e) {
6095 					/* We are done; we shouldn't reach here if there is more remaining
6096 					 * of the input string which needs to be processed */
6097 					ZEND_ASSERT(!in_len);
6098 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6099 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6100 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6101 					mb_convert_buf_free(&tmpbuf);
6102 					return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6103 				} else {
6104 					/* It's possible that more chars might fit on the current line,
6105 					 * so back up to where we were before emitting any ending bytes */
6106 					mb_convert_buf_reset(&tmpbuf, tmppos2);
6107 					tmpbuf.state = tmpstate2;
6108 				}
6109 			} else {
6110 				/* Converting 'n' more wchars on this line would be too much.
6111 				 * Back up to where we were before we tried that. */
6112 				mb_convert_buf_reset(&tmpbuf, tmppos);
6113 				tmpbuf.state = tmpstate;
6114 
6115 				if (n == 1) {
6116 					/* We have found the exact number of chars which will fit on the
6117 					 * current line. Finish up and move to a new line. */
6118 					outcode->from_wchar(NULL, 0, &tmpbuf, true);
6119 					transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6120 					tmpbuf.state = 0;
6121 
6122 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6123 					buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6124 
6125 					indent = 0; /* Indent argument must only affect the first line */
6126 
6127 					if (in_len || p < e) {
6128 						/* We still have more input to process */
6129 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6130 						buf.out = mb_convert_buf_add(buf.out, ' ');
6131 						line_start = mb_convert_buf_len(&buf);
6132 						offset = e - p;
6133 						if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6134 							/* Copy any remaining wchars to beginning of buffer and refill
6135 							 * the rest of the buffer */
6136 							memmove(wchar_buf, p, offset * sizeof(uint32_t));
6137 							goto refill_wchar_buf;
6138 						}
6139 						goto start_new_line;
6140 					} else {
6141 						/* We are done! */
6142 						mb_convert_buf_free(&tmpbuf);
6143 						return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6144 					}
6145 				} else {
6146 					/* Try a smaller number of wchars */
6147 					n = MAX(n >> 1, 1);
6148 				}
6149 			}
6150 		}
6151 	}
6152 }
6153 
PHP_FUNCTION(mb_encode_mimeheader)6154 PHP_FUNCTION(mb_encode_mimeheader)
6155 {
6156 	const mbfl_encoding *charset = &mbfl_encoding_pass;
6157 	zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6158 	char *linefeed = "\r\n";
6159 	size_t linefeed_len = 2;
6160 	zend_long indent = 0;
6161 	bool base64 = true;
6162 
6163 	ZEND_PARSE_PARAMETERS_START(1, 5)
6164 		Z_PARAM_STR(str)
6165 		Z_PARAM_OPTIONAL
6166 		Z_PARAM_STR(charset_name)
6167 		Z_PARAM_STR(transenc_name)
6168 		Z_PARAM_STRING(linefeed, linefeed_len)
6169 		Z_PARAM_LONG(indent)
6170 	ZEND_PARSE_PARAMETERS_END();
6171 
6172 	if (charset_name != NULL) {
6173 		charset = php_mb_get_encoding(charset_name, 2);
6174 		if (!charset) {
6175 			RETURN_THROWS();
6176 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6177 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6178 			RETURN_THROWS();
6179 		}
6180 	} else {
6181 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6182 		if (lang != NULL) {
6183 			charset = mbfl_no2encoding(lang->mail_charset);
6184 			const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6185 			char t = transenc->name[0];
6186 			if (t == 'Q' || t == 'q') {
6187 				base64 = false;
6188 			}
6189 		}
6190 	}
6191 
6192 	if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6193 		char t = ZSTR_VAL(transenc_name)[0];
6194 		if (t == 'Q' || t == 'q') {
6195 			base64 = false;
6196 		}
6197 	}
6198 
6199 	RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6200 }
6201 
decode_base64(unsigned char c)6202 static int8_t decode_base64(unsigned char c)
6203 {
6204 	if (c >= 'A' && c <= 'Z') {
6205 		return c - 'A';
6206 	} else if (c >= 'a' && c <= 'z') {
6207 		return c - 'a' + 26;
6208 	} else if (c >= '0' && c <= '9') {
6209 		return c - '0' + 52;
6210 	} else if (c == '+') {
6211 		return 62;
6212 	} else if (c == '/') {
6213 		return 63;
6214 	}
6215 	return -1;
6216 }
6217 
6218 static int8_t qprint_map[] = {
6219 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6220 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6221 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6222 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
6223 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6224 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6225 	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6226 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6227 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6228 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6229 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6230 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6231 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6232 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6233 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6234 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6235 };
6236 
6237 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6238 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6239 {
6240 	if ((e - p) < 6) {
6241 		return NULL;
6242 	}
6243 
6244 	ZEND_ASSERT(p[0] == '=');
6245 	ZEND_ASSERT(p[1] == '?');
6246 	p += 2;
6247 
6248 	unsigned char *charset = p;
6249 	unsigned char *charset_end = memchr(charset, '?', e - charset);
6250 	if (charset_end == NULL) {
6251 		return NULL;
6252 	}
6253 
6254 	unsigned char *encoding = charset_end + 1;
6255 	p = encoding + 1;
6256 	if (p >= e || *p++ != '?') {
6257 		return NULL;
6258 	}
6259 
6260 	char *charset_name = estrndup((const char*)charset, charset_end - charset);
6261 	const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6262 	efree(charset_name);
6263 	if (incode == NULL) {
6264 		return NULL;
6265 	}
6266 
6267 	unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6268 	if (end_marker) {
6269 		e = end_marker;
6270 	} else if (p < e && *(e-1) == '?') {
6271 		/* If encoded word is not properly terminated, but last byte is '?',
6272 		 * take that as a terminator (legacy behavior) */
6273 		e--;
6274 	}
6275 
6276 	unsigned char *buf = emalloc(e - p), *bufp = buf;
6277 	if (*encoding == 'Q' || *encoding == 'q') {
6278 		/* Fill `buf` with bytes from decoding QPrint */
6279 		while (p < e) {
6280 			unsigned char c = *p++;
6281 			if (c == '_') {
6282 				*bufp++ = ' ';
6283 				continue;
6284 			} else if (c == '=' && (e - p) >= 2) {
6285 				unsigned char c2 = *p++;
6286 				unsigned char c3 = *p++;
6287 				if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6288 					*bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6289 					continue;
6290 				} else if (c2 == '\r') {
6291 					if (c3 != '\n') {
6292 						p--;
6293 					}
6294 					continue;
6295 				} else if (c2 == '\n') {
6296 					p--;
6297 					continue;
6298 				}
6299 			}
6300 			*bufp++ = c;
6301 		}
6302 	} else if (*encoding == 'B' || *encoding == 'b') {
6303 		/* Fill `buf` with bytes from decoding Base64 */
6304 		unsigned int bits = 0, cache = 0;
6305 		while (p < e) {
6306 			unsigned char c = *p++;
6307 			if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6308 				continue;
6309 			}
6310 			int8_t decoded = decode_base64(c);
6311 			if (decoded == -1) {
6312 				*bufp++ = '?';
6313 				continue;
6314 			}
6315 			bits += 6;
6316 			cache = (cache << 6) | (decoded & 0x3F);
6317 			if (bits == 24) {
6318 				*bufp++ = (cache >> 16) & 0xFF;
6319 				*bufp++ = (cache >> 8) & 0xFF;
6320 				*bufp++ = cache & 0xFF;
6321 				bits = cache = 0;
6322 			}
6323 		}
6324 		if (bits == 18) {
6325 			*bufp++ = (cache >> 10) & 0xFF;
6326 			*bufp++ = (cache >> 2) & 0xFF;
6327 		} else if (bits == 12) {
6328 			*bufp++ = (cache >> 4) & 0xFF;
6329 		}
6330 	} else {
6331 		efree(buf);
6332 		return NULL;
6333 	}
6334 
6335 	size_t in_len = bufp - buf;
6336 	uint32_t wchar_buf[128];
6337 
6338 	bufp = buf;
6339 	while (in_len) {
6340 		size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6341 		ZEND_ASSERT(out_len <= 128);
6342 		outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6343 	}
6344 
6345 	efree(buf);
6346 	return e + 2;
6347 }
6348 
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6349 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6350 {
6351 	unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6352 	unsigned int state = 0;
6353 	bool space_pending = false;
6354 
6355 	mb_convert_buf buf;
6356 	mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6357 
6358 	while (p < e) {
6359 		unsigned char c = *p;
6360 
6361 		if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6362 			/* Does this look like a MIME encoded word? If so, try to decode it as one */
6363 			unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6364 			if (incode_end && (e - incode_end) >= 3) {
6365 				unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6366 				if (temp) {
6367 					p = temp;
6368 					/* Decoding of MIME encoded word was successful;
6369 					 * Try to collapse a run of whitespace */
6370 					if (p < e && (*p == '\n' || *p == '\r')) {
6371 						do {
6372 							p++;
6373 						} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6374 						/* We will only actually output a space if this is not immediately followed
6375 						 * by another valid encoded word */
6376 						space_pending = true;
6377 					}
6378 					continue;
6379 				}
6380 			}
6381 		}
6382 
6383 		if (space_pending) {
6384 			uint32_t space = ' ';
6385 			outcode->from_wchar(&space, 1, &buf, false);
6386 			space_pending = false;
6387 		}
6388 
6389 		/* Consume a run of plain ASCII characters */
6390 		if (c != '\n' && c != '\r') {
6391 			unsigned char *end = p + 1;
6392 			while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6393 				end++;
6394 			}
6395 			uint32_t wchar_buf[128];
6396 			size_t in_len = end - p;
6397 			while (in_len) {
6398 				size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6399 				ZEND_ASSERT(out_len <= 128);
6400 				outcode->from_wchar(wchar_buf, out_len, &buf, false);
6401 			}
6402 		}
6403 		/* Collapse a run of whitespace into a single space */
6404 		if (p < e && (*p == '\n' || *p == '\r')) {
6405 			do {
6406 				p++;
6407 			} while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6408 			if (p < e) {
6409 				/* Emulating legacy behavior of mb_decode_mimeheader here;
6410 				 * a run of whitespace is not converted to a space at the very
6411 				 * end of the input string */
6412 				uint32_t space = ' ';
6413 				outcode->from_wchar(&space, 1, &buf, false);
6414 			}
6415 		}
6416 	}
6417 
6418 	outcode->from_wchar(NULL, 0, &buf, true);
6419 
6420 	return mb_convert_buf_result(&buf, outcode);
6421 }
6422 
PHP_FUNCTION(mb_decode_mimeheader)6423 PHP_FUNCTION(mb_decode_mimeheader)
6424 {
6425 	zend_string *str;
6426 
6427 	ZEND_PARSE_PARAMETERS_START(1, 1)
6428 		Z_PARAM_STR(str)
6429 	ZEND_PARSE_PARAMETERS_END();
6430 
6431 	RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6432 }
6433