xref: /PHP-8.2/ext/mbstring/mbstring.c (revision 6fc8d014)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    |         Rui Hirokawa <hirokawa@php.net>                              |
15    |         Hironori Sato <satoh@jpnnet.com>                             |
16    |         Shigeru Kanemoto <sgk@happysize.co.jp>                       |
17    +----------------------------------------------------------------------+
18 */
19 
20 /* {{{ includes */
21 #include "libmbfl/config.h"
22 #include "php.h"
23 #include "php_ini.h"
24 #include "php_variables.h"
25 #include "mbstring.h"
26 #include "ext/standard/php_string.h"
27 #include "ext/standard/php_mail.h"
28 #include "ext/standard/exec.h"
29 #include "ext/standard/url.h"
30 #include "main/php_output.h"
31 #include "ext/standard/info.h"
32 #include "ext/pcre/php_pcre.h"
33 
34 #include "libmbfl/mbfl/mbfilter_8bit.h"
35 #include "libmbfl/mbfl/mbfilter_pass.h"
36 #include "libmbfl/mbfl/mbfilter_wchar.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_qprint.h"
40 #include "libmbfl/filters/mbfilter_htmlent.h"
41 #include "libmbfl/filters/mbfilter_uuencode.h"
42 #include "libmbfl/filters/mbfilter_ucs4.h"
43 #include "libmbfl/filters/mbfilter_utf8.h"
44 #include "libmbfl/filters/mbfilter_singlebyte.h"
45 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
46 
47 #include "php_variables.h"
48 #include "php_globals.h"
49 #include "rfc1867.h"
50 #include "php_content_types.h"
51 #include "SAPI.h"
52 #include "php_unicode.h"
53 #include "TSRM.h"
54 
55 #include "mb_gpc.h"
56 
57 #ifdef HAVE_MBREGEX
58 # include "php_mbregex.h"
59 #endif
60 
61 #include "zend_multibyte.h"
62 #include "mbstring_arginfo.h"
63 /* }}} */
64 
65 /* {{{ prototypes */
66 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
67 
68 static PHP_GINIT_FUNCTION(mbstring);
69 static PHP_GSHUTDOWN_FUNCTION(mbstring);
70 
71 static void php_mb_populate_current_detect_order_list(void);
72 
73 static int php_mb_encoding_translation(void);
74 
75 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
76 
77 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
78 
79 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
80 
81 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
82 
83 /* See mbfilter_cp5022x.c */
84 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
85 /* }}} */
86 
87 /* {{{ php_mb_default_identify_list */
88 typedef struct _php_mb_nls_ident_list {
89 	enum mbfl_no_language lang;
90 	const enum mbfl_no_encoding *list;
91 	size_t list_size;
92 } php_mb_nls_ident_list;
93 
94 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
95 	mbfl_no_encoding_ascii,
96 	mbfl_no_encoding_jis,
97 	mbfl_no_encoding_utf8,
98 	mbfl_no_encoding_euc_jp,
99 	mbfl_no_encoding_sjis
100 };
101 
102 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
103 	mbfl_no_encoding_ascii,
104 	mbfl_no_encoding_utf8,
105 	mbfl_no_encoding_euc_cn,
106 	mbfl_no_encoding_cp936
107 };
108 
109 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
110 	mbfl_no_encoding_ascii,
111 	mbfl_no_encoding_utf8,
112 	mbfl_no_encoding_euc_tw,
113 	mbfl_no_encoding_big5
114 };
115 
116 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
117 	mbfl_no_encoding_ascii,
118 	mbfl_no_encoding_utf8,
119 	mbfl_no_encoding_euc_kr,
120 	mbfl_no_encoding_uhc
121 };
122 
123 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
124 	mbfl_no_encoding_ascii,
125 	mbfl_no_encoding_utf8,
126 	mbfl_no_encoding_koi8r,
127 	mbfl_no_encoding_cp1251,
128 	mbfl_no_encoding_cp866
129 };
130 
131 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
132 	mbfl_no_encoding_ascii,
133 	mbfl_no_encoding_utf8,
134 	mbfl_no_encoding_armscii8
135 };
136 
137 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
138 	mbfl_no_encoding_ascii,
139 	mbfl_no_encoding_utf8,
140 	mbfl_no_encoding_cp1254,
141 	mbfl_no_encoding_8859_9
142 };
143 
144 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
145 	mbfl_no_encoding_ascii,
146 	mbfl_no_encoding_utf8,
147 	mbfl_no_encoding_koi8u
148 };
149 
150 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
151 	mbfl_no_encoding_ascii,
152 	mbfl_no_encoding_utf8
153 };
154 
155 
156 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
157 	{ mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
158 	{ mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
159 	{ mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
160 	{ mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
161 	{ mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
162 	{ mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
163 	{ mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
164 	{ mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
165 	{ mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
166 };
167 
168 /* }}} */
169 
170 /* {{{ mbstring_deps[] */
171 static const zend_module_dep mbstring_deps[] = {
172 	ZEND_MOD_REQUIRED("pcre")
173 	ZEND_MOD_END
174 };
175 /* }}} */
176 
177 /* {{{ zend_module_entry mbstring_module_entry */
178 zend_module_entry mbstring_module_entry = {
179 	STANDARD_MODULE_HEADER_EX,
180 	NULL,
181 	mbstring_deps,
182 	"mbstring",
183 	ext_functions,
184 	PHP_MINIT(mbstring),
185 	PHP_MSHUTDOWN(mbstring),
186 	PHP_RINIT(mbstring),
187 	PHP_RSHUTDOWN(mbstring),
188 	PHP_MINFO(mbstring),
189 	PHP_MBSTRING_VERSION,
190 	PHP_MODULE_GLOBALS(mbstring),
191 	PHP_GINIT(mbstring),
192 	PHP_GSHUTDOWN(mbstring),
193 	NULL,
194 	STANDARD_MODULE_PROPERTIES_EX
195 };
196 /* }}} */
197 
198 /* {{{ static sapi_post_entry php_post_entries[] */
199 static const sapi_post_entry php_post_entries[] = {
200 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data,	php_std_post_handler },
201 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
202 	{ NULL, 0, NULL, NULL }
203 };
204 /* }}} */
205 
206 #ifdef COMPILE_DL_MBSTRING
207 #ifdef ZTS
208 ZEND_TSRMLS_CACHE_DEFINE()
209 #endif
210 ZEND_GET_MODULE(mbstring)
211 #endif
212 
213 /* {{{ static sapi_post_entry mbstr_post_entries[] */
214 static const sapi_post_entry mbstr_post_entries[] = {
215 	{ DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
216 	{ MULTIPART_CONTENT_TYPE,    sizeof(MULTIPART_CONTENT_TYPE)-1,    NULL,                         rfc1867_post_handler },
217 	{ NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220 
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)221 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
222 	if (encoding_name) {
223 		const mbfl_encoding *encoding;
224 		zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
225 		if (last_encoding_name && (last_encoding_name == encoding_name
226 				|| zend_string_equals_ci(encoding_name, last_encoding_name))) {
227 			return MBSTRG(last_used_encoding);
228 		}
229 
230 		encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
231 		if (!encoding) {
232 			zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
233 			return NULL;
234 		} else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
235 			if (encoding == &mbfl_encoding_base64) {
236 				php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
237 			} else if (encoding == &mbfl_encoding_qprint) {
238 				php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
239 			} else if (encoding == &mbfl_encoding_html_ent) {
240 				php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
241 			} else if (encoding == &mbfl_encoding_uuencode) {
242 				php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
243 			}
244 		}
245 
246 		if (last_encoding_name) {
247 			zend_string_release(last_encoding_name);
248 		}
249 		MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
250 		MBSTRG(last_used_encoding) = encoding;
251 		return encoding;
252 	} else {
253 		return MBSTRG(current_internal_encoding);
254 	}
255 }
256 
php_mb_get_encoding_or_pass(const char * encoding_name)257 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
258 	if (strcmp(encoding_name, "pass") == 0) {
259 		return &mbfl_encoding_pass;
260 	}
261 
262 	return mbfl_name2encoding(encoding_name);
263 }
264 
count_commas(const char * p,const char * end)265 static size_t count_commas(const char *p, const char *end) {
266 	size_t count = 0;
267 	while ((p = memchr(p, ',', end - p))) {
268 		count++;
269 		p++;
270 	}
271 	return count;
272 }
273 
274 /* {{{ static zend_result php_mb_parse_encoding_list()
275  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
276  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
277  */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num,bool allow_pass_encoding)278 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
279 	const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num,
280 	bool allow_pass_encoding)
281 {
282 	if (value == NULL || value_length == 0) {
283 		*return_list = NULL;
284 		*return_size = 0;
285 		return SUCCESS;
286 	} else {
287 		bool included_auto;
288 		size_t n, size;
289 		char *p1, *endp, *tmpstr;
290 		const mbfl_encoding **entry, **list;
291 
292 		/* copy the value string for work */
293 		if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
294 			tmpstr = (char *)estrndup(value+1, value_length-2);
295 			value_length -= 2;
296 		} else {
297 			tmpstr = (char *)estrndup(value, value_length);
298 		}
299 
300 		endp = tmpstr + value_length;
301 		size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
302 		list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
303 		entry = list;
304 		n = 0;
305 		included_auto = 0;
306 		p1 = tmpstr;
307 		while (1) {
308 			char *comma = memchr(p1, ',', endp - p1);
309 			char *p = comma ? comma : endp;
310 			*p = '\0';
311 			/* trim spaces */
312 			while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
313 				p1++;
314 			}
315 			p--;
316 			while (p > p1 && (*p == ' ' || *p == '\t')) {
317 				*p = '\0';
318 				p--;
319 			}
320 			/* convert to the encoding number and check encoding */
321 			if (strcasecmp(p1, "auto") == 0) {
322 				if (!included_auto) {
323 					const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
324 					const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
325 					size_t i;
326 					included_auto = 1;
327 					for (i = 0; i < identify_list_size; i++) {
328 						*entry++ = mbfl_no2encoding(*src++);
329 						n++;
330 					}
331 				}
332 			} else {
333 				const mbfl_encoding *encoding =
334 					allow_pass_encoding ? php_mb_get_encoding_or_pass(p1) : mbfl_name2encoding(p1);
335 				if (!encoding) {
336 					/* Called from an INI setting modification */
337 					if (arg_num == 0) {
338 						php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
339 					} else {
340 						zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
341 					}
342 					efree(tmpstr);
343 					pefree(ZEND_VOIDP(list), persistent);
344 					return FAILURE;
345 				}
346 
347 				*entry++ = encoding;
348 				n++;
349 			}
350 			if (n >= size || comma == NULL) {
351 				break;
352 			}
353 			p1 = comma + 1;
354 		}
355 		*return_list = list;
356 		*return_size = n;
357 		efree(tmpstr);
358 	}
359 
360 	return SUCCESS;
361 }
362 /* }}} */
363 
364 /* {{{ static int php_mb_parse_encoding_array()
365  *  Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
366  * 	Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
367  */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)368 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
369 	size_t *return_size, uint32_t arg_num)
370 {
371 	/* Allocate enough space to include the default detect order if "auto" is used. */
372 	size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
373 	const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
374 	const mbfl_encoding **entry = list;
375 	bool included_auto = 0;
376 	size_t n = 0;
377 	zval *hash_entry;
378 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
379 		zend_string *encoding_str = zval_try_get_string(hash_entry);
380 		if (UNEXPECTED(!encoding_str)) {
381 			efree(ZEND_VOIDP(list));
382 			return FAILURE;
383 		}
384 
385 		if (zend_string_equals_literal_ci(encoding_str, "auto")) {
386 			if (!included_auto) {
387 				const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
388 				const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
389 				size_t j;
390 
391 				included_auto = 1;
392 				for (j = 0; j < identify_list_size; j++) {
393 					*entry++ = mbfl_no2encoding(*src++);
394 					n++;
395 				}
396 			}
397 		} else {
398 			const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
399 			if (encoding) {
400 				*entry++ = encoding;
401 				n++;
402 			} else {
403 				zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
404 				zend_string_release(encoding_str);
405 				efree(ZEND_VOIDP(list));
406 				return FAILURE;
407 			}
408 		}
409 		zend_string_release(encoding_str);
410 	} ZEND_HASH_FOREACH_END();
411 	*return_list = list;
412 	*return_size = n;
413 	return SUCCESS;
414 }
415 /* }}} */
416 
417 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)418 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
419 {
420 	return (const zend_encoding*)mbfl_name2encoding(encoding_name);
421 }
422 
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)423 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
424 {
425 	return ((const mbfl_encoding *)encoding)->name;
426 }
427 
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)428 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
429 {
430 	const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
431 	return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
432 }
433 
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)434 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
435 {
436 	mbfl_string string;
437 
438 	if (!list) {
439 		list = (const zend_encoding **)MBSTRG(current_detect_order_list);
440 		list_size = MBSTRG(current_detect_order_list_size);
441 	}
442 
443 	mbfl_string_init(&string);
444 	string.val = (unsigned char *)arg_string;
445 	string.len = arg_length;
446 	return (const zend_encoding *) mbfl_identify_encoding(&string, (const mbfl_encoding **)list, list_size, 0);
447 }
448 
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)449 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
450 {
451 	mbfl_string string, result;
452 	mbfl_buffer_converter *convd;
453 
454 	/* new encoding */
455 	/* initialize string */
456 	string.encoding = (const mbfl_encoding*)encoding_from;
457 	string.val = (unsigned char*)from;
458 	string.len = from_length;
459 
460 	/* initialize converter */
461 	convd = mbfl_buffer_converter_new((const mbfl_encoding *)encoding_from, (const mbfl_encoding *)encoding_to, string.len);
462 	if (convd == NULL) {
463 		return (size_t) -1;
464 	}
465 
466 	mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
467 	mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
468 
469 	/* do it */
470 	size_t loc = mbfl_buffer_converter_feed(convd, &string);
471 
472 	mbfl_buffer_converter_flush(convd);
473 	mbfl_string_init(&result);
474 	if (!mbfl_buffer_converter_result(convd, &result)) {
475 		mbfl_buffer_converter_delete(convd);
476 		return (size_t)-1;
477 	}
478 
479 	*to = result.val;
480 	*to_length = result.len;
481 
482 	mbfl_buffer_converter_delete(convd);
483 
484 	return loc;
485 }
486 
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)487 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
488 {
489 	return php_mb_parse_encoding_list(
490 		encoding_list, encoding_list_len,
491 		(const mbfl_encoding ***)return_list, return_size,
492 		persistent, /* arg_num */ 0, /* allow_pass_encoding */ 1);
493 }
494 
php_mb_zend_internal_encoding_getter(void)495 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
496 {
497 	return (const zend_encoding *)MBSTRG(internal_encoding);
498 }
499 
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)500 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
501 {
502 	MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
503 	return SUCCESS;
504 }
505 
506 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
507 	"mbstring",
508 	php_mb_zend_encoding_fetcher,
509 	php_mb_zend_encoding_name_getter,
510 	php_mb_zend_encoding_lexer_compatibility_checker,
511 	php_mb_zend_encoding_detector,
512 	php_mb_zend_encoding_converter,
513 	php_mb_zend_encoding_list_parser,
514 	php_mb_zend_internal_encoding_getter,
515 	php_mb_zend_internal_encoding_setter
516 };
517 /* }}} */
518 
519 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)520 static void *_php_mb_compile_regex(const char *pattern)
521 {
522 	pcre2_code *retval;
523 	PCRE2_SIZE err_offset;
524 	int errnum;
525 
526 	if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
527 			PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
528 		PCRE2_UCHAR err_str[128];
529 		pcre2_get_error_message(errnum, err_str, sizeof(err_str));
530 		php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
531 	}
532 	return retval;
533 }
534 /* }}} */
535 
536 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)537 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
538 {
539 	int res;
540 
541 	pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
542 	if (NULL == match_data) {
543 		pcre2_code_free(opaque);
544 		php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
545 		return FAILURE;
546 	}
547 	res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
548 	php_pcre_free_match_data(match_data);
549 
550 	return res;
551 }
552 /* }}} */
553 
554 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)555 static void _php_mb_free_regex(void *opaque)
556 {
557 	pcre2_code_free(opaque);
558 }
559 /* }}} */
560 
561 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)562 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
563 {
564 	size_t i;
565 
566 	*plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
567 	*plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
568 
569 	for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
570 		if (php_mb_default_identify_list[i].lang == lang) {
571 			*plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
572 			*plist_size = php_mb_default_identify_list[i].list_size;
573 			return 1;
574 		}
575 	}
576 	return 0;
577 }
578 /* }}} */
579 
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)580 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
581 {
582 	char *result = emalloc(len + 2);
583 	char *resp = result;
584 	size_t i;
585 
586 	for (i = 0; i < len && start[i] != quote; ++i) {
587 		if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
588 			*resp++ = start[++i];
589 		} else {
590 			size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
591 
592 			while (j-- > 0 && i < len) {
593 				*resp++ = start[i++];
594 			}
595 			--i;
596 		}
597 	}
598 
599 	*resp = '\0';
600 	return result;
601 }
602 
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)603 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
604 {
605 	char *pos = *line, quote;
606 	char *res;
607 
608 	while (*pos && *pos != stop) {
609 		if ((quote = *pos) == '"' || quote == '\'') {
610 			++pos;
611 			while (*pos && *pos != quote) {
612 				if (*pos == '\\' && pos[1] && pos[1] == quote) {
613 					pos += 2;
614 				} else {
615 					++pos;
616 				}
617 			}
618 			if (*pos) {
619 				++pos;
620 			}
621 		} else {
622 			pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
623 
624 		}
625 	}
626 	if (*pos == '\0') {
627 		res = estrdup(*line);
628 		*line += strlen(*line);
629 		return res;
630 	}
631 
632 	res = estrndup(*line, pos - *line);
633 
634 	while (*pos == stop) {
635 		pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
636 	}
637 
638 	*line = pos;
639 	return res;
640 }
641 /* }}} */
642 
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)643 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
644 {
645 	while (*str && isspace(*(unsigned char *)str)) {
646 		++str;
647 	}
648 
649 	if (!*str) {
650 		return estrdup("");
651 	}
652 
653 	if (*str == '"' || *str == '\'') {
654 		char quote = *str;
655 
656 		str++;
657 		return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
658 	} else {
659 		char *strend = str;
660 
661 		while (*strend && !isspace(*(unsigned char *)strend)) {
662 			++strend;
663 		}
664 		return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
665 	}
666 }
667 /* }}} */
668 
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)669 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
670 {
671 	char *s, *s2;
672 	const size_t filename_len = strlen(filename);
673 
674 	/* The \ check should technically be needed for win32 systems only where
675 	 * it is a valid path separator. However, IE in all it's wisdom always sends
676 	 * the full path of the file on the user's filesystem, which means that unless
677 	 * the user does basename() they get a bogus file name. Until IE's user base drops
678 	 * to nill or problem is fixed this code must remain enabled for all systems. */
679 	s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
680 	s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
681 
682 	if (s && s2) {
683 		if (s > s2) {
684 			return ++s;
685 		} else {
686 			return ++s2;
687 		}
688 	} else if (s) {
689 		return ++s;
690 	} else if (s2) {
691 		return ++s2;
692 	} else {
693 		return filename;
694 	}
695 }
696 /* }}} */
697 
698 /* {{{ php.ini directive handler */
699 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)700 static PHP_INI_MH(OnUpdate_mbstring_language)
701 {
702 	enum mbfl_no_language no_language;
703 
704 	no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
705 	if (no_language == mbfl_no_language_invalid) {
706 		MBSTRG(language) = mbfl_no_language_neutral;
707 		return FAILURE;
708 	}
709 	MBSTRG(language) = no_language;
710 	php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
711 	return SUCCESS;
712 }
713 /* }}} */
714 
715 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)716 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
717 {
718 	const mbfl_encoding **list;
719 	size_t size;
720 
721 	if (!new_value) {
722 		if (MBSTRG(detect_order_list)) {
723 			pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 		}
725 		MBSTRG(detect_order_list) = NULL;
726 		MBSTRG(detect_order_list_size) = 0;
727 		return SUCCESS;
728 	}
729 
730 	if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 0) || size == 0) {
731 		return FAILURE;
732 	}
733 
734 	if (MBSTRG(detect_order_list)) {
735 		pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
736 	}
737 	MBSTRG(detect_order_list) = list;
738 	MBSTRG(detect_order_list_size) = size;
739 	return SUCCESS;
740 }
741 /* }}} */
742 
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)743 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
744 	const mbfl_encoding **list;
745 	size_t size;
746 	if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 1) || size == 0) {
747 		return FAILURE;
748 	}
749 	if (MBSTRG(http_input_list)) {
750 		pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
751 	}
752 	MBSTRG(http_input_list) = list;
753 	MBSTRG(http_input_list_size) = size;
754 	return SUCCESS;
755 }
756 
757 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)758 static PHP_INI_MH(OnUpdate_mbstring_http_input)
759 {
760 	if (new_value) {
761 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
762 	}
763 
764 	if (!new_value || !ZSTR_LEN(new_value)) {
765 		const char *encoding = php_get_input_encoding();
766 		MBSTRG(http_input_set) = 0;
767 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
768 		return SUCCESS;
769 	}
770 
771 	MBSTRG(http_input_set) = 1;
772 	return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
773 }
774 /* }}} */
775 
_php_mb_ini_mbstring_http_output_set(const char * new_value)776 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
777 	const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
778 	if (!encoding) {
779 		return FAILURE;
780 	}
781 
782 	MBSTRG(http_output_encoding) = encoding;
783 	MBSTRG(current_http_output_encoding) = encoding;
784 	return SUCCESS;
785 }
786 
787 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)788 static PHP_INI_MH(OnUpdate_mbstring_http_output)
789 {
790 	if (new_value) {
791 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
792 	}
793 
794 	if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
795 		MBSTRG(http_output_set) = 0;
796 		_php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
797 		return SUCCESS;
798 	}
799 
800 	MBSTRG(http_output_set) = 1;
801 	return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
802 }
803 /* }}} */
804 
805 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)806 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
807 {
808 	const mbfl_encoding *encoding;
809 
810 	if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
811 		/* falls back to UTF-8 if an unknown encoding name is given */
812 		if (new_value) {
813 			php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
814 		}
815 		encoding = &mbfl_encoding_utf8;
816 	}
817 	MBSTRG(internal_encoding) = encoding;
818 	MBSTRG(current_internal_encoding) = encoding;
819 #ifdef HAVE_MBREGEX
820 	{
821 		const char *enc_name = new_value;
822 		if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
823 			/* falls back to UTF-8 if an unknown encoding name is given */
824 			enc_name = "UTF-8";
825 			php_mb_regex_set_default_mbctype(enc_name);
826 		}
827 		php_mb_regex_set_mbctype(new_value);
828 	}
829 #endif
830 	return SUCCESS;
831 }
832 /* }}} */
833 
834 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)835 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
836 {
837 	if (new_value) {
838 		php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
839 	}
840 
841 	if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
842 		return FAILURE;
843 	}
844 
845 	if (new_value && ZSTR_LEN(new_value)) {
846 		MBSTRG(internal_encoding_set) = 1;
847 		return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
848 	} else {
849 		const char *encoding = php_get_internal_encoding();
850 		MBSTRG(internal_encoding_set) = 0;
851 		return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
852 	}
853 }
854 /* }}} */
855 
856 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)857 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
858 {
859 	int c;
860 	char *endptr = NULL;
861 
862 	if (new_value != NULL) {
863 		if (zend_string_equals_literal_ci(new_value, "none")) {
864 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
865 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
866 		} else if (zend_string_equals_literal_ci(new_value, "long")) {
867 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
868 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
869 		} else if (zend_string_equals_literal_ci(new_value, "entity")) {
870 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
871 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
872 		} else {
873 			MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
874 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
875 			if (ZSTR_LEN(new_value) > 0) {
876 				c = strtol(ZSTR_VAL(new_value), &endptr, 0);
877 				if (*endptr == '\0') {
878 					MBSTRG(filter_illegal_substchar) = c;
879 					MBSTRG(current_filter_illegal_substchar) = c;
880 				}
881 			}
882 		}
883 	} else {
884 		MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
885 		MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
886 		MBSTRG(filter_illegal_substchar) = 0x3f;	/* '?' */
887 		MBSTRG(current_filter_illegal_substchar) = 0x3f;	/* '?' */
888 	}
889 
890 	return SUCCESS;
891 }
892 /* }}} */
893 
894 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)895 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
896 {
897 	if (new_value == NULL) {
898 		return FAILURE;
899 	}
900 
901 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
902 
903 	if (MBSTRG(encoding_translation)) {
904 		sapi_unregister_post_entry(php_post_entries);
905 		sapi_register_post_entries(mbstr_post_entries);
906 	} else {
907 		sapi_unregister_post_entry(mbstr_post_entries);
908 		sapi_register_post_entries(php_post_entries);
909 	}
910 
911 	return SUCCESS;
912 }
913 /* }}} */
914 
915 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)916 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
917 {
918 	zend_string *tmp;
919 	void *re = NULL;
920 
921 	if (!new_value) {
922 		new_value = entry->orig_value;
923 	}
924 	tmp = php_trim(new_value, NULL, 0, 3);
925 
926 	if (ZSTR_LEN(tmp) > 0) {
927 		if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
928 			zend_string_release_ex(tmp, 0);
929 			return FAILURE;
930 		}
931 	}
932 
933 	if (MBSTRG(http_output_conv_mimetypes)) {
934 		_php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
935 	}
936 
937 	MBSTRG(http_output_conv_mimetypes) = re;
938 
939 	zend_string_release_ex(tmp, 0);
940 	return SUCCESS;
941 }
942 /* }}} */
943 /* }}} */
944 
945 /* {{{ php.ini directive registration */
946 PHP_INI_BEGIN()
947 	PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
948 	PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
949 	PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
950 	PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
951 	STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
952 	PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
953 
954 	STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
955 		PHP_INI_SYSTEM | PHP_INI_PERDIR,
956 		OnUpdate_mbstring_encoding_translation,
957 		encoding_translation, zend_mbstring_globals, mbstring_globals)
958 	PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
959 		"^(text/|application/xhtml\\+xml)",
960 		PHP_INI_ALL,
961 		OnUpdate_mbstring_http_output_conv_mimetypes)
962 
963 	STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
964 		PHP_INI_ALL,
965 		OnUpdateBool,
966 		strict_detection, zend_mbstring_globals, mbstring_globals)
967 #ifdef HAVE_MBREGEX
968 	STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
969 	STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
970 #endif
PHP_INI_END()971 PHP_INI_END()
972 /* }}} */
973 
974 static void mbstring_internal_encoding_changed_hook(void) {
975 	/* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
976 	if (!MBSTRG(internal_encoding_set)) {
977 		const char *encoding = php_get_internal_encoding();
978 		_php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
979 	}
980 
981 	if (!MBSTRG(http_output_set)) {
982 		const char *encoding = php_get_output_encoding();
983 		_php_mb_ini_mbstring_http_output_set(encoding);
984 	}
985 
986 	if (!MBSTRG(http_input_set)) {
987 		const char *encoding = php_get_input_encoding();
988 		_php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
989 	}
990 }
991 
992 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)993 static PHP_GINIT_FUNCTION(mbstring)
994 {
995 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
996 ZEND_TSRMLS_CACHE_UPDATE();
997 #endif
998 
999 	mbstring_globals->language = mbfl_no_language_uni;
1000 	mbstring_globals->internal_encoding = NULL;
1001 	mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
1002 	mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
1003 	mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
1004 	mbstring_globals->http_input_identify = NULL;
1005 	mbstring_globals->http_input_identify_get = NULL;
1006 	mbstring_globals->http_input_identify_post = NULL;
1007 	mbstring_globals->http_input_identify_cookie = NULL;
1008 	mbstring_globals->http_input_identify_string = NULL;
1009 	mbstring_globals->http_input_list = NULL;
1010 	mbstring_globals->http_input_list_size = 0;
1011 	mbstring_globals->detect_order_list = NULL;
1012 	mbstring_globals->detect_order_list_size = 0;
1013 	mbstring_globals->current_detect_order_list = NULL;
1014 	mbstring_globals->current_detect_order_list_size = 0;
1015 	mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1016 	mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1017 	mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1018 	mbstring_globals->filter_illegal_substchar = 0x3f;	/* '?' */
1019 	mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1020 	mbstring_globals->current_filter_illegal_substchar = 0x3f;	/* '?' */
1021 	mbstring_globals->illegalchars = 0;
1022 	mbstring_globals->encoding_translation = 0;
1023 	mbstring_globals->strict_detection = 0;
1024 	mbstring_globals->outconv = NULL;
1025 	mbstring_globals->http_output_conv_mimetypes = NULL;
1026 #ifdef HAVE_MBREGEX
1027 	mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1028 #endif
1029 	mbstring_globals->last_used_encoding_name = NULL;
1030 	mbstring_globals->last_used_encoding = NULL;
1031 	mbstring_globals->internal_encoding_set = 0;
1032 	mbstring_globals->http_output_set = 0;
1033 	mbstring_globals->http_input_set = 0;
1034 }
1035 /* }}} */
1036 
1037 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1038 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1039 {
1040 	if (mbstring_globals->http_input_list) {
1041 		free(ZEND_VOIDP(mbstring_globals->http_input_list));
1042 	}
1043 	if (mbstring_globals->detect_order_list) {
1044 		free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1045 	}
1046 	if (mbstring_globals->http_output_conv_mimetypes) {
1047 		_php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1048 	}
1049 #ifdef HAVE_MBREGEX
1050 	php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1051 #endif
1052 }
1053 /* }}} */
1054 
1055 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1056 PHP_MINIT_FUNCTION(mbstring)
1057 {
1058 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1059 ZEND_TSRMLS_CACHE_UPDATE();
1060 #endif
1061 
1062 	REGISTER_INI_ENTRIES();
1063 
1064 	/* We assume that we're the only user of the hook. */
1065 	ZEND_ASSERT(php_internal_encoding_changed == NULL);
1066 	php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1067 	mbstring_internal_encoding_changed_hook();
1068 
1069 	/* This is a global handler. Should not be set in a per-request handler. */
1070 	sapi_register_treat_data(mbstr_treat_data);
1071 
1072 	/* Post handlers are stored in the thread-local context. */
1073 	if (MBSTRG(encoding_translation)) {
1074 		sapi_register_post_entries(mbstr_post_entries);
1075 	}
1076 
1077 #ifdef HAVE_MBREGEX
1078 	PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1079 #endif
1080 
1081 	register_mbstring_symbols(module_number);
1082 
1083 	if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1084 		return FAILURE;
1085 	}
1086 
1087 	php_rfc1867_set_multibyte_callbacks(
1088 		php_mb_encoding_translation,
1089 		php_mb_gpc_get_detect_order,
1090 		php_mb_gpc_set_input_encoding,
1091 		php_mb_rfc1867_getword,
1092 		php_mb_rfc1867_getword_conf,
1093 		php_mb_rfc1867_basename);
1094 
1095 	return SUCCESS;
1096 }
1097 /* }}} */
1098 
1099 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1100 PHP_MSHUTDOWN_FUNCTION(mbstring)
1101 {
1102 	UNREGISTER_INI_ENTRIES();
1103 
1104 	zend_multibyte_restore_functions();
1105 
1106 #ifdef HAVE_MBREGEX
1107 	PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1108 #endif
1109 
1110 	php_internal_encoding_changed = NULL;
1111 
1112 	return SUCCESS;
1113 }
1114 /* }}} */
1115 
1116 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1117 PHP_RINIT_FUNCTION(mbstring)
1118 {
1119 	MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1120 	MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1121 	MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1122 	MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1123 
1124 	MBSTRG(illegalchars) = 0;
1125 
1126 	php_mb_populate_current_detect_order_list();
1127 
1128 #ifdef HAVE_MBREGEX
1129 	PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1130 #endif
1131 	zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1132 
1133 	return SUCCESS;
1134 }
1135 /* }}} */
1136 
1137 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1138 PHP_RSHUTDOWN_FUNCTION(mbstring)
1139 {
1140 	if (MBSTRG(current_detect_order_list) != NULL) {
1141 		efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1142 		MBSTRG(current_detect_order_list) = NULL;
1143 		MBSTRG(current_detect_order_list_size) = 0;
1144 	}
1145 	if (MBSTRG(outconv) != NULL) {
1146 		MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1147 		mbfl_buffer_converter_delete(MBSTRG(outconv));
1148 		MBSTRG(outconv) = NULL;
1149 	}
1150 
1151 	/* clear http input identification. */
1152 	MBSTRG(http_input_identify) = NULL;
1153 	MBSTRG(http_input_identify_post) = NULL;
1154 	MBSTRG(http_input_identify_get) = NULL;
1155 	MBSTRG(http_input_identify_cookie) = NULL;
1156 	MBSTRG(http_input_identify_string) = NULL;
1157 
1158 	if (MBSTRG(last_used_encoding_name)) {
1159 		zend_string_release(MBSTRG(last_used_encoding_name));
1160 		MBSTRG(last_used_encoding_name) = NULL;
1161 	}
1162 
1163 	MBSTRG(internal_encoding_set) = 0;
1164 	MBSTRG(http_output_set) = 0;
1165 	MBSTRG(http_input_set) = 0;
1166 
1167 #ifdef HAVE_MBREGEX
1168 	PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1169 #endif
1170 
1171 	return SUCCESS;
1172 }
1173 /* }}} */
1174 
1175 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1176 PHP_MINFO_FUNCTION(mbstring)
1177 {
1178 	php_info_print_table_start();
1179 	php_info_print_table_row(2, "Multibyte Support", "enabled");
1180 	php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1181 	php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1182 	{
1183 		char tmp[256];
1184 		snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1185 		php_info_print_table_row(2, "libmbfl version", tmp);
1186 	}
1187 	php_info_print_table_end();
1188 
1189 	php_info_print_table_start();
1190 	php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1191 	php_info_print_table_end();
1192 
1193 #ifdef HAVE_MBREGEX
1194 	PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1195 #endif
1196 
1197 	DISPLAY_INI_ENTRIES();
1198 }
1199 /* }}} */
1200 
1201 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1202 PHP_FUNCTION(mb_language)
1203 {
1204 	zend_string *name = NULL;
1205 
1206 	ZEND_PARSE_PARAMETERS_START(0, 1)
1207 		Z_PARAM_OPTIONAL
1208 		Z_PARAM_STR_OR_NULL(name)
1209 	ZEND_PARSE_PARAMETERS_END();
1210 
1211 	if (name == NULL) {
1212 		RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1213 	} else {
1214 		zend_string *ini_name = zend_string_init("mbstring.language", sizeof("mbstring.language") - 1, 0);
1215 		if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1216 			zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1217 			zend_string_release_ex(ini_name, 0);
1218 			RETURN_THROWS();
1219 		}
1220 		// TODO Make return void
1221 		RETVAL_TRUE;
1222 		zend_string_release_ex(ini_name, 0);
1223 	}
1224 }
1225 /* }}} */
1226 
1227 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1228 PHP_FUNCTION(mb_internal_encoding)
1229 {
1230 	char *name = NULL;
1231 	size_t name_len;
1232 	const mbfl_encoding *encoding;
1233 
1234 	ZEND_PARSE_PARAMETERS_START(0, 1)
1235 		Z_PARAM_OPTIONAL
1236 		Z_PARAM_STRING_OR_NULL(name, name_len)
1237 	ZEND_PARSE_PARAMETERS_END();
1238 
1239 	if (name == NULL) {
1240 		ZEND_ASSERT(MBSTRG(current_internal_encoding));
1241 		RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1242 	} else {
1243 		encoding = mbfl_name2encoding(name);
1244 		if (!encoding) {
1245 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1246 			RETURN_THROWS();
1247 		} else {
1248 			MBSTRG(current_internal_encoding) = encoding;
1249 			MBSTRG(internal_encoding_set) = 1;
1250 			/* TODO Return old encoding */
1251 			RETURN_TRUE;
1252 		}
1253 	}
1254 }
1255 /* }}} */
1256 
1257 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1258 PHP_FUNCTION(mb_http_input)
1259 {
1260 	char *type = NULL;
1261 	size_t type_len = 0, n;
1262 	const mbfl_encoding **entry;
1263 	const mbfl_encoding *encoding;
1264 
1265 	ZEND_PARSE_PARAMETERS_START(0, 1)
1266 		Z_PARAM_OPTIONAL
1267 		Z_PARAM_STRING_OR_NULL(type, type_len)
1268 	ZEND_PARSE_PARAMETERS_END();
1269 
1270 	if (type == NULL) {
1271 		encoding = MBSTRG(http_input_identify);
1272 	} else {
1273 		switch (*type) {
1274 		case 'G':
1275 		case 'g':
1276 			encoding = MBSTRG(http_input_identify_get);
1277 			break;
1278 		case 'P':
1279 		case 'p':
1280 			encoding = MBSTRG(http_input_identify_post);
1281 			break;
1282 		case 'C':
1283 		case 'c':
1284 			encoding = MBSTRG(http_input_identify_cookie);
1285 			break;
1286 		case 'S':
1287 		case 's':
1288 			encoding = MBSTRG(http_input_identify_string);
1289 			break;
1290 		case 'I':
1291 		case 'i':
1292 			entry = MBSTRG(http_input_list);
1293 			n = MBSTRG(http_input_list_size);
1294 			array_init(return_value);
1295 			for (size_t i = 0; i < n; i++, entry++) {
1296 				add_next_index_string(return_value, (*entry)->name);
1297 			}
1298 			return;
1299 		case 'L':
1300 		case 'l':
1301 			entry = MBSTRG(http_input_list);
1302 			n = MBSTRG(http_input_list_size);
1303 			if (n == 0) {
1304 				RETURN_FALSE;
1305 			}
1306 			// TODO Use smart_str instead.
1307 			mbfl_string result;
1308 			mbfl_memory_device device;
1309 			mbfl_memory_device_init(&device, n * 12, 0);
1310 			for (size_t i = 0; i < n; i++, entry++) {
1311 				mbfl_memory_device_strcat(&device, (*entry)->name);
1312 				mbfl_memory_device_output(',', &device);
1313 			}
1314 			mbfl_memory_device_unput(&device); /* Remove trailing comma */
1315 			mbfl_memory_device_result(&device, &result);
1316 			RETVAL_STRINGL((const char*)result.val, result.len);
1317 			mbfl_string_clear(&result);
1318 			return;
1319 		default:
1320 			zend_argument_value_error(1,
1321 				"must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1322 			RETURN_THROWS();
1323 		}
1324 	}
1325 
1326 	if (encoding) {
1327 		RETURN_STRING(encoding->name);
1328 	} else {
1329 		RETURN_FALSE;
1330 	}
1331 }
1332 /* }}} */
1333 
1334 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1335 PHP_FUNCTION(mb_http_output)
1336 {
1337 	char *name = NULL;
1338 	size_t name_len;
1339 
1340 	ZEND_PARSE_PARAMETERS_START(0, 1)
1341 		Z_PARAM_OPTIONAL
1342 		Z_PARAM_STRING_OR_NULL(name, name_len)
1343 	ZEND_PARSE_PARAMETERS_END();
1344 
1345 	if (name == NULL) {
1346 		ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1347 		RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1348 	} else {
1349 		const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1350 		if (!encoding) {
1351 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1352 			RETURN_THROWS();
1353 		} else {
1354 			MBSTRG(http_output_set) = 1;
1355 			MBSTRG(current_http_output_encoding) = encoding;
1356 			/* TODO Return previous encoding? */
1357 			RETURN_TRUE;
1358 		}
1359 	}
1360 }
1361 /* }}} */
1362 
1363 /* {{{ Sets the current detect_order or Return the current detect_order as a array */
PHP_FUNCTION(mb_detect_order)1364 PHP_FUNCTION(mb_detect_order)
1365 {
1366 	zend_string *order_str = NULL;
1367 	HashTable *order_ht = NULL;
1368 
1369 	ZEND_PARSE_PARAMETERS_START(0, 1)
1370 		Z_PARAM_OPTIONAL
1371 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1372 	ZEND_PARSE_PARAMETERS_END();
1373 
1374 	if (!order_str && !order_ht) {
1375 		size_t n = MBSTRG(current_detect_order_list_size);
1376 		const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1377 		array_init(return_value);
1378 		for (size_t i = 0; i < n; i++) {
1379 			add_next_index_string(return_value, (*entry)->name);
1380 			entry++;
1381 		}
1382 	} else {
1383 		const mbfl_encoding **list;
1384 		size_t size;
1385 		if (order_ht) {
1386 			if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1387 				RETURN_THROWS();
1388 			}
1389 		} else {
1390 			if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1, /* allow_pass_encoding */ 0)) {
1391 				RETURN_THROWS();
1392 			}
1393 		}
1394 
1395 		if (size == 0) {
1396 			efree(ZEND_VOIDP(list));
1397 			zend_argument_value_error(1, "must specify at least one encoding");
1398 			RETURN_THROWS();
1399 		}
1400 
1401 		if (MBSTRG(current_detect_order_list)) {
1402 			efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1403 		}
1404 		MBSTRG(current_detect_order_list) = list;
1405 		MBSTRG(current_detect_order_list_size) = size;
1406 		RETURN_TRUE;
1407 	}
1408 }
1409 /* }}} */
1410 
php_mb_check_code_point(zend_long cp)1411 static inline int php_mb_check_code_point(zend_long cp)
1412 {
1413 	if (cp < 0 || cp >= 0x110000) {
1414 		/* Out of Unicode range */
1415 		return 0;
1416 	}
1417 
1418 	if (cp >= 0xd800 && cp <= 0xdfff) {
1419 		/* Surrogate code-point. These are never valid on their own and we only allow a single
1420 		 * substitute character. */
1421 		return 0;
1422 	}
1423 
1424 	/* As we do not know the target encoding of the conversion operation that is going to
1425 	 * use the substitution character, we cannot check whether the codepoint is actually mapped
1426 	 * in the given encoding at this point. Thus we have to accept everything. */
1427 	return 1;
1428 }
1429 
1430 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1431 PHP_FUNCTION(mb_substitute_character)
1432 {
1433 	zend_string *substitute_character = NULL;
1434 	zend_long substitute_codepoint;
1435 	bool substitute_is_null = 1;
1436 
1437 	ZEND_PARSE_PARAMETERS_START(0, 1)
1438 		Z_PARAM_OPTIONAL
1439 		Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1440 	ZEND_PARSE_PARAMETERS_END();
1441 
1442 	if (substitute_is_null) {
1443 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1444 			RETURN_STRING("none");
1445 		}
1446 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1447 			RETURN_STRING("long");
1448 		}
1449 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1450 			RETURN_STRING("entity");
1451 		}
1452 		RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1453 	}
1454 
1455 	if (substitute_character != NULL) {
1456 		if (zend_string_equals_literal_ci(substitute_character, "none")) {
1457 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1458 			RETURN_TRUE;
1459 		}
1460 		if (zend_string_equals_literal_ci(substitute_character, "long")) {
1461 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1462 			RETURN_TRUE;
1463 		}
1464 		if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1465 			MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1466 			RETURN_TRUE;
1467 		}
1468 		/* Invalid string value */
1469 		zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1470 		RETURN_THROWS();
1471 	}
1472 	/* Integer codepoint passed */
1473 	if (!php_mb_check_code_point(substitute_codepoint)) {
1474 		zend_argument_value_error(1, "is not a valid codepoint");
1475 		RETURN_THROWS();
1476 	}
1477 
1478 	MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1479 	MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1480 	RETURN_TRUE;
1481 }
1482 /* }}} */
1483 
1484 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1485 PHP_FUNCTION(mb_preferred_mime_name)
1486 {
1487 	enum mbfl_no_encoding no_encoding;
1488 	char *name = NULL;
1489 	size_t name_len;
1490 
1491 	ZEND_PARSE_PARAMETERS_START(1, 1)
1492 		Z_PARAM_STRING(name, name_len)
1493 	ZEND_PARSE_PARAMETERS_END();
1494 
1495 	no_encoding = mbfl_name2no_encoding(name);
1496 	if (no_encoding == mbfl_no_encoding_invalid) {
1497 		zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1498 		RETURN_THROWS();
1499 	}
1500 
1501 	const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
1502 	if (preferred_name == NULL || *preferred_name == '\0') {
1503 		php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1504 		RETVAL_FALSE;
1505 	} else {
1506 		RETVAL_STRING((char *)preferred_name);
1507 	}
1508 }
1509 /* }}} */
1510 
1511 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1512 PHP_FUNCTION(mb_parse_str)
1513 {
1514 	zval *track_vars_array = NULL;
1515 	char *encstr;
1516 	size_t encstr_len;
1517 	php_mb_encoding_handler_info_t info;
1518 	const mbfl_encoding *detected;
1519 
1520 	ZEND_PARSE_PARAMETERS_START(2, 2)
1521 		Z_PARAM_STRING(encstr, encstr_len)
1522 		Z_PARAM_ZVAL(track_vars_array)
1523 	ZEND_PARSE_PARAMETERS_END();
1524 
1525 	track_vars_array = zend_try_array_init(track_vars_array);
1526 	if (!track_vars_array) {
1527 		RETURN_THROWS();
1528 	}
1529 
1530 	encstr = estrndup(encstr, encstr_len);
1531 
1532 	info.data_type              = PARSE_STRING;
1533 	info.separator              = PG(arg_separator).input;
1534 	info.report_errors          = true;
1535 	info.to_encoding            = MBSTRG(current_internal_encoding);
1536 	info.from_encodings         = MBSTRG(http_input_list);
1537 	info.num_from_encodings     = MBSTRG(http_input_list_size);
1538 
1539 	detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1540 
1541 	MBSTRG(http_input_identify) = detected;
1542 
1543 	RETVAL_BOOL(detected);
1544 
1545 	if (encstr != NULL) efree(encstr);
1546 }
1547 /* }}} */
1548 
1549 /* {{{ Returns string in output buffer converted to the http_output encoding */
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 	char *arg_string;
1553 	size_t arg_string_len;
1554 	zend_long arg_status;
1555 	mbfl_string string, result;
1556 	const char *charset;
1557 	char *p;
1558 	const mbfl_encoding *encoding;
1559 	int last_feed;
1560 	size_t len;
1561 	unsigned char send_text_mimetype = 0;
1562 	char *s, *mimetype = NULL;
1563 
1564 	ZEND_PARSE_PARAMETERS_START(2, 2)
1565 		Z_PARAM_STRING(arg_string, arg_string_len)
1566 		Z_PARAM_LONG(arg_status)
1567 	ZEND_PARSE_PARAMETERS_END();
1568 
1569 	encoding = MBSTRG(current_http_output_encoding);
1570 
1571 	/* start phase only */
1572 	if ((arg_status & PHP_OUTPUT_HANDLER_START) != 0) {
1573 		/* delete the converter just in case. */
1574 		if (MBSTRG(outconv)) {
1575 			MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1576 			mbfl_buffer_converter_delete(MBSTRG(outconv));
1577 			MBSTRG(outconv) = NULL;
1578 		}
1579 
1580 		if (encoding == &mbfl_encoding_pass) {
1581 			RETURN_STRINGL(arg_string, arg_string_len);
1582 		}
1583 
1584 		/* analyze mime type */
1585 		if (SG(sapi_headers).mimetype &&
1586 			_php_mb_match_regex(
1587 				MBSTRG(http_output_conv_mimetypes),
1588 				SG(sapi_headers).mimetype,
1589 				strlen(SG(sapi_headers).mimetype))) {
1590 			if ((s = strchr(SG(sapi_headers).mimetype,';')) == NULL) {
1591 				mimetype = estrdup(SG(sapi_headers).mimetype);
1592 			} else {
1593 				mimetype = estrndup(SG(sapi_headers).mimetype,s-SG(sapi_headers).mimetype);
1594 			}
1595 			send_text_mimetype = 1;
1596 		} else if (SG(sapi_headers).send_default_content_type) {
1597 			mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1598 		}
1599 
1600 		/* if content-type is not yet set, set it and activate the converter */
1601 		if (SG(sapi_headers).send_default_content_type || send_text_mimetype) {
1602 			charset = encoding->mime_name;
1603 			if (charset) {
1604 				len = spprintf( &p, 0, "Content-Type: %s; charset=%s",  mimetype, charset );
1605 				if (sapi_add_header(p, len, 0) != FAILURE) {
1606 					SG(sapi_headers).send_default_content_type = 0;
1607 				}
1608 			}
1609 			/* activate the converter */
1610 			MBSTRG(outconv) = mbfl_buffer_converter_new(MBSTRG(current_internal_encoding), encoding, 0);
1611 			if (send_text_mimetype){
1612 				efree(mimetype);
1613 			}
1614 		}
1615 	}
1616 
1617 	/* just return if the converter is not activated. */
1618 	if (MBSTRG(outconv) == NULL) {
1619 		RETURN_STRINGL(arg_string, arg_string_len);
1620 	}
1621 
1622 	/* flag */
1623 	last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1624 	/* mode */
1625 	mbfl_buffer_converter_illegal_mode(MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1626 	mbfl_buffer_converter_illegal_substchar(MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1627 
1628 	/* feed the string */
1629 	mbfl_string_init(&string);
1630 	/* these are not needed. convd has encoding info.
1631 	string.encoding = MBSTRG(current_internal_encoding);
1632 	*/
1633 	string.val = (unsigned char *)arg_string;
1634 	string.len = arg_string_len;
1635 
1636 	mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1637 	if (last_feed) {
1638 		mbfl_buffer_converter_flush(MBSTRG(outconv));
1639 	}
1640 	/* get the converter output, and return it */
1641 	mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1642 
1643 	// TODO: avoid reallocation ???
1644 	RETVAL_STRINGL((char *)result.val, result.len);		/* the string is already strdup()'ed */
1645 	efree(result.val);
1646 
1647 	/* delete the converter if it is the last feed. */
1648 	if (last_feed) {
1649 		MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1650 		mbfl_buffer_converter_delete(MBSTRG(outconv));
1651 		MBSTRG(outconv) = NULL;
1652 	}
1653 }
1654 /* }}} */
1655 
1656 /* {{{ Convert a multibyte string to an array. If split_length is specified,
1657  break the string down into chunks each split_length characters long. */
1658 
1659 /* structure to pass split params to the callback */
1660 struct mbfl_split_params {
1661 	zval *return_value; /* php function return value structure pointer */
1662 	mbfl_string *result_string; /* string to store result chunk */
1663 	size_t mb_chunk_length; /* actual chunk length in chars */
1664 	size_t split_length; /* split length in chars */
1665 	mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1666 };
1667 
1668 /* callback function to fill split array */
mbfl_split_output(int c,void * data)1669 static int mbfl_split_output(int c, void *data)
1670 {
1671 	struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1672 
1673 	(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1674 
1675 	if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1676 		mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1677 		mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1678 		mbfl_string *chunk = params->result_string;
1679 		mbfl_memory_device_result(device, chunk); /* make chunk */
1680 		add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1681 		efree(chunk->val);
1682 		params->mb_chunk_length = 0; /* reset mb_chunk size */
1683 	}
1684 
1685 	return 0;
1686 }
1687 
PHP_FUNCTION(mb_str_split)1688 PHP_FUNCTION(mb_str_split)
1689 {
1690 	zend_string *str, *encoding = NULL;
1691 	size_t mb_len, chunks, chunk_len;
1692 	const char *p, *last; /* pointer for the string cursor and last string char */
1693 	mbfl_string string, result_string;
1694 	const mbfl_encoding *mbfl_encoding;
1695 	zend_long split_length = 1;
1696 
1697 	ZEND_PARSE_PARAMETERS_START(1, 3)
1698 		Z_PARAM_STR(str)
1699 		Z_PARAM_OPTIONAL
1700 		Z_PARAM_LONG(split_length)
1701 		Z_PARAM_STR_OR_NULL(encoding)
1702 	ZEND_PARSE_PARAMETERS_END();
1703 
1704 	if (split_length <= 0) {
1705 		zend_argument_value_error(2, "must be greater than 0");
1706 		RETURN_THROWS();
1707 	}
1708 
1709 	/* fill mbfl_string structure */
1710 	string.val = (unsigned char *) ZSTR_VAL(str);
1711 	string.len = ZSTR_LEN(str);
1712 	string.encoding = php_mb_get_encoding(encoding, 3);
1713 	if (!string.encoding) {
1714 		RETURN_THROWS();
1715 	}
1716 
1717 	if (ZSTR_LEN(str) == 0) {
1718 		RETURN_EMPTY_ARRAY();
1719 	}
1720 
1721 	p = ZSTR_VAL(str); /* string cursor pointer */
1722 	last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1723 
1724 	mbfl_encoding = string.encoding;
1725 
1726 	/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1727 	if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1728 		mb_len = string.len;
1729 		chunk_len = (size_t)split_length; /* chunk length in bytes */
1730 	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1731 		mb_len = string.len / 2;
1732 		chunk_len = split_length * 2;
1733 	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1734 		mb_len = string.len / 4;
1735 		chunk_len = split_length * 4;
1736 	} else if (mbfl_encoding->mblen_table != NULL) {
1737 		/* second scenario: variable width encodings with length table */
1738 		char unsigned const *mbtab = mbfl_encoding->mblen_table;
1739 
1740 		/* assume that we have 1-bytes characters */
1741 		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1742 
1743 		while (p < last) { /* split cycle work until the cursor has reached the last byte */
1744 			char const *chunk_p = p; /* chunk first byte pointer */
1745 			chunk_len = 0; /* chunk length in bytes */
1746 			zend_long char_count;
1747 
1748 			for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1749 				char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1750 				chunk_len += m;
1751 				p += m;
1752 			}
1753 			if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1754 			add_next_index_stringl(return_value, chunk_p, chunk_len);
1755 		}
1756 		return;
1757 	} else {
1758 		/* third scenario: other multibyte encodings */
1759 		mbfl_convert_filter *filter, *decoder;
1760 
1761 		/* assume that we have 1-bytes characters */
1762 		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1763 
1764 		/* decoder filter to decode wchar to encoding */
1765 		mbfl_memory_device device;
1766 		mbfl_memory_device_init(&device, split_length + 1, 0);
1767 
1768 		decoder = mbfl_convert_filter_new(
1769 				&mbfl_encoding_wchar,
1770 				string.encoding,
1771 				mbfl_memory_device_output,
1772 				NULL,
1773 				&device);
1774 		/* assert that nothing is wrong with the decoder */
1775 		ZEND_ASSERT(decoder != NULL);
1776 
1777 		/* wchar filter */
1778 		mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1779 		struct mbfl_split_params params = { /* init callback function params structure */
1780 			.return_value = return_value,
1781 			.result_string = &result_string,
1782 			.mb_chunk_length = 0,
1783 			.split_length = (size_t)split_length,
1784 			.next_filter = decoder,
1785 		};
1786 
1787 		filter = mbfl_convert_filter_new(
1788 				string.encoding,
1789 				&mbfl_encoding_wchar,
1790 				mbfl_split_output,
1791 				NULL,
1792 				&params);
1793 		/* assert that nothing is wrong with the filter */
1794 		ZEND_ASSERT(filter != NULL);
1795 
1796 		while (p < last - 1) { /* cycle each byte except last with callback function */
1797 			(*filter->filter_function)(*p++, filter);
1798 		}
1799 		params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1800 		(*filter->filter_function)(*p++, filter); /* process last char */
1801 
1802 		mbfl_convert_filter_delete(decoder);
1803 		mbfl_convert_filter_delete(filter);
1804 		mbfl_memory_device_clear(&device);
1805 		return;
1806 	}
1807 
1808 	/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1809 	chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1810 	array_init_size(return_value, chunks);
1811 	if (chunks != 0) {
1812 		zend_long i;
1813 
1814 		for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1815 			add_next_index_stringl(return_value, p, chunk_len);
1816 		}
1817 		add_next_index_stringl(return_value, p, last - p);
1818 	}
1819 }
1820 /* }}} */
1821 
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1822 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1823 {
1824 	size_t len = 0;
1825 
1826 	if (encoding->flag & MBFL_ENCTYPE_SBCS) {
1827 		return ZSTR_LEN(string);
1828 	} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1829 		return ZSTR_LEN(string) / 2;
1830 	} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1831 		return ZSTR_LEN(string) / 4;
1832 	} else if (encoding->mblen_table) {
1833 		const unsigned char *mbtab = encoding->mblen_table;
1834 		unsigned char *p = (unsigned char*)ZSTR_VAL(string), *e = p + ZSTR_LEN(string);
1835 		while (p < e) {
1836 			p += mbtab[*p];
1837 			len++;
1838 		}
1839 	} else {
1840 		uint32_t wchar_buf[128];
1841 		unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1842 		size_t in_len = ZSTR_LEN(string);
1843 		unsigned int state = 0;
1844 
1845 		while (in_len) {
1846 			len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1847 		}
1848 	}
1849 
1850 	return len;
1851 }
1852 
1853 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1854 PHP_FUNCTION(mb_strlen)
1855 {
1856 	zend_string *string, *enc_name = NULL;
1857 
1858 	ZEND_PARSE_PARAMETERS_START(1, 2)
1859 		Z_PARAM_STR(string)
1860 		Z_PARAM_OPTIONAL
1861 		Z_PARAM_STR_OR_NULL(enc_name)
1862 	ZEND_PARSE_PARAMETERS_END();
1863 
1864 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1865 	if (!enc) {
1866 		RETURN_THROWS();
1867 	}
1868 
1869 	RETVAL_LONG(mb_get_strlen(string, enc));
1870 }
1871 /* }}} */
1872 
handle_strpos_error(size_t error)1873 static void handle_strpos_error(size_t error) {
1874 	switch (error) {
1875 	case MBFL_ERROR_NOT_FOUND:
1876 		break;
1877 	case MBFL_ERROR_ENCODING:
1878 		php_error_docref(NULL, E_WARNING, "Conversion error");
1879 		break;
1880 	case MBFL_ERROR_OFFSET:
1881 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1882 		break;
1883 	default:
1884 		zend_value_error("mb_strpos(): Unknown error");
1885 		break;
1886 	}
1887 }
1888 
1889 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(mb_strpos)1890 PHP_FUNCTION(mb_strpos)
1891 {
1892 	int reverse = 0;
1893 	zend_long offset = 0;
1894 	char *haystack_val, *needle_val;
1895 	mbfl_string haystack, needle;
1896 	zend_string *enc_name = NULL;
1897 
1898 	ZEND_PARSE_PARAMETERS_START(2, 4)
1899 		Z_PARAM_STRING(haystack_val, haystack.len)
1900 		Z_PARAM_STRING(needle_val, needle.len)
1901 		Z_PARAM_OPTIONAL
1902 		Z_PARAM_LONG(offset)
1903 		Z_PARAM_STR_OR_NULL(enc_name)
1904 	ZEND_PARSE_PARAMETERS_END();
1905 
1906 	haystack.val = (unsigned char*)haystack_val;
1907 	needle.val = (unsigned char*)needle_val;
1908 
1909 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1910 	if (!haystack.encoding) {
1911 		RETURN_THROWS();
1912 	}
1913 
1914 	size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
1915 	if (!mbfl_is_error(n)) {
1916 		RETVAL_LONG(n);
1917 	} else {
1918 		handle_strpos_error(n);
1919 		RETVAL_FALSE;
1920 	}
1921 }
1922 /* }}} */
1923 
1924 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1925 PHP_FUNCTION(mb_strrpos)
1926 {
1927 	mbfl_string haystack, needle;
1928 	char *haystack_val, *needle_val;
1929 	zend_string *enc_name = NULL;
1930 	zend_long offset = 0;
1931 
1932 	ZEND_PARSE_PARAMETERS_START(2, 4)
1933 		Z_PARAM_STRING(haystack_val, haystack.len)
1934 		Z_PARAM_STRING(needle_val, needle.len)
1935 		Z_PARAM_OPTIONAL
1936 		Z_PARAM_LONG(offset)
1937 		Z_PARAM_STR_OR_NULL(enc_name)
1938 	ZEND_PARSE_PARAMETERS_END();
1939 
1940 	haystack.val = (unsigned char*)haystack_val;
1941 	needle.val = (unsigned char*)needle_val;
1942 
1943 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1944 	if (!haystack.encoding) {
1945 		RETURN_THROWS();
1946 	}
1947 
1948 	size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
1949 	if (!mbfl_is_error(n)) {
1950 		RETVAL_LONG(n);
1951 	} else {
1952 		handle_strpos_error(n);
1953 		RETVAL_FALSE;
1954 	}
1955 }
1956 /* }}} */
1957 
1958 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)1959 PHP_FUNCTION(mb_stripos)
1960 {
1961 	zend_long offset = 0;
1962 	mbfl_string haystack, needle;
1963 	char *haystack_val, *needle_val;
1964 	zend_string *from_encoding = NULL;
1965 
1966 	ZEND_PARSE_PARAMETERS_START(2, 4)
1967 		Z_PARAM_STRING(haystack_val, haystack.len)
1968 		Z_PARAM_STRING(needle_val, needle.len)
1969 		Z_PARAM_OPTIONAL
1970 		Z_PARAM_LONG(offset)
1971 		Z_PARAM_STR_OR_NULL(from_encoding)
1972 	ZEND_PARSE_PARAMETERS_END();
1973 
1974 	haystack.val = (unsigned char*)haystack_val;
1975 	needle.val = (unsigned char*)needle_val;
1976 
1977 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1978 	if (!enc) {
1979 		RETURN_THROWS();
1980 	}
1981 
1982 	size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1983 
1984 	if (!mbfl_is_error(n)) {
1985 		RETVAL_LONG(n);
1986 	} else {
1987 		handle_strpos_error(n);
1988 		RETVAL_FALSE;
1989 	}
1990 }
1991 /* }}} */
1992 
1993 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)1994 PHP_FUNCTION(mb_strripos)
1995 {
1996 	zend_long offset = 0;
1997 	mbfl_string haystack, needle;
1998 	char *haystack_val, *needle_val;
1999 	zend_string *from_encoding = NULL;
2000 
2001 	ZEND_PARSE_PARAMETERS_START(2, 4)
2002 		Z_PARAM_STRING(haystack_val, haystack.len)
2003 		Z_PARAM_STRING(needle_val, needle.len)
2004 		Z_PARAM_OPTIONAL
2005 		Z_PARAM_LONG(offset)
2006 		Z_PARAM_STR_OR_NULL(from_encoding)
2007 	ZEND_PARSE_PARAMETERS_END();
2008 
2009 	haystack.val = (unsigned char*)haystack_val;
2010 	needle.val = (unsigned char*)needle_val;
2011 
2012 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2013 	if (!enc) {
2014 		RETURN_THROWS();
2015 	}
2016 
2017 	size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
2018 
2019 	if (!mbfl_is_error(n)) {
2020 		RETVAL_LONG(n);
2021 	} else {
2022 		handle_strpos_error(n);
2023 		RETVAL_FALSE;
2024 	}
2025 }
2026 /* }}} */
2027 
2028 #define MB_STRSTR 1
2029 #define MB_STRRCHR 2
2030 #define MB_STRISTR 3
2031 #define MB_STRRICHR 4
2032 /* {{{ php_mb_strstr_variants */
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2033 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2034 {
2035 	int reverse_mode = 0;
2036 	size_t n;
2037 	char *haystack_val, *needle_val;
2038 	mbfl_string haystack, needle, result, *ret = NULL;
2039 	zend_string *encoding_name = NULL;
2040 	bool part = 0;
2041 
2042 	ZEND_PARSE_PARAMETERS_START(2, 4)
2043 		Z_PARAM_STRING(haystack_val, haystack.len)
2044 		Z_PARAM_STRING(needle_val, needle.len)
2045 		Z_PARAM_OPTIONAL
2046 		Z_PARAM_BOOL(part)
2047 		Z_PARAM_STR_OR_NULL(encoding_name)
2048 	ZEND_PARSE_PARAMETERS_END();
2049 
2050 	haystack.val = (unsigned char*)haystack_val;
2051 	needle.val = (unsigned char*)needle_val;
2052 	haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
2053 	if (!haystack.encoding) {
2054 		RETURN_THROWS();
2055 	}
2056 
2057 	if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
2058 
2059 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2060 		n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
2061 			needle.len, 0, needle.encoding);
2062 	} else {
2063 		n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
2064 	}
2065 
2066 	if (!mbfl_is_error(n)) {
2067 		if (part) {
2068 			ret = mbfl_substr(&haystack, &result, 0, n);
2069 			ZEND_ASSERT(ret != NULL);
2070 			// TODO: avoid reallocation ???
2071 			RETVAL_STRINGL((char *)ret->val, ret->len);
2072 			efree(ret->val);
2073 		} else {
2074 			ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
2075 			ZEND_ASSERT(ret != NULL);
2076 			// TODO: avoid reallocation ???
2077 			RETVAL_STRINGL((char *)ret->val, ret->len);
2078 			efree(ret->val);
2079 		}
2080 	} else {
2081 		// FIXME use handle_strpos_error(n)
2082 		RETVAL_FALSE;
2083 	}
2084 }
2085 
2086 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2087 PHP_FUNCTION(mb_strstr)
2088 {
2089 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2090 }
2091 /* }}} */
2092 
2093 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2094 PHP_FUNCTION(mb_strrchr)
2095 {
2096 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2097 }
2098 /* }}} */
2099 
2100 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2101 PHP_FUNCTION(mb_stristr)
2102 {
2103 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2104 }
2105 /* }}} */
2106 
2107 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2108 PHP_FUNCTION(mb_strrichr)
2109 {
2110 	php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2111 }
2112 /* }}} */
2113 
2114 #undef MB_STRSTR
2115 #undef MB_STRRCHR
2116 #undef MB_STRISTR
2117 #undef MB_STRRICHR
2118 
2119 /* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)2120 PHP_FUNCTION(mb_substr_count)
2121 {
2122 	mbfl_string haystack, needle;
2123 	char *haystack_val, *needle_val;
2124 	zend_string *enc_name = NULL;
2125 
2126 	ZEND_PARSE_PARAMETERS_START(2, 3)
2127 		Z_PARAM_STRING(haystack_val, haystack.len)
2128 		Z_PARAM_STRING(needle_val, needle.len)
2129 		Z_PARAM_OPTIONAL
2130 		Z_PARAM_STR_OR_NULL(enc_name)
2131 	ZEND_PARSE_PARAMETERS_END();
2132 
2133 	haystack.val = (unsigned char*)haystack_val;
2134 	needle.val = (unsigned char*)needle_val;
2135 
2136 	if (needle.len == 0) {
2137 		zend_argument_value_error(2, "must not be empty");
2138 		RETURN_THROWS();
2139 	}
2140 
2141 	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2142 	if (!haystack.encoding) {
2143 		RETURN_THROWS();
2144 	}
2145 
2146 	size_t n = mbfl_substr_count(&haystack, &needle);
2147 	/* An error can only occur if needle is empty,
2148 	 * an encoding error happens (which should not happen at this stage and is a bug)
2149 	 * or the haystack is more than sizeof(size_t) bytes
2150 	 * If one of these things occur this is a bug and should be flagged as such */
2151 	ZEND_ASSERT(!mbfl_is_error(n));
2152 	RETVAL_LONG(n);
2153 }
2154 /* }}} */
2155 
2156 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2157 PHP_FUNCTION(mb_substr)
2158 {
2159 	char *str;
2160 	zend_string *encoding = NULL;
2161 	zend_long from, len;
2162 	size_t real_from, real_len;
2163 	size_t str_len;
2164 	bool len_is_null = 1;
2165 	mbfl_string string, result, *ret;
2166 
2167 	ZEND_PARSE_PARAMETERS_START(2, 4)
2168 		Z_PARAM_STRING(str, str_len)
2169 		Z_PARAM_LONG(from)
2170 		Z_PARAM_OPTIONAL
2171 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2172 		Z_PARAM_STR_OR_NULL(encoding)
2173 	ZEND_PARSE_PARAMETERS_END();
2174 
2175 	string.encoding = php_mb_get_encoding(encoding, 4);
2176 	if (!string.encoding) {
2177 		RETURN_THROWS();
2178 	}
2179 
2180 	string.val = (unsigned char *)str;
2181 	string.len = str_len;
2182 
2183 	/* measures length */
2184 	size_t mblen = 0;
2185 	if (from < 0 || (!len_is_null && len < 0)) {
2186 		mblen = mbfl_strlen(&string);
2187 	}
2188 
2189 	/* if "from" position is negative, count start position from the end
2190 	 * of the string
2191 	 */
2192 	if (from >= 0) {
2193 		real_from = (size_t) from;
2194 	} else if (-from < mblen) {
2195 		real_from = mblen + from;
2196 	} else {
2197 		real_from = 0;
2198 	}
2199 
2200 	/* if "length" position is negative, set it to the length
2201 	 * needed to stop that many chars from the end of the string
2202 	 */
2203 	if (len_is_null) {
2204 		real_len = MBFL_SUBSTR_UNTIL_END;
2205 	} else if (len >= 0) {
2206 		real_len = (size_t) len;
2207 	} else if (real_from < mblen && -len < mblen - real_from) {
2208 		real_len = (mblen - real_from) + len;
2209 	} else {
2210 		real_len = 0;
2211 	}
2212 
2213 	ret = mbfl_substr(&string, &result, real_from, real_len);
2214 	ZEND_ASSERT(ret != NULL);
2215 
2216 	// TODO: avoid reallocation ???
2217 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2218 	efree(ret->val);
2219 }
2220 /* }}} */
2221 
2222 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2223 PHP_FUNCTION(mb_strcut)
2224 {
2225 	zend_string *encoding = NULL;
2226 	char *string_val;
2227 	zend_long from, len;
2228 	bool len_is_null = 1;
2229 	mbfl_string string, result, *ret;
2230 
2231 	ZEND_PARSE_PARAMETERS_START(2, 4)
2232 		Z_PARAM_STRING(string_val, string.len)
2233 		Z_PARAM_LONG(from)
2234 		Z_PARAM_OPTIONAL
2235 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
2236 		Z_PARAM_STR_OR_NULL(encoding)
2237 	ZEND_PARSE_PARAMETERS_END();
2238 
2239 	string.val = (unsigned char*)string_val;
2240 	string.encoding = php_mb_get_encoding(encoding, 4);
2241 	if (!string.encoding) {
2242 		RETURN_THROWS();
2243 	}
2244 
2245 	if (len_is_null) {
2246 		len = string.len;
2247 	}
2248 
2249 	/* if "from" position is negative, count start position from the end
2250 	 * of the string
2251 	 */
2252 	if (from < 0) {
2253 		from = string.len + from;
2254 		if (from < 0) {
2255 			from = 0;
2256 		}
2257 	}
2258 
2259 	/* if "length" position is negative, set it to the length
2260 	 * needed to stop that many chars from the end of the string
2261 	 */
2262 	if (len < 0) {
2263 		len = (string.len - from) + len;
2264 		if (len < 0) {
2265 			len = 0;
2266 		}
2267 	}
2268 
2269 	if (from > string.len) {
2270 		RETURN_EMPTY_STRING();
2271 	}
2272 
2273 	ret = mbfl_strcut(&string, &result, from, len);
2274 	ZEND_ASSERT(ret != NULL);
2275 
2276 	// TODO: avoid reallocation ???
2277 	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2278 	efree(ret->val);
2279 }
2280 /* }}} */
2281 
2282 /* Some East Asian characters, when printed at a terminal (or the like), require double
2283  * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2284 static size_t character_width(uint32_t c)
2285 {
2286 	if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2287 		return 1;
2288 	}
2289 
2290 	/* Do a binary search to see if we fall in any of the fullwidth ranges */
2291 	int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2292 	while (lo < hi) {
2293 		int probe = (lo + hi) / 2;
2294 		if (c < mbfl_eaw_table[probe].begin) {
2295 			hi = probe;
2296 		} else if (c > mbfl_eaw_table[probe].end) {
2297 			lo = probe + 1;
2298 		} else {
2299 			return 2;
2300 		}
2301 	}
2302 
2303 	return 1;
2304 }
2305 
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2306 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2307 {
2308 	size_t width = 0;
2309 	uint32_t wchar_buf[128];
2310 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2311 	size_t in_len = ZSTR_LEN(string);
2312 	unsigned int state = 0;
2313 
2314 	while (in_len) {
2315 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2316 		ZEND_ASSERT(out_len <= 128);
2317 
2318 		while (out_len) {
2319 			/* NOTE: 'bad input' marker will be counted as 1 unit of width
2320 			 * If text conversion is performed with an ordinary ASCII character as
2321 			 * the 'replacement character', this will give us the correct display width. */
2322 			width += character_width(wchar_buf[--out_len]);
2323 		}
2324 	}
2325 
2326 	return width;
2327 }
2328 
2329 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2330 PHP_FUNCTION(mb_strwidth)
2331 {
2332 	zend_string *string, *enc_name = NULL;
2333 
2334 	ZEND_PARSE_PARAMETERS_START(1, 2)
2335 		Z_PARAM_STR(string)
2336 		Z_PARAM_OPTIONAL
2337 		Z_PARAM_STR_OR_NULL(enc_name)
2338 	ZEND_PARSE_PARAMETERS_END();
2339 
2340 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2341 	if (!enc) {
2342 		RETURN_THROWS();
2343 	}
2344 
2345 	RETVAL_LONG(mb_get_strwidth(string, enc));
2346 }
2347 
2348 /* Cut 'n' codepoints from beginning of string
2349  * Remove this once mb_substr is implemented using the new conversion filters */
mb_drop_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2350 static zend_string* mb_drop_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2351 {
2352 	if (n >= ZSTR_LEN(input)) {
2353 		/* No supported text encoding decodes to more than one codepoint per byte
2354 		 * So if the number of codepoints to drop >= number of input bytes,
2355 		 * then definitely the output should be empty
2356 		 * This also guards `ZSTR_LEN(input) - n` (below) from underflow */
2357 		return zend_empty_string;
2358 	}
2359 
2360 	uint32_t wchar_buf[128];
2361 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2362 	size_t in_len = ZSTR_LEN(input);
2363 	unsigned int state = 0;
2364 
2365 	mb_convert_buf buf;
2366 	mb_convert_buf_init(&buf, ZSTR_LEN(input) - n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2367 
2368 	while (in_len) {
2369 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2370 		ZEND_ASSERT(out_len <= 128);
2371 
2372 		if (n >= out_len) {
2373 			n -= out_len;
2374 		} else {
2375 			enc->from_wchar(wchar_buf + n, out_len - n, &buf, !in_len);
2376 			n = 0;
2377 		}
2378 	}
2379 
2380 	return mb_convert_buf_result(&buf);
2381 }
2382 
2383 /* Pick 'n' codepoints from beginning of string
2384  * Remove this once mb_substr is implemented using the new conversion filters */
mb_pick_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2385 static zend_string* mb_pick_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2386 {
2387 	uint32_t wchar_buf[128];
2388 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2389 	size_t in_len = ZSTR_LEN(input);
2390 	unsigned int state = 0;
2391 
2392 	mb_convert_buf buf;
2393 	mb_convert_buf_init(&buf, n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2394 
2395 	while (in_len && n) {
2396 		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2397 		ZEND_ASSERT(out_len <= 128);
2398 
2399 		enc->from_wchar(wchar_buf, MIN(out_len, n), &buf, !in_len || out_len >= n);
2400 		n -= MIN(out_len, n);
2401 	}
2402 
2403 	return mb_convert_buf_result(&buf);
2404 }
2405 
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2406 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2407 {
2408 	uint32_t wchar_buf[128];
2409 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2410 	size_t in_len = ZSTR_LEN(input);
2411 	unsigned int state = 0;
2412 	int remaining_width = width;
2413 	unsigned int to_skip = from;
2414 	size_t out_len = 0;
2415 	bool first_call = true, input_err = false;
2416 	mb_convert_buf buf;
2417 
2418 	while (in_len) {
2419 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2420 		ZEND_ASSERT(out_len <= 128);
2421 
2422 		if (out_len <= to_skip) {
2423 			to_skip -= out_len;
2424 		} else {
2425 			for (int i = to_skip; i < out_len; i++) {
2426 				uint32_t w = wchar_buf[i];
2427 				input_err |= (w == MBFL_BAD_INPUT);
2428 				remaining_width -= character_width(w);
2429 				if (remaining_width < 0) {
2430 					/* We need to truncate string and append trim marker */
2431 					width -= mb_get_strwidth(marker, enc);
2432 					/* 'width' is now the amount we want to take from 'input' */
2433 					if (width <= 0) {
2434 						return zend_string_copy(marker);
2435 					}
2436 					mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2437 
2438 					if (first_call) {
2439 						/* We can use the buffer of wchars which we have right now;
2440 						 * no need to convert again */
2441 						goto dont_restart_conversion;
2442 					} else {
2443 						goto restart_conversion;
2444 					}
2445 				}
2446 			}
2447 			to_skip = 0;
2448 		}
2449 		first_call = false;
2450 	}
2451 
2452 	/* The input string fits in the requested width; we don't need to append the trim marker
2453 	 * However, if the string contains erroneous byte sequences, those should be converted
2454 	 * to error markers */
2455 	if (from == 0 && !input_err) {
2456 		/* This just increments the string's refcount; it doesn't really 'copy' it */
2457 		return zend_string_copy(input);
2458 	}
2459 	return mb_drop_chars(input, enc, from);
2460 
2461 	/* The input string is too wide; we need to build a new string which
2462 	 * includes some portion of the input string, with the trim marker
2463 	 * concatenated onto it */
2464 restart_conversion:
2465 	in = (unsigned char*)ZSTR_VAL(input);
2466 	in_len = ZSTR_LEN(input);
2467 	state = 0;
2468 
2469 	while (true) {
2470 		out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2471 		ZEND_ASSERT(out_len <= 128);
2472 
2473 dont_restart_conversion:
2474 		if (out_len <= from) {
2475 			from -= out_len;
2476 		} else {
2477 			for (int i = from; i < out_len; i++) {
2478 				width -= character_width(wchar_buf[i]);
2479 				if (width < 0) {
2480 					enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2481 					goto append_trim_marker;
2482 				}
2483 			}
2484 			ZEND_ASSERT(in_len > 0);
2485 			enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2486 			from = 0;
2487 		}
2488 	}
2489 
2490 append_trim_marker:
2491 	if (ZSTR_LEN(marker) > 0) {
2492 		MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2493 		memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2494 		buf.out += ZSTR_LEN(marker);
2495 	}
2496 
2497 	return mb_convert_buf_result(&buf);
2498 }
2499 
2500 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2501 PHP_FUNCTION(mb_strimwidth)
2502 {
2503 	zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2504 	zend_long from, width;
2505 
2506 	ZEND_PARSE_PARAMETERS_START(3, 5)
2507 		Z_PARAM_STR(str)
2508 		Z_PARAM_LONG(from)
2509 		Z_PARAM_LONG(width)
2510 		Z_PARAM_OPTIONAL
2511 		Z_PARAM_STR(trimmarker)
2512 		Z_PARAM_STR_OR_NULL(encoding)
2513 	ZEND_PARSE_PARAMETERS_END();
2514 
2515 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2516 	if (!enc) {
2517 		RETURN_THROWS();
2518 	}
2519 
2520 	if (from != 0) {
2521 		size_t str_len = mb_get_strlen(str, enc);
2522 		if (from < 0) {
2523 			from += str_len;
2524 		}
2525 		if (from < 0 || from > str_len) {
2526 			zend_argument_value_error(2, "is out of range");
2527 			RETURN_THROWS();
2528 		}
2529 	}
2530 
2531 	if (width < 0) {
2532 		width += mb_get_strwidth(str, enc);
2533 
2534 		if (from > 0) {
2535 			zend_string *trimmed = mb_pick_chars(str, enc, from);
2536 			width -= mb_get_strwidth(trimmed, enc);
2537 			zend_string_free(trimmed);
2538 		}
2539 
2540 		if (width < 0) {
2541 			zend_argument_value_error(3, "is out of range");
2542 			RETURN_THROWS();
2543 		}
2544 	}
2545 
2546 	RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2547 }
2548 
2549 
2550 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2551 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2552 {
2553 	return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2554 			|| (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2555 			|| (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2556 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2557 }
2558 
2559 
2560 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)2561 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
2562 {
2563 	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
2564 }
2565 
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2566 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2567 {
2568 	unsigned int num_errors = 0;
2569 	zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2570 	MBSTRG(illegalchars) += num_errors;
2571 	return result;
2572 }
2573 
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2574 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2575 {
2576 	const mbfl_encoding *from_encoding;
2577 
2578 	/* pre-conversion encoding */
2579 	ZEND_ASSERT(num_from_encodings >= 1);
2580 	if (num_from_encodings == 1) {
2581 		from_encoding = *from_encodings;
2582 	} else {
2583 		/* auto detect */
2584 		mbfl_string string;
2585 		mbfl_string_init(&string);
2586 		string.val = (unsigned char *)input;
2587 		string.len = length;
2588 		from_encoding = mbfl_identify_encoding(
2589 			&string, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2590 		if (!from_encoding) {
2591 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2592 			return NULL;
2593 		}
2594 	}
2595 
2596 	return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2597 }
2598 
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2599 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2600 {
2601 	HashTable *output, *chash;
2602 	zend_long idx;
2603 	zend_string *key;
2604 	zval *entry, entry_tmp;
2605 
2606 	if (!input) {
2607 		return NULL;
2608 	}
2609 
2610 	if (GC_IS_RECURSIVE(input)) {
2611 		GC_UNPROTECT_RECURSION(input);
2612 		php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2613 		return NULL;
2614 	}
2615 	GC_TRY_PROTECT_RECURSION(input);
2616 	output = zend_new_array(zend_hash_num_elements(input));
2617 	ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2618 		/* convert key */
2619 		if (key) {
2620 			zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2621 			if (!converted_key) {
2622 				continue;
2623 			}
2624 			key = converted_key;
2625 		}
2626 		/* convert value */
2627 		ZEND_ASSERT(entry);
2628 try_again:
2629 		switch(Z_TYPE_P(entry)) {
2630 			case IS_STRING: {
2631 				zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2632 				if (!converted_key) {
2633 					if (key) {
2634 						zend_string_release(key);
2635 					}
2636 					continue;
2637 				}
2638 				ZVAL_STR(&entry_tmp, converted_key);
2639 				break;
2640 			}
2641 			case IS_NULL:
2642 			case IS_TRUE:
2643 			case IS_FALSE:
2644 			case IS_LONG:
2645 			case IS_DOUBLE:
2646 				ZVAL_COPY(&entry_tmp, entry);
2647 				break;
2648 			case IS_ARRAY:
2649 				chash = php_mb_convert_encoding_recursive(
2650 					Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2651 				if (chash) {
2652 					ZVAL_ARR(&entry_tmp, chash);
2653 				} else {
2654 					ZVAL_EMPTY_ARRAY(&entry_tmp);
2655 				}
2656 				break;
2657 			case IS_REFERENCE:
2658 				entry = Z_REFVAL_P(entry);
2659 				goto try_again;
2660 			case IS_OBJECT:
2661 			default:
2662 				if (key) {
2663 					zend_string_release(key);
2664 				}
2665 				php_error_docref(NULL, E_WARNING, "Object is not supported");
2666 				continue;
2667 		}
2668 		if (key) {
2669 			zend_hash_add(output, key, &entry_tmp);
2670 			zend_string_release(key);
2671 		} else {
2672 			zend_hash_index_add(output, idx, &entry_tmp);
2673 		}
2674 	} ZEND_HASH_FOREACH_END();
2675 	GC_TRY_UNPROTECT_RECURSION(input);
2676 
2677 	return output;
2678 }
2679 /* }}} */
2680 
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2681 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2682 {
2683 	/* mbstring supports some 'text encodings' which aren't really text encodings
2684 	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2685 	 * These should never be returned by `mb_detect_encoding`. */
2686 	int shift = 0;
2687 	for (int i = 0; i < *size; i++) {
2688 		const mbfl_encoding *encoding = elist[i];
2689 		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2690 			shift++; /* Remove this encoding from the list */
2691 		} else if (shift) {
2692 			elist[i - shift] = encoding;
2693 		}
2694 	}
2695 	*size -= shift;
2696 }
2697 
2698 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2699 PHP_FUNCTION(mb_convert_encoding)
2700 {
2701 	zend_string *to_encoding_name;
2702 	zend_string *input_str, *from_encodings_str = NULL;
2703 	HashTable *input_ht, *from_encodings_ht = NULL;
2704 	const mbfl_encoding **from_encodings;
2705 	size_t num_from_encodings;
2706 	bool free_from_encodings;
2707 
2708 	ZEND_PARSE_PARAMETERS_START(2, 3)
2709 		Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2710 		Z_PARAM_STR(to_encoding_name)
2711 		Z_PARAM_OPTIONAL
2712 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2713 	ZEND_PARSE_PARAMETERS_END();
2714 
2715 	const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2716 	if (!to_encoding) {
2717 		RETURN_THROWS();
2718 	}
2719 
2720 	if (from_encodings_ht) {
2721 		if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2722 			RETURN_THROWS();
2723 		}
2724 		free_from_encodings = 1;
2725 	} else if (from_encodings_str) {
2726 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2727 				&from_encodings, &num_from_encodings,
2728 				/* persistent */ 0, /* arg_num */ 3, /* allow_pass_encoding */ 0) == FAILURE) {
2729 			RETURN_THROWS();
2730 		}
2731 		free_from_encodings = 1;
2732 	} else {
2733 		from_encodings = &MBSTRG(current_internal_encoding);
2734 		num_from_encodings = 1;
2735 		free_from_encodings = 0;
2736 	}
2737 
2738 	if (num_from_encodings > 1) {
2739 		remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2740 	}
2741 
2742 	if (!num_from_encodings) {
2743 		efree(ZEND_VOIDP(from_encodings));
2744 		zend_argument_value_error(3, "must specify at least one encoding");
2745 		RETURN_THROWS();
2746 	}
2747 
2748 	if (input_str) {
2749 		zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2750 		if (ret != NULL) {
2751 			RETVAL_STR(ret);
2752 		} else {
2753 			RETVAL_FALSE;
2754 		}
2755 	} else {
2756 		HashTable *tmp;
2757 		tmp = php_mb_convert_encoding_recursive(
2758 			input_ht, to_encoding, from_encodings, num_from_encodings);
2759 		RETVAL_ARR(tmp);
2760 	}
2761 
2762 	if (free_from_encodings) {
2763 		efree(ZEND_VOIDP(from_encodings));
2764 	}
2765 }
2766 /* }}} */
2767 
mbstring_convert_case(int case_mode,const char * str,size_t str_len,size_t * ret_len,const mbfl_encoding * enc)2768 static char *mbstring_convert_case(
2769 		int case_mode, const char *str, size_t str_len, size_t *ret_len,
2770 		const mbfl_encoding *enc) {
2771 	return php_unicode_convert_case(
2772 		case_mode, str, str_len, ret_len, enc,
2773 		MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2774 }
2775 
2776 /* {{{ Returns a case-folded version of source_string */
PHP_FUNCTION(mb_convert_case)2777 PHP_FUNCTION(mb_convert_case)
2778 {
2779 	zend_string *from_encoding = NULL;
2780 	char *str;
2781 	size_t str_len, ret_len;
2782 	zend_long case_mode = 0;
2783 
2784 	ZEND_PARSE_PARAMETERS_START(2, 3)
2785 		Z_PARAM_STRING(str, str_len)
2786 		Z_PARAM_LONG(case_mode)
2787 		Z_PARAM_OPTIONAL
2788 		Z_PARAM_STR_OR_NULL(from_encoding)
2789 	ZEND_PARSE_PARAMETERS_END();
2790 
2791 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2792 	if (!enc) {
2793 		RETURN_THROWS();
2794 	}
2795 
2796 	if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) {
2797 		zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2798 		RETURN_THROWS();
2799 	}
2800 
2801 	char *newstr = mbstring_convert_case(case_mode, str, str_len, &ret_len, enc);
2802 	/* If newstr is NULL something went wrong in mbfl and this is a bug */
2803 	ZEND_ASSERT(newstr != NULL);
2804 
2805 	// TODO: avoid reallocation ???
2806 	RETVAL_STRINGL(newstr, ret_len);
2807 	efree(newstr);
2808 }
2809 /* }}} */
2810 
2811 /* {{{ Returns a upper cased version of source_string */
PHP_FUNCTION(mb_strtoupper)2812 PHP_FUNCTION(mb_strtoupper)
2813 {
2814 	zend_string *from_encoding = NULL;
2815 	char *str;
2816 	size_t str_len, ret_len;
2817 
2818 	ZEND_PARSE_PARAMETERS_START(1, 2)
2819 		Z_PARAM_STRING(str, str_len)
2820 		Z_PARAM_OPTIONAL
2821 		Z_PARAM_STR_OR_NULL(from_encoding)
2822 	ZEND_PARSE_PARAMETERS_END();
2823 
2824 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2825 	if (!enc) {
2826 		RETURN_THROWS();
2827 	}
2828 
2829 	char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
2830 	/* If newstr is NULL something went wrong in mbfl and this is a bug */
2831 	ZEND_ASSERT(newstr != NULL);
2832 
2833 	// TODO: avoid reallocation ???
2834 	RETVAL_STRINGL(newstr, ret_len);
2835 	efree(newstr);
2836 }
2837 /* }}} */
2838 
2839 /* {{{ Returns a lower cased version of source_string */
PHP_FUNCTION(mb_strtolower)2840 PHP_FUNCTION(mb_strtolower)
2841 {
2842 	zend_string *from_encoding = NULL;
2843 	char *str;
2844 	size_t str_len;
2845 	char *newstr;
2846 	size_t ret_len;
2847 	const mbfl_encoding *enc;
2848 
2849 	ZEND_PARSE_PARAMETERS_START(1, 2)
2850 		Z_PARAM_STRING(str, str_len)
2851 		Z_PARAM_OPTIONAL
2852 		Z_PARAM_STR_OR_NULL(from_encoding)
2853 	ZEND_PARSE_PARAMETERS_END();
2854 
2855 	enc = php_mb_get_encoding(from_encoding, 2);
2856 	if (!enc) {
2857 		RETURN_THROWS();
2858 	}
2859 
2860 	newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
2861 	/* If newstr is NULL something went wrong in mbfl and this is a bug */
2862 	ZEND_ASSERT(newstr != NULL);
2863 
2864 	// TODO: avoid reallocation ???
2865 	RETVAL_STRINGL(newstr, ret_len);
2866 	efree(newstr);
2867 }
2868 /* }}} */
2869 
duplicate_elist(const mbfl_encoding ** elist,size_t size)2870 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2871 {
2872 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2873 	memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2874 	return new_elist;
2875 }
2876 
2877 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)2878 PHP_FUNCTION(mb_detect_encoding)
2879 {
2880 	char *str;
2881 	size_t str_len;
2882 	zend_string *encoding_str = NULL;
2883 	HashTable *encoding_ht = NULL;
2884 	bool strict = 0;
2885 
2886 	mbfl_string string;
2887 	const mbfl_encoding *ret;
2888 	const mbfl_encoding **elist;
2889 	size_t size;
2890 
2891 	ZEND_PARSE_PARAMETERS_START(1, 3)
2892 		Z_PARAM_STRING(str, str_len)
2893 		Z_PARAM_OPTIONAL
2894 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
2895 		Z_PARAM_BOOL(strict)
2896 	ZEND_PARSE_PARAMETERS_END();
2897 
2898 	/* make encoding list */
2899 	if (encoding_ht) {
2900 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
2901 			RETURN_THROWS();
2902 		}
2903 	} else if (encoding_str) {
2904 		if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0)) {
2905 			RETURN_THROWS();
2906 		}
2907 	} else {
2908 		elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
2909 		size = MBSTRG(current_detect_order_list_size);
2910 	}
2911 
2912 	if (size == 0) {
2913 		efree(ZEND_VOIDP(elist));
2914 		zend_argument_value_error(2, "must specify at least one encoding");
2915 		RETURN_THROWS();
2916 	}
2917 
2918 	remove_non_encodings_from_elist(elist, &size);
2919 	if (size == 0) {
2920 		efree(ZEND_VOIDP(elist));
2921 		RETURN_FALSE;
2922 	}
2923 
2924 	if (ZEND_NUM_ARGS() < 3) {
2925 		strict = MBSTRG(strict_detection);
2926 	}
2927 
2928 	if (strict && size == 1) {
2929 		/* If there is only a single candidate encoding, mb_check_encoding is faster */
2930 		ret = (php_mb_check_encoding(str, str_len, *elist)) ? *elist : NULL;
2931 	} else {
2932 		mbfl_string_init(&string);
2933 		string.val = (unsigned char *)str;
2934 		string.len = str_len;
2935 		ret = mbfl_identify_encoding(&string, elist, size, strict);
2936 	}
2937 
2938 	efree(ZEND_VOIDP(elist));
2939 
2940 	if (ret == NULL) {
2941 		RETURN_FALSE;
2942 	}
2943 
2944 	RETVAL_STRING((char *)ret->name);
2945 }
2946 /* }}} */
2947 
2948 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)2949 PHP_FUNCTION(mb_list_encodings)
2950 {
2951 	ZEND_PARSE_PARAMETERS_NONE();
2952 
2953 	array_init(return_value);
2954 	for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
2955 		add_next_index_string(return_value, (*encodings)->name);
2956 	}
2957 }
2958 /* }}} */
2959 
2960 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)2961 PHP_FUNCTION(mb_encoding_aliases)
2962 {
2963 	const mbfl_encoding *encoding;
2964 	zend_string *encoding_name = NULL;
2965 
2966 	ZEND_PARSE_PARAMETERS_START(1, 1)
2967 		Z_PARAM_STR(encoding_name)
2968 	ZEND_PARSE_PARAMETERS_END();
2969 
2970 	encoding = php_mb_get_encoding(encoding_name, 1);
2971 	if (!encoding) {
2972 		RETURN_THROWS();
2973 	}
2974 
2975 	array_init(return_value);
2976 	if (encoding->aliases != NULL) {
2977 		for (const char **alias = encoding->aliases; *alias; ++alias) {
2978 			add_next_index_string(return_value, (char *)*alias);
2979 		}
2980 	}
2981 }
2982 /* }}} */
2983 
2984 /* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)2985 PHP_FUNCTION(mb_encode_mimeheader)
2986 {
2987 	const mbfl_encoding *charset, *transenc;
2988 	mbfl_string  string, result, *ret;
2989 	zend_string *charset_name = NULL;
2990 	char *trans_enc_name = NULL, *string_val;
2991 	size_t trans_enc_name_len;
2992 	char *linefeed = "\r\n";
2993 	size_t linefeed_len;
2994 	zend_long indent = 0;
2995 
2996 	string.encoding = MBSTRG(current_internal_encoding);
2997 
2998 	ZEND_PARSE_PARAMETERS_START(1, 5)
2999 		Z_PARAM_STRING(string_val, string.len)
3000 		Z_PARAM_OPTIONAL
3001 		Z_PARAM_STR(charset_name)
3002 		Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
3003 		Z_PARAM_STRING(linefeed, linefeed_len)
3004 		Z_PARAM_LONG(indent)
3005 	ZEND_PARSE_PARAMETERS_END();
3006 
3007 	string.val = (unsigned char*)string_val;
3008 	charset = &mbfl_encoding_pass;
3009 	transenc = &mbfl_encoding_base64;
3010 
3011 	if (charset_name != NULL) {
3012 		charset = php_mb_get_encoding(charset_name, 2);
3013 		if (!charset) {
3014 			RETURN_THROWS();
3015 		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
3016 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
3017 			RETURN_THROWS();
3018 		}
3019 	} else {
3020 		const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
3021 		if (lang != NULL) {
3022 			charset = mbfl_no2encoding(lang->mail_charset);
3023 			transenc = mbfl_no2encoding(lang->mail_header_encoding);
3024 		}
3025 	}
3026 
3027 	if (trans_enc_name != NULL) {
3028 		if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
3029 			transenc = &mbfl_encoding_base64;
3030 		} else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
3031 			transenc = &mbfl_encoding_qprint;
3032 		}
3033 	}
3034 
3035 	mbfl_string_init(&result);
3036 	ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
3037 	ZEND_ASSERT(ret != NULL);
3038 	// TODO: avoid reallocation ???
3039 	RETVAL_STRINGL((char *)ret->val, ret->len);	/* the string is already strdup()'ed */
3040 	efree(ret->val);
3041 }
3042 /* }}} */
3043 
3044 /* {{{ Decodes the MIME "encoded-word" in the string */
PHP_FUNCTION(mb_decode_mimeheader)3045 PHP_FUNCTION(mb_decode_mimeheader)
3046 {
3047 	char *string_val;
3048 	mbfl_string string, result, *ret;
3049 
3050 	string.encoding = MBSTRG(current_internal_encoding);
3051 
3052 	ZEND_PARSE_PARAMETERS_START(1, 1)
3053 		Z_PARAM_STRING(string_val, string.len)
3054 	ZEND_PARSE_PARAMETERS_END();
3055 
3056 	string.val = (unsigned char*)string_val;
3057 	mbfl_string_init(&result);
3058 	ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding));
3059 	ZEND_ASSERT(ret != NULL);
3060 	// TODO: avoid reallocation ???
3061 	RETVAL_STRINGL((char *)ret->val, ret->len);	/* the string is already strdup()'ed */
3062 	efree(ret->val);
3063 }
3064 /* }}} */
3065 
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3066 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3067 {
3068 	/* Each wchar may potentially expand to 2 when we perform kana conversion...
3069 	 * if we are converting zenkaku kana to hankaku kana
3070 	 * Make the buffer for converted kana big enough that we never need to
3071 	 * perform bounds checks */
3072 	uint32_t wchar_buf[64], converted_buf[64 * 2];
3073 	unsigned int buf_offset = 0;
3074 	unsigned int state = 0;
3075 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3076 	size_t in_len = ZSTR_LEN(input);
3077 
3078 	mb_convert_buf buf;
3079 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3080 
3081 	while (in_len) {
3082 		uint32_t *converted = converted_buf;
3083 		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3084 		 * previous iteration, don't overwrite it */
3085 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3086 		out_len += buf_offset;
3087 		ZEND_ASSERT(out_len <= 64);
3088 
3089 		if (!out_len) {
3090 			continue;
3091 		}
3092 
3093 		for (int i = 0; i < out_len-1; i++) {
3094 			uint32_t second = 0;
3095 			bool consumed = false;
3096 			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3097 			if (second) {
3098 				*converted++ = second;
3099 			}
3100 			if (consumed) {
3101 				i++;
3102 				if (i == out_len-1) {
3103 					/* We consumed two codepoints at the very end of the wchar buffer
3104 					 * So there is nothing remaining to reprocess on the next iteration */
3105 					buf_offset = 0;
3106 					goto emit_converted_kana;
3107 				}
3108 			}
3109 		}
3110 
3111 		if (!in_len) {
3112 			/* This is the last iteration, so we need to process the final codepoint now */
3113 			uint32_t second = 0;
3114 			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3115 			if (second) {
3116 				*converted++ = second;
3117 			}
3118 		} else {
3119 			/* Reprocess the last codepoint on the next iteration */
3120 			wchar_buf[0] = wchar_buf[out_len-1];
3121 			buf_offset = 1;
3122 		}
3123 
3124 emit_converted_kana:
3125 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3126 	}
3127 
3128 	return mb_convert_buf_result(&buf);
3129 }
3130 
3131 char mb_convert_kana_flags[17] = {
3132 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3133 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3134 	'V'
3135 };
3136 
3137 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3138 PHP_FUNCTION(mb_convert_kana)
3139 {
3140 	unsigned int opt;
3141 	char *optstr = NULL;
3142 	size_t optstr_len;
3143 	zend_string *encname = NULL, *str;
3144 
3145 	ZEND_PARSE_PARAMETERS_START(1, 3)
3146 		Z_PARAM_STR(str)
3147 		Z_PARAM_OPTIONAL
3148 		Z_PARAM_STRING(optstr, optstr_len)
3149 		Z_PARAM_STR_OR_NULL(encname)
3150 	ZEND_PARSE_PARAMETERS_END();
3151 
3152 	if (optstr != NULL) {
3153 		char *p = optstr, *e = p + optstr_len;
3154 		opt = 0;
3155 next_option:
3156 		while (p < e) {
3157 			/* Walk through option string and convert to bit vector
3158 			 * See translit_kana_jisx0201_jisx0208.h for the values used */
3159 			char c = *p++;
3160 			if (c == 'A') {
3161 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3162 			} else if (c == 'a') {
3163 				opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3164 			} else {
3165 				for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3166 					if (c == mb_convert_kana_flags[i]) {
3167 						opt |= (1 << i);
3168 						goto next_option;
3169 					}
3170 				}
3171 
3172 				zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3173 				RETURN_THROWS();
3174 			}
3175 		}
3176 
3177 		/* Check for illegal combinations of options */
3178 		if (((opt & 0xFF00) >> 8) & opt) {
3179 			/* It doesn't make sense to convert the same type of characters from halfwidth to
3180 			 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3181 			 * FW hiragana to FW katakana and then back again. */
3182 			int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3183 			for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3184 			char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3185 			if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3186 				flag1 = 'A';
3187 			if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3188 				flag2 = 'a';
3189 			zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3190 			RETURN_THROWS();
3191 		}
3192 
3193 		if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3194 			/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3195 			zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3196 			RETURN_THROWS();
3197 		}
3198 
3199 		/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3200 		 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3201 		 * more than one of these */
3202 		if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3203 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3204 				zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3205 				RETURN_THROWS();
3206 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3207 				zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3208 				RETURN_THROWS();
3209 			}
3210 		} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3211 			if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3212 				zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3213 				RETURN_THROWS();
3214 			} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3215 				zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3216 				RETURN_THROWS();
3217 			}
3218 		}
3219 	} else {
3220 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3221 	}
3222 
3223 	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3224 	if (!enc) {
3225 		RETURN_THROWS();
3226 	}
3227 
3228 	RETVAL_STR(jp_kana_convert(str, enc, opt));
3229 }
3230 
mb_recursive_encoder_detector_feed(mbfl_encoding_detector * identd,zval * var,int * recursion_error)3231 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
3232 {
3233 	mbfl_string string;
3234 	HashTable *ht;
3235 	zval *entry;
3236 
3237 	ZVAL_DEREF(var);
3238 	if (Z_TYPE_P(var) == IS_STRING) {
3239 		string.val = (unsigned char *)Z_STRVAL_P(var);
3240 		string.len = Z_STRLEN_P(var);
3241 		if (mbfl_encoding_detector_feed(identd, &string)) {
3242 			return 1; /* complete detecting */
3243 		}
3244 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3245 		if (Z_REFCOUNTED_P(var)) {
3246 			if (Z_IS_RECURSIVE_P(var)) {
3247 				*recursion_error = 1;
3248 				return 0;
3249 			}
3250 			Z_PROTECT_RECURSION_P(var);
3251 		}
3252 
3253 		ht = HASH_OF(var);
3254 		if (ht != NULL) {
3255 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3256 				if (mb_recursive_encoder_detector_feed(identd, entry, recursion_error)) {
3257 					if (Z_REFCOUNTED_P(var)) {
3258 						Z_UNPROTECT_RECURSION_P(var);
3259 					}
3260 					return 1;
3261 				} else if (*recursion_error) {
3262 					if (Z_REFCOUNTED_P(var)) {
3263 						Z_UNPROTECT_RECURSION_P(var);
3264 					}
3265 					return 0;
3266 				}
3267 			} ZEND_HASH_FOREACH_END();
3268 		}
3269 
3270 		if (Z_REFCOUNTED_P(var)) {
3271 			Z_UNPROTECT_RECURSION_P(var);
3272 		}
3273 	}
3274 	return 0;
3275 } /* }}} */
3276 
mb_recursive_convert_variable(mbfl_buffer_converter * convd,zval * var)3277 static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
3278 {
3279 	mbfl_string string, result, *ret;
3280 	HashTable *ht;
3281 	zval *entry, *orig_var;
3282 
3283 	orig_var = var;
3284 	ZVAL_DEREF(var);
3285 	if (Z_TYPE_P(var) == IS_STRING) {
3286 		string.val = (unsigned char *)Z_STRVAL_P(var);
3287 		string.len = Z_STRLEN_P(var);
3288 		ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3289 		if (ret != NULL) {
3290 			zval_ptr_dtor(orig_var);
3291 			// TODO: avoid reallocation ???
3292 			ZVAL_STRINGL(orig_var, (char *)ret->val, ret->len);
3293 			efree(ret->val);
3294 		}
3295 	} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3296 		if (Z_TYPE_P(var) == IS_ARRAY) {
3297 			SEPARATE_ARRAY(var);
3298 		}
3299 		if (Z_REFCOUNTED_P(var)) {
3300 			if (Z_IS_RECURSIVE_P(var)) {
3301 				return 1;
3302 			}
3303 			Z_PROTECT_RECURSION_P(var);
3304 		}
3305 
3306 		ht = HASH_OF(var);
3307 		if (ht != NULL) {
3308 			ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3309 				if (mb_recursive_convert_variable(convd, entry)) {
3310 					if (Z_REFCOUNTED_P(var)) {
3311 						Z_UNPROTECT_RECURSION_P(var);
3312 					}
3313 					return 1;
3314 				}
3315 			} ZEND_HASH_FOREACH_END();
3316 		}
3317 
3318 		if (Z_REFCOUNTED_P(var)) {
3319 			Z_UNPROTECT_RECURSION_P(var);
3320 		}
3321 	}
3322 	return 0;
3323 } /* }}} */
3324 
3325 /* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)3326 PHP_FUNCTION(mb_convert_variables)
3327 {
3328 	zval *args;
3329 	zend_string *to_enc_str;
3330 	zend_string *from_enc_str;
3331 	HashTable *from_enc_ht;
3332 	mbfl_string string, result;
3333 	const mbfl_encoding *from_encoding, *to_encoding;
3334 	mbfl_encoding_detector *identd;
3335 	mbfl_buffer_converter *convd;
3336 	int n, argc;
3337 	size_t elistsz;
3338 	const mbfl_encoding **elist;
3339 	int recursion_error = 0;
3340 
3341 	ZEND_PARSE_PARAMETERS_START(3, -1)
3342 		Z_PARAM_STR(to_enc_str)
3343 		Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3344 		Z_PARAM_VARIADIC('+', args, argc)
3345 	ZEND_PARSE_PARAMETERS_END();
3346 
3347 	/* new encoding */
3348 	to_encoding = php_mb_get_encoding(to_enc_str, 1);
3349 	if (!to_encoding) {
3350 		RETURN_THROWS();
3351 	}
3352 
3353 	/* initialize string */
3354 	from_encoding = MBSTRG(current_internal_encoding);
3355 	mbfl_string_init_set(&string, from_encoding);
3356 	mbfl_string_init(&result);
3357 
3358 	/* pre-conversion encoding */
3359 	if (from_enc_ht) {
3360 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3361 			RETURN_THROWS();
3362 		}
3363 	} else {
3364 		if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0) == FAILURE) {
3365 			RETURN_THROWS();
3366 		}
3367 	}
3368 
3369 	if (elistsz == 0) {
3370 		efree(ZEND_VOIDP(elist));
3371 		zend_argument_value_error(2, "must specify at least one encoding");
3372 		RETURN_THROWS();
3373 	}
3374 
3375 	if (elistsz == 1) {
3376 		from_encoding = *elist;
3377 	} else {
3378 		/* auto detect */
3379 		from_encoding = NULL;
3380 		identd = mbfl_encoding_detector_new(elist, elistsz, MBSTRG(strict_detection));
3381 		if (identd != NULL) {
3382 			n = 0;
3383 			while (n < argc) {
3384 				if (mb_recursive_encoder_detector_feed(identd, &args[n], &recursion_error)) {
3385 					break;
3386 				}
3387 				n++;
3388 			}
3389 			from_encoding = mbfl_encoding_detector_judge(identd);
3390 			mbfl_encoding_detector_delete(identd);
3391 			if (recursion_error) {
3392 				efree(ZEND_VOIDP(elist));
3393 				php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3394 				RETURN_FALSE;
3395 			}
3396 		}
3397 
3398 		if (!from_encoding) {
3399 			php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3400 			efree(ZEND_VOIDP(elist));
3401 			RETURN_FALSE;
3402 		}
3403 	}
3404 
3405 	efree(ZEND_VOIDP(elist));
3406 
3407 	convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
3408 	/* If this assertion fails this means some memory allocation failure which is a bug */
3409 	ZEND_ASSERT(convd != NULL);
3410 
3411 	mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
3412 	mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
3413 
3414 	/* convert */
3415 	n = 0;
3416 	while (n < argc) {
3417 		zval *zv = &args[n];
3418 
3419 		ZVAL_DEREF(zv);
3420 		recursion_error = mb_recursive_convert_variable(convd, zv);
3421 		if (recursion_error) {
3422 			break;
3423 		}
3424 		n++;
3425 	}
3426 
3427 	MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
3428 	mbfl_buffer_converter_delete(convd);
3429 
3430 	if (recursion_error) {
3431 		php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3432 		RETURN_FALSE;
3433 	}
3434 
3435 	RETURN_STRING(from_encoding->name);
3436 }
3437 /* }}} */
3438 
3439 /* HTML numeric entities */
3440 
3441 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3442 static uint32_t *make_conversion_map(HashTable *target_hash, int *convmap_size)
3443 {
3444 	zval *hash_entry;
3445 
3446 	int n_elems = zend_hash_num_elements(target_hash);
3447 	if (n_elems % 4 != 0) {
3448 		zend_argument_value_error(2, "must have a multiple of 4 elements");
3449 		return NULL;
3450 	}
3451 
3452 	uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3453 	uint32_t *mapelm = convmap;
3454 
3455 	ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3456 		*mapelm++ = zval_get_long(hash_entry);
3457 	} ZEND_HASH_FOREACH_END();
3458 
3459 	*convmap_size = n_elems / 4;
3460 	return convmap;
3461 }
3462 
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,int mapsize,uint32_t * retval)3463 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, int mapsize, uint32_t *retval)
3464 {
3465 	uint32_t *convmap_end = convmap + (mapsize * 4);
3466 
3467 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3468 		uint32_t lo_code = mapelm[0];
3469 		uint32_t hi_code = mapelm[1];
3470 		uint32_t offset  = mapelm[2];
3471 		uint32_t mask    = mapelm[3];
3472 
3473 		if (w >= lo_code && w <= hi_code) {
3474 			/* This wchar falls inside one of the ranges which should be
3475 			 * converted to HTML entities */
3476 			*retval = (w + offset) & mask;
3477 			return true;
3478 		}
3479 	}
3480 
3481 	/* None of the ranges matched */
3482 	return false;
3483 }
3484 
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize,bool hex)3485 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize, bool hex)
3486 {
3487 	/* Each wchar which we get from decoding the input string may become up to
3488 	 * 13 wchars when we convert it to an HTML entity */
3489 	uint32_t wchar_buf[32], converted_buf[32 * 13];
3490 	unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3491 
3492 	unsigned int state = 0;
3493 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3494 	size_t in_len = ZSTR_LEN(input);
3495 
3496 	mb_convert_buf buf;
3497 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3498 
3499 	while (in_len) {
3500 		/* Convert input string to wchars, up to 32 at a time */
3501 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3502 		ZEND_ASSERT(out_len <= 32);
3503 		uint32_t *converted = converted_buf;
3504 
3505 		/* Run through wchars and see if any of them fall into the ranges
3506 		 * which we want to convert to HTML entities */
3507 		for (int i = 0; i < out_len; i++) {
3508 			uint32_t w = wchar_buf[i];
3509 
3510 			if (html_numeric_entity_convert(w, convmap, mapsize, &w)) {
3511 				*converted++ = '&';
3512 				*converted++ = '#';
3513 				if (hex) {
3514 					*converted++ = 'x';
3515 				}
3516 
3517 				/* Convert wchar to decimal/hex string */
3518 				if (w == 0) {
3519 					*converted++ = '0';
3520 				} else {
3521 					unsigned char *p = entity + sizeof(entity);
3522 					if (hex) {
3523 						while (w > 0) {
3524 							*(--p) = "0123456789ABCDEF"[w & 0xF];
3525 							w >>= 4;
3526 						}
3527 					} else {
3528 						while (w > 0) {
3529 							*(--p) = "0123456789"[w % 10];
3530 							w /= 10;
3531 						}
3532 					}
3533 					while (p < entity + sizeof(entity)) {
3534 						*converted++ = *p++;
3535 					}
3536 				}
3537 
3538 				*converted++ = ';';
3539 			} else {
3540 				*converted++ = w;
3541 			}
3542 		}
3543 
3544 		ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3545 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3546 	}
3547 
3548 	return mb_convert_buf_result(&buf);
3549 }
3550 
3551 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3552 PHP_FUNCTION(mb_encode_numericentity)
3553 {
3554 	zend_string *encoding = NULL, *str;
3555 	int mapsize;
3556 	HashTable *target_hash;
3557 	bool is_hex = false;
3558 
3559 	ZEND_PARSE_PARAMETERS_START(2, 4)
3560 		Z_PARAM_STR(str)
3561 		Z_PARAM_ARRAY_HT(target_hash)
3562 		Z_PARAM_OPTIONAL
3563 		Z_PARAM_STR_OR_NULL(encoding)
3564 		Z_PARAM_BOOL(is_hex)
3565 	ZEND_PARSE_PARAMETERS_END();
3566 
3567 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3568 	if (!enc) {
3569 		RETURN_THROWS();
3570 	}
3571 
3572 	uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3573 	if (convmap == NULL) {
3574 		RETURN_THROWS();
3575 	}
3576 
3577 	RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, mapsize, is_hex));
3578 	efree(convmap);
3579 }
3580 /* }}} */
3581 
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,int mapsize,uint32_t * retval)3582 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, int mapsize, uint32_t *retval)
3583 {
3584 	uint32_t *convmap_end = convmap + (mapsize * 4);
3585 
3586 	for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3587 		uint32_t lo_code = mapelm[0];
3588 		uint32_t hi_code = mapelm[1];
3589 		uint32_t offset  = mapelm[2];
3590 		uint32_t codepoint = number - offset;
3591 		if (codepoint >= lo_code && codepoint <= hi_code) {
3592 			*retval = codepoint;
3593 			return true;
3594 		}
3595 	}
3596 
3597 	return false;
3598 }
3599 
3600 #define DEC_ENTITY_MINLEN 3  /* For "&#" and 1 decimal digit */
3601 #define HEX_ENTITY_MINLEN 4  /* For "&#x" and 1 hexadecimal digit */
3602 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3603 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3604 
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize)3605 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
3606 {
3607 	uint32_t wchar_buf[128], converted_buf[128];
3608 
3609 	unsigned int state = 0;
3610 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3611 	size_t in_len = ZSTR_LEN(input);
3612 
3613 	mb_convert_buf buf;
3614 	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3615 
3616 	/* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
3617 	 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
3618 	 * 2nd 'converted' buffer.
3619 	 *
3620 	 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
3621 	 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
3622 	 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
3623 	 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
3624 	 * to store the next batch of wchars after it.
3625 	 *
3626 	 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
3627 	 * wchars from the 1st buffer to the 2nd one.
3628 	 *
3629 	 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
3630 	 * have to do bounds checks when writing wchars into it.
3631 	 */
3632 
3633 	unsigned int wchar_buf_offset = 0;
3634 
3635 	while (in_len) {
3636 		/* Leave space for sentinel at the end of the buffer */
3637 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
3638 		out_len += wchar_buf_offset;
3639 		ZEND_ASSERT(out_len <= 127);
3640 		wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
3641 
3642 		uint32_t *p, *converted;
3643 
3644 		/* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
3645 		 * be there (in `wchar_buf[0]`), so don't bother in that case */
3646 		if (wchar_buf_offset == 0) {
3647 			p = wchar_buf;
3648 			while (*p != '&')
3649 				p++;
3650 			if (p == wchar_buf + out_len) {
3651 				/* No HTML entities in this buffer */
3652 				encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
3653 				continue;
3654 			}
3655 
3656 			/* Copy over the prefix with no & which we already scanned */
3657 			memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
3658 			converted = converted_buf + (p - wchar_buf);
3659 		} else {
3660 			p = wchar_buf;
3661 			converted = converted_buf;
3662 		}
3663 
3664 found_ampersand:
3665 		ZEND_ASSERT(*p == '&');
3666 		uint32_t *p2 = p;
3667 
3668 		/* These tests can't overrun end of buffer, because we have a '&' sentinel there */
3669 		if (*++p2 == '#') {
3670 			if (*++p2 == 'x') {
3671 				/* Possible hex entity */
3672 				uint32_t w = *++p2;
3673 				while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
3674 					w = *++p2;
3675 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3676 					/* We hit the end of the buffer while reading digits, and
3677 					 * more wchars are still coming in the next buffer
3678 					 * Reprocess this identity on next iteration */
3679 					memmove(wchar_buf, p, (p2 - p) * 4);
3680 					wchar_buf_offset = p2 - p;
3681 					goto process_converted_wchars;
3682 				} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3683 					/* Invalid entity (too long or "&#x" only) */
3684 					memcpy(converted, p, (p2 - p) * 4);
3685 					converted += p2 - p;
3686 				} else {
3687 					/* Valid hexadecimal entity */
3688 					uint32_t value = 0, *p3 = p + 3;
3689 					while (p3 < p2) {
3690 						w = *p3++;
3691 						if (w <= '9') {
3692 							value = (value * 16) + (w - '0');
3693 						} else if (w >= 'a') {
3694 							value = (value * 16) + 10 + (w - 'a');
3695 						} else {
3696 							value = (value * 16) + 10 + (w - 'A');
3697 						}
3698 					}
3699 					if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3700 						converted++;
3701 						if (*p2 == ';')
3702 							p2++;
3703 					} else {
3704 						memcpy(converted, p, (p2 - p) * 4);
3705 						converted += p2 - p;
3706 					}
3707 				}
3708 			} else {
3709 				/* Possible decimal entity */
3710 				uint32_t w = *p2;
3711 				while (w >= '0' && w <= '9')
3712 					w = *++p2;
3713 				if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3714 					/* The number of digits was legal (no more than 10 decimal digits)
3715 					 * Reprocess this identity on next iteration of main loop */
3716 					memmove(wchar_buf, p, (p2 - p) * 4);
3717 					wchar_buf_offset = p2 - p;
3718 					goto process_converted_wchars;
3719 				} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3720 					/* Invalid entity (too long or "&#" only) */
3721 					memcpy(converted, p, (p2 - p) * 4);
3722 					converted += p2 - p;
3723 				} else {
3724 					/* Valid decimal entity */
3725 					uint32_t value = 0, *p3 = p + 2;
3726 					while (p3 < p2) {
3727 						/* If unsigned integer overflow would occur in the below
3728 						 * multiplication by 10, this entity is no good
3729 						 * 0x19999999 is 1/10th of 0xFFFFFFFF */
3730 						if (value > 0x19999999) {
3731 							memcpy(converted, p, (p2 - p) * 4);
3732 							converted += p2 - p;
3733 							goto decimal_entity_too_big;
3734 						}
3735 						value = (value * 10) + (*p3++ - '0');
3736 					}
3737 					if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3738 						converted++;
3739 						if (*p2 == ';')
3740 							p2++;
3741 					} else {
3742 						memcpy(converted, p, (p2 - p) * 4);
3743 						converted += p2 - p;
3744 					}
3745 				}
3746 			}
3747 		} else if ((p2 == wchar_buf + out_len) && in_len) {
3748 			/* Corner case: & at end of buffer */
3749 			wchar_buf[0] = '&';
3750 			wchar_buf_offset = 1;
3751 			goto process_converted_wchars;
3752 		} else {
3753 			*converted++ = '&';
3754 		}
3755 decimal_entity_too_big:
3756 
3757 		/* Starting to scan a new section of the wchar buffer
3758 		 * 'p2' is pointing at the next wchar which needs to be processed */
3759 		p = p2;
3760 		while (*p2 != '&')
3761 			p2++;
3762 
3763 		if (p2 > p) {
3764 			memcpy(converted, p, (p2 - p) * 4);
3765 			converted += p2 - p;
3766 			p = p2;
3767 		}
3768 
3769 		if (p < wchar_buf + out_len)
3770 			goto found_ampersand;
3771 
3772 		/* We do not have any wchars remaining at the end of this buffer which
3773 		 * we need to reprocess on the next call */
3774 		wchar_buf_offset = 0;
3775 process_converted_wchars:
3776 		ZEND_ASSERT(converted <= converted_buf + 128);
3777 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3778 	}
3779 
3780 	return mb_convert_buf_result(&buf);
3781 }
3782 
3783 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3784 PHP_FUNCTION(mb_decode_numericentity)
3785 {
3786 	zend_string *encoding = NULL, *str;
3787 	int mapsize;
3788 	HashTable *target_hash;
3789 
3790 	ZEND_PARSE_PARAMETERS_START(2, 3)
3791 		Z_PARAM_STR(str)
3792 		Z_PARAM_ARRAY_HT(target_hash)
3793 		Z_PARAM_OPTIONAL
3794 		Z_PARAM_STR_OR_NULL(encoding)
3795 	ZEND_PARSE_PARAMETERS_END();
3796 
3797 	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3798 	if (!enc) {
3799 		RETURN_THROWS();
3800 	}
3801 
3802 	uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3803 	if (convmap == NULL) {
3804 		RETURN_THROWS();
3805 	}
3806 
3807 	RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, mapsize));
3808 	efree(convmap);
3809 }
3810 /* }}} */
3811 
3812 /* {{{ Sends an email message with MIME scheme */
3813 #define CRLF "\r\n"
3814 
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)3815 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
3816 {
3817 	const char *ps;
3818 	size_t icnt;
3819 	int state = 0;
3820 	int crlf_state = -1;
3821 	char *token = NULL;
3822 	size_t token_pos = 0;
3823 	zend_string *fld_name, *fld_val;
3824 
3825 	ps = str;
3826 	icnt = str_len;
3827 	fld_name = fld_val = NULL;
3828 
3829 	/*
3830 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
3831 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
3832 	 *      state  0            1           2          3
3833 	 *
3834 	 *             C o n t e n t - T y p e :   t e x t / h t m l \r\n
3835 	 *             ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
3836 	 * crlf_state -1                       0                     1 -1
3837 	 *
3838 	 */
3839 
3840 	while (icnt > 0) {
3841 		switch (*ps) {
3842 			case ':':
3843 				if (crlf_state == 1) {
3844 					token_pos++;
3845 				}
3846 
3847 				if (state == 0 || state == 1) {
3848 					if(token && token_pos > 0) {
3849 						fld_name = zend_string_init(token, token_pos, 0);
3850 					}
3851 					state = 2;
3852 				} else {
3853 					token_pos++;
3854 				}
3855 
3856 				crlf_state = 0;
3857 				break;
3858 
3859 			case '\n':
3860 				if (crlf_state == -1) {
3861 					goto out;
3862 				}
3863 				crlf_state = -1;
3864 				break;
3865 
3866 			case '\r':
3867 				if (crlf_state == 1) {
3868 					token_pos++;
3869 				} else {
3870 					crlf_state = 1;
3871 				}
3872 				break;
3873 
3874 			case ' ': case '\t':
3875 				if (crlf_state == -1) {
3876 					if (state == 3) {
3877 						/* continuing from the previous line */
3878 						state = 4;
3879 					} else {
3880 						/* simply skipping this new line */
3881 						state = 5;
3882 					}
3883 				} else {
3884 					if (crlf_state == 1) {
3885 						token_pos++;
3886 					}
3887 					if (state == 1 || state == 3) {
3888 						token_pos++;
3889 					}
3890 				}
3891 				crlf_state = 0;
3892 				break;
3893 
3894 			default:
3895 				switch (state) {
3896 					case 0:
3897 						token = (char*)ps;
3898 						token_pos = 0;
3899 						state = 1;
3900 						break;
3901 
3902 					case 2:
3903 						if (crlf_state != -1) {
3904 							token = (char*)ps;
3905 							token_pos = 0;
3906 
3907 							state = 3;
3908 							break;
3909 						}
3910 						ZEND_FALLTHROUGH;
3911 
3912 					case 3:
3913 						if (crlf_state == -1) {
3914 							if(token && token_pos > 0) {
3915 								fld_val = zend_string_init(token, token_pos, 0);
3916 							}
3917 
3918 							if (fld_name != NULL && fld_val != NULL) {
3919 								zval val;
3920 								zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3921 								ZVAL_STR(&val, fld_val);
3922 
3923 								zend_hash_update(ht, fld_name, &val);
3924 
3925 								zend_string_release_ex(fld_name, 0);
3926 							}
3927 
3928 							fld_name = fld_val = NULL;
3929 							token = (char*)ps;
3930 							token_pos = 0;
3931 
3932 							state = 1;
3933 						}
3934 						break;
3935 
3936 					case 4:
3937 						token_pos++;
3938 						state = 3;
3939 						break;
3940 				}
3941 
3942 				if (crlf_state == 1) {
3943 					token_pos++;
3944 				}
3945 
3946 				token_pos++;
3947 
3948 				crlf_state = 0;
3949 				break;
3950 		}
3951 		ps++, icnt--;
3952 	}
3953 out:
3954 	if (state == 2) {
3955 		token = "";
3956 		token_pos = 0;
3957 
3958 		state = 3;
3959 	}
3960 	if (state == 3) {
3961 		if(token && token_pos > 0) {
3962 			fld_val = zend_string_init(token, token_pos, 0);
3963 		}
3964 		if (fld_name != NULL && fld_val != NULL) {
3965 			zval val;
3966 			zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3967 			ZVAL_STR(&val, fld_val);
3968 			zend_hash_update(ht, fld_name, &val);
3969 
3970 			zend_string_release_ex(fld_name, 0);
3971 		}
3972 	}
3973 	return state;
3974 }
3975 
PHP_FUNCTION(mb_send_mail)3976 PHP_FUNCTION(mb_send_mail)
3977 {
3978 	char *to;
3979 	size_t to_len;
3980 	char *message;
3981 	size_t message_len;
3982 	char *subject;
3983 	size_t subject_len;
3984 	zend_string *extra_cmd = NULL;
3985 	HashTable *headers_ht = NULL;
3986 	zend_string *str_headers = NULL;
3987 	size_t n, i;
3988 	char *to_r = NULL;
3989 	char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
3990 	struct {
3991 		int cnt_type:1;
3992 		int cnt_trans_enc:1;
3993 	} suppressed_hdrs = { 0, 0 };
3994 
3995 	char *message_buf = NULL, *subject_buf = NULL, *p;
3996 	mbfl_string orig_str, conv_str;
3997 	mbfl_string *pstr;	/* pointer to mbfl string for return value */
3998 	enum mbfl_no_encoding;
3999 	const mbfl_encoding *tran_cs,	/* transfer text charset */
4000 						*head_enc,	/* header transfer encoding */
4001 						*body_enc;	/* body transfer encoding */
4002 	mbfl_memory_device device;	/* automatic allocateable buffer for additional header */
4003 	const mbfl_language *lang;
4004 	int err = 0;
4005 	HashTable ht_headers;
4006 	zval *s;
4007 	extern void mbfl_memory_device_unput(mbfl_memory_device *device);
4008 
4009 	/* initialize */
4010 	mbfl_memory_device_init(&device, 0, 0);
4011 	mbfl_string_init(&orig_str);
4012 	mbfl_string_init(&conv_str);
4013 
4014 	/* character-set, transfer-encoding */
4015 	tran_cs = &mbfl_encoding_utf8;
4016 	head_enc = &mbfl_encoding_base64;
4017 	body_enc = &mbfl_encoding_base64;
4018 	lang = mbfl_no2language(MBSTRG(language));
4019 	if (lang != NULL) {
4020 		tran_cs = mbfl_no2encoding(lang->mail_charset);
4021 		head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4022 		body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4023 	}
4024 
4025 	ZEND_PARSE_PARAMETERS_START(3, 5)
4026 		Z_PARAM_PATH(to, to_len)
4027 		Z_PARAM_PATH(subject, subject_len)
4028 		Z_PARAM_PATH(message, message_len)
4029 		Z_PARAM_OPTIONAL
4030 		Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4031 		Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4032 	ZEND_PARSE_PARAMETERS_END();
4033 
4034 	if (str_headers) {
4035 		if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4036 			zend_argument_value_error(4, "must not contain any null bytes");
4037 			RETURN_THROWS();
4038 		}
4039 		str_headers = php_trim(str_headers, NULL, 0, 2);
4040 	} else if (headers_ht) {
4041 		str_headers = php_mail_build_headers(headers_ht);
4042 		if (EG(exception)) {
4043 			RETURN_THROWS();
4044 		}
4045 	}
4046 
4047 	zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4048 
4049 	if (str_headers != NULL) {
4050 		_php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4051 	}
4052 
4053 	if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4054 		char *tmp;
4055 		char *param_name;
4056 		char *charset = NULL;
4057 
4058 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4059 		p = strchr(Z_STRVAL_P(s), ';');
4060 
4061 		if (p != NULL) {
4062 			/* skipping the padded spaces */
4063 			do {
4064 				++p;
4065 			} while (*p == ' ' || *p == '\t');
4066 
4067 			if (*p != '\0') {
4068 				if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4069 					if (strcasecmp(param_name, "charset") == 0) {
4070 						const mbfl_encoding *_tran_cs = tran_cs;
4071 
4072 						charset = php_strtok_r(NULL, "= \"", &tmp);
4073 						if (charset != NULL) {
4074 							_tran_cs = mbfl_name2encoding(charset);
4075 						}
4076 
4077 						if (!_tran_cs) {
4078 							php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4079 							_tran_cs = &mbfl_encoding_ascii;
4080 						}
4081 						tran_cs = _tran_cs;
4082 					}
4083 				}
4084 			}
4085 		}
4086 		suppressed_hdrs.cnt_type = 1;
4087 	}
4088 
4089 	if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4090 		const mbfl_encoding *_body_enc;
4091 
4092 		ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4093 		_body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4094 		switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4095 			case mbfl_no_encoding_base64:
4096 			case mbfl_no_encoding_7bit:
4097 			case mbfl_no_encoding_8bit:
4098 				body_enc = _body_enc;
4099 				break;
4100 
4101 			default:
4102 				php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4103 				body_enc =	&mbfl_encoding_8bit;
4104 				break;
4105 		}
4106 		suppressed_hdrs.cnt_trans_enc = 1;
4107 	}
4108 
4109 	/* To: */
4110 	if (to_len > 0) {
4111 		to_r = estrndup(to, to_len);
4112 		for (; to_len; to_len--) {
4113 			if (!isspace((unsigned char) to_r[to_len - 1])) {
4114 				break;
4115 			}
4116 			to_r[to_len - 1] = '\0';
4117 		}
4118 		for (i = 0; to_r[i]; i++) {
4119 			if (iscntrl((unsigned char) to_r[i])) {
4120 				/* According to RFC 822, section 3.1.1 long headers may be separated into
4121 				 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4122 				 * To prevent these separators from being replaced with a space, we skip over them. */
4123 				if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4124 					i += 2;
4125 					while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4126 						i++;
4127 					}
4128 					continue;
4129 				}
4130 
4131 				to_r[i] = ' ';
4132 			}
4133 		}
4134 	} else {
4135 		to_r = to;
4136 	}
4137 
4138 	/* Subject: */
4139 	orig_str.val = (unsigned char *)subject;
4140 	orig_str.len = subject_len;
4141 	orig_str.encoding = MBSTRG(current_internal_encoding);
4142 	if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4143 			|| orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4144 		orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4145 	}
4146 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4147 	size_t line_sep_len = strlen(line_sep);
4148 	pstr = mbfl_mime_header_encode(&orig_str, &conv_str, tran_cs, head_enc, line_sep, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4149 	if (pstr != NULL) {
4150 		subject_buf = subject = (char *)pstr->val;
4151 	}
4152 
4153 	/* message body */
4154 	orig_str.val = (unsigned char *)message;
4155 	orig_str.len = message_len;
4156 	orig_str.encoding = MBSTRG(current_internal_encoding);
4157 
4158 	if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4159 			|| orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4160 		orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4161 	}
4162 
4163 	pstr = NULL;
4164 	{
4165 		mbfl_string tmpstr;
4166 
4167 		if (mbfl_convert_encoding(&orig_str, &tmpstr, tran_cs) != NULL) {
4168 			tmpstr.encoding = &mbfl_encoding_8bit;
4169 			pstr = mbfl_convert_encoding(&tmpstr, &conv_str, body_enc);
4170 			efree(tmpstr.val);
4171 		}
4172 	}
4173 	if (pstr != NULL) {
4174 		message_buf = message = (char *)pstr->val;
4175 	}
4176 
4177 	/* other headers */
4178 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4179 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4180 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4181 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4182 	if (str_headers != NULL) {
4183 		p = ZSTR_VAL(str_headers);
4184 		n = ZSTR_LEN(str_headers);
4185 		mbfl_memory_device_strncat(&device, p, n);
4186 		if (n > 0 && p[n - 1] != '\n') {
4187 			mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4188 		}
4189 		zend_string_release_ex(str_headers, 0);
4190 	}
4191 
4192 	if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4193 		mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4194 		mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4195 	}
4196 
4197 	if (!suppressed_hdrs.cnt_type) {
4198 		mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4199 
4200 		p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4201 		if (p != NULL) {
4202 			mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4203 			mbfl_memory_device_strcat(&device, p);
4204 		}
4205 		mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4206 	}
4207 	if (!suppressed_hdrs.cnt_trans_enc) {
4208 		mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4209 		p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4210 		if (p == NULL) {
4211 			p = "7bit";
4212 		}
4213 		mbfl_memory_device_strcat(&device, p);
4214 		mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4215 	}
4216 
4217 	if (!PG(mail_mixed_lf_and_crlf)) {
4218 		mbfl_memory_device_unput(&device);
4219 	}
4220 	mbfl_memory_device_unput(&device);
4221 	mbfl_memory_device_output('\0', &device);
4222 	str_headers = zend_string_init((char *)device.buffer, strlen((char *)device.buffer), 0);
4223 
4224 	if (force_extra_parameters) {
4225 		extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4226 	} else if (extra_cmd) {
4227 		extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4228 	}
4229 
4230 	if (!err && php_mail(to_r, subject, message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL)) {
4231 		RETVAL_TRUE;
4232 	} else {
4233 		RETVAL_FALSE;
4234 	}
4235 
4236 	if (extra_cmd) {
4237 		zend_string_release_ex(extra_cmd, 0);
4238 	}
4239 
4240 	if (to_r != to) {
4241 		efree(to_r);
4242 	}
4243 	if (subject_buf) {
4244 		efree((void *)subject_buf);
4245 	}
4246 	if (message_buf) {
4247 		efree((void *)message_buf);
4248 	}
4249 	mbfl_memory_device_clear(&device);
4250 	zend_hash_destroy(&ht_headers);
4251 	if (str_headers) {
4252 		zend_string_release_ex(str_headers, 0);
4253 	}
4254 }
4255 
4256 #undef CRLF
4257 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4258 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4259 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4260 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4261 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4262 /* }}} */
4263 
4264 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4265 PHP_FUNCTION(mb_get_info)
4266 {
4267 	zend_string *type = NULL;
4268 	size_t n;
4269 	char *name;
4270 	zval row;
4271 	const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4272 	const mbfl_encoding **entry;
4273 
4274 	ZEND_PARSE_PARAMETERS_START(0, 1)
4275 		Z_PARAM_OPTIONAL
4276 		Z_PARAM_STR(type)
4277 	ZEND_PARSE_PARAMETERS_END();
4278 
4279 	if (!type || zend_string_equals_literal_ci(type, "all")) {
4280 		array_init(return_value);
4281 		if (MBSTRG(current_internal_encoding)) {
4282 			add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4283 		}
4284 		if (MBSTRG(http_input_identify)) {
4285 			add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4286 		}
4287 		if (MBSTRG(current_http_output_encoding)) {
4288 			add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4289 		}
4290 		if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4291 			add_assoc_string(return_value, "http_output_conv_mimetypes", name);
4292 		}
4293 		if (lang != NULL) {
4294 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4295 				add_assoc_string(return_value, "mail_charset", name);
4296 			}
4297 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4298 				add_assoc_string(return_value, "mail_header_encoding", name);
4299 			}
4300 			if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4301 				add_assoc_string(return_value, "mail_body_encoding", name);
4302 			}
4303 		}
4304 		add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4305 		if (MBSTRG(encoding_translation)) {
4306 			add_assoc_string(return_value, "encoding_translation", "On");
4307 		} else {
4308 			add_assoc_string(return_value, "encoding_translation", "Off");
4309 		}
4310 		if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4311 			add_assoc_string(return_value, "language", name);
4312 		}
4313 		n = MBSTRG(current_detect_order_list_size);
4314 		entry = MBSTRG(current_detect_order_list);
4315 		if (n > 0) {
4316 			size_t i;
4317 			array_init(&row);
4318 			for (i = 0; i < n; i++) {
4319 				add_next_index_string(&row, (*entry)->name);
4320 				entry++;
4321 			}
4322 			add_assoc_zval(return_value, "detect_order", &row);
4323 		}
4324 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4325 			add_assoc_string(return_value, "substitute_character", "none");
4326 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4327 			add_assoc_string(return_value, "substitute_character", "long");
4328 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4329 			add_assoc_string(return_value, "substitute_character", "entity");
4330 		} else {
4331 			add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4332 		}
4333 		if (MBSTRG(strict_detection)) {
4334 			add_assoc_string(return_value, "strict_detection", "On");
4335 		} else {
4336 			add_assoc_string(return_value, "strict_detection", "Off");
4337 		}
4338 	} else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4339 		if (MBSTRG(current_internal_encoding)) {
4340 			RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
4341 		}
4342 	} else if (zend_string_equals_literal_ci(type, "http_input")) {
4343 		if (MBSTRG(http_input_identify)) {
4344 			RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
4345 		}
4346 	} else if (zend_string_equals_literal_ci(type, "http_output")) {
4347 		if (MBSTRG(current_http_output_encoding)) {
4348 			RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4349 		}
4350 	} else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4351 		if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4352 			RETVAL_STRING(name);
4353 		}
4354 	} else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4355 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4356 			RETVAL_STRING(name);
4357 		}
4358 	} else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4359 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4360 			RETVAL_STRING(name);
4361 		}
4362 	} else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4363 		if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4364 			RETVAL_STRING(name);
4365 		}
4366 	} else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4367 		RETVAL_LONG(MBSTRG(illegalchars));
4368 	} else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4369 		if (MBSTRG(encoding_translation)) {
4370 			RETVAL_STRING("On");
4371 		} else {
4372 			RETVAL_STRING("Off");
4373 		}
4374 	} else if (zend_string_equals_literal_ci(type, "language")) {
4375 		if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4376 			RETVAL_STRING(name);
4377 		}
4378 	} else if (zend_string_equals_literal_ci(type, "detect_order")) {
4379 		n = MBSTRG(current_detect_order_list_size);
4380 		entry = MBSTRG(current_detect_order_list);
4381 		if (n > 0) {
4382 			size_t i;
4383 			array_init(return_value);
4384 			for (i = 0; i < n; i++) {
4385 				add_next_index_string(return_value, (*entry)->name);
4386 				entry++;
4387 			}
4388 		}
4389 	} else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4390 		if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4391 			RETVAL_STRING("none");
4392 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4393 			RETVAL_STRING("long");
4394 		} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4395 			RETVAL_STRING("entity");
4396 		} else {
4397 			RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
4398 		}
4399 	} else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4400 		if (MBSTRG(strict_detection)) {
4401 			RETVAL_STRING("On");
4402 		} else {
4403 			RETVAL_STRING("Off");
4404 		}
4405 	} else {
4406 		// TODO Convert to ValueError
4407 		RETURN_FALSE;
4408 	}
4409 }
4410 /* }}} */
4411 
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4412 MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4413 {
4414 	uint32_t wchar_buf[128];
4415 	unsigned char *in = (unsigned char*)input;
4416 	unsigned int state = 0;
4417 
4418 	if (encoding->check != NULL) {
4419 		return encoding->check(in, length);
4420 	}
4421 
4422 	/* If the input string is not encoded in the given encoding, there is a significant chance
4423 	 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4424 	 * buffer of 128 codepoints, convert and check just a few codepoints first */
4425 	size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4426 	ZEND_ASSERT(out_len <= 8);
4427 	for (int i = 0; i < out_len; i++) {
4428 		if (wchar_buf[i] == MBFL_BAD_INPUT) {
4429 			return 0;
4430 		}
4431 	}
4432 
4433 	while (length) {
4434 		out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4435 		ZEND_ASSERT(out_len <= 128);
4436 		for (int i = 0; i < out_len; i++) {
4437 			if (wchar_buf[i] == MBFL_BAD_INPUT) {
4438 				return 0;
4439 			}
4440 		}
4441 	}
4442 
4443 	return 1;
4444 }
4445 
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)4446 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
4447 {
4448 	zend_long idx;
4449 	zend_string *key;
4450 	zval *entry;
4451 	int valid = 1;
4452 
4453 	(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
4454 
4455 	if (GC_IS_RECURSIVE(vars)) {
4456 		php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
4457 		return 0;
4458 	}
4459 	GC_TRY_PROTECT_RECURSION(vars);
4460 	ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
4461 		ZVAL_DEREF(entry);
4462 		if (key) {
4463 			if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
4464 				valid = 0;
4465 				break;
4466 			}
4467 		}
4468 		switch (Z_TYPE_P(entry)) {
4469 			case IS_STRING:
4470 				if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
4471 					valid = 0;
4472 					break;
4473 				}
4474 				break;
4475 			case IS_ARRAY:
4476 				if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
4477 					valid = 0;
4478 					break;
4479 				}
4480 				break;
4481 			case IS_LONG:
4482 			case IS_DOUBLE:
4483 			case IS_NULL:
4484 			case IS_TRUE:
4485 			case IS_FALSE:
4486 				break;
4487 			default:
4488 				/* Other types are error. */
4489 				valid = 0;
4490 				break;
4491 		}
4492 	} ZEND_HASH_FOREACH_END();
4493 	GC_TRY_UNPROTECT_RECURSION(vars);
4494 	return valid;
4495 }
4496 
4497 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)4498 PHP_FUNCTION(mb_check_encoding)
4499 {
4500 	zend_string *input_str = NULL, *enc = NULL;
4501 	HashTable *input_ht = NULL;
4502 	const mbfl_encoding *encoding;
4503 
4504 	ZEND_PARSE_PARAMETERS_START(0, 2)
4505 		Z_PARAM_OPTIONAL
4506 		Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
4507 		Z_PARAM_STR_OR_NULL(enc)
4508 	ZEND_PARSE_PARAMETERS_END();
4509 
4510 	encoding = php_mb_get_encoding(enc, 2);
4511 	if (!encoding) {
4512 		RETURN_THROWS();
4513 	}
4514 
4515 	if (input_ht) {
4516 		RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
4517 	} else if (input_str) {
4518 		RETURN_BOOL(php_mb_check_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), encoding));
4519 	} else {
4520 		php_error_docref(NULL, E_DEPRECATED,
4521 			"Calling mb_check_encoding() without argument is deprecated");
4522 
4523 		/* FIXME: Actually check all inputs, except $_FILES file content. */
4524 		RETURN_BOOL(MBSTRG(illegalchars) == 0);
4525 	}
4526 }
4527 /* }}} */
4528 
4529 
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)4530 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
4531 	const uint32_t enc_name_arg_num)
4532 {
4533 	const mbfl_encoding *enc;
4534 	enum mbfl_no_encoding no_enc;
4535 
4536 	ZEND_ASSERT(str_len > 0);
4537 
4538 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4539 	if (!enc) {
4540 		return -2;
4541 	}
4542 
4543 	no_enc = enc->no_encoding;
4544 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
4545 		zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
4546 		return -2;
4547 	}
4548 
4549 	/* Some legacy text encodings have a minimum required wchar buffer size;
4550 	 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
4551 	uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
4552 	unsigned int state = 0;
4553 	size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
4554 	ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
4555 
4556 	if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
4557 		return -1;
4558 	}
4559 	return wchar_buf[0];
4560 }
4561 
4562 
4563 /* {{{ */
PHP_FUNCTION(mb_ord)4564 PHP_FUNCTION(mb_ord)
4565 {
4566 	char *str;
4567 	size_t str_len;
4568 	zend_string *enc = NULL;
4569 	zend_long cp;
4570 
4571 	ZEND_PARSE_PARAMETERS_START(1, 2)
4572 		Z_PARAM_STRING(str, str_len)
4573 		Z_PARAM_OPTIONAL
4574 		Z_PARAM_STR_OR_NULL(enc)
4575 	ZEND_PARSE_PARAMETERS_END();
4576 
4577 	if (str_len == 0) {
4578 		zend_argument_value_error(1, "must not be empty");
4579 		RETURN_THROWS();
4580 	}
4581 
4582 	cp = php_mb_ord(str, str_len, enc, 2);
4583 
4584 	if (0 > cp) {
4585 		if (cp == -2) {
4586 			RETURN_THROWS();
4587 		}
4588 		RETURN_FALSE;
4589 	}
4590 
4591 	RETURN_LONG(cp);
4592 }
4593 /* }}} */
4594 
4595 
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)4596 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
4597 {
4598 	const mbfl_encoding *enc;
4599 	enum mbfl_no_encoding no_enc;
4600 	zend_string *ret;
4601 	char buf[4];
4602 
4603 	enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4604 	if (!enc) {
4605 		return NULL;
4606 	}
4607 
4608 	no_enc = enc->no_encoding;
4609 	if (php_mb_is_unsupported_no_encoding(no_enc)) {
4610 		zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
4611 		return NULL;
4612 	}
4613 
4614 	if (cp < 0 || cp > 0x10ffff) {
4615 		return NULL;
4616 	}
4617 
4618 	if (php_mb_is_no_encoding_utf8(no_enc)) {
4619 		if (cp > 0xd7ff && 0xe000 > cp) {
4620 			return NULL;
4621 		}
4622 
4623 		if (cp < 0x80) {
4624 			ret = ZSTR_CHAR(cp);
4625 		} else if (cp < 0x800) {
4626 			ret = zend_string_alloc(2, 0);
4627 			ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
4628 			ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
4629 			ZSTR_VAL(ret)[2] = 0;
4630 		} else if (cp < 0x10000) {
4631 			ret = zend_string_alloc(3, 0);
4632 			ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
4633 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
4634 			ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
4635 			ZSTR_VAL(ret)[3] = 0;
4636 		} else {
4637 			ret = zend_string_alloc(4, 0);
4638 			ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
4639 			ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
4640 			ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
4641 			ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
4642 			ZSTR_VAL(ret)[4] = 0;
4643 		}
4644 
4645 		return ret;
4646 	}
4647 
4648 	buf[0] = (cp >> 24) & 0xff;
4649 	buf[1] = (cp >> 16) & 0xff;
4650 	buf[2] = (cp >>  8) & 0xff;
4651 	buf[3] = cp & 0xff;
4652 
4653 	long orig_illegalchars = MBSTRG(illegalchars);
4654 	MBSTRG(illegalchars) = 0;
4655 	ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
4656 
4657 	if (MBSTRG(illegalchars) != 0) {
4658 		zend_string_release(ret);
4659 		ret = NULL;
4660 	}
4661 
4662 	MBSTRG(illegalchars) = orig_illegalchars;
4663 	return ret;
4664 }
4665 
4666 
4667 /* {{{ */
PHP_FUNCTION(mb_chr)4668 PHP_FUNCTION(mb_chr)
4669 {
4670 	zend_long cp;
4671 	zend_string *enc = NULL;
4672 
4673 	ZEND_PARSE_PARAMETERS_START(1, 2)
4674 		Z_PARAM_LONG(cp)
4675 		Z_PARAM_OPTIONAL
4676 		Z_PARAM_STR_OR_NULL(enc)
4677 	ZEND_PARSE_PARAMETERS_END();
4678 
4679 	zend_string* ret = php_mb_chr(cp, enc, 2);
4680 	if (ret == NULL) {
4681 		RETURN_FALSE;
4682 	}
4683 
4684 	RETURN_STR(ret);
4685 }
4686 /* }}} */
4687 
4688 /* {{{ */
PHP_FUNCTION(mb_scrub)4689 PHP_FUNCTION(mb_scrub)
4690 {
4691 	char* str;
4692 	size_t str_len;
4693 	zend_string *enc_name = NULL;
4694 
4695 	ZEND_PARSE_PARAMETERS_START(1, 2)
4696 		Z_PARAM_STRING(str, str_len)
4697 		Z_PARAM_OPTIONAL
4698 		Z_PARAM_STR_OR_NULL(enc_name)
4699 	ZEND_PARSE_PARAMETERS_END();
4700 
4701 	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
4702 	if (!enc) {
4703 		RETURN_THROWS();
4704 	}
4705 
4706 	RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc));
4707 }
4708 /* }}} */
4709 
4710 
4711 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)4712 static void php_mb_populate_current_detect_order_list(void)
4713 {
4714 	const mbfl_encoding **entry = 0;
4715 	size_t nentries;
4716 
4717 	if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
4718 		nentries = MBSTRG(detect_order_list_size);
4719 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4720 		memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
4721 	} else {
4722 		const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
4723 		size_t i;
4724 		nentries = MBSTRG(default_detect_order_list_size);
4725 		entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4726 		for (i = 0; i < nentries; i++) {
4727 			entry[i] = mbfl_no2encoding(src[i]);
4728 		}
4729 	}
4730 	MBSTRG(current_detect_order_list) = entry;
4731 	MBSTRG(current_detect_order_list_size) = nentries;
4732 }
4733 /* }}} */
4734 
4735 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)4736 static int php_mb_encoding_translation(void)
4737 {
4738 	return MBSTRG(encoding_translation);
4739 }
4740 /* }}} */
4741 
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)4742 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
4743 {
4744 	if (enc) {
4745 		if (enc->mblen_table) {
4746 			if (s) {
4747 				return enc->mblen_table[*(unsigned char *)s];
4748 			}
4749 		} else if (enc->flag & MBFL_ENCTYPE_WCS2) {
4750 			return 2;
4751 		} else if (enc->flag & MBFL_ENCTYPE_WCS4) {
4752 			return 4;
4753 		}
4754 	}
4755 	return 1;
4756 }
4757 
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)4758 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
4759 {
4760 	const char *p = s;
4761 	char *last=NULL;
4762 
4763 	if (nbytes == (size_t)-1) {
4764 		size_t nb = 0;
4765 
4766 		while (*p != '\0') {
4767 			if (nb == 0) {
4768 				if ((unsigned char)*p == (unsigned char)c) {
4769 					last = (char *)p;
4770 				}
4771 				nb = php_mb_mbchar_bytes(p, enc);
4772 				if (nb == 0) {
4773 					return NULL; /* something is going wrong! */
4774 				}
4775 			}
4776 			--nb;
4777 			++p;
4778 		}
4779 	} else {
4780 		size_t bcnt = nbytes;
4781 		size_t nbytes_char;
4782 		while (bcnt > 0) {
4783 			if ((unsigned char)*p == (unsigned char)c) {
4784 				last = (char *)p;
4785 			}
4786 			nbytes_char = php_mb_mbchar_bytes(p, enc);
4787 			if (bcnt < nbytes_char) {
4788 				return NULL;
4789 			}
4790 			p += nbytes_char;
4791 			bcnt -= nbytes_char;
4792 		}
4793 	}
4794 	return last;
4795 }
4796 
4797 /* {{{ MBSTRING_API int php_mb_stripos() */
php_mb_stripos(int mode,const char * old_haystack,size_t old_haystack_len,const char * old_needle,size_t old_needle_len,zend_long offset,const mbfl_encoding * enc)4798 MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *enc)
4799 {
4800 	size_t n = (size_t) -1;
4801 	mbfl_string haystack, needle;
4802 
4803 	mbfl_string_init_set(&haystack, enc);
4804 	mbfl_string_init_set(&needle, enc);
4805 
4806 	do {
4807 		/* We're using simple case-folding here, because we'd have to deal with remapping of
4808 		 * offsets otherwise. */
4809 
4810 		size_t len = 0;
4811 		haystack.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc);
4812 		haystack.len = len;
4813 
4814 		if (!haystack.val) {
4815 			break;
4816 		}
4817 
4818 		if (haystack.len == 0) {
4819 			break;
4820 		}
4821 
4822 		needle.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc);
4823 		needle.len = len;
4824 
4825 		if (!needle.val) {
4826 			break;
4827 		}
4828 
4829 		n = mbfl_strpos(&haystack, &needle, offset, mode);
4830 	} while(0);
4831 
4832 	if (haystack.val) {
4833 		efree(haystack.val);
4834 	}
4835 
4836 	if (needle.val) {
4837 		efree(needle.val);
4838 	}
4839 
4840 	return n;
4841 }
4842 /* }}} */
4843 
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)4844 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
4845 {
4846 	*list = (const zend_encoding **)MBSTRG(http_input_list);
4847 	*list_size = MBSTRG(http_input_list_size);
4848 }
4849 /* }}} */
4850 
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)4851 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
4852 {
4853 	MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
4854 }
4855 /* }}} */
4856