1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include <limits.h>
22
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "ext/standard/url.h"
32 #include "main/php_output.h"
33 #include "ext/standard/info.h"
34 #include "ext/pcre/php_pcre.h"
35
36 #include "libmbfl/mbfl/mbfilter_8bit.h"
37 #include "libmbfl/mbfl/mbfilter_pass.h"
38 #include "libmbfl/mbfl/mbfilter_wchar.h"
39 #include "libmbfl/mbfl/eaw_table.h"
40 #include "libmbfl/filters/mbfilter_base64.h"
41 #include "libmbfl/filters/mbfilter_cjk.h"
42 #include "libmbfl/filters/mbfilter_qprint.h"
43 #include "libmbfl/filters/mbfilter_htmlent.h"
44 #include "libmbfl/filters/mbfilter_uuencode.h"
45 #include "libmbfl/filters/mbfilter_ucs4.h"
46 #include "libmbfl/filters/mbfilter_utf8.h"
47 #include "libmbfl/filters/mbfilter_utf16.h"
48 #include "libmbfl/filters/mbfilter_singlebyte.h"
49 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
50 #include "libmbfl/filters/unicode_prop.h"
51
52 #include "php_variables.h"
53 #include "php_globals.h"
54 #include "rfc1867.h"
55 #include "php_content_types.h"
56 #include "SAPI.h"
57 #include "php_unicode.h"
58 #include "TSRM.h"
59
60 #include "mb_gpc.h"
61
62 #ifdef HAVE_MBREGEX
63 # include "php_mbregex.h"
64 #endif
65
66 #include "zend_smart_str.h"
67 #include "zend_multibyte.h"
68 #include "mbstring_arginfo.h"
69
70 #include "rare_cp_bitvec.h"
71
72 /* }}} */
73
74 /* {{{ prototypes */
75 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
76
77 static PHP_GINIT_FUNCTION(mbstring);
78 static PHP_GSHUTDOWN_FUNCTION(mbstring);
79
80 static void php_mb_populate_current_detect_order_list(void);
81
82 static int php_mb_encoding_translation(void);
83
84 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
85
86 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
87
88 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
89
90 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
91
92 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
93
94 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
95
96 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
97
98 /* See mbfilter_cp5022x.c */
99 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
100 /* }}} */
101
102 /* {{{ php_mb_default_identify_list */
103 typedef struct _php_mb_nls_ident_list {
104 enum mbfl_no_language lang;
105 const enum mbfl_no_encoding *list;
106 size_t list_size;
107 } php_mb_nls_ident_list;
108
109 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
110 mbfl_no_encoding_ascii,
111 mbfl_no_encoding_jis,
112 mbfl_no_encoding_utf8,
113 mbfl_no_encoding_euc_jp,
114 mbfl_no_encoding_sjis
115 };
116
117 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
118 mbfl_no_encoding_ascii,
119 mbfl_no_encoding_utf8,
120 mbfl_no_encoding_euc_cn,
121 mbfl_no_encoding_cp936
122 };
123
124 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
125 mbfl_no_encoding_ascii,
126 mbfl_no_encoding_utf8,
127 mbfl_no_encoding_euc_tw,
128 mbfl_no_encoding_big5
129 };
130
131 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
132 mbfl_no_encoding_ascii,
133 mbfl_no_encoding_utf8,
134 mbfl_no_encoding_euc_kr,
135 mbfl_no_encoding_uhc
136 };
137
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
139 mbfl_no_encoding_ascii,
140 mbfl_no_encoding_utf8,
141 mbfl_no_encoding_koi8r,
142 mbfl_no_encoding_cp1251,
143 mbfl_no_encoding_cp866
144 };
145
146 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
147 mbfl_no_encoding_ascii,
148 mbfl_no_encoding_utf8,
149 mbfl_no_encoding_armscii8
150 };
151
152 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
153 mbfl_no_encoding_ascii,
154 mbfl_no_encoding_utf8,
155 mbfl_no_encoding_cp1254,
156 mbfl_no_encoding_8859_9
157 };
158
159 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
160 mbfl_no_encoding_ascii,
161 mbfl_no_encoding_utf8,
162 mbfl_no_encoding_koi8u
163 };
164
165 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
166 mbfl_no_encoding_ascii,
167 mbfl_no_encoding_utf8
168 };
169
170
171 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
172 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
173 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
174 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
175 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
176 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
177 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
178 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
179 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
180 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
181 };
182
183 /* }}} */
184
185 /* {{{ mbstring_deps[] */
186 static const zend_module_dep mbstring_deps[] = {
187 ZEND_MOD_REQUIRED("pcre")
188 ZEND_MOD_END
189 };
190 /* }}} */
191
192 /* {{{ zend_module_entry mbstring_module_entry */
193 zend_module_entry mbstring_module_entry = {
194 STANDARD_MODULE_HEADER_EX,
195 NULL,
196 mbstring_deps,
197 "mbstring",
198 ext_functions,
199 PHP_MINIT(mbstring),
200 PHP_MSHUTDOWN(mbstring),
201 PHP_RINIT(mbstring),
202 PHP_RSHUTDOWN(mbstring),
203 PHP_MINFO(mbstring),
204 PHP_MBSTRING_VERSION,
205 PHP_MODULE_GLOBALS(mbstring),
206 PHP_GINIT(mbstring),
207 PHP_GSHUTDOWN(mbstring),
208 NULL,
209 STANDARD_MODULE_PROPERTIES_EX
210 };
211 /* }}} */
212
213 /* {{{ static sapi_post_entry php_post_entries[] */
214 static const sapi_post_entry php_post_entries[] = {
215 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
216 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
217 { NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220
221 #ifdef COMPILE_DL_MBSTRING
222 #ifdef ZTS
223 ZEND_TSRMLS_CACHE_DEFINE()
224 #endif
225 ZEND_GET_MODULE(mbstring)
226 #endif
227
228 /* {{{ static sapi_post_entry mbstr_post_entries[] */
229 static const sapi_post_entry mbstr_post_entries[] = {
230 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
231 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
232 { NULL, 0, NULL, NULL }
233 };
234 /* }}} */
235
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)236 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
237 if (encoding_name) {
238 const mbfl_encoding *encoding;
239 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
240 if (last_encoding_name && (last_encoding_name == encoding_name
241 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
242 return MBSTRG(last_used_encoding);
243 }
244
245 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
246 if (!encoding) {
247 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
248 return NULL;
249 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
250 if (encoding == &mbfl_encoding_base64) {
251 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
252 } else if (encoding == &mbfl_encoding_qprint) {
253 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
254 } else if (encoding == &mbfl_encoding_html_ent) {
255 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
256 } else if (encoding == &mbfl_encoding_uuencode) {
257 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
258 }
259 }
260
261 if (last_encoding_name) {
262 zend_string_release(last_encoding_name);
263 }
264 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
265 MBSTRG(last_used_encoding) = encoding;
266 return encoding;
267 } else {
268 return MBSTRG(current_internal_encoding);
269 }
270 }
271
php_mb_get_encoding_or_pass(const char * encoding_name,size_t encoding_name_len)272 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
273 if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
274 return &mbfl_encoding_pass;
275 }
276
277 return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
278 }
279
count_commas(const char * p,const char * end)280 static size_t count_commas(const char *p, const char *end) {
281 size_t count = 0;
282 while ((p = memchr(p, ',', end - p))) {
283 count++;
284 p++;
285 }
286 return count;
287 }
288
289 /* {{{ static zend_result php_mb_parse_encoding_list()
290 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
291 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
292 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)293 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
294 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
295 {
296 if (value == NULL || value_length == 0) {
297 *return_list = NULL;
298 *return_size = 0;
299 return SUCCESS;
300 } else {
301 bool included_auto;
302 size_t n, size;
303 const char *p1, *endp, *tmpstr;
304 const mbfl_encoding **entry, **list;
305
306 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
307 tmpstr = value + 1;
308 value_length -= 2;
309 } else {
310 tmpstr = value;
311 }
312
313 endp = tmpstr + value_length;
314 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
315 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
316 entry = list;
317 n = 0;
318 included_auto = 0;
319 p1 = tmpstr;
320 while (1) {
321 const char *comma = memchr(p1, ',', endp - p1);
322 const char *p = comma ? comma : endp;
323 /* trim spaces */
324 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
325 p1++;
326 }
327 p--;
328 while (p > p1 && (*p == ' ' || *p == '\t')) {
329 p--;
330 }
331 size_t p1_length = p - p1 + 1;
332 /* convert to the encoding number and check encoding */
333 if (strncasecmp(p1, "auto", p1_length) == 0) {
334 if (!included_auto) {
335 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
336 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
337 size_t i;
338 included_auto = 1;
339 for (i = 0; i < identify_list_size; i++) {
340 *entry++ = mbfl_no2encoding(*src++);
341 n++;
342 }
343 }
344 } else {
345 const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
346 if (!encoding) {
347 /* Called from an INI setting modification */
348 if (arg_num == 0) {
349 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
350 } else {
351 zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
352 }
353 pefree(ZEND_VOIDP(list), persistent);
354 return FAILURE;
355 }
356
357 *entry++ = encoding;
358 n++;
359 }
360 if (n >= size || comma == NULL) {
361 break;
362 }
363 p1 = comma + 1;
364 }
365 *return_list = list;
366 *return_size = n;
367 }
368
369 return SUCCESS;
370 }
371 /* }}} */
372
373 /* {{{
374 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
375 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
376 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)377 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
378 size_t *return_size, uint32_t arg_num)
379 {
380 /* Allocate enough space to include the default detect order if "auto" is used. */
381 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
382 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
383 const mbfl_encoding **entry = list;
384 bool included_auto = 0;
385 size_t n = 0;
386 zval *hash_entry;
387 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
388 zend_string *encoding_str = zval_try_get_string(hash_entry);
389 if (UNEXPECTED(!encoding_str)) {
390 efree(ZEND_VOIDP(list));
391 return FAILURE;
392 }
393
394 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
395 if (!included_auto) {
396 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
397 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
398 size_t j;
399
400 included_auto = 1;
401 for (j = 0; j < identify_list_size; j++) {
402 *entry++ = mbfl_no2encoding(*src++);
403 n++;
404 }
405 }
406 } else {
407 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
408 if (encoding) {
409 *entry++ = encoding;
410 n++;
411 } else {
412 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
413 zend_string_release(encoding_str);
414 efree(ZEND_VOIDP(list));
415 return FAILURE;
416 }
417 }
418 zend_string_release(encoding_str);
419 } ZEND_HASH_FOREACH_END();
420 *return_list = list;
421 *return_size = n;
422 return SUCCESS;
423 }
424 /* }}} */
425
426 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)427 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
428 {
429 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
430 }
431
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)432 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
433 {
434 return ((const mbfl_encoding *)encoding)->name;
435 }
436
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)437 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
438 {
439 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
440 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
441 }
442
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)443 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
444 {
445 if (!list) {
446 list = (const zend_encoding**)MBSTRG(current_detect_order_list);
447 list_size = MBSTRG(current_detect_order_list_size);
448 }
449 if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
450 /* Emulate behavior of previous implementation; it would never return "pass"
451 * from an encoding auto-detection operation */
452 return NULL;
453 }
454 return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
455 }
456
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)457 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
458 {
459 unsigned int num_errors = 0;
460 zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
461
462 *to_length = ZSTR_LEN(result);
463 *to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
464 memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
465 zend_string_free(result);
466
467 return from_length;
468 }
469
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)470 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
471 {
472 return php_mb_parse_encoding_list(
473 encoding_list, encoding_list_len,
474 (const mbfl_encoding ***)return_list, return_size,
475 persistent, /* arg_num */ 0);
476 }
477
php_mb_zend_internal_encoding_getter(void)478 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
479 {
480 return (const zend_encoding *)MBSTRG(internal_encoding);
481 }
482
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)483 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
484 {
485 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
486 return SUCCESS;
487 }
488
489 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
490 "mbstring",
491 php_mb_zend_encoding_fetcher,
492 php_mb_zend_encoding_name_getter,
493 php_mb_zend_encoding_lexer_compatibility_checker,
494 php_mb_zend_encoding_detector,
495 php_mb_zend_encoding_converter,
496 php_mb_zend_encoding_list_parser,
497 php_mb_zend_internal_encoding_getter,
498 php_mb_zend_internal_encoding_setter
499 };
500 /* }}} */
501
502 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)503 static void *_php_mb_compile_regex(const char *pattern)
504 {
505 pcre2_code *retval;
506 PCRE2_SIZE err_offset;
507 int errnum;
508
509 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
510 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
511 PCRE2_UCHAR err_str[128];
512 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
513 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
514 }
515 return retval;
516 }
517 /* }}} */
518
519 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)520 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
521 {
522 int res;
523
524 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
525 if (NULL == match_data) {
526 pcre2_code_free(opaque);
527 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
528 return FAILURE;
529 }
530 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
531 php_pcre_free_match_data(match_data);
532
533 return res;
534 }
535 /* }}} */
536
537 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)538 static void _php_mb_free_regex(void *opaque)
539 {
540 pcre2_code_free(opaque);
541 }
542 /* }}} */
543
544 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)545 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
546 {
547 size_t i;
548
549 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
550 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
551
552 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
553 if (php_mb_default_identify_list[i].lang == lang) {
554 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
555 *plist_size = php_mb_default_identify_list[i].list_size;
556 return 1;
557 }
558 }
559 return 0;
560 }
561 /* }}} */
562
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)563 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
564 {
565 char *result = emalloc(len + 2);
566 char *resp = result;
567 size_t i;
568
569 for (i = 0; i < len && start[i] != quote; ++i) {
570 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
571 *resp++ = start[++i];
572 } else {
573 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
574
575 while (j-- > 0 && i < len) {
576 *resp++ = start[i++];
577 }
578 --i;
579 }
580 }
581
582 *resp = '\0';
583 return result;
584 }
585
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)586 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
587 {
588 char *pos = *line, quote;
589 char *res;
590
591 while (*pos && *pos != stop) {
592 if ((quote = *pos) == '"' || quote == '\'') {
593 ++pos;
594 while (*pos && *pos != quote) {
595 if (*pos == '\\' && pos[1] && pos[1] == quote) {
596 pos += 2;
597 } else {
598 ++pos;
599 }
600 }
601 if (*pos) {
602 ++pos;
603 }
604 } else {
605 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
606
607 }
608 }
609 if (*pos == '\0') {
610 res = estrdup(*line);
611 *line += strlen(*line);
612 return res;
613 }
614
615 res = estrndup(*line, pos - *line);
616
617 while (*pos == stop) {
618 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
619 }
620
621 *line = pos;
622 return res;
623 }
624 /* }}} */
625
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)626 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
627 {
628 while (*str && isspace(*(unsigned char *)str)) {
629 ++str;
630 }
631
632 if (!*str) {
633 return estrdup("");
634 }
635
636 if (*str == '"' || *str == '\'') {
637 char quote = *str;
638
639 str++;
640 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
641 } else {
642 char *strend = str;
643
644 while (*strend && !isspace(*(unsigned char *)strend)) {
645 ++strend;
646 }
647 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
648 }
649 }
650 /* }}} */
651
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)652 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
653 {
654 char *s, *s2;
655 const size_t filename_len = strlen(filename);
656
657 /* The \ check should technically be needed for win32 systems only where
658 * it is a valid path separator. However, IE in all it's wisdom always sends
659 * the full path of the file on the user's filesystem, which means that unless
660 * the user does basename() they get a bogus file name. Until IE's user base drops
661 * to nill or problem is fixed this code must remain enabled for all systems. */
662 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
663 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
664
665 if (s && s2) {
666 if (s > s2) {
667 return ++s;
668 } else {
669 return ++s2;
670 }
671 } else if (s) {
672 return ++s;
673 } else if (s2) {
674 return ++s2;
675 } else {
676 return filename;
677 }
678 }
679 /* }}} */
680
681 /* {{{ php.ini directive handler */
682 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)683 static PHP_INI_MH(OnUpdate_mbstring_language)
684 {
685 enum mbfl_no_language no_language;
686
687 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
688 if (no_language == mbfl_no_language_invalid) {
689 MBSTRG(language) = mbfl_no_language_neutral;
690 return FAILURE;
691 }
692 MBSTRG(language) = no_language;
693 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
694 return SUCCESS;
695 }
696 /* }}} */
697
698 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)699 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
700 {
701 const mbfl_encoding **list;
702 size_t size;
703
704 if (!new_value) {
705 if (MBSTRG(detect_order_list)) {
706 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
707 }
708 MBSTRG(detect_order_list) = NULL;
709 MBSTRG(detect_order_list_size) = 0;
710 return SUCCESS;
711 }
712
713 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
714 return FAILURE;
715 }
716
717 if (MBSTRG(detect_order_list)) {
718 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
719 }
720 MBSTRG(detect_order_list) = list;
721 MBSTRG(detect_order_list_size) = size;
722 return SUCCESS;
723 }
724 /* }}} */
725
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)726 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
727 const mbfl_encoding **list;
728 size_t size;
729 if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
730 list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
731 *list = &mbfl_encoding_pass;
732 size = 1;
733 } else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
734 return FAILURE;
735 }
736 if (MBSTRG(http_input_list)) {
737 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
738 }
739 MBSTRG(http_input_list) = list;
740 MBSTRG(http_input_list_size) = size;
741 return SUCCESS;
742 }
743
744 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)745 static PHP_INI_MH(OnUpdate_mbstring_http_input)
746 {
747 if (new_value) {
748 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
749 }
750
751 if (!new_value || !ZSTR_LEN(new_value)) {
752 const char *encoding = php_get_input_encoding();
753 MBSTRG(http_input_set) = 0;
754 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
755 return SUCCESS;
756 }
757
758 MBSTRG(http_input_set) = 1;
759 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
760 }
761 /* }}} */
762
_php_mb_ini_mbstring_http_output_set(const char * new_value,size_t length)763 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
764 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
765 if (!encoding) {
766 return FAILURE;
767 }
768
769 MBSTRG(http_output_encoding) = encoding;
770 MBSTRG(current_http_output_encoding) = encoding;
771 return SUCCESS;
772 }
773
774 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)775 static PHP_INI_MH(OnUpdate_mbstring_http_output)
776 {
777 if (new_value) {
778 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
779 }
780
781 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
782 const char *encoding = php_get_output_encoding();
783 MBSTRG(http_output_set) = 0;
784 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
785 return SUCCESS;
786 }
787
788 MBSTRG(http_output_set) = 1;
789 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
790 }
791 /* }}} */
792
793 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)794 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
795 {
796 const mbfl_encoding *encoding;
797
798 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
799 /* falls back to UTF-8 if an unknown encoding name is given */
800 if (new_value) {
801 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
802 }
803 encoding = &mbfl_encoding_utf8;
804 }
805 MBSTRG(internal_encoding) = encoding;
806 MBSTRG(current_internal_encoding) = encoding;
807 #ifdef HAVE_MBREGEX
808 {
809 const char *enc_name = new_value;
810 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
811 /* falls back to UTF-8 if an unknown encoding name is given */
812 enc_name = "UTF-8";
813 php_mb_regex_set_default_mbctype(enc_name);
814 }
815 php_mb_regex_set_mbctype(new_value);
816 }
817 #endif
818 return SUCCESS;
819 }
820 /* }}} */
821
822 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)823 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
824 {
825 if (new_value) {
826 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
827 }
828
829 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
830 return FAILURE;
831 }
832
833 if (new_value && ZSTR_LEN(new_value)) {
834 MBSTRG(internal_encoding_set) = 1;
835 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
836 } else {
837 const char *encoding = php_get_internal_encoding();
838 MBSTRG(internal_encoding_set) = 0;
839 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
840 }
841 }
842 /* }}} */
843
844 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)845 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
846 {
847 if (new_value != NULL) {
848 if (zend_string_equals_literal_ci(new_value, "none")) {
849 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
850 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
851 } else if (zend_string_equals_literal_ci(new_value, "long")) {
852 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
853 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
854 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
855 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
856 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
857 } else {
858 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
859 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
860 if (ZSTR_LEN(new_value) > 0) {
861 char *endptr = NULL;
862 int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
863
864 if (*endptr == '\0') {
865 MBSTRG(filter_illegal_substchar) = c;
866 MBSTRG(current_filter_illegal_substchar) = c;
867 }
868 }
869 }
870 } else {
871 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
872 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
873 MBSTRG(filter_illegal_substchar) = '?';
874 MBSTRG(current_filter_illegal_substchar) = '?';
875 }
876
877 return SUCCESS;
878 }
879 /* }}} */
880
881 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)882 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
883 {
884 if (new_value == NULL) {
885 return FAILURE;
886 }
887
888 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
889
890 if (MBSTRG(encoding_translation)) {
891 sapi_unregister_post_entry(php_post_entries);
892 sapi_register_post_entries(mbstr_post_entries);
893 } else {
894 sapi_unregister_post_entry(mbstr_post_entries);
895 sapi_register_post_entries(php_post_entries);
896 }
897
898 return SUCCESS;
899 }
900 /* }}} */
901
902 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)903 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
904 {
905 zend_string *tmp;
906 void *re = NULL;
907
908 if (!new_value) {
909 new_value = entry->orig_value;
910 }
911 tmp = php_trim(new_value, NULL, 0, 3);
912
913 if (ZSTR_LEN(tmp) > 0) {
914 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
915 zend_string_release_ex(tmp, 0);
916 return FAILURE;
917 }
918 }
919
920 if (MBSTRG(http_output_conv_mimetypes)) {
921 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
922 }
923
924 MBSTRG(http_output_conv_mimetypes) = re;
925
926 zend_string_release_ex(tmp, 0);
927 return SUCCESS;
928 }
929 /* }}} */
930 /* }}} */
931
932 /* {{{ php.ini directive registration */
933 PHP_INI_BEGIN()
934 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
935 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
936 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
937 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
938 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
939 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
940
941 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
942 PHP_INI_SYSTEM | PHP_INI_PERDIR,
943 OnUpdate_mbstring_encoding_translation,
944 encoding_translation, zend_mbstring_globals, mbstring_globals)
945 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
946 "^(text/|application/xhtml\\+xml)",
947 PHP_INI_ALL,
948 OnUpdate_mbstring_http_output_conv_mimetypes)
949
950 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
951 PHP_INI_ALL,
952 OnUpdateBool,
953 strict_detection, zend_mbstring_globals, mbstring_globals)
954 #ifdef HAVE_MBREGEX
955 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
956 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
957 #endif
PHP_INI_END()958 PHP_INI_END()
959 /* }}} */
960
961 static void mbstring_internal_encoding_changed_hook(void) {
962 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
963 if (!MBSTRG(internal_encoding_set)) {
964 const char *encoding = php_get_internal_encoding();
965 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
966 }
967
968 if (!MBSTRG(http_output_set)) {
969 const char *encoding = php_get_output_encoding();
970 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
971 }
972
973 if (!MBSTRG(http_input_set)) {
974 const char *encoding = php_get_input_encoding();
975 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
976 }
977 }
978
979 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)980 static PHP_GINIT_FUNCTION(mbstring)
981 {
982 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
983 ZEND_TSRMLS_CACHE_UPDATE();
984 #endif
985
986 mbstring_globals->language = mbfl_no_language_uni;
987 mbstring_globals->internal_encoding = NULL;
988 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
989 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
990 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
991 mbstring_globals->http_input_identify = NULL;
992 mbstring_globals->http_input_identify_get = NULL;
993 mbstring_globals->http_input_identify_post = NULL;
994 mbstring_globals->http_input_identify_cookie = NULL;
995 mbstring_globals->http_input_identify_string = NULL;
996 mbstring_globals->http_input_list = NULL;
997 mbstring_globals->http_input_list_size = 0;
998 mbstring_globals->detect_order_list = NULL;
999 mbstring_globals->detect_order_list_size = 0;
1000 mbstring_globals->current_detect_order_list = NULL;
1001 mbstring_globals->current_detect_order_list_size = 0;
1002 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1003 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1004 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1005 mbstring_globals->filter_illegal_substchar = '?';
1006 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1007 mbstring_globals->current_filter_illegal_substchar = '?';
1008 mbstring_globals->illegalchars = 0;
1009 mbstring_globals->encoding_translation = 0;
1010 mbstring_globals->strict_detection = 0;
1011 mbstring_globals->outconv_enabled = false;
1012 mbstring_globals->outconv_state = 0;
1013 mbstring_globals->http_output_conv_mimetypes = NULL;
1014 #ifdef HAVE_MBREGEX
1015 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1016 #endif
1017 mbstring_globals->last_used_encoding_name = NULL;
1018 mbstring_globals->last_used_encoding = NULL;
1019 mbstring_globals->internal_encoding_set = 0;
1020 mbstring_globals->http_output_set = 0;
1021 mbstring_globals->http_input_set = 0;
1022 mbstring_globals->all_encodings_list = NULL;
1023 }
1024 /* }}} */
1025
1026 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1027 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1028 {
1029 if (mbstring_globals->http_input_list) {
1030 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1031 }
1032 if (mbstring_globals->detect_order_list) {
1033 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1034 }
1035 if (mbstring_globals->http_output_conv_mimetypes) {
1036 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1037 }
1038 #ifdef HAVE_MBREGEX
1039 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1040 #endif
1041 }
1042 /* }}} */
1043
1044 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1045 static void init_check_utf8(void);
1046 #endif
1047
1048 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1049 PHP_MINIT_FUNCTION(mbstring)
1050 {
1051 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1052 ZEND_TSRMLS_CACHE_UPDATE();
1053 #endif
1054
1055 REGISTER_INI_ENTRIES();
1056
1057 /* We assume that we're the only user of the hook. */
1058 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1059 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1060 mbstring_internal_encoding_changed_hook();
1061
1062 /* This is a global handler. Should not be set in a per-request handler. */
1063 sapi_register_treat_data(mbstr_treat_data);
1064
1065 /* Post handlers are stored in the thread-local context. */
1066 if (MBSTRG(encoding_translation)) {
1067 sapi_register_post_entries(mbstr_post_entries);
1068 }
1069
1070 #ifdef HAVE_MBREGEX
1071 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1072 #endif
1073
1074 register_mbstring_symbols(module_number);
1075
1076 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1077 return FAILURE;
1078 }
1079
1080 php_rfc1867_set_multibyte_callbacks(
1081 php_mb_encoding_translation,
1082 php_mb_gpc_get_detect_order,
1083 php_mb_gpc_set_input_encoding,
1084 php_mb_rfc1867_getword,
1085 php_mb_rfc1867_getword_conf,
1086 php_mb_rfc1867_basename);
1087
1088 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1089 init_check_utf8();
1090 init_convert_utf16();
1091 #endif
1092
1093 return SUCCESS;
1094 }
1095 /* }}} */
1096
1097 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1098 PHP_MSHUTDOWN_FUNCTION(mbstring)
1099 {
1100 UNREGISTER_INI_ENTRIES();
1101
1102 zend_multibyte_restore_functions();
1103
1104 #ifdef HAVE_MBREGEX
1105 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1106 #endif
1107
1108 php_internal_encoding_changed = NULL;
1109
1110 return SUCCESS;
1111 }
1112 /* }}} */
1113
1114 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1115 PHP_RINIT_FUNCTION(mbstring)
1116 {
1117 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1118 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1119 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1120 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1121
1122 MBSTRG(illegalchars) = 0;
1123
1124 php_mb_populate_current_detect_order_list();
1125
1126 #ifdef HAVE_MBREGEX
1127 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1128 #endif
1129 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1130
1131 return SUCCESS;
1132 }
1133 /* }}} */
1134
1135 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1136 PHP_RSHUTDOWN_FUNCTION(mbstring)
1137 {
1138 if (MBSTRG(current_detect_order_list) != NULL) {
1139 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1140 MBSTRG(current_detect_order_list) = NULL;
1141 MBSTRG(current_detect_order_list_size) = 0;
1142 }
1143
1144 /* clear http input identification. */
1145 MBSTRG(http_input_identify) = NULL;
1146 MBSTRG(http_input_identify_post) = NULL;
1147 MBSTRG(http_input_identify_get) = NULL;
1148 MBSTRG(http_input_identify_cookie) = NULL;
1149 MBSTRG(http_input_identify_string) = NULL;
1150
1151 if (MBSTRG(last_used_encoding_name)) {
1152 zend_string_release(MBSTRG(last_used_encoding_name));
1153 MBSTRG(last_used_encoding_name) = NULL;
1154 }
1155
1156 MBSTRG(internal_encoding_set) = 0;
1157 MBSTRG(http_output_set) = 0;
1158 MBSTRG(http_input_set) = 0;
1159
1160 MBSTRG(outconv_enabled) = false;
1161 MBSTRG(outconv_state) = 0;
1162
1163 if (MBSTRG(all_encodings_list)) {
1164 GC_DELREF(MBSTRG(all_encodings_list));
1165 zend_array_destroy(MBSTRG(all_encodings_list));
1166 MBSTRG(all_encodings_list) = NULL;
1167 }
1168
1169 #ifdef HAVE_MBREGEX
1170 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1171 #endif
1172
1173 return SUCCESS;
1174 }
1175 /* }}} */
1176
1177 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1178 PHP_MINFO_FUNCTION(mbstring)
1179 {
1180 php_info_print_table_start();
1181 php_info_print_table_row(2, "Multibyte Support", "enabled");
1182 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1183 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1184 {
1185 char tmp[256];
1186 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1187 php_info_print_table_row(2, "libmbfl version", tmp);
1188 }
1189 php_info_print_table_end();
1190
1191 php_info_print_table_start();
1192 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1193 php_info_print_table_end();
1194
1195 #ifdef HAVE_MBREGEX
1196 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1197 #endif
1198
1199 DISPLAY_INI_ENTRIES();
1200 }
1201 /* }}} */
1202
1203 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1204 PHP_FUNCTION(mb_language)
1205 {
1206 zend_string *name = NULL;
1207
1208 ZEND_PARSE_PARAMETERS_START(0, 1)
1209 Z_PARAM_OPTIONAL
1210 Z_PARAM_STR_OR_NULL(name)
1211 ZEND_PARSE_PARAMETERS_END();
1212
1213 if (name == NULL) {
1214 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1215 } else {
1216 zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1217 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1218 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1219 zend_string_release_ex(ini_name, 0);
1220 RETURN_THROWS();
1221 }
1222 // TODO Make return void
1223 RETVAL_TRUE;
1224 zend_string_release_ex(ini_name, 0);
1225 }
1226 }
1227 /* }}} */
1228
1229 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1230 PHP_FUNCTION(mb_internal_encoding)
1231 {
1232 char *name = NULL;
1233 size_t name_len;
1234 const mbfl_encoding *encoding;
1235
1236 ZEND_PARSE_PARAMETERS_START(0, 1)
1237 Z_PARAM_OPTIONAL
1238 Z_PARAM_STRING_OR_NULL(name, name_len)
1239 ZEND_PARSE_PARAMETERS_END();
1240
1241 if (name == NULL) {
1242 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1243 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1244 } else {
1245 encoding = mbfl_name2encoding(name);
1246 if (!encoding) {
1247 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1248 RETURN_THROWS();
1249 } else {
1250 MBSTRG(current_internal_encoding) = encoding;
1251 MBSTRG(internal_encoding_set) = 1;
1252 /* TODO Return old encoding */
1253 RETURN_TRUE;
1254 }
1255 }
1256 }
1257 /* }}} */
1258
1259 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1260 PHP_FUNCTION(mb_http_input)
1261 {
1262 char *type = NULL;
1263 size_t type_len = 0, n;
1264 const mbfl_encoding **entry;
1265 const mbfl_encoding *encoding;
1266
1267 ZEND_PARSE_PARAMETERS_START(0, 1)
1268 Z_PARAM_OPTIONAL
1269 Z_PARAM_STRING_OR_NULL(type, type_len)
1270 ZEND_PARSE_PARAMETERS_END();
1271
1272 if (type == NULL) {
1273 encoding = MBSTRG(http_input_identify);
1274 } else if (type_len != 1) {
1275 zend_argument_value_error(1,
1276 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1277 RETURN_THROWS();
1278 } else {
1279 switch (*type) {
1280 case 'G':
1281 case 'g':
1282 encoding = MBSTRG(http_input_identify_get);
1283 break;
1284 case 'P':
1285 case 'p':
1286 encoding = MBSTRG(http_input_identify_post);
1287 break;
1288 case 'C':
1289 case 'c':
1290 encoding = MBSTRG(http_input_identify_cookie);
1291 break;
1292 case 'S':
1293 case 's':
1294 encoding = MBSTRG(http_input_identify_string);
1295 break;
1296 case 'I':
1297 case 'i':
1298 entry = MBSTRG(http_input_list);
1299 n = MBSTRG(http_input_list_size);
1300 array_init(return_value);
1301 for (size_t i = 0; i < n; i++, entry++) {
1302 add_next_index_string(return_value, (*entry)->name);
1303 }
1304 return;
1305 case 'L':
1306 case 'l':
1307 entry = MBSTRG(http_input_list);
1308 n = MBSTRG(http_input_list_size);
1309 if (n == 0) {
1310 RETURN_FALSE;
1311 }
1312
1313 smart_str result = {0};
1314 for (size_t i = 0; i < n; i++, entry++) {
1315 if (i > 0) {
1316 smart_str_appendc(&result, ',');
1317 }
1318 smart_str_appends(&result, (*entry)->name);
1319 }
1320 RETURN_STR(smart_str_extract(&result));
1321 default:
1322 zend_argument_value_error(1,
1323 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1324 RETURN_THROWS();
1325 }
1326 }
1327
1328 if (encoding) {
1329 RETURN_STRING(encoding->name);
1330 } else {
1331 RETURN_FALSE;
1332 }
1333 }
1334 /* }}} */
1335
1336 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1337 PHP_FUNCTION(mb_http_output)
1338 {
1339 char *name = NULL;
1340 size_t name_len;
1341
1342 ZEND_PARSE_PARAMETERS_START(0, 1)
1343 Z_PARAM_OPTIONAL
1344 Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1345 ZEND_PARSE_PARAMETERS_END();
1346
1347 if (name == NULL) {
1348 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1349 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1350 } else {
1351 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1352 if (!encoding) {
1353 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1354 RETURN_THROWS();
1355 } else {
1356 MBSTRG(http_output_set) = 1;
1357 MBSTRG(current_http_output_encoding) = encoding;
1358 /* TODO Return previous encoding? */
1359 RETURN_TRUE;
1360 }
1361 }
1362 }
1363 /* }}} */
1364
1365 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1366 PHP_FUNCTION(mb_detect_order)
1367 {
1368 zend_string *order_str = NULL;
1369 HashTable *order_ht = NULL;
1370
1371 ZEND_PARSE_PARAMETERS_START(0, 1)
1372 Z_PARAM_OPTIONAL
1373 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1374 ZEND_PARSE_PARAMETERS_END();
1375
1376 if (!order_str && !order_ht) {
1377 size_t n = MBSTRG(current_detect_order_list_size);
1378 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1379 array_init(return_value);
1380 for (size_t i = 0; i < n; i++) {
1381 add_next_index_string(return_value, (*entry)->name);
1382 entry++;
1383 }
1384 } else {
1385 const mbfl_encoding **list;
1386 size_t size;
1387 if (order_ht) {
1388 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1389 RETURN_THROWS();
1390 }
1391 } else {
1392 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1393 RETURN_THROWS();
1394 }
1395 }
1396
1397 if (size == 0) {
1398 efree(ZEND_VOIDP(list));
1399 zend_argument_value_error(1, "must specify at least one encoding");
1400 RETURN_THROWS();
1401 }
1402
1403 if (MBSTRG(current_detect_order_list)) {
1404 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1405 }
1406 MBSTRG(current_detect_order_list) = list;
1407 MBSTRG(current_detect_order_list_size) = size;
1408 RETURN_TRUE;
1409 }
1410 }
1411 /* }}} */
1412
php_mb_check_code_point(zend_long cp)1413 static inline bool php_mb_check_code_point(zend_long cp)
1414 {
1415 if (cp < 0 || cp >= 0x110000) {
1416 /* Out of Unicode range */
1417 return false;
1418 }
1419
1420 if (cp >= 0xd800 && cp <= 0xdfff) {
1421 /* Surrogate code-point. These are never valid on their own and we only allow a single
1422 * substitute character. */
1423 return false;
1424 }
1425
1426 /* As we do not know the target encoding of the conversion operation that is going to
1427 * use the substitution character, we cannot check whether the codepoint is actually mapped
1428 * in the given encoding at this point. Thus we have to accept everything. */
1429 return true;
1430 }
1431
1432 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1433 PHP_FUNCTION(mb_substitute_character)
1434 {
1435 zend_string *substitute_character = NULL;
1436 zend_long substitute_codepoint;
1437 bool substitute_is_null = 1;
1438
1439 ZEND_PARSE_PARAMETERS_START(0, 1)
1440 Z_PARAM_OPTIONAL
1441 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1442 ZEND_PARSE_PARAMETERS_END();
1443
1444 if (substitute_is_null) {
1445 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1446 RETURN_STRING("none");
1447 }
1448 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1449 RETURN_STRING("long");
1450 }
1451 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1452 RETURN_STRING("entity");
1453 }
1454 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1455 }
1456
1457 if (substitute_character != NULL) {
1458 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1459 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1460 RETURN_TRUE;
1461 }
1462 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1463 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1464 RETURN_TRUE;
1465 }
1466 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1467 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1468 RETURN_TRUE;
1469 }
1470 /* Invalid string value */
1471 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1472 RETURN_THROWS();
1473 }
1474 /* Integer codepoint passed */
1475 if (!php_mb_check_code_point(substitute_codepoint)) {
1476 zend_argument_value_error(1, "is not a valid codepoint");
1477 RETURN_THROWS();
1478 }
1479
1480 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1481 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1482 RETURN_TRUE;
1483 }
1484 /* }}} */
1485
1486 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1487 PHP_FUNCTION(mb_preferred_mime_name)
1488 {
1489 char *name = NULL;
1490 size_t name_len;
1491
1492 ZEND_PARSE_PARAMETERS_START(1, 1)
1493 Z_PARAM_STRING(name, name_len)
1494 ZEND_PARSE_PARAMETERS_END();
1495
1496 const mbfl_encoding *enc = mbfl_name2encoding(name);
1497 if (enc == NULL) {
1498 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1499 RETURN_THROWS();
1500 }
1501
1502 const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1503 if (preferred_name == NULL || *preferred_name == '\0') {
1504 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1505 RETVAL_FALSE;
1506 } else {
1507 RETVAL_STRING((char *)preferred_name);
1508 }
1509 }
1510 /* }}} */
1511
1512 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1513 PHP_FUNCTION(mb_parse_str)
1514 {
1515 zval *track_vars_array = NULL;
1516 char *encstr;
1517 size_t encstr_len;
1518 php_mb_encoding_handler_info_t info;
1519 const mbfl_encoding *detected;
1520
1521 ZEND_PARSE_PARAMETERS_START(2, 2)
1522 Z_PARAM_STRING(encstr, encstr_len)
1523 Z_PARAM_ZVAL(track_vars_array)
1524 ZEND_PARSE_PARAMETERS_END();
1525
1526 track_vars_array = zend_try_array_init(track_vars_array);
1527 if (!track_vars_array) {
1528 RETURN_THROWS();
1529 }
1530
1531 encstr = estrndup(encstr, encstr_len);
1532
1533 info.data_type = PARSE_STRING;
1534 info.separator = PG(arg_separator).input;
1535 info.report_errors = true;
1536 info.to_encoding = MBSTRG(current_internal_encoding);
1537 info.from_encodings = MBSTRG(http_input_list);
1538 info.num_from_encodings = MBSTRG(http_input_list_size);
1539
1540 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1541
1542 MBSTRG(http_input_identify) = detected;
1543
1544 RETVAL_BOOL(detected);
1545
1546 if (encstr != NULL) efree(encstr);
1547 }
1548 /* }}} */
1549
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 zend_string *str;
1553 zend_long arg_status;
1554
1555 ZEND_PARSE_PARAMETERS_START(2, 2)
1556 Z_PARAM_STR(str)
1557 Z_PARAM_LONG(arg_status)
1558 ZEND_PARSE_PARAMETERS_END();
1559
1560 const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1561 if (encoding == &mbfl_encoding_pass) {
1562 RETURN_STR_COPY(str);
1563 }
1564
1565 if (arg_status & PHP_OUTPUT_HANDLER_START) {
1566 bool free_mimetype = false;
1567 char *mimetype = NULL;
1568
1569 /* Analyze mime type */
1570 if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1571 char *s;
1572 if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1573 mimetype = estrdup(SG(sapi_headers).mimetype);
1574 } else {
1575 mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1576 }
1577 free_mimetype = true;
1578 } else if (SG(sapi_headers).send_default_content_type) {
1579 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1580 }
1581
1582 /* If content-type is not yet set, set it and enable conversion */
1583 if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1584 const char *charset = encoding->mime_name;
1585 if (charset) {
1586 char *p;
1587 size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s", mimetype, charset);
1588 if (sapi_add_header(p, len, 0) != FAILURE) {
1589 SG(sapi_headers).send_default_content_type = 0;
1590 }
1591 }
1592
1593 MBSTRG(outconv_enabled) = true;
1594 }
1595
1596 if (free_mimetype) {
1597 efree(mimetype);
1598 }
1599 }
1600
1601 if (!MBSTRG(outconv_enabled)) {
1602 RETURN_STR_COPY(str);
1603 }
1604
1605 mb_convert_buf buf;
1606 mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1607
1608 uint32_t wchar_buf[128];
1609 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1610 size_t in_len = ZSTR_LEN(str);
1611 bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1612
1613 while (in_len) {
1614 size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1615 ZEND_ASSERT(out_len <= 128);
1616 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1617 }
1618
1619 MBSTRG(illegalchars) += buf.errors;
1620 RETVAL_STR(mb_convert_buf_result_raw(&buf));
1621
1622 if (last_feed) {
1623 MBSTRG(outconv_enabled) = false;
1624 MBSTRG(outconv_state) = 0;
1625 }
1626 }
1627
PHP_FUNCTION(mb_str_split)1628 PHP_FUNCTION(mb_str_split)
1629 {
1630 zend_string *str, *encoding = NULL;
1631 zend_long split_len = 1;
1632
1633 ZEND_PARSE_PARAMETERS_START(1, 3)
1634 Z_PARAM_STR(str)
1635 Z_PARAM_OPTIONAL
1636 Z_PARAM_LONG(split_len)
1637 Z_PARAM_STR_OR_NULL(encoding)
1638 ZEND_PARSE_PARAMETERS_END();
1639
1640 if (split_len <= 0) {
1641 zend_argument_value_error(2, "must be greater than 0");
1642 RETURN_THROWS();
1643 } else if (split_len > UINT_MAX / 4) {
1644 zend_argument_value_error(2, "is too large");
1645 RETURN_THROWS();
1646 }
1647
1648 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1649 if (!enc) {
1650 RETURN_THROWS();
1651 }
1652
1653 if (ZSTR_LEN(str) == 0) {
1654 RETURN_EMPTY_ARRAY();
1655 }
1656
1657 unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1658
1659 unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1660 if (char_len) {
1661 unsigned int chunk_len = char_len * split_len;
1662 unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1663 array_init_size(return_value, chunks);
1664 while (p < e) {
1665 add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1666 p += chunk_len;
1667 }
1668 } else if (enc->mblen_table) {
1669 unsigned char const *mbtab = enc->mblen_table;
1670
1671 /* Assume that we have 1-byte characters */
1672 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1673
1674 while (p < e) {
1675 unsigned char *chunk = p; /* start of chunk */
1676
1677 for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1678 p += mbtab[*p];
1679 }
1680 if (p > e) {
1681 p = e; /* ensure chunk is in bounds */
1682 }
1683 add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1684 }
1685 } else {
1686 /* Assume that we have 1-byte characters */
1687 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1688
1689 uint32_t wchar_buf[128];
1690 size_t in_len = ZSTR_LEN(str);
1691 unsigned int state = 0, char_count = 0;
1692
1693 mb_convert_buf buf;
1694
1695 while (in_len) {
1696 size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1697 ZEND_ASSERT(out_len <= 128);
1698 size_t i = 0;
1699
1700 /* Is there some output remaining from the previous iteration? */
1701 if (char_count) {
1702 if (out_len >= split_len - char_count) {
1703 /* Finish off an incomplete chunk from previous iteration
1704 * ('buf' was already initialized; we don't need to do it again) */
1705 enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1706 i += split_len - char_count;
1707 char_count = 0;
1708 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1709 } else {
1710 /* Output from this iteration is not enough to finish the next chunk;
1711 * output what we can, and leave 'buf' to be used again on next iteration */
1712 enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1713 char_count += out_len;
1714 continue;
1715 }
1716 }
1717
1718 while (i < out_len) {
1719 /* Prepare for the next chunk */
1720 mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1721
1722 if (out_len - i >= split_len) {
1723 enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1724 i += split_len;
1725 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1726 } else {
1727 /* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1728 * leave them for the next iteration */
1729 enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1730 char_count = out_len - i;
1731 break;
1732 }
1733 }
1734 }
1735
1736 if (char_count) {
1737 /* The main loop above has finished processing the input string, but
1738 * has left a partial chunk in 'buf' */
1739 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1740 }
1741 }
1742 }
1743
1744 #ifdef __SSE2__
1745 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1746 * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1747 * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1748 * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1749 static inline uint32_t _mm_sum_epu8(const __m128i v)
1750 {
1751 /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1752 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1753 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1754 * halves of the destination XMM register
1755 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1756 * summed up will actually just be the 8-bit values from `v` */
1757 __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1758 /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1759 * to extract it here; but it stored the sum as two different 16-bit values
1760 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1761 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1762 return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1763 }
1764 #endif
1765
1766 /* This assumes that `string` is valid UTF-8
1767 * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1768 * Interpreted as signed integers, those are all byte values less than -64
1769 * A fast way to get the length of a UTF-8 string is to start with its byte length,
1770 * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1771 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1772 {
1773 unsigned char *e = p + len;
1774
1775 #ifdef __SSE2__
1776 if (len >= sizeof(__m128i)) {
1777 e -= sizeof(__m128i);
1778
1779 const __m128i threshold = _mm_set1_epi8(-64);
1780 const __m128i delta = _mm_set1_epi8(1);
1781 __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1782
1783 unsigned char reset_counter = 255;
1784 do {
1785 __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1786 __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1787 counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1788
1789 /* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1790 * and reset them to zero */
1791 if (--reset_counter == 0) {
1792 len -= _mm_sum_epu8(counter);
1793 counter = _mm_setzero_si128();
1794 reset_counter = 255;
1795 }
1796
1797 p += sizeof(__m128i);
1798 } while (p <= e);
1799
1800 e += sizeof(__m128i);
1801 len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1802 }
1803 #endif
1804
1805 /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1806 while (p < e) {
1807 signed char c = *p++;
1808 if (c < -64) {
1809 len--;
1810 }
1811 }
1812
1813 return len;
1814 }
1815
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1816 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1817 {
1818 unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1819 if (char_len) {
1820 return ZSTR_LEN(string) / char_len;
1821 } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1822 return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1823 }
1824
1825 uint32_t wchar_buf[128];
1826 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1827 size_t in_len = ZSTR_LEN(string);
1828 unsigned int state = 0;
1829 size_t len = 0;
1830
1831 while (in_len) {
1832 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1833 }
1834
1835 return len;
1836 }
1837
1838 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1839 PHP_FUNCTION(mb_strlen)
1840 {
1841 zend_string *string, *enc_name = NULL;
1842
1843 ZEND_PARSE_PARAMETERS_START(1, 2)
1844 Z_PARAM_STR(string)
1845 Z_PARAM_OPTIONAL
1846 Z_PARAM_STR_OR_NULL(enc_name)
1847 ZEND_PARSE_PARAMETERS_END();
1848
1849 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1850 if (!enc) {
1851 RETURN_THROWS();
1852 }
1853
1854 RETVAL_LONG(mb_get_strlen(string, enc));
1855 }
1856 /* }}} */
1857
1858 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1859 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1860 {
1861 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1862 }
1863
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1864 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1865 if (offset < 0) {
1866 unsigned char *pos = end;
1867 while (offset < 0) {
1868 if (pos <= str) {
1869 return NULL;
1870 }
1871
1872 unsigned char c = *--pos;
1873 if (c < 0x80 || (c & 0xC0) != 0x80) {
1874 offset++;
1875 }
1876 }
1877 return pos;
1878 } else {
1879 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1880 unsigned char *pos = str;
1881 while (offset-- > 0) {
1882 if (pos >= end) {
1883 return NULL;
1884 }
1885 pos += u8_tbl[*pos];
1886 }
1887 return pos;
1888 }
1889 }
1890
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1891 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1892 return mb_fast_strlen_utf8(start, pos - start);
1893 }
1894
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1895 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1896 {
1897 size_t result;
1898 zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1899 unsigned char *offset_pointer;
1900
1901 if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1902 unsigned int num_errors = 0;
1903 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1904 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1905 } else {
1906 haystack_u8 = haystack;
1907 needle_u8 = needle;
1908 }
1909
1910 offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1911 if (!offset_pointer) {
1912 result = MBFL_ERROR_OFFSET;
1913 goto out;
1914 }
1915
1916 result = MBFL_ERROR_NOT_FOUND;
1917 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1918 goto out;
1919 }
1920
1921 const char *found_pos;
1922 if (!reverse) {
1923 found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1924 } else if (offset >= 0) {
1925 found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1926 } else {
1927 size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1928 offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1929 if (!offset_pointer) {
1930 offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1931 }
1932
1933 found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1934 }
1935
1936 if (found_pos) {
1937 result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1938 }
1939
1940 out:
1941 if (haystack_u8 != haystack) {
1942 zend_string_free(haystack_u8);
1943 }
1944 if (needle_u8 != needle) {
1945 zend_string_free(needle_u8);
1946 }
1947 return result;
1948 }
1949
handle_strpos_error(size_t error)1950 static void handle_strpos_error(size_t error) {
1951 switch (error) {
1952 case MBFL_ERROR_NOT_FOUND:
1953 break;
1954 case MBFL_ERROR_ENCODING:
1955 php_error_docref(NULL, E_WARNING, "Conversion error");
1956 break;
1957 case MBFL_ERROR_OFFSET:
1958 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1959 break;
1960 default:
1961 zend_value_error("mb_strpos(): Unknown error");
1962 break;
1963 }
1964 }
1965
PHP_FUNCTION(mb_strpos)1966 PHP_FUNCTION(mb_strpos)
1967 {
1968 zend_long offset = 0;
1969 zend_string *needle, *haystack;
1970 zend_string *enc_name = NULL;
1971
1972 ZEND_PARSE_PARAMETERS_START(2, 4)
1973 Z_PARAM_STR(haystack)
1974 Z_PARAM_STR(needle)
1975 Z_PARAM_OPTIONAL
1976 Z_PARAM_LONG(offset)
1977 Z_PARAM_STR_OR_NULL(enc_name)
1978 ZEND_PARSE_PARAMETERS_END();
1979
1980 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1981 if (!enc) {
1982 RETURN_THROWS();
1983 }
1984
1985 size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1986 if (!mbfl_is_error(n)) {
1987 RETVAL_LONG(n);
1988 } else {
1989 handle_strpos_error(n);
1990 RETVAL_FALSE;
1991 }
1992 }
1993
1994 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1995 PHP_FUNCTION(mb_strrpos)
1996 {
1997 zend_long offset = 0;
1998 zend_string *needle, *haystack;
1999 zend_string *enc_name = NULL;
2000
2001 ZEND_PARSE_PARAMETERS_START(2, 4)
2002 Z_PARAM_STR(haystack)
2003 Z_PARAM_STR(needle)
2004 Z_PARAM_OPTIONAL
2005 Z_PARAM_LONG(offset)
2006 Z_PARAM_STR_OR_NULL(enc_name)
2007 ZEND_PARSE_PARAMETERS_END();
2008
2009 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2010 if (!enc) {
2011 RETURN_THROWS();
2012 }
2013
2014 size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2015 if (!mbfl_is_error(n)) {
2016 RETVAL_LONG(n);
2017 } else {
2018 handle_strpos_error(n);
2019 RETVAL_FALSE;
2020 }
2021 }
2022 /* }}} */
2023
2024 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2025 PHP_FUNCTION(mb_stripos)
2026 {
2027 zend_long offset = 0;
2028 zend_string *haystack, *needle;
2029 zend_string *from_encoding = NULL;
2030
2031 ZEND_PARSE_PARAMETERS_START(2, 4)
2032 Z_PARAM_STR(haystack)
2033 Z_PARAM_STR(needle)
2034 Z_PARAM_OPTIONAL
2035 Z_PARAM_LONG(offset)
2036 Z_PARAM_STR_OR_NULL(from_encoding)
2037 ZEND_PARSE_PARAMETERS_END();
2038
2039 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2040 if (!enc) {
2041 RETURN_THROWS();
2042 }
2043
2044 size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2045
2046 if (!mbfl_is_error(n)) {
2047 RETVAL_LONG(n);
2048 } else {
2049 handle_strpos_error(n);
2050 RETVAL_FALSE;
2051 }
2052 }
2053 /* }}} */
2054
2055 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2056 PHP_FUNCTION(mb_strripos)
2057 {
2058 zend_long offset = 0;
2059 zend_string *haystack, *needle;
2060 zend_string *from_encoding = NULL;
2061
2062 ZEND_PARSE_PARAMETERS_START(2, 4)
2063 Z_PARAM_STR(haystack)
2064 Z_PARAM_STR(needle)
2065 Z_PARAM_OPTIONAL
2066 Z_PARAM_LONG(offset)
2067 Z_PARAM_STR_OR_NULL(from_encoding)
2068 ZEND_PARSE_PARAMETERS_END();
2069
2070 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2071 if (!enc) {
2072 RETURN_THROWS();
2073 }
2074
2075 size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2076
2077 if (!mbfl_is_error(n)) {
2078 RETVAL_LONG(n);
2079 } else {
2080 handle_strpos_error(n);
2081 RETVAL_FALSE;
2082 }
2083 }
2084 /* }}} */
2085
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2086 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2087 {
2088 uint32_t wchar_buf[128];
2089 unsigned int state = 0;
2090
2091 mb_convert_buf buf;
2092 mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2093
2094 while (in_len && len) {
2095 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2096 ZEND_ASSERT(out_len <= 128);
2097
2098 if (from >= out_len) {
2099 from -= out_len;
2100 } else {
2101 size_t needed_codepoints = MIN(out_len - from, len);
2102 enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2103 from = 0;
2104 len -= needed_codepoints;
2105 }
2106 }
2107
2108 return mb_convert_buf_result(&buf, enc);
2109 }
2110
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2111 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2112 {
2113 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2114 size_t in_len = ZSTR_LEN(input);
2115
2116 if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2117 /* Other than MacJapanese, no supported text encoding decodes to
2118 * more than one codepoint per byte
2119 * So if the number of codepoints to skip >= number of input bytes,
2120 * then definitely the output should be empty */
2121 return zend_empty_string;
2122 }
2123
2124 /* Does each codepoint have a fixed byte width? */
2125 unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2126 if (flag) {
2127 /* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2128 from *= flag;
2129 len *= flag;
2130 if (from >= in_len) {
2131 return zend_empty_string;
2132 }
2133 in += from;
2134 in_len -= from;
2135 if (len > in_len) {
2136 len = in_len;
2137 }
2138 return zend_string_init_fast((const char*)in, len);
2139 }
2140
2141 return mb_get_substr_slow(in, in_len, from, len, enc);
2142 }
2143
2144 #define MB_STRSTR 1
2145 #define MB_STRRCHR 2
2146 #define MB_STRISTR 3
2147 #define MB_STRRICHR 4
2148
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2149 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2150 {
2151 bool reverse_mode = false, part = false;
2152 size_t n;
2153 zend_string *haystack, *needle;
2154 zend_string *encoding_name = NULL;
2155
2156 ZEND_PARSE_PARAMETERS_START(2, 4)
2157 Z_PARAM_STR(haystack)
2158 Z_PARAM_STR(needle)
2159 Z_PARAM_OPTIONAL
2160 Z_PARAM_BOOL(part)
2161 Z_PARAM_STR_OR_NULL(encoding_name)
2162 ZEND_PARSE_PARAMETERS_END();
2163
2164 const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2165 if (!enc) {
2166 RETURN_THROWS();
2167 }
2168
2169 if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2170 reverse_mode = true;
2171 }
2172
2173 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2174 n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2175 } else {
2176 n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2177 }
2178
2179 if (!mbfl_is_error(n)) {
2180 if (part) {
2181 RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2182 } else {
2183 RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2184 }
2185 } else {
2186 // FIXME use handle_strpos_error(n)
2187 RETVAL_FALSE;
2188 }
2189 }
2190
2191 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2192 PHP_FUNCTION(mb_strstr)
2193 {
2194 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2195 }
2196 /* }}} */
2197
2198 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2199 PHP_FUNCTION(mb_strrchr)
2200 {
2201 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2202 }
2203 /* }}} */
2204
2205 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2206 PHP_FUNCTION(mb_stristr)
2207 {
2208 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2209 }
2210 /* }}} */
2211
2212 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2213 PHP_FUNCTION(mb_strrichr)
2214 {
2215 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2216 }
2217 /* }}} */
2218
2219 #undef MB_STRSTR
2220 #undef MB_STRRCHR
2221 #undef MB_STRISTR
2222 #undef MB_STRRICHR
2223
PHP_FUNCTION(mb_substr_count)2224 PHP_FUNCTION(mb_substr_count)
2225 {
2226 zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2227
2228 ZEND_PARSE_PARAMETERS_START(2, 3)
2229 Z_PARAM_STR(haystack)
2230 Z_PARAM_STR(needle)
2231 Z_PARAM_OPTIONAL
2232 Z_PARAM_STR_OR_NULL(enc_name)
2233 ZEND_PARSE_PARAMETERS_END();
2234
2235 if (ZSTR_LEN(needle) == 0) {
2236 zend_argument_value_error(2, "must not be empty");
2237 RETURN_THROWS();
2238 }
2239
2240 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2241 if (!enc) {
2242 RETURN_THROWS();
2243 }
2244
2245 if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2246 /* No need to do any conversion if haystack/needle are already known-valid UTF-8
2247 * (If they are not valid, then not passing them through conversion filters could affect output) */
2248 if (ZSTR_IS_VALID_UTF8(haystack)) {
2249 haystack_u8 = haystack;
2250 } else {
2251 unsigned int num_errors = 0;
2252 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2253 if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2254 GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2255 }
2256 }
2257
2258 if (ZSTR_IS_VALID_UTF8(needle)) {
2259 needle_u8 = needle;
2260 } else {
2261 unsigned int num_errors = 0;
2262 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2263 if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2264 GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2265 }
2266 }
2267 } else {
2268 unsigned int num_errors = 0;
2269 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2271 /* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2272 * may be only escape sequences */
2273 if (ZSTR_LEN(needle_u8) == 0) {
2274 zend_string_free(haystack_u8);
2275 zend_string_free(needle_u8);
2276 zend_argument_value_error(2, "must not be empty");
2277 RETURN_THROWS();
2278 }
2279 }
2280
2281 size_t result = 0;
2282
2283 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2284 goto out;
2285 }
2286
2287 const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2288 while (true) {
2289 p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2290 if (!p) {
2291 break;
2292 }
2293 p += ZSTR_LEN(needle_u8);
2294 result++;
2295 }
2296
2297 out:
2298 if (haystack_u8 != haystack) {
2299 zend_string_free(haystack_u8);
2300 }
2301 if (needle_u8 != needle) {
2302 zend_string_free(needle_u8);
2303 }
2304
2305 RETVAL_LONG(result);
2306 }
2307
2308 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2309 PHP_FUNCTION(mb_substr)
2310 {
2311 zend_string *str, *encoding = NULL;
2312 zend_long from, len;
2313 size_t real_from, real_len;
2314 bool len_is_null = true;
2315
2316 ZEND_PARSE_PARAMETERS_START(2, 4)
2317 Z_PARAM_STR(str)
2318 Z_PARAM_LONG(from)
2319 Z_PARAM_OPTIONAL
2320 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2321 Z_PARAM_STR_OR_NULL(encoding)
2322 ZEND_PARSE_PARAMETERS_END();
2323
2324 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2325 if (!enc) {
2326 RETURN_THROWS();
2327 }
2328
2329 size_t mblen = 0;
2330 if (from < 0 || (!len_is_null && len < 0)) {
2331 mblen = mb_get_strlen(str, enc);
2332 }
2333
2334 /* if "from" position is negative, count start position from the end
2335 * of the string */
2336 if (from >= 0) {
2337 real_from = (size_t) from;
2338 } else if (-from < mblen) {
2339 real_from = mblen + from;
2340 } else {
2341 real_from = 0;
2342 }
2343
2344 /* if "length" position is negative, set it to the length
2345 * needed to stop that many chars from the end of the string */
2346 if (len_is_null) {
2347 real_len = MBFL_SUBSTR_UNTIL_END;
2348 } else if (len >= 0) {
2349 real_len = (size_t) len;
2350 } else if (real_from < mblen && -len < mblen - real_from) {
2351 real_len = (mblen - real_from) + len;
2352 } else {
2353 real_len = 0;
2354 }
2355
2356 RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2357 }
2358 /* }}} */
2359
2360 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2361 PHP_FUNCTION(mb_strcut)
2362 {
2363 zend_string *encoding = NULL;
2364 char *string_val;
2365 zend_long from, len;
2366 bool len_is_null = true;
2367 mbfl_string string, result, *ret;
2368
2369 ZEND_PARSE_PARAMETERS_START(2, 4)
2370 Z_PARAM_STRING(string_val, string.len)
2371 Z_PARAM_LONG(from)
2372 Z_PARAM_OPTIONAL
2373 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2374 Z_PARAM_STR_OR_NULL(encoding)
2375 ZEND_PARSE_PARAMETERS_END();
2376
2377 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2378 if (!enc) {
2379 RETURN_THROWS();
2380 }
2381
2382 string.val = (unsigned char*)string_val;
2383 string.encoding = enc;
2384
2385 if (len_is_null) {
2386 len = string.len;
2387 }
2388
2389 /* if "from" position is negative, count start position from the end
2390 * of the string */
2391 if (from < 0) {
2392 from = string.len + from;
2393 if (from < 0) {
2394 from = 0;
2395 }
2396 }
2397
2398 /* if "length" position is negative, set it to the length
2399 * needed to stop that many chars from the end of the string */
2400 if (len < 0) {
2401 len = (string.len - from) + len;
2402 if (len < 0) {
2403 len = 0;
2404 }
2405 }
2406
2407 if (from > string.len || len == 0) {
2408 RETURN_EMPTY_STRING();
2409 }
2410
2411 if (enc->cut) {
2412 RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2413 }
2414
2415 unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2416 if (char_len) {
2417 /* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2418 from &= -char_len;
2419 if (len > string.len - from) {
2420 len = string.len - from;
2421 }
2422 RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2423 }
2424
2425 if (enc->mblen_table) {
2426 const unsigned char *mbtab = enc->mblen_table;
2427 const unsigned char *p, *q, *end;
2428 int m = 0;
2429 /* Search for start position */
2430 for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2431 if (p > q) {
2432 p -= m;
2433 }
2434 const unsigned char *start = p;
2435 /* Search for end position */
2436 if (len >= string.len - (start - (const unsigned char*)string.val)) {
2437 end = (const unsigned char*)(string.val + string.len);
2438 } else {
2439 for (q = p + len; p < q; p += (m = mbtab[*p]));
2440 if (p > q) {
2441 p -= m;
2442 }
2443 end = p;
2444 }
2445 RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2446 }
2447
2448 ret = mbfl_strcut(&string, &result, from, len);
2449 ZEND_ASSERT(ret != NULL);
2450 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2451 efree(ret->val);
2452 }
2453 /* }}} */
2454
2455 /* Some East Asian characters, when printed at a terminal (or the like), require double
2456 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2457 static size_t character_width(uint32_t c)
2458 {
2459 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2460 return 1;
2461 }
2462
2463 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2464 unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2465 while (lo < hi) {
2466 unsigned int probe = (lo + hi) / 2;
2467 if (c < mbfl_eaw_table[probe].begin) {
2468 hi = probe;
2469 } else if (c > mbfl_eaw_table[probe].end) {
2470 lo = probe + 1;
2471 } else {
2472 return 2;
2473 }
2474 }
2475
2476 return 1;
2477 }
2478
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2479 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2480 {
2481 size_t width = 0;
2482 uint32_t wchar_buf[128];
2483 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2484 size_t in_len = ZSTR_LEN(string);
2485 unsigned int state = 0;
2486
2487 while (in_len) {
2488 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2489 ZEND_ASSERT(out_len <= 128);
2490
2491 while (out_len) {
2492 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2493 * If text conversion is performed with an ordinary ASCII character as
2494 * the 'replacement character', this will give us the correct display width. */
2495 width += character_width(wchar_buf[--out_len]);
2496 }
2497 }
2498
2499 return width;
2500 }
2501
2502 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2503 PHP_FUNCTION(mb_strwidth)
2504 {
2505 zend_string *string, *enc_name = NULL;
2506
2507 ZEND_PARSE_PARAMETERS_START(1, 2)
2508 Z_PARAM_STR(string)
2509 Z_PARAM_OPTIONAL
2510 Z_PARAM_STR_OR_NULL(enc_name)
2511 ZEND_PARSE_PARAMETERS_END();
2512
2513 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2514 if (!enc) {
2515 RETURN_THROWS();
2516 }
2517
2518 RETVAL_LONG(mb_get_strwidth(string, enc));
2519 }
2520
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2521 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2522 {
2523 uint32_t wchar_buf[128];
2524 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2525 size_t in_len = ZSTR_LEN(input);
2526 unsigned int state = 0;
2527 size_t remaining_width = width;
2528 size_t to_skip = from;
2529 size_t out_len = 0;
2530 bool first_call = true, input_err = false;
2531 mb_convert_buf buf;
2532
2533 while (in_len) {
2534 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2535 ZEND_ASSERT(out_len <= 128);
2536
2537 if (out_len <= to_skip) {
2538 to_skip -= out_len;
2539 } else {
2540 for (size_t i = to_skip; i < out_len; i++) {
2541 uint32_t w = wchar_buf[i];
2542 size_t current_w_width = character_width(w);
2543
2544 input_err |= (w == MBFL_BAD_INPUT);
2545
2546 if (remaining_width < current_w_width) {
2547 size_t marker_width = mb_get_strwidth(marker, enc);
2548
2549 /* The trim marker is larger than the desired string width */
2550 if (width <= marker_width) {
2551 return zend_string_copy(marker);
2552 }
2553
2554 /* We need to truncate string and append trim marker */
2555 width -= marker_width;
2556 /* 'width' is now the amount we want to take from 'input' */
2557 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2558
2559 if (first_call) {
2560 /* We can use the buffer of wchars which we have right now;
2561 * no need to convert again */
2562 goto dont_restart_conversion;
2563 } else {
2564 goto restart_conversion;
2565 }
2566 }
2567 remaining_width -= current_w_width;
2568 }
2569 to_skip = 0;
2570 }
2571 first_call = false;
2572 }
2573
2574 /* The input string fits in the requested width; we don't need to append the trim marker
2575 * However, if the string contains erroneous byte sequences, those should be converted
2576 * to error markers */
2577 if (!input_err) {
2578 if (from == 0) {
2579 /* This just increments the string's refcount; it doesn't really 'copy' it */
2580 return zend_string_copy(input);
2581 } else {
2582 return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2583 }
2584 } else {
2585 /* We can't use `mb_get_substr`, because it uses the fastest method possible of
2586 * picking out a substring, which may not include converting erroneous byte
2587 * sequences to error markers */
2588 return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2589 }
2590
2591 /* The input string is too wide; we need to build a new string which
2592 * includes some portion of the input string, with the trim marker
2593 * concatenated onto it */
2594 restart_conversion:
2595 in = (unsigned char*)ZSTR_VAL(input);
2596 in_len = ZSTR_LEN(input);
2597 state = 0;
2598
2599 while (true) {
2600 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2601 ZEND_ASSERT(out_len <= 128);
2602
2603 dont_restart_conversion:
2604 if (out_len <= from) {
2605 from -= out_len;
2606 } else {
2607 for (size_t i = from; i < out_len; i++) {
2608 size_t current_wchar_char_width = character_width(wchar_buf[i]);
2609 if (width < current_wchar_char_width) {
2610 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2611 goto append_trim_marker;
2612 }
2613 width -= current_wchar_char_width;
2614 }
2615 ZEND_ASSERT(in_len > 0);
2616 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2617 from = 0;
2618 }
2619 }
2620
2621 append_trim_marker:
2622 if (ZSTR_LEN(marker) > 0) {
2623 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2624 buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2625 }
2626
2627 /* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2628 * we have no guarantee that the trim marker string is valid UTF-8 */
2629 return mb_convert_buf_result_raw(&buf);
2630 }
2631
2632 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2633 PHP_FUNCTION(mb_strimwidth)
2634 {
2635 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2636 zend_long from, width;
2637
2638 ZEND_PARSE_PARAMETERS_START(3, 5)
2639 Z_PARAM_STR(str)
2640 Z_PARAM_LONG(from)
2641 Z_PARAM_LONG(width)
2642 Z_PARAM_OPTIONAL
2643 Z_PARAM_STR(trimmarker)
2644 Z_PARAM_STR_OR_NULL(encoding)
2645 ZEND_PARSE_PARAMETERS_END();
2646
2647 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2648 if (!enc) {
2649 RETURN_THROWS();
2650 }
2651
2652 if (from != 0) {
2653 size_t str_len = mb_get_strlen(str, enc);
2654 if (from < 0) {
2655 from += str_len;
2656 }
2657 if (from < 0 || from > str_len) {
2658 zend_argument_value_error(2, "is out of range");
2659 RETURN_THROWS();
2660 }
2661 }
2662
2663 if (width < 0) {
2664 php_error_docref(NULL, E_DEPRECATED,
2665 "passing a negative integer to argument #3 ($width) is deprecated");
2666 width += mb_get_strwidth(str, enc);
2667
2668 if (from > 0) {
2669 zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2670 width -= mb_get_strwidth(trimmed, enc);
2671 zend_string_free(trimmed);
2672 }
2673
2674 if (width < 0) {
2675 zend_argument_value_error(3, "is out of range");
2676 RETURN_THROWS();
2677 }
2678 }
2679
2680 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2681 }
2682
2683
2684 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2685 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2686 {
2687 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2688 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2689 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2690 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2691 }
2692
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2693 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2694 {
2695 unsigned int num_errors = 0;
2696 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2697 MBSTRG(illegalchars) += num_errors;
2698 return result;
2699 }
2700
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2701 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2702 {
2703 const mbfl_encoding *from_encoding;
2704
2705 /* pre-conversion encoding */
2706 ZEND_ASSERT(num_from_encodings >= 1);
2707 if (num_from_encodings == 1) {
2708 from_encoding = *from_encodings;
2709 } else {
2710 /* auto detect */
2711 from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2712 if (!from_encoding) {
2713 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2714 return NULL;
2715 }
2716 }
2717
2718 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2719 }
2720
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2721 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2722 {
2723 HashTable *output, *chash;
2724 zend_long idx;
2725 zend_string *key;
2726 zval *entry, entry_tmp;
2727
2728 if (!input) {
2729 return NULL;
2730 }
2731
2732 if (GC_IS_RECURSIVE(input)) {
2733 GC_UNPROTECT_RECURSION(input);
2734 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2735 return NULL;
2736 }
2737 GC_TRY_PROTECT_RECURSION(input);
2738 output = zend_new_array(zend_hash_num_elements(input));
2739 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2740 /* convert key */
2741 if (key) {
2742 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2743 if (!converted_key) {
2744 continue;
2745 }
2746 key = converted_key;
2747 }
2748 /* convert value */
2749 ZEND_ASSERT(entry);
2750 try_again:
2751 switch(Z_TYPE_P(entry)) {
2752 case IS_STRING: {
2753 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2754 if (!converted_key) {
2755 if (key) {
2756 zend_string_release(key);
2757 }
2758 continue;
2759 }
2760 ZVAL_STR(&entry_tmp, converted_key);
2761 break;
2762 }
2763 case IS_NULL:
2764 case IS_TRUE:
2765 case IS_FALSE:
2766 case IS_LONG:
2767 case IS_DOUBLE:
2768 ZVAL_COPY(&entry_tmp, entry);
2769 break;
2770 case IS_ARRAY:
2771 chash = php_mb_convert_encoding_recursive(
2772 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2773 if (chash) {
2774 ZVAL_ARR(&entry_tmp, chash);
2775 } else {
2776 ZVAL_EMPTY_ARRAY(&entry_tmp);
2777 }
2778 break;
2779 case IS_REFERENCE:
2780 entry = Z_REFVAL_P(entry);
2781 goto try_again;
2782 case IS_OBJECT:
2783 default:
2784 if (key) {
2785 zend_string_release(key);
2786 }
2787 php_error_docref(NULL, E_WARNING, "Object is not supported");
2788 continue;
2789 }
2790 if (key) {
2791 zend_hash_add(output, key, &entry_tmp);
2792 zend_string_release(key);
2793 } else {
2794 zend_hash_index_add(output, idx, &entry_tmp);
2795 }
2796 } ZEND_HASH_FOREACH_END();
2797 GC_TRY_UNPROTECT_RECURSION(input);
2798
2799 return output;
2800 }
2801 /* }}} */
2802
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2803 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2804 {
2805 /* mbstring supports some 'text encodings' which aren't really text encodings
2806 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2807 * These should never be returned by `mb_detect_encoding`. */
2808 unsigned int shift = 0;
2809 for (unsigned int i = 0; i < *size; i++) {
2810 const mbfl_encoding *encoding = elist[i];
2811 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2812 shift++; /* Remove this encoding from the list */
2813 } else if (shift) {
2814 elist[i - shift] = encoding;
2815 }
2816 }
2817 *size -= shift;
2818 }
2819
2820 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2821 PHP_FUNCTION(mb_convert_encoding)
2822 {
2823 zend_string *to_encoding_name;
2824 zend_string *input_str, *from_encodings_str = NULL;
2825 HashTable *input_ht, *from_encodings_ht = NULL;
2826 const mbfl_encoding **from_encodings;
2827 size_t num_from_encodings;
2828 bool free_from_encodings = false;
2829
2830 ZEND_PARSE_PARAMETERS_START(2, 3)
2831 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2832 Z_PARAM_STR(to_encoding_name)
2833 Z_PARAM_OPTIONAL
2834 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2835 ZEND_PARSE_PARAMETERS_END();
2836
2837 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2838 if (!to_encoding) {
2839 RETURN_THROWS();
2840 }
2841
2842 if (from_encodings_ht) {
2843 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2844 RETURN_THROWS();
2845 }
2846 free_from_encodings = true;
2847 } else if (from_encodings_str) {
2848 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2849 &from_encodings, &num_from_encodings,
2850 /* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2851 RETURN_THROWS();
2852 }
2853 free_from_encodings = true;
2854 } else {
2855 from_encodings = &MBSTRG(current_internal_encoding);
2856 num_from_encodings = 1;
2857 }
2858
2859 if (num_from_encodings > 1) {
2860 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2861 }
2862
2863 if (!num_from_encodings) {
2864 efree(ZEND_VOIDP(from_encodings));
2865 zend_argument_value_error(3, "must specify at least one encoding");
2866 RETURN_THROWS();
2867 }
2868
2869 if (input_str) {
2870 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2871 if (ret != NULL) {
2872 RETVAL_STR(ret);
2873 } else {
2874 RETVAL_FALSE;
2875 }
2876 } else {
2877 HashTable *tmp;
2878 tmp = php_mb_convert_encoding_recursive(
2879 input_ht, to_encoding, from_encodings, num_from_encodings);
2880 RETVAL_ARR(tmp);
2881 }
2882
2883 if (free_from_encodings) {
2884 efree(ZEND_VOIDP(from_encodings));
2885 }
2886 }
2887 /* }}} */
2888
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2889 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2890 {
2891 return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2892 }
2893
PHP_FUNCTION(mb_convert_case)2894 PHP_FUNCTION(mb_convert_case)
2895 {
2896 zend_string *str, *from_encoding = NULL;
2897 zend_long case_mode = 0;
2898
2899 ZEND_PARSE_PARAMETERS_START(2, 3)
2900 Z_PARAM_STR(str)
2901 Z_PARAM_LONG(case_mode)
2902 Z_PARAM_OPTIONAL
2903 Z_PARAM_STR_OR_NULL(from_encoding)
2904 ZEND_PARSE_PARAMETERS_END();
2905
2906 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2907 if (!enc) {
2908 RETURN_THROWS();
2909 }
2910
2911 if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2912 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2913 RETURN_THROWS();
2914 }
2915
2916 RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2917 }
2918
PHP_FUNCTION(mb_strtoupper)2919 PHP_FUNCTION(mb_strtoupper)
2920 {
2921 zend_string *str, *from_encoding = NULL;
2922
2923 ZEND_PARSE_PARAMETERS_START(1, 2)
2924 Z_PARAM_STR(str)
2925 Z_PARAM_OPTIONAL
2926 Z_PARAM_STR_OR_NULL(from_encoding)
2927 ZEND_PARSE_PARAMETERS_END();
2928
2929 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2930 if (!enc) {
2931 RETURN_THROWS();
2932 }
2933
2934 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2935 }
2936
PHP_FUNCTION(mb_strtolower)2937 PHP_FUNCTION(mb_strtolower)
2938 {
2939 zend_string *str, *from_encoding = NULL;
2940
2941 ZEND_PARSE_PARAMETERS_START(1, 2)
2942 Z_PARAM_STR(str)
2943 Z_PARAM_OPTIONAL
2944 Z_PARAM_STR_OR_NULL(from_encoding)
2945 ZEND_PARSE_PARAMETERS_END();
2946
2947 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2948 if (!enc) {
2949 RETURN_THROWS();
2950 }
2951
2952 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2953 }
2954
php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS,php_case_mode mode)2955 static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2956 {
2957 zend_string *str, *from_encoding = NULL;
2958
2959 ZEND_PARSE_PARAMETERS_START(1, 2)
2960 Z_PARAM_STR(str)
2961 Z_PARAM_OPTIONAL
2962 Z_PARAM_STR_OR_NULL(from_encoding)
2963 ZEND_PARSE_PARAMETERS_END();
2964
2965 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2966 if (!enc) {
2967 RETURN_THROWS();
2968 }
2969
2970 zend_string *first = mb_get_substr(str, 0, 1, enc);
2971 zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2972
2973 if (zend_string_equals(first, head)) {
2974 zend_string_release_ex(first, false);
2975 zend_string_release_ex(head, false);
2976 RETURN_STR(zend_string_copy(str));
2977 }
2978
2979 zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2980 zend_string *retval = zend_string_concat2(ZSTR_VAL(head), ZSTR_LEN(head), ZSTR_VAL(second), ZSTR_LEN(second));
2981
2982 zend_string_release_ex(first, false);
2983 zend_string_release_ex(head, false);
2984 zend_string_release_ex(second, false);
2985
2986 RETVAL_STR(retval);
2987 }
2988
PHP_FUNCTION(mb_ucfirst)2989 PHP_FUNCTION(mb_ucfirst)
2990 {
2991 php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_TITLE);
2992 }
2993
PHP_FUNCTION(mb_lcfirst)2994 PHP_FUNCTION(mb_lcfirst)
2995 {
2996 php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_LOWER);
2997 }
2998
2999 typedef enum {
3000 MB_LTRIM = 1,
3001 MB_RTRIM = 2,
3002 MB_BOTH_TRIM = 3
3003 } mb_trim_mode;
3004
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)3005 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3006 {
3007 if (ht) {
3008 return zend_hash_index_exists(ht, w);
3009 } else {
3010 for (size_t i = 0; i < default_chars_length; i++) {
3011 if (w == default_chars[i]) {
3012 return true;
3013 }
3014 }
3015 return false;
3016 }
3017 }
3018
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)3019 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3020 {
3021 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3022 uint32_t wchar_buf[128];
3023 size_t in_len = ZSTR_LEN(str);
3024 size_t out_len = 0;
3025 unsigned int state = 0;
3026 size_t left = 0;
3027 size_t right = 0;
3028 size_t total_len = 0;
3029
3030 while (in_len) {
3031 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3032 ZEND_ASSERT(out_len <= 128);
3033 total_len += out_len;
3034
3035 for (size_t i = 0; i < out_len; i++) {
3036 uint32_t w = wchar_buf[i];
3037 if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3038 if (mode & MB_LTRIM) {
3039 left += 1;
3040 }
3041 if (mode & MB_RTRIM) {
3042 right += 1;
3043 }
3044 } else {
3045 mode &= ~MB_LTRIM;
3046 if (mode & MB_RTRIM) {
3047 right = 0;
3048 }
3049 }
3050 }
3051 }
3052
3053 if (left == 0 && right == 0) {
3054 return zend_string_copy(str);
3055 }
3056 return mb_get_substr(str, left, total_len - (right + left), enc);
3057 }
3058
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3059 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3060 {
3061 const uint32_t trim_default_chars[] = {
3062 0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3063 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3064 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3065 0x85, 0x180E
3066 };
3067 size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3068
3069 HashTable what_ht;
3070 zval val;
3071 ZVAL_TRUE(&val);
3072
3073 zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3074
3075 for (size_t i = 0; i < trim_default_chars_length; i++) {
3076 zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3077 }
3078 zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3079 zend_hash_destroy(&what_ht);
3080
3081 return retval;
3082 }
3083
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3084 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3085 {
3086 unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3087 uint32_t what_wchar_buf[128];
3088 size_t what_out_len = 0;
3089 unsigned int state = 0;
3090 size_t what_len = ZSTR_LEN(what);
3091 HashTable what_ht;
3092 zval val;
3093 bool hash_initialized = false;
3094
3095 while (what_len) {
3096 what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3097 ZEND_ASSERT(what_out_len <= 128);
3098
3099 if (what_out_len <= 4 && !hash_initialized) {
3100 return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3101 } else {
3102 if (!hash_initialized) {
3103 hash_initialized = true;
3104 ZVAL_TRUE(&val);
3105 zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3106 }
3107 for (size_t i = 0; i < what_out_len; i++) {
3108 zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3109 }
3110 }
3111 }
3112
3113 if (UNEXPECTED(!hash_initialized)) {
3114 /* This is only possible if what is empty */
3115 return zend_string_copy(str);
3116 }
3117
3118 zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3119 zend_hash_destroy(&what_ht);
3120
3121 return retval;
3122 }
3123
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3124 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3125 {
3126 zend_string *str;
3127 zend_string *what = NULL;
3128 zend_string *encoding = NULL;
3129
3130 ZEND_PARSE_PARAMETERS_START(1, 3)
3131 Z_PARAM_STR(str)
3132 Z_PARAM_OPTIONAL
3133 Z_PARAM_STR_OR_NULL(what)
3134 Z_PARAM_STR_OR_NULL(encoding)
3135 ZEND_PARSE_PARAMETERS_END();
3136
3137 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3138 if (!enc) {
3139 RETURN_THROWS();
3140 }
3141
3142 if (what) {
3143 RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3144 } else {
3145 RETURN_STR(mb_trim_default_chars(str, mode, enc));
3146 }
3147 }
3148
PHP_FUNCTION(mb_trim)3149 PHP_FUNCTION(mb_trim)
3150 {
3151 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3152 }
3153
PHP_FUNCTION(mb_ltrim)3154 PHP_FUNCTION(mb_ltrim)
3155 {
3156 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3157 }
3158
PHP_FUNCTION(mb_rtrim)3159 PHP_FUNCTION(mb_rtrim)
3160 {
3161 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3162 }
3163
duplicate_elist(const mbfl_encoding ** elist,size_t size)3164 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3165 {
3166 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3167 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3168 return new_elist;
3169 }
3170
estimate_demerits(uint32_t w)3171 static unsigned int estimate_demerits(uint32_t w)
3172 {
3173 /* Receive wchars decoded from input string using candidate encoding.
3174 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3175 * a smaller number for each ASCII punctuation character, and 1 for
3176 * all other codepoints.
3177 *
3178 * The 'common' codepoints should cover the vast majority of
3179 * codepoints we are likely to see in practice, while only covering
3180 * a small minority of the entire Unicode encoding space. Why?
3181 * Well, if the test string happens to be valid in an incorrect
3182 * candidate encoding, the bogus codepoints which it decodes to will
3183 * be more or less random. By treating the majority of codepoints as
3184 * 'rare', we ensure that in almost all such cases, the bogus
3185 * codepoints will include plenty of 'rares', thus giving the
3186 * incorrect candidate encoding lots of demerits. See
3187 * common_codepoints.txt for the actual list used.
3188 *
3189 * So, why give extra demerits for ASCII punctuation characters? It's
3190 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3191 * which deliberately only use bytes in the ASCII range. When
3192 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3193 * have an unusually high number of ASCII punctuation characters.
3194 * So giving extra demerits for such characters will improve
3195 * detection accuracy for UTF-7 and similar encodings.
3196 *
3197 * Finally, why 1 demerit for all other characters? That penalizes
3198 * long strings, meaning we will tend to choose a candidate encoding
3199 * in which the test string decodes to a smaller number of
3200 * codepoints. That prevents single-byte encodings in which almost
3201 * every possible input byte decodes to a 'common' codepoint from
3202 * being favored too much. */
3203 if (w > 0xFFFF) {
3204 return 40;
3205 } else if (w >= 0x21 && w <= 0x2F) {
3206 return 6;
3207 } else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3208 return 30;
3209 } else {
3210 return 1;
3211 }
3212 return 0;
3213 }
3214
3215 struct candidate {
3216 const mbfl_encoding *enc;
3217 const unsigned char *in;
3218 size_t in_len;
3219 uint64_t demerits; /* Wide bit size to prevent overflow */
3220 unsigned int state;
3221 float multiplier;
3222 };
3223
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3224 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3225 {
3226 size_t j = 0;
3227
3228 for (size_t i = 0; i < length; i++) {
3229 const mbfl_encoding *enc = encodings[i];
3230
3231 array[j].enc = enc;
3232 array[j].state = 0;
3233 array[j].demerits = 0;
3234
3235 /* If any candidate encodings have specialized validation functions, use them
3236 * to eliminate as many candidates as possible */
3237 if (enc->check != NULL) {
3238 for (size_t k = 0; k < n; k++) {
3239 if (!enc->check((unsigned char*)in[k], in_len[k])) {
3240 if (strict) {
3241 goto skip_to_next;
3242 } else {
3243 array[j].demerits += 500;
3244 }
3245 }
3246 }
3247 }
3248
3249 /* This multiplier can optionally be used to make candidate encodings listed
3250 * first more likely to be chosen. It is a weight factor which multiplies
3251 * the number of demerits counted for each candidate. */
3252 array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3253 j++;
3254 skip_to_next: ;
3255 }
3256
3257 return j;
3258 }
3259
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3260 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3261 {
3262 for (size_t i = 0; i < length; i++) {
3263 const mbfl_encoding *enc = array[i].enc;
3264
3265 array[i].in = in;
3266 array[i].in_len = in_len;
3267
3268 /* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3269 if (enc == &mbfl_encoding_utf8) {
3270 if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3271 array[i].in_len -= 3;
3272 array[i].in += 3;
3273 }
3274 } else if (enc == &mbfl_encoding_utf16be) {
3275 if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3276 array[i].in_len -= 2;
3277 array[i].in += 2;
3278 }
3279 } else if (enc == &mbfl_encoding_utf16le) {
3280 if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3281 array[i].in_len -= 2;
3282 array[i].in += 2;
3283 }
3284 }
3285 }
3286 }
3287
count_demerits(struct candidate * array,size_t length,bool strict)3288 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3289 {
3290 uint32_t wchar_buf[128];
3291 unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3292
3293 for (size_t i = 0; i < length; i++) {
3294 if (array[i].in_len == 0) {
3295 finished++;
3296 }
3297 }
3298
3299 while ((strict || length > 1) && finished < length) {
3300 /* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3301 for (size_t i = length - 1; i != (size_t)-1; i--) {
3302 /* Do we still have more input to process for this candidate encoding? */
3303 if (array[i].in_len) {
3304 const mbfl_encoding *enc = array[i].enc;
3305 size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3306 ZEND_ASSERT(out_len <= 128);
3307 /* Check this batch of decoded codepoints; are there any error markers?
3308 * Also sum up the number of demerits */
3309 while (out_len) {
3310 uint32_t w = wchar_buf[--out_len];
3311 if (w == MBFL_BAD_INPUT) {
3312 if (strict) {
3313 /* This candidate encoding is not valid, eliminate it from consideration */
3314 length--;
3315 if (i < length) {
3316 /* The eliminated candidate was the last valid one in the list */
3317 memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3318 }
3319 goto try_next_encoding;
3320 } else {
3321 array[i].demerits += 1000;
3322 }
3323 } else {
3324 array[i].demerits += estimate_demerits(w);
3325 }
3326 }
3327 if (array[i].in_len == 0) {
3328 finished++;
3329 }
3330 }
3331 try_next_encoding:;
3332 }
3333 }
3334
3335 for (size_t i = 0; i < length; i++) {
3336 array[i].demerits *= array[i].multiplier;
3337 }
3338
3339 return length;
3340 }
3341
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3342 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3343 {
3344 if (elist_size == 0) {
3345 return NULL;
3346 }
3347 if (elist_size == 1) {
3348 if (strict) {
3349 while (n--) {
3350 if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3351 return NULL;
3352 }
3353 }
3354 }
3355 return *elist;
3356 }
3357 if (n == 1 && *str_lengths == 0) {
3358 return *elist;
3359 }
3360
3361 /* Allocate on stack; when we return, this array is automatically freed */
3362 struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3363 elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3364
3365 while (n--) {
3366 start_string(array, elist_size, strings[n], str_lengths[n]);
3367 elist_size = count_demerits(array, elist_size, strict);
3368 if (elist_size == 0) {
3369 /* All candidates were eliminated */
3370 return NULL;
3371 }
3372 }
3373
3374 /* See which remaining candidate encoding has the least demerits */
3375 unsigned int best = 0;
3376 for (unsigned int i = 1; i < elist_size; i++) {
3377 if (array[i].demerits < array[best].demerits) {
3378 best = i;
3379 }
3380 }
3381 return array[best].enc;
3382 }
3383
3384 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3385 * is rejected. With non-strict detection, we just continue, but apply demerits for
3386 * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3387 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3388 {
3389 return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3390 }
3391
3392 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3393 PHP_FUNCTION(mb_detect_encoding)
3394 {
3395 zend_string *str, *encoding_str = NULL;
3396 HashTable *encoding_ht = NULL;
3397 bool strict = false;
3398 const mbfl_encoding *ret, **elist;
3399 size_t size;
3400
3401 ZEND_PARSE_PARAMETERS_START(1, 3)
3402 Z_PARAM_STR(str)
3403 Z_PARAM_OPTIONAL
3404 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3405 Z_PARAM_BOOL(strict)
3406 ZEND_PARSE_PARAMETERS_END();
3407
3408 /* Should we pay attention to the order of the provided candidate encodings and prefer
3409 * the earlier ones (if more than one candidate encoding matches)?
3410 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3411 * in, then don't treat the order as significant */
3412 bool order_significant = true;
3413
3414 /* make encoding list */
3415 if (encoding_ht) {
3416 if (encoding_ht == MBSTRG(all_encodings_list)) {
3417 order_significant = false;
3418 }
3419 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3420 RETURN_THROWS();
3421 }
3422 } else if (encoding_str) {
3423 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3424 RETURN_THROWS();
3425 }
3426 } else {
3427 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3428 size = MBSTRG(current_detect_order_list_size);
3429 }
3430
3431 if (size == 0) {
3432 efree(ZEND_VOIDP(elist));
3433 zend_argument_value_error(2, "must specify at least one encoding");
3434 RETURN_THROWS();
3435 }
3436
3437 remove_non_encodings_from_elist(elist, &size);
3438 if (size == 0) {
3439 efree(ZEND_VOIDP(elist));
3440 RETURN_FALSE;
3441 }
3442
3443 if (ZEND_NUM_ARGS() < 3) {
3444 strict = MBSTRG(strict_detection);
3445 }
3446
3447 if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3448 ret = &mbfl_encoding_utf8;
3449 } else {
3450 ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3451 }
3452
3453 efree(ZEND_VOIDP(elist));
3454
3455 if (ret == NULL) {
3456 RETURN_FALSE;
3457 }
3458
3459 RETVAL_STRING((char *)ret->name);
3460 }
3461 /* }}} */
3462
3463 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3464 PHP_FUNCTION(mb_list_encodings)
3465 {
3466 ZEND_PARSE_PARAMETERS_NONE();
3467
3468 if (MBSTRG(all_encodings_list) == NULL) {
3469 /* Initialize shared array of supported encoding names
3470 * This is done so that we can check if `mb_list_encodings()` is being
3471 * passed to other mbstring functions using a cheap pointer equality check */
3472 HashTable *array = emalloc(sizeof(HashTable));
3473 zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3474 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3475 zval tmp;
3476 ZVAL_STRING(&tmp, (*encodings)->name);
3477 zend_hash_next_index_insert(array, &tmp);
3478 }
3479 MBSTRG(all_encodings_list) = array;
3480 }
3481
3482 GC_ADDREF(MBSTRG(all_encodings_list));
3483 RETURN_ARR(MBSTRG(all_encodings_list));
3484 }
3485 /* }}} */
3486
3487 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3488 PHP_FUNCTION(mb_encoding_aliases)
3489 {
3490 const mbfl_encoding *encoding;
3491 zend_string *encoding_name = NULL;
3492
3493 ZEND_PARSE_PARAMETERS_START(1, 1)
3494 Z_PARAM_STR(encoding_name)
3495 ZEND_PARSE_PARAMETERS_END();
3496
3497 encoding = php_mb_get_encoding(encoding_name, 1);
3498 if (!encoding) {
3499 RETURN_THROWS();
3500 }
3501
3502 array_init(return_value);
3503 if (encoding->aliases != NULL) {
3504 for (const char **alias = encoding->aliases; *alias; ++alias) {
3505 add_next_index_string(return_value, (char *)*alias);
3506 }
3507 }
3508 }
3509 /* }}} */
3510
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3511 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3512 {
3513 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3514 * if we are converting zenkaku kana to hankaku kana
3515 * Make the buffer for converted kana big enough that we never need to
3516 * perform bounds checks */
3517 uint32_t wchar_buf[64], converted_buf[64 * 2];
3518 unsigned int buf_offset = 0;
3519 unsigned int state = 0;
3520 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3521 size_t in_len = ZSTR_LEN(input);
3522
3523 mb_convert_buf buf;
3524 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3525
3526 while (in_len) {
3527 uint32_t *converted = converted_buf;
3528 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3529 * previous iteration, don't overwrite it */
3530 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3531 out_len += buf_offset;
3532 ZEND_ASSERT(out_len <= 64);
3533
3534 if (!out_len) {
3535 continue;
3536 }
3537
3538 for (size_t i = 0; i < out_len-1; i++) {
3539 uint32_t second = 0;
3540 bool consumed = false;
3541 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3542 if (second) {
3543 *converted++ = second;
3544 }
3545 if (consumed) {
3546 i++;
3547 if (i == out_len-1) {
3548 /* We consumed two codepoints at the very end of the wchar buffer
3549 * So there is nothing remaining to reprocess on the next iteration */
3550 buf_offset = 0;
3551 goto emit_converted_kana;
3552 }
3553 }
3554 }
3555
3556 if (!in_len) {
3557 /* This is the last iteration, so we need to process the final codepoint now */
3558 uint32_t second = 0;
3559 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3560 if (second) {
3561 *converted++ = second;
3562 }
3563 } else {
3564 /* Reprocess the last codepoint on the next iteration */
3565 wchar_buf[0] = wchar_buf[out_len-1];
3566 buf_offset = 1;
3567 }
3568
3569 emit_converted_kana:
3570 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3571 }
3572
3573 return mb_convert_buf_result(&buf, encoding);
3574 }
3575
3576 char mb_convert_kana_flags[17] = {
3577 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3578 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3579 'V'
3580 };
3581
3582 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3583 PHP_FUNCTION(mb_convert_kana)
3584 {
3585 unsigned int opt;
3586 char *optstr = NULL;
3587 size_t optstr_len;
3588 zend_string *encname = NULL, *str;
3589
3590 ZEND_PARSE_PARAMETERS_START(1, 3)
3591 Z_PARAM_STR(str)
3592 Z_PARAM_OPTIONAL
3593 Z_PARAM_STRING(optstr, optstr_len)
3594 Z_PARAM_STR_OR_NULL(encname)
3595 ZEND_PARSE_PARAMETERS_END();
3596
3597 if (optstr != NULL) {
3598 char *p = optstr, *e = p + optstr_len;
3599 opt = 0;
3600 next_option:
3601 while (p < e) {
3602 /* Walk through option string and convert to bit vector
3603 * See translit_kana_jisx0201_jisx0208.h for the values used */
3604 char c = *p++;
3605 if (c == 'A') {
3606 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3607 } else if (c == 'a') {
3608 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3609 } else {
3610 for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3611 if (c == mb_convert_kana_flags[i]) {
3612 opt |= (1 << i);
3613 goto next_option;
3614 }
3615 }
3616
3617 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3618 RETURN_THROWS();
3619 }
3620 }
3621
3622 /* Check for illegal combinations of options */
3623 if (((opt & 0xFF00) >> 8) & opt) {
3624 /* It doesn't make sense to convert the same type of characters from halfwidth to
3625 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3626 * FW hiragana to FW katakana and then back again. */
3627 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3628 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3629 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3630 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3631 flag1 = 'A';
3632 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3633 flag2 = 'a';
3634 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3635 RETURN_THROWS();
3636 }
3637
3638 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3639 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3640 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3641 RETURN_THROWS();
3642 }
3643
3644 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3645 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3646 * more than one of these */
3647 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3648 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3649 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3650 RETURN_THROWS();
3651 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3652 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3653 RETURN_THROWS();
3654 }
3655 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3656 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3657 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3658 RETURN_THROWS();
3659 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3660 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3661 RETURN_THROWS();
3662 }
3663 }
3664 } else {
3665 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3666 }
3667
3668 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3669 if (!enc) {
3670 RETURN_THROWS();
3671 }
3672
3673 RETVAL_STR(jp_kana_convert(str, enc, opt));
3674 }
3675
mb_recursive_count_strings(zval * var)3676 static unsigned int mb_recursive_count_strings(zval *var)
3677 {
3678 unsigned int count = 0;
3679 ZVAL_DEREF(var);
3680
3681 if (Z_TYPE_P(var) == IS_STRING) {
3682 count++;
3683 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3684 if (Z_REFCOUNTED_P(var)) {
3685 if (Z_IS_RECURSIVE_P(var)) {
3686 return count;
3687 }
3688 Z_PROTECT_RECURSION_P(var);
3689 }
3690
3691 HashTable *ht = HASH_OF(var);
3692 if (ht != NULL) {
3693 zval *entry;
3694 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3695 count += mb_recursive_count_strings(entry);
3696 } ZEND_HASH_FOREACH_END();
3697 }
3698
3699 if (Z_REFCOUNTED_P(var)) {
3700 Z_UNPROTECT_RECURSION_P(var);
3701 }
3702 }
3703
3704 return count;
3705 }
3706
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3707 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3708 {
3709 ZVAL_DEREF(var);
3710
3711 if (Z_TYPE_P(var) == IS_STRING) {
3712 val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3713 len_list[*count] = Z_STRLEN_P(var);
3714 (*count)++;
3715 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3716 if (Z_REFCOUNTED_P(var)) {
3717 if (Z_IS_RECURSIVE_P(var)) {
3718 return true;
3719 }
3720 Z_PROTECT_RECURSION_P(var);
3721 }
3722
3723 HashTable *ht = HASH_OF(var);
3724 if (ht != NULL) {
3725 zval *entry;
3726 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3727 if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3728 if (Z_REFCOUNTED_P(var)) {
3729 Z_UNPROTECT_RECURSION_P(var);
3730 return true;
3731 }
3732 }
3733 } ZEND_HASH_FOREACH_END();
3734 }
3735
3736 if (Z_REFCOUNTED_P(var)) {
3737 Z_UNPROTECT_RECURSION_P(var);
3738 }
3739 }
3740
3741 return false;
3742 }
3743
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3744 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3745 {
3746 zval *entry, *orig_var;
3747
3748 orig_var = var;
3749 ZVAL_DEREF(var);
3750
3751 if (Z_TYPE_P(var) == IS_STRING) {
3752 zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3753 zval_ptr_dtor(orig_var);
3754 ZVAL_STR(orig_var, ret);
3755 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3756 if (Z_TYPE_P(var) == IS_ARRAY) {
3757 SEPARATE_ARRAY(var);
3758 }
3759 if (Z_REFCOUNTED_P(var)) {
3760 if (Z_IS_RECURSIVE_P(var)) {
3761 return true;
3762 }
3763 Z_PROTECT_RECURSION_P(var);
3764 }
3765
3766 HashTable *ht = HASH_OF(var);
3767 if (ht != NULL) {
3768 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3769 if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3770 if (Z_REFCOUNTED_P(var)) {
3771 Z_UNPROTECT_RECURSION_P(var);
3772 }
3773 return true;
3774 }
3775 } ZEND_HASH_FOREACH_END();
3776 }
3777
3778 if (Z_REFCOUNTED_P(var)) {
3779 Z_UNPROTECT_RECURSION_P(var);
3780 }
3781 }
3782
3783 return false;
3784 }
3785
PHP_FUNCTION(mb_convert_variables)3786 PHP_FUNCTION(mb_convert_variables)
3787 {
3788 zval *args;
3789 zend_string *to_enc_str;
3790 zend_string *from_enc_str;
3791 HashTable *from_enc_ht;
3792 const mbfl_encoding *from_encoding, *to_encoding;
3793 uint32_t argc;
3794 size_t elistsz;
3795 const mbfl_encoding **elist;
3796
3797 ZEND_PARSE_PARAMETERS_START(3, -1)
3798 Z_PARAM_STR(to_enc_str)
3799 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3800 Z_PARAM_VARIADIC('+', args, argc)
3801 ZEND_PARSE_PARAMETERS_END();
3802
3803 /* new encoding */
3804 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3805 if (!to_encoding) {
3806 RETURN_THROWS();
3807 }
3808
3809 from_encoding = MBSTRG(current_internal_encoding);
3810
3811 bool order_significant = true;
3812
3813 /* pre-conversion encoding */
3814 if (from_enc_ht) {
3815 if (from_enc_ht == MBSTRG(all_encodings_list)) {
3816 /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3817 * in, then don't treat the order of the list as significant */
3818 order_significant = false;
3819 }
3820 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3821 RETURN_THROWS();
3822 }
3823 } else {
3824 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3825 RETURN_THROWS();
3826 }
3827 }
3828
3829 if (elistsz == 0) {
3830 efree(ZEND_VOIDP(elist));
3831 zend_argument_value_error(2, "must specify at least one encoding");
3832 RETURN_THROWS();
3833 }
3834
3835 if (elistsz == 1) {
3836 from_encoding = *elist;
3837 } else {
3838 /* auto detect */
3839 unsigned int num = 0;
3840 for (size_t n = 0; n < argc; n++) {
3841 zval *zv = &args[n];
3842 num += mb_recursive_count_strings(zv);
3843 }
3844 const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3845 size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3846 unsigned int i = 0;
3847 for (size_t n = 0; n < argc; n++) {
3848 zval *zv = &args[n];
3849 if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3850 efree(ZEND_VOIDP(elist));
3851 efree(ZEND_VOIDP(val_list));
3852 efree(len_list);
3853 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3854 RETURN_FALSE;
3855 }
3856 }
3857 from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3858 efree(ZEND_VOIDP(val_list));
3859 efree(len_list);
3860 if (!from_encoding) {
3861 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3862 efree(ZEND_VOIDP(elist));
3863 RETURN_FALSE;
3864 }
3865
3866 }
3867
3868 efree(ZEND_VOIDP(elist));
3869
3870 /* convert */
3871 for (size_t n = 0; n < argc; n++) {
3872 zval *zv = &args[n];
3873 ZVAL_DEREF(zv);
3874 if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3875 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3876 RETURN_FALSE;
3877 }
3878 }
3879
3880 RETURN_STRING(from_encoding->name);
3881 }
3882
3883 /* HTML numeric entities */
3884
3885 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3886 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3887 {
3888 zval *hash_entry;
3889
3890 size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3891 if (n_elems % 4 != 0) {
3892 zend_argument_value_error(2, "must have a multiple of 4 elements");
3893 return NULL;
3894 }
3895
3896 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3897 uint32_t *mapelm = convmap;
3898
3899 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3900 bool failed = true;
3901 zend_long tmp = zval_try_get_long(hash_entry, &failed);
3902 if (failed) {
3903 efree(convmap);
3904 zend_argument_value_error(2, "must only be composed of values of type int");
3905 return NULL;
3906 }
3907 *mapelm++ = tmp;
3908 } ZEND_HASH_FOREACH_END();
3909
3910 return convmap;
3911 }
3912
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3913 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3914 {
3915 uint32_t *convmap_end = convmap + conversion_map_size;
3916
3917 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3918 uint32_t lo_code = mapelm[0];
3919 uint32_t hi_code = mapelm[1];
3920 uint32_t offset = mapelm[2];
3921 uint32_t mask = mapelm[3];
3922
3923 if (w >= lo_code && w <= hi_code) {
3924 /* This wchar falls inside one of the ranges which should be
3925 * converted to HTML entities */
3926 *retval = (w + offset) & mask;
3927 return true;
3928 }
3929 }
3930
3931 /* None of the ranges matched */
3932 return false;
3933 }
3934
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3935 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3936 {
3937 /* Each wchar which we get from decoding the input string may become up to
3938 * 13 wchars when we convert it to an HTML entity */
3939 uint32_t wchar_buf[32], converted_buf[32 * 13];
3940 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3941
3942 unsigned int state = 0;
3943 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3944 size_t in_len = ZSTR_LEN(input);
3945
3946 mb_convert_buf buf;
3947 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3948
3949 while (in_len) {
3950 /* Convert input string to wchars, up to 32 at a time */
3951 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3952 ZEND_ASSERT(out_len <= 32);
3953 uint32_t *converted = converted_buf;
3954
3955 /* Run through wchars and see if any of them fall into the ranges
3956 * which we want to convert to HTML entities */
3957 for (size_t i = 0; i < out_len; i++) {
3958 uint32_t w = wchar_buf[i];
3959
3960 if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3961 *converted++ = '&';
3962 *converted++ = '#';
3963 if (hex) {
3964 *converted++ = 'x';
3965 }
3966
3967 /* Convert wchar to decimal/hex string */
3968 if (w == 0) {
3969 *converted++ = '0';
3970 } else {
3971 unsigned char *p = entity + sizeof(entity);
3972 if (hex) {
3973 while (w > 0) {
3974 *(--p) = "0123456789ABCDEF"[w & 0xF];
3975 w >>= 4;
3976 }
3977 } else {
3978 while (w > 0) {
3979 *(--p) = "0123456789"[w % 10];
3980 w /= 10;
3981 }
3982 }
3983 while (p < entity + sizeof(entity)) {
3984 *converted++ = *p++;
3985 }
3986 }
3987
3988 *converted++ = ';';
3989 } else {
3990 *converted++ = w;
3991 }
3992 }
3993
3994 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3995 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3996 }
3997
3998 return mb_convert_buf_result(&buf, encoding);
3999 }
4000
4001 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)4002 PHP_FUNCTION(mb_encode_numericentity)
4003 {
4004 zend_string *encoding = NULL, *str;
4005 size_t conversion_map_size;
4006 HashTable *target_hash;
4007 bool is_hex = false;
4008
4009 ZEND_PARSE_PARAMETERS_START(2, 4)
4010 Z_PARAM_STR(str)
4011 Z_PARAM_ARRAY_HT(target_hash)
4012 Z_PARAM_OPTIONAL
4013 Z_PARAM_STR_OR_NULL(encoding)
4014 Z_PARAM_BOOL(is_hex)
4015 ZEND_PARSE_PARAMETERS_END();
4016
4017 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4018 if (!enc) {
4019 RETURN_THROWS();
4020 }
4021
4022 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4023 if (convmap == NULL) {
4024 RETURN_THROWS();
4025 }
4026
4027 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4028 efree(convmap);
4029 }
4030 /* }}} */
4031
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)4032 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4033 {
4034 uint32_t *convmap_end = convmap + conversion_map_size;
4035
4036 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4037 uint32_t lo_code = mapelm[0];
4038 uint32_t hi_code = mapelm[1];
4039 uint32_t offset = mapelm[2];
4040 uint32_t codepoint = number - offset;
4041 if (codepoint >= lo_code && codepoint <= hi_code) {
4042 *retval = codepoint;
4043 return true;
4044 }
4045 }
4046
4047 return false;
4048 }
4049
4050 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
4051 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
4052 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4053 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4054
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4055 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4056 {
4057 uint32_t wchar_buf[128], converted_buf[128];
4058
4059 unsigned int state = 0;
4060 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4061 size_t in_len = ZSTR_LEN(input);
4062
4063 mb_convert_buf buf;
4064 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4065
4066 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4067 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4068 * 2nd 'converted' buffer.
4069 *
4070 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4071 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4072 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4073 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4074 * to store the next batch of wchars after it.
4075 *
4076 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4077 * wchars from the 1st buffer to the 2nd one.
4078 *
4079 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4080 * have to do bounds checks when writing wchars into it.
4081 */
4082
4083 unsigned int wchar_buf_offset = 0;
4084
4085 while (in_len) {
4086 /* Leave space for sentinel at the end of the buffer */
4087 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4088 out_len += wchar_buf_offset;
4089 ZEND_ASSERT(out_len <= 127);
4090 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4091
4092 uint32_t *p, *converted;
4093
4094 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4095 * be there (in `wchar_buf[0]`), so don't bother in that case */
4096 if (wchar_buf_offset == 0) {
4097 p = wchar_buf;
4098 while (*p != '&')
4099 p++;
4100 if (p == wchar_buf + out_len) {
4101 /* No HTML entities in this buffer */
4102 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4103 continue;
4104 }
4105
4106 /* Copy over the prefix with no & which we already scanned */
4107 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4108 converted = converted_buf + (p - wchar_buf);
4109 } else {
4110 p = wchar_buf;
4111 converted = converted_buf;
4112 }
4113
4114 found_ampersand:
4115 ZEND_ASSERT(*p == '&');
4116 uint32_t *p2 = p;
4117
4118 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4119 if (*++p2 == '#') {
4120 if (*++p2 == 'x') {
4121 /* Possible hex entity */
4122 uint32_t w = *++p2;
4123 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4124 w = *++p2;
4125 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4126 /* We hit the end of the buffer while reading digits, and
4127 * more wchars are still coming in the next buffer
4128 * Reprocess this identity on next iteration */
4129 memmove(wchar_buf, p, (p2 - p) * 4);
4130 wchar_buf_offset = p2 - p;
4131 goto process_converted_wchars;
4132 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4133 /* Invalid entity (too long or "&#x" only) */
4134 memcpy(converted, p, (p2 - p) * 4);
4135 converted += p2 - p;
4136 } else {
4137 /* Valid hexadecimal entity */
4138 uint32_t value = 0, *p3 = p + 3;
4139 while (p3 < p2) {
4140 w = *p3++;
4141 if (w <= '9') {
4142 value = (value * 16) + (w - '0');
4143 } else if (w >= 'a') {
4144 value = (value * 16) + 10 + (w - 'a');
4145 } else {
4146 value = (value * 16) + 10 + (w - 'A');
4147 }
4148 }
4149 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4150 converted++;
4151 if (*p2 == ';')
4152 p2++;
4153 } else {
4154 memcpy(converted, p, (p2 - p) * 4);
4155 converted += p2 - p;
4156 }
4157 }
4158 } else {
4159 /* Possible decimal entity */
4160 uint32_t w = *p2;
4161 while (w >= '0' && w <= '9')
4162 w = *++p2;
4163 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4164 /* The number of digits was legal (no more than 10 decimal digits)
4165 * Reprocess this identity on next iteration of main loop */
4166 memmove(wchar_buf, p, (p2 - p) * 4);
4167 wchar_buf_offset = p2 - p;
4168 goto process_converted_wchars;
4169 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4170 /* Invalid entity (too long or "&#" only) */
4171 memcpy(converted, p, (p2 - p) * 4);
4172 converted += p2 - p;
4173 } else {
4174 /* Valid decimal entity */
4175 uint32_t value = 0, *p3 = p + 2;
4176 while (p3 < p2) {
4177 /* If unsigned integer overflow would occur in the below
4178 * multiplication by 10, this entity is no good
4179 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4180 if (value > 0x19999999) {
4181 memcpy(converted, p, (p2 - p) * 4);
4182 converted += p2 - p;
4183 goto decimal_entity_too_big;
4184 }
4185 value = (value * 10) + (*p3++ - '0');
4186 }
4187 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4188 converted++;
4189 if (*p2 == ';')
4190 p2++;
4191 } else {
4192 memcpy(converted, p, (p2 - p) * 4);
4193 converted += p2 - p;
4194 }
4195 }
4196 }
4197 } else if ((p2 == wchar_buf + out_len) && in_len) {
4198 /* Corner case: & at end of buffer */
4199 wchar_buf[0] = '&';
4200 wchar_buf_offset = 1;
4201 goto process_converted_wchars;
4202 } else {
4203 *converted++ = '&';
4204 }
4205 decimal_entity_too_big:
4206
4207 /* Starting to scan a new section of the wchar buffer
4208 * 'p2' is pointing at the next wchar which needs to be processed */
4209 p = p2;
4210 while (*p2 != '&')
4211 p2++;
4212
4213 if (p2 > p) {
4214 memcpy(converted, p, (p2 - p) * 4);
4215 converted += p2 - p;
4216 p = p2;
4217 }
4218
4219 if (p < wchar_buf + out_len)
4220 goto found_ampersand;
4221
4222 /* We do not have any wchars remaining at the end of this buffer which
4223 * we need to reprocess on the next call */
4224 wchar_buf_offset = 0;
4225 process_converted_wchars:
4226 ZEND_ASSERT(converted <= converted_buf + 128);
4227 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4228 }
4229
4230 return mb_convert_buf_result(&buf, encoding);
4231 }
4232
4233 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4234 PHP_FUNCTION(mb_decode_numericentity)
4235 {
4236 zend_string *encoding = NULL, *str;
4237 size_t conversion_map_size;
4238 HashTable *target_hash;
4239
4240 ZEND_PARSE_PARAMETERS_START(2, 3)
4241 Z_PARAM_STR(str)
4242 Z_PARAM_ARRAY_HT(target_hash)
4243 Z_PARAM_OPTIONAL
4244 Z_PARAM_STR_OR_NULL(encoding)
4245 ZEND_PARSE_PARAMETERS_END();
4246
4247 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4248 if (!enc) {
4249 RETURN_THROWS();
4250 }
4251
4252 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4253 if (convmap == NULL) {
4254 RETURN_THROWS();
4255 }
4256
4257 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4258 efree(convmap);
4259 }
4260 /* }}} */
4261
4262 /* {{{ Sends an email message with MIME scheme */
4263 #define CRLF "\r\n"
4264
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4265 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4266 {
4267 const char *ps;
4268 size_t icnt;
4269 int state = 0;
4270 int crlf_state = -1;
4271 char *token = NULL;
4272 size_t token_pos = 0;
4273 zend_string *fld_name, *fld_val;
4274
4275 ps = str;
4276 icnt = str_len;
4277 fld_name = fld_val = NULL;
4278
4279 /*
4280 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4281 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4282 * state 0 1 2 3
4283 *
4284 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4285 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4286 * crlf_state -1 0 1 -1
4287 *
4288 */
4289
4290 while (icnt > 0) {
4291 switch (*ps) {
4292 case ':':
4293 if (crlf_state == 1) {
4294 token_pos++;
4295 }
4296
4297 if (state == 0 || state == 1) {
4298 if(token && token_pos > 0) {
4299 fld_name = zend_string_init(token, token_pos, 0);
4300 }
4301 state = 2;
4302 } else {
4303 token_pos++;
4304 }
4305
4306 crlf_state = 0;
4307 break;
4308
4309 case '\n':
4310 if (crlf_state == -1) {
4311 goto out;
4312 }
4313 crlf_state = -1;
4314 break;
4315
4316 case '\r':
4317 if (crlf_state == 1) {
4318 token_pos++;
4319 } else {
4320 crlf_state = 1;
4321 }
4322 break;
4323
4324 case ' ': case '\t':
4325 if (crlf_state == -1) {
4326 if (state == 3) {
4327 /* continuing from the previous line */
4328 state = 4;
4329 } else {
4330 /* simply skipping this new line */
4331 state = 5;
4332 }
4333 } else {
4334 if (crlf_state == 1) {
4335 token_pos++;
4336 }
4337 if (state == 1 || state == 3) {
4338 token_pos++;
4339 }
4340 }
4341 crlf_state = 0;
4342 break;
4343
4344 default:
4345 switch (state) {
4346 case 0:
4347 token = (char*)ps;
4348 token_pos = 0;
4349 state = 1;
4350 break;
4351
4352 case 2:
4353 if (crlf_state != -1) {
4354 token = (char*)ps;
4355 token_pos = 0;
4356
4357 state = 3;
4358 break;
4359 }
4360 ZEND_FALLTHROUGH;
4361
4362 case 3:
4363 if (crlf_state == -1) {
4364 if(token && token_pos > 0) {
4365 fld_val = zend_string_init(token, token_pos, 0);
4366 }
4367
4368 if (fld_name != NULL && fld_val != NULL) {
4369 zval val;
4370 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4371 ZVAL_STR(&val, fld_val);
4372
4373 zend_hash_update(ht, fld_name, &val);
4374
4375 zend_string_release_ex(fld_name, 0);
4376 }
4377
4378 fld_name = fld_val = NULL;
4379 token = (char*)ps;
4380 token_pos = 0;
4381
4382 state = 1;
4383 }
4384 break;
4385
4386 case 4:
4387 token_pos++;
4388 state = 3;
4389 break;
4390 }
4391
4392 if (crlf_state == 1) {
4393 token_pos++;
4394 }
4395
4396 token_pos++;
4397
4398 crlf_state = 0;
4399 break;
4400 }
4401 ps++, icnt--;
4402 }
4403 out:
4404 if (state == 2) {
4405 token = "";
4406 token_pos = 0;
4407
4408 state = 3;
4409 }
4410 if (state == 3) {
4411 if(token && token_pos > 0) {
4412 fld_val = zend_string_init(token, token_pos, 0);
4413 }
4414 if (fld_name != NULL && fld_val != NULL) {
4415 zval val;
4416 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4417 ZVAL_STR(&val, fld_val);
4418 zend_hash_update(ht, fld_name, &val);
4419
4420 zend_string_release_ex(fld_name, 0);
4421 }
4422 }
4423 return state;
4424 }
4425
PHP_FUNCTION(mb_send_mail)4426 PHP_FUNCTION(mb_send_mail)
4427 {
4428 char *to;
4429 size_t to_len;
4430 char *message;
4431 size_t message_len;
4432 zend_string *subject;
4433 zend_string *extra_cmd = NULL;
4434 HashTable *headers_ht = NULL;
4435 zend_string *str_headers = NULL;
4436 size_t i;
4437 char *to_r = NULL;
4438 char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4439 bool suppress_content_type = false;
4440 bool suppress_content_transfer_encoding = false;
4441
4442 char *p;
4443 enum mbfl_no_encoding;
4444 const mbfl_encoding *tran_cs, /* transfer text charset */
4445 *head_enc, /* header transfer encoding */
4446 *body_enc; /* body transfer encoding */
4447 const mbfl_language *lang;
4448 HashTable ht_headers;
4449 zval *s;
4450
4451 /* character-set, transfer-encoding */
4452 tran_cs = &mbfl_encoding_utf8;
4453 head_enc = &mbfl_encoding_base64;
4454 body_enc = &mbfl_encoding_base64;
4455 lang = mbfl_no2language(MBSTRG(language));
4456 if (lang != NULL) {
4457 tran_cs = mbfl_no2encoding(lang->mail_charset);
4458 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4459 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4460 }
4461
4462 ZEND_PARSE_PARAMETERS_START(3, 5)
4463 Z_PARAM_PATH(to, to_len)
4464 Z_PARAM_PATH_STR(subject)
4465 Z_PARAM_PATH(message, message_len)
4466 Z_PARAM_OPTIONAL
4467 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4468 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4469 ZEND_PARSE_PARAMETERS_END();
4470
4471 if (str_headers) {
4472 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4473 zend_argument_value_error(4, "must not contain any null bytes");
4474 RETURN_THROWS();
4475 }
4476 str_headers = php_trim(str_headers, NULL, 0, 2);
4477 } else if (headers_ht) {
4478 str_headers = php_mail_build_headers(headers_ht);
4479 if (EG(exception)) {
4480 RETURN_THROWS();
4481 }
4482 }
4483
4484 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4485
4486 if (str_headers != NULL) {
4487 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4488 }
4489
4490 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4491 char *tmp;
4492 char *param_name;
4493 char *charset = NULL;
4494
4495 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4496 p = strchr(Z_STRVAL_P(s), ';');
4497
4498 if (p != NULL) {
4499 /* skipping the padded spaces */
4500 do {
4501 ++p;
4502 } while (*p == ' ' || *p == '\t');
4503
4504 if (*p != '\0') {
4505 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4506 if (strcasecmp(param_name, "charset") == 0) {
4507 const mbfl_encoding *_tran_cs = tran_cs;
4508
4509 charset = php_strtok_r(NULL, "= \"", &tmp);
4510 if (charset != NULL) {
4511 _tran_cs = mbfl_name2encoding(charset);
4512 }
4513
4514 if (!_tran_cs) {
4515 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4516 _tran_cs = &mbfl_encoding_ascii;
4517 }
4518 tran_cs = _tran_cs;
4519 }
4520 }
4521 }
4522 }
4523 suppress_content_type = true;
4524 }
4525
4526 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4527 const mbfl_encoding *_body_enc;
4528
4529 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4530 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4531 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4532 case mbfl_no_encoding_base64:
4533 case mbfl_no_encoding_7bit:
4534 case mbfl_no_encoding_8bit:
4535 body_enc = _body_enc;
4536 break;
4537
4538 default:
4539 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4540 body_enc = &mbfl_encoding_8bit;
4541 break;
4542 }
4543 suppress_content_transfer_encoding = true;
4544 }
4545
4546 /* To: */
4547 if (to_len > 0) {
4548 to_r = estrndup(to, to_len);
4549 for (; to_len; to_len--) {
4550 if (!isspace((unsigned char) to_r[to_len - 1])) {
4551 break;
4552 }
4553 to_r[to_len - 1] = '\0';
4554 }
4555 for (i = 0; to_r[i]; i++) {
4556 if (iscntrl((unsigned char) to_r[i])) {
4557 /* According to RFC 822, section 3.1.1 long headers may be separated into
4558 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4559 * To prevent these separators from being replaced with a space, we skip over them. */
4560 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4561 i += 2;
4562 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4563 i++;
4564 }
4565 continue;
4566 }
4567
4568 to_r[i] = ' ';
4569 }
4570 }
4571 } else {
4572 to_r = to;
4573 }
4574
4575 /* Subject: */
4576 const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4577 if (enc == &mbfl_encoding_pass) {
4578 enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4579 }
4580 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4581 size_t line_sep_len = strlen(line_sep);
4582
4583 subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4584
4585 /* message body */
4586 const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4587 if (msg_enc == &mbfl_encoding_pass) {
4588 msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4589 }
4590
4591 unsigned int num_errors = 0;
4592 zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4593 zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4594 zend_string_free(tmpstr);
4595 message = ZSTR_VAL(conv);
4596
4597 /* other headers */
4598 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4599 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4600 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4601 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4602
4603 smart_str str = {0};
4604 bool empty = true;
4605
4606 if (str_headers != NULL) {
4607 /* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4608 size_t len = ZSTR_LEN(str_headers);
4609 if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4610 len--;
4611 }
4612 if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4613 len--;
4614 }
4615 smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4616 empty = false;
4617 zend_string_release_ex(str_headers, 0);
4618 }
4619
4620 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4621 if (!empty) {
4622 smart_str_appendl(&str, line_sep, line_sep_len);
4623 }
4624 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4625 empty = false;
4626 }
4627
4628 if (!suppress_content_type) {
4629 if (!empty) {
4630 smart_str_appendl(&str, line_sep, line_sep_len);
4631 }
4632 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4633
4634 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4635 if (p != NULL) {
4636 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4637 smart_str_appends(&str, p);
4638 }
4639 empty = false;
4640 }
4641
4642 if (!suppress_content_transfer_encoding) {
4643 if (!empty) {
4644 smart_str_appendl(&str, line_sep, line_sep_len);
4645 }
4646 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4647 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4648 if (p == NULL) {
4649 p = "7bit";
4650 }
4651 smart_str_appends(&str, p);
4652 }
4653
4654 str_headers = smart_str_extract(&str);
4655
4656 if (force_extra_parameters) {
4657 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4658 } else if (extra_cmd) {
4659 extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4660 }
4661
4662 RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4663
4664 if (extra_cmd) {
4665 zend_string_release_ex(extra_cmd, 0);
4666 }
4667 if (to_r != to) {
4668 efree(to_r);
4669 }
4670 zend_string_release(subject);
4671 zend_string_free(conv);
4672 zend_hash_destroy(&ht_headers);
4673 if (str_headers) {
4674 zend_string_release_ex(str_headers, 0);
4675 }
4676 }
4677
4678 #undef CRLF
4679 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4680 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4681 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4682 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4683 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4684 /* }}} */
4685
4686 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4687 PHP_FUNCTION(mb_get_info)
4688 {
4689 zend_string *type = NULL;
4690 size_t n;
4691 char *name;
4692 zval row;
4693 const mbfl_encoding **entry;
4694 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4695
4696 ZEND_ASSERT(lang);
4697
4698 ZEND_PARSE_PARAMETERS_START(0, 1)
4699 Z_PARAM_OPTIONAL
4700 Z_PARAM_STR(type)
4701 ZEND_PARSE_PARAMETERS_END();
4702
4703 if (!type || zend_string_equals_literal_ci(type, "all")) {
4704 array_init(return_value);
4705 if (MBSTRG(current_internal_encoding)) {
4706 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4707 }
4708 if (MBSTRG(http_input_identify)) {
4709 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4710 }
4711 if (MBSTRG(current_http_output_encoding)) {
4712 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4713 }
4714
4715 add_assoc_str(return_value, "http_output_conv_mimetypes",
4716 zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4717 );
4718
4719 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4720 add_assoc_string(return_value, "mail_charset", name);
4721
4722 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4723 add_assoc_string(return_value, "mail_header_encoding", name);
4724
4725 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4726 add_assoc_string(return_value, "mail_body_encoding", name);
4727
4728 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4729
4730 if (MBSTRG(encoding_translation)) {
4731 add_assoc_string(return_value, "encoding_translation", "On");
4732 } else {
4733 add_assoc_string(return_value, "encoding_translation", "Off");
4734 }
4735
4736 name = (char *)mbfl_no_language2name(MBSTRG(language));
4737 add_assoc_string(return_value, "language", name);
4738
4739 // TODO Seems to always have one entry at least?
4740 n = MBSTRG(current_detect_order_list_size);
4741 entry = MBSTRG(current_detect_order_list);
4742 if (n > 0) {
4743 size_t i;
4744 array_init(&row);
4745 for (i = 0; i < n; i++) {
4746 add_next_index_string(&row, (*entry)->name);
4747 entry++;
4748 }
4749 add_assoc_zval(return_value, "detect_order", &row);
4750 }
4751 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4752 add_assoc_string(return_value, "substitute_character", "none");
4753 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4754 add_assoc_string(return_value, "substitute_character", "long");
4755 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4756 add_assoc_string(return_value, "substitute_character", "entity");
4757 } else {
4758 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4759 }
4760 if (MBSTRG(strict_detection)) {
4761 add_assoc_string(return_value, "strict_detection", "On");
4762 } else {
4763 add_assoc_string(return_value, "strict_detection", "Off");
4764 }
4765 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4766 ZEND_ASSERT(MBSTRG(current_internal_encoding));
4767 RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4768 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4769 if (MBSTRG(http_input_identify)) {
4770 RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4771 }
4772 RETURN_NULL();
4773 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4774 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4775 RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4776 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4777 RETURN_STR(
4778 zend_ini_str(
4779 "mbstring.http_output_conv_mimetypes",
4780 sizeof("mbstring.http_output_conv_mimetypes") - 1,
4781 false
4782 )
4783 );
4784 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4785 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4786 RETURN_STRING(name);
4787 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4788 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4789 RETURN_STRING(name);
4790 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4791 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4792 RETURN_STRING(name);
4793 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4794 RETURN_LONG(MBSTRG(illegalchars));
4795 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4796 if (MBSTRG(encoding_translation)) {
4797 RETURN_STRING("On");
4798 } else {
4799 RETURN_STRING("Off");
4800 }
4801 } else if (zend_string_equals_literal_ci(type, "language")) {
4802 name = (char *)mbfl_no_language2name(MBSTRG(language));
4803 RETURN_STRING(name);
4804 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4805 // TODO Seems to always have one entry at least?
4806 n = MBSTRG(current_detect_order_list_size);
4807 entry = MBSTRG(current_detect_order_list);
4808 if (n > 0) {
4809 size_t i;
4810 array_init(return_value);
4811 for (i = 0; i < n; i++) {
4812 add_next_index_string(return_value, (*entry)->name);
4813 entry++;
4814 }
4815 }
4816 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4817 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4818 RETURN_STRING("none");
4819 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4820 RETURN_STRING("long");
4821 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4822 RETURN_STRING("entity");
4823 } else {
4824 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4825 }
4826 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4827 if (MBSTRG(strict_detection)) {
4828 RETURN_STRING("On");
4829 } else {
4830 RETURN_STRING("Off");
4831 }
4832 } else {
4833 php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4834 RETURN_FALSE;
4835 }
4836 }
4837 /* }}} */
4838
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4839 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4840 {
4841 uint32_t wchar_buf[128];
4842 unsigned char *in = (unsigned char*)input;
4843 unsigned int state = 0;
4844
4845 if (encoding->check != NULL) {
4846 return encoding->check(in, length);
4847 }
4848
4849 /* If the input string is not encoded in the given encoding, there is a significant chance
4850 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4851 * buffer of 128 codepoints, convert and check just a few codepoints first */
4852 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4853 ZEND_ASSERT(out_len <= 8);
4854 for (unsigned int i = 0; i < out_len; i++) {
4855 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4856 return false;
4857 }
4858 }
4859
4860 while (length) {
4861 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4862 ZEND_ASSERT(out_len <= 128);
4863 for (unsigned int i = 0; i < out_len; i++) {
4864 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4865 return false;
4866 }
4867 }
4868 }
4869
4870 return true;
4871 }
4872
4873 /* MSVC 32-bit has issues with 64-bit intrinsics.
4874 * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4875 * It seems this is caused by a bug in MS Visual C++
4876 * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4877 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4878 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4879 #endif
4880
4881 /* If we are building an AVX2-only binary, don't compile the next function */
4882 #ifndef ZEND_INTRIN_AVX2_NATIVE
4883
4884 /* SSE2-based function for validating UTF-8 strings
4885 * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4886 static bool mb_fast_check_utf8_default(zend_string *str)
4887 {
4888 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4889 # ifdef __SSE2__
4890 /* `e` points 1 byte past the last full 16-byte block of string content
4891 * Note that we include the terminating null byte which is included in each zend_string
4892 * as part of the content to check; this ensures that multi-byte characters which are
4893 * truncated abruptly at the end of the string will be detected as invalid */
4894 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4895
4896 /* For checking for illegal bytes 0xF5-FF */
4897 const __m128i over_f5 = _mm_set1_epi8(-117);
4898 /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4899 const __m128i over_9f = _mm_set1_epi8(-97);
4900 /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4901 const __m128i over_8f = _mm_set1_epi8(-113);
4902 /* For checking for illegal bytes 0xC0-C1 */
4903 const __m128i find_c0 = _mm_set1_epi8(-64);
4904 const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4905 /* For checking structure of continuation bytes */
4906 const __m128i find_e0 = _mm_set1_epi8(-32);
4907 const __m128i find_f0 = _mm_set1_epi8(-16);
4908
4909 __m128i last_block = _mm_setzero_si128();
4910 __m128i operand;
4911
4912 while (p < e) {
4913 operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4914
4915 check_operand:
4916 /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4917 if (!_mm_movemask_epi8(operand)) {
4918 /* Even if this block only contains single-byte characters, there may have been a
4919 * multi-byte character at the end of the previous block, which was supposed to
4920 * have continuation bytes in this block
4921 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4922 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4923 * from the 3rd last */
4924 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4925 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4926 if (_mm_movemask_epi8(bad)) {
4927 return false;
4928 }
4929
4930 /* Consume as many full blocks of single-byte characters as we can */
4931 while (true) {
4932 p += sizeof(__m128i);
4933 if (p >= e) {
4934 goto finish_up_remaining_bytes;
4935 }
4936 operand = _mm_loadu_si128((__m128i*)p);
4937 if (_mm_movemask_epi8(operand)) {
4938 break;
4939 }
4940 }
4941 }
4942
4943 /* Check for >= 0xF5, which are illegal byte values in UTF-8
4944 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4945 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4946 * Then a single signed compare will pick out any bad bytes
4947 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4948 __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4949
4950 /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4951 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4952 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4953 * We can check for both problems at once by generating a vector where each byte < 0xA0
4954 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4955 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4956 __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4957 __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4958 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4959
4960 /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4961 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4962 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4963 * Build the bitmask and compare it with the shifted block */
4964 __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4965 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4966
4967 /* Check for overlong 2-byte code units
4968 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4969 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4970 * byte range, do a signed compare to pick out any bad bytes */
4971 bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4972
4973 /* Check structure of continuation bytes
4974 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4975 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4976 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4977 * 3) 3 bytes after the start of a 4-byte character
4978 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4979 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4980 __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4981 __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4982 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4983 __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4984 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4985
4986 /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4987 * a continuation byte actually is
4988 * XOR those two bitmasks together; if everything is good, the result should be zero
4989 * However, if a byte which should have been a continuation wasn't, or if a byte which
4990 * shouldn't have been a continuation was, we will get 0xFF in that position */
4991 __m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4992 bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4993
4994 /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4995 * If that value is non-zero, then we found a bad byte somewhere! */
4996 if (_mm_movemask_epi8(bad)) {
4997 return false;
4998 }
4999
5000 last_block = operand;
5001 p += sizeof(__m128i);
5002 }
5003
5004 finish_up_remaining_bytes:
5005 /* Finish up 1-15 remaining bytes */
5006 if (p == e) {
5007 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5008
5009 /* Crazy hack here for cases where 9 or more bytes are remaining...
5010 * We want to use the above vectorized code to check a block of less than 16 bytes,
5011 * but there is no good way to read a variable number of bytes into an XMM register
5012 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5013 * 'header' fields which occupy the memory just before its content
5014 * And, those header fields occupy more than 16 bytes...
5015 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5016 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5017 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5018 * Then, we do a left shift to get rid of the unwanted bytes
5019 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5020 *
5021 * The following `switch` looks useless, but it's not
5022 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5023 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5024 */
5025 switch (remaining_bytes) {
5026 case 0: ;
5027 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5028 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5029 return _mm_movemask_epi8(bad) == 0;
5030 case 1:
5031 case 2:
5032 operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5033 goto check_operand;
5034 case 3:
5035 case 4:
5036 operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5037 goto check_operand;
5038 case 5:
5039 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5040 goto check_operand;
5041 case 6:
5042 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5043 goto check_operand;
5044 case 7:
5045 case 8:
5046 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5047 operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5048 #else
5049 operand = _mm_set_epi64x(0, *((uint64_t*)p));
5050 #endif
5051 goto check_operand;
5052 case 9:
5053 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5054 goto check_operand;
5055 case 10:
5056 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5057 goto check_operand;
5058 case 11:
5059 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5060 goto check_operand;
5061 case 12:
5062 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5063 goto check_operand;
5064 case 13:
5065 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5066 goto check_operand;
5067 case 14:
5068 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5069 goto check_operand;
5070 case 15:
5071 /* No trailing bytes are left which need to be checked
5072 * We get 15 because we did not include the terminating null when
5073 * calculating `remaining_bytes`, so the value wraps around */
5074 return true;
5075 }
5076
5077 ZEND_UNREACHABLE();
5078 }
5079
5080 return true;
5081 # else
5082 /* This UTF-8 validation function is derived from PCRE2 */
5083 size_t length = ZSTR_LEN(str);
5084 /* Table of the number of extra bytes, indexed by the first byte masked with
5085 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5086 static const uint8_t utf8_table[] = {
5087 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5088 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5089 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5090 3,3,3,3,3,3,3,3
5091 };
5092
5093 for (; length > 0; p++) {
5094 uint32_t d;
5095 unsigned char c = *p;
5096 length--;
5097
5098 if (c < 128) {
5099 /* ASCII character */
5100 continue;
5101 }
5102
5103 if (c < 0xc0) {
5104 /* Isolated 10xx xxxx byte */
5105 return false;
5106 }
5107
5108 if (c >= 0xf5) {
5109 return false;
5110 }
5111
5112 uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5113 if (length < ab) {
5114 /* Missing bytes */
5115 return false;
5116 }
5117 length -= ab;
5118
5119 /* Check top bits in the second byte */
5120 if (((d = *(++p)) & 0xc0) != 0x80) {
5121 return false;
5122 }
5123
5124 /* For each length, check that the remaining bytes start with the 0x80 bit
5125 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5126 * excluded range 0xd800 to 0xdfff. */
5127 switch (ab) {
5128 case 1:
5129 /* 2-byte character. No further bytes to check for 0x80. Check first byte
5130 * for xx00 000x (overlong sequence). */
5131 if ((c & 0x3e) == 0) {
5132 return false;
5133 }
5134 break;
5135
5136 case 2:
5137 /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5138 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5139 if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5140 return false;
5141 }
5142 break;
5143
5144 case 3:
5145 /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5146 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5147 * character greater than 0x0010ffff (f4 8f bf bf) */
5148 if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5149 return false;
5150 }
5151 break;
5152
5153 EMPTY_SWITCH_DEFAULT_CASE();
5154 }
5155 }
5156
5157 return true;
5158 # endif
5159 }
5160
5161 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5162
5163 #ifdef ZEND_INTRIN_AVX2_NATIVE
5164
5165 /* We are building AVX2-only binary */
5166 # include <immintrin.h>
5167 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5168
5169 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5170
5171 /* We are building binary which works with or without AVX2; whether or not to use
5172 * AVX2-accelerated functions will be determined at runtime */
5173 # include <immintrin.h>
5174 # include "Zend/zend_cpuinfo.h"
5175
5176 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5177 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5178 * resolve symbols accordingly */
5179
5180 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5181
5182 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5183
5184 typedef bool (*check_utf8_func_t)(zend_string*);
5185
5186 ZEND_NO_SANITIZE_ADDRESS
5187 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5188 static check_utf8_func_t resolve_check_utf8(void)
5189 {
5190 if (zend_cpu_supports_avx2()) {
5191 return mb_fast_check_utf8_avx2;
5192 }
5193 return mb_fast_check_utf8_default;
5194 }
5195
5196 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5197 /* We are compiling for a target where the dynamic linker will not be able to
5198 * resolve symbols according to whether the host supports AVX2 or not; so instead,
5199 * we can make calls go through a function pointer and set the function pointer
5200 * on module load */
5201
5202 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5203 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5204 #else
5205 static bool mb_fast_check_utf8_avx2(zend_string *str);
5206 #endif
5207
5208 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5209
mb_fast_check_utf8(zend_string * str)5210 static bool mb_fast_check_utf8(zend_string *str)
5211 {
5212 return check_utf8_ptr(str);
5213 }
5214
init_check_utf8(void)5215 static void init_check_utf8(void)
5216 {
5217 if (zend_cpu_supports_avx2()) {
5218 check_utf8_ptr = mb_fast_check_utf8_avx2;
5219 } else {
5220 check_utf8_ptr = mb_fast_check_utf8_default;
5221 }
5222 }
5223 # endif
5224
5225 #else
5226
5227 /* No AVX2 support */
5228 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5229
5230 #endif
5231
5232 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5233
5234 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5235 * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5236 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5237 # define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5238 #endif
5239
5240 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5241 * number of bytes, then take the low 256 bits
5242 * This is used to take some number of trailing bytes from the previous 32-byte
5243 * block followed by some number of leading bytes from the current 32-byte block
5244 *
5245 * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5246 * YMM register while shifting in bytes from another YMM register... but
5247 * it works separately on respective 128-bit halves of the YMM registers,
5248 * which is not what we want.
5249 * To make it work as desired, we first do _mm256_permute2x128_si256
5250 * (VPERM2I128) to combine the low 128 bits from the previous block and
5251 * the high 128 bits of the current block in one YMM register.
5252 * Then VPALIGNR will do what is needed. */
5253 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5254
5255 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5256 *
5257 * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5258 * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5259 * sections. */
5260 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5261 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5262 #else
5263 static bool mb_fast_check_utf8_avx2(zend_string *str)
5264 #endif
5265 {
5266 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5267 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5268
5269 /* The algorithm used here for UTF-8 validation is partially adapted from the
5270 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5271 * and Daniel Lemire.
5272 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5273 *
5274 * Most types of invalid UTF-8 text can be detected by examining pairs of
5275 * successive bytes. Specifically:
5276 *
5277 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5278 * No valid UTF-8 string ever uses these byte values.
5279 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5280 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5281 * • 5-byte or 6-byte code units, which should never be used, start with
5282 * 0xF8-FE.
5283 * • A codepoint value higher than U+10FFFF, which is the highest value for
5284 * any Unicode codepoint, would either start with 0xF4, followed by a
5285 * byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5286 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5287 * be used, would start with 0xED, followed by a byte >= 0xA0.
5288 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5289 *
5290 * To detect all these problems, for each pair of successive bytes, we do
5291 * table lookups using the high nibble of the first byte, the low nibble of
5292 * the first byte, and the high nibble of the second byte. Each table lookup
5293 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5294 * combination; AND those three bitmasks together, and any 1 bit in the result
5295 * will indicate an actual invalid byte combination was found.
5296 */
5297
5298 #define BAD_BYTE 0x1
5299 #define OVERLONG_2BYTE 0x2
5300 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5301 #define OVERLONG_3BYTE 0x4
5302 #define SURROGATE 0x8
5303 #define OVERLONG_4BYTE 0x10
5304 #define INVALID_CP 0x20
5305
5306 /* Each of these are 16-entry tables, repeated twice; this is required by the
5307 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5308 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5309 *
5310 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5311 * that means that high nibble 0xC is consistent with the byte pair being part of
5312 * an overlong 2-byte code unit */
5313 const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5314 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5315 0, 0, 0, 0,
5316 0, 0, 0, 0,
5317 0, 0, 0, 0,
5318 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5319 0, 0, 0, 0,
5320 0, 0, 0, 0,
5321 0, 0, 0, 0);
5322 const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5323 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5324 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5325 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5326 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5327 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5328 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5329 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5330 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5331 const __m256i bad_hi_nibble = _mm256_set_epi8(
5332 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5333 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5334 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5335 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5336 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5337 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5338 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5339 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5340 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5341 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5342 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5343 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5344 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5345 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5346 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5347 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5348
5349 const __m256i find_continuation = _mm256_set1_epi8(-64);
5350 const __m256i _b = _mm256_set1_epi8(0xB);
5351 const __m256i _d = _mm256_set1_epi8(0xD);
5352 const __m256i _f = _mm256_set1_epi8(0xF);
5353
5354 __m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5355 __m256i operand;
5356
5357 while (p < e) {
5358 operand = _mm256_loadu_si256((__m256i*)p);
5359
5360 check_operand:
5361 if (!_mm256_movemask_epi8(operand)) {
5362 /* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5363 * the previous block didn't end with an incomplete multi-byte character
5364 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5365 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5366 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5367 if (_mm256_movemask_epi8(bad)) {
5368 return false;
5369 }
5370
5371 /* Consume as many full blocks of single-byte characters as we can */
5372 while (true) {
5373 p += sizeof(__m256i);
5374 if (p >= e) {
5375 goto finish_up_remaining_bytes;
5376 }
5377 operand = _mm256_loadu_si256((__m256i*)p);
5378 if (_mm256_movemask_epi8(operand)) {
5379 break;
5380 }
5381 }
5382 }
5383
5384 __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5385 __m256i lo_nibbles = _mm256_and_si256(operand, _f);
5386
5387 __m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5388 __m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5389
5390 /* Do parallel table lookups in all 3 tables */
5391 __m256i bad = _mm256_cmpgt_epi8(
5392 _mm256_and_si256(
5393 _mm256_and_si256(
5394 _mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5395 _mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5396 _mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5397 _mm256_setzero_si256());
5398
5399 __m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5400 __m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5401 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5402 __m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5403 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5404
5405 __m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5406 bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5407
5408 if (_mm256_movemask_epi8(bad)) {
5409 return false;
5410 }
5411
5412 last_hi_nibbles = hi_nibbles;
5413 last_lo_nibbles = lo_nibbles;
5414 p += sizeof(__m256i);
5415 }
5416
5417 finish_up_remaining_bytes:
5418 if (p == e) {
5419 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5420
5421 switch (remaining_bytes) {
5422 case 0: ;
5423 /* No actual data bytes are remaining */
5424 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5425 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5426 return _mm256_movemask_epi8(bad) == 0;
5427 case 1:
5428 case 2:
5429 operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5430 goto check_operand;
5431 case 3:
5432 case 4:
5433 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5434 goto check_operand;
5435 case 5:
5436 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5437 goto check_operand;
5438 case 6:
5439 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5440 goto check_operand;
5441 case 7:
5442 case 8:
5443 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5444 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5445 #else
5446 operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5447 #endif
5448 goto check_operand;
5449 case 9:
5450 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5451 goto check_operand;
5452 case 10:
5453 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5454 goto check_operand;
5455 case 11:
5456 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5457 goto check_operand;
5458 case 12:
5459 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5460 goto check_operand;
5461 case 13:
5462 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5463 goto check_operand;
5464 case 14:
5465 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5466 goto check_operand;
5467 case 15:
5468 case 16:
5469 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5470 goto check_operand;
5471 case 17:
5472 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5473 goto check_operand;
5474 case 18:
5475 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5476 goto check_operand;
5477 case 19:
5478 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5479 goto check_operand;
5480 case 20:
5481 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5482 goto check_operand;
5483 case 21:
5484 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5485 goto check_operand;
5486 case 22:
5487 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5488 goto check_operand;
5489 case 23:
5490 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5491 goto check_operand;
5492 case 24:
5493 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5494 goto check_operand;
5495 case 25:
5496 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5497 goto check_operand;
5498 case 26:
5499 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5500 goto check_operand;
5501 case 27:
5502 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5503 goto check_operand;
5504 case 28:
5505 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5506 goto check_operand;
5507 case 29:
5508 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5509 goto check_operand;
5510 case 30:
5511 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5512 goto check_operand;
5513 case 31:
5514 return true;
5515 }
5516
5517 ZEND_UNREACHABLE();
5518 }
5519
5520 return true;
5521 }
5522
5523 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5524
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5525 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5526 {
5527 if (encoding == &mbfl_encoding_utf8) {
5528 if (ZSTR_IS_VALID_UTF8(str)) {
5529 return true;
5530 }
5531 bool result = mb_fast_check_utf8(str);
5532 if (result && !ZSTR_IS_INTERNED(str)) {
5533 GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5534 }
5535 return result;
5536 } else {
5537 return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5538 }
5539 }
5540
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5541 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5542 {
5543 zend_long idx;
5544 zend_string *key;
5545 zval *entry;
5546 bool valid = true;
5547
5548 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5549
5550 if (GC_IS_RECURSIVE(vars)) {
5551 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5552 return false;
5553 }
5554 GC_TRY_PROTECT_RECURSION(vars);
5555 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5556 ZVAL_DEREF(entry);
5557 if (key) {
5558 if (!mb_check_str_encoding(key, encoding)) {
5559 valid = false;
5560 break;
5561 }
5562 }
5563 switch (Z_TYPE_P(entry)) {
5564 case IS_STRING:
5565 if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5566 valid = false;
5567 break;
5568 }
5569 break;
5570 case IS_ARRAY:
5571 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5572 valid = false;
5573 break;
5574 }
5575 break;
5576 case IS_LONG:
5577 case IS_DOUBLE:
5578 case IS_NULL:
5579 case IS_TRUE:
5580 case IS_FALSE:
5581 break;
5582 default:
5583 /* Other types are error. */
5584 valid = false;
5585 break;
5586 }
5587 } ZEND_HASH_FOREACH_END();
5588 GC_TRY_UNPROTECT_RECURSION(vars);
5589 return valid;
5590 }
5591
5592 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5593 PHP_FUNCTION(mb_check_encoding)
5594 {
5595 zend_string *input_str = NULL, *enc = NULL;
5596 HashTable *input_ht = NULL;
5597 const mbfl_encoding *encoding;
5598
5599 ZEND_PARSE_PARAMETERS_START(0, 2)
5600 Z_PARAM_OPTIONAL
5601 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5602 Z_PARAM_STR_OR_NULL(enc)
5603 ZEND_PARSE_PARAMETERS_END();
5604
5605 encoding = php_mb_get_encoding(enc, 2);
5606 if (!encoding) {
5607 RETURN_THROWS();
5608 }
5609
5610 if (input_ht) {
5611 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5612 } else if (input_str) {
5613 RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5614 } else {
5615 php_error_docref(NULL, E_DEPRECATED,
5616 "Calling mb_check_encoding() without argument is deprecated");
5617
5618 /* FIXME: Actually check all inputs, except $_FILES file content. */
5619 RETURN_BOOL(MBSTRG(illegalchars) == 0);
5620 }
5621 }
5622 /* }}} */
5623
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5624 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5625 const uint32_t enc_name_arg_num)
5626 {
5627 const mbfl_encoding *enc;
5628 enum mbfl_no_encoding no_enc;
5629
5630 ZEND_ASSERT(str_len > 0);
5631
5632 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5633 if (!enc) {
5634 return -2;
5635 }
5636
5637 no_enc = enc->no_encoding;
5638 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5639 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5640 return -2;
5641 }
5642
5643 /* Some legacy text encodings have a minimum required wchar buffer size;
5644 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5645 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5646 unsigned int state = 0;
5647 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5648 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5649
5650 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5651 return -1;
5652 }
5653 return wchar_buf[0];
5654 }
5655
5656 /* {{{ */
PHP_FUNCTION(mb_ord)5657 PHP_FUNCTION(mb_ord)
5658 {
5659 char *str;
5660 size_t str_len;
5661 zend_string *enc = NULL;
5662 zend_long cp;
5663
5664 ZEND_PARSE_PARAMETERS_START(1, 2)
5665 Z_PARAM_STRING(str, str_len)
5666 Z_PARAM_OPTIONAL
5667 Z_PARAM_STR_OR_NULL(enc)
5668 ZEND_PARSE_PARAMETERS_END();
5669
5670 if (str_len == 0) {
5671 zend_argument_value_error(1, "must not be empty");
5672 RETURN_THROWS();
5673 }
5674
5675 cp = php_mb_ord(str, str_len, enc, 2);
5676
5677 if (0 > cp) {
5678 if (cp == -2) {
5679 RETURN_THROWS();
5680 }
5681 RETURN_FALSE;
5682 }
5683
5684 RETURN_LONG(cp);
5685 }
5686 /* }}} */
5687
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5688 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5689 {
5690 const mbfl_encoding *enc;
5691 enum mbfl_no_encoding no_enc;
5692 zend_string *ret;
5693 char buf[4];
5694
5695 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5696 if (!enc) {
5697 return NULL;
5698 }
5699
5700 no_enc = enc->no_encoding;
5701 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5702 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5703 return NULL;
5704 }
5705
5706 if (cp < 0 || cp > 0x10ffff) {
5707 return NULL;
5708 }
5709
5710 if (php_mb_is_no_encoding_utf8(no_enc)) {
5711 if (cp > 0xd7ff && 0xe000 > cp) {
5712 return NULL;
5713 }
5714
5715 if (cp < 0x80) {
5716 ret = ZSTR_CHAR(cp);
5717 } else if (cp < 0x800) {
5718 ret = zend_string_alloc(2, 0);
5719 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5720 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5721 ZSTR_VAL(ret)[2] = 0;
5722 } else if (cp < 0x10000) {
5723 ret = zend_string_alloc(3, 0);
5724 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5725 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5726 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5727 ZSTR_VAL(ret)[3] = 0;
5728 } else {
5729 ret = zend_string_alloc(4, 0);
5730 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5731 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5732 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5733 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5734 ZSTR_VAL(ret)[4] = 0;
5735 }
5736
5737 return ret;
5738 }
5739
5740 buf[0] = (cp >> 24) & 0xff;
5741 buf[1] = (cp >> 16) & 0xff;
5742 buf[2] = (cp >> 8) & 0xff;
5743 buf[3] = cp & 0xff;
5744
5745 long orig_illegalchars = MBSTRG(illegalchars);
5746 MBSTRG(illegalchars) = 0;
5747 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5748
5749 if (MBSTRG(illegalchars) != 0) {
5750 zend_string_release(ret);
5751 ret = NULL;
5752 }
5753
5754 MBSTRG(illegalchars) = orig_illegalchars;
5755 return ret;
5756 }
5757
5758 /* {{{ */
PHP_FUNCTION(mb_chr)5759 PHP_FUNCTION(mb_chr)
5760 {
5761 zend_long cp;
5762 zend_string *enc = NULL;
5763
5764 ZEND_PARSE_PARAMETERS_START(1, 2)
5765 Z_PARAM_LONG(cp)
5766 Z_PARAM_OPTIONAL
5767 Z_PARAM_STR_OR_NULL(enc)
5768 ZEND_PARSE_PARAMETERS_END();
5769
5770 zend_string* ret = php_mb_chr(cp, enc, 2);
5771 if (ret == NULL) {
5772 RETURN_FALSE;
5773 }
5774
5775 RETURN_STR(ret);
5776 }
5777 /* }}} */
5778
PHP_FUNCTION(mb_str_pad)5779 PHP_FUNCTION(mb_str_pad)
5780 {
5781 zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5782 zend_long pad_to_length;
5783 zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5784
5785 ZEND_PARSE_PARAMETERS_START(2, 5)
5786 Z_PARAM_STR(input)
5787 Z_PARAM_LONG(pad_to_length)
5788 Z_PARAM_OPTIONAL
5789 Z_PARAM_STR(pad)
5790 Z_PARAM_LONG(pad_type_val)
5791 Z_PARAM_STR_OR_NULL(encoding_str)
5792 ZEND_PARSE_PARAMETERS_END();
5793
5794 const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5795 if (!encoding) {
5796 RETURN_THROWS();
5797 }
5798
5799 size_t input_length = mb_get_strlen(input, encoding);
5800
5801 /* If resulting string turns out to be shorter than input string,
5802 we simply copy the input and return. */
5803 if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5804 RETURN_STR_COPY(input);
5805 }
5806
5807 if (ZSTR_LEN(pad) == 0) {
5808 zend_argument_value_error(3, "must be a non-empty string");
5809 RETURN_THROWS();
5810 }
5811
5812 if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5813 zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5814 RETURN_THROWS();
5815 }
5816
5817 size_t pad_length = mb_get_strlen(pad, encoding);
5818
5819 size_t num_mb_pad_chars = pad_to_length - input_length;
5820
5821 /* We need to figure out the left/right padding lengths. */
5822 size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5823 switch (pad_type_val) {
5824 case PHP_STR_PAD_RIGHT:
5825 right_pad = num_mb_pad_chars;
5826 break;
5827
5828 case PHP_STR_PAD_LEFT:
5829 left_pad = num_mb_pad_chars;
5830 break;
5831
5832 case PHP_STR_PAD_BOTH:
5833 left_pad = num_mb_pad_chars / 2;
5834 right_pad = num_mb_pad_chars - left_pad;
5835 break;
5836 }
5837
5838 /* How many full block copies need to happen, and how many characters are then left over? */
5839 size_t full_left_pad_copies = left_pad / pad_length;
5840 size_t full_right_pad_copies = right_pad / pad_length;
5841 size_t remaining_left_pad_chars = left_pad % pad_length;
5842 size_t remaining_right_pad_chars = right_pad % pad_length;
5843
5844 if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5845 goto overflow_no_release;
5846 }
5847
5848 /* Compute the number of bytes required for the padding */
5849 size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5850 size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5851
5852 /* No special fast-path handling necessary for zero-length pads because these functions will not
5853 * allocate memory in case a zero-length pad is required. */
5854 zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5855 zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5856
5857 if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5858 || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5859 goto overflow;
5860 }
5861
5862 size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5863 size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5864
5865 if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5866 || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5867 goto overflow;
5868 }
5869
5870 zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5871 char *buffer = ZSTR_VAL(result);
5872
5873 /* First we pad the left. */
5874 for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5875 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5876 }
5877 memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5878 buffer += ZSTR_LEN(remaining_left_pad_str);
5879
5880 /* Then we copy the input string. */
5881 memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5882 buffer += ZSTR_LEN(input);
5883
5884 /* Finally, we pad on the right. */
5885 for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5886 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5887 }
5888 memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5889
5890 ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5891
5892 zend_string_release_ex(remaining_left_pad_str, false);
5893 zend_string_release_ex(remaining_right_pad_str, false);
5894
5895 RETURN_NEW_STR(result);
5896
5897 overflow:
5898 zend_string_release_ex(remaining_left_pad_str, false);
5899 zend_string_release_ex(remaining_right_pad_str, false);
5900 overflow_no_release:
5901 zend_throw_error(NULL, "String size overflow");
5902 RETURN_THROWS();
5903 }
5904
5905 /* {{{ */
PHP_FUNCTION(mb_scrub)5906 PHP_FUNCTION(mb_scrub)
5907 {
5908 zend_string *str, *enc_name = NULL;
5909
5910 ZEND_PARSE_PARAMETERS_START(1, 2)
5911 Z_PARAM_STR(str)
5912 Z_PARAM_OPTIONAL
5913 Z_PARAM_STR_OR_NULL(enc_name)
5914 ZEND_PARSE_PARAMETERS_END();
5915
5916 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5917 if (!enc) {
5918 RETURN_THROWS();
5919 }
5920
5921 if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5922 /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5923 RETURN_STR_COPY(str);
5924 }
5925
5926 RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5927 }
5928 /* }}} */
5929
5930 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5931 static void php_mb_populate_current_detect_order_list(void)
5932 {
5933 const mbfl_encoding **entry = 0;
5934 size_t nentries;
5935
5936 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5937 nentries = MBSTRG(detect_order_list_size);
5938 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5939 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5940 } else {
5941 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5942 size_t i;
5943 nentries = MBSTRG(default_detect_order_list_size);
5944 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5945 for (i = 0; i < nentries; i++) {
5946 entry[i] = mbfl_no2encoding(src[i]);
5947 }
5948 }
5949 MBSTRG(current_detect_order_list) = entry;
5950 MBSTRG(current_detect_order_list_size) = nentries;
5951 }
5952 /* }}} */
5953
5954 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5955 static int php_mb_encoding_translation(void)
5956 {
5957 return MBSTRG(encoding_translation);
5958 }
5959 /* }}} */
5960
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5961 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5962 {
5963 if (enc) {
5964 if (enc->mblen_table) {
5965 if (s) {
5966 return enc->mblen_table[*(unsigned char *)s];
5967 }
5968 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5969 return 2;
5970 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5971 return 4;
5972 }
5973 }
5974 return 1;
5975 }
5976
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5977 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5978 {
5979 const char *p = s;
5980 char *last=NULL;
5981
5982 if (nbytes == (size_t)-1) {
5983 size_t nb = 0;
5984
5985 while (*p != '\0') {
5986 if (nb == 0) {
5987 if ((unsigned char)*p == (unsigned char)c) {
5988 last = (char *)p;
5989 }
5990 nb = php_mb_mbchar_bytes(p, enc);
5991 if (nb == 0) {
5992 return NULL; /* something is going wrong! */
5993 }
5994 }
5995 --nb;
5996 ++p;
5997 }
5998 } else {
5999 size_t bcnt = nbytes;
6000 size_t nbytes_char;
6001 while (bcnt > 0) {
6002 if ((unsigned char)*p == (unsigned char)c) {
6003 last = (char *)p;
6004 }
6005 nbytes_char = php_mb_mbchar_bytes(p, enc);
6006 if (bcnt < nbytes_char) {
6007 return NULL;
6008 }
6009 p += nbytes_char;
6010 bcnt -= nbytes_char;
6011 }
6012 }
6013 return last;
6014 }
6015
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)6016 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
6017 {
6018 /* We're using simple case-folding here, because we'd have to deal with remapping of
6019 * offsets otherwise. */
6020 zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6021 zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6022
6023 size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6024
6025 zend_string_free(haystack_conv);
6026 zend_string_free(needle_conv);
6027
6028 return n;
6029 }
6030
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)6031 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6032 {
6033 *list = (const zend_encoding **)MBSTRG(http_input_list);
6034 *list_size = MBSTRG(http_input_list_size);
6035 }
6036 /* }}} */
6037
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)6038 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6039 {
6040 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
6041 }
6042 /* }}} */
6043
6044 static const unsigned char base64_table[] = {
6045 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6046 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6047 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6048 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6049 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6050 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6051 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6052 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6053 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6054 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6055 };
6056
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6057 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6058 {
6059 if (base64) {
6060 return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6061 } else {
6062 size_t enc_size = 0;
6063 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6064 while (p < tmpbuf->out) {
6065 unsigned char c = *p++;
6066 enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6067 }
6068 return enc_size;
6069 }
6070 }
6071
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6072 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6073 {
6074 unsigned char *out, *limit;
6075 MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6076 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6077
6078 if (base64) {
6079 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6080 while ((e - p) >= 3) {
6081 unsigned char a = *p++;
6082 unsigned char b = *p++;
6083 unsigned char c = *p++;
6084 uint32_t bits = (a << 16) | (b << 8) | c;
6085 out = mb_convert_buf_add4(out,
6086 base64_table[(bits >> 18) & 0x3F],
6087 base64_table[(bits >> 12) & 0x3F],
6088 base64_table[(bits >> 6) & 0x3F],
6089 base64_table[bits & 0x3F]);
6090 }
6091 if (p != e) {
6092 if ((e - p) == 1) {
6093 uint32_t bits = *p++;
6094 out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6095 } else {
6096 unsigned char a = *p++;
6097 unsigned char b = *p++;
6098 uint32_t bits = (a << 8) | b;
6099 out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6100 }
6101 }
6102 } else {
6103 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6104 while (p < e) {
6105 unsigned char c = *p++;
6106 if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6107 out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6108 } else {
6109 out = mb_convert_buf_add(out, c);
6110 }
6111 }
6112 }
6113
6114 mb_convert_buf_reset(tmpbuf, 0);
6115 MB_CONVERT_BUF_STORE(outbuf, out, limit);
6116 }
6117
6118 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6119
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6120 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6121 {
6122 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6123 size_t in_len = ZSTR_LEN(input);
6124
6125 ZEND_ASSERT(outcode->mime_name != NULL);
6126 ZEND_ASSERT(outcode->mime_name[0] != '\0');
6127
6128 if (!in_len) {
6129 return zend_empty_string;
6130 }
6131
6132 if (indent < 0 || indent >= 74) {
6133 indent = 0;
6134 }
6135
6136 if (linefeed_len > 8) {
6137 linefeed_len = 8;
6138 }
6139 /* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6140 for (size_t i = 0; i < linefeed_len; i++) {
6141 if (linefeed[i] == '\0') {
6142 linefeed_len = i;
6143 break;
6144 }
6145 }
6146
6147 unsigned int state = 0;
6148 /* wchar_buf should be big enough that when it is full, we definitely have enough
6149 * wchars to fill an entire line of output */
6150 uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6151 uint32_t *p, *e;
6152 /* What part of wchar_buf is filled with still-unprocessed data which should not
6153 * be overwritten? */
6154 unsigned int offset = 0;
6155 size_t line_start = 0;
6156
6157 /* If the entire input string is ASCII with no spaces (except possibly leading
6158 * spaces), just pass it through unchanged */
6159 bool checking_leading_spaces = true;
6160 while (in_len) {
6161 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6162 p = wchar_buf;
6163 e = wchar_buf + out_len;
6164
6165 while (p < e) {
6166 uint32_t w = *p++;
6167 if (checking_leading_spaces) {
6168 if (w == ' ') {
6169 continue;
6170 } else {
6171 checking_leading_spaces = false;
6172 }
6173 }
6174 if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6175 /* We cannot simply pass input string through unchanged; start again */
6176 in = (unsigned char*)ZSTR_VAL(input);
6177 in_len = ZSTR_LEN(input);
6178 goto no_passthrough;
6179 }
6180 }
6181 }
6182
6183 return zend_string_copy(input); /* This just increments refcount */
6184
6185 no_passthrough: ;
6186
6187 mb_convert_buf buf;
6188 mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6189
6190 /* Encode some prefix of the input string as plain ASCII if possible
6191 * If we find it necessary to switch to Base64/QPrint encoding, we will
6192 * do so all the way to the end of the string */
6193 while (in_len) {
6194 /* Decode part of the input string, refill wchar_buf */
6195 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6196 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6197 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6198 p = wchar_buf;
6199 e = wchar_buf + offset + out_len;
6200 /* ASCII output is broken into space-delimited 'words'
6201 * If we find a non-ASCII character in the middle of a word, we will
6202 * transfer-encode the entire word */
6203 uint32_t *word_start = p;
6204
6205 /* Don't consider adding line feed for spaces at the beginning of a word */
6206 while (p < e && *p == ' ' && (p - word_start) <= 74) {
6207 p++;
6208 }
6209
6210 while (p < e) {
6211 uint32_t w = *p++;
6212
6213 if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6214 /* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6215 * If we are already too far along on a line to include Base64/QPrint encoded data
6216 * on the same line (without overrunning max line length), then add a line feed
6217 * right now */
6218 feed_and_mime_encode:
6219 if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6220 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6221 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6222 buf.out = mb_convert_buf_add(buf.out, ' ');
6223 indent = 0;
6224 line_start = mb_convert_buf_len(&buf);
6225 } else if (mb_convert_buf_len(&buf) > 0) {
6226 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6227 buf.out = mb_convert_buf_add(buf.out, ' ');
6228 }
6229 p = word_start; /* Back up to where MIME encoding of input chars should start */
6230 goto mime_encoding_needed;
6231 } else if (w == ' ') {
6232 /* When we see a space, check whether we should insert a line break */
6233 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6234 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6235 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6236 buf.out = mb_convert_buf_add(buf.out, ' ');
6237 indent = 0;
6238 line_start = mb_convert_buf_len(&buf);
6239 } else if (mb_convert_buf_len(&buf) > 0) {
6240 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6241 buf.out = mb_convert_buf_add(buf.out, ' ');
6242 }
6243 /* Output one (space-delimited) word as plain ASCII */
6244 while (word_start < p-1) {
6245 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6246 }
6247 word_start++;
6248 while (p < e && *p == ' ') {
6249 p++;
6250 }
6251 }
6252 }
6253
6254 if (in_len) {
6255 /* Copy chars which are part of an incomplete 'word' to the beginning
6256 * of wchar_buf and reprocess them on the next iteration.
6257 * But first make sure that the incomplete 'word' isn't so big that
6258 * there will be no space to add any more decoded wchars in the buffer
6259 * (which could lead to an infinite loop) */
6260 if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6261 goto feed_and_mime_encode;
6262 }
6263 offset = e - word_start;
6264 if (offset) {
6265 memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6266 }
6267 } else {
6268 /* We have reached the end of the input string while still in 'ASCII mode';
6269 * process any trailing ASCII chars which were not followed by a space */
6270 if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6271 /* The whole input string was not just one big ASCII 'word' with no spaces
6272 * consider adding a line feed if necessary to prevent output lines from
6273 * being too long */
6274 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6275 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6276 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6277 buf.out = mb_convert_buf_add(buf.out, ' ');
6278 } else {
6279 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6280 buf.out = mb_convert_buf_add(buf.out, ' ');
6281 }
6282 }
6283 while (word_start < e) {
6284 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6285 }
6286 }
6287 }
6288
6289 /* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6290 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6291
6292 mime_encoding_needed: ;
6293
6294 /* We will generate the output line by line, first converting wchars to bytes
6295 * in the requested output encoding, then transfer-encoding those bytes as
6296 * Base64 or QPrint
6297 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6298 * sending them to 'buf' */
6299 mb_convert_buf tmpbuf;
6300 mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6301
6302 /* Do we need to refill wchar_buf to make sure we don't run out of wchars
6303 * in the middle of a line? */
6304 offset = e - p;
6305 if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6306 goto start_new_line;
6307 }
6308 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6309
6310 while(true) {
6311 refill_wchar_buf: ;
6312 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6313 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6314 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6315 p = wchar_buf;
6316 e = wchar_buf + offset + out_len;
6317
6318 start_new_line: ;
6319 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6320 buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6321 buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6322 buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6323
6324 /* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6325 * We do something like a 'binary search' to find the greatest number which
6326 * can be included on this line without exceeding max line length */
6327 unsigned int n = 12;
6328 size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6329
6330 while (true) {
6331 ZEND_ASSERT(p < e);
6332
6333 /* Remember where we were in process of generating output, so we can back
6334 * up if necessary */
6335 size_t tmppos = mb_convert_buf_len(&tmpbuf);
6336 unsigned int tmpstate = tmpbuf.state;
6337
6338 /* Try encoding 'n' wchars in output text encoding and sending output
6339 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6340 * current line. */
6341 n = MIN(n, e - p);
6342 outcode->from_wchar(p, n, &tmpbuf, false);
6343
6344 /* For some output text encodings, there may be a few ending bytes
6345 * which need to be emitted to output before we break a line.
6346 * Again, remember where we were so we can back up */
6347 size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6348 unsigned int tmpstate2 = tmpbuf.state;
6349 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6350
6351 if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6352 /* If we convert 'n' more wchars on the current line, it will not
6353 * overflow the maximum line length */
6354 p += n;
6355
6356 if (p == e) {
6357 /* We are done; we shouldn't reach here if there is more remaining
6358 * of the input string which needs to be processed */
6359 ZEND_ASSERT(!in_len);
6360 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6361 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6362 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6363 mb_convert_buf_free(&tmpbuf);
6364 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6365 } else {
6366 /* It's possible that more chars might fit on the current line,
6367 * so back up to where we were before emitting any ending bytes */
6368 mb_convert_buf_reset(&tmpbuf, tmppos2);
6369 tmpbuf.state = tmpstate2;
6370 }
6371 } else {
6372 /* Converting 'n' more wchars on this line would be too much.
6373 * Back up to where we were before we tried that. */
6374 mb_convert_buf_reset(&tmpbuf, tmppos);
6375 tmpbuf.state = tmpstate;
6376
6377 if (n == 1) {
6378 /* We have found the exact number of chars which will fit on the
6379 * current line. Finish up and move to a new line. */
6380 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6381 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6382 tmpbuf.state = 0;
6383
6384 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6385 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6386
6387 indent = 0; /* Indent argument must only affect the first line */
6388
6389 if (in_len || p < e) {
6390 /* We still have more input to process */
6391 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6392 buf.out = mb_convert_buf_add(buf.out, ' ');
6393 line_start = mb_convert_buf_len(&buf);
6394 offset = e - p;
6395 if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6396 /* Copy any remaining wchars to beginning of buffer and refill
6397 * the rest of the buffer */
6398 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6399 goto refill_wchar_buf;
6400 }
6401 goto start_new_line;
6402 } else {
6403 /* We are done! */
6404 mb_convert_buf_free(&tmpbuf);
6405 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6406 }
6407 } else {
6408 /* Try a smaller number of wchars */
6409 n = MAX(n >> 1, 1);
6410 }
6411 }
6412 }
6413 }
6414 }
6415
PHP_FUNCTION(mb_encode_mimeheader)6416 PHP_FUNCTION(mb_encode_mimeheader)
6417 {
6418 const mbfl_encoding *charset = &mbfl_encoding_pass;
6419 zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6420 char *linefeed = "\r\n";
6421 size_t linefeed_len = 2;
6422 zend_long indent = 0;
6423 bool base64 = true;
6424
6425 ZEND_PARSE_PARAMETERS_START(1, 5)
6426 Z_PARAM_STR(str)
6427 Z_PARAM_OPTIONAL
6428 Z_PARAM_STR(charset_name)
6429 Z_PARAM_STR(transenc_name)
6430 Z_PARAM_STRING(linefeed, linefeed_len)
6431 Z_PARAM_LONG(indent)
6432 ZEND_PARSE_PARAMETERS_END();
6433
6434 if (charset_name != NULL) {
6435 charset = php_mb_get_encoding(charset_name, 2);
6436 if (!charset) {
6437 RETURN_THROWS();
6438 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6439 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6440 RETURN_THROWS();
6441 }
6442 } else {
6443 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6444 if (lang != NULL) {
6445 charset = mbfl_no2encoding(lang->mail_charset);
6446 const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6447 char t = transenc->name[0];
6448 if (t == 'Q' || t == 'q') {
6449 base64 = false;
6450 }
6451 }
6452 }
6453
6454 if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6455 char t = ZSTR_VAL(transenc_name)[0];
6456 if (t == 'Q' || t == 'q') {
6457 base64 = false;
6458 }
6459 }
6460
6461 RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6462 }
6463
decode_base64(unsigned char c)6464 static int8_t decode_base64(unsigned char c)
6465 {
6466 if (c >= 'A' && c <= 'Z') {
6467 return c - 'A';
6468 } else if (c >= 'a' && c <= 'z') {
6469 return c - 'a' + 26;
6470 } else if (c >= '0' && c <= '9') {
6471 return c - '0' + 52;
6472 } else if (c == '+') {
6473 return 62;
6474 } else if (c == '/') {
6475 return 63;
6476 }
6477 return -1;
6478 }
6479
6480 static int8_t qprint_map[] = {
6481 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6482 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6483 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6484 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
6485 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6486 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6487 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6488 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6489 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6490 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6491 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6492 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6493 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6494 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6495 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6496 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6497 };
6498
6499 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6500 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6501 {
6502 if ((e - p) < 6) {
6503 return NULL;
6504 }
6505
6506 ZEND_ASSERT(p[0] == '=');
6507 ZEND_ASSERT(p[1] == '?');
6508 p += 2;
6509
6510 unsigned char *charset = p;
6511 unsigned char *charset_end = memchr(charset, '?', e - charset);
6512 if (charset_end == NULL) {
6513 return NULL;
6514 }
6515
6516 unsigned char *encoding = charset_end + 1;
6517 p = encoding + 1;
6518 if (p >= e || *p++ != '?') {
6519 return NULL;
6520 }
6521
6522 char *charset_name = estrndup((const char*)charset, charset_end - charset);
6523 const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6524 efree(charset_name);
6525 if (incode == NULL) {
6526 return NULL;
6527 }
6528
6529 unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6530 if (end_marker) {
6531 e = end_marker;
6532 } else if (p < e && *(e-1) == '?') {
6533 /* If encoded word is not properly terminated, but last byte is '?',
6534 * take that as a terminator (legacy behavior) */
6535 e--;
6536 }
6537
6538 unsigned char *buf = emalloc(e - p), *bufp = buf;
6539 if (*encoding == 'Q' || *encoding == 'q') {
6540 /* Fill `buf` with bytes from decoding QPrint */
6541 while (p < e) {
6542 unsigned char c = *p++;
6543 if (c == '_') {
6544 *bufp++ = ' ';
6545 continue;
6546 } else if (c == '=' && (e - p) >= 2) {
6547 unsigned char c2 = *p++;
6548 unsigned char c3 = *p++;
6549 if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6550 *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6551 continue;
6552 } else if (c2 == '\r') {
6553 if (c3 != '\n') {
6554 p--;
6555 }
6556 continue;
6557 } else if (c2 == '\n') {
6558 p--;
6559 continue;
6560 }
6561 }
6562 *bufp++ = c;
6563 }
6564 } else if (*encoding == 'B' || *encoding == 'b') {
6565 /* Fill `buf` with bytes from decoding Base64 */
6566 unsigned int bits = 0, cache = 0;
6567 while (p < e) {
6568 unsigned char c = *p++;
6569 if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6570 continue;
6571 }
6572 int8_t decoded = decode_base64(c);
6573 if (decoded == -1) {
6574 *bufp++ = '?';
6575 continue;
6576 }
6577 bits += 6;
6578 cache = (cache << 6) | (decoded & 0x3F);
6579 if (bits == 24) {
6580 *bufp++ = (cache >> 16) & 0xFF;
6581 *bufp++ = (cache >> 8) & 0xFF;
6582 *bufp++ = cache & 0xFF;
6583 bits = cache = 0;
6584 }
6585 }
6586 if (bits == 18) {
6587 *bufp++ = (cache >> 10) & 0xFF;
6588 *bufp++ = (cache >> 2) & 0xFF;
6589 } else if (bits == 12) {
6590 *bufp++ = (cache >> 4) & 0xFF;
6591 }
6592 } else {
6593 efree(buf);
6594 return NULL;
6595 }
6596
6597 size_t in_len = bufp - buf;
6598 uint32_t wchar_buf[128];
6599
6600 bufp = buf;
6601 while (in_len) {
6602 size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6603 ZEND_ASSERT(out_len <= 128);
6604 outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6605 }
6606
6607 efree(buf);
6608 return e + 2;
6609 }
6610
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6611 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6612 {
6613 unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6614 unsigned int state = 0;
6615 bool space_pending = false;
6616
6617 mb_convert_buf buf;
6618 mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6619
6620 while (p < e) {
6621 unsigned char c = *p;
6622
6623 if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6624 /* Does this look like a MIME encoded word? If so, try to decode it as one */
6625 unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6626 if (incode_end && (e - incode_end) >= 3) {
6627 unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6628 if (temp) {
6629 p = temp;
6630 /* Decoding of MIME encoded word was successful;
6631 * Try to collapse a run of whitespace */
6632 if (p < e && (*p == '\n' || *p == '\r')) {
6633 do {
6634 p++;
6635 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6636 /* We will only actually output a space if this is not immediately followed
6637 * by another valid encoded word */
6638 space_pending = true;
6639 }
6640 continue;
6641 }
6642 }
6643 }
6644
6645 if (space_pending) {
6646 uint32_t space = ' ';
6647 outcode->from_wchar(&space, 1, &buf, false);
6648 space_pending = false;
6649 }
6650
6651 /* Consume a run of plain ASCII characters */
6652 if (c != '\n' && c != '\r') {
6653 unsigned char *end = p + 1;
6654 while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6655 end++;
6656 }
6657 uint32_t wchar_buf[128];
6658 size_t in_len = end - p;
6659 while (in_len) {
6660 size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6661 ZEND_ASSERT(out_len <= 128);
6662 outcode->from_wchar(wchar_buf, out_len, &buf, false);
6663 }
6664 }
6665 /* Collapse a run of whitespace into a single space */
6666 if (p < e && (*p == '\n' || *p == '\r')) {
6667 do {
6668 p++;
6669 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6670 if (p < e) {
6671 /* Emulating legacy behavior of mb_decode_mimeheader here;
6672 * a run of whitespace is not converted to a space at the very
6673 * end of the input string */
6674 uint32_t space = ' ';
6675 outcode->from_wchar(&space, 1, &buf, false);
6676 }
6677 }
6678 }
6679
6680 outcode->from_wchar(NULL, 0, &buf, true);
6681
6682 return mb_convert_buf_result(&buf, outcode);
6683 }
6684
PHP_FUNCTION(mb_decode_mimeheader)6685 PHP_FUNCTION(mb_decode_mimeheader)
6686 {
6687 zend_string *str;
6688
6689 ZEND_PARSE_PARAMETERS_START(1, 1)
6690 Z_PARAM_STR(str)
6691 ZEND_PARSE_PARAMETERS_END();
6692
6693 RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6694 }
6695