1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include <limits.h>
22
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "ext/standard/url.h"
32 #include "main/php_output.h"
33 #include "ext/standard/info.h"
34 #include "ext/pcre/php_pcre.h"
35
36 #include "libmbfl/mbfl/mbfilter_8bit.h"
37 #include "libmbfl/mbfl/mbfilter_pass.h"
38 #include "libmbfl/mbfl/mbfilter_wchar.h"
39 #include "libmbfl/mbfl/eaw_table.h"
40 #include "libmbfl/filters/mbfilter_base64.h"
41 #include "libmbfl/filters/mbfilter_cjk.h"
42 #include "libmbfl/filters/mbfilter_qprint.h"
43 #include "libmbfl/filters/mbfilter_htmlent.h"
44 #include "libmbfl/filters/mbfilter_uuencode.h"
45 #include "libmbfl/filters/mbfilter_ucs4.h"
46 #include "libmbfl/filters/mbfilter_utf8.h"
47 #include "libmbfl/filters/mbfilter_utf16.h"
48 #include "libmbfl/filters/mbfilter_singlebyte.h"
49 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
50 #include "libmbfl/filters/unicode_prop.h"
51
52 #include "php_variables.h"
53 #include "php_globals.h"
54 #include "rfc1867.h"
55 #include "php_content_types.h"
56 #include "SAPI.h"
57 #include "php_unicode.h"
58 #include "TSRM.h"
59
60 #include "mb_gpc.h"
61
62 #ifdef HAVE_MBREGEX
63 # include "php_mbregex.h"
64 #endif
65
66 #include "zend_smart_str.h"
67 #include "zend_multibyte.h"
68 #include "mbstring_arginfo.h"
69
70 #include "rare_cp_bitvec.h"
71
72 /* }}} */
73
74 /* {{{ prototypes */
75 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
76
77 static PHP_GINIT_FUNCTION(mbstring);
78 static PHP_GSHUTDOWN_FUNCTION(mbstring);
79
80 static void php_mb_populate_current_detect_order_list(void);
81
82 static int php_mb_encoding_translation(void);
83
84 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
85
86 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
87
88 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
89
90 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
91
92 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
93
94 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
95
96 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
97
98 /* See mbfilter_cp5022x.c */
99 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
100 /* }}} */
101
102 /* {{{ php_mb_default_identify_list */
103 typedef struct _php_mb_nls_ident_list {
104 enum mbfl_no_language lang;
105 const enum mbfl_no_encoding *list;
106 size_t list_size;
107 } php_mb_nls_ident_list;
108
109 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
110 mbfl_no_encoding_ascii,
111 mbfl_no_encoding_jis,
112 mbfl_no_encoding_utf8,
113 mbfl_no_encoding_euc_jp,
114 mbfl_no_encoding_sjis
115 };
116
117 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
118 mbfl_no_encoding_ascii,
119 mbfl_no_encoding_utf8,
120 mbfl_no_encoding_euc_cn,
121 mbfl_no_encoding_cp936
122 };
123
124 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
125 mbfl_no_encoding_ascii,
126 mbfl_no_encoding_utf8,
127 mbfl_no_encoding_euc_tw,
128 mbfl_no_encoding_big5
129 };
130
131 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
132 mbfl_no_encoding_ascii,
133 mbfl_no_encoding_utf8,
134 mbfl_no_encoding_euc_kr,
135 mbfl_no_encoding_uhc
136 };
137
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
139 mbfl_no_encoding_ascii,
140 mbfl_no_encoding_utf8,
141 mbfl_no_encoding_koi8r,
142 mbfl_no_encoding_cp1251,
143 mbfl_no_encoding_cp866
144 };
145
146 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
147 mbfl_no_encoding_ascii,
148 mbfl_no_encoding_utf8,
149 mbfl_no_encoding_armscii8
150 };
151
152 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
153 mbfl_no_encoding_ascii,
154 mbfl_no_encoding_utf8,
155 mbfl_no_encoding_cp1254,
156 mbfl_no_encoding_8859_9
157 };
158
159 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
160 mbfl_no_encoding_ascii,
161 mbfl_no_encoding_utf8,
162 mbfl_no_encoding_koi8u
163 };
164
165 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
166 mbfl_no_encoding_ascii,
167 mbfl_no_encoding_utf8
168 };
169
170
171 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
172 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
173 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
174 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
175 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
176 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
177 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
178 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
179 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
180 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
181 };
182
183 /* }}} */
184
185 /* {{{ mbstring_deps[] */
186 static const zend_module_dep mbstring_deps[] = {
187 ZEND_MOD_REQUIRED("pcre")
188 ZEND_MOD_END
189 };
190 /* }}} */
191
192 /* {{{ zend_module_entry mbstring_module_entry */
193 zend_module_entry mbstring_module_entry = {
194 STANDARD_MODULE_HEADER_EX,
195 NULL,
196 mbstring_deps,
197 "mbstring",
198 ext_functions,
199 PHP_MINIT(mbstring),
200 PHP_MSHUTDOWN(mbstring),
201 PHP_RINIT(mbstring),
202 PHP_RSHUTDOWN(mbstring),
203 PHP_MINFO(mbstring),
204 PHP_MBSTRING_VERSION,
205 PHP_MODULE_GLOBALS(mbstring),
206 PHP_GINIT(mbstring),
207 PHP_GSHUTDOWN(mbstring),
208 NULL,
209 STANDARD_MODULE_PROPERTIES_EX
210 };
211 /* }}} */
212
213 /* {{{ static sapi_post_entry php_post_entries[] */
214 static const sapi_post_entry php_post_entries[] = {
215 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
216 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
217 { NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220
221 #ifdef COMPILE_DL_MBSTRING
222 #ifdef ZTS
223 ZEND_TSRMLS_CACHE_DEFINE()
224 #endif
225 ZEND_GET_MODULE(mbstring)
226 #endif
227
228 /* {{{ static sapi_post_entry mbstr_post_entries[] */
229 static const sapi_post_entry mbstr_post_entries[] = {
230 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
231 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
232 { NULL, 0, NULL, NULL }
233 };
234 /* }}} */
235
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)236 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
237 if (encoding_name) {
238 const mbfl_encoding *encoding;
239 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
240 if (last_encoding_name && (last_encoding_name == encoding_name
241 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
242 return MBSTRG(last_used_encoding);
243 }
244
245 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
246 if (!encoding) {
247 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
248 return NULL;
249 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
250 if (encoding == &mbfl_encoding_base64) {
251 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
252 } else if (encoding == &mbfl_encoding_qprint) {
253 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
254 } else if (encoding == &mbfl_encoding_html_ent) {
255 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
256 } else if (encoding == &mbfl_encoding_uuencode) {
257 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
258 }
259 }
260
261 if (last_encoding_name) {
262 zend_string_release(last_encoding_name);
263 }
264 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
265 MBSTRG(last_used_encoding) = encoding;
266 return encoding;
267 } else {
268 return MBSTRG(current_internal_encoding);
269 }
270 }
271
php_mb_get_encoding_or_pass(const char * encoding_name)272 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
273 if (strcmp(encoding_name, "pass") == 0) {
274 return &mbfl_encoding_pass;
275 }
276
277 return mbfl_name2encoding(encoding_name);
278 }
279
count_commas(const char * p,const char * end)280 static size_t count_commas(const char *p, const char *end) {
281 size_t count = 0;
282 while ((p = memchr(p, ',', end - p))) {
283 count++;
284 p++;
285 }
286 return count;
287 }
288
289 /* {{{ static zend_result php_mb_parse_encoding_list()
290 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
291 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
292 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)293 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
294 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
295 {
296 if (value == NULL || value_length == 0) {
297 *return_list = NULL;
298 *return_size = 0;
299 return SUCCESS;
300 } else {
301 bool included_auto;
302 size_t n, size;
303 const char *p1, *endp, *tmpstr;
304 const mbfl_encoding **entry, **list;
305
306 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
307 tmpstr = value + 1;
308 value_length -= 2;
309 } else {
310 tmpstr = value;
311 }
312
313 endp = tmpstr + value_length;
314 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
315 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
316 entry = list;
317 n = 0;
318 included_auto = 0;
319 p1 = tmpstr;
320 while (1) {
321 const char *comma = memchr(p1, ',', endp - p1);
322 const char *p = comma ? comma : endp;
323 /* trim spaces */
324 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
325 p1++;
326 }
327 p--;
328 while (p > p1 && (*p == ' ' || *p == '\t')) {
329 p--;
330 }
331 size_t p1_length = p - p1 + 1;
332 /* convert to the encoding number and check encoding */
333 if (strncasecmp(p1, "auto", p1_length) == 0) {
334 if (!included_auto) {
335 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
336 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
337 size_t i;
338 included_auto = 1;
339 for (i = 0; i < identify_list_size; i++) {
340 *entry++ = mbfl_no2encoding(*src++);
341 n++;
342 }
343 }
344 } else {
345 const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
346 if (!encoding) {
347 /* Called from an INI setting modification */
348 if (arg_num == 0) {
349 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
350 } else {
351 zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
352 }
353 pefree(ZEND_VOIDP(list), persistent);
354 return FAILURE;
355 }
356
357 *entry++ = encoding;
358 n++;
359 }
360 if (n >= size || comma == NULL) {
361 break;
362 }
363 p1 = comma + 1;
364 }
365 *return_list = list;
366 *return_size = n;
367 }
368
369 return SUCCESS;
370 }
371 /* }}} */
372
373 /* {{{
374 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
375 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
376 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)377 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
378 size_t *return_size, uint32_t arg_num)
379 {
380 /* Allocate enough space to include the default detect order if "auto" is used. */
381 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
382 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
383 const mbfl_encoding **entry = list;
384 bool included_auto = 0;
385 size_t n = 0;
386 zval *hash_entry;
387 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
388 zend_string *encoding_str = zval_try_get_string(hash_entry);
389 if (UNEXPECTED(!encoding_str)) {
390 efree(ZEND_VOIDP(list));
391 return FAILURE;
392 }
393
394 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
395 if (!included_auto) {
396 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
397 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
398 size_t j;
399
400 included_auto = 1;
401 for (j = 0; j < identify_list_size; j++) {
402 *entry++ = mbfl_no2encoding(*src++);
403 n++;
404 }
405 }
406 } else {
407 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
408 if (encoding) {
409 *entry++ = encoding;
410 n++;
411 } else {
412 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
413 zend_string_release(encoding_str);
414 efree(ZEND_VOIDP(list));
415 return FAILURE;
416 }
417 }
418 zend_string_release(encoding_str);
419 } ZEND_HASH_FOREACH_END();
420 *return_list = list;
421 *return_size = n;
422 return SUCCESS;
423 }
424 /* }}} */
425
426 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)427 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
428 {
429 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
430 }
431
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)432 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
433 {
434 return ((const mbfl_encoding *)encoding)->name;
435 }
436
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)437 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
438 {
439 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
440 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
441 }
442
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)443 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
444 {
445 if (!list) {
446 list = (const zend_encoding**)MBSTRG(current_detect_order_list);
447 list_size = MBSTRG(current_detect_order_list_size);
448 }
449 if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
450 /* Emulate behavior of previous implementation; it would never return "pass"
451 * from an encoding auto-detection operation */
452 return NULL;
453 }
454 return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
455 }
456
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)457 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
458 {
459 unsigned int num_errors = 0;
460 zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
461
462 *to_length = ZSTR_LEN(result);
463 *to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
464 memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
465 zend_string_free(result);
466
467 return from_length;
468 }
469
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)470 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
471 {
472 return php_mb_parse_encoding_list(
473 encoding_list, encoding_list_len,
474 (const mbfl_encoding ***)return_list, return_size,
475 persistent, /* arg_num */ 0);
476 }
477
php_mb_zend_internal_encoding_getter(void)478 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
479 {
480 return (const zend_encoding *)MBSTRG(internal_encoding);
481 }
482
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)483 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
484 {
485 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
486 return SUCCESS;
487 }
488
489 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
490 "mbstring",
491 php_mb_zend_encoding_fetcher,
492 php_mb_zend_encoding_name_getter,
493 php_mb_zend_encoding_lexer_compatibility_checker,
494 php_mb_zend_encoding_detector,
495 php_mb_zend_encoding_converter,
496 php_mb_zend_encoding_list_parser,
497 php_mb_zend_internal_encoding_getter,
498 php_mb_zend_internal_encoding_setter
499 };
500 /* }}} */
501
502 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)503 static void *_php_mb_compile_regex(const char *pattern)
504 {
505 pcre2_code *retval;
506 PCRE2_SIZE err_offset;
507 int errnum;
508
509 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
510 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
511 PCRE2_UCHAR err_str[128];
512 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
513 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
514 }
515 return retval;
516 }
517 /* }}} */
518
519 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)520 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
521 {
522 int res;
523
524 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
525 if (NULL == match_data) {
526 pcre2_code_free(opaque);
527 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
528 return FAILURE;
529 }
530 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
531 php_pcre_free_match_data(match_data);
532
533 return res;
534 }
535 /* }}} */
536
537 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)538 static void _php_mb_free_regex(void *opaque)
539 {
540 pcre2_code_free(opaque);
541 }
542 /* }}} */
543
544 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)545 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
546 {
547 size_t i;
548
549 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
550 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
551
552 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
553 if (php_mb_default_identify_list[i].lang == lang) {
554 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
555 *plist_size = php_mb_default_identify_list[i].list_size;
556 return 1;
557 }
558 }
559 return 0;
560 }
561 /* }}} */
562
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)563 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
564 {
565 char *result = emalloc(len + 2);
566 char *resp = result;
567 size_t i;
568
569 for (i = 0; i < len && start[i] != quote; ++i) {
570 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
571 *resp++ = start[++i];
572 } else {
573 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
574
575 while (j-- > 0 && i < len) {
576 *resp++ = start[i++];
577 }
578 --i;
579 }
580 }
581
582 *resp = '\0';
583 return result;
584 }
585
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)586 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
587 {
588 char *pos = *line, quote;
589 char *res;
590
591 while (*pos && *pos != stop) {
592 if ((quote = *pos) == '"' || quote == '\'') {
593 ++pos;
594 while (*pos && *pos != quote) {
595 if (*pos == '\\' && pos[1] && pos[1] == quote) {
596 pos += 2;
597 } else {
598 ++pos;
599 }
600 }
601 if (*pos) {
602 ++pos;
603 }
604 } else {
605 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
606
607 }
608 }
609 if (*pos == '\0') {
610 res = estrdup(*line);
611 *line += strlen(*line);
612 return res;
613 }
614
615 res = estrndup(*line, pos - *line);
616
617 while (*pos == stop) {
618 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
619 }
620
621 *line = pos;
622 return res;
623 }
624 /* }}} */
625
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)626 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
627 {
628 while (*str && isspace(*(unsigned char *)str)) {
629 ++str;
630 }
631
632 if (!*str) {
633 return estrdup("");
634 }
635
636 if (*str == '"' || *str == '\'') {
637 char quote = *str;
638
639 str++;
640 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
641 } else {
642 char *strend = str;
643
644 while (*strend && !isspace(*(unsigned char *)strend)) {
645 ++strend;
646 }
647 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
648 }
649 }
650 /* }}} */
651
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)652 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
653 {
654 char *s, *s2;
655 const size_t filename_len = strlen(filename);
656
657 /* The \ check should technically be needed for win32 systems only where
658 * it is a valid path separator. However, IE in all it's wisdom always sends
659 * the full path of the file on the user's filesystem, which means that unless
660 * the user does basename() they get a bogus file name. Until IE's user base drops
661 * to nill or problem is fixed this code must remain enabled for all systems. */
662 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
663 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
664
665 if (s && s2) {
666 if (s > s2) {
667 return ++s;
668 } else {
669 return ++s2;
670 }
671 } else if (s) {
672 return ++s;
673 } else if (s2) {
674 return ++s2;
675 } else {
676 return filename;
677 }
678 }
679 /* }}} */
680
681 /* {{{ php.ini directive handler */
682 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)683 static PHP_INI_MH(OnUpdate_mbstring_language)
684 {
685 enum mbfl_no_language no_language;
686
687 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
688 if (no_language == mbfl_no_language_invalid) {
689 MBSTRG(language) = mbfl_no_language_neutral;
690 return FAILURE;
691 }
692 MBSTRG(language) = no_language;
693 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
694 return SUCCESS;
695 }
696 /* }}} */
697
698 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)699 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
700 {
701 const mbfl_encoding **list;
702 size_t size;
703
704 if (!new_value) {
705 if (MBSTRG(detect_order_list)) {
706 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
707 }
708 MBSTRG(detect_order_list) = NULL;
709 MBSTRG(detect_order_list_size) = 0;
710 return SUCCESS;
711 }
712
713 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
714 return FAILURE;
715 }
716
717 if (MBSTRG(detect_order_list)) {
718 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
719 }
720 MBSTRG(detect_order_list) = list;
721 MBSTRG(detect_order_list_size) = size;
722 return SUCCESS;
723 }
724 /* }}} */
725
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)726 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
727 const mbfl_encoding **list;
728 size_t size;
729 if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
730 list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
731 *list = &mbfl_encoding_pass;
732 size = 1;
733 } else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
734 return FAILURE;
735 }
736 if (MBSTRG(http_input_list)) {
737 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
738 }
739 MBSTRG(http_input_list) = list;
740 MBSTRG(http_input_list_size) = size;
741 return SUCCESS;
742 }
743
744 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)745 static PHP_INI_MH(OnUpdate_mbstring_http_input)
746 {
747 if (new_value) {
748 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
749 }
750
751 if (!new_value || !ZSTR_LEN(new_value)) {
752 const char *encoding = php_get_input_encoding();
753 MBSTRG(http_input_set) = 0;
754 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
755 return SUCCESS;
756 }
757
758 MBSTRG(http_input_set) = 1;
759 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
760 }
761 /* }}} */
762
_php_mb_ini_mbstring_http_output_set(const char * new_value)763 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value) {
764 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
765 if (!encoding) {
766 return FAILURE;
767 }
768
769 MBSTRG(http_output_encoding) = encoding;
770 MBSTRG(current_http_output_encoding) = encoding;
771 return SUCCESS;
772 }
773
774 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)775 static PHP_INI_MH(OnUpdate_mbstring_http_output)
776 {
777 if (new_value) {
778 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
779 }
780
781 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
782 MBSTRG(http_output_set) = 0;
783 _php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
784 return SUCCESS;
785 }
786
787 MBSTRG(http_output_set) = 1;
788 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
789 }
790 /* }}} */
791
792 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)793 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
794 {
795 const mbfl_encoding *encoding;
796
797 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
798 /* falls back to UTF-8 if an unknown encoding name is given */
799 if (new_value) {
800 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
801 }
802 encoding = &mbfl_encoding_utf8;
803 }
804 MBSTRG(internal_encoding) = encoding;
805 MBSTRG(current_internal_encoding) = encoding;
806 #ifdef HAVE_MBREGEX
807 {
808 const char *enc_name = new_value;
809 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
810 /* falls back to UTF-8 if an unknown encoding name is given */
811 enc_name = "UTF-8";
812 php_mb_regex_set_default_mbctype(enc_name);
813 }
814 php_mb_regex_set_mbctype(new_value);
815 }
816 #endif
817 return SUCCESS;
818 }
819 /* }}} */
820
821 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)822 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
823 {
824 if (new_value) {
825 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
826 }
827
828 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
829 return FAILURE;
830 }
831
832 if (new_value && ZSTR_LEN(new_value)) {
833 MBSTRG(internal_encoding_set) = 1;
834 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
835 } else {
836 const char *encoding = php_get_internal_encoding();
837 MBSTRG(internal_encoding_set) = 0;
838 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
839 }
840 }
841 /* }}} */
842
843 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)844 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
845 {
846 if (new_value != NULL) {
847 if (zend_string_equals_literal_ci(new_value, "none")) {
848 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
849 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
850 } else if (zend_string_equals_literal_ci(new_value, "long")) {
851 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
852 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
853 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
854 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
855 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
856 } else {
857 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
858 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
859 if (ZSTR_LEN(new_value) > 0) {
860 char *endptr = NULL;
861 int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
862
863 if (*endptr == '\0') {
864 MBSTRG(filter_illegal_substchar) = c;
865 MBSTRG(current_filter_illegal_substchar) = c;
866 }
867 }
868 }
869 } else {
870 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
871 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
872 MBSTRG(filter_illegal_substchar) = '?';
873 MBSTRG(current_filter_illegal_substchar) = '?';
874 }
875
876 return SUCCESS;
877 }
878 /* }}} */
879
880 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)881 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
882 {
883 if (new_value == NULL) {
884 return FAILURE;
885 }
886
887 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
888
889 if (MBSTRG(encoding_translation)) {
890 sapi_unregister_post_entry(php_post_entries);
891 sapi_register_post_entries(mbstr_post_entries);
892 } else {
893 sapi_unregister_post_entry(mbstr_post_entries);
894 sapi_register_post_entries(php_post_entries);
895 }
896
897 return SUCCESS;
898 }
899 /* }}} */
900
901 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)902 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
903 {
904 zend_string *tmp;
905 void *re = NULL;
906
907 if (!new_value) {
908 new_value = entry->orig_value;
909 }
910 tmp = php_trim(new_value, NULL, 0, 3);
911
912 if (ZSTR_LEN(tmp) > 0) {
913 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
914 zend_string_release_ex(tmp, 0);
915 return FAILURE;
916 }
917 }
918
919 if (MBSTRG(http_output_conv_mimetypes)) {
920 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
921 }
922
923 MBSTRG(http_output_conv_mimetypes) = re;
924
925 zend_string_release_ex(tmp, 0);
926 return SUCCESS;
927 }
928 /* }}} */
929 /* }}} */
930
931 /* {{{ php.ini directive registration */
932 PHP_INI_BEGIN()
933 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
934 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
935 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
936 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
937 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
938 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
939
940 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
941 PHP_INI_SYSTEM | PHP_INI_PERDIR,
942 OnUpdate_mbstring_encoding_translation,
943 encoding_translation, zend_mbstring_globals, mbstring_globals)
944 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
945 "^(text/|application/xhtml\\+xml)",
946 PHP_INI_ALL,
947 OnUpdate_mbstring_http_output_conv_mimetypes)
948
949 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
950 PHP_INI_ALL,
951 OnUpdateBool,
952 strict_detection, zend_mbstring_globals, mbstring_globals)
953 #ifdef HAVE_MBREGEX
954 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
955 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
956 #endif
PHP_INI_END()957 PHP_INI_END()
958 /* }}} */
959
960 static void mbstring_internal_encoding_changed_hook(void) {
961 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
962 if (!MBSTRG(internal_encoding_set)) {
963 const char *encoding = php_get_internal_encoding();
964 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
965 }
966
967 if (!MBSTRG(http_output_set)) {
968 const char *encoding = php_get_output_encoding();
969 _php_mb_ini_mbstring_http_output_set(encoding);
970 }
971
972 if (!MBSTRG(http_input_set)) {
973 const char *encoding = php_get_input_encoding();
974 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
975 }
976 }
977
978 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)979 static PHP_GINIT_FUNCTION(mbstring)
980 {
981 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
982 ZEND_TSRMLS_CACHE_UPDATE();
983 #endif
984
985 mbstring_globals->language = mbfl_no_language_uni;
986 mbstring_globals->internal_encoding = NULL;
987 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
988 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
989 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
990 mbstring_globals->http_input_identify = NULL;
991 mbstring_globals->http_input_identify_get = NULL;
992 mbstring_globals->http_input_identify_post = NULL;
993 mbstring_globals->http_input_identify_cookie = NULL;
994 mbstring_globals->http_input_identify_string = NULL;
995 mbstring_globals->http_input_list = NULL;
996 mbstring_globals->http_input_list_size = 0;
997 mbstring_globals->detect_order_list = NULL;
998 mbstring_globals->detect_order_list_size = 0;
999 mbstring_globals->current_detect_order_list = NULL;
1000 mbstring_globals->current_detect_order_list_size = 0;
1001 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1002 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1003 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1004 mbstring_globals->filter_illegal_substchar = '?';
1005 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1006 mbstring_globals->current_filter_illegal_substchar = '?';
1007 mbstring_globals->illegalchars = 0;
1008 mbstring_globals->encoding_translation = 0;
1009 mbstring_globals->strict_detection = 0;
1010 mbstring_globals->outconv_enabled = false;
1011 mbstring_globals->outconv_state = 0;
1012 mbstring_globals->http_output_conv_mimetypes = NULL;
1013 #ifdef HAVE_MBREGEX
1014 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1015 #endif
1016 mbstring_globals->last_used_encoding_name = NULL;
1017 mbstring_globals->last_used_encoding = NULL;
1018 mbstring_globals->internal_encoding_set = 0;
1019 mbstring_globals->http_output_set = 0;
1020 mbstring_globals->http_input_set = 0;
1021 mbstring_globals->all_encodings_list = NULL;
1022 }
1023 /* }}} */
1024
1025 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1026 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1027 {
1028 if (mbstring_globals->http_input_list) {
1029 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1030 }
1031 if (mbstring_globals->detect_order_list) {
1032 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1033 }
1034 if (mbstring_globals->http_output_conv_mimetypes) {
1035 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1036 }
1037 #ifdef HAVE_MBREGEX
1038 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1039 #endif
1040 }
1041 /* }}} */
1042
1043 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1044 static void init_check_utf8(void);
1045 #endif
1046
1047 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1048 PHP_MINIT_FUNCTION(mbstring)
1049 {
1050 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1051 ZEND_TSRMLS_CACHE_UPDATE();
1052 #endif
1053
1054 REGISTER_INI_ENTRIES();
1055
1056 /* We assume that we're the only user of the hook. */
1057 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1058 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1059 mbstring_internal_encoding_changed_hook();
1060
1061 /* This is a global handler. Should not be set in a per-request handler. */
1062 sapi_register_treat_data(mbstr_treat_data);
1063
1064 /* Post handlers are stored in the thread-local context. */
1065 if (MBSTRG(encoding_translation)) {
1066 sapi_register_post_entries(mbstr_post_entries);
1067 }
1068
1069 #ifdef HAVE_MBREGEX
1070 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1071 #endif
1072
1073 register_mbstring_symbols(module_number);
1074
1075 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1076 return FAILURE;
1077 }
1078
1079 php_rfc1867_set_multibyte_callbacks(
1080 php_mb_encoding_translation,
1081 php_mb_gpc_get_detect_order,
1082 php_mb_gpc_set_input_encoding,
1083 php_mb_rfc1867_getword,
1084 php_mb_rfc1867_getword_conf,
1085 php_mb_rfc1867_basename);
1086
1087 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1088 init_check_utf8();
1089 init_convert_utf16();
1090 #endif
1091
1092 return SUCCESS;
1093 }
1094 /* }}} */
1095
1096 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1097 PHP_MSHUTDOWN_FUNCTION(mbstring)
1098 {
1099 UNREGISTER_INI_ENTRIES();
1100
1101 zend_multibyte_restore_functions();
1102
1103 #ifdef HAVE_MBREGEX
1104 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1105 #endif
1106
1107 php_internal_encoding_changed = NULL;
1108
1109 return SUCCESS;
1110 }
1111 /* }}} */
1112
1113 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1114 PHP_RINIT_FUNCTION(mbstring)
1115 {
1116 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1117 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1118 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1119 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1120
1121 MBSTRG(illegalchars) = 0;
1122
1123 php_mb_populate_current_detect_order_list();
1124
1125 #ifdef HAVE_MBREGEX
1126 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1127 #endif
1128 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1129
1130 return SUCCESS;
1131 }
1132 /* }}} */
1133
1134 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1135 PHP_RSHUTDOWN_FUNCTION(mbstring)
1136 {
1137 if (MBSTRG(current_detect_order_list) != NULL) {
1138 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1139 MBSTRG(current_detect_order_list) = NULL;
1140 MBSTRG(current_detect_order_list_size) = 0;
1141 }
1142
1143 /* clear http input identification. */
1144 MBSTRG(http_input_identify) = NULL;
1145 MBSTRG(http_input_identify_post) = NULL;
1146 MBSTRG(http_input_identify_get) = NULL;
1147 MBSTRG(http_input_identify_cookie) = NULL;
1148 MBSTRG(http_input_identify_string) = NULL;
1149
1150 if (MBSTRG(last_used_encoding_name)) {
1151 zend_string_release(MBSTRG(last_used_encoding_name));
1152 MBSTRG(last_used_encoding_name) = NULL;
1153 }
1154
1155 MBSTRG(internal_encoding_set) = 0;
1156 MBSTRG(http_output_set) = 0;
1157 MBSTRG(http_input_set) = 0;
1158
1159 MBSTRG(outconv_enabled) = false;
1160 MBSTRG(outconv_state) = 0;
1161
1162 if (MBSTRG(all_encodings_list)) {
1163 GC_DELREF(MBSTRG(all_encodings_list));
1164 zend_array_destroy(MBSTRG(all_encodings_list));
1165 MBSTRG(all_encodings_list) = NULL;
1166 }
1167
1168 #ifdef HAVE_MBREGEX
1169 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1170 #endif
1171
1172 return SUCCESS;
1173 }
1174 /* }}} */
1175
1176 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1177 PHP_MINFO_FUNCTION(mbstring)
1178 {
1179 php_info_print_table_start();
1180 php_info_print_table_row(2, "Multibyte Support", "enabled");
1181 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1182 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1183 {
1184 char tmp[256];
1185 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1186 php_info_print_table_row(2, "libmbfl version", tmp);
1187 }
1188 php_info_print_table_end();
1189
1190 php_info_print_table_start();
1191 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1192 php_info_print_table_end();
1193
1194 #ifdef HAVE_MBREGEX
1195 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1196 #endif
1197
1198 DISPLAY_INI_ENTRIES();
1199 }
1200 /* }}} */
1201
1202 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1203 PHP_FUNCTION(mb_language)
1204 {
1205 zend_string *name = NULL;
1206
1207 ZEND_PARSE_PARAMETERS_START(0, 1)
1208 Z_PARAM_OPTIONAL
1209 Z_PARAM_STR_OR_NULL(name)
1210 ZEND_PARSE_PARAMETERS_END();
1211
1212 if (name == NULL) {
1213 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1214 } else {
1215 zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1216 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1217 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1218 zend_string_release_ex(ini_name, 0);
1219 RETURN_THROWS();
1220 }
1221 // TODO Make return void
1222 RETVAL_TRUE;
1223 zend_string_release_ex(ini_name, 0);
1224 }
1225 }
1226 /* }}} */
1227
1228 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1229 PHP_FUNCTION(mb_internal_encoding)
1230 {
1231 char *name = NULL;
1232 size_t name_len;
1233 const mbfl_encoding *encoding;
1234
1235 ZEND_PARSE_PARAMETERS_START(0, 1)
1236 Z_PARAM_OPTIONAL
1237 Z_PARAM_STRING_OR_NULL(name, name_len)
1238 ZEND_PARSE_PARAMETERS_END();
1239
1240 if (name == NULL) {
1241 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1242 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1243 } else {
1244 encoding = mbfl_name2encoding(name);
1245 if (!encoding) {
1246 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1247 RETURN_THROWS();
1248 } else {
1249 MBSTRG(current_internal_encoding) = encoding;
1250 MBSTRG(internal_encoding_set) = 1;
1251 /* TODO Return old encoding */
1252 RETURN_TRUE;
1253 }
1254 }
1255 }
1256 /* }}} */
1257
1258 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1259 PHP_FUNCTION(mb_http_input)
1260 {
1261 char *type = NULL;
1262 size_t type_len = 0, n;
1263 const mbfl_encoding **entry;
1264 const mbfl_encoding *encoding;
1265
1266 ZEND_PARSE_PARAMETERS_START(0, 1)
1267 Z_PARAM_OPTIONAL
1268 Z_PARAM_STRING_OR_NULL(type, type_len)
1269 ZEND_PARSE_PARAMETERS_END();
1270
1271 if (type == NULL) {
1272 encoding = MBSTRG(http_input_identify);
1273 } else if (type_len != 1) {
1274 zend_argument_value_error(1,
1275 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1276 RETURN_THROWS();
1277 } else {
1278 switch (*type) {
1279 case 'G':
1280 case 'g':
1281 encoding = MBSTRG(http_input_identify_get);
1282 break;
1283 case 'P':
1284 case 'p':
1285 encoding = MBSTRG(http_input_identify_post);
1286 break;
1287 case 'C':
1288 case 'c':
1289 encoding = MBSTRG(http_input_identify_cookie);
1290 break;
1291 case 'S':
1292 case 's':
1293 encoding = MBSTRG(http_input_identify_string);
1294 break;
1295 case 'I':
1296 case 'i':
1297 entry = MBSTRG(http_input_list);
1298 n = MBSTRG(http_input_list_size);
1299 array_init(return_value);
1300 for (size_t i = 0; i < n; i++, entry++) {
1301 add_next_index_string(return_value, (*entry)->name);
1302 }
1303 return;
1304 case 'L':
1305 case 'l':
1306 entry = MBSTRG(http_input_list);
1307 n = MBSTRG(http_input_list_size);
1308 if (n == 0) {
1309 RETURN_FALSE;
1310 }
1311
1312 smart_str result = {0};
1313 for (size_t i = 0; i < n; i++, entry++) {
1314 if (i > 0) {
1315 smart_str_appendc(&result, ',');
1316 }
1317 smart_str_appends(&result, (*entry)->name);
1318 }
1319 RETURN_STR(smart_str_extract(&result));
1320 default:
1321 zend_argument_value_error(1,
1322 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1323 RETURN_THROWS();
1324 }
1325 }
1326
1327 if (encoding) {
1328 RETURN_STRING(encoding->name);
1329 } else {
1330 RETURN_FALSE;
1331 }
1332 }
1333 /* }}} */
1334
1335 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1336 PHP_FUNCTION(mb_http_output)
1337 {
1338 char *name = NULL;
1339 size_t name_len;
1340
1341 ZEND_PARSE_PARAMETERS_START(0, 1)
1342 Z_PARAM_OPTIONAL
1343 Z_PARAM_STRING_OR_NULL(name, name_len)
1344 ZEND_PARSE_PARAMETERS_END();
1345
1346 if (name == NULL) {
1347 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1348 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1349 } else {
1350 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1351 if (!encoding) {
1352 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1353 RETURN_THROWS();
1354 } else {
1355 MBSTRG(http_output_set) = 1;
1356 MBSTRG(current_http_output_encoding) = encoding;
1357 /* TODO Return previous encoding? */
1358 RETURN_TRUE;
1359 }
1360 }
1361 }
1362 /* }}} */
1363
1364 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1365 PHP_FUNCTION(mb_detect_order)
1366 {
1367 zend_string *order_str = NULL;
1368 HashTable *order_ht = NULL;
1369
1370 ZEND_PARSE_PARAMETERS_START(0, 1)
1371 Z_PARAM_OPTIONAL
1372 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1373 ZEND_PARSE_PARAMETERS_END();
1374
1375 if (!order_str && !order_ht) {
1376 size_t n = MBSTRG(current_detect_order_list_size);
1377 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1378 array_init(return_value);
1379 for (size_t i = 0; i < n; i++) {
1380 add_next_index_string(return_value, (*entry)->name);
1381 entry++;
1382 }
1383 } else {
1384 const mbfl_encoding **list;
1385 size_t size;
1386 if (order_ht) {
1387 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1388 RETURN_THROWS();
1389 }
1390 } else {
1391 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1392 RETURN_THROWS();
1393 }
1394 }
1395
1396 if (size == 0) {
1397 efree(ZEND_VOIDP(list));
1398 zend_argument_value_error(1, "must specify at least one encoding");
1399 RETURN_THROWS();
1400 }
1401
1402 if (MBSTRG(current_detect_order_list)) {
1403 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1404 }
1405 MBSTRG(current_detect_order_list) = list;
1406 MBSTRG(current_detect_order_list_size) = size;
1407 RETURN_TRUE;
1408 }
1409 }
1410 /* }}} */
1411
php_mb_check_code_point(zend_long cp)1412 static inline bool php_mb_check_code_point(zend_long cp)
1413 {
1414 if (cp < 0 || cp >= 0x110000) {
1415 /* Out of Unicode range */
1416 return false;
1417 }
1418
1419 if (cp >= 0xd800 && cp <= 0xdfff) {
1420 /* Surrogate code-point. These are never valid on their own and we only allow a single
1421 * substitute character. */
1422 return false;
1423 }
1424
1425 /* As we do not know the target encoding of the conversion operation that is going to
1426 * use the substitution character, we cannot check whether the codepoint is actually mapped
1427 * in the given encoding at this point. Thus we have to accept everything. */
1428 return true;
1429 }
1430
1431 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1432 PHP_FUNCTION(mb_substitute_character)
1433 {
1434 zend_string *substitute_character = NULL;
1435 zend_long substitute_codepoint;
1436 bool substitute_is_null = 1;
1437
1438 ZEND_PARSE_PARAMETERS_START(0, 1)
1439 Z_PARAM_OPTIONAL
1440 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1441 ZEND_PARSE_PARAMETERS_END();
1442
1443 if (substitute_is_null) {
1444 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1445 RETURN_STRING("none");
1446 }
1447 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1448 RETURN_STRING("long");
1449 }
1450 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1451 RETURN_STRING("entity");
1452 }
1453 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1454 }
1455
1456 if (substitute_character != NULL) {
1457 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1458 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1459 RETURN_TRUE;
1460 }
1461 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1462 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1463 RETURN_TRUE;
1464 }
1465 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1466 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1467 RETURN_TRUE;
1468 }
1469 /* Invalid string value */
1470 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1471 RETURN_THROWS();
1472 }
1473 /* Integer codepoint passed */
1474 if (!php_mb_check_code_point(substitute_codepoint)) {
1475 zend_argument_value_error(1, "is not a valid codepoint");
1476 RETURN_THROWS();
1477 }
1478
1479 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1480 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1481 RETURN_TRUE;
1482 }
1483 /* }}} */
1484
1485 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1486 PHP_FUNCTION(mb_preferred_mime_name)
1487 {
1488 char *name = NULL;
1489 size_t name_len;
1490
1491 ZEND_PARSE_PARAMETERS_START(1, 1)
1492 Z_PARAM_STRING(name, name_len)
1493 ZEND_PARSE_PARAMETERS_END();
1494
1495 const mbfl_encoding *enc = mbfl_name2encoding(name);
1496 if (enc == NULL) {
1497 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1498 RETURN_THROWS();
1499 }
1500
1501 const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1502 if (preferred_name == NULL || *preferred_name == '\0') {
1503 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1504 RETVAL_FALSE;
1505 } else {
1506 RETVAL_STRING((char *)preferred_name);
1507 }
1508 }
1509 /* }}} */
1510
1511 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1512 PHP_FUNCTION(mb_parse_str)
1513 {
1514 zval *track_vars_array = NULL;
1515 char *encstr;
1516 size_t encstr_len;
1517 php_mb_encoding_handler_info_t info;
1518 const mbfl_encoding *detected;
1519
1520 ZEND_PARSE_PARAMETERS_START(2, 2)
1521 Z_PARAM_STRING(encstr, encstr_len)
1522 Z_PARAM_ZVAL(track_vars_array)
1523 ZEND_PARSE_PARAMETERS_END();
1524
1525 track_vars_array = zend_try_array_init(track_vars_array);
1526 if (!track_vars_array) {
1527 RETURN_THROWS();
1528 }
1529
1530 encstr = estrndup(encstr, encstr_len);
1531
1532 info.data_type = PARSE_STRING;
1533 info.separator = PG(arg_separator).input;
1534 info.report_errors = true;
1535 info.to_encoding = MBSTRG(current_internal_encoding);
1536 info.from_encodings = MBSTRG(http_input_list);
1537 info.num_from_encodings = MBSTRG(http_input_list_size);
1538
1539 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1540
1541 MBSTRG(http_input_identify) = detected;
1542
1543 RETVAL_BOOL(detected);
1544
1545 if (encstr != NULL) efree(encstr);
1546 }
1547 /* }}} */
1548
PHP_FUNCTION(mb_output_handler)1549 PHP_FUNCTION(mb_output_handler)
1550 {
1551 zend_string *str;
1552 zend_long arg_status;
1553
1554 ZEND_PARSE_PARAMETERS_START(2, 2)
1555 Z_PARAM_STR(str)
1556 Z_PARAM_LONG(arg_status)
1557 ZEND_PARSE_PARAMETERS_END();
1558
1559 const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1560 if (encoding == &mbfl_encoding_pass) {
1561 RETURN_STR_COPY(str);
1562 }
1563
1564 if (arg_status & PHP_OUTPUT_HANDLER_START) {
1565 bool free_mimetype = false;
1566 char *mimetype = NULL;
1567
1568 /* Analyze mime type */
1569 if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1570 char *s;
1571 if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1572 mimetype = estrdup(SG(sapi_headers).mimetype);
1573 } else {
1574 mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1575 }
1576 free_mimetype = true;
1577 } else if (SG(sapi_headers).send_default_content_type) {
1578 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1579 }
1580
1581 /* If content-type is not yet set, set it and enable conversion */
1582 if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1583 const char *charset = encoding->mime_name;
1584 if (charset) {
1585 char *p;
1586 size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s", mimetype, charset);
1587 if (sapi_add_header(p, len, 0) != FAILURE) {
1588 SG(sapi_headers).send_default_content_type = 0;
1589 }
1590 }
1591
1592 MBSTRG(outconv_enabled) = true;
1593 }
1594
1595 if (free_mimetype) {
1596 efree(mimetype);
1597 }
1598 }
1599
1600 if (!MBSTRG(outconv_enabled)) {
1601 RETURN_STR_COPY(str);
1602 }
1603
1604 mb_convert_buf buf;
1605 mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1606
1607 uint32_t wchar_buf[128];
1608 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1609 size_t in_len = ZSTR_LEN(str);
1610 bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1611
1612 while (in_len) {
1613 size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1614 ZEND_ASSERT(out_len <= 128);
1615 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1616 }
1617
1618 MBSTRG(illegalchars) += buf.errors;
1619 RETVAL_STR(mb_convert_buf_result_raw(&buf));
1620
1621 if (last_feed) {
1622 MBSTRG(outconv_enabled) = false;
1623 MBSTRG(outconv_state) = 0;
1624 }
1625 }
1626
PHP_FUNCTION(mb_str_split)1627 PHP_FUNCTION(mb_str_split)
1628 {
1629 zend_string *str, *encoding = NULL;
1630 zend_long split_len = 1;
1631
1632 ZEND_PARSE_PARAMETERS_START(1, 3)
1633 Z_PARAM_STR(str)
1634 Z_PARAM_OPTIONAL
1635 Z_PARAM_LONG(split_len)
1636 Z_PARAM_STR_OR_NULL(encoding)
1637 ZEND_PARSE_PARAMETERS_END();
1638
1639 if (split_len <= 0) {
1640 zend_argument_value_error(2, "must be greater than 0");
1641 RETURN_THROWS();
1642 } else if (split_len > UINT_MAX / 4) {
1643 zend_argument_value_error(2, "is too large");
1644 RETURN_THROWS();
1645 }
1646
1647 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1648 if (!enc) {
1649 RETURN_THROWS();
1650 }
1651
1652 if (ZSTR_LEN(str) == 0) {
1653 RETURN_EMPTY_ARRAY();
1654 }
1655
1656 unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1657
1658 unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1659 if (char_len) {
1660 unsigned int chunk_len = char_len * split_len;
1661 unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1662 array_init_size(return_value, chunks);
1663 while (p < e) {
1664 add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1665 p += chunk_len;
1666 }
1667 } else if (enc->mblen_table) {
1668 unsigned char const *mbtab = enc->mblen_table;
1669
1670 /* Assume that we have 1-byte characters */
1671 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1672
1673 while (p < e) {
1674 unsigned char *chunk = p; /* start of chunk */
1675
1676 for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1677 p += mbtab[*p];
1678 }
1679 if (p > e) {
1680 p = e; /* ensure chunk is in bounds */
1681 }
1682 add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1683 }
1684 } else {
1685 /* Assume that we have 1-byte characters */
1686 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1687
1688 uint32_t wchar_buf[128];
1689 size_t in_len = ZSTR_LEN(str);
1690 unsigned int state = 0, char_count = 0;
1691
1692 mb_convert_buf buf;
1693
1694 while (in_len) {
1695 size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1696 ZEND_ASSERT(out_len <= 128);
1697 size_t i = 0;
1698
1699 /* Is there some output remaining from the previous iteration? */
1700 if (char_count) {
1701 if (out_len >= split_len - char_count) {
1702 /* Finish off an incomplete chunk from previous iteration
1703 * ('buf' was already initialized; we don't need to do it again) */
1704 enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1705 i += split_len - char_count;
1706 char_count = 0;
1707 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1708 } else {
1709 /* Output from this iteration is not enough to finish the next chunk;
1710 * output what we can, and leave 'buf' to be used again on next iteration */
1711 enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1712 char_count += out_len;
1713 continue;
1714 }
1715 }
1716
1717 while (i < out_len) {
1718 /* Prepare for the next chunk */
1719 mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1720
1721 if (out_len - i >= split_len) {
1722 enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1723 i += split_len;
1724 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1725 } else {
1726 /* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1727 * leave them for the next iteration */
1728 enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1729 char_count = out_len - i;
1730 break;
1731 }
1732 }
1733 }
1734
1735 if (char_count) {
1736 /* The main loop above has finished processing the input string, but
1737 * has left a partial chunk in 'buf' */
1738 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1739 }
1740 }
1741 }
1742
1743 #ifdef __SSE2__
1744 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1745 * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1746 * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1747 * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1748 static inline uint32_t _mm_sum_epu8(const __m128i v)
1749 {
1750 /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1751 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1752 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1753 * halves of the destination XMM register
1754 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1755 * summed up will actually just be the 8-bit values from `v` */
1756 __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1757 /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1758 * to extract it here; but it stored the sum as two different 16-bit values
1759 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1760 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1761 return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1762 }
1763 #endif
1764
1765 /* This assumes that `string` is valid UTF-8
1766 * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1767 * Interpreted as signed integers, those are all byte values less than -64
1768 * A fast way to get the length of a UTF-8 string is to start with its byte length,
1769 * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1770 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1771 {
1772 unsigned char *e = p + len;
1773
1774 #ifdef __SSE2__
1775 if (len >= sizeof(__m128i)) {
1776 e -= sizeof(__m128i);
1777
1778 const __m128i threshold = _mm_set1_epi8(-64);
1779 const __m128i delta = _mm_set1_epi8(1);
1780 __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1781
1782 unsigned char reset_counter = 255;
1783 do {
1784 __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1785 __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1786 counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1787
1788 /* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1789 * and reset them to zero */
1790 if (--reset_counter == 0) {
1791 len -= _mm_sum_epu8(counter);
1792 counter = _mm_setzero_si128();
1793 reset_counter = 255;
1794 }
1795
1796 p += sizeof(__m128i);
1797 } while (p <= e);
1798
1799 e += sizeof(__m128i);
1800 len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1801 }
1802 #endif
1803
1804 /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1805 while (p < e) {
1806 signed char c = *p++;
1807 if (c < -64) {
1808 len--;
1809 }
1810 }
1811
1812 return len;
1813 }
1814
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1815 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1816 {
1817 unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1818 if (char_len) {
1819 return ZSTR_LEN(string) / char_len;
1820 } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1821 return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1822 }
1823
1824 uint32_t wchar_buf[128];
1825 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1826 size_t in_len = ZSTR_LEN(string);
1827 unsigned int state = 0;
1828 size_t len = 0;
1829
1830 while (in_len) {
1831 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1832 }
1833
1834 return len;
1835 }
1836
1837 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1838 PHP_FUNCTION(mb_strlen)
1839 {
1840 zend_string *string, *enc_name = NULL;
1841
1842 ZEND_PARSE_PARAMETERS_START(1, 2)
1843 Z_PARAM_STR(string)
1844 Z_PARAM_OPTIONAL
1845 Z_PARAM_STR_OR_NULL(enc_name)
1846 ZEND_PARSE_PARAMETERS_END();
1847
1848 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1849 if (!enc) {
1850 RETURN_THROWS();
1851 }
1852
1853 RETVAL_LONG(mb_get_strlen(string, enc));
1854 }
1855 /* }}} */
1856
1857 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1858 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1859 {
1860 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1861 }
1862
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1863 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1864 if (offset < 0) {
1865 unsigned char *pos = end;
1866 while (offset < 0) {
1867 if (pos <= str) {
1868 return NULL;
1869 }
1870
1871 unsigned char c = *--pos;
1872 if (c < 0x80 || (c & 0xC0) != 0x80) {
1873 offset++;
1874 }
1875 }
1876 return pos;
1877 } else {
1878 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1879 unsigned char *pos = str;
1880 while (offset-- > 0) {
1881 if (pos >= end) {
1882 return NULL;
1883 }
1884 pos += u8_tbl[*pos];
1885 }
1886 return pos;
1887 }
1888 }
1889
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1890 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1891 return mb_fast_strlen_utf8(start, pos - start);
1892 }
1893
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1894 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1895 {
1896 size_t result;
1897 zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1898 unsigned char *offset_pointer;
1899
1900 if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1901 unsigned int num_errors = 0;
1902 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1903 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1904 } else {
1905 haystack_u8 = haystack;
1906 needle_u8 = needle;
1907 }
1908
1909 offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1910 if (!offset_pointer) {
1911 result = MBFL_ERROR_OFFSET;
1912 goto out;
1913 }
1914
1915 result = MBFL_ERROR_NOT_FOUND;
1916 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1917 goto out;
1918 }
1919
1920 const char *found_pos;
1921 if (!reverse) {
1922 found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1923 } else if (offset >= 0) {
1924 found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1925 } else {
1926 size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1927 offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1928 if (!offset_pointer) {
1929 offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1930 }
1931
1932 found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1933 }
1934
1935 if (found_pos) {
1936 result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1937 }
1938
1939 out:
1940 if (haystack_u8 != haystack) {
1941 zend_string_free(haystack_u8);
1942 }
1943 if (needle_u8 != needle) {
1944 zend_string_free(needle_u8);
1945 }
1946 return result;
1947 }
1948
handle_strpos_error(size_t error)1949 static void handle_strpos_error(size_t error) {
1950 switch (error) {
1951 case MBFL_ERROR_NOT_FOUND:
1952 break;
1953 case MBFL_ERROR_ENCODING:
1954 php_error_docref(NULL, E_WARNING, "Conversion error");
1955 break;
1956 case MBFL_ERROR_OFFSET:
1957 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1958 break;
1959 default:
1960 zend_value_error("mb_strpos(): Unknown error");
1961 break;
1962 }
1963 }
1964
PHP_FUNCTION(mb_strpos)1965 PHP_FUNCTION(mb_strpos)
1966 {
1967 zend_long offset = 0;
1968 zend_string *needle, *haystack;
1969 zend_string *enc_name = NULL;
1970
1971 ZEND_PARSE_PARAMETERS_START(2, 4)
1972 Z_PARAM_STR(haystack)
1973 Z_PARAM_STR(needle)
1974 Z_PARAM_OPTIONAL
1975 Z_PARAM_LONG(offset)
1976 Z_PARAM_STR_OR_NULL(enc_name)
1977 ZEND_PARSE_PARAMETERS_END();
1978
1979 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1980 if (!enc) {
1981 RETURN_THROWS();
1982 }
1983
1984 size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1985 if (!mbfl_is_error(n)) {
1986 RETVAL_LONG(n);
1987 } else {
1988 handle_strpos_error(n);
1989 RETVAL_FALSE;
1990 }
1991 }
1992
1993 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1994 PHP_FUNCTION(mb_strrpos)
1995 {
1996 zend_long offset = 0;
1997 zend_string *needle, *haystack;
1998 zend_string *enc_name = NULL;
1999
2000 ZEND_PARSE_PARAMETERS_START(2, 4)
2001 Z_PARAM_STR(haystack)
2002 Z_PARAM_STR(needle)
2003 Z_PARAM_OPTIONAL
2004 Z_PARAM_LONG(offset)
2005 Z_PARAM_STR_OR_NULL(enc_name)
2006 ZEND_PARSE_PARAMETERS_END();
2007
2008 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2009 if (!enc) {
2010 RETURN_THROWS();
2011 }
2012
2013 size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2014 if (!mbfl_is_error(n)) {
2015 RETVAL_LONG(n);
2016 } else {
2017 handle_strpos_error(n);
2018 RETVAL_FALSE;
2019 }
2020 }
2021 /* }}} */
2022
2023 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2024 PHP_FUNCTION(mb_stripos)
2025 {
2026 zend_long offset = 0;
2027 zend_string *haystack, *needle;
2028 zend_string *from_encoding = NULL;
2029
2030 ZEND_PARSE_PARAMETERS_START(2, 4)
2031 Z_PARAM_STR(haystack)
2032 Z_PARAM_STR(needle)
2033 Z_PARAM_OPTIONAL
2034 Z_PARAM_LONG(offset)
2035 Z_PARAM_STR_OR_NULL(from_encoding)
2036 ZEND_PARSE_PARAMETERS_END();
2037
2038 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2039 if (!enc) {
2040 RETURN_THROWS();
2041 }
2042
2043 size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2044
2045 if (!mbfl_is_error(n)) {
2046 RETVAL_LONG(n);
2047 } else {
2048 handle_strpos_error(n);
2049 RETVAL_FALSE;
2050 }
2051 }
2052 /* }}} */
2053
2054 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2055 PHP_FUNCTION(mb_strripos)
2056 {
2057 zend_long offset = 0;
2058 zend_string *haystack, *needle;
2059 zend_string *from_encoding = NULL;
2060
2061 ZEND_PARSE_PARAMETERS_START(2, 4)
2062 Z_PARAM_STR(haystack)
2063 Z_PARAM_STR(needle)
2064 Z_PARAM_OPTIONAL
2065 Z_PARAM_LONG(offset)
2066 Z_PARAM_STR_OR_NULL(from_encoding)
2067 ZEND_PARSE_PARAMETERS_END();
2068
2069 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2070 if (!enc) {
2071 RETURN_THROWS();
2072 }
2073
2074 size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2075
2076 if (!mbfl_is_error(n)) {
2077 RETVAL_LONG(n);
2078 } else {
2079 handle_strpos_error(n);
2080 RETVAL_FALSE;
2081 }
2082 }
2083 /* }}} */
2084
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2085 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2086 {
2087 uint32_t wchar_buf[128];
2088 unsigned int state = 0;
2089
2090 mb_convert_buf buf;
2091 mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2092
2093 while (in_len && len) {
2094 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2095 ZEND_ASSERT(out_len <= 128);
2096
2097 if (from >= out_len) {
2098 from -= out_len;
2099 } else {
2100 size_t needed_codepoints = MIN(out_len - from, len);
2101 enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2102 from = 0;
2103 len -= needed_codepoints;
2104 }
2105 }
2106
2107 return mb_convert_buf_result(&buf, enc);
2108 }
2109
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2110 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2111 {
2112 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2113 size_t in_len = ZSTR_LEN(input);
2114
2115 if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2116 /* Other than MacJapanese, no supported text encoding decodes to
2117 * more than one codepoint per byte
2118 * So if the number of codepoints to skip >= number of input bytes,
2119 * then definitely the output should be empty */
2120 return zend_empty_string;
2121 }
2122
2123 /* Does each codepoint have a fixed byte width? */
2124 unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2125 if (flag) {
2126 /* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2127 from *= flag;
2128 len *= flag;
2129 if (from >= in_len) {
2130 return zend_empty_string;
2131 }
2132 in += from;
2133 in_len -= from;
2134 if (len > in_len) {
2135 len = in_len;
2136 }
2137 return zend_string_init_fast((const char*)in, len);
2138 }
2139
2140 return mb_get_substr_slow(in, in_len, from, len, enc);
2141 }
2142
2143 #define MB_STRSTR 1
2144 #define MB_STRRCHR 2
2145 #define MB_STRISTR 3
2146 #define MB_STRRICHR 4
2147
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2148 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2149 {
2150 bool reverse_mode = false, part = false;
2151 size_t n;
2152 zend_string *haystack, *needle;
2153 zend_string *encoding_name = NULL;
2154
2155 ZEND_PARSE_PARAMETERS_START(2, 4)
2156 Z_PARAM_STR(haystack)
2157 Z_PARAM_STR(needle)
2158 Z_PARAM_OPTIONAL
2159 Z_PARAM_BOOL(part)
2160 Z_PARAM_STR_OR_NULL(encoding_name)
2161 ZEND_PARSE_PARAMETERS_END();
2162
2163 const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2164 if (!enc) {
2165 RETURN_THROWS();
2166 }
2167
2168 if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2169 reverse_mode = true;
2170 }
2171
2172 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2173 n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2174 } else {
2175 n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2176 }
2177
2178 if (!mbfl_is_error(n)) {
2179 if (part) {
2180 RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2181 } else {
2182 RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2183 }
2184 } else {
2185 // FIXME use handle_strpos_error(n)
2186 RETVAL_FALSE;
2187 }
2188 }
2189
2190 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2191 PHP_FUNCTION(mb_strstr)
2192 {
2193 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2194 }
2195 /* }}} */
2196
2197 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2198 PHP_FUNCTION(mb_strrchr)
2199 {
2200 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2201 }
2202 /* }}} */
2203
2204 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2205 PHP_FUNCTION(mb_stristr)
2206 {
2207 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2208 }
2209 /* }}} */
2210
2211 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2212 PHP_FUNCTION(mb_strrichr)
2213 {
2214 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2215 }
2216 /* }}} */
2217
2218 #undef MB_STRSTR
2219 #undef MB_STRRCHR
2220 #undef MB_STRISTR
2221 #undef MB_STRRICHR
2222
PHP_FUNCTION(mb_substr_count)2223 PHP_FUNCTION(mb_substr_count)
2224 {
2225 zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2226
2227 ZEND_PARSE_PARAMETERS_START(2, 3)
2228 Z_PARAM_STR(haystack)
2229 Z_PARAM_STR(needle)
2230 Z_PARAM_OPTIONAL
2231 Z_PARAM_STR_OR_NULL(enc_name)
2232 ZEND_PARSE_PARAMETERS_END();
2233
2234 if (ZSTR_LEN(needle) == 0) {
2235 zend_argument_value_error(2, "must not be empty");
2236 RETURN_THROWS();
2237 }
2238
2239 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2240 if (!enc) {
2241 RETURN_THROWS();
2242 }
2243
2244 if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2245 /* No need to do any conversion if haystack/needle are already known-valid UTF-8
2246 * (If they are not valid, then not passing them through conversion filters could affect output) */
2247 if (ZSTR_IS_VALID_UTF8(haystack)) {
2248 haystack_u8 = haystack;
2249 } else {
2250 unsigned int num_errors = 0;
2251 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2252 if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2253 GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2254 }
2255 }
2256
2257 if (ZSTR_IS_VALID_UTF8(needle)) {
2258 needle_u8 = needle;
2259 } else {
2260 unsigned int num_errors = 0;
2261 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2262 if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2263 GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2264 }
2265 }
2266 } else {
2267 unsigned int num_errors = 0;
2268 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2269 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 /* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2271 * may be only escape sequences */
2272 if (ZSTR_LEN(needle_u8) == 0) {
2273 zend_string_free(haystack_u8);
2274 zend_string_free(needle_u8);
2275 zend_argument_value_error(2, "must not be empty");
2276 RETURN_THROWS();
2277 }
2278 }
2279
2280 size_t result = 0;
2281
2282 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2283 goto out;
2284 }
2285
2286 const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2287 while (true) {
2288 p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2289 if (!p) {
2290 break;
2291 }
2292 p += ZSTR_LEN(needle_u8);
2293 result++;
2294 }
2295
2296 out:
2297 if (haystack_u8 != haystack) {
2298 zend_string_free(haystack_u8);
2299 }
2300 if (needle_u8 != needle) {
2301 zend_string_free(needle_u8);
2302 }
2303
2304 RETVAL_LONG(result);
2305 }
2306
2307 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2308 PHP_FUNCTION(mb_substr)
2309 {
2310 zend_string *str, *encoding = NULL;
2311 zend_long from, len;
2312 size_t real_from, real_len;
2313 bool len_is_null = true;
2314
2315 ZEND_PARSE_PARAMETERS_START(2, 4)
2316 Z_PARAM_STR(str)
2317 Z_PARAM_LONG(from)
2318 Z_PARAM_OPTIONAL
2319 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2320 Z_PARAM_STR_OR_NULL(encoding)
2321 ZEND_PARSE_PARAMETERS_END();
2322
2323 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2324 if (!enc) {
2325 RETURN_THROWS();
2326 }
2327
2328 size_t mblen = 0;
2329 if (from < 0 || (!len_is_null && len < 0)) {
2330 mblen = mb_get_strlen(str, enc);
2331 }
2332
2333 /* if "from" position is negative, count start position from the end
2334 * of the string */
2335 if (from >= 0) {
2336 real_from = (size_t) from;
2337 } else if (-from < mblen) {
2338 real_from = mblen + from;
2339 } else {
2340 real_from = 0;
2341 }
2342
2343 /* if "length" position is negative, set it to the length
2344 * needed to stop that many chars from the end of the string */
2345 if (len_is_null) {
2346 real_len = MBFL_SUBSTR_UNTIL_END;
2347 } else if (len >= 0) {
2348 real_len = (size_t) len;
2349 } else if (real_from < mblen && -len < mblen - real_from) {
2350 real_len = (mblen - real_from) + len;
2351 } else {
2352 real_len = 0;
2353 }
2354
2355 RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2356 }
2357 /* }}} */
2358
2359 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2360 PHP_FUNCTION(mb_strcut)
2361 {
2362 zend_string *encoding = NULL;
2363 char *string_val;
2364 zend_long from, len;
2365 bool len_is_null = true;
2366 mbfl_string string, result, *ret;
2367
2368 ZEND_PARSE_PARAMETERS_START(2, 4)
2369 Z_PARAM_STRING(string_val, string.len)
2370 Z_PARAM_LONG(from)
2371 Z_PARAM_OPTIONAL
2372 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2373 Z_PARAM_STR_OR_NULL(encoding)
2374 ZEND_PARSE_PARAMETERS_END();
2375
2376 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2377 if (!enc) {
2378 RETURN_THROWS();
2379 }
2380
2381 string.val = (unsigned char*)string_val;
2382 string.encoding = enc;
2383
2384 if (len_is_null) {
2385 len = string.len;
2386 }
2387
2388 /* if "from" position is negative, count start position from the end
2389 * of the string */
2390 if (from < 0) {
2391 from = string.len + from;
2392 if (from < 0) {
2393 from = 0;
2394 }
2395 }
2396
2397 /* if "length" position is negative, set it to the length
2398 * needed to stop that many chars from the end of the string */
2399 if (len < 0) {
2400 len = (string.len - from) + len;
2401 if (len < 0) {
2402 len = 0;
2403 }
2404 }
2405
2406 if (from > string.len || len == 0) {
2407 RETURN_EMPTY_STRING();
2408 }
2409
2410 if (enc->cut) {
2411 RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2412 }
2413
2414 unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2415 if (char_len) {
2416 /* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2417 from &= -char_len;
2418 if (len > string.len - from) {
2419 len = string.len - from;
2420 }
2421 RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2422 }
2423
2424 if (enc->mblen_table) {
2425 const unsigned char *mbtab = enc->mblen_table;
2426 const unsigned char *p, *q, *end;
2427 int m = 0;
2428 /* Search for start position */
2429 for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2430 if (p > q) {
2431 p -= m;
2432 }
2433 const unsigned char *start = p;
2434 /* Search for end position */
2435 if (len >= string.len - (start - (const unsigned char*)string.val)) {
2436 end = (const unsigned char*)(string.val + string.len);
2437 } else {
2438 for (q = p + len; p < q; p += (m = mbtab[*p]));
2439 if (p > q) {
2440 p -= m;
2441 }
2442 end = p;
2443 }
2444 RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2445 }
2446
2447 ret = mbfl_strcut(&string, &result, from, len);
2448 ZEND_ASSERT(ret != NULL);
2449 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2450 efree(ret->val);
2451 }
2452 /* }}} */
2453
2454 /* Some East Asian characters, when printed at a terminal (or the like), require double
2455 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2456 static size_t character_width(uint32_t c)
2457 {
2458 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2459 return 1;
2460 }
2461
2462 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2463 unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2464 while (lo < hi) {
2465 unsigned int probe = (lo + hi) / 2;
2466 if (c < mbfl_eaw_table[probe].begin) {
2467 hi = probe;
2468 } else if (c > mbfl_eaw_table[probe].end) {
2469 lo = probe + 1;
2470 } else {
2471 return 2;
2472 }
2473 }
2474
2475 return 1;
2476 }
2477
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2478 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2479 {
2480 size_t width = 0;
2481 uint32_t wchar_buf[128];
2482 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2483 size_t in_len = ZSTR_LEN(string);
2484 unsigned int state = 0;
2485
2486 while (in_len) {
2487 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2488 ZEND_ASSERT(out_len <= 128);
2489
2490 while (out_len) {
2491 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2492 * If text conversion is performed with an ordinary ASCII character as
2493 * the 'replacement character', this will give us the correct display width. */
2494 width += character_width(wchar_buf[--out_len]);
2495 }
2496 }
2497
2498 return width;
2499 }
2500
2501 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2502 PHP_FUNCTION(mb_strwidth)
2503 {
2504 zend_string *string, *enc_name = NULL;
2505
2506 ZEND_PARSE_PARAMETERS_START(1, 2)
2507 Z_PARAM_STR(string)
2508 Z_PARAM_OPTIONAL
2509 Z_PARAM_STR_OR_NULL(enc_name)
2510 ZEND_PARSE_PARAMETERS_END();
2511
2512 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2513 if (!enc) {
2514 RETURN_THROWS();
2515 }
2516
2517 RETVAL_LONG(mb_get_strwidth(string, enc));
2518 }
2519
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2520 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2521 {
2522 uint32_t wchar_buf[128];
2523 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2524 size_t in_len = ZSTR_LEN(input);
2525 unsigned int state = 0;
2526 size_t remaining_width = width;
2527 size_t to_skip = from;
2528 size_t out_len = 0;
2529 bool first_call = true, input_err = false;
2530 mb_convert_buf buf;
2531
2532 while (in_len) {
2533 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2534 ZEND_ASSERT(out_len <= 128);
2535
2536 if (out_len <= to_skip) {
2537 to_skip -= out_len;
2538 } else {
2539 for (size_t i = to_skip; i < out_len; i++) {
2540 uint32_t w = wchar_buf[i];
2541 size_t current_w_width = character_width(w);
2542
2543 input_err |= (w == MBFL_BAD_INPUT);
2544
2545 if (remaining_width < current_w_width) {
2546 size_t marker_width = mb_get_strwidth(marker, enc);
2547
2548 /* The trim marker is larger than the desired string width */
2549 if (width <= marker_width) {
2550 return zend_string_copy(marker);
2551 }
2552
2553 /* We need to truncate string and append trim marker */
2554 width -= marker_width;
2555 /* 'width' is now the amount we want to take from 'input' */
2556 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2557
2558 if (first_call) {
2559 /* We can use the buffer of wchars which we have right now;
2560 * no need to convert again */
2561 goto dont_restart_conversion;
2562 } else {
2563 goto restart_conversion;
2564 }
2565 }
2566 remaining_width -= current_w_width;
2567 }
2568 to_skip = 0;
2569 }
2570 first_call = false;
2571 }
2572
2573 /* The input string fits in the requested width; we don't need to append the trim marker
2574 * However, if the string contains erroneous byte sequences, those should be converted
2575 * to error markers */
2576 if (!input_err) {
2577 if (from == 0) {
2578 /* This just increments the string's refcount; it doesn't really 'copy' it */
2579 return zend_string_copy(input);
2580 } else {
2581 return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2582 }
2583 } else {
2584 /* We can't use `mb_get_substr`, because it uses the fastest method possible of
2585 * picking out a substring, which may not include converting erroneous byte
2586 * sequences to error markers */
2587 return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2588 }
2589
2590 /* The input string is too wide; we need to build a new string which
2591 * includes some portion of the input string, with the trim marker
2592 * concatenated onto it */
2593 restart_conversion:
2594 in = (unsigned char*)ZSTR_VAL(input);
2595 in_len = ZSTR_LEN(input);
2596 state = 0;
2597
2598 while (true) {
2599 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2600 ZEND_ASSERT(out_len <= 128);
2601
2602 dont_restart_conversion:
2603 if (out_len <= from) {
2604 from -= out_len;
2605 } else {
2606 for (size_t i = from; i < out_len; i++) {
2607 size_t current_wchar_char_width = character_width(wchar_buf[i]);
2608 if (width < current_wchar_char_width) {
2609 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2610 goto append_trim_marker;
2611 }
2612 width -= current_wchar_char_width;
2613 }
2614 ZEND_ASSERT(in_len > 0);
2615 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2616 from = 0;
2617 }
2618 }
2619
2620 append_trim_marker:
2621 if (ZSTR_LEN(marker) > 0) {
2622 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2623 buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2624 }
2625
2626 /* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2627 * we have no guarantee that the trim marker string is valid UTF-8 */
2628 return mb_convert_buf_result_raw(&buf);
2629 }
2630
2631 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2632 PHP_FUNCTION(mb_strimwidth)
2633 {
2634 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2635 zend_long from, width;
2636
2637 ZEND_PARSE_PARAMETERS_START(3, 5)
2638 Z_PARAM_STR(str)
2639 Z_PARAM_LONG(from)
2640 Z_PARAM_LONG(width)
2641 Z_PARAM_OPTIONAL
2642 Z_PARAM_STR(trimmarker)
2643 Z_PARAM_STR_OR_NULL(encoding)
2644 ZEND_PARSE_PARAMETERS_END();
2645
2646 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2647 if (!enc) {
2648 RETURN_THROWS();
2649 }
2650
2651 if (from != 0) {
2652 size_t str_len = mb_get_strlen(str, enc);
2653 if (from < 0) {
2654 from += str_len;
2655 }
2656 if (from < 0 || from > str_len) {
2657 zend_argument_value_error(2, "is out of range");
2658 RETURN_THROWS();
2659 }
2660 }
2661
2662 if (width < 0) {
2663 php_error_docref(NULL, E_DEPRECATED,
2664 "passing a negative integer to argument #3 ($width) is deprecated");
2665 width += mb_get_strwidth(str, enc);
2666
2667 if (from > 0) {
2668 zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2669 width -= mb_get_strwidth(trimmed, enc);
2670 zend_string_free(trimmed);
2671 }
2672
2673 if (width < 0) {
2674 zend_argument_value_error(3, "is out of range");
2675 RETURN_THROWS();
2676 }
2677 }
2678
2679 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2680 }
2681
2682
2683 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2684 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2685 {
2686 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2687 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2688 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2689 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2690 }
2691
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2692 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2693 {
2694 unsigned int num_errors = 0;
2695 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2696 MBSTRG(illegalchars) += num_errors;
2697 return result;
2698 }
2699
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2700 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2701 {
2702 const mbfl_encoding *from_encoding;
2703
2704 /* pre-conversion encoding */
2705 ZEND_ASSERT(num_from_encodings >= 1);
2706 if (num_from_encodings == 1) {
2707 from_encoding = *from_encodings;
2708 } else {
2709 /* auto detect */
2710 from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2711 if (!from_encoding) {
2712 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2713 return NULL;
2714 }
2715 }
2716
2717 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2718 }
2719
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2720 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2721 {
2722 HashTable *output, *chash;
2723 zend_long idx;
2724 zend_string *key;
2725 zval *entry, entry_tmp;
2726
2727 if (!input) {
2728 return NULL;
2729 }
2730
2731 if (GC_IS_RECURSIVE(input)) {
2732 GC_UNPROTECT_RECURSION(input);
2733 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2734 return NULL;
2735 }
2736 GC_TRY_PROTECT_RECURSION(input);
2737 output = zend_new_array(zend_hash_num_elements(input));
2738 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2739 /* convert key */
2740 if (key) {
2741 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2742 if (!converted_key) {
2743 continue;
2744 }
2745 key = converted_key;
2746 }
2747 /* convert value */
2748 ZEND_ASSERT(entry);
2749 try_again:
2750 switch(Z_TYPE_P(entry)) {
2751 case IS_STRING: {
2752 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2753 if (!converted_key) {
2754 if (key) {
2755 zend_string_release(key);
2756 }
2757 continue;
2758 }
2759 ZVAL_STR(&entry_tmp, converted_key);
2760 break;
2761 }
2762 case IS_NULL:
2763 case IS_TRUE:
2764 case IS_FALSE:
2765 case IS_LONG:
2766 case IS_DOUBLE:
2767 ZVAL_COPY(&entry_tmp, entry);
2768 break;
2769 case IS_ARRAY:
2770 chash = php_mb_convert_encoding_recursive(
2771 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2772 if (chash) {
2773 ZVAL_ARR(&entry_tmp, chash);
2774 } else {
2775 ZVAL_EMPTY_ARRAY(&entry_tmp);
2776 }
2777 break;
2778 case IS_REFERENCE:
2779 entry = Z_REFVAL_P(entry);
2780 goto try_again;
2781 case IS_OBJECT:
2782 default:
2783 if (key) {
2784 zend_string_release(key);
2785 }
2786 php_error_docref(NULL, E_WARNING, "Object is not supported");
2787 continue;
2788 }
2789 if (key) {
2790 zend_hash_add(output, key, &entry_tmp);
2791 zend_string_release(key);
2792 } else {
2793 zend_hash_index_add(output, idx, &entry_tmp);
2794 }
2795 } ZEND_HASH_FOREACH_END();
2796 GC_TRY_UNPROTECT_RECURSION(input);
2797
2798 return output;
2799 }
2800 /* }}} */
2801
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2802 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2803 {
2804 /* mbstring supports some 'text encodings' which aren't really text encodings
2805 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2806 * These should never be returned by `mb_detect_encoding`. */
2807 unsigned int shift = 0;
2808 for (unsigned int i = 0; i < *size; i++) {
2809 const mbfl_encoding *encoding = elist[i];
2810 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2811 shift++; /* Remove this encoding from the list */
2812 } else if (shift) {
2813 elist[i - shift] = encoding;
2814 }
2815 }
2816 *size -= shift;
2817 }
2818
2819 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2820 PHP_FUNCTION(mb_convert_encoding)
2821 {
2822 zend_string *to_encoding_name;
2823 zend_string *input_str, *from_encodings_str = NULL;
2824 HashTable *input_ht, *from_encodings_ht = NULL;
2825 const mbfl_encoding **from_encodings;
2826 size_t num_from_encodings;
2827 bool free_from_encodings = false;
2828
2829 ZEND_PARSE_PARAMETERS_START(2, 3)
2830 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2831 Z_PARAM_STR(to_encoding_name)
2832 Z_PARAM_OPTIONAL
2833 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2834 ZEND_PARSE_PARAMETERS_END();
2835
2836 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2837 if (!to_encoding) {
2838 RETURN_THROWS();
2839 }
2840
2841 if (from_encodings_ht) {
2842 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2843 RETURN_THROWS();
2844 }
2845 free_from_encodings = true;
2846 } else if (from_encodings_str) {
2847 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2848 &from_encodings, &num_from_encodings,
2849 /* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2850 RETURN_THROWS();
2851 }
2852 free_from_encodings = true;
2853 } else {
2854 from_encodings = &MBSTRG(current_internal_encoding);
2855 num_from_encodings = 1;
2856 }
2857
2858 if (num_from_encodings > 1) {
2859 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2860 }
2861
2862 if (!num_from_encodings) {
2863 efree(ZEND_VOIDP(from_encodings));
2864 zend_argument_value_error(3, "must specify at least one encoding");
2865 RETURN_THROWS();
2866 }
2867
2868 if (input_str) {
2869 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2870 if (ret != NULL) {
2871 RETVAL_STR(ret);
2872 } else {
2873 RETVAL_FALSE;
2874 }
2875 } else {
2876 HashTable *tmp;
2877 tmp = php_mb_convert_encoding_recursive(
2878 input_ht, to_encoding, from_encodings, num_from_encodings);
2879 RETVAL_ARR(tmp);
2880 }
2881
2882 if (free_from_encodings) {
2883 efree(ZEND_VOIDP(from_encodings));
2884 }
2885 }
2886 /* }}} */
2887
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2888 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2889 {
2890 return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2891 }
2892
PHP_FUNCTION(mb_convert_case)2893 PHP_FUNCTION(mb_convert_case)
2894 {
2895 zend_string *str, *from_encoding = NULL;
2896 zend_long case_mode = 0;
2897
2898 ZEND_PARSE_PARAMETERS_START(2, 3)
2899 Z_PARAM_STR(str)
2900 Z_PARAM_LONG(case_mode)
2901 Z_PARAM_OPTIONAL
2902 Z_PARAM_STR_OR_NULL(from_encoding)
2903 ZEND_PARSE_PARAMETERS_END();
2904
2905 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2906 if (!enc) {
2907 RETURN_THROWS();
2908 }
2909
2910 if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2911 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2912 RETURN_THROWS();
2913 }
2914
2915 RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2916 }
2917
PHP_FUNCTION(mb_strtoupper)2918 PHP_FUNCTION(mb_strtoupper)
2919 {
2920 zend_string *str, *from_encoding = NULL;
2921
2922 ZEND_PARSE_PARAMETERS_START(1, 2)
2923 Z_PARAM_STR(str)
2924 Z_PARAM_OPTIONAL
2925 Z_PARAM_STR_OR_NULL(from_encoding)
2926 ZEND_PARSE_PARAMETERS_END();
2927
2928 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2929 if (!enc) {
2930 RETURN_THROWS();
2931 }
2932
2933 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2934 }
2935
PHP_FUNCTION(mb_strtolower)2936 PHP_FUNCTION(mb_strtolower)
2937 {
2938 zend_string *str, *from_encoding = NULL;
2939
2940 ZEND_PARSE_PARAMETERS_START(1, 2)
2941 Z_PARAM_STR(str)
2942 Z_PARAM_OPTIONAL
2943 Z_PARAM_STR_OR_NULL(from_encoding)
2944 ZEND_PARSE_PARAMETERS_END();
2945
2946 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2947 if (!enc) {
2948 RETURN_THROWS();
2949 }
2950
2951 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2952 }
2953
2954 typedef enum {
2955 MB_LTRIM = 1,
2956 MB_RTRIM = 2,
2957 MB_BOTH_TRIM = 3
2958 } mb_trim_mode;
2959
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)2960 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
2961 {
2962 if (ht) {
2963 return zend_hash_index_exists(ht, w);
2964 } else {
2965 for (size_t i = 0; i < default_chars_length; i++) {
2966 if (w == default_chars[i]) {
2967 return true;
2968 }
2969 }
2970 return false;
2971 }
2972 }
2973
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)2974 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
2975 {
2976 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
2977 uint32_t wchar_buf[128];
2978 size_t in_len = ZSTR_LEN(str);
2979 size_t out_len = 0;
2980 unsigned int state = 0;
2981 size_t left = 0;
2982 size_t right = 0;
2983 size_t total_len = 0;
2984
2985 while (in_len) {
2986 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2987 ZEND_ASSERT(out_len <= 128);
2988 total_len += out_len;
2989
2990 for (size_t i = 0; i < out_len; i++) {
2991 uint32_t w = wchar_buf[i];
2992 if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
2993 if (mode & MB_LTRIM) {
2994 left += 1;
2995 }
2996 if (mode & MB_RTRIM) {
2997 right += 1;
2998 }
2999 } else {
3000 mode &= ~MB_LTRIM;
3001 if (mode & MB_RTRIM) {
3002 right = 0;
3003 }
3004 }
3005 }
3006 }
3007
3008 if (left == 0 && right == 0) {
3009 return zend_string_copy(str);
3010 }
3011 return mb_get_substr(str, left, total_len - (right + left), enc);
3012 }
3013
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3014 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3015 {
3016 const uint32_t trim_default_chars[] = {
3017 0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3018 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3019 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3020 0x85, 0x180E
3021 };
3022 size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3023
3024 HashTable what_ht;
3025 zval val;
3026 ZVAL_TRUE(&val);
3027
3028 zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3029
3030 for (size_t i = 0; i < trim_default_chars_length; i++) {
3031 zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3032 }
3033 zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3034 zend_hash_destroy(&what_ht);
3035
3036 return retval;
3037 }
3038
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3039 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3040 {
3041 unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3042 uint32_t what_wchar_buf[128];
3043 size_t what_out_len = 0;
3044 unsigned int state = 0;
3045 size_t what_len = ZSTR_LEN(what);
3046 HashTable what_ht;
3047 zval val;
3048 bool hash_initialized = false;
3049
3050 while (what_len) {
3051 what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3052 ZEND_ASSERT(what_out_len <= 128);
3053
3054 if (what_out_len <= 4 && !hash_initialized) {
3055 return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3056 } else {
3057 if (!hash_initialized) {
3058 hash_initialized = true;
3059 ZVAL_TRUE(&val);
3060 zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3061 }
3062 for (size_t i = 0; i < what_out_len; i++) {
3063 zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3064 }
3065 }
3066 }
3067
3068 if (UNEXPECTED(!hash_initialized)) {
3069 /* This is only possible if what is empty */
3070 return zend_string_copy(str);
3071 }
3072
3073 zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3074 zend_hash_destroy(&what_ht);
3075
3076 return retval;
3077 }
3078
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3079 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3080 {
3081 zend_string *str;
3082 zend_string *what = NULL;
3083 zend_string *encoding = NULL;
3084
3085 ZEND_PARSE_PARAMETERS_START(1, 3)
3086 Z_PARAM_STR(str)
3087 Z_PARAM_OPTIONAL
3088 Z_PARAM_STR(what)
3089 Z_PARAM_STR_OR_NULL(encoding)
3090 ZEND_PARSE_PARAMETERS_END();
3091
3092 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3093 if (!enc) {
3094 RETURN_THROWS();
3095 }
3096
3097 if (what) {
3098 RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3099 } else {
3100 RETURN_STR(mb_trim_default_chars(str, mode, enc));
3101 }
3102 }
3103
PHP_FUNCTION(mb_trim)3104 PHP_FUNCTION(mb_trim)
3105 {
3106 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3107 }
3108
PHP_FUNCTION(mb_ltrim)3109 PHP_FUNCTION(mb_ltrim)
3110 {
3111 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3112 }
3113
PHP_FUNCTION(mb_rtrim)3114 PHP_FUNCTION(mb_rtrim)
3115 {
3116 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3117 }
3118
duplicate_elist(const mbfl_encoding ** elist,size_t size)3119 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3120 {
3121 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3122 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3123 return new_elist;
3124 }
3125
estimate_demerits(uint32_t w)3126 static unsigned int estimate_demerits(uint32_t w)
3127 {
3128 /* Receive wchars decoded from input string using candidate encoding.
3129 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3130 * a smaller number for each ASCII punctuation character, and 1 for
3131 * all other codepoints.
3132 *
3133 * The 'common' codepoints should cover the vast majority of
3134 * codepoints we are likely to see in practice, while only covering
3135 * a small minority of the entire Unicode encoding space. Why?
3136 * Well, if the test string happens to be valid in an incorrect
3137 * candidate encoding, the bogus codepoints which it decodes to will
3138 * be more or less random. By treating the majority of codepoints as
3139 * 'rare', we ensure that in almost all such cases, the bogus
3140 * codepoints will include plenty of 'rares', thus giving the
3141 * incorrect candidate encoding lots of demerits. See
3142 * common_codepoints.txt for the actual list used.
3143 *
3144 * So, why give extra demerits for ASCII punctuation characters? It's
3145 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3146 * which deliberately only use bytes in the ASCII range. When
3147 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3148 * have an unusually high number of ASCII punctuation characters.
3149 * So giving extra demerits for such characters will improve
3150 * detection accuracy for UTF-7 and similar encodings.
3151 *
3152 * Finally, why 1 demerit for all other characters? That penalizes
3153 * long strings, meaning we will tend to choose a candidate encoding
3154 * in which the test string decodes to a smaller number of
3155 * codepoints. That prevents single-byte encodings in which almost
3156 * every possible input byte decodes to a 'common' codepoint from
3157 * being favored too much. */
3158 if (w > 0xFFFF) {
3159 return 40;
3160 } else if (w >= 0x21 && w <= 0x2F) {
3161 return 6;
3162 } else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3163 return 30;
3164 } else {
3165 return 1;
3166 }
3167 return 0;
3168 }
3169
3170 struct candidate {
3171 const mbfl_encoding *enc;
3172 const unsigned char *in;
3173 size_t in_len;
3174 uint64_t demerits; /* Wide bit size to prevent overflow */
3175 unsigned int state;
3176 float multiplier;
3177 };
3178
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3179 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3180 {
3181 size_t j = 0;
3182
3183 for (size_t i = 0; i < length; i++) {
3184 const mbfl_encoding *enc = encodings[i];
3185
3186 array[j].enc = enc;
3187 array[j].state = 0;
3188 array[j].demerits = 0;
3189
3190 /* If any candidate encodings have specialized validation functions, use them
3191 * to eliminate as many candidates as possible */
3192 if (enc->check != NULL) {
3193 for (size_t k = 0; k < n; k++) {
3194 if (!enc->check((unsigned char*)in[k], in_len[k])) {
3195 if (strict) {
3196 goto skip_to_next;
3197 } else {
3198 array[j].demerits += 500;
3199 }
3200 }
3201 }
3202 }
3203
3204 /* This multiplier can optionally be used to make candidate encodings listed
3205 * first more likely to be chosen. It is a weight factor which multiplies
3206 * the number of demerits counted for each candidate. */
3207 array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3208 j++;
3209 skip_to_next: ;
3210 }
3211
3212 return j;
3213 }
3214
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3215 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3216 {
3217 for (size_t i = 0; i < length; i++) {
3218 const mbfl_encoding *enc = array[i].enc;
3219
3220 array[i].in = in;
3221 array[i].in_len = in_len;
3222
3223 /* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3224 if (enc == &mbfl_encoding_utf8) {
3225 if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3226 array[i].in_len -= 3;
3227 array[i].in += 3;
3228 }
3229 } else if (enc == &mbfl_encoding_utf16be) {
3230 if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3231 array[i].in_len -= 2;
3232 array[i].in += 2;
3233 }
3234 } else if (enc == &mbfl_encoding_utf16le) {
3235 if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3236 array[i].in_len -= 2;
3237 array[i].in += 2;
3238 }
3239 }
3240 }
3241 }
3242
count_demerits(struct candidate * array,size_t length,bool strict)3243 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3244 {
3245 uint32_t wchar_buf[128];
3246 unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3247
3248 for (size_t i = 0; i < length; i++) {
3249 if (array[i].in_len == 0) {
3250 finished++;
3251 }
3252 }
3253
3254 while ((strict || length > 1) && finished < length) {
3255 /* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3256 for (size_t i = length - 1; i != (size_t)-1; i--) {
3257 /* Do we still have more input to process for this candidate encoding? */
3258 if (array[i].in_len) {
3259 const mbfl_encoding *enc = array[i].enc;
3260 size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3261 ZEND_ASSERT(out_len <= 128);
3262 /* Check this batch of decoded codepoints; are there any error markers?
3263 * Also sum up the number of demerits */
3264 while (out_len) {
3265 uint32_t w = wchar_buf[--out_len];
3266 if (w == MBFL_BAD_INPUT) {
3267 if (strict) {
3268 /* This candidate encoding is not valid, eliminate it from consideration */
3269 length--;
3270 if (i < length) {
3271 /* The eliminated candidate was the last valid one in the list */
3272 memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3273 }
3274 goto try_next_encoding;
3275 } else {
3276 array[i].demerits += 1000;
3277 }
3278 } else {
3279 array[i].demerits += estimate_demerits(w);
3280 }
3281 }
3282 if (array[i].in_len == 0) {
3283 finished++;
3284 }
3285 }
3286 try_next_encoding:;
3287 }
3288 }
3289
3290 for (size_t i = 0; i < length; i++) {
3291 array[i].demerits *= array[i].multiplier;
3292 }
3293
3294 return length;
3295 }
3296
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3297 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3298 {
3299 if (elist_size == 0) {
3300 return NULL;
3301 }
3302 if (elist_size == 1) {
3303 if (strict) {
3304 while (n--) {
3305 if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3306 return NULL;
3307 }
3308 }
3309 }
3310 return *elist;
3311 }
3312 if (n == 1 && *str_lengths == 0) {
3313 return *elist;
3314 }
3315
3316 /* Allocate on stack; when we return, this array is automatically freed */
3317 struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3318 elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3319
3320 while (n--) {
3321 start_string(array, elist_size, strings[n], str_lengths[n]);
3322 elist_size = count_demerits(array, elist_size, strict);
3323 if (elist_size == 0) {
3324 /* All candidates were eliminated */
3325 return NULL;
3326 }
3327 }
3328
3329 /* See which remaining candidate encoding has the least demerits */
3330 unsigned int best = 0;
3331 for (unsigned int i = 1; i < elist_size; i++) {
3332 if (array[i].demerits < array[best].demerits) {
3333 best = i;
3334 }
3335 }
3336 return array[best].enc;
3337 }
3338
3339 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3340 * is rejected. With non-strict detection, we just continue, but apply demerits for
3341 * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3342 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3343 {
3344 return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3345 }
3346
3347 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3348 PHP_FUNCTION(mb_detect_encoding)
3349 {
3350 zend_string *str, *encoding_str = NULL;
3351 HashTable *encoding_ht = NULL;
3352 bool strict = false;
3353 const mbfl_encoding *ret, **elist;
3354 size_t size;
3355
3356 ZEND_PARSE_PARAMETERS_START(1, 3)
3357 Z_PARAM_STR(str)
3358 Z_PARAM_OPTIONAL
3359 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3360 Z_PARAM_BOOL(strict)
3361 ZEND_PARSE_PARAMETERS_END();
3362
3363 /* Should we pay attention to the order of the provided candidate encodings and prefer
3364 * the earlier ones (if more than one candidate encoding matches)?
3365 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3366 * in, then don't treat the order as significant */
3367 bool order_significant = true;
3368
3369 /* make encoding list */
3370 if (encoding_ht) {
3371 if (encoding_ht == MBSTRG(all_encodings_list)) {
3372 order_significant = false;
3373 }
3374 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3375 RETURN_THROWS();
3376 }
3377 } else if (encoding_str) {
3378 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3379 RETURN_THROWS();
3380 }
3381 } else {
3382 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3383 size = MBSTRG(current_detect_order_list_size);
3384 }
3385
3386 if (size == 0) {
3387 efree(ZEND_VOIDP(elist));
3388 zend_argument_value_error(2, "must specify at least one encoding");
3389 RETURN_THROWS();
3390 }
3391
3392 remove_non_encodings_from_elist(elist, &size);
3393 if (size == 0) {
3394 efree(ZEND_VOIDP(elist));
3395 RETURN_FALSE;
3396 }
3397
3398 if (ZEND_NUM_ARGS() < 3) {
3399 strict = MBSTRG(strict_detection);
3400 }
3401
3402 if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3403 ret = &mbfl_encoding_utf8;
3404 } else {
3405 ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3406 }
3407
3408 efree(ZEND_VOIDP(elist));
3409
3410 if (ret == NULL) {
3411 RETURN_FALSE;
3412 }
3413
3414 RETVAL_STRING((char *)ret->name);
3415 }
3416 /* }}} */
3417
3418 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3419 PHP_FUNCTION(mb_list_encodings)
3420 {
3421 ZEND_PARSE_PARAMETERS_NONE();
3422
3423 if (MBSTRG(all_encodings_list) == NULL) {
3424 /* Initialize shared array of supported encoding names
3425 * This is done so that we can check if `mb_list_encodings()` is being
3426 * passed to other mbstring functions using a cheap pointer equality check */
3427 HashTable *array = emalloc(sizeof(HashTable));
3428 zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3429 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3430 zval tmp;
3431 ZVAL_STRING(&tmp, (*encodings)->name);
3432 zend_hash_next_index_insert(array, &tmp);
3433 }
3434 MBSTRG(all_encodings_list) = array;
3435 }
3436
3437 GC_ADDREF(MBSTRG(all_encodings_list));
3438 RETURN_ARR(MBSTRG(all_encodings_list));
3439 }
3440 /* }}} */
3441
3442 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3443 PHP_FUNCTION(mb_encoding_aliases)
3444 {
3445 const mbfl_encoding *encoding;
3446 zend_string *encoding_name = NULL;
3447
3448 ZEND_PARSE_PARAMETERS_START(1, 1)
3449 Z_PARAM_STR(encoding_name)
3450 ZEND_PARSE_PARAMETERS_END();
3451
3452 encoding = php_mb_get_encoding(encoding_name, 1);
3453 if (!encoding) {
3454 RETURN_THROWS();
3455 }
3456
3457 array_init(return_value);
3458 if (encoding->aliases != NULL) {
3459 for (const char **alias = encoding->aliases; *alias; ++alias) {
3460 add_next_index_string(return_value, (char *)*alias);
3461 }
3462 }
3463 }
3464 /* }}} */
3465
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3466 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3467 {
3468 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3469 * if we are converting zenkaku kana to hankaku kana
3470 * Make the buffer for converted kana big enough that we never need to
3471 * perform bounds checks */
3472 uint32_t wchar_buf[64], converted_buf[64 * 2];
3473 unsigned int buf_offset = 0;
3474 unsigned int state = 0;
3475 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3476 size_t in_len = ZSTR_LEN(input);
3477
3478 mb_convert_buf buf;
3479 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3480
3481 while (in_len) {
3482 uint32_t *converted = converted_buf;
3483 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3484 * previous iteration, don't overwrite it */
3485 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3486 out_len += buf_offset;
3487 ZEND_ASSERT(out_len <= 64);
3488
3489 if (!out_len) {
3490 continue;
3491 }
3492
3493 for (size_t i = 0; i < out_len-1; i++) {
3494 uint32_t second = 0;
3495 bool consumed = false;
3496 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3497 if (second) {
3498 *converted++ = second;
3499 }
3500 if (consumed) {
3501 i++;
3502 if (i == out_len-1) {
3503 /* We consumed two codepoints at the very end of the wchar buffer
3504 * So there is nothing remaining to reprocess on the next iteration */
3505 buf_offset = 0;
3506 goto emit_converted_kana;
3507 }
3508 }
3509 }
3510
3511 if (!in_len) {
3512 /* This is the last iteration, so we need to process the final codepoint now */
3513 uint32_t second = 0;
3514 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3515 if (second) {
3516 *converted++ = second;
3517 }
3518 } else {
3519 /* Reprocess the last codepoint on the next iteration */
3520 wchar_buf[0] = wchar_buf[out_len-1];
3521 buf_offset = 1;
3522 }
3523
3524 emit_converted_kana:
3525 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3526 }
3527
3528 return mb_convert_buf_result(&buf, encoding);
3529 }
3530
3531 char mb_convert_kana_flags[17] = {
3532 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3533 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3534 'V'
3535 };
3536
3537 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3538 PHP_FUNCTION(mb_convert_kana)
3539 {
3540 unsigned int opt;
3541 char *optstr = NULL;
3542 size_t optstr_len;
3543 zend_string *encname = NULL, *str;
3544
3545 ZEND_PARSE_PARAMETERS_START(1, 3)
3546 Z_PARAM_STR(str)
3547 Z_PARAM_OPTIONAL
3548 Z_PARAM_STRING(optstr, optstr_len)
3549 Z_PARAM_STR_OR_NULL(encname)
3550 ZEND_PARSE_PARAMETERS_END();
3551
3552 if (optstr != NULL) {
3553 char *p = optstr, *e = p + optstr_len;
3554 opt = 0;
3555 next_option:
3556 while (p < e) {
3557 /* Walk through option string and convert to bit vector
3558 * See translit_kana_jisx0201_jisx0208.h for the values used */
3559 char c = *p++;
3560 if (c == 'A') {
3561 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3562 } else if (c == 'a') {
3563 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3564 } else {
3565 for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3566 if (c == mb_convert_kana_flags[i]) {
3567 opt |= (1 << i);
3568 goto next_option;
3569 }
3570 }
3571
3572 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3573 RETURN_THROWS();
3574 }
3575 }
3576
3577 /* Check for illegal combinations of options */
3578 if (((opt & 0xFF00) >> 8) & opt) {
3579 /* It doesn't make sense to convert the same type of characters from halfwidth to
3580 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3581 * FW hiragana to FW katakana and then back again. */
3582 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3583 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3584 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3585 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3586 flag1 = 'A';
3587 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3588 flag2 = 'a';
3589 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3590 RETURN_THROWS();
3591 }
3592
3593 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3594 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3595 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3596 RETURN_THROWS();
3597 }
3598
3599 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3600 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3601 * more than one of these */
3602 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3603 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3604 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3605 RETURN_THROWS();
3606 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3607 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3608 RETURN_THROWS();
3609 }
3610 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3611 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3612 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3613 RETURN_THROWS();
3614 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3615 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3616 RETURN_THROWS();
3617 }
3618 }
3619 } else {
3620 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3621 }
3622
3623 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3624 if (!enc) {
3625 RETURN_THROWS();
3626 }
3627
3628 RETVAL_STR(jp_kana_convert(str, enc, opt));
3629 }
3630
mb_recursive_count_strings(zval * var)3631 static unsigned int mb_recursive_count_strings(zval *var)
3632 {
3633 unsigned int count = 0;
3634 ZVAL_DEREF(var);
3635
3636 if (Z_TYPE_P(var) == IS_STRING) {
3637 count++;
3638 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3639 if (Z_REFCOUNTED_P(var)) {
3640 if (Z_IS_RECURSIVE_P(var)) {
3641 return count;
3642 }
3643 Z_PROTECT_RECURSION_P(var);
3644 }
3645
3646 HashTable *ht = HASH_OF(var);
3647 if (ht != NULL) {
3648 zval *entry;
3649 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3650 count += mb_recursive_count_strings(entry);
3651 } ZEND_HASH_FOREACH_END();
3652 }
3653
3654 if (Z_REFCOUNTED_P(var)) {
3655 Z_UNPROTECT_RECURSION_P(var);
3656 }
3657 }
3658
3659 return count;
3660 }
3661
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3662 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3663 {
3664 ZVAL_DEREF(var);
3665
3666 if (Z_TYPE_P(var) == IS_STRING) {
3667 val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3668 len_list[*count] = Z_STRLEN_P(var);
3669 (*count)++;
3670 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3671 if (Z_REFCOUNTED_P(var)) {
3672 if (Z_IS_RECURSIVE_P(var)) {
3673 return true;
3674 }
3675 Z_PROTECT_RECURSION_P(var);
3676 }
3677
3678 HashTable *ht = HASH_OF(var);
3679 if (ht != NULL) {
3680 zval *entry;
3681 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3682 if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3683 if (Z_REFCOUNTED_P(var)) {
3684 Z_UNPROTECT_RECURSION_P(var);
3685 return true;
3686 }
3687 }
3688 } ZEND_HASH_FOREACH_END();
3689 }
3690
3691 if (Z_REFCOUNTED_P(var)) {
3692 Z_UNPROTECT_RECURSION_P(var);
3693 }
3694 }
3695
3696 return false;
3697 }
3698
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3699 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3700 {
3701 zval *entry, *orig_var;
3702
3703 orig_var = var;
3704 ZVAL_DEREF(var);
3705
3706 if (Z_TYPE_P(var) == IS_STRING) {
3707 zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3708 zval_ptr_dtor(orig_var);
3709 ZVAL_STR(orig_var, ret);
3710 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3711 if (Z_TYPE_P(var) == IS_ARRAY) {
3712 SEPARATE_ARRAY(var);
3713 }
3714 if (Z_REFCOUNTED_P(var)) {
3715 if (Z_IS_RECURSIVE_P(var)) {
3716 return true;
3717 }
3718 Z_PROTECT_RECURSION_P(var);
3719 }
3720
3721 HashTable *ht = HASH_OF(var);
3722 if (ht != NULL) {
3723 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3724 if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3725 if (Z_REFCOUNTED_P(var)) {
3726 Z_UNPROTECT_RECURSION_P(var);
3727 }
3728 return true;
3729 }
3730 } ZEND_HASH_FOREACH_END();
3731 }
3732
3733 if (Z_REFCOUNTED_P(var)) {
3734 Z_UNPROTECT_RECURSION_P(var);
3735 }
3736 }
3737
3738 return false;
3739 }
3740
PHP_FUNCTION(mb_convert_variables)3741 PHP_FUNCTION(mb_convert_variables)
3742 {
3743 zval *args;
3744 zend_string *to_enc_str;
3745 zend_string *from_enc_str;
3746 HashTable *from_enc_ht;
3747 const mbfl_encoding *from_encoding, *to_encoding;
3748 uint32_t argc;
3749 size_t elistsz;
3750 const mbfl_encoding **elist;
3751
3752 ZEND_PARSE_PARAMETERS_START(3, -1)
3753 Z_PARAM_STR(to_enc_str)
3754 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3755 Z_PARAM_VARIADIC('+', args, argc)
3756 ZEND_PARSE_PARAMETERS_END();
3757
3758 /* new encoding */
3759 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3760 if (!to_encoding) {
3761 RETURN_THROWS();
3762 }
3763
3764 from_encoding = MBSTRG(current_internal_encoding);
3765
3766 bool order_significant = true;
3767
3768 /* pre-conversion encoding */
3769 if (from_enc_ht) {
3770 if (from_enc_ht == MBSTRG(all_encodings_list)) {
3771 /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3772 * in, then don't treat the order of the list as significant */
3773 order_significant = false;
3774 }
3775 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3776 RETURN_THROWS();
3777 }
3778 } else {
3779 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3780 RETURN_THROWS();
3781 }
3782 }
3783
3784 if (elistsz == 0) {
3785 efree(ZEND_VOIDP(elist));
3786 zend_argument_value_error(2, "must specify at least one encoding");
3787 RETURN_THROWS();
3788 }
3789
3790 if (elistsz == 1) {
3791 from_encoding = *elist;
3792 } else {
3793 /* auto detect */
3794 unsigned int num = 0;
3795 for (size_t n = 0; n < argc; n++) {
3796 zval *zv = &args[n];
3797 num += mb_recursive_count_strings(zv);
3798 }
3799 const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3800 size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3801 unsigned int i = 0;
3802 for (size_t n = 0; n < argc; n++) {
3803 zval *zv = &args[n];
3804 if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3805 efree(ZEND_VOIDP(elist));
3806 efree(ZEND_VOIDP(val_list));
3807 efree(len_list);
3808 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3809 RETURN_FALSE;
3810 }
3811 }
3812 from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3813 efree(ZEND_VOIDP(val_list));
3814 efree(len_list);
3815 if (!from_encoding) {
3816 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3817 efree(ZEND_VOIDP(elist));
3818 RETURN_FALSE;
3819 }
3820
3821 }
3822
3823 efree(ZEND_VOIDP(elist));
3824
3825 /* convert */
3826 for (size_t n = 0; n < argc; n++) {
3827 zval *zv = &args[n];
3828 ZVAL_DEREF(zv);
3829 if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3830 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3831 RETURN_FALSE;
3832 }
3833 }
3834
3835 RETURN_STRING(from_encoding->name);
3836 }
3837
3838 /* HTML numeric entities */
3839
3840 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3841 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3842 {
3843 zval *hash_entry;
3844
3845 size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3846 if (n_elems % 4 != 0) {
3847 zend_argument_value_error(2, "must have a multiple of 4 elements");
3848 return NULL;
3849 }
3850
3851 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3852 uint32_t *mapelm = convmap;
3853
3854 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3855 bool failed = true;
3856 zend_long tmp = zval_try_get_long(hash_entry, &failed);
3857 if (failed) {
3858 efree(convmap);
3859 zend_argument_value_error(2, "must only be composed of values of type int");
3860 return NULL;
3861 }
3862 *mapelm++ = tmp;
3863 } ZEND_HASH_FOREACH_END();
3864
3865 return convmap;
3866 }
3867
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3868 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3869 {
3870 uint32_t *convmap_end = convmap + conversion_map_size;
3871
3872 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3873 uint32_t lo_code = mapelm[0];
3874 uint32_t hi_code = mapelm[1];
3875 uint32_t offset = mapelm[2];
3876 uint32_t mask = mapelm[3];
3877
3878 if (w >= lo_code && w <= hi_code) {
3879 /* This wchar falls inside one of the ranges which should be
3880 * converted to HTML entities */
3881 *retval = (w + offset) & mask;
3882 return true;
3883 }
3884 }
3885
3886 /* None of the ranges matched */
3887 return false;
3888 }
3889
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3890 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3891 {
3892 /* Each wchar which we get from decoding the input string may become up to
3893 * 13 wchars when we convert it to an HTML entity */
3894 uint32_t wchar_buf[32], converted_buf[32 * 13];
3895 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3896
3897 unsigned int state = 0;
3898 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3899 size_t in_len = ZSTR_LEN(input);
3900
3901 mb_convert_buf buf;
3902 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3903
3904 while (in_len) {
3905 /* Convert input string to wchars, up to 32 at a time */
3906 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3907 ZEND_ASSERT(out_len <= 32);
3908 uint32_t *converted = converted_buf;
3909
3910 /* Run through wchars and see if any of them fall into the ranges
3911 * which we want to convert to HTML entities */
3912 for (size_t i = 0; i < out_len; i++) {
3913 uint32_t w = wchar_buf[i];
3914
3915 if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3916 *converted++ = '&';
3917 *converted++ = '#';
3918 if (hex) {
3919 *converted++ = 'x';
3920 }
3921
3922 /* Convert wchar to decimal/hex string */
3923 if (w == 0) {
3924 *converted++ = '0';
3925 } else {
3926 unsigned char *p = entity + sizeof(entity);
3927 if (hex) {
3928 while (w > 0) {
3929 *(--p) = "0123456789ABCDEF"[w & 0xF];
3930 w >>= 4;
3931 }
3932 } else {
3933 while (w > 0) {
3934 *(--p) = "0123456789"[w % 10];
3935 w /= 10;
3936 }
3937 }
3938 while (p < entity + sizeof(entity)) {
3939 *converted++ = *p++;
3940 }
3941 }
3942
3943 *converted++ = ';';
3944 } else {
3945 *converted++ = w;
3946 }
3947 }
3948
3949 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3950 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3951 }
3952
3953 return mb_convert_buf_result(&buf, encoding);
3954 }
3955
3956 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3957 PHP_FUNCTION(mb_encode_numericentity)
3958 {
3959 zend_string *encoding = NULL, *str;
3960 size_t conversion_map_size;
3961 HashTable *target_hash;
3962 bool is_hex = false;
3963
3964 ZEND_PARSE_PARAMETERS_START(2, 4)
3965 Z_PARAM_STR(str)
3966 Z_PARAM_ARRAY_HT(target_hash)
3967 Z_PARAM_OPTIONAL
3968 Z_PARAM_STR_OR_NULL(encoding)
3969 Z_PARAM_BOOL(is_hex)
3970 ZEND_PARSE_PARAMETERS_END();
3971
3972 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3973 if (!enc) {
3974 RETURN_THROWS();
3975 }
3976
3977 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
3978 if (convmap == NULL) {
3979 RETURN_THROWS();
3980 }
3981
3982 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
3983 efree(convmap);
3984 }
3985 /* }}} */
3986
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3987 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3988 {
3989 uint32_t *convmap_end = convmap + conversion_map_size;
3990
3991 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3992 uint32_t lo_code = mapelm[0];
3993 uint32_t hi_code = mapelm[1];
3994 uint32_t offset = mapelm[2];
3995 uint32_t codepoint = number - offset;
3996 if (codepoint >= lo_code && codepoint <= hi_code) {
3997 *retval = codepoint;
3998 return true;
3999 }
4000 }
4001
4002 return false;
4003 }
4004
4005 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
4006 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
4007 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4008 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4009
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4010 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4011 {
4012 uint32_t wchar_buf[128], converted_buf[128];
4013
4014 unsigned int state = 0;
4015 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4016 size_t in_len = ZSTR_LEN(input);
4017
4018 mb_convert_buf buf;
4019 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4020
4021 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4022 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4023 * 2nd 'converted' buffer.
4024 *
4025 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4026 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4027 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4028 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4029 * to store the next batch of wchars after it.
4030 *
4031 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4032 * wchars from the 1st buffer to the 2nd one.
4033 *
4034 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4035 * have to do bounds checks when writing wchars into it.
4036 */
4037
4038 unsigned int wchar_buf_offset = 0;
4039
4040 while (in_len) {
4041 /* Leave space for sentinel at the end of the buffer */
4042 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4043 out_len += wchar_buf_offset;
4044 ZEND_ASSERT(out_len <= 127);
4045 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4046
4047 uint32_t *p, *converted;
4048
4049 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4050 * be there (in `wchar_buf[0]`), so don't bother in that case */
4051 if (wchar_buf_offset == 0) {
4052 p = wchar_buf;
4053 while (*p != '&')
4054 p++;
4055 if (p == wchar_buf + out_len) {
4056 /* No HTML entities in this buffer */
4057 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4058 continue;
4059 }
4060
4061 /* Copy over the prefix with no & which we already scanned */
4062 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4063 converted = converted_buf + (p - wchar_buf);
4064 } else {
4065 p = wchar_buf;
4066 converted = converted_buf;
4067 }
4068
4069 found_ampersand:
4070 ZEND_ASSERT(*p == '&');
4071 uint32_t *p2 = p;
4072
4073 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4074 if (*++p2 == '#') {
4075 if (*++p2 == 'x') {
4076 /* Possible hex entity */
4077 uint32_t w = *++p2;
4078 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4079 w = *++p2;
4080 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4081 /* We hit the end of the buffer while reading digits, and
4082 * more wchars are still coming in the next buffer
4083 * Reprocess this identity on next iteration */
4084 memmove(wchar_buf, p, (p2 - p) * 4);
4085 wchar_buf_offset = p2 - p;
4086 goto process_converted_wchars;
4087 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4088 /* Invalid entity (too long or "&#x" only) */
4089 memcpy(converted, p, (p2 - p) * 4);
4090 converted += p2 - p;
4091 } else {
4092 /* Valid hexadecimal entity */
4093 uint32_t value = 0, *p3 = p + 3;
4094 while (p3 < p2) {
4095 w = *p3++;
4096 if (w <= '9') {
4097 value = (value * 16) + (w - '0');
4098 } else if (w >= 'a') {
4099 value = (value * 16) + 10 + (w - 'a');
4100 } else {
4101 value = (value * 16) + 10 + (w - 'A');
4102 }
4103 }
4104 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4105 converted++;
4106 if (*p2 == ';')
4107 p2++;
4108 } else {
4109 memcpy(converted, p, (p2 - p) * 4);
4110 converted += p2 - p;
4111 }
4112 }
4113 } else {
4114 /* Possible decimal entity */
4115 uint32_t w = *p2;
4116 while (w >= '0' && w <= '9')
4117 w = *++p2;
4118 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4119 /* The number of digits was legal (no more than 10 decimal digits)
4120 * Reprocess this identity on next iteration of main loop */
4121 memmove(wchar_buf, p, (p2 - p) * 4);
4122 wchar_buf_offset = p2 - p;
4123 goto process_converted_wchars;
4124 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4125 /* Invalid entity (too long or "&#" only) */
4126 memcpy(converted, p, (p2 - p) * 4);
4127 converted += p2 - p;
4128 } else {
4129 /* Valid decimal entity */
4130 uint32_t value = 0, *p3 = p + 2;
4131 while (p3 < p2) {
4132 /* If unsigned integer overflow would occur in the below
4133 * multiplication by 10, this entity is no good
4134 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4135 if (value > 0x19999999) {
4136 memcpy(converted, p, (p2 - p) * 4);
4137 converted += p2 - p;
4138 goto decimal_entity_too_big;
4139 }
4140 value = (value * 10) + (*p3++ - '0');
4141 }
4142 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4143 converted++;
4144 if (*p2 == ';')
4145 p2++;
4146 } else {
4147 memcpy(converted, p, (p2 - p) * 4);
4148 converted += p2 - p;
4149 }
4150 }
4151 }
4152 } else if ((p2 == wchar_buf + out_len) && in_len) {
4153 /* Corner case: & at end of buffer */
4154 wchar_buf[0] = '&';
4155 wchar_buf_offset = 1;
4156 goto process_converted_wchars;
4157 } else {
4158 *converted++ = '&';
4159 }
4160 decimal_entity_too_big:
4161
4162 /* Starting to scan a new section of the wchar buffer
4163 * 'p2' is pointing at the next wchar which needs to be processed */
4164 p = p2;
4165 while (*p2 != '&')
4166 p2++;
4167
4168 if (p2 > p) {
4169 memcpy(converted, p, (p2 - p) * 4);
4170 converted += p2 - p;
4171 p = p2;
4172 }
4173
4174 if (p < wchar_buf + out_len)
4175 goto found_ampersand;
4176
4177 /* We do not have any wchars remaining at the end of this buffer which
4178 * we need to reprocess on the next call */
4179 wchar_buf_offset = 0;
4180 process_converted_wchars:
4181 ZEND_ASSERT(converted <= converted_buf + 128);
4182 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4183 }
4184
4185 return mb_convert_buf_result(&buf, encoding);
4186 }
4187
4188 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4189 PHP_FUNCTION(mb_decode_numericentity)
4190 {
4191 zend_string *encoding = NULL, *str;
4192 size_t conversion_map_size;
4193 HashTable *target_hash;
4194
4195 ZEND_PARSE_PARAMETERS_START(2, 3)
4196 Z_PARAM_STR(str)
4197 Z_PARAM_ARRAY_HT(target_hash)
4198 Z_PARAM_OPTIONAL
4199 Z_PARAM_STR_OR_NULL(encoding)
4200 ZEND_PARSE_PARAMETERS_END();
4201
4202 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4203 if (!enc) {
4204 RETURN_THROWS();
4205 }
4206
4207 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4208 if (convmap == NULL) {
4209 RETURN_THROWS();
4210 }
4211
4212 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4213 efree(convmap);
4214 }
4215 /* }}} */
4216
4217 /* {{{ Sends an email message with MIME scheme */
4218 #define CRLF "\r\n"
4219
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4220 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4221 {
4222 const char *ps;
4223 size_t icnt;
4224 int state = 0;
4225 int crlf_state = -1;
4226 char *token = NULL;
4227 size_t token_pos = 0;
4228 zend_string *fld_name, *fld_val;
4229
4230 ps = str;
4231 icnt = str_len;
4232 fld_name = fld_val = NULL;
4233
4234 /*
4235 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4236 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4237 * state 0 1 2 3
4238 *
4239 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4240 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4241 * crlf_state -1 0 1 -1
4242 *
4243 */
4244
4245 while (icnt > 0) {
4246 switch (*ps) {
4247 case ':':
4248 if (crlf_state == 1) {
4249 token_pos++;
4250 }
4251
4252 if (state == 0 || state == 1) {
4253 if(token && token_pos > 0) {
4254 fld_name = zend_string_init(token, token_pos, 0);
4255 }
4256 state = 2;
4257 } else {
4258 token_pos++;
4259 }
4260
4261 crlf_state = 0;
4262 break;
4263
4264 case '\n':
4265 if (crlf_state == -1) {
4266 goto out;
4267 }
4268 crlf_state = -1;
4269 break;
4270
4271 case '\r':
4272 if (crlf_state == 1) {
4273 token_pos++;
4274 } else {
4275 crlf_state = 1;
4276 }
4277 break;
4278
4279 case ' ': case '\t':
4280 if (crlf_state == -1) {
4281 if (state == 3) {
4282 /* continuing from the previous line */
4283 state = 4;
4284 } else {
4285 /* simply skipping this new line */
4286 state = 5;
4287 }
4288 } else {
4289 if (crlf_state == 1) {
4290 token_pos++;
4291 }
4292 if (state == 1 || state == 3) {
4293 token_pos++;
4294 }
4295 }
4296 crlf_state = 0;
4297 break;
4298
4299 default:
4300 switch (state) {
4301 case 0:
4302 token = (char*)ps;
4303 token_pos = 0;
4304 state = 1;
4305 break;
4306
4307 case 2:
4308 if (crlf_state != -1) {
4309 token = (char*)ps;
4310 token_pos = 0;
4311
4312 state = 3;
4313 break;
4314 }
4315 ZEND_FALLTHROUGH;
4316
4317 case 3:
4318 if (crlf_state == -1) {
4319 if(token && token_pos > 0) {
4320 fld_val = zend_string_init(token, token_pos, 0);
4321 }
4322
4323 if (fld_name != NULL && fld_val != NULL) {
4324 zval val;
4325 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4326 ZVAL_STR(&val, fld_val);
4327
4328 zend_hash_update(ht, fld_name, &val);
4329
4330 zend_string_release_ex(fld_name, 0);
4331 }
4332
4333 fld_name = fld_val = NULL;
4334 token = (char*)ps;
4335 token_pos = 0;
4336
4337 state = 1;
4338 }
4339 break;
4340
4341 case 4:
4342 token_pos++;
4343 state = 3;
4344 break;
4345 }
4346
4347 if (crlf_state == 1) {
4348 token_pos++;
4349 }
4350
4351 token_pos++;
4352
4353 crlf_state = 0;
4354 break;
4355 }
4356 ps++, icnt--;
4357 }
4358 out:
4359 if (state == 2) {
4360 token = "";
4361 token_pos = 0;
4362
4363 state = 3;
4364 }
4365 if (state == 3) {
4366 if(token && token_pos > 0) {
4367 fld_val = zend_string_init(token, token_pos, 0);
4368 }
4369 if (fld_name != NULL && fld_val != NULL) {
4370 zval val;
4371 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4372 ZVAL_STR(&val, fld_val);
4373 zend_hash_update(ht, fld_name, &val);
4374
4375 zend_string_release_ex(fld_name, 0);
4376 }
4377 }
4378 return state;
4379 }
4380
PHP_FUNCTION(mb_send_mail)4381 PHP_FUNCTION(mb_send_mail)
4382 {
4383 char *to;
4384 size_t to_len;
4385 char *message;
4386 size_t message_len;
4387 zend_string *subject;
4388 zend_string *extra_cmd = NULL;
4389 HashTable *headers_ht = NULL;
4390 zend_string *str_headers = NULL;
4391 size_t i;
4392 char *to_r = NULL;
4393 char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4394 bool suppress_content_type = false;
4395 bool suppress_content_transfer_encoding = false;
4396
4397 char *p;
4398 enum mbfl_no_encoding;
4399 const mbfl_encoding *tran_cs, /* transfer text charset */
4400 *head_enc, /* header transfer encoding */
4401 *body_enc; /* body transfer encoding */
4402 const mbfl_language *lang;
4403 HashTable ht_headers;
4404 zval *s;
4405
4406 /* character-set, transfer-encoding */
4407 tran_cs = &mbfl_encoding_utf8;
4408 head_enc = &mbfl_encoding_base64;
4409 body_enc = &mbfl_encoding_base64;
4410 lang = mbfl_no2language(MBSTRG(language));
4411 if (lang != NULL) {
4412 tran_cs = mbfl_no2encoding(lang->mail_charset);
4413 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4414 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4415 }
4416
4417 ZEND_PARSE_PARAMETERS_START(3, 5)
4418 Z_PARAM_PATH(to, to_len)
4419 Z_PARAM_PATH_STR(subject)
4420 Z_PARAM_PATH(message, message_len)
4421 Z_PARAM_OPTIONAL
4422 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4423 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4424 ZEND_PARSE_PARAMETERS_END();
4425
4426 if (str_headers) {
4427 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4428 zend_argument_value_error(4, "must not contain any null bytes");
4429 RETURN_THROWS();
4430 }
4431 str_headers = php_trim(str_headers, NULL, 0, 2);
4432 } else if (headers_ht) {
4433 str_headers = php_mail_build_headers(headers_ht);
4434 if (EG(exception)) {
4435 RETURN_THROWS();
4436 }
4437 }
4438
4439 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4440
4441 if (str_headers != NULL) {
4442 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4443 }
4444
4445 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4446 char *tmp;
4447 char *param_name;
4448 char *charset = NULL;
4449
4450 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4451 p = strchr(Z_STRVAL_P(s), ';');
4452
4453 if (p != NULL) {
4454 /* skipping the padded spaces */
4455 do {
4456 ++p;
4457 } while (*p == ' ' || *p == '\t');
4458
4459 if (*p != '\0') {
4460 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4461 if (strcasecmp(param_name, "charset") == 0) {
4462 const mbfl_encoding *_tran_cs = tran_cs;
4463
4464 charset = php_strtok_r(NULL, "= \"", &tmp);
4465 if (charset != NULL) {
4466 _tran_cs = mbfl_name2encoding(charset);
4467 }
4468
4469 if (!_tran_cs) {
4470 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4471 _tran_cs = &mbfl_encoding_ascii;
4472 }
4473 tran_cs = _tran_cs;
4474 }
4475 }
4476 }
4477 }
4478 suppress_content_type = true;
4479 }
4480
4481 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4482 const mbfl_encoding *_body_enc;
4483
4484 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4485 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4486 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4487 case mbfl_no_encoding_base64:
4488 case mbfl_no_encoding_7bit:
4489 case mbfl_no_encoding_8bit:
4490 body_enc = _body_enc;
4491 break;
4492
4493 default:
4494 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4495 body_enc = &mbfl_encoding_8bit;
4496 break;
4497 }
4498 suppress_content_transfer_encoding = true;
4499 }
4500
4501 /* To: */
4502 if (to_len > 0) {
4503 to_r = estrndup(to, to_len);
4504 for (; to_len; to_len--) {
4505 if (!isspace((unsigned char) to_r[to_len - 1])) {
4506 break;
4507 }
4508 to_r[to_len - 1] = '\0';
4509 }
4510 for (i = 0; to_r[i]; i++) {
4511 if (iscntrl((unsigned char) to_r[i])) {
4512 /* According to RFC 822, section 3.1.1 long headers may be separated into
4513 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4514 * To prevent these separators from being replaced with a space, we skip over them. */
4515 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4516 i += 2;
4517 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4518 i++;
4519 }
4520 continue;
4521 }
4522
4523 to_r[i] = ' ';
4524 }
4525 }
4526 } else {
4527 to_r = to;
4528 }
4529
4530 /* Subject: */
4531 const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4532 if (enc == &mbfl_encoding_pass) {
4533 enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4534 }
4535 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4536 size_t line_sep_len = strlen(line_sep);
4537
4538 subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4539
4540 /* message body */
4541 const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4542 if (msg_enc == &mbfl_encoding_pass) {
4543 msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4544 }
4545
4546 unsigned int num_errors = 0;
4547 zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4548 zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4549 zend_string_free(tmpstr);
4550 message = ZSTR_VAL(conv);
4551
4552 /* other headers */
4553 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4554 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4555 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4556 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4557
4558 smart_str str = {0};
4559 bool empty = true;
4560
4561 if (str_headers != NULL) {
4562 /* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4563 size_t len = ZSTR_LEN(str_headers);
4564 if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4565 len--;
4566 }
4567 if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4568 len--;
4569 }
4570 smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4571 empty = false;
4572 zend_string_release_ex(str_headers, 0);
4573 }
4574
4575 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4576 if (!empty) {
4577 smart_str_appendl(&str, line_sep, line_sep_len);
4578 }
4579 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4580 empty = false;
4581 }
4582
4583 if (!suppress_content_type) {
4584 if (!empty) {
4585 smart_str_appendl(&str, line_sep, line_sep_len);
4586 }
4587 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4588
4589 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4590 if (p != NULL) {
4591 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4592 smart_str_appends(&str, p);
4593 }
4594 empty = false;
4595 }
4596
4597 if (!suppress_content_transfer_encoding) {
4598 if (!empty) {
4599 smart_str_appendl(&str, line_sep, line_sep_len);
4600 }
4601 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4602 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4603 if (p == NULL) {
4604 p = "7bit";
4605 }
4606 smart_str_appends(&str, p);
4607 }
4608
4609 str_headers = smart_str_extract(&str);
4610
4611 if (force_extra_parameters) {
4612 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4613 } else if (extra_cmd) {
4614 extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4615 }
4616
4617 RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4618
4619 if (extra_cmd) {
4620 zend_string_release_ex(extra_cmd, 0);
4621 }
4622 if (to_r != to) {
4623 efree(to_r);
4624 }
4625 zend_string_release(subject);
4626 zend_string_free(conv);
4627 zend_hash_destroy(&ht_headers);
4628 if (str_headers) {
4629 zend_string_release_ex(str_headers, 0);
4630 }
4631 }
4632
4633 #undef CRLF
4634 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4635 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4636 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4637 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4638 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4639 /* }}} */
4640
4641 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4642 PHP_FUNCTION(mb_get_info)
4643 {
4644 zend_string *type = NULL;
4645 size_t n;
4646 char *name;
4647 zval row;
4648 const mbfl_encoding **entry;
4649 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4650
4651 ZEND_ASSERT(lang);
4652
4653 ZEND_PARSE_PARAMETERS_START(0, 1)
4654 Z_PARAM_OPTIONAL
4655 Z_PARAM_STR(type)
4656 ZEND_PARSE_PARAMETERS_END();
4657
4658 if (!type || zend_string_equals_literal_ci(type, "all")) {
4659 array_init(return_value);
4660 if (MBSTRG(current_internal_encoding)) {
4661 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4662 }
4663 if (MBSTRG(http_input_identify)) {
4664 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4665 }
4666 if (MBSTRG(current_http_output_encoding)) {
4667 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4668 }
4669
4670 add_assoc_str(return_value, "http_output_conv_mimetypes",
4671 zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4672 );
4673
4674 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4675 add_assoc_string(return_value, "mail_charset", name);
4676
4677 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4678 add_assoc_string(return_value, "mail_header_encoding", name);
4679
4680 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4681 add_assoc_string(return_value, "mail_body_encoding", name);
4682
4683 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4684
4685 if (MBSTRG(encoding_translation)) {
4686 add_assoc_string(return_value, "encoding_translation", "On");
4687 } else {
4688 add_assoc_string(return_value, "encoding_translation", "Off");
4689 }
4690
4691 name = (char *)mbfl_no_language2name(MBSTRG(language));
4692 add_assoc_string(return_value, "language", name);
4693
4694 // TODO Seems to always have one entry at least?
4695 n = MBSTRG(current_detect_order_list_size);
4696 entry = MBSTRG(current_detect_order_list);
4697 if (n > 0) {
4698 size_t i;
4699 array_init(&row);
4700 for (i = 0; i < n; i++) {
4701 add_next_index_string(&row, (*entry)->name);
4702 entry++;
4703 }
4704 add_assoc_zval(return_value, "detect_order", &row);
4705 }
4706 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4707 add_assoc_string(return_value, "substitute_character", "none");
4708 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4709 add_assoc_string(return_value, "substitute_character", "long");
4710 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4711 add_assoc_string(return_value, "substitute_character", "entity");
4712 } else {
4713 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4714 }
4715 if (MBSTRG(strict_detection)) {
4716 add_assoc_string(return_value, "strict_detection", "On");
4717 } else {
4718 add_assoc_string(return_value, "strict_detection", "Off");
4719 }
4720 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4721 ZEND_ASSERT(MBSTRG(current_internal_encoding));
4722 RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4723 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4724 if (MBSTRG(http_input_identify)) {
4725 RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4726 }
4727 RETURN_NULL();
4728 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4729 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4730 RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4731 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4732 RETURN_STR(
4733 zend_ini_str(
4734 "mbstring.http_output_conv_mimetypes",
4735 sizeof("mbstring.http_output_conv_mimetypes") - 1,
4736 false
4737 )
4738 );
4739 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4740 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4741 RETURN_STRING(name);
4742 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4743 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4744 RETURN_STRING(name);
4745 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4746 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4747 RETURN_STRING(name);
4748 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4749 RETURN_LONG(MBSTRG(illegalchars));
4750 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4751 if (MBSTRG(encoding_translation)) {
4752 RETURN_STRING("On");
4753 } else {
4754 RETURN_STRING("Off");
4755 }
4756 } else if (zend_string_equals_literal_ci(type, "language")) {
4757 name = (char *)mbfl_no_language2name(MBSTRG(language));
4758 RETURN_STRING(name);
4759 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4760 // TODO Seems to always have one entry at least?
4761 n = MBSTRG(current_detect_order_list_size);
4762 entry = MBSTRG(current_detect_order_list);
4763 if (n > 0) {
4764 size_t i;
4765 array_init(return_value);
4766 for (i = 0; i < n; i++) {
4767 add_next_index_string(return_value, (*entry)->name);
4768 entry++;
4769 }
4770 }
4771 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4772 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4773 RETURN_STRING("none");
4774 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4775 RETURN_STRING("long");
4776 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4777 RETURN_STRING("entity");
4778 } else {
4779 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4780 }
4781 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4782 if (MBSTRG(strict_detection)) {
4783 RETURN_STRING("On");
4784 } else {
4785 RETURN_STRING("Off");
4786 }
4787 } else {
4788 php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4789 RETURN_FALSE;
4790 }
4791 }
4792 /* }}} */
4793
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4794 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4795 {
4796 uint32_t wchar_buf[128];
4797 unsigned char *in = (unsigned char*)input;
4798 unsigned int state = 0;
4799
4800 if (encoding->check != NULL) {
4801 return encoding->check(in, length);
4802 }
4803
4804 /* If the input string is not encoded in the given encoding, there is a significant chance
4805 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4806 * buffer of 128 codepoints, convert and check just a few codepoints first */
4807 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4808 ZEND_ASSERT(out_len <= 8);
4809 for (unsigned int i = 0; i < out_len; i++) {
4810 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4811 return false;
4812 }
4813 }
4814
4815 while (length) {
4816 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4817 ZEND_ASSERT(out_len <= 128);
4818 for (unsigned int i = 0; i < out_len; i++) {
4819 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4820 return false;
4821 }
4822 }
4823 }
4824
4825 return true;
4826 }
4827
4828 /* MSVC 32-bit has issues with 64-bit intrinsics.
4829 * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4830 * It seems this is caused by a bug in MS Visual C++
4831 * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4832 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4833 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4834 #endif
4835
4836 /* If we are building an AVX2-only binary, don't compile the next function */
4837 #ifndef ZEND_INTRIN_AVX2_NATIVE
4838
4839 /* SSE2-based function for validating UTF-8 strings
4840 * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4841 static bool mb_fast_check_utf8_default(zend_string *str)
4842 {
4843 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4844 # ifdef __SSE2__
4845 /* `e` points 1 byte past the last full 16-byte block of string content
4846 * Note that we include the terminating null byte which is included in each zend_string
4847 * as part of the content to check; this ensures that multi-byte characters which are
4848 * truncated abruptly at the end of the string will be detected as invalid */
4849 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4850
4851 /* For checking for illegal bytes 0xF5-FF */
4852 const __m128i over_f5 = _mm_set1_epi8(-117);
4853 /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4854 const __m128i over_9f = _mm_set1_epi8(-97);
4855 /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4856 const __m128i over_8f = _mm_set1_epi8(-113);
4857 /* For checking for illegal bytes 0xC0-C1 */
4858 const __m128i find_c0 = _mm_set1_epi8(-64);
4859 const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4860 /* For checking structure of continuation bytes */
4861 const __m128i find_e0 = _mm_set1_epi8(-32);
4862 const __m128i find_f0 = _mm_set1_epi8(-16);
4863
4864 __m128i last_block = _mm_setzero_si128();
4865 __m128i operand;
4866
4867 while (p < e) {
4868 operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4869
4870 check_operand:
4871 /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4872 if (!_mm_movemask_epi8(operand)) {
4873 /* Even if this block only contains single-byte characters, there may have been a
4874 * multi-byte character at the end of the previous block, which was supposed to
4875 * have continuation bytes in this block
4876 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4877 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4878 * from the 3rd last */
4879 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4880 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4881 if (_mm_movemask_epi8(bad)) {
4882 return false;
4883 }
4884
4885 /* Consume as many full blocks of single-byte characters as we can */
4886 while (true) {
4887 p += sizeof(__m128i);
4888 if (p >= e) {
4889 goto finish_up_remaining_bytes;
4890 }
4891 operand = _mm_loadu_si128((__m128i*)p);
4892 if (_mm_movemask_epi8(operand)) {
4893 break;
4894 }
4895 }
4896 }
4897
4898 /* Check for >= 0xF5, which are illegal byte values in UTF-8
4899 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4900 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4901 * Then a single signed compare will pick out any bad bytes
4902 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4903 __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4904
4905 /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4906 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4907 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4908 * We can check for both problems at once by generating a vector where each byte < 0xA0
4909 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4910 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4911 __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4912 __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4913 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4914
4915 /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4916 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4917 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4918 * Build the bitmask and compare it with the shifted block */
4919 __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4920 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4921
4922 /* Check for overlong 2-byte code units
4923 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4924 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4925 * byte range, do a signed compare to pick out any bad bytes */
4926 bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4927
4928 /* Check structure of continuation bytes
4929 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4930 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4931 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4932 * 3) 3 bytes after the start of a 4-byte character
4933 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4934 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4935 __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4936 __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4937 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4938 __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4939 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4940
4941 /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4942 * a continuation byte actually is
4943 * XOR those two bitmasks together; if everything is good, the result should be zero
4944 * However, if a byte which should have been a continuation wasn't, or if a byte which
4945 * shouldn't have been a continuation was, we will get 0xFF in that position */
4946 __m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4947 bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4948
4949 /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4950 * If that value is non-zero, then we found a bad byte somewhere! */
4951 if (_mm_movemask_epi8(bad)) {
4952 return false;
4953 }
4954
4955 last_block = operand;
4956 p += sizeof(__m128i);
4957 }
4958
4959 finish_up_remaining_bytes:
4960 /* Finish up 1-15 remaining bytes */
4961 if (p == e) {
4962 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
4963
4964 /* Crazy hack here for cases where 9 or more bytes are remaining...
4965 * We want to use the above vectorized code to check a block of less than 16 bytes,
4966 * but there is no good way to read a variable number of bytes into an XMM register
4967 * However, we know that these bytes are part of a zend_string, and a zend_string has some
4968 * 'header' fields which occupy the memory just before its content
4969 * And, those header fields occupy more than 16 bytes...
4970 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4971 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4972 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4973 * Then, we do a left shift to get rid of the unwanted bytes
4974 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4975 *
4976 * The following `switch` looks useless, but it's not
4977 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4978 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4979 */
4980 switch (remaining_bytes) {
4981 case 0: ;
4982 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4983 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4984 return _mm_movemask_epi8(bad) == 0;
4985 case 1:
4986 case 2:
4987 operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
4988 goto check_operand;
4989 case 3:
4990 case 4:
4991 operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
4992 goto check_operand;
4993 case 5:
4994 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
4995 goto check_operand;
4996 case 6:
4997 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
4998 goto check_operand;
4999 case 7:
5000 case 8:
5001 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5002 operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5003 #else
5004 operand = _mm_set_epi64x(0, *((uint64_t*)p));
5005 #endif
5006 goto check_operand;
5007 case 9:
5008 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5009 goto check_operand;
5010 case 10:
5011 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5012 goto check_operand;
5013 case 11:
5014 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5015 goto check_operand;
5016 case 12:
5017 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5018 goto check_operand;
5019 case 13:
5020 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5021 goto check_operand;
5022 case 14:
5023 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5024 goto check_operand;
5025 case 15:
5026 /* No trailing bytes are left which need to be checked
5027 * We get 15 because we did not include the terminating null when
5028 * calculating `remaining_bytes`, so the value wraps around */
5029 return true;
5030 }
5031
5032 ZEND_UNREACHABLE();
5033 }
5034
5035 return true;
5036 # else
5037 /* This UTF-8 validation function is derived from PCRE2 */
5038 size_t length = ZSTR_LEN(str);
5039 /* Table of the number of extra bytes, indexed by the first byte masked with
5040 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5041 static const uint8_t utf8_table[] = {
5042 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5043 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5044 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5045 3,3,3,3,3,3,3,3
5046 };
5047
5048 for (; length > 0; p++) {
5049 uint32_t d;
5050 unsigned char c = *p;
5051 length--;
5052
5053 if (c < 128) {
5054 /* ASCII character */
5055 continue;
5056 }
5057
5058 if (c < 0xc0) {
5059 /* Isolated 10xx xxxx byte */
5060 return false;
5061 }
5062
5063 if (c >= 0xf5) {
5064 return false;
5065 }
5066
5067 uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5068 if (length < ab) {
5069 /* Missing bytes */
5070 return false;
5071 }
5072 length -= ab;
5073
5074 /* Check top bits in the second byte */
5075 if (((d = *(++p)) & 0xc0) != 0x80) {
5076 return false;
5077 }
5078
5079 /* For each length, check that the remaining bytes start with the 0x80 bit
5080 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5081 * excluded range 0xd800 to 0xdfff. */
5082 switch (ab) {
5083 case 1:
5084 /* 2-byte character. No further bytes to check for 0x80. Check first byte
5085 * for xx00 000x (overlong sequence). */
5086 if ((c & 0x3e) == 0) {
5087 return false;
5088 }
5089 break;
5090
5091 case 2:
5092 /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5093 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5094 if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5095 return false;
5096 }
5097 break;
5098
5099 case 3:
5100 /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5101 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5102 * character greater than 0x0010ffff (f4 8f bf bf) */
5103 if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5104 return false;
5105 }
5106 break;
5107
5108 EMPTY_SWITCH_DEFAULT_CASE();
5109 }
5110 }
5111
5112 return true;
5113 # endif
5114 }
5115
5116 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5117
5118 #ifdef ZEND_INTRIN_AVX2_NATIVE
5119
5120 /* We are building AVX2-only binary */
5121 # include <immintrin.h>
5122 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5123
5124 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5125
5126 /* We are building binary which works with or without AVX2; whether or not to use
5127 * AVX2-accelerated functions will be determined at runtime */
5128 # include <immintrin.h>
5129 # include "Zend/zend_cpuinfo.h"
5130
5131 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5132 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5133 * resolve symbols accordingly */
5134
5135 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5136
5137 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5138
5139 typedef bool (*check_utf8_func_t)(zend_string*);
5140
5141 ZEND_NO_SANITIZE_ADDRESS
5142 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5143 static check_utf8_func_t resolve_check_utf8(void)
5144 {
5145 if (zend_cpu_supports_avx2()) {
5146 return mb_fast_check_utf8_avx2;
5147 }
5148 return mb_fast_check_utf8_default;
5149 }
5150
5151 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5152 /* We are compiling for a target where the dynamic linker will not be able to
5153 * resolve symbols according to whether the host supports AVX2 or not; so instead,
5154 * we can make calls go through a function pointer and set the function pointer
5155 * on module load */
5156
5157 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5158 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5159 #else
5160 static bool mb_fast_check_utf8_avx2(zend_string *str);
5161 #endif
5162
5163 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5164
mb_fast_check_utf8(zend_string * str)5165 static bool mb_fast_check_utf8(zend_string *str)
5166 {
5167 return check_utf8_ptr(str);
5168 }
5169
init_check_utf8(void)5170 static void init_check_utf8(void)
5171 {
5172 if (zend_cpu_supports_avx2()) {
5173 check_utf8_ptr = mb_fast_check_utf8_avx2;
5174 } else {
5175 check_utf8_ptr = mb_fast_check_utf8_default;
5176 }
5177 }
5178 # endif
5179
5180 #else
5181
5182 /* No AVX2 support */
5183 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5184
5185 #endif
5186
5187 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5188
5189 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5190 * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5191 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5192 # define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5193 #endif
5194
5195 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5196 * number of bytes, then take the low 256 bits
5197 * This is used to take some number of trailing bytes from the previous 32-byte
5198 * block followed by some number of leading bytes from the current 32-byte block
5199 *
5200 * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5201 * YMM register while shifting in bytes from another YMM register... but
5202 * it works separately on respective 128-bit halves of the YMM registers,
5203 * which is not what we want.
5204 * To make it work as desired, we first do _mm256_permute2x128_si256
5205 * (VPERM2I128) to combine the low 128 bits from the previous block and
5206 * the high 128 bits of the current block in one YMM register.
5207 * Then VPALIGNR will do what is needed. */
5208 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5209
5210 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5211 *
5212 * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5213 * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5214 * sections. */
5215 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5216 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5217 #else
5218 static bool mb_fast_check_utf8_avx2(zend_string *str)
5219 #endif
5220 {
5221 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5222 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5223
5224 /* The algorithm used here for UTF-8 validation is partially adapted from the
5225 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5226 * and Daniel Lemire.
5227 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5228 *
5229 * Most types of invalid UTF-8 text can be detected by examining pairs of
5230 * successive bytes. Specifically:
5231 *
5232 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5233 * No valid UTF-8 string ever uses these byte values.
5234 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5235 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5236 * • 5-byte or 6-byte code units, which should never be used, start with
5237 * 0xF8-FE.
5238 * • A codepoint value higher than U+10FFFF, which is the highest value for
5239 * any Unicode codepoint, would either start with 0xF4, followed by a
5240 * byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5241 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5242 * be used, would start with 0xED, followed by a byte >= 0xA0.
5243 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5244 *
5245 * To detect all these problems, for each pair of successive bytes, we do
5246 * table lookups using the high nibble of the first byte, the low nibble of
5247 * the first byte, and the high nibble of the second byte. Each table lookup
5248 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5249 * combination; AND those three bitmasks together, and any 1 bit in the result
5250 * will indicate an actual invalid byte combination was found.
5251 */
5252
5253 #define BAD_BYTE 0x1
5254 #define OVERLONG_2BYTE 0x2
5255 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5256 #define OVERLONG_3BYTE 0x4
5257 #define SURROGATE 0x8
5258 #define OVERLONG_4BYTE 0x10
5259 #define INVALID_CP 0x20
5260
5261 /* Each of these are 16-entry tables, repeated twice; this is required by the
5262 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5263 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5264 *
5265 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5266 * that means that high nibble 0xC is consistent with the byte pair being part of
5267 * an overlong 2-byte code unit */
5268 const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5269 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5270 0, 0, 0, 0,
5271 0, 0, 0, 0,
5272 0, 0, 0, 0,
5273 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5274 0, 0, 0, 0,
5275 0, 0, 0, 0,
5276 0, 0, 0, 0);
5277 const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5278 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5279 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5280 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5281 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5282 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5283 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5284 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5285 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5286 const __m256i bad_hi_nibble = _mm256_set_epi8(
5287 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5288 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5289 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5290 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5291 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5292 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5293 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5294 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5295 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5296 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5297 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5298 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5299 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5300 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5301 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5302 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5303
5304 const __m256i find_continuation = _mm256_set1_epi8(-64);
5305 const __m256i _b = _mm256_set1_epi8(0xB);
5306 const __m256i _d = _mm256_set1_epi8(0xD);
5307 const __m256i _f = _mm256_set1_epi8(0xF);
5308
5309 __m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5310 __m256i operand;
5311
5312 while (p < e) {
5313 operand = _mm256_loadu_si256((__m256i*)p);
5314
5315 check_operand:
5316 if (!_mm256_movemask_epi8(operand)) {
5317 /* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5318 * the previous block didn't end with an incomplete multi-byte character
5319 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5320 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5321 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5322 if (_mm256_movemask_epi8(bad)) {
5323 return false;
5324 }
5325
5326 /* Consume as many full blocks of single-byte characters as we can */
5327 while (true) {
5328 p += sizeof(__m256i);
5329 if (p >= e) {
5330 goto finish_up_remaining_bytes;
5331 }
5332 operand = _mm256_loadu_si256((__m256i*)p);
5333 if (_mm256_movemask_epi8(operand)) {
5334 break;
5335 }
5336 }
5337 }
5338
5339 __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5340 __m256i lo_nibbles = _mm256_and_si256(operand, _f);
5341
5342 __m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5343 __m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5344
5345 /* Do parallel table lookups in all 3 tables */
5346 __m256i bad = _mm256_cmpgt_epi8(
5347 _mm256_and_si256(
5348 _mm256_and_si256(
5349 _mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5350 _mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5351 _mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5352 _mm256_setzero_si256());
5353
5354 __m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5355 __m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5356 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5357 __m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5358 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5359
5360 __m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5361 bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5362
5363 if (_mm256_movemask_epi8(bad)) {
5364 return false;
5365 }
5366
5367 last_hi_nibbles = hi_nibbles;
5368 last_lo_nibbles = lo_nibbles;
5369 p += sizeof(__m256i);
5370 }
5371
5372 finish_up_remaining_bytes:
5373 if (p == e) {
5374 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5375
5376 switch (remaining_bytes) {
5377 case 0: ;
5378 /* No actual data bytes are remaining */
5379 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5380 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5381 return _mm256_movemask_epi8(bad) == 0;
5382 case 1:
5383 case 2:
5384 operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5385 goto check_operand;
5386 case 3:
5387 case 4:
5388 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5389 goto check_operand;
5390 case 5:
5391 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5392 goto check_operand;
5393 case 6:
5394 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5395 goto check_operand;
5396 case 7:
5397 case 8:
5398 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5399 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5400 #else
5401 operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5402 #endif
5403 goto check_operand;
5404 case 9:
5405 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5406 goto check_operand;
5407 case 10:
5408 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5409 goto check_operand;
5410 case 11:
5411 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5412 goto check_operand;
5413 case 12:
5414 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5415 goto check_operand;
5416 case 13:
5417 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5418 goto check_operand;
5419 case 14:
5420 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5421 goto check_operand;
5422 case 15:
5423 case 16:
5424 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5425 goto check_operand;
5426 case 17:
5427 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5428 goto check_operand;
5429 case 18:
5430 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5431 goto check_operand;
5432 case 19:
5433 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5434 goto check_operand;
5435 case 20:
5436 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5437 goto check_operand;
5438 case 21:
5439 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5440 goto check_operand;
5441 case 22:
5442 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5443 goto check_operand;
5444 case 23:
5445 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5446 goto check_operand;
5447 case 24:
5448 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5449 goto check_operand;
5450 case 25:
5451 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5452 goto check_operand;
5453 case 26:
5454 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5455 goto check_operand;
5456 case 27:
5457 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5458 goto check_operand;
5459 case 28:
5460 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5461 goto check_operand;
5462 case 29:
5463 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5464 goto check_operand;
5465 case 30:
5466 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5467 goto check_operand;
5468 case 31:
5469 return true;
5470 }
5471
5472 ZEND_UNREACHABLE();
5473 }
5474
5475 return true;
5476 }
5477
5478 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5479
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5480 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5481 {
5482 if (encoding == &mbfl_encoding_utf8) {
5483 if (ZSTR_IS_VALID_UTF8(str)) {
5484 return true;
5485 }
5486 bool result = mb_fast_check_utf8(str);
5487 if (result && !ZSTR_IS_INTERNED(str)) {
5488 GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5489 }
5490 return result;
5491 } else {
5492 return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5493 }
5494 }
5495
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5496 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5497 {
5498 zend_long idx;
5499 zend_string *key;
5500 zval *entry;
5501 bool valid = true;
5502
5503 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5504
5505 if (GC_IS_RECURSIVE(vars)) {
5506 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5507 return false;
5508 }
5509 GC_TRY_PROTECT_RECURSION(vars);
5510 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5511 ZVAL_DEREF(entry);
5512 if (key) {
5513 if (!mb_check_str_encoding(key, encoding)) {
5514 valid = false;
5515 break;
5516 }
5517 }
5518 switch (Z_TYPE_P(entry)) {
5519 case IS_STRING:
5520 if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5521 valid = false;
5522 break;
5523 }
5524 break;
5525 case IS_ARRAY:
5526 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5527 valid = false;
5528 break;
5529 }
5530 break;
5531 case IS_LONG:
5532 case IS_DOUBLE:
5533 case IS_NULL:
5534 case IS_TRUE:
5535 case IS_FALSE:
5536 break;
5537 default:
5538 /* Other types are error. */
5539 valid = false;
5540 break;
5541 }
5542 } ZEND_HASH_FOREACH_END();
5543 GC_TRY_UNPROTECT_RECURSION(vars);
5544 return valid;
5545 }
5546
5547 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5548 PHP_FUNCTION(mb_check_encoding)
5549 {
5550 zend_string *input_str = NULL, *enc = NULL;
5551 HashTable *input_ht = NULL;
5552 const mbfl_encoding *encoding;
5553
5554 ZEND_PARSE_PARAMETERS_START(0, 2)
5555 Z_PARAM_OPTIONAL
5556 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5557 Z_PARAM_STR_OR_NULL(enc)
5558 ZEND_PARSE_PARAMETERS_END();
5559
5560 encoding = php_mb_get_encoding(enc, 2);
5561 if (!encoding) {
5562 RETURN_THROWS();
5563 }
5564
5565 if (input_ht) {
5566 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5567 } else if (input_str) {
5568 RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5569 } else {
5570 php_error_docref(NULL, E_DEPRECATED,
5571 "Calling mb_check_encoding() without argument is deprecated");
5572
5573 /* FIXME: Actually check all inputs, except $_FILES file content. */
5574 RETURN_BOOL(MBSTRG(illegalchars) == 0);
5575 }
5576 }
5577 /* }}} */
5578
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5579 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5580 const uint32_t enc_name_arg_num)
5581 {
5582 const mbfl_encoding *enc;
5583 enum mbfl_no_encoding no_enc;
5584
5585 ZEND_ASSERT(str_len > 0);
5586
5587 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5588 if (!enc) {
5589 return -2;
5590 }
5591
5592 no_enc = enc->no_encoding;
5593 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5594 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5595 return -2;
5596 }
5597
5598 /* Some legacy text encodings have a minimum required wchar buffer size;
5599 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5600 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5601 unsigned int state = 0;
5602 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5603 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5604
5605 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5606 return -1;
5607 }
5608 return wchar_buf[0];
5609 }
5610
5611 /* {{{ */
PHP_FUNCTION(mb_ord)5612 PHP_FUNCTION(mb_ord)
5613 {
5614 char *str;
5615 size_t str_len;
5616 zend_string *enc = NULL;
5617 zend_long cp;
5618
5619 ZEND_PARSE_PARAMETERS_START(1, 2)
5620 Z_PARAM_STRING(str, str_len)
5621 Z_PARAM_OPTIONAL
5622 Z_PARAM_STR_OR_NULL(enc)
5623 ZEND_PARSE_PARAMETERS_END();
5624
5625 if (str_len == 0) {
5626 zend_argument_value_error(1, "must not be empty");
5627 RETURN_THROWS();
5628 }
5629
5630 cp = php_mb_ord(str, str_len, enc, 2);
5631
5632 if (0 > cp) {
5633 if (cp == -2) {
5634 RETURN_THROWS();
5635 }
5636 RETURN_FALSE;
5637 }
5638
5639 RETURN_LONG(cp);
5640 }
5641 /* }}} */
5642
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5643 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5644 {
5645 const mbfl_encoding *enc;
5646 enum mbfl_no_encoding no_enc;
5647 zend_string *ret;
5648 char buf[4];
5649
5650 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5651 if (!enc) {
5652 return NULL;
5653 }
5654
5655 no_enc = enc->no_encoding;
5656 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5657 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5658 return NULL;
5659 }
5660
5661 if (cp < 0 || cp > 0x10ffff) {
5662 return NULL;
5663 }
5664
5665 if (php_mb_is_no_encoding_utf8(no_enc)) {
5666 if (cp > 0xd7ff && 0xe000 > cp) {
5667 return NULL;
5668 }
5669
5670 if (cp < 0x80) {
5671 ret = ZSTR_CHAR(cp);
5672 } else if (cp < 0x800) {
5673 ret = zend_string_alloc(2, 0);
5674 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5675 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5676 ZSTR_VAL(ret)[2] = 0;
5677 } else if (cp < 0x10000) {
5678 ret = zend_string_alloc(3, 0);
5679 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5680 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5681 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5682 ZSTR_VAL(ret)[3] = 0;
5683 } else {
5684 ret = zend_string_alloc(4, 0);
5685 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5686 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5687 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5688 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5689 ZSTR_VAL(ret)[4] = 0;
5690 }
5691
5692 return ret;
5693 }
5694
5695 buf[0] = (cp >> 24) & 0xff;
5696 buf[1] = (cp >> 16) & 0xff;
5697 buf[2] = (cp >> 8) & 0xff;
5698 buf[3] = cp & 0xff;
5699
5700 long orig_illegalchars = MBSTRG(illegalchars);
5701 MBSTRG(illegalchars) = 0;
5702 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5703
5704 if (MBSTRG(illegalchars) != 0) {
5705 zend_string_release(ret);
5706 ret = NULL;
5707 }
5708
5709 MBSTRG(illegalchars) = orig_illegalchars;
5710 return ret;
5711 }
5712
5713 /* {{{ */
PHP_FUNCTION(mb_chr)5714 PHP_FUNCTION(mb_chr)
5715 {
5716 zend_long cp;
5717 zend_string *enc = NULL;
5718
5719 ZEND_PARSE_PARAMETERS_START(1, 2)
5720 Z_PARAM_LONG(cp)
5721 Z_PARAM_OPTIONAL
5722 Z_PARAM_STR_OR_NULL(enc)
5723 ZEND_PARSE_PARAMETERS_END();
5724
5725 zend_string* ret = php_mb_chr(cp, enc, 2);
5726 if (ret == NULL) {
5727 RETURN_FALSE;
5728 }
5729
5730 RETURN_STR(ret);
5731 }
5732 /* }}} */
5733
PHP_FUNCTION(mb_str_pad)5734 PHP_FUNCTION(mb_str_pad)
5735 {
5736 zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5737 zend_long pad_to_length;
5738 zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5739
5740 ZEND_PARSE_PARAMETERS_START(2, 5)
5741 Z_PARAM_STR(input)
5742 Z_PARAM_LONG(pad_to_length)
5743 Z_PARAM_OPTIONAL
5744 Z_PARAM_STR(pad)
5745 Z_PARAM_LONG(pad_type_val)
5746 Z_PARAM_STR_OR_NULL(encoding_str)
5747 ZEND_PARSE_PARAMETERS_END();
5748
5749 const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5750 if (!encoding) {
5751 RETURN_THROWS();
5752 }
5753
5754 size_t input_length = mb_get_strlen(input, encoding);
5755
5756 /* If resulting string turns out to be shorter than input string,
5757 we simply copy the input and return. */
5758 if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5759 RETURN_STR_COPY(input);
5760 }
5761
5762 if (ZSTR_LEN(pad) == 0) {
5763 zend_argument_value_error(3, "must be a non-empty string");
5764 RETURN_THROWS();
5765 }
5766
5767 if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5768 zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5769 RETURN_THROWS();
5770 }
5771
5772 size_t pad_length = mb_get_strlen(pad, encoding);
5773
5774 size_t num_mb_pad_chars = pad_to_length - input_length;
5775
5776 /* We need to figure out the left/right padding lengths. */
5777 size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5778 switch (pad_type_val) {
5779 case PHP_STR_PAD_RIGHT:
5780 right_pad = num_mb_pad_chars;
5781 break;
5782
5783 case PHP_STR_PAD_LEFT:
5784 left_pad = num_mb_pad_chars;
5785 break;
5786
5787 case PHP_STR_PAD_BOTH:
5788 left_pad = num_mb_pad_chars / 2;
5789 right_pad = num_mb_pad_chars - left_pad;
5790 break;
5791 }
5792
5793 /* How many full block copies need to happen, and how many characters are then left over? */
5794 size_t full_left_pad_copies = left_pad / pad_length;
5795 size_t full_right_pad_copies = right_pad / pad_length;
5796 size_t remaining_left_pad_chars = left_pad % pad_length;
5797 size_t remaining_right_pad_chars = right_pad % pad_length;
5798
5799 if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5800 goto overflow_no_release;
5801 }
5802
5803 /* Compute the number of bytes required for the padding */
5804 size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5805 size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5806
5807 /* No special fast-path handling necessary for zero-length pads because these functions will not
5808 * allocate memory in case a zero-length pad is required. */
5809 zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5810 zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5811
5812 if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5813 || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5814 goto overflow;
5815 }
5816
5817 size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5818 size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5819
5820 if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5821 || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5822 goto overflow;
5823 }
5824
5825 zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5826 char *buffer = ZSTR_VAL(result);
5827
5828 /* First we pad the left. */
5829 for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5830 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5831 }
5832 memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5833 buffer += ZSTR_LEN(remaining_left_pad_str);
5834
5835 /* Then we copy the input string. */
5836 memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5837 buffer += ZSTR_LEN(input);
5838
5839 /* Finally, we pad on the right. */
5840 for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5841 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5842 }
5843 memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5844
5845 ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5846
5847 zend_string_release_ex(remaining_left_pad_str, false);
5848 zend_string_release_ex(remaining_right_pad_str, false);
5849
5850 RETURN_NEW_STR(result);
5851
5852 overflow:
5853 zend_string_release_ex(remaining_left_pad_str, false);
5854 zend_string_release_ex(remaining_right_pad_str, false);
5855 overflow_no_release:
5856 zend_throw_error(NULL, "String size overflow");
5857 RETURN_THROWS();
5858 }
5859
5860 /* {{{ */
PHP_FUNCTION(mb_scrub)5861 PHP_FUNCTION(mb_scrub)
5862 {
5863 zend_string *str, *enc_name = NULL;
5864
5865 ZEND_PARSE_PARAMETERS_START(1, 2)
5866 Z_PARAM_STR(str)
5867 Z_PARAM_OPTIONAL
5868 Z_PARAM_STR_OR_NULL(enc_name)
5869 ZEND_PARSE_PARAMETERS_END();
5870
5871 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5872 if (!enc) {
5873 RETURN_THROWS();
5874 }
5875
5876 if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5877 /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5878 RETURN_STR_COPY(str);
5879 }
5880
5881 RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5882 }
5883 /* }}} */
5884
5885 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5886 static void php_mb_populate_current_detect_order_list(void)
5887 {
5888 const mbfl_encoding **entry = 0;
5889 size_t nentries;
5890
5891 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5892 nentries = MBSTRG(detect_order_list_size);
5893 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5894 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5895 } else {
5896 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5897 size_t i;
5898 nentries = MBSTRG(default_detect_order_list_size);
5899 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5900 for (i = 0; i < nentries; i++) {
5901 entry[i] = mbfl_no2encoding(src[i]);
5902 }
5903 }
5904 MBSTRG(current_detect_order_list) = entry;
5905 MBSTRG(current_detect_order_list_size) = nentries;
5906 }
5907 /* }}} */
5908
5909 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5910 static int php_mb_encoding_translation(void)
5911 {
5912 return MBSTRG(encoding_translation);
5913 }
5914 /* }}} */
5915
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5916 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5917 {
5918 if (enc) {
5919 if (enc->mblen_table) {
5920 if (s) {
5921 return enc->mblen_table[*(unsigned char *)s];
5922 }
5923 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5924 return 2;
5925 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5926 return 4;
5927 }
5928 }
5929 return 1;
5930 }
5931
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5932 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5933 {
5934 const char *p = s;
5935 char *last=NULL;
5936
5937 if (nbytes == (size_t)-1) {
5938 size_t nb = 0;
5939
5940 while (*p != '\0') {
5941 if (nb == 0) {
5942 if ((unsigned char)*p == (unsigned char)c) {
5943 last = (char *)p;
5944 }
5945 nb = php_mb_mbchar_bytes(p, enc);
5946 if (nb == 0) {
5947 return NULL; /* something is going wrong! */
5948 }
5949 }
5950 --nb;
5951 ++p;
5952 }
5953 } else {
5954 size_t bcnt = nbytes;
5955 size_t nbytes_char;
5956 while (bcnt > 0) {
5957 if ((unsigned char)*p == (unsigned char)c) {
5958 last = (char *)p;
5959 }
5960 nbytes_char = php_mb_mbchar_bytes(p, enc);
5961 if (bcnt < nbytes_char) {
5962 return NULL;
5963 }
5964 p += nbytes_char;
5965 bcnt -= nbytes_char;
5966 }
5967 }
5968 return last;
5969 }
5970
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)5971 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
5972 {
5973 /* We're using simple case-folding here, because we'd have to deal with remapping of
5974 * offsets otherwise. */
5975 zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5976 zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5977
5978 size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
5979
5980 zend_string_free(haystack_conv);
5981 zend_string_free(needle_conv);
5982
5983 return n;
5984 }
5985
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)5986 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
5987 {
5988 *list = (const zend_encoding **)MBSTRG(http_input_list);
5989 *list_size = MBSTRG(http_input_list_size);
5990 }
5991 /* }}} */
5992
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)5993 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
5994 {
5995 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
5996 }
5997 /* }}} */
5998
5999 static const unsigned char base64_table[] = {
6000 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6001 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6002 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6003 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6004 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6005 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6006 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6007 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6008 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6009 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6010 };
6011
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6012 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6013 {
6014 if (base64) {
6015 return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6016 } else {
6017 size_t enc_size = 0;
6018 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6019 while (p < tmpbuf->out) {
6020 unsigned char c = *p++;
6021 enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6022 }
6023 return enc_size;
6024 }
6025 }
6026
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6027 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6028 {
6029 unsigned char *out, *limit;
6030 MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6031 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6032
6033 if (base64) {
6034 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6035 while ((e - p) >= 3) {
6036 unsigned char a = *p++;
6037 unsigned char b = *p++;
6038 unsigned char c = *p++;
6039 uint32_t bits = (a << 16) | (b << 8) | c;
6040 out = mb_convert_buf_add4(out,
6041 base64_table[(bits >> 18) & 0x3F],
6042 base64_table[(bits >> 12) & 0x3F],
6043 base64_table[(bits >> 6) & 0x3F],
6044 base64_table[bits & 0x3F]);
6045 }
6046 if (p != e) {
6047 if ((e - p) == 1) {
6048 uint32_t bits = *p++;
6049 out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6050 } else {
6051 unsigned char a = *p++;
6052 unsigned char b = *p++;
6053 uint32_t bits = (a << 8) | b;
6054 out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6055 }
6056 }
6057 } else {
6058 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6059 while (p < e) {
6060 unsigned char c = *p++;
6061 if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6062 out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6063 } else {
6064 out = mb_convert_buf_add(out, c);
6065 }
6066 }
6067 }
6068
6069 mb_convert_buf_reset(tmpbuf, 0);
6070 MB_CONVERT_BUF_STORE(outbuf, out, limit);
6071 }
6072
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6073 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6074 {
6075 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6076 size_t in_len = ZSTR_LEN(input);
6077
6078 if (!in_len) {
6079 return zend_empty_string;
6080 }
6081
6082 if (indent < 0 || indent >= 74) {
6083 indent = 0;
6084 }
6085
6086 if (linefeed_len > 8) {
6087 linefeed_len = 8;
6088 }
6089 /* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6090 for (size_t i = 0; i < linefeed_len; i++) {
6091 if (linefeed[i] == '\0') {
6092 linefeed_len = i;
6093 break;
6094 }
6095 }
6096
6097 unsigned int state = 0;
6098 /* wchar_buf should be big enough that when it is full, we definitely have enough
6099 * wchars to fill an entire line of output */
6100 uint32_t wchar_buf[80];
6101 uint32_t *p, *e;
6102 /* What part of wchar_buf is filled with still-unprocessed data which should not
6103 * be overwritten? */
6104 unsigned int offset = 0;
6105 size_t line_start = 0;
6106
6107 /* If the entire input string is ASCII with no spaces (except possibly leading
6108 * spaces), just pass it through unchanged */
6109 bool checking_leading_spaces = true;
6110 while (in_len) {
6111 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, 80, &state);
6112 p = wchar_buf;
6113 e = wchar_buf + out_len;
6114
6115 while (p < e) {
6116 uint32_t w = *p++;
6117 if (checking_leading_spaces) {
6118 if (w == ' ') {
6119 continue;
6120 } else {
6121 checking_leading_spaces = false;
6122 }
6123 }
6124 if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6125 /* We cannot simply pass input string through unchanged; start again */
6126 in = (unsigned char*)ZSTR_VAL(input);
6127 in_len = ZSTR_LEN(input);
6128 goto no_passthrough;
6129 }
6130 }
6131 }
6132
6133 return zend_string_copy(input); /* This just increments refcount */
6134
6135 no_passthrough: ;
6136
6137 mb_convert_buf buf;
6138 mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6139
6140 /* Encode some prefix of the input string as plain ASCII if possible
6141 * If we find it necessary to switch to Base64/QPrint encoding, we will
6142 * do so all the way to the end of the string */
6143 while (in_len) {
6144 /* Decode part of the input string, refill wchar_buf */
6145 ZEND_ASSERT(offset < 80);
6146 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
6147 ZEND_ASSERT(out_len <= 80 - offset);
6148 p = wchar_buf;
6149 e = wchar_buf + offset + out_len;
6150 /* ASCII output is broken into space-delimited 'words'
6151 * If we find a non-ASCII character in the middle of a word, we will
6152 * transfer-encode the entire word */
6153 uint32_t *word_start = p;
6154
6155 /* Don't consider adding line feed for spaces at the beginning of a word */
6156 while (p < e && *p == ' ' && (p - word_start) <= 74) {
6157 p++;
6158 }
6159
6160 while (p < e) {
6161 uint32_t w = *p++;
6162
6163 if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6164 /* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6165 * If we are already too far along on a line to include Base64/QPrint encoded data
6166 * on the same line (without overrunning max line length), then add a line feed
6167 * right now */
6168 if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6169 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6170 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6171 buf.out = mb_convert_buf_add(buf.out, ' ');
6172 indent = 0;
6173 line_start = mb_convert_buf_len(&buf);
6174 } else if (mb_convert_buf_len(&buf) > 0) {
6175 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6176 buf.out = mb_convert_buf_add(buf.out, ' ');
6177 }
6178 p = word_start; /* Back up to where MIME encoding of input chars should start */
6179 goto mime_encoding_needed;
6180 } else if (w == ' ') {
6181 /* When we see a space, check whether we should insert a line break */
6182 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6183 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6184 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6185 buf.out = mb_convert_buf_add(buf.out, ' ');
6186 indent = 0;
6187 line_start = mb_convert_buf_len(&buf);
6188 } else if (mb_convert_buf_len(&buf) > 0) {
6189 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6190 buf.out = mb_convert_buf_add(buf.out, ' ');
6191 }
6192 /* Output one (space-delimited) word as plain ASCII */
6193 while (word_start < p-1) {
6194 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6195 }
6196 word_start++;
6197 while (p < e && *p == ' ') {
6198 p++;
6199 }
6200 }
6201 }
6202
6203 if (in_len) {
6204 /* Copy chars which are part of an incomplete 'word' to the beginning
6205 * of wchar_buf and reprocess them on the next iteration */
6206 offset = e - word_start;
6207 if (offset) {
6208 memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6209 }
6210 } else {
6211 /* We have reached the end of the input string while still in 'ASCII mode';
6212 * process any trailing ASCII chars which were not followed by a space */
6213 if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6214 /* The whole input string was not just one big ASCII 'word' with no spaces
6215 * consider adding a line feed if necessary to prevent output lines from
6216 * being too long */
6217 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6218 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6219 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6220 buf.out = mb_convert_buf_add(buf.out, ' ');
6221 } else {
6222 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6223 buf.out = mb_convert_buf_add(buf.out, ' ');
6224 }
6225 }
6226 while (word_start < e) {
6227 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6228 }
6229 }
6230 }
6231
6232 /* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6233 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6234
6235 mime_encoding_needed: ;
6236
6237 /* We will generate the output line by line, first converting wchars to bytes
6238 * in the requested output encoding, then transfer-encoding those bytes as
6239 * Base64 or QPrint
6240 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6241 * sending them to 'buf' */
6242 mb_convert_buf tmpbuf;
6243 mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6244
6245 /* Do we need to refill wchar_buf to make sure we don't run out of wchars
6246 * in the middle of a line? */
6247 if (p == wchar_buf) {
6248 goto start_new_line;
6249 }
6250 offset = e - p;
6251 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6252
6253 while(true) {
6254 refill_wchar_buf: ;
6255 ZEND_ASSERT(offset < 80);
6256 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
6257 ZEND_ASSERT(out_len <= 80 - offset);
6258 p = wchar_buf;
6259 e = wchar_buf + offset + out_len;
6260
6261 start_new_line: ;
6262 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6263 buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6264 buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6265 buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6266
6267 /* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6268 * We do something like a 'binary search' to find the greatest number which
6269 * can be included on this line without exceeding max line length */
6270 unsigned int n = 12;
6271 size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6272
6273 while (true) {
6274 ZEND_ASSERT(p < e);
6275
6276 /* Remember where we were in process of generating output, so we can back
6277 * up if necessary */
6278 size_t tmppos = mb_convert_buf_len(&tmpbuf);
6279 unsigned int tmpstate = tmpbuf.state;
6280
6281 /* Try encoding 'n' wchars in output text encoding and sending output
6282 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6283 * current line. */
6284 n = MIN(n, e - p);
6285 outcode->from_wchar(p, n, &tmpbuf, false);
6286
6287 /* For some output text encodings, there may be a few ending bytes
6288 * which need to be emitted to output before we break a line.
6289 * Again, remember where we were so we can back up */
6290 size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6291 unsigned int tmpstate2 = tmpbuf.state;
6292 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6293
6294 if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6295 /* If we convert 'n' more wchars on the current line, it will not
6296 * overflow the maximum line length */
6297 p += n;
6298
6299 if (p == e) {
6300 /* We are done; we shouldn't reach here if there is more remaining
6301 * of the input string which needs to be processed */
6302 ZEND_ASSERT(!in_len);
6303 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6304 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6305 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6306 mb_convert_buf_free(&tmpbuf);
6307 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6308 } else {
6309 /* It's possible that more chars might fit on the current line,
6310 * so back up to where we were before emitting any ending bytes */
6311 mb_convert_buf_reset(&tmpbuf, tmppos2);
6312 tmpbuf.state = tmpstate2;
6313 }
6314 } else {
6315 /* Converting 'n' more wchars on this line would be too much.
6316 * Back up to where we were before we tried that. */
6317 mb_convert_buf_reset(&tmpbuf, tmppos);
6318 tmpbuf.state = tmpstate;
6319
6320 if (n == 1) {
6321 /* We have found the exact number of chars which will fit on the
6322 * current line. Finish up and move to a new line. */
6323 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6324 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6325 tmpbuf.state = 0;
6326
6327 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6328 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6329
6330 indent = 0; /* Indent argument must only affect the first line */
6331
6332 if (in_len) {
6333 /* We still have more of input string remaining to decode */
6334 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6335 buf.out = mb_convert_buf_add(buf.out, ' ');
6336 line_start = mb_convert_buf_len(&buf);
6337 /* Copy remaining wchars to beginning of buffer so they will be
6338 * processed on the next iteration of outer 'do' loop */
6339 offset = e - p;
6340 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6341 goto refill_wchar_buf;
6342 } else if (p < e) {
6343 /* Input string is finished, but we still have trailing wchars
6344 * remaining to be processed in wchar_buf */
6345 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6346 buf.out = mb_convert_buf_add(buf.out, ' ');
6347 line_start = mb_convert_buf_len(&buf);
6348 goto start_new_line;
6349 } else {
6350 /* We are done! */
6351 mb_convert_buf_free(&tmpbuf);
6352 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6353 }
6354 } else {
6355 /* Try a smaller number of wchars */
6356 n = MAX(n >> 1, 1);
6357 }
6358 }
6359 }
6360 }
6361 }
6362
PHP_FUNCTION(mb_encode_mimeheader)6363 PHP_FUNCTION(mb_encode_mimeheader)
6364 {
6365 const mbfl_encoding *charset = &mbfl_encoding_pass;
6366 zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6367 char *linefeed = "\r\n";
6368 size_t linefeed_len = 2;
6369 zend_long indent = 0;
6370 bool base64 = true;
6371
6372 ZEND_PARSE_PARAMETERS_START(1, 5)
6373 Z_PARAM_STR(str)
6374 Z_PARAM_OPTIONAL
6375 Z_PARAM_STR(charset_name)
6376 Z_PARAM_STR(transenc_name)
6377 Z_PARAM_STRING(linefeed, linefeed_len)
6378 Z_PARAM_LONG(indent)
6379 ZEND_PARSE_PARAMETERS_END();
6380
6381 if (charset_name != NULL) {
6382 charset = php_mb_get_encoding(charset_name, 2);
6383 if (!charset) {
6384 RETURN_THROWS();
6385 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
6386 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6387 RETURN_THROWS();
6388 }
6389 } else {
6390 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6391 if (lang != NULL) {
6392 charset = mbfl_no2encoding(lang->mail_charset);
6393 const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6394 char t = transenc->name[0];
6395 if (t == 'Q' || t == 'q') {
6396 base64 = false;
6397 }
6398 }
6399 }
6400
6401 if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6402 char t = ZSTR_VAL(transenc_name)[0];
6403 if (t == 'Q' || t == 'q') {
6404 base64 = false;
6405 }
6406 }
6407
6408 RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6409 }
6410
decode_base64(unsigned char c)6411 static int8_t decode_base64(unsigned char c)
6412 {
6413 if (c >= 'A' && c <= 'Z') {
6414 return c - 'A';
6415 } else if (c >= 'a' && c <= 'z') {
6416 return c - 'a' + 26;
6417 } else if (c >= '0' && c <= '9') {
6418 return c - '0' + 52;
6419 } else if (c == '+') {
6420 return 62;
6421 } else if (c == '/') {
6422 return 63;
6423 }
6424 return -1;
6425 }
6426
6427 static int8_t qprint_map[] = {
6428 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6429 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6430 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6431 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
6432 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6433 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6434 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6435 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6436 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6437 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6438 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6439 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6440 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6441 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6442 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6443 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6444 };
6445
6446 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6447 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6448 {
6449 if ((e - p) < 6) {
6450 return NULL;
6451 }
6452
6453 ZEND_ASSERT(p[0] == '=');
6454 ZEND_ASSERT(p[1] == '?');
6455 p += 2;
6456
6457 unsigned char *charset = p;
6458 unsigned char *charset_end = memchr(charset, '?', e - charset);
6459 if (charset_end == NULL) {
6460 return NULL;
6461 }
6462
6463 unsigned char *encoding = charset_end + 1;
6464 p = encoding + 1;
6465 if (p >= e || *p++ != '?') {
6466 return NULL;
6467 }
6468
6469 char *charset_name = estrndup((const char*)charset, charset_end - charset);
6470 const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6471 efree(charset_name);
6472 if (incode == NULL) {
6473 return NULL;
6474 }
6475
6476 unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6477 if (end_marker) {
6478 e = end_marker;
6479 } else if (p < e && *(e-1) == '?') {
6480 /* If encoded word is not properly terminated, but last byte is '?',
6481 * take that as a terminator (legacy behavior) */
6482 e--;
6483 }
6484
6485 unsigned char *buf = emalloc(e - p), *bufp = buf;
6486 if (*encoding == 'Q' || *encoding == 'q') {
6487 /* Fill `buf` with bytes from decoding QPrint */
6488 while (p < e) {
6489 unsigned char c = *p++;
6490 if (c == '_') {
6491 *bufp++ = ' ';
6492 continue;
6493 } else if (c == '=' && (e - p) >= 2) {
6494 unsigned char c2 = *p++;
6495 unsigned char c3 = *p++;
6496 if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6497 *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6498 continue;
6499 } else if (c2 == '\r') {
6500 if (c3 != '\n') {
6501 p--;
6502 }
6503 continue;
6504 } else if (c2 == '\n') {
6505 p--;
6506 continue;
6507 }
6508 }
6509 *bufp++ = c;
6510 }
6511 } else if (*encoding == 'B' || *encoding == 'b') {
6512 /* Fill `buf` with bytes from decoding Base64 */
6513 unsigned int bits = 0, cache = 0;
6514 while (p < e) {
6515 unsigned char c = *p++;
6516 if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6517 continue;
6518 }
6519 int8_t decoded = decode_base64(c);
6520 if (decoded == -1) {
6521 *bufp++ = '?';
6522 continue;
6523 }
6524 bits += 6;
6525 cache = (cache << 6) | (decoded & 0x3F);
6526 if (bits == 24) {
6527 *bufp++ = (cache >> 16) & 0xFF;
6528 *bufp++ = (cache >> 8) & 0xFF;
6529 *bufp++ = cache & 0xFF;
6530 bits = cache = 0;
6531 }
6532 }
6533 if (bits == 18) {
6534 *bufp++ = (cache >> 10) & 0xFF;
6535 *bufp++ = (cache >> 2) & 0xFF;
6536 } else if (bits == 12) {
6537 *bufp++ = (cache >> 4) & 0xFF;
6538 }
6539 } else {
6540 efree(buf);
6541 return NULL;
6542 }
6543
6544 size_t in_len = bufp - buf;
6545 uint32_t wchar_buf[128];
6546
6547 bufp = buf;
6548 while (in_len) {
6549 size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6550 ZEND_ASSERT(out_len <= 128);
6551 outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6552 }
6553
6554 efree(buf);
6555 return e + 2;
6556 }
6557
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6558 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6559 {
6560 unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6561 unsigned int state = 0;
6562 bool space_pending = false;
6563
6564 mb_convert_buf buf;
6565 mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6566
6567 while (p < e) {
6568 unsigned char c = *p;
6569
6570 if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6571 /* Does this look like a MIME encoded word? If so, try to decode it as one */
6572 unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6573 if (incode_end && (e - incode_end) >= 3) {
6574 unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6575 if (temp) {
6576 p = temp;
6577 /* Decoding of MIME encoded word was successful;
6578 * Try to collapse a run of whitespace */
6579 if (p < e && (*p == '\n' || *p == '\r')) {
6580 do {
6581 p++;
6582 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6583 /* We will only actually output a space if this is not immediately followed
6584 * by another valid encoded word */
6585 space_pending = true;
6586 }
6587 continue;
6588 }
6589 }
6590 }
6591
6592 if (space_pending) {
6593 uint32_t space = ' ';
6594 outcode->from_wchar(&space, 1, &buf, false);
6595 space_pending = false;
6596 }
6597
6598 /* Consume a run of plain ASCII characters */
6599 if (c != '\n' && c != '\r') {
6600 unsigned char *end = p + 1;
6601 while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6602 end++;
6603 }
6604 uint32_t wchar_buf[128];
6605 size_t in_len = end - p;
6606 while (in_len) {
6607 size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6608 ZEND_ASSERT(out_len <= 128);
6609 outcode->from_wchar(wchar_buf, out_len, &buf, false);
6610 }
6611 }
6612 /* Collapse a run of whitespace into a single space */
6613 if (p < e && (*p == '\n' || *p == '\r')) {
6614 do {
6615 p++;
6616 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6617 if (p < e) {
6618 /* Emulating legacy behavior of mb_decode_mimeheader here;
6619 * a run of whitespace is not converted to a space at the very
6620 * end of the input string */
6621 uint32_t space = ' ';
6622 outcode->from_wchar(&space, 1, &buf, false);
6623 }
6624 }
6625 }
6626
6627 outcode->from_wchar(NULL, 0, &buf, true);
6628
6629 return mb_convert_buf_result(&buf, outcode);
6630 }
6631
PHP_FUNCTION(mb_decode_mimeheader)6632 PHP_FUNCTION(mb_decode_mimeheader)
6633 {
6634 zend_string *str;
6635
6636 ZEND_PARSE_PARAMETERS_START(1, 1)
6637 Z_PARAM_STR(str)
6638 ZEND_PARSE_PARAMETERS_END();
6639
6640 RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6641 }
6642