1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include <limits.h>
22
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "main/php_output.h"
32 #include "ext/standard/info.h"
33 #include "ext/pcre/php_pcre.h"
34
35 #include "libmbfl/mbfl/mbfilter_8bit.h"
36 #include "libmbfl/mbfl/mbfilter_pass.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_cjk.h"
40 #include "libmbfl/filters/mbfilter_qprint.h"
41 #include "libmbfl/filters/mbfilter_htmlent.h"
42 #include "libmbfl/filters/mbfilter_uuencode.h"
43 #include "libmbfl/filters/mbfilter_ucs4.h"
44 #include "libmbfl/filters/mbfilter_utf16.h"
45 #include "libmbfl/filters/mbfilter_singlebyte.h"
46 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
47 #include "libmbfl/filters/unicode_prop.h"
48
49 #include "php_globals.h"
50 #include "rfc1867.h"
51 #include "php_content_types.h"
52 #include "SAPI.h"
53 #include "php_unicode.h"
54 #include "TSRM.h"
55
56 #include "mb_gpc.h"
57
58 #ifdef HAVE_MBREGEX
59 # include "php_mbregex.h"
60 #endif
61
62 #include "zend_smart_str.h"
63 #include "zend_multibyte.h"
64 #include "mbstring_arginfo.h"
65
66 #include "rare_cp_bitvec.h"
67
68 #ifdef __SSE2__
69 #include <emmintrin.h>
70 #endif
71
72 #ifdef __SSE3__
73 #include <immintrin.h>
74 #include <pmmintrin.h>
75 #endif
76
77 /* }}} */
78
79 /* {{{ prototypes */
80 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
81
82 static PHP_GINIT_FUNCTION(mbstring);
83 static PHP_GSHUTDOWN_FUNCTION(mbstring);
84
85 static void php_mb_populate_current_detect_order_list(void);
86
87 static int php_mb_encoding_translation(void);
88
89 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
90
91 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
92
93 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
94
95 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
96
97 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
98
99 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
100
101 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
102
103 /* See mbfilter_cp5022x.c */
104 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
105 /* }}} */
106
107 /* {{{ php_mb_default_identify_list */
108 typedef struct _php_mb_nls_ident_list {
109 enum mbfl_no_language lang;
110 const enum mbfl_no_encoding *list;
111 size_t list_size;
112 } php_mb_nls_ident_list;
113
114 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
115 mbfl_no_encoding_ascii,
116 mbfl_no_encoding_jis,
117 mbfl_no_encoding_utf8,
118 mbfl_no_encoding_euc_jp,
119 mbfl_no_encoding_sjis
120 };
121
122 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
123 mbfl_no_encoding_ascii,
124 mbfl_no_encoding_utf8,
125 mbfl_no_encoding_euc_cn,
126 mbfl_no_encoding_cp936
127 };
128
129 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
130 mbfl_no_encoding_ascii,
131 mbfl_no_encoding_utf8,
132 mbfl_no_encoding_euc_tw,
133 mbfl_no_encoding_big5
134 };
135
136 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
137 mbfl_no_encoding_ascii,
138 mbfl_no_encoding_utf8,
139 mbfl_no_encoding_euc_kr,
140 mbfl_no_encoding_uhc
141 };
142
143 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
144 mbfl_no_encoding_ascii,
145 mbfl_no_encoding_utf8,
146 mbfl_no_encoding_koi8r,
147 mbfl_no_encoding_cp1251,
148 mbfl_no_encoding_cp866
149 };
150
151 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
152 mbfl_no_encoding_ascii,
153 mbfl_no_encoding_utf8,
154 mbfl_no_encoding_armscii8
155 };
156
157 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
158 mbfl_no_encoding_ascii,
159 mbfl_no_encoding_utf8,
160 mbfl_no_encoding_cp1254,
161 mbfl_no_encoding_8859_9
162 };
163
164 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
165 mbfl_no_encoding_ascii,
166 mbfl_no_encoding_utf8,
167 mbfl_no_encoding_koi8u
168 };
169
170 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
171 mbfl_no_encoding_ascii,
172 mbfl_no_encoding_utf8
173 };
174
175
176 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
177 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
178 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
179 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
180 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
181 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
182 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
183 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
184 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
185 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
186 };
187
188 /* }}} */
189
190 /* {{{ mbstring_deps[] */
191 static const zend_module_dep mbstring_deps[] = {
192 ZEND_MOD_REQUIRED("pcre")
193 ZEND_MOD_END
194 };
195 /* }}} */
196
197 /* {{{ zend_module_entry mbstring_module_entry */
198 zend_module_entry mbstring_module_entry = {
199 STANDARD_MODULE_HEADER_EX,
200 NULL,
201 mbstring_deps,
202 "mbstring",
203 ext_functions,
204 PHP_MINIT(mbstring),
205 PHP_MSHUTDOWN(mbstring),
206 PHP_RINIT(mbstring),
207 PHP_RSHUTDOWN(mbstring),
208 PHP_MINFO(mbstring),
209 PHP_MBSTRING_VERSION,
210 PHP_MODULE_GLOBALS(mbstring),
211 PHP_GINIT(mbstring),
212 PHP_GSHUTDOWN(mbstring),
213 NULL,
214 STANDARD_MODULE_PROPERTIES_EX
215 };
216 /* }}} */
217
218 /* {{{ static sapi_post_entry php_post_entries[] */
219 static const sapi_post_entry php_post_entries[] = {
220 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
221 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
222 { NULL, 0, NULL, NULL }
223 };
224 /* }}} */
225
226 #ifdef COMPILE_DL_MBSTRING
227 #ifdef ZTS
228 ZEND_TSRMLS_CACHE_DEFINE()
229 #endif
230 ZEND_GET_MODULE(mbstring)
231 #endif
232
233 /* {{{ static sapi_post_entry mbstr_post_entries[] */
234 static const sapi_post_entry mbstr_post_entries[] = {
235 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
236 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
237 { NULL, 0, NULL, NULL }
238 };
239 /* }}} */
240
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)241 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
242 if (encoding_name) {
243 const mbfl_encoding *encoding;
244 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
245 if (last_encoding_name && (last_encoding_name == encoding_name
246 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
247 return MBSTRG(last_used_encoding);
248 }
249
250 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
251 if (!encoding) {
252 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
253 return NULL;
254 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
255 if (encoding == &mbfl_encoding_base64) {
256 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
257 } else if (encoding == &mbfl_encoding_qprint) {
258 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
259 } else if (encoding == &mbfl_encoding_html_ent) {
260 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
261 } else if (encoding == &mbfl_encoding_uuencode) {
262 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
263 }
264 }
265
266 if (last_encoding_name) {
267 zend_string_release(last_encoding_name);
268 }
269 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
270 MBSTRG(last_used_encoding) = encoding;
271 return encoding;
272 } else {
273 return MBSTRG(current_internal_encoding);
274 }
275 }
276
php_mb_get_encoding_or_pass(const char * encoding_name,size_t encoding_name_len)277 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
278 if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
279 return &mbfl_encoding_pass;
280 }
281
282 return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
283 }
284
count_commas(const char * p,const char * end)285 static size_t count_commas(const char *p, const char *end) {
286 size_t count = 0;
287 while ((p = memchr(p, ',', end - p))) {
288 count++;
289 p++;
290 }
291 return count;
292 }
293
294 /* {{{ static zend_result php_mb_parse_encoding_list()
295 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
296 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
297 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)298 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
299 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
300 {
301 if (value == NULL || value_length == 0) {
302 *return_list = NULL;
303 *return_size = 0;
304 return SUCCESS;
305 } else {
306 bool included_auto;
307 size_t n, size;
308 const char *p1, *endp, *tmpstr;
309 const mbfl_encoding **entry, **list;
310
311 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
312 tmpstr = value + 1;
313 value_length -= 2;
314 } else {
315 tmpstr = value;
316 }
317
318 endp = tmpstr + value_length;
319 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
320 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
321 entry = list;
322 n = 0;
323 included_auto = 0;
324 p1 = tmpstr;
325 while (1) {
326 const char *comma = memchr(p1, ',', endp - p1);
327 const char *p = comma ? comma : endp;
328 /* trim spaces */
329 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
330 p1++;
331 }
332 p--;
333 while (p > p1 && (*p == ' ' || *p == '\t')) {
334 p--;
335 }
336 size_t p1_length = p - p1 + 1;
337 /* convert to the encoding number and check encoding */
338 if (strncasecmp(p1, "auto", p1_length) == 0) {
339 if (!included_auto) {
340 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
341 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
342 size_t i;
343 included_auto = 1;
344 for (i = 0; i < identify_list_size; i++) {
345 *entry++ = mbfl_no2encoding(*src++);
346 n++;
347 }
348 }
349 } else {
350 const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
351 if (!encoding) {
352 /* Called from an INI setting modification */
353 if (arg_num == 0) {
354 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
355 } else {
356 zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
357 }
358 pefree(ZEND_VOIDP(list), persistent);
359 return FAILURE;
360 }
361
362 *entry++ = encoding;
363 n++;
364 }
365 if (n >= size || comma == NULL) {
366 break;
367 }
368 p1 = comma + 1;
369 }
370 *return_list = list;
371 *return_size = n;
372 }
373
374 return SUCCESS;
375 }
376 /* }}} */
377
378 /* {{{
379 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
380 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
381 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)382 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
383 size_t *return_size, uint32_t arg_num)
384 {
385 /* Allocate enough space to include the default detect order if "auto" is used. */
386 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
387 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
388 const mbfl_encoding **entry = list;
389 bool included_auto = 0;
390 size_t n = 0;
391 zval *hash_entry;
392 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
393 zend_string *encoding_str = zval_try_get_string(hash_entry);
394 if (UNEXPECTED(!encoding_str)) {
395 efree(ZEND_VOIDP(list));
396 return FAILURE;
397 }
398
399 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
400 if (!included_auto) {
401 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
402 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
403 size_t j;
404
405 included_auto = 1;
406 for (j = 0; j < identify_list_size; j++) {
407 *entry++ = mbfl_no2encoding(*src++);
408 n++;
409 }
410 }
411 } else {
412 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
413 if (encoding) {
414 *entry++ = encoding;
415 n++;
416 } else {
417 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
418 zend_string_release(encoding_str);
419 efree(ZEND_VOIDP(list));
420 return FAILURE;
421 }
422 }
423 zend_string_release(encoding_str);
424 } ZEND_HASH_FOREACH_END();
425 *return_list = list;
426 *return_size = n;
427 return SUCCESS;
428 }
429 /* }}} */
430
431 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)432 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
433 {
434 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
435 }
436
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)437 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
438 {
439 return ((const mbfl_encoding *)encoding)->name;
440 }
441
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)442 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
443 {
444 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
445 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
446 }
447
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)448 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
449 {
450 if (!list) {
451 list = (const zend_encoding**)MBSTRG(current_detect_order_list);
452 list_size = MBSTRG(current_detect_order_list_size);
453 }
454 if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
455 /* Emulate behavior of previous implementation; it would never return "pass"
456 * from an encoding auto-detection operation */
457 return NULL;
458 }
459 return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
460 }
461
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)462 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
463 {
464 unsigned int num_errors = 0;
465 zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
466
467 *to_length = ZSTR_LEN(result);
468 *to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
469 memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
470 zend_string_free(result);
471
472 return from_length;
473 }
474
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)475 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
476 {
477 return php_mb_parse_encoding_list(
478 encoding_list, encoding_list_len,
479 (const mbfl_encoding ***)return_list, return_size,
480 persistent, /* arg_num */ 0);
481 }
482
php_mb_zend_internal_encoding_getter(void)483 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
484 {
485 return (const zend_encoding *)MBSTRG(internal_encoding);
486 }
487
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)488 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
489 {
490 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
491 return SUCCESS;
492 }
493
494 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
495 "mbstring",
496 php_mb_zend_encoding_fetcher,
497 php_mb_zend_encoding_name_getter,
498 php_mb_zend_encoding_lexer_compatibility_checker,
499 php_mb_zend_encoding_detector,
500 php_mb_zend_encoding_converter,
501 php_mb_zend_encoding_list_parser,
502 php_mb_zend_internal_encoding_getter,
503 php_mb_zend_internal_encoding_setter
504 };
505 /* }}} */
506
507 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)508 static void *_php_mb_compile_regex(const char *pattern)
509 {
510 pcre2_code *retval;
511 PCRE2_SIZE err_offset;
512 int errnum;
513
514 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
515 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
516 PCRE2_UCHAR err_str[128];
517 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
518 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
519 }
520 return retval;
521 }
522 /* }}} */
523
524 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)525 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
526 {
527 int res;
528
529 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
530 if (NULL == match_data) {
531 pcre2_code_free(opaque);
532 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
533 return FAILURE;
534 }
535 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
536 php_pcre_free_match_data(match_data);
537
538 return res;
539 }
540 /* }}} */
541
542 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)543 static void _php_mb_free_regex(void *opaque)
544 {
545 pcre2_code_free(opaque);
546 }
547 /* }}} */
548
549 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)550 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
551 {
552 size_t i;
553
554 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
555 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
556
557 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
558 if (php_mb_default_identify_list[i].lang == lang) {
559 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
560 *plist_size = php_mb_default_identify_list[i].list_size;
561 return 1;
562 }
563 }
564 return 0;
565 }
566 /* }}} */
567
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)568 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
569 {
570 char *result = emalloc(len + 2);
571 char *resp = result;
572 size_t i;
573
574 for (i = 0; i < len && start[i] != quote; ++i) {
575 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
576 *resp++ = start[++i];
577 } else {
578 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
579
580 while (j-- > 0 && i < len) {
581 *resp++ = start[i++];
582 }
583 --i;
584 }
585 }
586
587 *resp = '\0';
588 return result;
589 }
590
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)591 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
592 {
593 char *pos = *line, quote;
594 char *res;
595
596 while (*pos && *pos != stop) {
597 if ((quote = *pos) == '"' || quote == '\'') {
598 ++pos;
599 while (*pos && *pos != quote) {
600 if (*pos == '\\' && pos[1] && pos[1] == quote) {
601 pos += 2;
602 } else {
603 ++pos;
604 }
605 }
606 if (*pos) {
607 ++pos;
608 }
609 } else {
610 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
611
612 }
613 }
614 if (*pos == '\0') {
615 res = estrdup(*line);
616 *line += strlen(*line);
617 return res;
618 }
619
620 res = estrndup(*line, pos - *line);
621
622 while (*pos == stop) {
623 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
624 }
625
626 *line = pos;
627 return res;
628 }
629 /* }}} */
630
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)631 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
632 {
633 while (*str && isspace(*(unsigned char *)str)) {
634 ++str;
635 }
636
637 if (!*str) {
638 return estrdup("");
639 }
640
641 if (*str == '"' || *str == '\'') {
642 char quote = *str;
643
644 str++;
645 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
646 } else {
647 char *strend = str;
648
649 while (*strend && !isspace(*(unsigned char *)strend)) {
650 ++strend;
651 }
652 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
653 }
654 }
655 /* }}} */
656
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)657 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
658 {
659 char *s, *s2;
660 const size_t filename_len = strlen(filename);
661
662 /* The \ check should technically be needed for win32 systems only where
663 * it is a valid path separator. However, IE in all it's wisdom always sends
664 * the full path of the file on the user's filesystem, which means that unless
665 * the user does basename() they get a bogus file name. Until IE's user base drops
666 * to nill or problem is fixed this code must remain enabled for all systems. */
667 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
668 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
669
670 if (s && s2) {
671 if (s > s2) {
672 return ++s;
673 } else {
674 return ++s2;
675 }
676 } else if (s) {
677 return ++s;
678 } else if (s2) {
679 return ++s2;
680 } else {
681 return filename;
682 }
683 }
684 /* }}} */
685
686 /* {{{ php.ini directive handler */
687 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)688 static PHP_INI_MH(OnUpdate_mbstring_language)
689 {
690 enum mbfl_no_language no_language;
691
692 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
693 if (no_language == mbfl_no_language_invalid) {
694 MBSTRG(language) = mbfl_no_language_neutral;
695 return FAILURE;
696 }
697 MBSTRG(language) = no_language;
698 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
699 return SUCCESS;
700 }
701 /* }}} */
702
703 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)704 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
705 {
706 const mbfl_encoding **list;
707 size_t size;
708
709 if (!new_value) {
710 if (MBSTRG(detect_order_list)) {
711 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
712 }
713 MBSTRG(detect_order_list) = NULL;
714 MBSTRG(detect_order_list_size) = 0;
715 return SUCCESS;
716 }
717
718 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
719 return FAILURE;
720 }
721
722 if (MBSTRG(detect_order_list)) {
723 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 }
725 MBSTRG(detect_order_list) = list;
726 MBSTRG(detect_order_list_size) = size;
727 return SUCCESS;
728 }
729 /* }}} */
730
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)731 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
732 const mbfl_encoding **list;
733 size_t size;
734 if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
735 list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
736 *list = &mbfl_encoding_pass;
737 size = 1;
738 } else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
739 return FAILURE;
740 }
741 if (MBSTRG(http_input_list)) {
742 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
743 }
744 MBSTRG(http_input_list) = list;
745 MBSTRG(http_input_list_size) = size;
746 return SUCCESS;
747 }
748
749 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)750 static PHP_INI_MH(OnUpdate_mbstring_http_input)
751 {
752 if (new_value) {
753 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
754 }
755
756 if (!new_value || !ZSTR_LEN(new_value)) {
757 const char *encoding = php_get_input_encoding();
758 MBSTRG(http_input_set) = 0;
759 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
760 return SUCCESS;
761 }
762
763 MBSTRG(http_input_set) = 1;
764 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
765 }
766 /* }}} */
767
_php_mb_ini_mbstring_http_output_set(const char * new_value,size_t length)768 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
769 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
770 if (!encoding) {
771 return FAILURE;
772 }
773
774 MBSTRG(http_output_encoding) = encoding;
775 MBSTRG(current_http_output_encoding) = encoding;
776 return SUCCESS;
777 }
778
779 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)780 static PHP_INI_MH(OnUpdate_mbstring_http_output)
781 {
782 if (new_value) {
783 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
784 }
785
786 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
787 const char *encoding = php_get_output_encoding();
788 MBSTRG(http_output_set) = 0;
789 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
790 return SUCCESS;
791 }
792
793 MBSTRG(http_output_set) = 1;
794 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
795 }
796 /* }}} */
797
798 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)799 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
800 {
801 const mbfl_encoding *encoding;
802
803 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
804 /* falls back to UTF-8 if an unknown encoding name is given */
805 if (new_value) {
806 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
807 }
808 encoding = &mbfl_encoding_utf8;
809 }
810 MBSTRG(internal_encoding) = encoding;
811 MBSTRG(current_internal_encoding) = encoding;
812 #ifdef HAVE_MBREGEX
813 {
814 const char *enc_name = new_value;
815 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
816 /* falls back to UTF-8 if an unknown encoding name is given */
817 enc_name = "UTF-8";
818 php_mb_regex_set_default_mbctype(enc_name);
819 }
820 php_mb_regex_set_mbctype(new_value);
821 }
822 #endif
823 return SUCCESS;
824 }
825 /* }}} */
826
827 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)828 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
829 {
830 if (new_value) {
831 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
832 }
833
834 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
835 return FAILURE;
836 }
837
838 if (new_value && ZSTR_LEN(new_value)) {
839 MBSTRG(internal_encoding_set) = 1;
840 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
841 } else {
842 const char *encoding = php_get_internal_encoding();
843 MBSTRG(internal_encoding_set) = 0;
844 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
845 }
846 }
847 /* }}} */
848
849 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)850 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
851 {
852 if (new_value != NULL) {
853 if (zend_string_equals_literal_ci(new_value, "none")) {
854 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
855 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
856 } else if (zend_string_equals_literal_ci(new_value, "long")) {
857 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
858 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
859 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
860 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
861 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
862 } else {
863 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
864 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
865 if (ZSTR_LEN(new_value) > 0) {
866 char *endptr = NULL;
867 int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868
869 if (*endptr == '\0') {
870 MBSTRG(filter_illegal_substchar) = c;
871 MBSTRG(current_filter_illegal_substchar) = c;
872 }
873 }
874 }
875 } else {
876 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
878 MBSTRG(filter_illegal_substchar) = '?';
879 MBSTRG(current_filter_illegal_substchar) = '?';
880 }
881
882 return SUCCESS;
883 }
884 /* }}} */
885
886 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)887 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
888 {
889 if (new_value == NULL) {
890 return FAILURE;
891 }
892
893 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
894
895 if (MBSTRG(encoding_translation)) {
896 sapi_unregister_post_entry(php_post_entries);
897 sapi_register_post_entries(mbstr_post_entries);
898 } else {
899 sapi_unregister_post_entry(mbstr_post_entries);
900 sapi_register_post_entries(php_post_entries);
901 }
902
903 return SUCCESS;
904 }
905 /* }}} */
906
907 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)908 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
909 {
910 zend_string *tmp;
911 void *re = NULL;
912
913 if (!new_value) {
914 new_value = entry->orig_value;
915 }
916 tmp = php_trim(new_value, NULL, 0, 3);
917
918 if (ZSTR_LEN(tmp) > 0) {
919 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
920 zend_string_release_ex(tmp, 0);
921 return FAILURE;
922 }
923 }
924
925 if (MBSTRG(http_output_conv_mimetypes)) {
926 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
927 }
928
929 MBSTRG(http_output_conv_mimetypes) = re;
930
931 zend_string_release_ex(tmp, 0);
932 return SUCCESS;
933 }
934 /* }}} */
935 /* }}} */
936
937 /* {{{ php.ini directive registration */
938 PHP_INI_BEGIN()
939 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
940 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
941 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
942 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
943 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
944 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
945
946 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
947 PHP_INI_SYSTEM | PHP_INI_PERDIR,
948 OnUpdate_mbstring_encoding_translation,
949 encoding_translation, zend_mbstring_globals, mbstring_globals)
950 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
951 "^(text/|application/xhtml\\+xml)",
952 PHP_INI_ALL,
953 OnUpdate_mbstring_http_output_conv_mimetypes)
954
955 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
956 PHP_INI_ALL,
957 OnUpdateBool,
958 strict_detection, zend_mbstring_globals, mbstring_globals)
959 #ifdef HAVE_MBREGEX
960 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
961 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
962 #endif
PHP_INI_END()963 PHP_INI_END()
964 /* }}} */
965
966 static void mbstring_internal_encoding_changed_hook(void) {
967 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
968 if (!MBSTRG(internal_encoding_set)) {
969 const char *encoding = php_get_internal_encoding();
970 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
971 }
972
973 if (!MBSTRG(http_output_set)) {
974 const char *encoding = php_get_output_encoding();
975 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
976 }
977
978 if (!MBSTRG(http_input_set)) {
979 const char *encoding = php_get_input_encoding();
980 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
981 }
982 }
983
984 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)985 static PHP_GINIT_FUNCTION(mbstring)
986 {
987 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
988 ZEND_TSRMLS_CACHE_UPDATE();
989 #endif
990
991 mbstring_globals->language = mbfl_no_language_uni;
992 mbstring_globals->internal_encoding = NULL;
993 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
994 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
995 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
996 mbstring_globals->http_input_identify = NULL;
997 mbstring_globals->http_input_identify_get = NULL;
998 mbstring_globals->http_input_identify_post = NULL;
999 mbstring_globals->http_input_identify_cookie = NULL;
1000 mbstring_globals->http_input_identify_string = NULL;
1001 mbstring_globals->http_input_list = NULL;
1002 mbstring_globals->http_input_list_size = 0;
1003 mbstring_globals->detect_order_list = NULL;
1004 mbstring_globals->detect_order_list_size = 0;
1005 mbstring_globals->current_detect_order_list = NULL;
1006 mbstring_globals->current_detect_order_list_size = 0;
1007 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1008 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1009 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1010 mbstring_globals->filter_illegal_substchar = '?';
1011 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1012 mbstring_globals->current_filter_illegal_substchar = '?';
1013 mbstring_globals->illegalchars = 0;
1014 mbstring_globals->encoding_translation = 0;
1015 mbstring_globals->strict_detection = 0;
1016 mbstring_globals->outconv_enabled = false;
1017 mbstring_globals->outconv_state = 0;
1018 mbstring_globals->http_output_conv_mimetypes = NULL;
1019 #ifdef HAVE_MBREGEX
1020 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1021 #endif
1022 mbstring_globals->last_used_encoding_name = NULL;
1023 mbstring_globals->last_used_encoding = NULL;
1024 mbstring_globals->internal_encoding_set = 0;
1025 mbstring_globals->http_output_set = 0;
1026 mbstring_globals->http_input_set = 0;
1027 mbstring_globals->all_encodings_list = NULL;
1028 }
1029 /* }}} */
1030
1031 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1032 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1033 {
1034 if (mbstring_globals->http_input_list) {
1035 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1036 }
1037 if (mbstring_globals->detect_order_list) {
1038 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1039 }
1040 if (mbstring_globals->http_output_conv_mimetypes) {
1041 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1042 }
1043 #ifdef HAVE_MBREGEX
1044 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1045 #endif
1046 }
1047 /* }}} */
1048
1049 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1050 static void init_check_utf8(void);
1051 #endif
1052
1053 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1054 PHP_MINIT_FUNCTION(mbstring)
1055 {
1056 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1057 ZEND_TSRMLS_CACHE_UPDATE();
1058 #endif
1059
1060 REGISTER_INI_ENTRIES();
1061
1062 /* We assume that we're the only user of the hook. */
1063 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1064 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1065 mbstring_internal_encoding_changed_hook();
1066
1067 /* This is a global handler. Should not be set in a per-request handler. */
1068 sapi_register_treat_data(mbstr_treat_data);
1069
1070 /* Post handlers are stored in the thread-local context. */
1071 if (MBSTRG(encoding_translation)) {
1072 sapi_register_post_entries(mbstr_post_entries);
1073 }
1074
1075 #ifdef HAVE_MBREGEX
1076 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1077 #endif
1078
1079 register_mbstring_symbols(module_number);
1080
1081 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1082 return FAILURE;
1083 }
1084
1085 php_rfc1867_set_multibyte_callbacks(
1086 php_mb_encoding_translation,
1087 php_mb_gpc_get_detect_order,
1088 php_mb_gpc_set_input_encoding,
1089 php_mb_rfc1867_getword,
1090 php_mb_rfc1867_getword_conf,
1091 php_mb_rfc1867_basename);
1092
1093 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1094 init_check_utf8();
1095 init_convert_utf16();
1096 #endif
1097
1098 return SUCCESS;
1099 }
1100 /* }}} */
1101
1102 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1103 PHP_MSHUTDOWN_FUNCTION(mbstring)
1104 {
1105 UNREGISTER_INI_ENTRIES();
1106
1107 zend_multibyte_restore_functions();
1108
1109 #ifdef HAVE_MBREGEX
1110 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1111 #endif
1112
1113 php_internal_encoding_changed = NULL;
1114
1115 return SUCCESS;
1116 }
1117 /* }}} */
1118
1119 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1120 PHP_RINIT_FUNCTION(mbstring)
1121 {
1122 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1123 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1124 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1125 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1126
1127 MBSTRG(illegalchars) = 0;
1128
1129 php_mb_populate_current_detect_order_list();
1130
1131 #ifdef HAVE_MBREGEX
1132 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1133 #endif
1134 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1135
1136 return SUCCESS;
1137 }
1138 /* }}} */
1139
1140 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1141 PHP_RSHUTDOWN_FUNCTION(mbstring)
1142 {
1143 if (MBSTRG(current_detect_order_list) != NULL) {
1144 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1145 MBSTRG(current_detect_order_list) = NULL;
1146 MBSTRG(current_detect_order_list_size) = 0;
1147 }
1148
1149 /* clear http input identification. */
1150 MBSTRG(http_input_identify) = NULL;
1151 MBSTRG(http_input_identify_post) = NULL;
1152 MBSTRG(http_input_identify_get) = NULL;
1153 MBSTRG(http_input_identify_cookie) = NULL;
1154 MBSTRG(http_input_identify_string) = NULL;
1155
1156 if (MBSTRG(last_used_encoding_name)) {
1157 zend_string_release(MBSTRG(last_used_encoding_name));
1158 MBSTRG(last_used_encoding_name) = NULL;
1159 }
1160
1161 MBSTRG(internal_encoding_set) = 0;
1162 MBSTRG(http_output_set) = 0;
1163 MBSTRG(http_input_set) = 0;
1164
1165 MBSTRG(outconv_enabled) = false;
1166 MBSTRG(outconv_state) = 0;
1167
1168 if (MBSTRG(all_encodings_list)) {
1169 GC_DELREF(MBSTRG(all_encodings_list));
1170 zend_array_destroy(MBSTRG(all_encodings_list));
1171 MBSTRG(all_encodings_list) = NULL;
1172 }
1173
1174 #ifdef HAVE_MBREGEX
1175 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1176 #endif
1177
1178 return SUCCESS;
1179 }
1180 /* }}} */
1181
1182 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1183 PHP_MINFO_FUNCTION(mbstring)
1184 {
1185 php_info_print_table_start();
1186 php_info_print_table_row(2, "Multibyte Support", "enabled");
1187 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1188 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1189 {
1190 char tmp[256];
1191 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1192 php_info_print_table_row(2, "libmbfl version", tmp);
1193 }
1194 php_info_print_table_end();
1195
1196 php_info_print_table_start();
1197 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1198 php_info_print_table_end();
1199
1200 #ifdef HAVE_MBREGEX
1201 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1202 #endif
1203
1204 DISPLAY_INI_ENTRIES();
1205 }
1206 /* }}} */
1207
1208 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1209 PHP_FUNCTION(mb_language)
1210 {
1211 zend_string *name = NULL;
1212
1213 ZEND_PARSE_PARAMETERS_START(0, 1)
1214 Z_PARAM_OPTIONAL
1215 Z_PARAM_STR_OR_NULL(name)
1216 ZEND_PARSE_PARAMETERS_END();
1217
1218 if (name == NULL) {
1219 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1220 } else {
1221 zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1222 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1223 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1224 zend_string_release_ex(ini_name, 0);
1225 RETURN_THROWS();
1226 }
1227 // TODO Make return void
1228 RETVAL_TRUE;
1229 zend_string_release_ex(ini_name, 0);
1230 }
1231 }
1232 /* }}} */
1233
1234 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1235 PHP_FUNCTION(mb_internal_encoding)
1236 {
1237 char *name = NULL;
1238 size_t name_len;
1239 const mbfl_encoding *encoding;
1240
1241 ZEND_PARSE_PARAMETERS_START(0, 1)
1242 Z_PARAM_OPTIONAL
1243 Z_PARAM_STRING_OR_NULL(name, name_len)
1244 ZEND_PARSE_PARAMETERS_END();
1245
1246 if (name == NULL) {
1247 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1248 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1249 } else {
1250 encoding = mbfl_name2encoding(name);
1251 if (!encoding) {
1252 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1253 RETURN_THROWS();
1254 } else {
1255 MBSTRG(current_internal_encoding) = encoding;
1256 MBSTRG(internal_encoding_set) = 1;
1257 /* TODO Return old encoding */
1258 RETURN_TRUE;
1259 }
1260 }
1261 }
1262 /* }}} */
1263
1264 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1265 PHP_FUNCTION(mb_http_input)
1266 {
1267 char *type = NULL;
1268 size_t type_len = 0, n;
1269 const mbfl_encoding **entry;
1270 const mbfl_encoding *encoding;
1271
1272 ZEND_PARSE_PARAMETERS_START(0, 1)
1273 Z_PARAM_OPTIONAL
1274 Z_PARAM_STRING_OR_NULL(type, type_len)
1275 ZEND_PARSE_PARAMETERS_END();
1276
1277 if (type == NULL) {
1278 encoding = MBSTRG(http_input_identify);
1279 } else if (type_len != 1) {
1280 zend_argument_value_error(1,
1281 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1282 RETURN_THROWS();
1283 } else {
1284 switch (*type) {
1285 case 'G':
1286 case 'g':
1287 encoding = MBSTRG(http_input_identify_get);
1288 break;
1289 case 'P':
1290 case 'p':
1291 encoding = MBSTRG(http_input_identify_post);
1292 break;
1293 case 'C':
1294 case 'c':
1295 encoding = MBSTRG(http_input_identify_cookie);
1296 break;
1297 case 'S':
1298 case 's':
1299 encoding = MBSTRG(http_input_identify_string);
1300 break;
1301 case 'I':
1302 case 'i':
1303 entry = MBSTRG(http_input_list);
1304 n = MBSTRG(http_input_list_size);
1305 array_init(return_value);
1306 for (size_t i = 0; i < n; i++, entry++) {
1307 add_next_index_string(return_value, (*entry)->name);
1308 }
1309 return;
1310 case 'L':
1311 case 'l':
1312 entry = MBSTRG(http_input_list);
1313 n = MBSTRG(http_input_list_size);
1314 if (n == 0) {
1315 RETURN_FALSE;
1316 }
1317
1318 smart_str result = {0};
1319 for (size_t i = 0; i < n; i++, entry++) {
1320 if (i > 0) {
1321 smart_str_appendc(&result, ',');
1322 }
1323 smart_str_appends(&result, (*entry)->name);
1324 }
1325 RETURN_STR(smart_str_extract(&result));
1326 default:
1327 zend_argument_value_error(1,
1328 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1329 RETURN_THROWS();
1330 }
1331 }
1332
1333 if (encoding) {
1334 RETURN_STRING(encoding->name);
1335 } else {
1336 RETURN_FALSE;
1337 }
1338 }
1339 /* }}} */
1340
1341 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1342 PHP_FUNCTION(mb_http_output)
1343 {
1344 char *name = NULL;
1345 size_t name_len;
1346
1347 ZEND_PARSE_PARAMETERS_START(0, 1)
1348 Z_PARAM_OPTIONAL
1349 Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1350 ZEND_PARSE_PARAMETERS_END();
1351
1352 if (name == NULL) {
1353 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1354 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1355 } else {
1356 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1357 if (!encoding) {
1358 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1359 RETURN_THROWS();
1360 } else {
1361 MBSTRG(http_output_set) = 1;
1362 MBSTRG(current_http_output_encoding) = encoding;
1363 /* TODO Return previous encoding? */
1364 RETURN_TRUE;
1365 }
1366 }
1367 }
1368 /* }}} */
1369
1370 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1371 PHP_FUNCTION(mb_detect_order)
1372 {
1373 zend_string *order_str = NULL;
1374 HashTable *order_ht = NULL;
1375
1376 ZEND_PARSE_PARAMETERS_START(0, 1)
1377 Z_PARAM_OPTIONAL
1378 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1379 ZEND_PARSE_PARAMETERS_END();
1380
1381 if (!order_str && !order_ht) {
1382 size_t n = MBSTRG(current_detect_order_list_size);
1383 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1384 array_init(return_value);
1385 for (size_t i = 0; i < n; i++) {
1386 add_next_index_string(return_value, (*entry)->name);
1387 entry++;
1388 }
1389 } else {
1390 const mbfl_encoding **list;
1391 size_t size;
1392 if (order_ht) {
1393 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1394 RETURN_THROWS();
1395 }
1396 } else {
1397 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1398 RETURN_THROWS();
1399 }
1400 }
1401
1402 if (size == 0) {
1403 efree(ZEND_VOIDP(list));
1404 zend_argument_value_error(1, "must specify at least one encoding");
1405 RETURN_THROWS();
1406 }
1407
1408 if (MBSTRG(current_detect_order_list)) {
1409 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1410 }
1411 MBSTRG(current_detect_order_list) = list;
1412 MBSTRG(current_detect_order_list_size) = size;
1413 RETURN_TRUE;
1414 }
1415 }
1416 /* }}} */
1417
php_mb_check_code_point(zend_long cp)1418 static inline bool php_mb_check_code_point(zend_long cp)
1419 {
1420 if (cp < 0 || cp >= 0x110000) {
1421 /* Out of Unicode range */
1422 return false;
1423 }
1424
1425 if (cp >= 0xd800 && cp <= 0xdfff) {
1426 /* Surrogate code-point. These are never valid on their own and we only allow a single
1427 * substitute character. */
1428 return false;
1429 }
1430
1431 /* As we do not know the target encoding of the conversion operation that is going to
1432 * use the substitution character, we cannot check whether the codepoint is actually mapped
1433 * in the given encoding at this point. Thus we have to accept everything. */
1434 return true;
1435 }
1436
1437 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1438 PHP_FUNCTION(mb_substitute_character)
1439 {
1440 zend_string *substitute_character = NULL;
1441 zend_long substitute_codepoint;
1442 bool substitute_is_null = 1;
1443
1444 ZEND_PARSE_PARAMETERS_START(0, 1)
1445 Z_PARAM_OPTIONAL
1446 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1447 ZEND_PARSE_PARAMETERS_END();
1448
1449 if (substitute_is_null) {
1450 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1451 RETURN_STRING("none");
1452 }
1453 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1454 RETURN_STRING("long");
1455 }
1456 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1457 RETURN_STRING("entity");
1458 }
1459 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1460 }
1461
1462 if (substitute_character != NULL) {
1463 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1464 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1465 RETURN_TRUE;
1466 }
1467 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1468 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1469 RETURN_TRUE;
1470 }
1471 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1472 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1473 RETURN_TRUE;
1474 }
1475 /* Invalid string value */
1476 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1477 RETURN_THROWS();
1478 }
1479 /* Integer codepoint passed */
1480 if (!php_mb_check_code_point(substitute_codepoint)) {
1481 zend_argument_value_error(1, "is not a valid codepoint");
1482 RETURN_THROWS();
1483 }
1484
1485 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1486 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1487 RETURN_TRUE;
1488 }
1489 /* }}} */
1490
1491 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1492 PHP_FUNCTION(mb_preferred_mime_name)
1493 {
1494 char *name = NULL;
1495 size_t name_len;
1496
1497 ZEND_PARSE_PARAMETERS_START(1, 1)
1498 Z_PARAM_STRING(name, name_len)
1499 ZEND_PARSE_PARAMETERS_END();
1500
1501 const mbfl_encoding *enc = mbfl_name2encoding(name);
1502 if (enc == NULL) {
1503 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1504 RETURN_THROWS();
1505 }
1506
1507 const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1508 if (preferred_name == NULL || *preferred_name == '\0') {
1509 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1510 RETVAL_FALSE;
1511 } else {
1512 RETVAL_STRING((char *)preferred_name);
1513 }
1514 }
1515 /* }}} */
1516
1517 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1518 PHP_FUNCTION(mb_parse_str)
1519 {
1520 zval *track_vars_array = NULL;
1521 char *encstr;
1522 size_t encstr_len;
1523 php_mb_encoding_handler_info_t info;
1524 const mbfl_encoding *detected;
1525
1526 ZEND_PARSE_PARAMETERS_START(2, 2)
1527 Z_PARAM_STRING(encstr, encstr_len)
1528 Z_PARAM_ZVAL(track_vars_array)
1529 ZEND_PARSE_PARAMETERS_END();
1530
1531 track_vars_array = zend_try_array_init(track_vars_array);
1532 if (!track_vars_array) {
1533 RETURN_THROWS();
1534 }
1535
1536 encstr = estrndup(encstr, encstr_len);
1537
1538 info.data_type = PARSE_STRING;
1539 info.separator = PG(arg_separator).input;
1540 info.report_errors = true;
1541 info.to_encoding = MBSTRG(current_internal_encoding);
1542 info.from_encodings = MBSTRG(http_input_list);
1543 info.num_from_encodings = MBSTRG(http_input_list_size);
1544
1545 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1546
1547 MBSTRG(http_input_identify) = detected;
1548
1549 RETVAL_BOOL(detected);
1550
1551 if (encstr != NULL) efree(encstr);
1552 }
1553 /* }}} */
1554
PHP_FUNCTION(mb_output_handler)1555 PHP_FUNCTION(mb_output_handler)
1556 {
1557 zend_string *str;
1558 zend_long arg_status;
1559
1560 ZEND_PARSE_PARAMETERS_START(2, 2)
1561 Z_PARAM_STR(str)
1562 Z_PARAM_LONG(arg_status)
1563 ZEND_PARSE_PARAMETERS_END();
1564
1565 const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1566 if (encoding == &mbfl_encoding_pass) {
1567 RETURN_STR_COPY(str);
1568 }
1569
1570 if (arg_status & PHP_OUTPUT_HANDLER_START) {
1571 bool free_mimetype = false;
1572 char *mimetype = NULL;
1573
1574 /* Analyze mime type */
1575 if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1576 char *s;
1577 if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1578 mimetype = estrdup(SG(sapi_headers).mimetype);
1579 } else {
1580 mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1581 }
1582 free_mimetype = true;
1583 } else if (SG(sapi_headers).send_default_content_type) {
1584 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1585 }
1586
1587 /* If content-type is not yet set, set it and enable conversion */
1588 if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1589 const char *charset = encoding->mime_name;
1590 if (charset) {
1591 char *p;
1592 size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s", mimetype, charset);
1593 if (sapi_add_header(p, len, 0) != FAILURE) {
1594 SG(sapi_headers).send_default_content_type = 0;
1595 }
1596 }
1597
1598 MBSTRG(outconv_enabled) = true;
1599 }
1600
1601 if (free_mimetype) {
1602 efree(mimetype);
1603 }
1604 }
1605
1606 if (!MBSTRG(outconv_enabled)) {
1607 RETURN_STR_COPY(str);
1608 }
1609
1610 mb_convert_buf buf;
1611 mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1612
1613 uint32_t wchar_buf[128];
1614 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1615 size_t in_len = ZSTR_LEN(str);
1616 bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1617
1618 while (in_len) {
1619 size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1620 ZEND_ASSERT(out_len <= 128);
1621 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1622 }
1623
1624 MBSTRG(illegalchars) += buf.errors;
1625 RETVAL_STR(mb_convert_buf_result_raw(&buf));
1626
1627 if (last_feed) {
1628 MBSTRG(outconv_enabled) = false;
1629 MBSTRG(outconv_state) = 0;
1630 }
1631 }
1632
PHP_FUNCTION(mb_str_split)1633 PHP_FUNCTION(mb_str_split)
1634 {
1635 zend_string *str, *encoding = NULL;
1636 zend_long split_len = 1;
1637
1638 ZEND_PARSE_PARAMETERS_START(1, 3)
1639 Z_PARAM_STR(str)
1640 Z_PARAM_OPTIONAL
1641 Z_PARAM_LONG(split_len)
1642 Z_PARAM_STR_OR_NULL(encoding)
1643 ZEND_PARSE_PARAMETERS_END();
1644
1645 if (split_len <= 0) {
1646 zend_argument_value_error(2, "must be greater than 0");
1647 RETURN_THROWS();
1648 } else if (split_len > UINT_MAX / 4) {
1649 zend_argument_value_error(2, "is too large");
1650 RETURN_THROWS();
1651 }
1652
1653 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1654 if (!enc) {
1655 RETURN_THROWS();
1656 }
1657
1658 if (ZSTR_LEN(str) == 0) {
1659 RETURN_EMPTY_ARRAY();
1660 }
1661
1662 unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1663
1664 unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1665 if (char_len) {
1666 unsigned int chunk_len = char_len * split_len;
1667 unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1668 array_init_size(return_value, chunks);
1669 while (p < e) {
1670 add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1671 p += chunk_len;
1672 }
1673 } else if (enc->mblen_table) {
1674 unsigned char const *mbtab = enc->mblen_table;
1675
1676 /* Assume that we have 1-byte characters */
1677 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1678
1679 while (p < e) {
1680 unsigned char *chunk = p; /* start of chunk */
1681
1682 for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1683 p += mbtab[*p];
1684 }
1685 if (p > e) {
1686 p = e; /* ensure chunk is in bounds */
1687 }
1688 add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1689 }
1690 } else {
1691 /* Assume that we have 1-byte characters */
1692 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1693
1694 uint32_t wchar_buf[128];
1695 size_t in_len = ZSTR_LEN(str);
1696 unsigned int state = 0, char_count = 0;
1697
1698 mb_convert_buf buf;
1699
1700 while (in_len) {
1701 size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1702 ZEND_ASSERT(out_len <= 128);
1703 size_t i = 0;
1704
1705 /* Is there some output remaining from the previous iteration? */
1706 if (char_count) {
1707 if (out_len >= split_len - char_count) {
1708 /* Finish off an incomplete chunk from previous iteration
1709 * ('buf' was already initialized; we don't need to do it again) */
1710 enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1711 i += split_len - char_count;
1712 char_count = 0;
1713 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1714 } else {
1715 /* Output from this iteration is not enough to finish the next chunk;
1716 * output what we can, and leave 'buf' to be used again on next iteration */
1717 enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1718 char_count += out_len;
1719 continue;
1720 }
1721 }
1722
1723 while (i < out_len) {
1724 /* Prepare for the next chunk */
1725 mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1726
1727 if (out_len - i >= split_len) {
1728 enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1729 i += split_len;
1730 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1731 } else {
1732 /* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1733 * leave them for the next iteration */
1734 enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1735 char_count = out_len - i;
1736 break;
1737 }
1738 }
1739 }
1740
1741 if (char_count) {
1742 /* The main loop above has finished processing the input string, but
1743 * has left a partial chunk in 'buf' */
1744 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1745 }
1746 }
1747 }
1748
1749 #ifdef __SSE2__
1750 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1751 * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1752 * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1753 * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1754 static inline uint32_t _mm_sum_epu8(const __m128i v)
1755 {
1756 /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1757 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1758 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1759 * halves of the destination XMM register
1760 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1761 * summed up will actually just be the 8-bit values from `v` */
1762 __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1763 /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1764 * to extract it here; but it stored the sum as two different 16-bit values
1765 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1766 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1767 return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1768 }
1769 #endif
1770
1771 /* This assumes that `string` is valid UTF-8
1772 * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1773 * Interpreted as signed integers, those are all byte values less than -64
1774 * A fast way to get the length of a UTF-8 string is to start with its byte length,
1775 * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1776 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1777 {
1778 unsigned char *e = p + len;
1779
1780 #ifdef __SSE2__
1781 if (len >= sizeof(__m128i)) {
1782 e -= sizeof(__m128i);
1783
1784 const __m128i threshold = _mm_set1_epi8(-64);
1785 const __m128i delta = _mm_set1_epi8(1);
1786 __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1787
1788 unsigned char reset_counter = 255;
1789 do {
1790 __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1791 __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1792 counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1793
1794 /* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1795 * and reset them to zero */
1796 if (--reset_counter == 0) {
1797 len -= _mm_sum_epu8(counter);
1798 counter = _mm_setzero_si128();
1799 reset_counter = 255;
1800 }
1801
1802 p += sizeof(__m128i);
1803 } while (p <= e);
1804
1805 e += sizeof(__m128i);
1806 len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1807 }
1808 #endif
1809
1810 /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1811 while (p < e) {
1812 signed char c = *p++;
1813 if (c < -64) {
1814 len--;
1815 }
1816 }
1817
1818 return len;
1819 }
1820
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1821 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1822 {
1823 unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1824 if (char_len) {
1825 return ZSTR_LEN(string) / char_len;
1826 } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1827 return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1828 }
1829
1830 uint32_t wchar_buf[128];
1831 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1832 size_t in_len = ZSTR_LEN(string);
1833 unsigned int state = 0;
1834 size_t len = 0;
1835
1836 while (in_len) {
1837 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1838 }
1839
1840 return len;
1841 }
1842
1843 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1844 PHP_FUNCTION(mb_strlen)
1845 {
1846 zend_string *string, *enc_name = NULL;
1847
1848 ZEND_PARSE_PARAMETERS_START(1, 2)
1849 Z_PARAM_STR(string)
1850 Z_PARAM_OPTIONAL
1851 Z_PARAM_STR_OR_NULL(enc_name)
1852 ZEND_PARSE_PARAMETERS_END();
1853
1854 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1855 if (!enc) {
1856 RETURN_THROWS();
1857 }
1858
1859 RETVAL_LONG(mb_get_strlen(string, enc));
1860 }
1861 /* }}} */
1862
1863 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1864 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1865 {
1866 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1867 }
1868
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1869 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1870 if (offset < 0) {
1871 unsigned char *pos = end;
1872 while (offset < 0) {
1873 if (pos <= str) {
1874 return NULL;
1875 }
1876
1877 unsigned char c = *--pos;
1878 if (c < 0x80 || (c & 0xC0) != 0x80) {
1879 offset++;
1880 }
1881 }
1882 return pos;
1883 } else {
1884 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1885 unsigned char *pos = str;
1886 while (offset-- > 0) {
1887 if (pos >= end) {
1888 return NULL;
1889 }
1890 pos += u8_tbl[*pos];
1891 }
1892 return pos;
1893 }
1894 }
1895
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1896 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1897 return mb_fast_strlen_utf8(start, pos - start);
1898 }
1899
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1900 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1901 {
1902 size_t result;
1903 zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1904 unsigned char *offset_pointer;
1905
1906 if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1907 unsigned int num_errors = 0;
1908 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1909 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1910 } else {
1911 haystack_u8 = haystack;
1912 needle_u8 = needle;
1913 }
1914
1915 offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1916 if (!offset_pointer) {
1917 result = MBFL_ERROR_OFFSET;
1918 goto out;
1919 }
1920
1921 result = MBFL_ERROR_NOT_FOUND;
1922 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1923 goto out;
1924 }
1925
1926 const char *found_pos;
1927 if (!reverse) {
1928 found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1929 } else if (offset >= 0) {
1930 found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1931 } else {
1932 size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1933 offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1934 if (!offset_pointer) {
1935 offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1936 }
1937
1938 found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1939 }
1940
1941 if (found_pos) {
1942 result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1943 }
1944
1945 out:
1946 if (haystack_u8 != haystack) {
1947 zend_string_free(haystack_u8);
1948 }
1949 if (needle_u8 != needle) {
1950 zend_string_free(needle_u8);
1951 }
1952 return result;
1953 }
1954
handle_strpos_error(size_t error)1955 static void handle_strpos_error(size_t error) {
1956 switch (error) {
1957 case MBFL_ERROR_NOT_FOUND:
1958 break;
1959 case MBFL_ERROR_ENCODING:
1960 php_error_docref(NULL, E_WARNING, "Conversion error");
1961 break;
1962 case MBFL_ERROR_OFFSET:
1963 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1964 break;
1965 default:
1966 zend_value_error("mb_strpos(): Unknown error");
1967 break;
1968 }
1969 }
1970
PHP_FUNCTION(mb_strpos)1971 PHP_FUNCTION(mb_strpos)
1972 {
1973 zend_long offset = 0;
1974 zend_string *needle, *haystack;
1975 zend_string *enc_name = NULL;
1976
1977 ZEND_PARSE_PARAMETERS_START(2, 4)
1978 Z_PARAM_STR(haystack)
1979 Z_PARAM_STR(needle)
1980 Z_PARAM_OPTIONAL
1981 Z_PARAM_LONG(offset)
1982 Z_PARAM_STR_OR_NULL(enc_name)
1983 ZEND_PARSE_PARAMETERS_END();
1984
1985 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1986 if (!enc) {
1987 RETURN_THROWS();
1988 }
1989
1990 size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1991 if (!mbfl_is_error(n)) {
1992 RETVAL_LONG(n);
1993 } else {
1994 handle_strpos_error(n);
1995 RETVAL_FALSE;
1996 }
1997 }
1998
1999 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)2000 PHP_FUNCTION(mb_strrpos)
2001 {
2002 zend_long offset = 0;
2003 zend_string *needle, *haystack;
2004 zend_string *enc_name = NULL;
2005
2006 ZEND_PARSE_PARAMETERS_START(2, 4)
2007 Z_PARAM_STR(haystack)
2008 Z_PARAM_STR(needle)
2009 Z_PARAM_OPTIONAL
2010 Z_PARAM_LONG(offset)
2011 Z_PARAM_STR_OR_NULL(enc_name)
2012 ZEND_PARSE_PARAMETERS_END();
2013
2014 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2015 if (!enc) {
2016 RETURN_THROWS();
2017 }
2018
2019 size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2020 if (!mbfl_is_error(n)) {
2021 RETVAL_LONG(n);
2022 } else {
2023 handle_strpos_error(n);
2024 RETVAL_FALSE;
2025 }
2026 }
2027 /* }}} */
2028
2029 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2030 PHP_FUNCTION(mb_stripos)
2031 {
2032 zend_long offset = 0;
2033 zend_string *haystack, *needle;
2034 zend_string *from_encoding = NULL;
2035
2036 ZEND_PARSE_PARAMETERS_START(2, 4)
2037 Z_PARAM_STR(haystack)
2038 Z_PARAM_STR(needle)
2039 Z_PARAM_OPTIONAL
2040 Z_PARAM_LONG(offset)
2041 Z_PARAM_STR_OR_NULL(from_encoding)
2042 ZEND_PARSE_PARAMETERS_END();
2043
2044 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2045 if (!enc) {
2046 RETURN_THROWS();
2047 }
2048
2049 size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2050
2051 if (!mbfl_is_error(n)) {
2052 RETVAL_LONG(n);
2053 } else {
2054 handle_strpos_error(n);
2055 RETVAL_FALSE;
2056 }
2057 }
2058 /* }}} */
2059
2060 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2061 PHP_FUNCTION(mb_strripos)
2062 {
2063 zend_long offset = 0;
2064 zend_string *haystack, *needle;
2065 zend_string *from_encoding = NULL;
2066
2067 ZEND_PARSE_PARAMETERS_START(2, 4)
2068 Z_PARAM_STR(haystack)
2069 Z_PARAM_STR(needle)
2070 Z_PARAM_OPTIONAL
2071 Z_PARAM_LONG(offset)
2072 Z_PARAM_STR_OR_NULL(from_encoding)
2073 ZEND_PARSE_PARAMETERS_END();
2074
2075 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2076 if (!enc) {
2077 RETURN_THROWS();
2078 }
2079
2080 size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2081
2082 if (!mbfl_is_error(n)) {
2083 RETVAL_LONG(n);
2084 } else {
2085 handle_strpos_error(n);
2086 RETVAL_FALSE;
2087 }
2088 }
2089 /* }}} */
2090
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2091 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2092 {
2093 uint32_t wchar_buf[128];
2094 unsigned int state = 0;
2095
2096 mb_convert_buf buf;
2097 mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2098
2099 while (in_len && len) {
2100 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2101 ZEND_ASSERT(out_len <= 128);
2102
2103 if (from >= out_len) {
2104 from -= out_len;
2105 } else {
2106 size_t needed_codepoints = MIN(out_len - from, len);
2107 enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2108 from = 0;
2109 len -= needed_codepoints;
2110 }
2111 }
2112
2113 return mb_convert_buf_result(&buf, enc);
2114 }
2115
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2116 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2117 {
2118 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2119 size_t in_len = ZSTR_LEN(input);
2120
2121 if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2122 /* Other than MacJapanese, no supported text encoding decodes to
2123 * more than one codepoint per byte
2124 * So if the number of codepoints to skip >= number of input bytes,
2125 * then definitely the output should be empty */
2126 return zend_empty_string;
2127 }
2128
2129 /* Does each codepoint have a fixed byte width? */
2130 unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2131 if (flag) {
2132 /* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2133 from *= flag;
2134 len *= flag;
2135 if (from >= in_len) {
2136 return zend_empty_string;
2137 }
2138 in += from;
2139 in_len -= from;
2140 if (len > in_len) {
2141 len = in_len;
2142 }
2143 return zend_string_init_fast((const char*)in, len);
2144 }
2145
2146 return mb_get_substr_slow(in, in_len, from, len, enc);
2147 }
2148
2149 #define MB_STRSTR 1
2150 #define MB_STRRCHR 2
2151 #define MB_STRISTR 3
2152 #define MB_STRRICHR 4
2153
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2154 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2155 {
2156 bool reverse_mode = false, part = false;
2157 size_t n;
2158 zend_string *haystack, *needle;
2159 zend_string *encoding_name = NULL;
2160
2161 ZEND_PARSE_PARAMETERS_START(2, 4)
2162 Z_PARAM_STR(haystack)
2163 Z_PARAM_STR(needle)
2164 Z_PARAM_OPTIONAL
2165 Z_PARAM_BOOL(part)
2166 Z_PARAM_STR_OR_NULL(encoding_name)
2167 ZEND_PARSE_PARAMETERS_END();
2168
2169 const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2170 if (!enc) {
2171 RETURN_THROWS();
2172 }
2173
2174 if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2175 reverse_mode = true;
2176 }
2177
2178 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2179 n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2180 } else {
2181 n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2182 }
2183
2184 if (!mbfl_is_error(n)) {
2185 if (part) {
2186 RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2187 } else {
2188 RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2189 }
2190 } else {
2191 // FIXME use handle_strpos_error(n)
2192 RETVAL_FALSE;
2193 }
2194 }
2195
2196 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2197 PHP_FUNCTION(mb_strstr)
2198 {
2199 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2200 }
2201 /* }}} */
2202
2203 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2204 PHP_FUNCTION(mb_strrchr)
2205 {
2206 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2207 }
2208 /* }}} */
2209
2210 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2211 PHP_FUNCTION(mb_stristr)
2212 {
2213 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2214 }
2215 /* }}} */
2216
2217 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2218 PHP_FUNCTION(mb_strrichr)
2219 {
2220 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2221 }
2222 /* }}} */
2223
2224 #undef MB_STRSTR
2225 #undef MB_STRRCHR
2226 #undef MB_STRISTR
2227 #undef MB_STRRICHR
2228
PHP_FUNCTION(mb_substr_count)2229 PHP_FUNCTION(mb_substr_count)
2230 {
2231 zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2232
2233 ZEND_PARSE_PARAMETERS_START(2, 3)
2234 Z_PARAM_STR(haystack)
2235 Z_PARAM_STR(needle)
2236 Z_PARAM_OPTIONAL
2237 Z_PARAM_STR_OR_NULL(enc_name)
2238 ZEND_PARSE_PARAMETERS_END();
2239
2240 if (ZSTR_LEN(needle) == 0) {
2241 zend_argument_must_not_be_empty_error(2);
2242 RETURN_THROWS();
2243 }
2244
2245 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2246 if (!enc) {
2247 RETURN_THROWS();
2248 }
2249
2250 if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2251 /* No need to do any conversion if haystack/needle are already known-valid UTF-8
2252 * (If they are not valid, then not passing them through conversion filters could affect output) */
2253 if (ZSTR_IS_VALID_UTF8(haystack)) {
2254 haystack_u8 = haystack;
2255 } else {
2256 unsigned int num_errors = 0;
2257 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2258 if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2259 GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2260 }
2261 }
2262
2263 if (ZSTR_IS_VALID_UTF8(needle)) {
2264 needle_u8 = needle;
2265 } else {
2266 unsigned int num_errors = 0;
2267 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2268 if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2269 GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2270 }
2271 }
2272 } else {
2273 unsigned int num_errors = 0;
2274 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2275 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2276 /* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2277 * may be only escape sequences */
2278 if (ZSTR_LEN(needle_u8) == 0) {
2279 zend_string_free(haystack_u8);
2280 zend_string_free(needle_u8);
2281 zend_argument_must_not_be_empty_error(2);
2282 RETURN_THROWS();
2283 }
2284 }
2285
2286 size_t result = 0;
2287
2288 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2289 goto out;
2290 }
2291
2292 const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2293 while (true) {
2294 p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2295 if (!p) {
2296 break;
2297 }
2298 p += ZSTR_LEN(needle_u8);
2299 result++;
2300 }
2301
2302 out:
2303 if (haystack_u8 != haystack) {
2304 zend_string_free(haystack_u8);
2305 }
2306 if (needle_u8 != needle) {
2307 zend_string_free(needle_u8);
2308 }
2309
2310 RETVAL_LONG(result);
2311 }
2312
2313 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2314 PHP_FUNCTION(mb_substr)
2315 {
2316 zend_string *str, *encoding = NULL;
2317 zend_long from, len;
2318 size_t real_from, real_len;
2319 bool len_is_null = true;
2320
2321 ZEND_PARSE_PARAMETERS_START(2, 4)
2322 Z_PARAM_STR(str)
2323 Z_PARAM_LONG(from)
2324 Z_PARAM_OPTIONAL
2325 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2326 Z_PARAM_STR_OR_NULL(encoding)
2327 ZEND_PARSE_PARAMETERS_END();
2328
2329 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2330 if (!enc) {
2331 RETURN_THROWS();
2332 }
2333
2334 size_t mblen = 0;
2335 if (from < 0 || (!len_is_null && len < 0)) {
2336 mblen = mb_get_strlen(str, enc);
2337 }
2338
2339 /* if "from" position is negative, count start position from the end
2340 * of the string */
2341 if (from >= 0) {
2342 real_from = (size_t) from;
2343 } else if (-from < mblen) {
2344 real_from = mblen + from;
2345 } else {
2346 real_from = 0;
2347 }
2348
2349 /* if "length" position is negative, set it to the length
2350 * needed to stop that many chars from the end of the string */
2351 if (len_is_null) {
2352 real_len = MBFL_SUBSTR_UNTIL_END;
2353 } else if (len >= 0) {
2354 real_len = (size_t) len;
2355 } else if (real_from < mblen && -len < mblen - real_from) {
2356 real_len = (mblen - real_from) + len;
2357 } else {
2358 real_len = 0;
2359 }
2360
2361 RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2362 }
2363 /* }}} */
2364
2365 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2366 PHP_FUNCTION(mb_strcut)
2367 {
2368 zend_string *encoding = NULL;
2369 char *string_val;
2370 zend_long from, len;
2371 bool len_is_null = true;
2372 mbfl_string string, result, *ret;
2373
2374 ZEND_PARSE_PARAMETERS_START(2, 4)
2375 Z_PARAM_STRING(string_val, string.len)
2376 Z_PARAM_LONG(from)
2377 Z_PARAM_OPTIONAL
2378 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2379 Z_PARAM_STR_OR_NULL(encoding)
2380 ZEND_PARSE_PARAMETERS_END();
2381
2382 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2383 if (!enc) {
2384 RETURN_THROWS();
2385 }
2386
2387 string.val = (unsigned char*)string_val;
2388 string.encoding = enc;
2389
2390 if (len_is_null) {
2391 len = string.len;
2392 }
2393
2394 /* if "from" position is negative, count start position from the end
2395 * of the string */
2396 if (from < 0) {
2397 from = string.len + from;
2398 if (from < 0) {
2399 from = 0;
2400 }
2401 }
2402
2403 /* if "length" position is negative, set it to the length
2404 * needed to stop that many chars from the end of the string */
2405 if (len < 0) {
2406 len = (string.len - from) + len;
2407 if (len < 0) {
2408 len = 0;
2409 }
2410 }
2411
2412 if (from > string.len || len == 0) {
2413 RETURN_EMPTY_STRING();
2414 }
2415
2416 if (enc->cut) {
2417 RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2418 }
2419
2420 unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2421 if (char_len) {
2422 /* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2423 from &= -char_len;
2424 if (len > string.len - from) {
2425 len = string.len - from;
2426 }
2427 RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2428 }
2429
2430 if (enc->mblen_table) {
2431 const unsigned char *mbtab = enc->mblen_table;
2432 const unsigned char *p, *q, *end;
2433 int m = 0;
2434 /* Search for start position */
2435 for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2436 if (p > q) {
2437 p -= m;
2438 }
2439 const unsigned char *start = p;
2440 /* Search for end position */
2441 if (len >= string.len - (start - (const unsigned char*)string.val)) {
2442 end = (const unsigned char*)(string.val + string.len);
2443 } else {
2444 for (q = p + len; p < q; p += (m = mbtab[*p]));
2445 if (p > q) {
2446 p -= m;
2447 }
2448 end = p;
2449 }
2450 RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2451 }
2452
2453 ret = mbfl_strcut(&string, &result, from, len);
2454 ZEND_ASSERT(ret != NULL);
2455 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2456 efree(ret->val);
2457 }
2458 /* }}} */
2459
2460 /* Some East Asian characters, when printed at a terminal (or the like), require double
2461 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2462 static size_t character_width(uint32_t c)
2463 {
2464 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2465 return 1;
2466 }
2467
2468 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2469 unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2470 while (lo < hi) {
2471 unsigned int probe = (lo + hi) / 2;
2472 if (c < mbfl_eaw_table[probe].begin) {
2473 hi = probe;
2474 } else if (c > mbfl_eaw_table[probe].end) {
2475 lo = probe + 1;
2476 } else {
2477 return 2;
2478 }
2479 }
2480
2481 return 1;
2482 }
2483
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2484 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2485 {
2486 size_t width = 0;
2487 uint32_t wchar_buf[128];
2488 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2489 size_t in_len = ZSTR_LEN(string);
2490 unsigned int state = 0;
2491
2492 while (in_len) {
2493 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2494 ZEND_ASSERT(out_len <= 128);
2495
2496 while (out_len) {
2497 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2498 * If text conversion is performed with an ordinary ASCII character as
2499 * the 'replacement character', this will give us the correct display width. */
2500 width += character_width(wchar_buf[--out_len]);
2501 }
2502 }
2503
2504 return width;
2505 }
2506
2507 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2508 PHP_FUNCTION(mb_strwidth)
2509 {
2510 zend_string *string, *enc_name = NULL;
2511
2512 ZEND_PARSE_PARAMETERS_START(1, 2)
2513 Z_PARAM_STR(string)
2514 Z_PARAM_OPTIONAL
2515 Z_PARAM_STR_OR_NULL(enc_name)
2516 ZEND_PARSE_PARAMETERS_END();
2517
2518 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2519 if (!enc) {
2520 RETURN_THROWS();
2521 }
2522
2523 RETVAL_LONG(mb_get_strwidth(string, enc));
2524 }
2525
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2526 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2527 {
2528 uint32_t wchar_buf[128];
2529 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2530 size_t in_len = ZSTR_LEN(input);
2531 unsigned int state = 0;
2532 size_t remaining_width = width;
2533 size_t to_skip = from;
2534 size_t out_len = 0;
2535 bool first_call = true, input_err = false;
2536 mb_convert_buf buf;
2537
2538 while (in_len) {
2539 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2540 ZEND_ASSERT(out_len <= 128);
2541
2542 if (out_len <= to_skip) {
2543 to_skip -= out_len;
2544 } else {
2545 for (size_t i = to_skip; i < out_len; i++) {
2546 uint32_t w = wchar_buf[i];
2547 size_t current_w_width = character_width(w);
2548
2549 input_err |= (w == MBFL_BAD_INPUT);
2550
2551 if (remaining_width < current_w_width) {
2552 size_t marker_width = mb_get_strwidth(marker, enc);
2553
2554 /* The trim marker is larger than the desired string width */
2555 if (width <= marker_width) {
2556 return zend_string_copy(marker);
2557 }
2558
2559 /* We need to truncate string and append trim marker */
2560 width -= marker_width;
2561 /* 'width' is now the amount we want to take from 'input' */
2562 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2563
2564 if (first_call) {
2565 /* We can use the buffer of wchars which we have right now;
2566 * no need to convert again */
2567 goto dont_restart_conversion;
2568 } else {
2569 goto restart_conversion;
2570 }
2571 }
2572 remaining_width -= current_w_width;
2573 }
2574 to_skip = 0;
2575 }
2576 first_call = false;
2577 }
2578
2579 /* The input string fits in the requested width; we don't need to append the trim marker
2580 * However, if the string contains erroneous byte sequences, those should be converted
2581 * to error markers */
2582 if (!input_err) {
2583 if (from == 0) {
2584 /* This just increments the string's refcount; it doesn't really 'copy' it */
2585 return zend_string_copy(input);
2586 } else {
2587 return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2588 }
2589 } else {
2590 /* We can't use `mb_get_substr`, because it uses the fastest method possible of
2591 * picking out a substring, which may not include converting erroneous byte
2592 * sequences to error markers */
2593 return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2594 }
2595
2596 /* The input string is too wide; we need to build a new string which
2597 * includes some portion of the input string, with the trim marker
2598 * concatenated onto it */
2599 restart_conversion:
2600 in = (unsigned char*)ZSTR_VAL(input);
2601 in_len = ZSTR_LEN(input);
2602 state = 0;
2603
2604 while (true) {
2605 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2606 ZEND_ASSERT(out_len <= 128);
2607
2608 dont_restart_conversion:
2609 if (out_len <= from) {
2610 from -= out_len;
2611 } else {
2612 for (size_t i = from; i < out_len; i++) {
2613 size_t current_wchar_char_width = character_width(wchar_buf[i]);
2614 if (width < current_wchar_char_width) {
2615 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2616 goto append_trim_marker;
2617 }
2618 width -= current_wchar_char_width;
2619 }
2620 ZEND_ASSERT(in_len > 0);
2621 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2622 from = 0;
2623 }
2624 }
2625
2626 append_trim_marker:
2627 if (ZSTR_LEN(marker) > 0) {
2628 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2629 buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2630 }
2631
2632 /* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2633 * we have no guarantee that the trim marker string is valid UTF-8 */
2634 return mb_convert_buf_result_raw(&buf);
2635 }
2636
2637 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2638 PHP_FUNCTION(mb_strimwidth)
2639 {
2640 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2641 zend_long from, width;
2642
2643 ZEND_PARSE_PARAMETERS_START(3, 5)
2644 Z_PARAM_STR(str)
2645 Z_PARAM_LONG(from)
2646 Z_PARAM_LONG(width)
2647 Z_PARAM_OPTIONAL
2648 Z_PARAM_STR(trimmarker)
2649 Z_PARAM_STR_OR_NULL(encoding)
2650 ZEND_PARSE_PARAMETERS_END();
2651
2652 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2653 if (!enc) {
2654 RETURN_THROWS();
2655 }
2656
2657 if (from != 0) {
2658 size_t str_len = mb_get_strlen(str, enc);
2659 if (from < 0) {
2660 from += str_len;
2661 }
2662 if (from < 0 || from > str_len) {
2663 zend_argument_value_error(2, "is out of range");
2664 RETURN_THROWS();
2665 }
2666 }
2667
2668 if (width < 0) {
2669 php_error_docref(NULL, E_DEPRECATED,
2670 "passing a negative integer to argument #3 ($width) is deprecated");
2671 width += mb_get_strwidth(str, enc);
2672
2673 if (from > 0) {
2674 zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2675 width -= mb_get_strwidth(trimmed, enc);
2676 zend_string_free(trimmed);
2677 }
2678
2679 if (width < 0) {
2680 zend_argument_value_error(3, "is out of range");
2681 RETURN_THROWS();
2682 }
2683 }
2684
2685 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2686 }
2687
2688
2689 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2690 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2691 {
2692 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2693 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2694 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2695 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2696 }
2697
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2698 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2699 {
2700 unsigned int num_errors = 0;
2701 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2702 MBSTRG(illegalchars) += num_errors;
2703 return result;
2704 }
2705
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2706 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2707 {
2708 const mbfl_encoding *from_encoding;
2709
2710 /* pre-conversion encoding */
2711 ZEND_ASSERT(num_from_encodings >= 1);
2712 if (num_from_encodings == 1) {
2713 from_encoding = *from_encodings;
2714 } else {
2715 /* auto detect */
2716 from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2717 if (!from_encoding) {
2718 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2719 return NULL;
2720 }
2721 }
2722
2723 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2724 }
2725
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2726 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2727 {
2728 HashTable *output, *chash;
2729 zend_long idx;
2730 zend_string *key;
2731 zval *entry, entry_tmp;
2732
2733 if (!input) {
2734 return NULL;
2735 }
2736
2737 if (GC_IS_RECURSIVE(input)) {
2738 GC_UNPROTECT_RECURSION(input);
2739 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2740 return NULL;
2741 }
2742 GC_TRY_PROTECT_RECURSION(input);
2743 output = zend_new_array(zend_hash_num_elements(input));
2744 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2745 /* convert key */
2746 if (key) {
2747 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2748 if (!converted_key) {
2749 continue;
2750 }
2751 key = converted_key;
2752 }
2753 /* convert value */
2754 ZEND_ASSERT(entry);
2755 try_again:
2756 switch(Z_TYPE_P(entry)) {
2757 case IS_STRING: {
2758 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2759 if (!converted_key) {
2760 if (key) {
2761 zend_string_release(key);
2762 }
2763 continue;
2764 }
2765 ZVAL_STR(&entry_tmp, converted_key);
2766 break;
2767 }
2768 case IS_NULL:
2769 case IS_TRUE:
2770 case IS_FALSE:
2771 case IS_LONG:
2772 case IS_DOUBLE:
2773 ZVAL_COPY(&entry_tmp, entry);
2774 break;
2775 case IS_ARRAY:
2776 chash = php_mb_convert_encoding_recursive(
2777 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2778 if (chash) {
2779 ZVAL_ARR(&entry_tmp, chash);
2780 } else {
2781 ZVAL_EMPTY_ARRAY(&entry_tmp);
2782 }
2783 break;
2784 case IS_REFERENCE:
2785 entry = Z_REFVAL_P(entry);
2786 goto try_again;
2787 case IS_OBJECT:
2788 default:
2789 if (key) {
2790 zend_string_release(key);
2791 }
2792 php_error_docref(NULL, E_WARNING, "Object is not supported");
2793 continue;
2794 }
2795 if (key) {
2796 zend_hash_add(output, key, &entry_tmp);
2797 zend_string_release(key);
2798 } else {
2799 zend_hash_index_add(output, idx, &entry_tmp);
2800 }
2801 } ZEND_HASH_FOREACH_END();
2802 GC_TRY_UNPROTECT_RECURSION(input);
2803
2804 return output;
2805 }
2806 /* }}} */
2807
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2808 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2809 {
2810 /* mbstring supports some 'text encodings' which aren't really text encodings
2811 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2812 * These should never be returned by `mb_detect_encoding`. */
2813 unsigned int shift = 0;
2814 for (unsigned int i = 0; i < *size; i++) {
2815 const mbfl_encoding *encoding = elist[i];
2816 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2817 shift++; /* Remove this encoding from the list */
2818 } else if (shift) {
2819 elist[i - shift] = encoding;
2820 }
2821 }
2822 *size -= shift;
2823 }
2824
2825 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2826 PHP_FUNCTION(mb_convert_encoding)
2827 {
2828 zend_string *to_encoding_name;
2829 zend_string *input_str, *from_encodings_str = NULL;
2830 HashTable *input_ht, *from_encodings_ht = NULL;
2831 const mbfl_encoding **from_encodings;
2832 size_t num_from_encodings;
2833 bool free_from_encodings = false;
2834
2835 ZEND_PARSE_PARAMETERS_START(2, 3)
2836 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2837 Z_PARAM_STR(to_encoding_name)
2838 Z_PARAM_OPTIONAL
2839 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2840 ZEND_PARSE_PARAMETERS_END();
2841
2842 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2843 if (!to_encoding) {
2844 RETURN_THROWS();
2845 }
2846
2847 if (from_encodings_ht) {
2848 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2849 RETURN_THROWS();
2850 }
2851 free_from_encodings = true;
2852 } else if (from_encodings_str) {
2853 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2854 &from_encodings, &num_from_encodings,
2855 /* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2856 RETURN_THROWS();
2857 }
2858 free_from_encodings = true;
2859 } else {
2860 from_encodings = &MBSTRG(current_internal_encoding);
2861 num_from_encodings = 1;
2862 }
2863
2864 if (num_from_encodings > 1) {
2865 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2866 }
2867
2868 if (!num_from_encodings) {
2869 efree(ZEND_VOIDP(from_encodings));
2870 zend_argument_value_error(3, "must specify at least one encoding");
2871 RETURN_THROWS();
2872 }
2873
2874 if (input_str) {
2875 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2876 if (ret != NULL) {
2877 RETVAL_STR(ret);
2878 } else {
2879 RETVAL_FALSE;
2880 }
2881 } else {
2882 HashTable *tmp;
2883 tmp = php_mb_convert_encoding_recursive(
2884 input_ht, to_encoding, from_encodings, num_from_encodings);
2885 RETVAL_ARR(tmp);
2886 }
2887
2888 if (free_from_encodings) {
2889 efree(ZEND_VOIDP(from_encodings));
2890 }
2891 }
2892 /* }}} */
2893
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2894 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2895 {
2896 return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2897 }
2898
PHP_FUNCTION(mb_convert_case)2899 PHP_FUNCTION(mb_convert_case)
2900 {
2901 zend_string *str, *from_encoding = NULL;
2902 zend_long case_mode = 0;
2903
2904 ZEND_PARSE_PARAMETERS_START(2, 3)
2905 Z_PARAM_STR(str)
2906 Z_PARAM_LONG(case_mode)
2907 Z_PARAM_OPTIONAL
2908 Z_PARAM_STR_OR_NULL(from_encoding)
2909 ZEND_PARSE_PARAMETERS_END();
2910
2911 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2912 if (!enc) {
2913 RETURN_THROWS();
2914 }
2915
2916 if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2917 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2918 RETURN_THROWS();
2919 }
2920
2921 RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2922 }
2923
PHP_FUNCTION(mb_strtoupper)2924 PHP_FUNCTION(mb_strtoupper)
2925 {
2926 zend_string *str, *from_encoding = NULL;
2927
2928 ZEND_PARSE_PARAMETERS_START(1, 2)
2929 Z_PARAM_STR(str)
2930 Z_PARAM_OPTIONAL
2931 Z_PARAM_STR_OR_NULL(from_encoding)
2932 ZEND_PARSE_PARAMETERS_END();
2933
2934 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2935 if (!enc) {
2936 RETURN_THROWS();
2937 }
2938
2939 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2940 }
2941
PHP_FUNCTION(mb_strtolower)2942 PHP_FUNCTION(mb_strtolower)
2943 {
2944 zend_string *str, *from_encoding = NULL;
2945
2946 ZEND_PARSE_PARAMETERS_START(1, 2)
2947 Z_PARAM_STR(str)
2948 Z_PARAM_OPTIONAL
2949 Z_PARAM_STR_OR_NULL(from_encoding)
2950 ZEND_PARSE_PARAMETERS_END();
2951
2952 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2953 if (!enc) {
2954 RETURN_THROWS();
2955 }
2956
2957 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2958 }
2959
php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS,php_case_mode mode)2960 static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2961 {
2962 zend_string *str, *from_encoding = NULL;
2963
2964 ZEND_PARSE_PARAMETERS_START(1, 2)
2965 Z_PARAM_STR(str)
2966 Z_PARAM_OPTIONAL
2967 Z_PARAM_STR_OR_NULL(from_encoding)
2968 ZEND_PARSE_PARAMETERS_END();
2969
2970 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2971 if (!enc) {
2972 RETURN_THROWS();
2973 }
2974
2975 zend_string *first = mb_get_substr(str, 0, 1, enc);
2976 zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2977
2978 if (zend_string_equals(first, head)) {
2979 zend_string_release_ex(first, false);
2980 zend_string_release_ex(head, false);
2981 RETURN_STR(zend_string_copy(str));
2982 }
2983
2984 zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2985 zend_string *retval = zend_string_concat2(ZSTR_VAL(head), ZSTR_LEN(head), ZSTR_VAL(second), ZSTR_LEN(second));
2986
2987 zend_string_release_ex(first, false);
2988 zend_string_release_ex(head, false);
2989 zend_string_release_ex(second, false);
2990
2991 RETVAL_STR(retval);
2992 }
2993
PHP_FUNCTION(mb_ucfirst)2994 PHP_FUNCTION(mb_ucfirst)
2995 {
2996 php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_TITLE);
2997 }
2998
PHP_FUNCTION(mb_lcfirst)2999 PHP_FUNCTION(mb_lcfirst)
3000 {
3001 php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_LOWER);
3002 }
3003
3004 typedef enum {
3005 MB_LTRIM = 1,
3006 MB_RTRIM = 2,
3007 MB_BOTH_TRIM = 3
3008 } mb_trim_mode;
3009
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)3010 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3011 {
3012 if (ht) {
3013 return zend_hash_index_exists(ht, w);
3014 } else {
3015 for (size_t i = 0; i < default_chars_length; i++) {
3016 if (w == default_chars[i]) {
3017 return true;
3018 }
3019 }
3020 return false;
3021 }
3022 }
3023
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)3024 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3025 {
3026 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3027 uint32_t wchar_buf[128];
3028 size_t in_len = ZSTR_LEN(str);
3029 size_t out_len = 0;
3030 unsigned int state = 0;
3031 size_t left = 0;
3032 size_t right = 0;
3033 size_t total_len = 0;
3034
3035 while (in_len) {
3036 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3037 ZEND_ASSERT(out_len <= 128);
3038 total_len += out_len;
3039
3040 for (size_t i = 0; i < out_len; i++) {
3041 uint32_t w = wchar_buf[i];
3042 if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3043 if (mode & MB_LTRIM) {
3044 left += 1;
3045 }
3046 if (mode & MB_RTRIM) {
3047 right += 1;
3048 }
3049 } else {
3050 mode &= ~MB_LTRIM;
3051 if (mode & MB_RTRIM) {
3052 right = 0;
3053 }
3054 }
3055 }
3056 }
3057
3058 if (left == 0 && right == 0) {
3059 return zend_string_copy(str);
3060 }
3061 return mb_get_substr(str, left, total_len - (right + left), enc);
3062 }
3063
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3064 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3065 {
3066 const uint32_t trim_default_chars[] = {
3067 0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3068 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3069 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3070 0x85, 0x180E
3071 };
3072 size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3073
3074 HashTable what_ht;
3075 zval val;
3076 ZVAL_TRUE(&val);
3077
3078 zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3079
3080 for (size_t i = 0; i < trim_default_chars_length; i++) {
3081 zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3082 }
3083 zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3084 zend_hash_destroy(&what_ht);
3085
3086 return retval;
3087 }
3088
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3089 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3090 {
3091 unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3092 uint32_t what_wchar_buf[128];
3093 size_t what_out_len = 0;
3094 unsigned int state = 0;
3095 size_t what_len = ZSTR_LEN(what);
3096 HashTable what_ht;
3097 zval val;
3098 bool hash_initialized = false;
3099
3100 while (what_len) {
3101 what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3102 ZEND_ASSERT(what_out_len <= 128);
3103
3104 if (what_out_len <= 4 && !hash_initialized) {
3105 return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3106 } else {
3107 if (!hash_initialized) {
3108 hash_initialized = true;
3109 ZVAL_TRUE(&val);
3110 zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3111 }
3112 for (size_t i = 0; i < what_out_len; i++) {
3113 zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3114 }
3115 }
3116 }
3117
3118 if (UNEXPECTED(!hash_initialized)) {
3119 /* This is only possible if what is empty */
3120 return zend_string_copy(str);
3121 }
3122
3123 zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3124 zend_hash_destroy(&what_ht);
3125
3126 return retval;
3127 }
3128
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3129 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3130 {
3131 zend_string *str;
3132 zend_string *what = NULL;
3133 zend_string *encoding = NULL;
3134
3135 ZEND_PARSE_PARAMETERS_START(1, 3)
3136 Z_PARAM_STR(str)
3137 Z_PARAM_OPTIONAL
3138 Z_PARAM_STR_OR_NULL(what)
3139 Z_PARAM_STR_OR_NULL(encoding)
3140 ZEND_PARSE_PARAMETERS_END();
3141
3142 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3143 if (!enc) {
3144 RETURN_THROWS();
3145 }
3146
3147 if (what) {
3148 RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3149 } else {
3150 RETURN_STR(mb_trim_default_chars(str, mode, enc));
3151 }
3152 }
3153
PHP_FUNCTION(mb_trim)3154 PHP_FUNCTION(mb_trim)
3155 {
3156 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3157 }
3158
PHP_FUNCTION(mb_ltrim)3159 PHP_FUNCTION(mb_ltrim)
3160 {
3161 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3162 }
3163
PHP_FUNCTION(mb_rtrim)3164 PHP_FUNCTION(mb_rtrim)
3165 {
3166 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3167 }
3168
duplicate_elist(const mbfl_encoding ** elist,size_t size)3169 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3170 {
3171 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3172 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3173 return new_elist;
3174 }
3175
estimate_demerits(uint32_t w)3176 static unsigned int estimate_demerits(uint32_t w)
3177 {
3178 /* Receive wchars decoded from input string using candidate encoding.
3179 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3180 * a smaller number for each ASCII punctuation character, and 1 for
3181 * all other codepoints.
3182 *
3183 * The 'common' codepoints should cover the vast majority of
3184 * codepoints we are likely to see in practice, while only covering
3185 * a small minority of the entire Unicode encoding space. Why?
3186 * Well, if the test string happens to be valid in an incorrect
3187 * candidate encoding, the bogus codepoints which it decodes to will
3188 * be more or less random. By treating the majority of codepoints as
3189 * 'rare', we ensure that in almost all such cases, the bogus
3190 * codepoints will include plenty of 'rares', thus giving the
3191 * incorrect candidate encoding lots of demerits. See
3192 * common_codepoints.txt for the actual list used.
3193 *
3194 * So, why give extra demerits for ASCII punctuation characters? It's
3195 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3196 * which deliberately only use bytes in the ASCII range. When
3197 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3198 * have an unusually high number of ASCII punctuation characters.
3199 * So giving extra demerits for such characters will improve
3200 * detection accuracy for UTF-7 and similar encodings.
3201 *
3202 * Finally, why 1 demerit for all other characters? That penalizes
3203 * long strings, meaning we will tend to choose a candidate encoding
3204 * in which the test string decodes to a smaller number of
3205 * codepoints. That prevents single-byte encodings in which almost
3206 * every possible input byte decodes to a 'common' codepoint from
3207 * being favored too much. */
3208 if (w > 0xFFFF) {
3209 return 40;
3210 } else if (w >= 0x21 && w <= 0x2F) {
3211 return 6;
3212 } else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3213 return 30;
3214 } else {
3215 return 1;
3216 }
3217 return 0;
3218 }
3219
3220 struct candidate {
3221 const mbfl_encoding *enc;
3222 const unsigned char *in;
3223 size_t in_len;
3224 uint64_t demerits; /* Wide bit size to prevent overflow */
3225 unsigned int state;
3226 float multiplier;
3227 };
3228
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3229 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3230 {
3231 size_t j = 0;
3232
3233 for (size_t i = 0; i < length; i++) {
3234 const mbfl_encoding *enc = encodings[i];
3235
3236 array[j].enc = enc;
3237 array[j].state = 0;
3238 array[j].demerits = 0;
3239
3240 /* If any candidate encodings have specialized validation functions, use them
3241 * to eliminate as many candidates as possible */
3242 if (enc->check != NULL) {
3243 for (size_t k = 0; k < n; k++) {
3244 if (!enc->check((unsigned char*)in[k], in_len[k])) {
3245 if (strict) {
3246 goto skip_to_next;
3247 } else {
3248 array[j].demerits += 500;
3249 }
3250 }
3251 }
3252 }
3253
3254 /* This multiplier can optionally be used to make candidate encodings listed
3255 * first more likely to be chosen. It is a weight factor which multiplies
3256 * the number of demerits counted for each candidate. */
3257 array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3258 j++;
3259 skip_to_next: ;
3260 }
3261
3262 return j;
3263 }
3264
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3265 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3266 {
3267 for (size_t i = 0; i < length; i++) {
3268 const mbfl_encoding *enc = array[i].enc;
3269
3270 array[i].in = in;
3271 array[i].in_len = in_len;
3272
3273 /* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3274 if (enc == &mbfl_encoding_utf8) {
3275 if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3276 array[i].in_len -= 3;
3277 array[i].in += 3;
3278 }
3279 } else if (enc == &mbfl_encoding_utf16be) {
3280 if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3281 array[i].in_len -= 2;
3282 array[i].in += 2;
3283 }
3284 } else if (enc == &mbfl_encoding_utf16le) {
3285 if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3286 array[i].in_len -= 2;
3287 array[i].in += 2;
3288 }
3289 }
3290 }
3291 }
3292
count_demerits(struct candidate * array,size_t length,bool strict)3293 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3294 {
3295 uint32_t wchar_buf[128];
3296 unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3297
3298 for (size_t i = 0; i < length; i++) {
3299 if (array[i].in_len == 0) {
3300 finished++;
3301 }
3302 }
3303
3304 while ((strict || length > 1) && finished < length) {
3305 /* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3306 for (size_t i = length - 1; i != (size_t)-1; i--) {
3307 /* Do we still have more input to process for this candidate encoding? */
3308 if (array[i].in_len) {
3309 const mbfl_encoding *enc = array[i].enc;
3310 size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3311 ZEND_ASSERT(out_len <= 128);
3312 /* Check this batch of decoded codepoints; are there any error markers?
3313 * Also sum up the number of demerits */
3314 while (out_len) {
3315 uint32_t w = wchar_buf[--out_len];
3316 if (w == MBFL_BAD_INPUT) {
3317 if (strict) {
3318 /* This candidate encoding is not valid, eliminate it from consideration */
3319 length--;
3320 if (i < length) {
3321 /* The eliminated candidate was the last valid one in the list */
3322 memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3323 }
3324 goto try_next_encoding;
3325 } else {
3326 array[i].demerits += 1000;
3327 }
3328 } else {
3329 array[i].demerits += estimate_demerits(w);
3330 }
3331 }
3332 if (array[i].in_len == 0) {
3333 finished++;
3334 }
3335 }
3336 try_next_encoding:;
3337 }
3338 }
3339
3340 for (size_t i = 0; i < length; i++) {
3341 array[i].demerits *= array[i].multiplier;
3342 }
3343
3344 return length;
3345 }
3346
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3347 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3348 {
3349 if (elist_size == 0) {
3350 return NULL;
3351 }
3352 if (elist_size == 1) {
3353 if (strict) {
3354 while (n--) {
3355 if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3356 return NULL;
3357 }
3358 }
3359 }
3360 return *elist;
3361 }
3362 if (n == 1 && *str_lengths == 0) {
3363 return *elist;
3364 }
3365
3366 /* Allocate on stack; when we return, this array is automatically freed */
3367 struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3368 elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3369
3370 while (n--) {
3371 start_string(array, elist_size, strings[n], str_lengths[n]);
3372 elist_size = count_demerits(array, elist_size, strict);
3373 if (elist_size == 0) {
3374 /* All candidates were eliminated */
3375 return NULL;
3376 }
3377 }
3378
3379 /* See which remaining candidate encoding has the least demerits */
3380 unsigned int best = 0;
3381 for (unsigned int i = 1; i < elist_size; i++) {
3382 if (array[i].demerits < array[best].demerits) {
3383 best = i;
3384 }
3385 }
3386 return array[best].enc;
3387 }
3388
3389 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3390 * is rejected. With non-strict detection, we just continue, but apply demerits for
3391 * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3392 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3393 {
3394 return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3395 }
3396
3397 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3398 PHP_FUNCTION(mb_detect_encoding)
3399 {
3400 zend_string *str, *encoding_str = NULL;
3401 HashTable *encoding_ht = NULL;
3402 bool strict = false;
3403 const mbfl_encoding *ret, **elist;
3404 size_t size;
3405
3406 ZEND_PARSE_PARAMETERS_START(1, 3)
3407 Z_PARAM_STR(str)
3408 Z_PARAM_OPTIONAL
3409 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3410 Z_PARAM_BOOL(strict)
3411 ZEND_PARSE_PARAMETERS_END();
3412
3413 /* Should we pay attention to the order of the provided candidate encodings and prefer
3414 * the earlier ones (if more than one candidate encoding matches)?
3415 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3416 * in, then don't treat the order as significant */
3417 bool order_significant = true;
3418
3419 /* make encoding list */
3420 if (encoding_ht) {
3421 if (encoding_ht == MBSTRG(all_encodings_list)) {
3422 order_significant = false;
3423 }
3424 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3425 RETURN_THROWS();
3426 }
3427 } else if (encoding_str) {
3428 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3429 RETURN_THROWS();
3430 }
3431 } else {
3432 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3433 size = MBSTRG(current_detect_order_list_size);
3434 }
3435
3436 if (size == 0) {
3437 efree(ZEND_VOIDP(elist));
3438 zend_argument_value_error(2, "must specify at least one encoding");
3439 RETURN_THROWS();
3440 }
3441
3442 remove_non_encodings_from_elist(elist, &size);
3443 if (size == 0) {
3444 efree(ZEND_VOIDP(elist));
3445 RETURN_FALSE;
3446 }
3447
3448 if (ZEND_NUM_ARGS() < 3) {
3449 strict = MBSTRG(strict_detection);
3450 }
3451
3452 if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3453 ret = &mbfl_encoding_utf8;
3454 } else {
3455 ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3456 }
3457
3458 efree(ZEND_VOIDP(elist));
3459
3460 if (ret == NULL) {
3461 RETURN_FALSE;
3462 }
3463
3464 RETVAL_STRING((char *)ret->name);
3465 }
3466 /* }}} */
3467
3468 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3469 PHP_FUNCTION(mb_list_encodings)
3470 {
3471 ZEND_PARSE_PARAMETERS_NONE();
3472
3473 if (MBSTRG(all_encodings_list) == NULL) {
3474 /* Initialize shared array of supported encoding names
3475 * This is done so that we can check if `mb_list_encodings()` is being
3476 * passed to other mbstring functions using a cheap pointer equality check */
3477 HashTable *array = emalloc(sizeof(HashTable));
3478 zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3479 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3480 zval tmp;
3481 ZVAL_STRING(&tmp, (*encodings)->name);
3482 zend_hash_next_index_insert(array, &tmp);
3483 }
3484 MBSTRG(all_encodings_list) = array;
3485 }
3486
3487 GC_ADDREF(MBSTRG(all_encodings_list));
3488 RETURN_ARR(MBSTRG(all_encodings_list));
3489 }
3490 /* }}} */
3491
3492 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3493 PHP_FUNCTION(mb_encoding_aliases)
3494 {
3495 const mbfl_encoding *encoding;
3496 zend_string *encoding_name = NULL;
3497
3498 ZEND_PARSE_PARAMETERS_START(1, 1)
3499 Z_PARAM_STR(encoding_name)
3500 ZEND_PARSE_PARAMETERS_END();
3501
3502 encoding = php_mb_get_encoding(encoding_name, 1);
3503 if (!encoding) {
3504 RETURN_THROWS();
3505 }
3506
3507 array_init(return_value);
3508 if (encoding->aliases != NULL) {
3509 for (const char **alias = encoding->aliases; *alias; ++alias) {
3510 add_next_index_string(return_value, (char *)*alias);
3511 }
3512 }
3513 }
3514 /* }}} */
3515
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3516 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3517 {
3518 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3519 * if we are converting zenkaku kana to hankaku kana
3520 * Make the buffer for converted kana big enough that we never need to
3521 * perform bounds checks */
3522 uint32_t wchar_buf[64], converted_buf[64 * 2];
3523 unsigned int buf_offset = 0;
3524 unsigned int state = 0;
3525 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3526 size_t in_len = ZSTR_LEN(input);
3527
3528 mb_convert_buf buf;
3529 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3530
3531 while (in_len) {
3532 uint32_t *converted = converted_buf;
3533 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3534 * previous iteration, don't overwrite it */
3535 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3536 out_len += buf_offset;
3537 ZEND_ASSERT(out_len <= 64);
3538
3539 if (!out_len) {
3540 continue;
3541 }
3542
3543 for (size_t i = 0; i < out_len-1; i++) {
3544 uint32_t second = 0;
3545 bool consumed = false;
3546 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3547 if (second) {
3548 *converted++ = second;
3549 }
3550 if (consumed) {
3551 i++;
3552 if (i == out_len-1) {
3553 /* We consumed two codepoints at the very end of the wchar buffer
3554 * So there is nothing remaining to reprocess on the next iteration */
3555 buf_offset = 0;
3556 goto emit_converted_kana;
3557 }
3558 }
3559 }
3560
3561 if (!in_len) {
3562 /* This is the last iteration, so we need to process the final codepoint now */
3563 uint32_t second = 0;
3564 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3565 if (second) {
3566 *converted++ = second;
3567 }
3568 } else {
3569 /* Reprocess the last codepoint on the next iteration */
3570 wchar_buf[0] = wchar_buf[out_len-1];
3571 buf_offset = 1;
3572 }
3573
3574 emit_converted_kana:
3575 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3576 }
3577
3578 return mb_convert_buf_result(&buf, encoding);
3579 }
3580
3581 char mb_convert_kana_flags[17] = {
3582 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3583 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3584 'V'
3585 };
3586
3587 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3588 PHP_FUNCTION(mb_convert_kana)
3589 {
3590 unsigned int opt;
3591 char *optstr = NULL;
3592 size_t optstr_len;
3593 zend_string *encname = NULL, *str;
3594
3595 ZEND_PARSE_PARAMETERS_START(1, 3)
3596 Z_PARAM_STR(str)
3597 Z_PARAM_OPTIONAL
3598 Z_PARAM_STRING(optstr, optstr_len)
3599 Z_PARAM_STR_OR_NULL(encname)
3600 ZEND_PARSE_PARAMETERS_END();
3601
3602 if (optstr != NULL) {
3603 char *p = optstr, *e = p + optstr_len;
3604 opt = 0;
3605 next_option:
3606 while (p < e) {
3607 /* Walk through option string and convert to bit vector
3608 * See translit_kana_jisx0201_jisx0208.h for the values used */
3609 char c = *p++;
3610 if (c == 'A') {
3611 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3612 } else if (c == 'a') {
3613 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3614 } else {
3615 for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3616 if (c == mb_convert_kana_flags[i]) {
3617 opt |= (1 << i);
3618 goto next_option;
3619 }
3620 }
3621
3622 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3623 RETURN_THROWS();
3624 }
3625 }
3626
3627 /* Check for illegal combinations of options */
3628 if (((opt & 0xFF00) >> 8) & opt) {
3629 /* It doesn't make sense to convert the same type of characters from halfwidth to
3630 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3631 * FW hiragana to FW katakana and then back again. */
3632 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3633 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3634 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3635 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3636 flag1 = 'A';
3637 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3638 flag2 = 'a';
3639 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3640 RETURN_THROWS();
3641 }
3642
3643 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3644 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3645 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3646 RETURN_THROWS();
3647 }
3648
3649 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3650 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3651 * more than one of these */
3652 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3653 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3654 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3655 RETURN_THROWS();
3656 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3657 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3658 RETURN_THROWS();
3659 }
3660 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3661 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3662 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3663 RETURN_THROWS();
3664 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3665 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3666 RETURN_THROWS();
3667 }
3668 }
3669 } else {
3670 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3671 }
3672
3673 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3674 if (!enc) {
3675 RETURN_THROWS();
3676 }
3677
3678 RETVAL_STR(jp_kana_convert(str, enc, opt));
3679 }
3680
mb_recursive_count_strings(zval * var)3681 static unsigned int mb_recursive_count_strings(zval *var)
3682 {
3683 unsigned int count = 0;
3684 ZVAL_DEREF(var);
3685
3686 if (Z_TYPE_P(var) == IS_STRING) {
3687 count++;
3688 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3689 if (Z_REFCOUNTED_P(var)) {
3690 if (Z_IS_RECURSIVE_P(var)) {
3691 return count;
3692 }
3693 Z_PROTECT_RECURSION_P(var);
3694 }
3695
3696 HashTable *ht = HASH_OF(var);
3697 if (ht != NULL) {
3698 zval *entry;
3699 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3700 count += mb_recursive_count_strings(entry);
3701 } ZEND_HASH_FOREACH_END();
3702 }
3703
3704 if (Z_REFCOUNTED_P(var)) {
3705 Z_UNPROTECT_RECURSION_P(var);
3706 }
3707 }
3708
3709 return count;
3710 }
3711
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3712 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3713 {
3714 ZVAL_DEREF(var);
3715
3716 if (Z_TYPE_P(var) == IS_STRING) {
3717 val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3718 len_list[*count] = Z_STRLEN_P(var);
3719 (*count)++;
3720 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3721 if (Z_REFCOUNTED_P(var)) {
3722 if (Z_IS_RECURSIVE_P(var)) {
3723 return true;
3724 }
3725 Z_PROTECT_RECURSION_P(var);
3726 }
3727
3728 HashTable *ht = HASH_OF(var);
3729 if (ht != NULL) {
3730 zval *entry;
3731 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3732 if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3733 if (Z_REFCOUNTED_P(var)) {
3734 Z_UNPROTECT_RECURSION_P(var);
3735 return true;
3736 }
3737 }
3738 } ZEND_HASH_FOREACH_END();
3739 }
3740
3741 if (Z_REFCOUNTED_P(var)) {
3742 Z_UNPROTECT_RECURSION_P(var);
3743 }
3744 }
3745
3746 return false;
3747 }
3748
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3749 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3750 {
3751 zval *entry, *orig_var;
3752
3753 orig_var = var;
3754 ZVAL_DEREF(var);
3755
3756 if (Z_TYPE_P(var) == IS_STRING) {
3757 zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3758 zval_ptr_dtor(orig_var);
3759 ZVAL_STR(orig_var, ret);
3760 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3761 if (Z_TYPE_P(var) == IS_ARRAY) {
3762 SEPARATE_ARRAY(var);
3763 }
3764 if (Z_REFCOUNTED_P(var)) {
3765 if (Z_IS_RECURSIVE_P(var)) {
3766 return true;
3767 }
3768 Z_PROTECT_RECURSION_P(var);
3769 }
3770
3771 HashTable *ht = HASH_OF(var);
3772 if (ht != NULL) {
3773 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3774 if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3775 if (Z_REFCOUNTED_P(var)) {
3776 Z_UNPROTECT_RECURSION_P(var);
3777 }
3778 return true;
3779 }
3780 } ZEND_HASH_FOREACH_END();
3781 }
3782
3783 if (Z_REFCOUNTED_P(var)) {
3784 Z_UNPROTECT_RECURSION_P(var);
3785 }
3786 }
3787
3788 return false;
3789 }
3790
PHP_FUNCTION(mb_convert_variables)3791 PHP_FUNCTION(mb_convert_variables)
3792 {
3793 zval *args;
3794 zend_string *to_enc_str;
3795 zend_string *from_enc_str;
3796 HashTable *from_enc_ht;
3797 const mbfl_encoding *from_encoding, *to_encoding;
3798 uint32_t argc;
3799 size_t elistsz;
3800 const mbfl_encoding **elist;
3801
3802 ZEND_PARSE_PARAMETERS_START(3, -1)
3803 Z_PARAM_STR(to_enc_str)
3804 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3805 Z_PARAM_VARIADIC('+', args, argc)
3806 ZEND_PARSE_PARAMETERS_END();
3807
3808 /* new encoding */
3809 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3810 if (!to_encoding) {
3811 RETURN_THROWS();
3812 }
3813
3814 from_encoding = MBSTRG(current_internal_encoding);
3815
3816 bool order_significant = true;
3817
3818 /* pre-conversion encoding */
3819 if (from_enc_ht) {
3820 if (from_enc_ht == MBSTRG(all_encodings_list)) {
3821 /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3822 * in, then don't treat the order of the list as significant */
3823 order_significant = false;
3824 }
3825 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3826 RETURN_THROWS();
3827 }
3828 } else {
3829 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3830 RETURN_THROWS();
3831 }
3832 }
3833
3834 if (elistsz == 0) {
3835 efree(ZEND_VOIDP(elist));
3836 zend_argument_value_error(2, "must specify at least one encoding");
3837 RETURN_THROWS();
3838 }
3839
3840 if (elistsz == 1) {
3841 from_encoding = *elist;
3842 } else {
3843 /* auto detect */
3844 unsigned int num = 0;
3845 for (size_t n = 0; n < argc; n++) {
3846 zval *zv = &args[n];
3847 num += mb_recursive_count_strings(zv);
3848 }
3849 const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3850 size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3851 unsigned int i = 0;
3852 for (size_t n = 0; n < argc; n++) {
3853 zval *zv = &args[n];
3854 if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3855 efree(ZEND_VOIDP(elist));
3856 efree(ZEND_VOIDP(val_list));
3857 efree(len_list);
3858 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3859 RETURN_FALSE;
3860 }
3861 }
3862 from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3863 efree(ZEND_VOIDP(val_list));
3864 efree(len_list);
3865 if (!from_encoding) {
3866 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3867 efree(ZEND_VOIDP(elist));
3868 RETURN_FALSE;
3869 }
3870
3871 }
3872
3873 efree(ZEND_VOIDP(elist));
3874
3875 /* convert */
3876 for (size_t n = 0; n < argc; n++) {
3877 zval *zv = &args[n];
3878 ZVAL_DEREF(zv);
3879 if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3880 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3881 RETURN_FALSE;
3882 }
3883 }
3884
3885 RETURN_STRING(from_encoding->name);
3886 }
3887
3888 /* HTML numeric entities */
3889
3890 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3891 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3892 {
3893 zval *hash_entry;
3894
3895 size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3896 if (n_elems % 4 != 0) {
3897 zend_argument_value_error(2, "must have a multiple of 4 elements");
3898 return NULL;
3899 }
3900
3901 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3902 uint32_t *mapelm = convmap;
3903
3904 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3905 bool failed = true;
3906 zend_long tmp = zval_try_get_long(hash_entry, &failed);
3907 if (failed) {
3908 efree(convmap);
3909 zend_argument_value_error(2, "must only be composed of values of type int");
3910 return NULL;
3911 }
3912 *mapelm++ = tmp;
3913 } ZEND_HASH_FOREACH_END();
3914
3915 return convmap;
3916 }
3917
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3918 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3919 {
3920 uint32_t *convmap_end = convmap + conversion_map_size;
3921
3922 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3923 uint32_t lo_code = mapelm[0];
3924 uint32_t hi_code = mapelm[1];
3925 uint32_t offset = mapelm[2];
3926 uint32_t mask = mapelm[3];
3927
3928 if (w >= lo_code && w <= hi_code) {
3929 /* This wchar falls inside one of the ranges which should be
3930 * converted to HTML entities */
3931 *retval = (w + offset) & mask;
3932 return true;
3933 }
3934 }
3935
3936 /* None of the ranges matched */
3937 return false;
3938 }
3939
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3940 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3941 {
3942 /* Each wchar which we get from decoding the input string may become up to
3943 * 13 wchars when we convert it to an HTML entity */
3944 uint32_t wchar_buf[32], converted_buf[32 * 13];
3945 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3946
3947 unsigned int state = 0;
3948 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3949 size_t in_len = ZSTR_LEN(input);
3950
3951 mb_convert_buf buf;
3952 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3953
3954 while (in_len) {
3955 /* Convert input string to wchars, up to 32 at a time */
3956 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3957 ZEND_ASSERT(out_len <= 32);
3958 uint32_t *converted = converted_buf;
3959
3960 /* Run through wchars and see if any of them fall into the ranges
3961 * which we want to convert to HTML entities */
3962 for (size_t i = 0; i < out_len; i++) {
3963 uint32_t w = wchar_buf[i];
3964
3965 if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3966 *converted++ = '&';
3967 *converted++ = '#';
3968 if (hex) {
3969 *converted++ = 'x';
3970 }
3971
3972 /* Convert wchar to decimal/hex string */
3973 if (w == 0) {
3974 *converted++ = '0';
3975 } else {
3976 unsigned char *p = entity + sizeof(entity);
3977 if (hex) {
3978 while (w > 0) {
3979 *(--p) = "0123456789ABCDEF"[w & 0xF];
3980 w >>= 4;
3981 }
3982 } else {
3983 while (w > 0) {
3984 *(--p) = "0123456789"[w % 10];
3985 w /= 10;
3986 }
3987 }
3988 while (p < entity + sizeof(entity)) {
3989 *converted++ = *p++;
3990 }
3991 }
3992
3993 *converted++ = ';';
3994 } else {
3995 *converted++ = w;
3996 }
3997 }
3998
3999 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
4000 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4001 }
4002
4003 return mb_convert_buf_result(&buf, encoding);
4004 }
4005
4006 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)4007 PHP_FUNCTION(mb_encode_numericentity)
4008 {
4009 zend_string *encoding = NULL, *str;
4010 size_t conversion_map_size;
4011 HashTable *target_hash;
4012 bool is_hex = false;
4013
4014 ZEND_PARSE_PARAMETERS_START(2, 4)
4015 Z_PARAM_STR(str)
4016 Z_PARAM_ARRAY_HT(target_hash)
4017 Z_PARAM_OPTIONAL
4018 Z_PARAM_STR_OR_NULL(encoding)
4019 Z_PARAM_BOOL(is_hex)
4020 ZEND_PARSE_PARAMETERS_END();
4021
4022 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4023 if (!enc) {
4024 RETURN_THROWS();
4025 }
4026
4027 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4028 if (convmap == NULL) {
4029 RETURN_THROWS();
4030 }
4031
4032 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4033 efree(convmap);
4034 }
4035 /* }}} */
4036
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)4037 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4038 {
4039 uint32_t *convmap_end = convmap + conversion_map_size;
4040
4041 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4042 uint32_t lo_code = mapelm[0];
4043 uint32_t hi_code = mapelm[1];
4044 uint32_t offset = mapelm[2];
4045 uint32_t codepoint = number - offset;
4046 if (codepoint >= lo_code && codepoint <= hi_code) {
4047 *retval = codepoint;
4048 return true;
4049 }
4050 }
4051
4052 return false;
4053 }
4054
4055 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
4056 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
4057 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4058 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4059
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4060 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4061 {
4062 uint32_t wchar_buf[128], converted_buf[128];
4063
4064 unsigned int state = 0;
4065 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4066 size_t in_len = ZSTR_LEN(input);
4067
4068 mb_convert_buf buf;
4069 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4070
4071 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4072 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4073 * 2nd 'converted' buffer.
4074 *
4075 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4076 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4077 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4078 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4079 * to store the next batch of wchars after it.
4080 *
4081 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4082 * wchars from the 1st buffer to the 2nd one.
4083 *
4084 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4085 * have to do bounds checks when writing wchars into it.
4086 */
4087
4088 unsigned int wchar_buf_offset = 0;
4089
4090 while (in_len) {
4091 /* Leave space for sentinel at the end of the buffer */
4092 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4093 out_len += wchar_buf_offset;
4094 ZEND_ASSERT(out_len <= 127);
4095 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4096
4097 uint32_t *p, *converted;
4098
4099 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4100 * be there (in `wchar_buf[0]`), so don't bother in that case */
4101 if (wchar_buf_offset == 0) {
4102 p = wchar_buf;
4103 while (*p != '&')
4104 p++;
4105 if (p == wchar_buf + out_len) {
4106 /* No HTML entities in this buffer */
4107 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4108 continue;
4109 }
4110
4111 /* Copy over the prefix with no & which we already scanned */
4112 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4113 converted = converted_buf + (p - wchar_buf);
4114 } else {
4115 p = wchar_buf;
4116 converted = converted_buf;
4117 }
4118
4119 found_ampersand:
4120 ZEND_ASSERT(*p == '&');
4121 uint32_t *p2 = p;
4122
4123 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4124 if (*++p2 == '#') {
4125 if (*++p2 == 'x') {
4126 /* Possible hex entity */
4127 uint32_t w = *++p2;
4128 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4129 w = *++p2;
4130 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4131 /* We hit the end of the buffer while reading digits, and
4132 * more wchars are still coming in the next buffer
4133 * Reprocess this identity on next iteration */
4134 memmove(wchar_buf, p, (p2 - p) * 4);
4135 wchar_buf_offset = p2 - p;
4136 goto process_converted_wchars;
4137 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4138 /* Invalid entity (too long or "&#x" only) */
4139 memcpy(converted, p, (p2 - p) * 4);
4140 converted += p2 - p;
4141 } else {
4142 /* Valid hexadecimal entity */
4143 uint32_t value = 0, *p3 = p + 3;
4144 while (p3 < p2) {
4145 w = *p3++;
4146 if (w <= '9') {
4147 value = (value * 16) + (w - '0');
4148 } else if (w >= 'a') {
4149 value = (value * 16) + 10 + (w - 'a');
4150 } else {
4151 value = (value * 16) + 10 + (w - 'A');
4152 }
4153 }
4154 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4155 converted++;
4156 if (*p2 == ';')
4157 p2++;
4158 } else {
4159 memcpy(converted, p, (p2 - p) * 4);
4160 converted += p2 - p;
4161 }
4162 }
4163 } else {
4164 /* Possible decimal entity */
4165 uint32_t w = *p2;
4166 while (w >= '0' && w <= '9')
4167 w = *++p2;
4168 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4169 /* The number of digits was legal (no more than 10 decimal digits)
4170 * Reprocess this identity on next iteration of main loop */
4171 memmove(wchar_buf, p, (p2 - p) * 4);
4172 wchar_buf_offset = p2 - p;
4173 goto process_converted_wchars;
4174 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4175 /* Invalid entity (too long or "&#" only) */
4176 memcpy(converted, p, (p2 - p) * 4);
4177 converted += p2 - p;
4178 } else {
4179 /* Valid decimal entity */
4180 uint32_t value = 0, *p3 = p + 2;
4181 while (p3 < p2) {
4182 /* If unsigned integer overflow would occur in the below
4183 * multiplication by 10, this entity is no good
4184 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4185 if (value > 0x19999999) {
4186 memcpy(converted, p, (p2 - p) * 4);
4187 converted += p2 - p;
4188 goto decimal_entity_too_big;
4189 }
4190 value = (value * 10) + (*p3++ - '0');
4191 }
4192 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4193 converted++;
4194 if (*p2 == ';')
4195 p2++;
4196 } else {
4197 memcpy(converted, p, (p2 - p) * 4);
4198 converted += p2 - p;
4199 }
4200 }
4201 }
4202 } else if ((p2 == wchar_buf + out_len) && in_len) {
4203 /* Corner case: & at end of buffer */
4204 wchar_buf[0] = '&';
4205 wchar_buf_offset = 1;
4206 goto process_converted_wchars;
4207 } else {
4208 *converted++ = '&';
4209 }
4210 decimal_entity_too_big:
4211
4212 /* Starting to scan a new section of the wchar buffer
4213 * 'p2' is pointing at the next wchar which needs to be processed */
4214 p = p2;
4215 while (*p2 != '&')
4216 p2++;
4217
4218 if (p2 > p) {
4219 memcpy(converted, p, (p2 - p) * 4);
4220 converted += p2 - p;
4221 p = p2;
4222 }
4223
4224 if (p < wchar_buf + out_len)
4225 goto found_ampersand;
4226
4227 /* We do not have any wchars remaining at the end of this buffer which
4228 * we need to reprocess on the next call */
4229 wchar_buf_offset = 0;
4230 process_converted_wchars:
4231 ZEND_ASSERT(converted <= converted_buf + 128);
4232 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4233 }
4234
4235 return mb_convert_buf_result(&buf, encoding);
4236 }
4237
4238 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4239 PHP_FUNCTION(mb_decode_numericentity)
4240 {
4241 zend_string *encoding = NULL, *str;
4242 size_t conversion_map_size;
4243 HashTable *target_hash;
4244
4245 ZEND_PARSE_PARAMETERS_START(2, 3)
4246 Z_PARAM_STR(str)
4247 Z_PARAM_ARRAY_HT(target_hash)
4248 Z_PARAM_OPTIONAL
4249 Z_PARAM_STR_OR_NULL(encoding)
4250 ZEND_PARSE_PARAMETERS_END();
4251
4252 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4253 if (!enc) {
4254 RETURN_THROWS();
4255 }
4256
4257 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4258 if (convmap == NULL) {
4259 RETURN_THROWS();
4260 }
4261
4262 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4263 efree(convmap);
4264 }
4265 /* }}} */
4266
4267 /* {{{ Sends an email message with MIME scheme */
4268 #define CRLF "\r\n"
4269
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4270 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4271 {
4272 const char *ps;
4273 size_t icnt;
4274 int state = 0;
4275 int crlf_state = -1;
4276 char *token = NULL;
4277 size_t token_pos = 0;
4278 zend_string *fld_name, *fld_val;
4279
4280 ps = str;
4281 icnt = str_len;
4282 fld_name = fld_val = NULL;
4283
4284 /*
4285 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4286 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4287 * state 0 1 2 3
4288 *
4289 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4290 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4291 * crlf_state -1 0 1 -1
4292 *
4293 */
4294
4295 while (icnt > 0) {
4296 switch (*ps) {
4297 case ':':
4298 if (crlf_state == 1) {
4299 token_pos++;
4300 }
4301
4302 if (state == 0 || state == 1) {
4303 if(token && token_pos > 0) {
4304 fld_name = zend_string_init(token, token_pos, 0);
4305 }
4306 state = 2;
4307 } else {
4308 token_pos++;
4309 }
4310
4311 crlf_state = 0;
4312 break;
4313
4314 case '\n':
4315 if (crlf_state == -1) {
4316 goto out;
4317 }
4318 crlf_state = -1;
4319 break;
4320
4321 case '\r':
4322 if (crlf_state == 1) {
4323 token_pos++;
4324 } else {
4325 crlf_state = 1;
4326 }
4327 break;
4328
4329 case ' ': case '\t':
4330 if (crlf_state == -1) {
4331 if (state == 3) {
4332 /* continuing from the previous line */
4333 state = 4;
4334 } else {
4335 /* simply skipping this new line */
4336 state = 5;
4337 }
4338 } else {
4339 if (crlf_state == 1) {
4340 token_pos++;
4341 }
4342 if (state == 1 || state == 3) {
4343 token_pos++;
4344 }
4345 }
4346 crlf_state = 0;
4347 break;
4348
4349 default:
4350 switch (state) {
4351 case 0:
4352 token = (char*)ps;
4353 token_pos = 0;
4354 state = 1;
4355 break;
4356
4357 case 2:
4358 if (crlf_state != -1) {
4359 token = (char*)ps;
4360 token_pos = 0;
4361
4362 state = 3;
4363 break;
4364 }
4365 ZEND_FALLTHROUGH;
4366
4367 case 3:
4368 if (crlf_state == -1) {
4369 if(token && token_pos > 0) {
4370 fld_val = zend_string_init(token, token_pos, 0);
4371 }
4372
4373 if (fld_name != NULL && fld_val != NULL) {
4374 zval val;
4375 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4376 ZVAL_STR(&val, fld_val);
4377
4378 zend_hash_update(ht, fld_name, &val);
4379
4380 zend_string_release_ex(fld_name, 0);
4381 }
4382
4383 fld_name = fld_val = NULL;
4384 token = (char*)ps;
4385 token_pos = 0;
4386
4387 state = 1;
4388 }
4389 break;
4390
4391 case 4:
4392 token_pos++;
4393 state = 3;
4394 break;
4395 }
4396
4397 if (crlf_state == 1) {
4398 token_pos++;
4399 }
4400
4401 token_pos++;
4402
4403 crlf_state = 0;
4404 break;
4405 }
4406 ps++, icnt--;
4407 }
4408 out:
4409 if (state == 2) {
4410 token = "";
4411 token_pos = 0;
4412
4413 state = 3;
4414 }
4415 if (state == 3) {
4416 if(token && token_pos > 0) {
4417 fld_val = zend_string_init(token, token_pos, 0);
4418 }
4419 if (fld_name != NULL && fld_val != NULL) {
4420 zval val;
4421 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4422 ZVAL_STR(&val, fld_val);
4423 zend_hash_update(ht, fld_name, &val);
4424
4425 zend_string_release_ex(fld_name, 0);
4426 }
4427 }
4428 return state;
4429 }
4430
PHP_FUNCTION(mb_send_mail)4431 PHP_FUNCTION(mb_send_mail)
4432 {
4433 char *to;
4434 size_t to_len;
4435 char *message;
4436 size_t message_len;
4437 zend_string *subject;
4438 zend_string *extra_cmd = NULL;
4439 HashTable *headers_ht = NULL;
4440 zend_string *str_headers = NULL;
4441 size_t i;
4442 char *to_r = NULL;
4443 bool suppress_content_type = false;
4444 bool suppress_content_transfer_encoding = false;
4445
4446 char *p;
4447 enum mbfl_no_encoding;
4448 const mbfl_encoding *tran_cs, /* transfer text charset */
4449 *head_enc, /* header transfer encoding */
4450 *body_enc; /* body transfer encoding */
4451 const mbfl_language *lang;
4452 HashTable ht_headers;
4453 zval *s;
4454
4455 /* character-set, transfer-encoding */
4456 tran_cs = &mbfl_encoding_utf8;
4457 head_enc = &mbfl_encoding_base64;
4458 body_enc = &mbfl_encoding_base64;
4459 lang = mbfl_no2language(MBSTRG(language));
4460 if (lang != NULL) {
4461 tran_cs = mbfl_no2encoding(lang->mail_charset);
4462 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4463 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4464 }
4465
4466 ZEND_PARSE_PARAMETERS_START(3, 5)
4467 Z_PARAM_PATH(to, to_len)
4468 Z_PARAM_PATH_STR(subject)
4469 Z_PARAM_PATH(message, message_len)
4470 Z_PARAM_OPTIONAL
4471 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4472 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4473 ZEND_PARSE_PARAMETERS_END();
4474
4475 if (str_headers) {
4476 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4477 zend_argument_value_error(4, "must not contain any null bytes");
4478 RETURN_THROWS();
4479 }
4480 str_headers = php_trim(str_headers, NULL, 0, 2);
4481 } else if (headers_ht) {
4482 str_headers = php_mail_build_headers(headers_ht);
4483 if (EG(exception)) {
4484 RETURN_THROWS();
4485 }
4486 }
4487
4488 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4489
4490 if (str_headers != NULL) {
4491 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4492 }
4493
4494 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4495 char *tmp;
4496 char *param_name;
4497 char *charset = NULL;
4498
4499 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4500 p = strchr(Z_STRVAL_P(s), ';');
4501
4502 if (p != NULL) {
4503 /* skipping the padded spaces */
4504 do {
4505 ++p;
4506 } while (*p == ' ' || *p == '\t');
4507
4508 if (*p != '\0') {
4509 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4510 if (strcasecmp(param_name, "charset") == 0) {
4511 const mbfl_encoding *_tran_cs = tran_cs;
4512
4513 charset = php_strtok_r(NULL, "= \"", &tmp);
4514 if (charset != NULL) {
4515 _tran_cs = mbfl_name2encoding(charset);
4516 }
4517
4518 if (!_tran_cs) {
4519 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4520 _tran_cs = &mbfl_encoding_ascii;
4521 }
4522 tran_cs = _tran_cs;
4523 }
4524 }
4525 }
4526 }
4527 suppress_content_type = true;
4528 }
4529
4530 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4531 const mbfl_encoding *_body_enc;
4532
4533 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4534 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4535 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4536 case mbfl_no_encoding_base64:
4537 case mbfl_no_encoding_7bit:
4538 case mbfl_no_encoding_8bit:
4539 body_enc = _body_enc;
4540 break;
4541
4542 default:
4543 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4544 body_enc = &mbfl_encoding_8bit;
4545 break;
4546 }
4547 suppress_content_transfer_encoding = true;
4548 }
4549
4550 /* To: */
4551 if (to_len > 0) {
4552 to_r = estrndup(to, to_len);
4553 for (; to_len; to_len--) {
4554 if (!isspace((unsigned char) to_r[to_len - 1])) {
4555 break;
4556 }
4557 to_r[to_len - 1] = '\0';
4558 }
4559 for (i = 0; to_r[i]; i++) {
4560 if (iscntrl((unsigned char) to_r[i])) {
4561 /* According to RFC 822, section 3.1.1 long headers may be separated into
4562 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4563 * To prevent these separators from being replaced with a space, we skip over them. */
4564 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4565 i += 2;
4566 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4567 i++;
4568 }
4569 continue;
4570 }
4571
4572 to_r[i] = ' ';
4573 }
4574 }
4575 } else {
4576 to_r = to;
4577 }
4578
4579 /* Subject: */
4580 const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4581 if (enc == &mbfl_encoding_pass) {
4582 enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4583 }
4584 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4585 size_t line_sep_len = strlen(line_sep);
4586
4587 subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4588
4589 /* message body */
4590 const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4591 if (msg_enc == &mbfl_encoding_pass) {
4592 msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4593 }
4594
4595 unsigned int num_errors = 0;
4596 zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4597 zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4598 zend_string_free(tmpstr);
4599 message = ZSTR_VAL(conv);
4600
4601 /* other headers */
4602 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4603 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4604 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4605 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4606
4607 smart_str str = {0};
4608 bool empty = true;
4609
4610 if (str_headers != NULL) {
4611 /* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4612 size_t len = ZSTR_LEN(str_headers);
4613 if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4614 len--;
4615 }
4616 if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4617 len--;
4618 }
4619 smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4620 empty = false;
4621 zend_string_release_ex(str_headers, 0);
4622 }
4623
4624 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4625 if (!empty) {
4626 smart_str_appendl(&str, line_sep, line_sep_len);
4627 }
4628 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4629 empty = false;
4630 }
4631
4632 if (!suppress_content_type) {
4633 if (!empty) {
4634 smart_str_appendl(&str, line_sep, line_sep_len);
4635 }
4636 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4637
4638 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4639 if (p != NULL) {
4640 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4641 smart_str_appends(&str, p);
4642 }
4643 empty = false;
4644 }
4645
4646 if (!suppress_content_transfer_encoding) {
4647 if (!empty) {
4648 smart_str_appendl(&str, line_sep, line_sep_len);
4649 }
4650 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4651 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4652 if (p == NULL) {
4653 p = "7bit";
4654 }
4655 smart_str_appends(&str, p);
4656 }
4657
4658 str_headers = smart_str_extract(&str);
4659
4660 zend_string *force_extra_parameters = zend_ini_str_ex("mail.force_extra_parameters", strlen("mail.force_extra_parameters"), false, NULL);
4661 if (force_extra_parameters) {
4662 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4663 } else if (extra_cmd) {
4664 extra_cmd = php_escape_shell_cmd(extra_cmd);
4665 }
4666
4667 RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4668
4669 if (extra_cmd) {
4670 zend_string_release_ex(extra_cmd, 0);
4671 }
4672 if (to_r != to) {
4673 efree(to_r);
4674 }
4675 zend_string_release(subject);
4676 zend_string_free(conv);
4677 zend_hash_destroy(&ht_headers);
4678 if (str_headers) {
4679 zend_string_release_ex(str_headers, 0);
4680 }
4681 }
4682
4683 #undef CRLF
4684 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4685 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4686 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4687 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4688 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4689 /* }}} */
4690
4691 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4692 PHP_FUNCTION(mb_get_info)
4693 {
4694 zend_string *type = NULL;
4695 size_t n;
4696 char *name;
4697 zval row;
4698 const mbfl_encoding **entry;
4699 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4700
4701 ZEND_ASSERT(lang);
4702
4703 ZEND_PARSE_PARAMETERS_START(0, 1)
4704 Z_PARAM_OPTIONAL
4705 Z_PARAM_STR(type)
4706 ZEND_PARSE_PARAMETERS_END();
4707
4708 if (!type || zend_string_equals_literal_ci(type, "all")) {
4709 array_init(return_value);
4710 if (MBSTRG(current_internal_encoding)) {
4711 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4712 }
4713 if (MBSTRG(http_input_identify)) {
4714 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4715 }
4716 if (MBSTRG(current_http_output_encoding)) {
4717 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4718 }
4719
4720 add_assoc_str(return_value, "http_output_conv_mimetypes",
4721 zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4722 );
4723
4724 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4725 add_assoc_string(return_value, "mail_charset", name);
4726
4727 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4728 add_assoc_string(return_value, "mail_header_encoding", name);
4729
4730 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4731 add_assoc_string(return_value, "mail_body_encoding", name);
4732
4733 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4734
4735 if (MBSTRG(encoding_translation)) {
4736 add_assoc_string(return_value, "encoding_translation", "On");
4737 } else {
4738 add_assoc_string(return_value, "encoding_translation", "Off");
4739 }
4740
4741 name = (char *)mbfl_no_language2name(MBSTRG(language));
4742 add_assoc_string(return_value, "language", name);
4743
4744 // TODO Seems to always have one entry at least?
4745 n = MBSTRG(current_detect_order_list_size);
4746 entry = MBSTRG(current_detect_order_list);
4747 if (n > 0) {
4748 size_t i;
4749 array_init(&row);
4750 for (i = 0; i < n; i++) {
4751 add_next_index_string(&row, (*entry)->name);
4752 entry++;
4753 }
4754 add_assoc_zval(return_value, "detect_order", &row);
4755 }
4756 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4757 add_assoc_string(return_value, "substitute_character", "none");
4758 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4759 add_assoc_string(return_value, "substitute_character", "long");
4760 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4761 add_assoc_string(return_value, "substitute_character", "entity");
4762 } else {
4763 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4764 }
4765 if (MBSTRG(strict_detection)) {
4766 add_assoc_string(return_value, "strict_detection", "On");
4767 } else {
4768 add_assoc_string(return_value, "strict_detection", "Off");
4769 }
4770 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4771 ZEND_ASSERT(MBSTRG(current_internal_encoding));
4772 RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4773 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4774 if (MBSTRG(http_input_identify)) {
4775 RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4776 }
4777 RETURN_NULL();
4778 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4779 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4780 RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4781 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4782 RETURN_STR(
4783 zend_ini_str(
4784 "mbstring.http_output_conv_mimetypes",
4785 sizeof("mbstring.http_output_conv_mimetypes") - 1,
4786 false
4787 )
4788 );
4789 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4790 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4791 RETURN_STRING(name);
4792 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4793 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4794 RETURN_STRING(name);
4795 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4796 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4797 RETURN_STRING(name);
4798 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4799 RETURN_LONG(MBSTRG(illegalchars));
4800 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4801 if (MBSTRG(encoding_translation)) {
4802 RETURN_STRING("On");
4803 } else {
4804 RETURN_STRING("Off");
4805 }
4806 } else if (zend_string_equals_literal_ci(type, "language")) {
4807 name = (char *)mbfl_no_language2name(MBSTRG(language));
4808 RETURN_STRING(name);
4809 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4810 // TODO Seems to always have one entry at least?
4811 n = MBSTRG(current_detect_order_list_size);
4812 entry = MBSTRG(current_detect_order_list);
4813 if (n > 0) {
4814 size_t i;
4815 array_init(return_value);
4816 for (i = 0; i < n; i++) {
4817 add_next_index_string(return_value, (*entry)->name);
4818 entry++;
4819 }
4820 }
4821 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4822 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4823 RETURN_STRING("none");
4824 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4825 RETURN_STRING("long");
4826 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4827 RETURN_STRING("entity");
4828 } else {
4829 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4830 }
4831 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4832 if (MBSTRG(strict_detection)) {
4833 RETURN_STRING("On");
4834 } else {
4835 RETURN_STRING("Off");
4836 }
4837 } else {
4838 php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4839 RETURN_FALSE;
4840 }
4841 }
4842 /* }}} */
4843
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4844 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4845 {
4846 uint32_t wchar_buf[128];
4847 unsigned char *in = (unsigned char*)input;
4848 unsigned int state = 0;
4849
4850 if (encoding->check != NULL) {
4851 return encoding->check(in, length);
4852 }
4853
4854 /* If the input string is not encoded in the given encoding, there is a significant chance
4855 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4856 * buffer of 128 codepoints, convert and check just a few codepoints first */
4857 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4858 ZEND_ASSERT(out_len <= 8);
4859 for (unsigned int i = 0; i < out_len; i++) {
4860 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4861 return false;
4862 }
4863 }
4864
4865 while (length) {
4866 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4867 ZEND_ASSERT(out_len <= 128);
4868 for (unsigned int i = 0; i < out_len; i++) {
4869 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4870 return false;
4871 }
4872 }
4873 }
4874
4875 return true;
4876 }
4877
4878 /* MSVC 32-bit has issues with 64-bit intrinsics.
4879 * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4880 * It seems this is caused by a bug in MS Visual C++
4881 * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4882 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4883 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4884 #endif
4885
4886 /* If we are building an AVX2-only binary, don't compile the next function */
4887 #ifndef ZEND_INTRIN_AVX2_NATIVE
4888
4889 /* SSE2-based function for validating UTF-8 strings
4890 * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4891 static bool mb_fast_check_utf8_default(zend_string *str)
4892 {
4893 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4894 # ifdef __SSE2__
4895 /* `e` points 1 byte past the last full 16-byte block of string content
4896 * Note that we include the terminating null byte which is included in each zend_string
4897 * as part of the content to check; this ensures that multi-byte characters which are
4898 * truncated abruptly at the end of the string will be detected as invalid */
4899 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4900
4901 /* For checking for illegal bytes 0xF5-FF */
4902 const __m128i over_f5 = _mm_set1_epi8(-117);
4903 /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4904 const __m128i over_9f = _mm_set1_epi8(-97);
4905 /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4906 const __m128i over_8f = _mm_set1_epi8(-113);
4907 /* For checking for illegal bytes 0xC0-C1 */
4908 const __m128i find_c0 = _mm_set1_epi8(-64);
4909 const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4910 /* For checking structure of continuation bytes */
4911 const __m128i find_e0 = _mm_set1_epi8(-32);
4912 const __m128i find_f0 = _mm_set1_epi8(-16);
4913
4914 __m128i last_block = _mm_setzero_si128();
4915 __m128i operand;
4916
4917 while (p < e) {
4918 operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4919
4920 check_operand:
4921 /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4922 if (!_mm_movemask_epi8(operand)) {
4923 /* Even if this block only contains single-byte characters, there may have been a
4924 * multi-byte character at the end of the previous block, which was supposed to
4925 * have continuation bytes in this block
4926 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4927 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4928 * from the 3rd last */
4929 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4930 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4931 if (_mm_movemask_epi8(bad)) {
4932 return false;
4933 }
4934
4935 /* Consume as many full blocks of single-byte characters as we can */
4936 while (true) {
4937 p += sizeof(__m128i);
4938 if (p >= e) {
4939 goto finish_up_remaining_bytes;
4940 }
4941 operand = _mm_loadu_si128((__m128i*)p);
4942 if (_mm_movemask_epi8(operand)) {
4943 break;
4944 }
4945 }
4946 }
4947
4948 /* Check for >= 0xF5, which are illegal byte values in UTF-8
4949 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4950 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4951 * Then a single signed compare will pick out any bad bytes
4952 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4953 __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4954
4955 /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4956 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4957 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4958 * We can check for both problems at once by generating a vector where each byte < 0xA0
4959 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4960 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4961 __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4962 __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4963 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4964
4965 /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4966 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4967 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4968 * Build the bitmask and compare it with the shifted block */
4969 __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4970 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4971
4972 /* Check for overlong 2-byte code units
4973 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4974 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4975 * byte range, do a signed compare to pick out any bad bytes */
4976 bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4977
4978 /* Check structure of continuation bytes
4979 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4980 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4981 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4982 * 3) 3 bytes after the start of a 4-byte character
4983 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4984 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4985 __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4986 __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4987 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4988 __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4989 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4990
4991 /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4992 * a continuation byte actually is
4993 * XOR those two bitmasks together; if everything is good, the result should be zero
4994 * However, if a byte which should have been a continuation wasn't, or if a byte which
4995 * shouldn't have been a continuation was, we will get 0xFF in that position */
4996 __m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4997 bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4998
4999 /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
5000 * If that value is non-zero, then we found a bad byte somewhere! */
5001 if (_mm_movemask_epi8(bad)) {
5002 return false;
5003 }
5004
5005 last_block = operand;
5006 p += sizeof(__m128i);
5007 }
5008
5009 finish_up_remaining_bytes:
5010 /* Finish up 1-15 remaining bytes */
5011 if (p == e) {
5012 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5013
5014 /* Crazy hack here for cases where 9 or more bytes are remaining...
5015 * We want to use the above vectorized code to check a block of less than 16 bytes,
5016 * but there is no good way to read a variable number of bytes into an XMM register
5017 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5018 * 'header' fields which occupy the memory just before its content
5019 * And, those header fields occupy more than 16 bytes...
5020 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5021 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5022 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5023 * Then, we do a left shift to get rid of the unwanted bytes
5024 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5025 *
5026 * The following `switch` looks useless, but it's not
5027 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5028 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5029 */
5030 switch (remaining_bytes) {
5031 case 0: ;
5032 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5033 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5034 return _mm_movemask_epi8(bad) == 0;
5035 case 1:
5036 case 2:
5037 operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5038 goto check_operand;
5039 case 3:
5040 case 4:
5041 operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5042 goto check_operand;
5043 case 5:
5044 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5045 goto check_operand;
5046 case 6:
5047 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5048 goto check_operand;
5049 case 7:
5050 case 8:
5051 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5052 operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5053 #else
5054 operand = _mm_set_epi64x(0, *((uint64_t*)p));
5055 #endif
5056 goto check_operand;
5057 case 9:
5058 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5059 goto check_operand;
5060 case 10:
5061 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5062 goto check_operand;
5063 case 11:
5064 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5065 goto check_operand;
5066 case 12:
5067 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5068 goto check_operand;
5069 case 13:
5070 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5071 goto check_operand;
5072 case 14:
5073 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5074 goto check_operand;
5075 case 15:
5076 /* No trailing bytes are left which need to be checked
5077 * We get 15 because we did not include the terminating null when
5078 * calculating `remaining_bytes`, so the value wraps around */
5079 return true;
5080 }
5081
5082 ZEND_UNREACHABLE();
5083 }
5084
5085 return true;
5086 # else
5087 /* This UTF-8 validation function is derived from PCRE2 */
5088 size_t length = ZSTR_LEN(str);
5089 /* Table of the number of extra bytes, indexed by the first byte masked with
5090 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5091 static const uint8_t utf8_table[] = {
5092 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5093 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5094 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5095 3,3,3,3,3,3,3,3
5096 };
5097
5098 for (; length > 0; p++) {
5099 uint32_t d;
5100 unsigned char c = *p;
5101 length--;
5102
5103 if (c < 128) {
5104 /* ASCII character */
5105 continue;
5106 }
5107
5108 if (c < 0xc0) {
5109 /* Isolated 10xx xxxx byte */
5110 return false;
5111 }
5112
5113 if (c >= 0xf5) {
5114 return false;
5115 }
5116
5117 uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5118 if (length < ab) {
5119 /* Missing bytes */
5120 return false;
5121 }
5122 length -= ab;
5123
5124 /* Check top bits in the second byte */
5125 if (((d = *(++p)) & 0xc0) != 0x80) {
5126 return false;
5127 }
5128
5129 /* For each length, check that the remaining bytes start with the 0x80 bit
5130 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5131 * excluded range 0xd800 to 0xdfff. */
5132 switch (ab) {
5133 case 1:
5134 /* 2-byte character. No further bytes to check for 0x80. Check first byte
5135 * for xx00 000x (overlong sequence). */
5136 if ((c & 0x3e) == 0) {
5137 return false;
5138 }
5139 break;
5140
5141 case 2:
5142 /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5143 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5144 if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5145 return false;
5146 }
5147 break;
5148
5149 case 3:
5150 /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5151 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5152 * character greater than 0x0010ffff (f4 8f bf bf) */
5153 if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5154 return false;
5155 }
5156 break;
5157
5158 EMPTY_SWITCH_DEFAULT_CASE();
5159 }
5160 }
5161
5162 return true;
5163 # endif
5164 }
5165
5166 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5167
5168 #ifdef ZEND_INTRIN_AVX2_NATIVE
5169
5170 /* We are building AVX2-only binary */
5171 # include <immintrin.h>
5172 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5173
5174 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5175
5176 /* We are building binary which works with or without AVX2; whether or not to use
5177 * AVX2-accelerated functions will be determined at runtime */
5178 # include <immintrin.h>
5179 # include "Zend/zend_cpuinfo.h"
5180
5181 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5182 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5183 * resolve symbols accordingly */
5184
5185 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5186
5187 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5188
5189 typedef bool (*check_utf8_func_t)(zend_string*);
5190
5191 ZEND_NO_SANITIZE_ADDRESS
5192 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5193 static check_utf8_func_t resolve_check_utf8(void)
5194 {
5195 if (zend_cpu_supports_avx2()) {
5196 return mb_fast_check_utf8_avx2;
5197 }
5198 return mb_fast_check_utf8_default;
5199 }
5200
5201 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5202 /* We are compiling for a target where the dynamic linker will not be able to
5203 * resolve symbols according to whether the host supports AVX2 or not; so instead,
5204 * we can make calls go through a function pointer and set the function pointer
5205 * on module load */
5206
5207 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5208 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5209 #else
5210 static bool mb_fast_check_utf8_avx2(zend_string *str);
5211 #endif
5212
5213 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5214
mb_fast_check_utf8(zend_string * str)5215 static bool mb_fast_check_utf8(zend_string *str)
5216 {
5217 return check_utf8_ptr(str);
5218 }
5219
init_check_utf8(void)5220 static void init_check_utf8(void)
5221 {
5222 if (zend_cpu_supports_avx2()) {
5223 check_utf8_ptr = mb_fast_check_utf8_avx2;
5224 } else {
5225 check_utf8_ptr = mb_fast_check_utf8_default;
5226 }
5227 }
5228 # endif
5229
5230 #else
5231
5232 /* No AVX2 support */
5233 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5234
5235 #endif
5236
5237 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5238
5239 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5240 * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5241 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5242 # define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5243 #endif
5244
5245 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5246 * number of bytes, then take the low 256 bits
5247 * This is used to take some number of trailing bytes from the previous 32-byte
5248 * block followed by some number of leading bytes from the current 32-byte block
5249 *
5250 * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5251 * YMM register while shifting in bytes from another YMM register... but
5252 * it works separately on respective 128-bit halves of the YMM registers,
5253 * which is not what we want.
5254 * To make it work as desired, we first do _mm256_permute2x128_si256
5255 * (VPERM2I128) to combine the low 128 bits from the previous block and
5256 * the high 128 bits of the current block in one YMM register.
5257 * Then VPALIGNR will do what is needed. */
5258 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5259
5260 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5261 *
5262 * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5263 * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5264 * sections. */
5265 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5266 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5267 #else
5268 static bool mb_fast_check_utf8_avx2(zend_string *str)
5269 #endif
5270 {
5271 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5272 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5273
5274 /* The algorithm used here for UTF-8 validation is partially adapted from the
5275 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5276 * and Daniel Lemire.
5277 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5278 *
5279 * Most types of invalid UTF-8 text can be detected by examining pairs of
5280 * successive bytes. Specifically:
5281 *
5282 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5283 * No valid UTF-8 string ever uses these byte values.
5284 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5285 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5286 * • 5-byte or 6-byte code units, which should never be used, start with
5287 * 0xF8-FE.
5288 * • A codepoint value higher than U+10FFFF, which is the highest value for
5289 * any Unicode codepoint, would either start with 0xF4, followed by a
5290 * byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5291 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5292 * be used, would start with 0xED, followed by a byte >= 0xA0.
5293 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5294 *
5295 * To detect all these problems, for each pair of successive bytes, we do
5296 * table lookups using the high nibble of the first byte, the low nibble of
5297 * the first byte, and the high nibble of the second byte. Each table lookup
5298 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5299 * combination; AND those three bitmasks together, and any 1 bit in the result
5300 * will indicate an actual invalid byte combination was found.
5301 */
5302
5303 #define BAD_BYTE 0x1
5304 #define OVERLONG_2BYTE 0x2
5305 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5306 #define OVERLONG_3BYTE 0x4
5307 #define SURROGATE 0x8
5308 #define OVERLONG_4BYTE 0x10
5309 #define INVALID_CP 0x20
5310
5311 /* Each of these are 16-entry tables, repeated twice; this is required by the
5312 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5313 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5314 *
5315 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5316 * that means that high nibble 0xC is consistent with the byte pair being part of
5317 * an overlong 2-byte code unit */
5318 const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5319 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5320 0, 0, 0, 0,
5321 0, 0, 0, 0,
5322 0, 0, 0, 0,
5323 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5324 0, 0, 0, 0,
5325 0, 0, 0, 0,
5326 0, 0, 0, 0);
5327 const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5328 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5329 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5330 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5331 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5332 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5333 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5334 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5335 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5336 const __m256i bad_hi_nibble = _mm256_set_epi8(
5337 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5338 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5339 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5340 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5341 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5342 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5343 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5344 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5345 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5346 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5347 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5348 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5349 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5350 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5351 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5352 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5353
5354 const __m256i find_continuation = _mm256_set1_epi8(-64);
5355 const __m256i _b = _mm256_set1_epi8(0xB);
5356 const __m256i _d = _mm256_set1_epi8(0xD);
5357 const __m256i _f = _mm256_set1_epi8(0xF);
5358
5359 __m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5360 __m256i operand;
5361
5362 while (p < e) {
5363 operand = _mm256_loadu_si256((__m256i*)p);
5364
5365 check_operand:
5366 if (!_mm256_movemask_epi8(operand)) {
5367 /* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5368 * the previous block didn't end with an incomplete multi-byte character
5369 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5370 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5371 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5372 if (_mm256_movemask_epi8(bad)) {
5373 return false;
5374 }
5375
5376 /* Consume as many full blocks of single-byte characters as we can */
5377 while (true) {
5378 p += sizeof(__m256i);
5379 if (p >= e) {
5380 goto finish_up_remaining_bytes;
5381 }
5382 operand = _mm256_loadu_si256((__m256i*)p);
5383 if (_mm256_movemask_epi8(operand)) {
5384 break;
5385 }
5386 }
5387 }
5388
5389 __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5390 __m256i lo_nibbles = _mm256_and_si256(operand, _f);
5391
5392 __m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5393 __m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5394
5395 /* Do parallel table lookups in all 3 tables */
5396 __m256i bad = _mm256_cmpgt_epi8(
5397 _mm256_and_si256(
5398 _mm256_and_si256(
5399 _mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5400 _mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5401 _mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5402 _mm256_setzero_si256());
5403
5404 __m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5405 __m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5406 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5407 __m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5408 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5409
5410 __m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5411 bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5412
5413 if (_mm256_movemask_epi8(bad)) {
5414 return false;
5415 }
5416
5417 last_hi_nibbles = hi_nibbles;
5418 last_lo_nibbles = lo_nibbles;
5419 p += sizeof(__m256i);
5420 }
5421
5422 finish_up_remaining_bytes:
5423 if (p == e) {
5424 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5425
5426 switch (remaining_bytes) {
5427 case 0: ;
5428 /* No actual data bytes are remaining */
5429 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5430 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5431 return _mm256_movemask_epi8(bad) == 0;
5432 case 1:
5433 case 2:
5434 operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5435 goto check_operand;
5436 case 3:
5437 case 4:
5438 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5439 goto check_operand;
5440 case 5:
5441 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5442 goto check_operand;
5443 case 6:
5444 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5445 goto check_operand;
5446 case 7:
5447 case 8:
5448 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5449 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5450 #else
5451 operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5452 #endif
5453 goto check_operand;
5454 case 9:
5455 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5456 goto check_operand;
5457 case 10:
5458 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5459 goto check_operand;
5460 case 11:
5461 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5462 goto check_operand;
5463 case 12:
5464 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5465 goto check_operand;
5466 case 13:
5467 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5468 goto check_operand;
5469 case 14:
5470 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5471 goto check_operand;
5472 case 15:
5473 case 16:
5474 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5475 goto check_operand;
5476 case 17:
5477 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5478 goto check_operand;
5479 case 18:
5480 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5481 goto check_operand;
5482 case 19:
5483 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5484 goto check_operand;
5485 case 20:
5486 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5487 goto check_operand;
5488 case 21:
5489 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5490 goto check_operand;
5491 case 22:
5492 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5493 goto check_operand;
5494 case 23:
5495 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5496 goto check_operand;
5497 case 24:
5498 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5499 goto check_operand;
5500 case 25:
5501 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5502 goto check_operand;
5503 case 26:
5504 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5505 goto check_operand;
5506 case 27:
5507 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5508 goto check_operand;
5509 case 28:
5510 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5511 goto check_operand;
5512 case 29:
5513 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5514 goto check_operand;
5515 case 30:
5516 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5517 goto check_operand;
5518 case 31:
5519 return true;
5520 }
5521
5522 ZEND_UNREACHABLE();
5523 }
5524
5525 return true;
5526 }
5527
5528 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5529
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5530 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5531 {
5532 if (encoding == &mbfl_encoding_utf8) {
5533 if (ZSTR_IS_VALID_UTF8(str)) {
5534 return true;
5535 }
5536 bool result = mb_fast_check_utf8(str);
5537 if (result && !ZSTR_IS_INTERNED(str)) {
5538 GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5539 }
5540 return result;
5541 } else {
5542 return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5543 }
5544 }
5545
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5546 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5547 {
5548 zend_long idx;
5549 zend_string *key;
5550 zval *entry;
5551 bool valid = true;
5552
5553 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5554
5555 if (GC_IS_RECURSIVE(vars)) {
5556 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5557 return false;
5558 }
5559 GC_TRY_PROTECT_RECURSION(vars);
5560 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5561 ZVAL_DEREF(entry);
5562 if (key) {
5563 if (!mb_check_str_encoding(key, encoding)) {
5564 valid = false;
5565 break;
5566 }
5567 }
5568 switch (Z_TYPE_P(entry)) {
5569 case IS_STRING:
5570 if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5571 valid = false;
5572 break;
5573 }
5574 break;
5575 case IS_ARRAY:
5576 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5577 valid = false;
5578 break;
5579 }
5580 break;
5581 case IS_LONG:
5582 case IS_DOUBLE:
5583 case IS_NULL:
5584 case IS_TRUE:
5585 case IS_FALSE:
5586 break;
5587 default:
5588 /* Other types are error. */
5589 valid = false;
5590 break;
5591 }
5592 } ZEND_HASH_FOREACH_END();
5593 GC_TRY_UNPROTECT_RECURSION(vars);
5594 return valid;
5595 }
5596
5597 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5598 PHP_FUNCTION(mb_check_encoding)
5599 {
5600 zend_string *input_str = NULL, *enc = NULL;
5601 HashTable *input_ht = NULL;
5602 const mbfl_encoding *encoding;
5603
5604 ZEND_PARSE_PARAMETERS_START(0, 2)
5605 Z_PARAM_OPTIONAL
5606 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5607 Z_PARAM_STR_OR_NULL(enc)
5608 ZEND_PARSE_PARAMETERS_END();
5609
5610 encoding = php_mb_get_encoding(enc, 2);
5611 if (!encoding) {
5612 RETURN_THROWS();
5613 }
5614
5615 if (input_ht) {
5616 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5617 } else if (input_str) {
5618 RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5619 } else {
5620 php_error_docref(NULL, E_DEPRECATED,
5621 "Calling mb_check_encoding() without argument is deprecated");
5622
5623 /* FIXME: Actually check all inputs, except $_FILES file content. */
5624 RETURN_BOOL(MBSTRG(illegalchars) == 0);
5625 }
5626 }
5627 /* }}} */
5628
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5629 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5630 const uint32_t enc_name_arg_num)
5631 {
5632 const mbfl_encoding *enc;
5633 enum mbfl_no_encoding no_enc;
5634
5635 ZEND_ASSERT(str_len > 0);
5636
5637 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5638 if (!enc) {
5639 return -2;
5640 }
5641
5642 no_enc = enc->no_encoding;
5643 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5644 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5645 return -2;
5646 }
5647
5648 /* Some legacy text encodings have a minimum required wchar buffer size;
5649 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5650 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5651 unsigned int state = 0;
5652 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5653 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5654
5655 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5656 return -1;
5657 }
5658 return wchar_buf[0];
5659 }
5660
5661 /* {{{ */
PHP_FUNCTION(mb_ord)5662 PHP_FUNCTION(mb_ord)
5663 {
5664 char *str;
5665 size_t str_len;
5666 zend_string *enc = NULL;
5667 zend_long cp;
5668
5669 ZEND_PARSE_PARAMETERS_START(1, 2)
5670 Z_PARAM_STRING(str, str_len)
5671 Z_PARAM_OPTIONAL
5672 Z_PARAM_STR_OR_NULL(enc)
5673 ZEND_PARSE_PARAMETERS_END();
5674
5675 if (str_len == 0) {
5676 zend_argument_must_not_be_empty_error(1);
5677 RETURN_THROWS();
5678 }
5679
5680 cp = php_mb_ord(str, str_len, enc, 2);
5681
5682 if (0 > cp) {
5683 if (cp == -2) {
5684 RETURN_THROWS();
5685 }
5686 RETURN_FALSE;
5687 }
5688
5689 RETURN_LONG(cp);
5690 }
5691 /* }}} */
5692
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5693 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5694 {
5695 const mbfl_encoding *enc;
5696 enum mbfl_no_encoding no_enc;
5697 zend_string *ret;
5698 char buf[4];
5699
5700 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5701 if (!enc) {
5702 return NULL;
5703 }
5704
5705 no_enc = enc->no_encoding;
5706 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5707 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5708 return NULL;
5709 }
5710
5711 if (cp < 0 || cp > 0x10ffff) {
5712 return NULL;
5713 }
5714
5715 if (php_mb_is_no_encoding_utf8(no_enc)) {
5716 if (cp > 0xd7ff && 0xe000 > cp) {
5717 return NULL;
5718 }
5719
5720 if (cp < 0x80) {
5721 ret = ZSTR_CHAR(cp);
5722 } else if (cp < 0x800) {
5723 ret = zend_string_alloc(2, 0);
5724 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5725 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5726 ZSTR_VAL(ret)[2] = 0;
5727 } else if (cp < 0x10000) {
5728 ret = zend_string_alloc(3, 0);
5729 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5730 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5731 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5732 ZSTR_VAL(ret)[3] = 0;
5733 } else {
5734 ret = zend_string_alloc(4, 0);
5735 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5736 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5737 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5738 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5739 ZSTR_VAL(ret)[4] = 0;
5740 }
5741
5742 return ret;
5743 }
5744
5745 buf[0] = (cp >> 24) & 0xff;
5746 buf[1] = (cp >> 16) & 0xff;
5747 buf[2] = (cp >> 8) & 0xff;
5748 buf[3] = cp & 0xff;
5749
5750 long orig_illegalchars = MBSTRG(illegalchars);
5751 MBSTRG(illegalchars) = 0;
5752 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5753
5754 if (MBSTRG(illegalchars) != 0) {
5755 zend_string_release(ret);
5756 ret = NULL;
5757 }
5758
5759 MBSTRG(illegalchars) = orig_illegalchars;
5760 return ret;
5761 }
5762
5763 /* {{{ */
PHP_FUNCTION(mb_chr)5764 PHP_FUNCTION(mb_chr)
5765 {
5766 zend_long cp;
5767 zend_string *enc = NULL;
5768
5769 ZEND_PARSE_PARAMETERS_START(1, 2)
5770 Z_PARAM_LONG(cp)
5771 Z_PARAM_OPTIONAL
5772 Z_PARAM_STR_OR_NULL(enc)
5773 ZEND_PARSE_PARAMETERS_END();
5774
5775 zend_string* ret = php_mb_chr(cp, enc, 2);
5776 if (ret == NULL) {
5777 RETURN_FALSE;
5778 }
5779
5780 RETURN_STR(ret);
5781 }
5782 /* }}} */
5783
PHP_FUNCTION(mb_str_pad)5784 PHP_FUNCTION(mb_str_pad)
5785 {
5786 zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5787 zend_long pad_to_length;
5788 zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5789
5790 ZEND_PARSE_PARAMETERS_START(2, 5)
5791 Z_PARAM_STR(input)
5792 Z_PARAM_LONG(pad_to_length)
5793 Z_PARAM_OPTIONAL
5794 Z_PARAM_STR(pad)
5795 Z_PARAM_LONG(pad_type_val)
5796 Z_PARAM_STR_OR_NULL(encoding_str)
5797 ZEND_PARSE_PARAMETERS_END();
5798
5799 const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5800 if (!encoding) {
5801 RETURN_THROWS();
5802 }
5803
5804 size_t input_length = mb_get_strlen(input, encoding);
5805
5806 /* If resulting string turns out to be shorter than input string,
5807 we simply copy the input and return. */
5808 if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5809 RETURN_STR_COPY(input);
5810 }
5811
5812 if (ZSTR_LEN(pad) == 0) {
5813 zend_argument_must_not_be_empty_error(3);
5814 RETURN_THROWS();
5815 }
5816
5817 if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5818 zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5819 RETURN_THROWS();
5820 }
5821
5822 size_t pad_length = mb_get_strlen(pad, encoding);
5823
5824 size_t num_mb_pad_chars = pad_to_length - input_length;
5825
5826 /* We need to figure out the left/right padding lengths. */
5827 size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5828 switch (pad_type_val) {
5829 case PHP_STR_PAD_RIGHT:
5830 right_pad = num_mb_pad_chars;
5831 break;
5832
5833 case PHP_STR_PAD_LEFT:
5834 left_pad = num_mb_pad_chars;
5835 break;
5836
5837 case PHP_STR_PAD_BOTH:
5838 left_pad = num_mb_pad_chars / 2;
5839 right_pad = num_mb_pad_chars - left_pad;
5840 break;
5841 }
5842
5843 /* How many full block copies need to happen, and how many characters are then left over? */
5844 size_t full_left_pad_copies = left_pad / pad_length;
5845 size_t full_right_pad_copies = right_pad / pad_length;
5846 size_t remaining_left_pad_chars = left_pad % pad_length;
5847 size_t remaining_right_pad_chars = right_pad % pad_length;
5848
5849 if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5850 goto overflow_no_release;
5851 }
5852
5853 /* Compute the number of bytes required for the padding */
5854 size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5855 size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5856
5857 /* No special fast-path handling necessary for zero-length pads because these functions will not
5858 * allocate memory in case a zero-length pad is required. */
5859 zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5860 zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5861
5862 if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5863 || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5864 goto overflow;
5865 }
5866
5867 size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5868 size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5869
5870 if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5871 || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5872 goto overflow;
5873 }
5874
5875 zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5876 char *buffer = ZSTR_VAL(result);
5877
5878 /* First we pad the left. */
5879 for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5880 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5881 }
5882 memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5883 buffer += ZSTR_LEN(remaining_left_pad_str);
5884
5885 /* Then we copy the input string. */
5886 memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5887 buffer += ZSTR_LEN(input);
5888
5889 /* Finally, we pad on the right. */
5890 for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5891 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5892 }
5893 memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5894
5895 ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5896
5897 zend_string_release_ex(remaining_left_pad_str, false);
5898 zend_string_release_ex(remaining_right_pad_str, false);
5899
5900 RETURN_NEW_STR(result);
5901
5902 overflow:
5903 zend_string_release_ex(remaining_left_pad_str, false);
5904 zend_string_release_ex(remaining_right_pad_str, false);
5905 overflow_no_release:
5906 zend_throw_error(NULL, "String size overflow");
5907 RETURN_THROWS();
5908 }
5909
5910 /* {{{ */
PHP_FUNCTION(mb_scrub)5911 PHP_FUNCTION(mb_scrub)
5912 {
5913 zend_string *str, *enc_name = NULL;
5914
5915 ZEND_PARSE_PARAMETERS_START(1, 2)
5916 Z_PARAM_STR(str)
5917 Z_PARAM_OPTIONAL
5918 Z_PARAM_STR_OR_NULL(enc_name)
5919 ZEND_PARSE_PARAMETERS_END();
5920
5921 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5922 if (!enc) {
5923 RETURN_THROWS();
5924 }
5925
5926 if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5927 /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5928 RETURN_STR_COPY(str);
5929 }
5930
5931 RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5932 }
5933 /* }}} */
5934
5935 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5936 static void php_mb_populate_current_detect_order_list(void)
5937 {
5938 const mbfl_encoding **entry = 0;
5939 size_t nentries;
5940
5941 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5942 nentries = MBSTRG(detect_order_list_size);
5943 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5944 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5945 } else {
5946 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5947 size_t i;
5948 nentries = MBSTRG(default_detect_order_list_size);
5949 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5950 for (i = 0; i < nentries; i++) {
5951 entry[i] = mbfl_no2encoding(src[i]);
5952 }
5953 }
5954 MBSTRG(current_detect_order_list) = entry;
5955 MBSTRG(current_detect_order_list_size) = nentries;
5956 }
5957 /* }}} */
5958
5959 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5960 static int php_mb_encoding_translation(void)
5961 {
5962 return MBSTRG(encoding_translation);
5963 }
5964 /* }}} */
5965
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5966 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5967 {
5968 if (enc) {
5969 if (enc->mblen_table) {
5970 if (s) {
5971 return enc->mblen_table[*(unsigned char *)s];
5972 }
5973 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5974 return 2;
5975 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5976 return 4;
5977 }
5978 }
5979 return 1;
5980 }
5981
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5982 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5983 {
5984 const char *p = s;
5985 char *last=NULL;
5986
5987 if (nbytes == (size_t)-1) {
5988 size_t nb = 0;
5989
5990 while (*p != '\0') {
5991 if (nb == 0) {
5992 if ((unsigned char)*p == (unsigned char)c) {
5993 last = (char *)p;
5994 }
5995 nb = php_mb_mbchar_bytes(p, enc);
5996 if (nb == 0) {
5997 return NULL; /* something is going wrong! */
5998 }
5999 }
6000 --nb;
6001 ++p;
6002 }
6003 } else {
6004 size_t bcnt = nbytes;
6005 size_t nbytes_char;
6006 while (bcnt > 0) {
6007 if ((unsigned char)*p == (unsigned char)c) {
6008 last = (char *)p;
6009 }
6010 nbytes_char = php_mb_mbchar_bytes(p, enc);
6011 if (bcnt < nbytes_char) {
6012 return NULL;
6013 }
6014 p += nbytes_char;
6015 bcnt -= nbytes_char;
6016 }
6017 }
6018 return last;
6019 }
6020
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)6021 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
6022 {
6023 /* We're using simple case-folding here, because we'd have to deal with remapping of
6024 * offsets otherwise. */
6025 zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6026 zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6027
6028 size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6029
6030 zend_string_free(haystack_conv);
6031 zend_string_free(needle_conv);
6032
6033 return n;
6034 }
6035
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)6036 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6037 {
6038 *list = (const zend_encoding **)MBSTRG(http_input_list);
6039 *list_size = MBSTRG(http_input_list_size);
6040 }
6041 /* }}} */
6042
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)6043 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6044 {
6045 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
6046 }
6047 /* }}} */
6048
6049 static const unsigned char base64_table[] = {
6050 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6051 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6052 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6053 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6054 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6055 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6056 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6057 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6058 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6059 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6060 };
6061
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6062 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6063 {
6064 if (base64) {
6065 return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6066 } else {
6067 size_t enc_size = 0;
6068 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6069 while (p < tmpbuf->out) {
6070 unsigned char c = *p++;
6071 enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6072 }
6073 return enc_size;
6074 }
6075 }
6076
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6077 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6078 {
6079 unsigned char *out, *limit;
6080 MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6081 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6082
6083 if (base64) {
6084 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6085 while ((e - p) >= 3) {
6086 unsigned char a = *p++;
6087 unsigned char b = *p++;
6088 unsigned char c = *p++;
6089 uint32_t bits = (a << 16) | (b << 8) | c;
6090 out = mb_convert_buf_add4(out,
6091 base64_table[(bits >> 18) & 0x3F],
6092 base64_table[(bits >> 12) & 0x3F],
6093 base64_table[(bits >> 6) & 0x3F],
6094 base64_table[bits & 0x3F]);
6095 }
6096 if (p != e) {
6097 if ((e - p) == 1) {
6098 uint32_t bits = *p++;
6099 out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6100 } else {
6101 unsigned char a = *p++;
6102 unsigned char b = *p++;
6103 uint32_t bits = (a << 8) | b;
6104 out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6105 }
6106 }
6107 } else {
6108 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6109 while (p < e) {
6110 unsigned char c = *p++;
6111 if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6112 out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6113 } else {
6114 out = mb_convert_buf_add(out, c);
6115 }
6116 }
6117 }
6118
6119 mb_convert_buf_reset(tmpbuf, 0);
6120 MB_CONVERT_BUF_STORE(outbuf, out, limit);
6121 }
6122
6123 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6124
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6125 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6126 {
6127 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6128 size_t in_len = ZSTR_LEN(input);
6129
6130 ZEND_ASSERT(outcode->mime_name != NULL);
6131 ZEND_ASSERT(outcode->mime_name[0] != '\0');
6132
6133 if (!in_len) {
6134 return zend_empty_string;
6135 }
6136
6137 if (indent < 0 || indent >= 74) {
6138 indent = 0;
6139 }
6140
6141 if (linefeed_len > 8) {
6142 linefeed_len = 8;
6143 }
6144 /* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6145 for (size_t i = 0; i < linefeed_len; i++) {
6146 if (linefeed[i] == '\0') {
6147 linefeed_len = i;
6148 break;
6149 }
6150 }
6151
6152 unsigned int state = 0;
6153 /* wchar_buf should be big enough that when it is full, we definitely have enough
6154 * wchars to fill an entire line of output */
6155 uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6156 uint32_t *p, *e;
6157 /* What part of wchar_buf is filled with still-unprocessed data which should not
6158 * be overwritten? */
6159 unsigned int offset = 0;
6160 size_t line_start = 0;
6161
6162 /* If the entire input string is ASCII with no spaces (except possibly leading
6163 * spaces), just pass it through unchanged */
6164 bool checking_leading_spaces = true;
6165 while (in_len) {
6166 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6167 p = wchar_buf;
6168 e = wchar_buf + out_len;
6169
6170 while (p < e) {
6171 uint32_t w = *p++;
6172 if (checking_leading_spaces) {
6173 if (w == ' ') {
6174 continue;
6175 } else {
6176 checking_leading_spaces = false;
6177 }
6178 }
6179 if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6180 /* We cannot simply pass input string through unchanged; start again */
6181 in = (unsigned char*)ZSTR_VAL(input);
6182 in_len = ZSTR_LEN(input);
6183 goto no_passthrough;
6184 }
6185 }
6186 }
6187
6188 return zend_string_copy(input); /* This just increments refcount */
6189
6190 no_passthrough: ;
6191
6192 mb_convert_buf buf;
6193 mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6194
6195 /* Encode some prefix of the input string as plain ASCII if possible
6196 * If we find it necessary to switch to Base64/QPrint encoding, we will
6197 * do so all the way to the end of the string */
6198 while (in_len) {
6199 /* Decode part of the input string, refill wchar_buf */
6200 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6201 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6202 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6203 p = wchar_buf;
6204 e = wchar_buf + offset + out_len;
6205 /* ASCII output is broken into space-delimited 'words'
6206 * If we find a non-ASCII character in the middle of a word, we will
6207 * transfer-encode the entire word */
6208 uint32_t *word_start = p;
6209
6210 /* Don't consider adding line feed for spaces at the beginning of a word */
6211 while (p < e && *p == ' ' && (p - word_start) <= 74) {
6212 p++;
6213 }
6214
6215 while (p < e) {
6216 uint32_t w = *p++;
6217
6218 if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6219 /* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6220 * If we are already too far along on a line to include Base64/QPrint encoded data
6221 * on the same line (without overrunning max line length), then add a line feed
6222 * right now */
6223 feed_and_mime_encode:
6224 if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6225 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6226 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6227 buf.out = mb_convert_buf_add(buf.out, ' ');
6228 indent = 0;
6229 line_start = mb_convert_buf_len(&buf);
6230 } else if (mb_convert_buf_len(&buf) > 0) {
6231 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6232 buf.out = mb_convert_buf_add(buf.out, ' ');
6233 }
6234 p = word_start; /* Back up to where MIME encoding of input chars should start */
6235 goto mime_encoding_needed;
6236 } else if (w == ' ') {
6237 /* When we see a space, check whether we should insert a line break */
6238 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6239 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6240 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6241 buf.out = mb_convert_buf_add(buf.out, ' ');
6242 indent = 0;
6243 line_start = mb_convert_buf_len(&buf);
6244 } else if (mb_convert_buf_len(&buf) > 0) {
6245 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6246 buf.out = mb_convert_buf_add(buf.out, ' ');
6247 }
6248 /* Output one (space-delimited) word as plain ASCII */
6249 while (word_start < p-1) {
6250 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6251 }
6252 word_start++;
6253 while (p < e && *p == ' ') {
6254 p++;
6255 }
6256 }
6257 }
6258
6259 if (in_len) {
6260 /* Copy chars which are part of an incomplete 'word' to the beginning
6261 * of wchar_buf and reprocess them on the next iteration.
6262 * But first make sure that the incomplete 'word' isn't so big that
6263 * there will be no space to add any more decoded wchars in the buffer
6264 * (which could lead to an infinite loop) */
6265 if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6266 goto feed_and_mime_encode;
6267 }
6268 offset = e - word_start;
6269 if (offset) {
6270 memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6271 }
6272 } else {
6273 /* We have reached the end of the input string while still in 'ASCII mode';
6274 * process any trailing ASCII chars which were not followed by a space */
6275 if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6276 /* The whole input string was not just one big ASCII 'word' with no spaces
6277 * consider adding a line feed if necessary to prevent output lines from
6278 * being too long */
6279 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6280 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6281 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6282 buf.out = mb_convert_buf_add(buf.out, ' ');
6283 } else {
6284 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6285 buf.out = mb_convert_buf_add(buf.out, ' ');
6286 }
6287 }
6288 while (word_start < e) {
6289 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6290 }
6291 }
6292 }
6293
6294 /* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6295 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6296
6297 mime_encoding_needed: ;
6298
6299 /* We will generate the output line by line, first converting wchars to bytes
6300 * in the requested output encoding, then transfer-encoding those bytes as
6301 * Base64 or QPrint
6302 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6303 * sending them to 'buf' */
6304 mb_convert_buf tmpbuf;
6305 mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6306
6307 /* Do we need to refill wchar_buf to make sure we don't run out of wchars
6308 * in the middle of a line? */
6309 offset = e - p;
6310 if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6311 goto start_new_line;
6312 }
6313 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6314
6315 while(true) {
6316 refill_wchar_buf: ;
6317 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6318 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6319 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6320 p = wchar_buf;
6321 e = wchar_buf + offset + out_len;
6322
6323 start_new_line: ;
6324 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6325 buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6326 buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6327 buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6328
6329 /* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6330 * We do something like a 'binary search' to find the greatest number which
6331 * can be included on this line without exceeding max line length */
6332 unsigned int n = 12;
6333 size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6334
6335 while (true) {
6336 ZEND_ASSERT(p < e);
6337
6338 /* Remember where we were in process of generating output, so we can back
6339 * up if necessary */
6340 size_t tmppos = mb_convert_buf_len(&tmpbuf);
6341 unsigned int tmpstate = tmpbuf.state;
6342
6343 /* Try encoding 'n' wchars in output text encoding and sending output
6344 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6345 * current line. */
6346 n = MIN(n, e - p);
6347 outcode->from_wchar(p, n, &tmpbuf, false);
6348
6349 /* For some output text encodings, there may be a few ending bytes
6350 * which need to be emitted to output before we break a line.
6351 * Again, remember where we were so we can back up */
6352 size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6353 unsigned int tmpstate2 = tmpbuf.state;
6354 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6355
6356 if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6357 /* If we convert 'n' more wchars on the current line, it will not
6358 * overflow the maximum line length */
6359 p += n;
6360
6361 if (p == e) {
6362 /* We are done; we shouldn't reach here if there is more remaining
6363 * of the input string which needs to be processed */
6364 ZEND_ASSERT(!in_len);
6365 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6366 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6367 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6368 mb_convert_buf_free(&tmpbuf);
6369 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6370 } else {
6371 /* It's possible that more chars might fit on the current line,
6372 * so back up to where we were before emitting any ending bytes */
6373 mb_convert_buf_reset(&tmpbuf, tmppos2);
6374 tmpbuf.state = tmpstate2;
6375 }
6376 } else {
6377 /* Converting 'n' more wchars on this line would be too much.
6378 * Back up to where we were before we tried that. */
6379 mb_convert_buf_reset(&tmpbuf, tmppos);
6380 tmpbuf.state = tmpstate;
6381
6382 if (n == 1) {
6383 /* We have found the exact number of chars which will fit on the
6384 * current line. Finish up and move to a new line. */
6385 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6386 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6387 tmpbuf.state = 0;
6388
6389 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6390 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6391
6392 indent = 0; /* Indent argument must only affect the first line */
6393
6394 if (in_len || p < e) {
6395 /* We still have more input to process */
6396 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6397 buf.out = mb_convert_buf_add(buf.out, ' ');
6398 line_start = mb_convert_buf_len(&buf);
6399 offset = e - p;
6400 if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6401 /* Copy any remaining wchars to beginning of buffer and refill
6402 * the rest of the buffer */
6403 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6404 goto refill_wchar_buf;
6405 }
6406 goto start_new_line;
6407 } else {
6408 /* We are done! */
6409 mb_convert_buf_free(&tmpbuf);
6410 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6411 }
6412 } else {
6413 /* Try a smaller number of wchars */
6414 n = MAX(n >> 1, 1);
6415 }
6416 }
6417 }
6418 }
6419 }
6420
PHP_FUNCTION(mb_encode_mimeheader)6421 PHP_FUNCTION(mb_encode_mimeheader)
6422 {
6423 const mbfl_encoding *charset = &mbfl_encoding_pass;
6424 zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6425 char *linefeed = "\r\n";
6426 size_t linefeed_len = 2;
6427 zend_long indent = 0;
6428 bool base64 = true;
6429
6430 ZEND_PARSE_PARAMETERS_START(1, 5)
6431 Z_PARAM_STR(str)
6432 Z_PARAM_OPTIONAL
6433 Z_PARAM_STR(charset_name)
6434 Z_PARAM_STR(transenc_name)
6435 Z_PARAM_STRING(linefeed, linefeed_len)
6436 Z_PARAM_LONG(indent)
6437 ZEND_PARSE_PARAMETERS_END();
6438
6439 if (charset_name != NULL) {
6440 charset = php_mb_get_encoding(charset_name, 2);
6441 if (!charset) {
6442 RETURN_THROWS();
6443 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6444 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6445 RETURN_THROWS();
6446 }
6447 } else {
6448 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6449 if (lang != NULL) {
6450 charset = mbfl_no2encoding(lang->mail_charset);
6451 const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6452 char t = transenc->name[0];
6453 if (t == 'Q' || t == 'q') {
6454 base64 = false;
6455 }
6456 }
6457 }
6458
6459 if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6460 char t = ZSTR_VAL(transenc_name)[0];
6461 if (t == 'Q' || t == 'q') {
6462 base64 = false;
6463 }
6464 }
6465
6466 RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6467 }
6468
decode_base64(unsigned char c)6469 static int8_t decode_base64(unsigned char c)
6470 {
6471 if (c >= 'A' && c <= 'Z') {
6472 return c - 'A';
6473 } else if (c >= 'a' && c <= 'z') {
6474 return c - 'a' + 26;
6475 } else if (c >= '0' && c <= '9') {
6476 return c - '0' + 52;
6477 } else if (c == '+') {
6478 return 62;
6479 } else if (c == '/') {
6480 return 63;
6481 }
6482 return -1;
6483 }
6484
6485 static int8_t qprint_map[] = {
6486 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6487 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6488 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6489 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
6490 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6491 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6492 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6493 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6494 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6495 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6496 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6497 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6498 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6499 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6500 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6501 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6502 };
6503
6504 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6505 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6506 {
6507 if ((e - p) < 6) {
6508 return NULL;
6509 }
6510
6511 ZEND_ASSERT(p[0] == '=');
6512 ZEND_ASSERT(p[1] == '?');
6513 p += 2;
6514
6515 unsigned char *charset = p;
6516 unsigned char *charset_end = memchr(charset, '?', e - charset);
6517 if (charset_end == NULL) {
6518 return NULL;
6519 }
6520
6521 unsigned char *encoding = charset_end + 1;
6522 p = encoding + 1;
6523 if (p >= e || *p++ != '?') {
6524 return NULL;
6525 }
6526
6527 char *charset_name = estrndup((const char*)charset, charset_end - charset);
6528 const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6529 efree(charset_name);
6530 if (incode == NULL) {
6531 return NULL;
6532 }
6533
6534 unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6535 if (end_marker) {
6536 e = end_marker;
6537 } else if (p < e && *(e-1) == '?') {
6538 /* If encoded word is not properly terminated, but last byte is '?',
6539 * take that as a terminator (legacy behavior) */
6540 e--;
6541 }
6542
6543 unsigned char *buf = emalloc(e - p), *bufp = buf;
6544 if (*encoding == 'Q' || *encoding == 'q') {
6545 /* Fill `buf` with bytes from decoding QPrint */
6546 while (p < e) {
6547 unsigned char c = *p++;
6548 if (c == '_') {
6549 *bufp++ = ' ';
6550 continue;
6551 } else if (c == '=' && (e - p) >= 2) {
6552 unsigned char c2 = *p++;
6553 unsigned char c3 = *p++;
6554 if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6555 *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6556 continue;
6557 } else if (c2 == '\r') {
6558 if (c3 != '\n') {
6559 p--;
6560 }
6561 continue;
6562 } else if (c2 == '\n') {
6563 p--;
6564 continue;
6565 }
6566 }
6567 *bufp++ = c;
6568 }
6569 } else if (*encoding == 'B' || *encoding == 'b') {
6570 /* Fill `buf` with bytes from decoding Base64 */
6571 unsigned int bits = 0, cache = 0;
6572 while (p < e) {
6573 unsigned char c = *p++;
6574 if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6575 continue;
6576 }
6577 int8_t decoded = decode_base64(c);
6578 if (decoded == -1) {
6579 *bufp++ = '?';
6580 continue;
6581 }
6582 bits += 6;
6583 cache = (cache << 6) | (decoded & 0x3F);
6584 if (bits == 24) {
6585 *bufp++ = (cache >> 16) & 0xFF;
6586 *bufp++ = (cache >> 8) & 0xFF;
6587 *bufp++ = cache & 0xFF;
6588 bits = cache = 0;
6589 }
6590 }
6591 if (bits == 18) {
6592 *bufp++ = (cache >> 10) & 0xFF;
6593 *bufp++ = (cache >> 2) & 0xFF;
6594 } else if (bits == 12) {
6595 *bufp++ = (cache >> 4) & 0xFF;
6596 }
6597 } else {
6598 efree(buf);
6599 return NULL;
6600 }
6601
6602 size_t in_len = bufp - buf;
6603 uint32_t wchar_buf[128];
6604
6605 bufp = buf;
6606 while (in_len) {
6607 size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6608 ZEND_ASSERT(out_len <= 128);
6609 outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6610 }
6611
6612 efree(buf);
6613 return e + 2;
6614 }
6615
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6616 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6617 {
6618 unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6619 unsigned int state = 0;
6620 bool space_pending = false;
6621
6622 mb_convert_buf buf;
6623 mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6624
6625 while (p < e) {
6626 unsigned char c = *p;
6627
6628 if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6629 /* Does this look like a MIME encoded word? If so, try to decode it as one */
6630 unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6631 if (incode_end && (e - incode_end) >= 3) {
6632 unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6633 if (temp) {
6634 p = temp;
6635 /* Decoding of MIME encoded word was successful;
6636 * Try to collapse a run of whitespace */
6637 if (p < e && (*p == '\n' || *p == '\r')) {
6638 do {
6639 p++;
6640 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6641 /* We will only actually output a space if this is not immediately followed
6642 * by another valid encoded word */
6643 space_pending = true;
6644 }
6645 continue;
6646 }
6647 }
6648 }
6649
6650 if (space_pending) {
6651 uint32_t space = ' ';
6652 outcode->from_wchar(&space, 1, &buf, false);
6653 space_pending = false;
6654 }
6655
6656 /* Consume a run of plain ASCII characters */
6657 if (c != '\n' && c != '\r') {
6658 unsigned char *end = p + 1;
6659 while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6660 end++;
6661 }
6662 uint32_t wchar_buf[128];
6663 size_t in_len = end - p;
6664 while (in_len) {
6665 size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6666 ZEND_ASSERT(out_len <= 128);
6667 outcode->from_wchar(wchar_buf, out_len, &buf, false);
6668 }
6669 }
6670 /* Collapse a run of whitespace into a single space */
6671 if (p < e && (*p == '\n' || *p == '\r')) {
6672 do {
6673 p++;
6674 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6675 if (p < e) {
6676 /* Emulating legacy behavior of mb_decode_mimeheader here;
6677 * a run of whitespace is not converted to a space at the very
6678 * end of the input string */
6679 uint32_t space = ' ';
6680 outcode->from_wchar(&space, 1, &buf, false);
6681 }
6682 }
6683 }
6684
6685 outcode->from_wchar(NULL, 0, &buf, true);
6686
6687 return mb_convert_buf_result(&buf, outcode);
6688 }
6689
PHP_FUNCTION(mb_decode_mimeheader)6690 PHP_FUNCTION(mb_decode_mimeheader)
6691 {
6692 zend_string *str;
6693
6694 ZEND_PARSE_PARAMETERS_START(1, 1)
6695 Z_PARAM_STR(str)
6696 ZEND_PARSE_PARAMETERS_END();
6697
6698 RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6699 }
6700