1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include <limits.h>
22
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "main/php_output.h"
32 #include "ext/standard/info.h"
33 #include "ext/pcre/php_pcre.h"
34
35 #include "libmbfl/mbfl/mbfilter_8bit.h"
36 #include "libmbfl/mbfl/mbfilter_pass.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_cjk.h"
40 #include "libmbfl/filters/mbfilter_qprint.h"
41 #include "libmbfl/filters/mbfilter_htmlent.h"
42 #include "libmbfl/filters/mbfilter_uuencode.h"
43 #include "libmbfl/filters/mbfilter_ucs4.h"
44 #include "libmbfl/filters/mbfilter_utf16.h"
45 #include "libmbfl/filters/mbfilter_singlebyte.h"
46 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
47 #include "libmbfl/filters/unicode_prop.h"
48
49 #include "php_globals.h"
50 #include "rfc1867.h"
51 #include "php_content_types.h"
52 #include "SAPI.h"
53 #include "php_unicode.h"
54 #include "TSRM.h"
55
56 #include "mb_gpc.h"
57
58 #ifdef HAVE_MBREGEX
59 # include "php_mbregex.h"
60 #endif
61
62 #include "zend_smart_str.h"
63 #include "zend_multibyte.h"
64 #include "mbstring_arginfo.h"
65
66 #include "rare_cp_bitvec.h"
67
68 #ifdef __SSE2__
69 #include <emmintrin.h>
70 #endif
71
72 #ifdef __SSE3__
73 #include <immintrin.h>
74 #include <pmmintrin.h>
75 #endif
76
77 /* }}} */
78
79 /* {{{ prototypes */
80 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
81
82 static PHP_GINIT_FUNCTION(mbstring);
83 static PHP_GSHUTDOWN_FUNCTION(mbstring);
84
85 static void php_mb_populate_current_detect_order_list(void);
86
87 static int php_mb_encoding_translation(void);
88
89 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
90
91 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
92
93 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
94
95 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
96
97 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
98
99 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
100
101 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
102
103 /* See mbfilter_cp5022x.c */
104 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
105 /* }}} */
106
107 /* {{{ php_mb_default_identify_list */
108 typedef struct _php_mb_nls_ident_list {
109 enum mbfl_no_language lang;
110 const enum mbfl_no_encoding *list;
111 size_t list_size;
112 } php_mb_nls_ident_list;
113
114 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
115 mbfl_no_encoding_ascii,
116 mbfl_no_encoding_jis,
117 mbfl_no_encoding_utf8,
118 mbfl_no_encoding_euc_jp,
119 mbfl_no_encoding_sjis
120 };
121
122 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
123 mbfl_no_encoding_ascii,
124 mbfl_no_encoding_utf8,
125 mbfl_no_encoding_euc_cn,
126 mbfl_no_encoding_cp936
127 };
128
129 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
130 mbfl_no_encoding_ascii,
131 mbfl_no_encoding_utf8,
132 mbfl_no_encoding_euc_tw,
133 mbfl_no_encoding_big5
134 };
135
136 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
137 mbfl_no_encoding_ascii,
138 mbfl_no_encoding_utf8,
139 mbfl_no_encoding_euc_kr,
140 mbfl_no_encoding_uhc
141 };
142
143 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
144 mbfl_no_encoding_ascii,
145 mbfl_no_encoding_utf8,
146 mbfl_no_encoding_koi8r,
147 mbfl_no_encoding_cp1251,
148 mbfl_no_encoding_cp866
149 };
150
151 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
152 mbfl_no_encoding_ascii,
153 mbfl_no_encoding_utf8,
154 mbfl_no_encoding_armscii8
155 };
156
157 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
158 mbfl_no_encoding_ascii,
159 mbfl_no_encoding_utf8,
160 mbfl_no_encoding_cp1254,
161 mbfl_no_encoding_8859_9
162 };
163
164 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
165 mbfl_no_encoding_ascii,
166 mbfl_no_encoding_utf8,
167 mbfl_no_encoding_koi8u
168 };
169
170 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
171 mbfl_no_encoding_ascii,
172 mbfl_no_encoding_utf8
173 };
174
175
176 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
177 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
178 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
179 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
180 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
181 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
182 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
183 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
184 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
185 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
186 };
187
188 /* }}} */
189
190 /* {{{ mbstring_deps[] */
191 static const zend_module_dep mbstring_deps[] = {
192 ZEND_MOD_REQUIRED("pcre")
193 ZEND_MOD_END
194 };
195 /* }}} */
196
197 /* {{{ zend_module_entry mbstring_module_entry */
198 zend_module_entry mbstring_module_entry = {
199 STANDARD_MODULE_HEADER_EX,
200 NULL,
201 mbstring_deps,
202 "mbstring",
203 ext_functions,
204 PHP_MINIT(mbstring),
205 PHP_MSHUTDOWN(mbstring),
206 PHP_RINIT(mbstring),
207 PHP_RSHUTDOWN(mbstring),
208 PHP_MINFO(mbstring),
209 PHP_MBSTRING_VERSION,
210 PHP_MODULE_GLOBALS(mbstring),
211 PHP_GINIT(mbstring),
212 PHP_GSHUTDOWN(mbstring),
213 NULL,
214 STANDARD_MODULE_PROPERTIES_EX
215 };
216 /* }}} */
217
218 /* {{{ static sapi_post_entry php_post_entries[] */
219 static const sapi_post_entry php_post_entries[] = {
220 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
221 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
222 { NULL, 0, NULL, NULL }
223 };
224 /* }}} */
225
226 #ifdef COMPILE_DL_MBSTRING
227 #ifdef ZTS
228 ZEND_TSRMLS_CACHE_DEFINE()
229 #endif
230 ZEND_GET_MODULE(mbstring)
231 #endif
232
233 /* {{{ static sapi_post_entry mbstr_post_entries[] */
234 static const sapi_post_entry mbstr_post_entries[] = {
235 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
236 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
237 { NULL, 0, NULL, NULL }
238 };
239 /* }}} */
240
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)241 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
242 if (encoding_name) {
243 const mbfl_encoding *encoding;
244 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
245 if (last_encoding_name && (last_encoding_name == encoding_name
246 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
247 return MBSTRG(last_used_encoding);
248 }
249
250 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
251 if (!encoding) {
252 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
253 return NULL;
254 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
255 if (encoding == &mbfl_encoding_base64) {
256 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
257 } else if (encoding == &mbfl_encoding_qprint) {
258 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
259 } else if (encoding == &mbfl_encoding_html_ent) {
260 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
261 } else if (encoding == &mbfl_encoding_uuencode) {
262 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
263 }
264 }
265
266 if (last_encoding_name) {
267 zend_string_release(last_encoding_name);
268 }
269 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
270 MBSTRG(last_used_encoding) = encoding;
271 return encoding;
272 } else {
273 return MBSTRG(current_internal_encoding);
274 }
275 }
276
php_mb_get_encoding_or_pass(const char * encoding_name,size_t encoding_name_len)277 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
278 if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
279 return &mbfl_encoding_pass;
280 }
281
282 return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
283 }
284
count_commas(const char * p,const char * end)285 static size_t count_commas(const char *p, const char *end) {
286 size_t count = 0;
287 while ((p = memchr(p, ',', end - p))) {
288 count++;
289 p++;
290 }
291 return count;
292 }
293
294 /* {{{ static zend_result php_mb_parse_encoding_list()
295 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
296 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
297 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)298 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
299 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
300 {
301 if (value == NULL || value_length == 0) {
302 *return_list = NULL;
303 *return_size = 0;
304 return SUCCESS;
305 } else {
306 bool included_auto;
307 size_t n, size;
308 const char *p1, *endp, *tmpstr;
309 const mbfl_encoding **entry, **list;
310
311 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
312 tmpstr = value + 1;
313 value_length -= 2;
314 } else {
315 tmpstr = value;
316 }
317
318 endp = tmpstr + value_length;
319 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
320 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
321 entry = list;
322 n = 0;
323 included_auto = 0;
324 p1 = tmpstr;
325 while (1) {
326 const char *comma = memchr(p1, ',', endp - p1);
327 const char *p = comma ? comma : endp;
328 /* trim spaces */
329 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
330 p1++;
331 }
332 p--;
333 while (p > p1 && (*p == ' ' || *p == '\t')) {
334 p--;
335 }
336 size_t p1_length = p - p1 + 1;
337 /* convert to the encoding number and check encoding */
338 if (strncasecmp(p1, "auto", p1_length) == 0) {
339 if (!included_auto) {
340 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
341 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
342 size_t i;
343 included_auto = 1;
344 for (i = 0; i < identify_list_size; i++) {
345 *entry++ = mbfl_no2encoding(*src++);
346 n++;
347 }
348 }
349 } else {
350 const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
351 if (!encoding) {
352 /* Called from an INI setting modification */
353 if (arg_num == 0) {
354 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
355 } else {
356 zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
357 }
358 pefree(ZEND_VOIDP(list), persistent);
359 return FAILURE;
360 }
361
362 *entry++ = encoding;
363 n++;
364 }
365 if (n >= size || comma == NULL) {
366 break;
367 }
368 p1 = comma + 1;
369 }
370 *return_list = list;
371 *return_size = n;
372 }
373
374 return SUCCESS;
375 }
376 /* }}} */
377
378 /* {{{
379 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
380 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
381 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)382 static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
383 size_t *return_size, uint32_t arg_num)
384 {
385 /* Allocate enough space to include the default detect order if "auto" is used. */
386 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
387 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
388 const mbfl_encoding **entry = list;
389 bool included_auto = 0;
390 size_t n = 0;
391 zval *hash_entry;
392 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
393 zend_string *encoding_str = zval_try_get_string(hash_entry);
394 if (UNEXPECTED(!encoding_str)) {
395 efree(ZEND_VOIDP(list));
396 return FAILURE;
397 }
398
399 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
400 if (!included_auto) {
401 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
402 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
403 size_t j;
404
405 included_auto = 1;
406 for (j = 0; j < identify_list_size; j++) {
407 *entry++ = mbfl_no2encoding(*src++);
408 n++;
409 }
410 }
411 } else {
412 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
413 if (encoding) {
414 *entry++ = encoding;
415 n++;
416 } else {
417 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
418 zend_string_release(encoding_str);
419 efree(ZEND_VOIDP(list));
420 return FAILURE;
421 }
422 }
423 zend_string_release(encoding_str);
424 } ZEND_HASH_FOREACH_END();
425 *return_list = list;
426 *return_size = n;
427 return SUCCESS;
428 }
429 /* }}} */
430
431 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)432 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
433 {
434 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
435 }
436
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)437 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
438 {
439 return ((const mbfl_encoding *)encoding)->name;
440 }
441
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)442 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
443 {
444 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
445 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
446 }
447
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)448 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
449 {
450 if (!list) {
451 list = (const zend_encoding**)MBSTRG(current_detect_order_list);
452 list_size = MBSTRG(current_detect_order_list_size);
453 }
454 if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
455 /* Emulate behavior of previous implementation; it would never return "pass"
456 * from an encoding auto-detection operation */
457 return NULL;
458 }
459 return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
460 }
461
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)462 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
463 {
464 unsigned int num_errors = 0;
465 zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
466
467 *to_length = ZSTR_LEN(result);
468 *to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
469 memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
470 zend_string_free(result);
471
472 return from_length;
473 }
474
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)475 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
476 {
477 return php_mb_parse_encoding_list(
478 encoding_list, encoding_list_len,
479 (const mbfl_encoding ***)return_list, return_size,
480 persistent, /* arg_num */ 0);
481 }
482
php_mb_zend_internal_encoding_getter(void)483 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
484 {
485 return (const zend_encoding *)MBSTRG(internal_encoding);
486 }
487
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)488 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
489 {
490 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
491 return SUCCESS;
492 }
493
494 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
495 "mbstring",
496 php_mb_zend_encoding_fetcher,
497 php_mb_zend_encoding_name_getter,
498 php_mb_zend_encoding_lexer_compatibility_checker,
499 php_mb_zend_encoding_detector,
500 php_mb_zend_encoding_converter,
501 php_mb_zend_encoding_list_parser,
502 php_mb_zend_internal_encoding_getter,
503 php_mb_zend_internal_encoding_setter
504 };
505 /* }}} */
506
507 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)508 static void *_php_mb_compile_regex(const char *pattern)
509 {
510 pcre2_code *retval;
511 PCRE2_SIZE err_offset;
512 int errnum;
513
514 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
515 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
516 PCRE2_UCHAR err_str[128];
517 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
518 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
519 }
520 return retval;
521 }
522 /* }}} */
523
524 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)525 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
526 {
527 int res;
528
529 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
530 if (NULL == match_data) {
531 pcre2_code_free(opaque);
532 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
533 return FAILURE;
534 }
535 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
536 php_pcre_free_match_data(match_data);
537
538 return res;
539 }
540 /* }}} */
541
542 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)543 static void _php_mb_free_regex(void *opaque)
544 {
545 pcre2_code_free(opaque);
546 }
547 /* }}} */
548
549 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)550 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
551 {
552 size_t i;
553
554 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
555 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
556
557 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
558 if (php_mb_default_identify_list[i].lang == lang) {
559 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
560 *plist_size = php_mb_default_identify_list[i].list_size;
561 return 1;
562 }
563 }
564 return 0;
565 }
566 /* }}} */
567
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)568 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
569 {
570 char *result = emalloc(len + 2);
571 char *resp = result;
572 size_t i;
573
574 for (i = 0; i < len && start[i] != quote; ++i) {
575 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
576 *resp++ = start[++i];
577 } else {
578 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
579
580 while (j-- > 0 && i < len) {
581 *resp++ = start[i++];
582 }
583 --i;
584 }
585 }
586
587 *resp = '\0';
588 return result;
589 }
590
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)591 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
592 {
593 char *pos = *line, quote;
594 char *res;
595
596 while (*pos && *pos != stop) {
597 if ((quote = *pos) == '"' || quote == '\'') {
598 ++pos;
599 while (*pos && *pos != quote) {
600 if (*pos == '\\' && pos[1] && pos[1] == quote) {
601 pos += 2;
602 } else {
603 ++pos;
604 }
605 }
606 if (*pos) {
607 ++pos;
608 }
609 } else {
610 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
611
612 }
613 }
614 if (*pos == '\0') {
615 res = estrdup(*line);
616 *line += strlen(*line);
617 return res;
618 }
619
620 res = estrndup(*line, pos - *line);
621
622 while (*pos == stop) {
623 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
624 }
625
626 *line = pos;
627 return res;
628 }
629 /* }}} */
630
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)631 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
632 {
633 while (*str && isspace(*(unsigned char *)str)) {
634 ++str;
635 }
636
637 if (!*str) {
638 return estrdup("");
639 }
640
641 if (*str == '"' || *str == '\'') {
642 char quote = *str;
643
644 str++;
645 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
646 } else {
647 char *strend = str;
648
649 while (*strend && !isspace(*(unsigned char *)strend)) {
650 ++strend;
651 }
652 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
653 }
654 }
655 /* }}} */
656
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)657 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
658 {
659 char *s, *s2;
660 const size_t filename_len = strlen(filename);
661
662 /* The \ check should technically be needed for win32 systems only where
663 * it is a valid path separator. However, IE in all it's wisdom always sends
664 * the full path of the file on the user's filesystem, which means that unless
665 * the user does basename() they get a bogus file name. Until IE's user base drops
666 * to nill or problem is fixed this code must remain enabled for all systems. */
667 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
668 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
669
670 if (s && s2) {
671 if (s > s2) {
672 return ++s;
673 } else {
674 return ++s2;
675 }
676 } else if (s) {
677 return ++s;
678 } else if (s2) {
679 return ++s2;
680 } else {
681 return filename;
682 }
683 }
684 /* }}} */
685
686 /* {{{ php.ini directive handler */
687 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)688 static PHP_INI_MH(OnUpdate_mbstring_language)
689 {
690 enum mbfl_no_language no_language;
691
692 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
693 if (no_language == mbfl_no_language_invalid) {
694 MBSTRG(language) = mbfl_no_language_neutral;
695 return FAILURE;
696 }
697 MBSTRG(language) = no_language;
698 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
699 return SUCCESS;
700 }
701 /* }}} */
702
703 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)704 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
705 {
706 const mbfl_encoding **list;
707 size_t size;
708
709 if (!new_value) {
710 if (MBSTRG(detect_order_list)) {
711 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
712 }
713 MBSTRG(detect_order_list) = NULL;
714 MBSTRG(detect_order_list_size) = 0;
715 return SUCCESS;
716 }
717
718 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
719 return FAILURE;
720 }
721
722 if (MBSTRG(detect_order_list)) {
723 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 }
725 MBSTRG(detect_order_list) = list;
726 MBSTRG(detect_order_list_size) = size;
727 return SUCCESS;
728 }
729 /* }}} */
730
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)731 static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
732 const mbfl_encoding **list;
733 size_t size;
734 if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
735 list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
736 *list = &mbfl_encoding_pass;
737 size = 1;
738 } else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
739 return FAILURE;
740 }
741 if (MBSTRG(http_input_list)) {
742 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
743 }
744 MBSTRG(http_input_list) = list;
745 MBSTRG(http_input_list_size) = size;
746 return SUCCESS;
747 }
748
749 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)750 static PHP_INI_MH(OnUpdate_mbstring_http_input)
751 {
752 if (new_value) {
753 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
754 }
755
756 if (!new_value || !ZSTR_LEN(new_value)) {
757 const char *encoding = php_get_input_encoding();
758 MBSTRG(http_input_set) = 0;
759 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
760 return SUCCESS;
761 }
762
763 MBSTRG(http_input_set) = 1;
764 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
765 }
766 /* }}} */
767
_php_mb_ini_mbstring_http_output_set(const char * new_value,size_t length)768 static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
769 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
770 if (!encoding) {
771 return FAILURE;
772 }
773
774 MBSTRG(http_output_encoding) = encoding;
775 MBSTRG(current_http_output_encoding) = encoding;
776 return SUCCESS;
777 }
778
779 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)780 static PHP_INI_MH(OnUpdate_mbstring_http_output)
781 {
782 if (new_value) {
783 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
784 }
785
786 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
787 const char *encoding = php_get_output_encoding();
788 MBSTRG(http_output_set) = 0;
789 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
790 return SUCCESS;
791 }
792
793 MBSTRG(http_output_set) = 1;
794 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
795 }
796 /* }}} */
797
798 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)799 static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
800 {
801 const mbfl_encoding *encoding;
802
803 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
804 /* falls back to UTF-8 if an unknown encoding name is given */
805 if (new_value) {
806 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
807 }
808 encoding = &mbfl_encoding_utf8;
809 }
810 MBSTRG(internal_encoding) = encoding;
811 MBSTRG(current_internal_encoding) = encoding;
812 #ifdef HAVE_MBREGEX
813 {
814 const char *enc_name = new_value;
815 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
816 /* falls back to UTF-8 if an unknown encoding name is given */
817 enc_name = "UTF-8";
818 php_mb_regex_set_default_mbctype(enc_name);
819 }
820 php_mb_regex_set_mbctype(new_value);
821 }
822 #endif
823 return SUCCESS;
824 }
825 /* }}} */
826
827 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)828 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
829 {
830 if (new_value) {
831 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
832 }
833
834 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
835 return FAILURE;
836 }
837
838 if (new_value && ZSTR_LEN(new_value)) {
839 MBSTRG(internal_encoding_set) = 1;
840 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
841 } else {
842 const char *encoding = php_get_internal_encoding();
843 MBSTRG(internal_encoding_set) = 0;
844 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
845 }
846 }
847 /* }}} */
848
849 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)850 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
851 {
852 if (new_value != NULL) {
853 if (zend_string_equals_literal_ci(new_value, "none")) {
854 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
855 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
856 } else if (zend_string_equals_literal_ci(new_value, "long")) {
857 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
858 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
859 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
860 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
861 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
862 } else {
863 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
864 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
865 if (ZSTR_LEN(new_value) > 0) {
866 char *endptr = NULL;
867 int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868
869 if (*endptr == '\0') {
870 MBSTRG(filter_illegal_substchar) = c;
871 MBSTRG(current_filter_illegal_substchar) = c;
872 }
873 }
874 }
875 } else {
876 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
878 MBSTRG(filter_illegal_substchar) = '?';
879 MBSTRG(current_filter_illegal_substchar) = '?';
880 }
881
882 return SUCCESS;
883 }
884 /* }}} */
885
886 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)887 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
888 {
889 if (new_value == NULL) {
890 return FAILURE;
891 }
892
893 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
894
895 if (MBSTRG(encoding_translation)) {
896 sapi_unregister_post_entry(php_post_entries);
897 sapi_register_post_entries(mbstr_post_entries);
898 } else {
899 sapi_unregister_post_entry(mbstr_post_entries);
900 sapi_register_post_entries(php_post_entries);
901 }
902
903 return SUCCESS;
904 }
905 /* }}} */
906
907 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)908 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
909 {
910 zend_string *tmp;
911 void *re = NULL;
912
913 if (!new_value) {
914 new_value = entry->orig_value;
915 }
916 tmp = php_trim(new_value, NULL, 0, 3);
917
918 if (ZSTR_LEN(tmp) > 0) {
919 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
920 zend_string_release_ex(tmp, 0);
921 return FAILURE;
922 }
923 }
924
925 if (MBSTRG(http_output_conv_mimetypes)) {
926 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
927 }
928
929 MBSTRG(http_output_conv_mimetypes) = re;
930
931 zend_string_release_ex(tmp, 0);
932 return SUCCESS;
933 }
934 /* }}} */
935 /* }}} */
936
937 /* {{{ php.ini directive registration */
938 PHP_INI_BEGIN()
939 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
940 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
941 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
942 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
943 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
944 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
945
946 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
947 PHP_INI_SYSTEM | PHP_INI_PERDIR,
948 OnUpdate_mbstring_encoding_translation,
949 encoding_translation, zend_mbstring_globals, mbstring_globals)
950 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
951 "^(text/|application/xhtml\\+xml)",
952 PHP_INI_ALL,
953 OnUpdate_mbstring_http_output_conv_mimetypes)
954
955 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
956 PHP_INI_ALL,
957 OnUpdateBool,
958 strict_detection, zend_mbstring_globals, mbstring_globals)
959 #ifdef HAVE_MBREGEX
960 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
961 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
962 #endif
PHP_INI_END()963 PHP_INI_END()
964 /* }}} */
965
966 static void mbstring_internal_encoding_changed_hook(void) {
967 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
968 if (!MBSTRG(internal_encoding_set)) {
969 const char *encoding = php_get_internal_encoding();
970 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
971 }
972
973 if (!MBSTRG(http_output_set)) {
974 const char *encoding = php_get_output_encoding();
975 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
976 }
977
978 if (!MBSTRG(http_input_set)) {
979 const char *encoding = php_get_input_encoding();
980 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
981 }
982 }
983
984 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)985 static PHP_GINIT_FUNCTION(mbstring)
986 {
987 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
988 ZEND_TSRMLS_CACHE_UPDATE();
989 #endif
990
991 mbstring_globals->language = mbfl_no_language_uni;
992 mbstring_globals->internal_encoding = NULL;
993 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
994 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
995 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
996 mbstring_globals->http_input_identify = NULL;
997 mbstring_globals->http_input_identify_get = NULL;
998 mbstring_globals->http_input_identify_post = NULL;
999 mbstring_globals->http_input_identify_cookie = NULL;
1000 mbstring_globals->http_input_identify_string = NULL;
1001 mbstring_globals->http_input_list = NULL;
1002 mbstring_globals->http_input_list_size = 0;
1003 mbstring_globals->detect_order_list = NULL;
1004 mbstring_globals->detect_order_list_size = 0;
1005 mbstring_globals->current_detect_order_list = NULL;
1006 mbstring_globals->current_detect_order_list_size = 0;
1007 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1008 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1009 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1010 mbstring_globals->filter_illegal_substchar = '?';
1011 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1012 mbstring_globals->current_filter_illegal_substchar = '?';
1013 mbstring_globals->illegalchars = 0;
1014 mbstring_globals->encoding_translation = 0;
1015 mbstring_globals->strict_detection = 0;
1016 mbstring_globals->outconv_enabled = false;
1017 mbstring_globals->outconv_state = 0;
1018 mbstring_globals->http_output_conv_mimetypes = NULL;
1019 #ifdef HAVE_MBREGEX
1020 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1021 #endif
1022 mbstring_globals->last_used_encoding_name = NULL;
1023 mbstring_globals->last_used_encoding = NULL;
1024 mbstring_globals->internal_encoding_set = 0;
1025 mbstring_globals->http_output_set = 0;
1026 mbstring_globals->http_input_set = 0;
1027 mbstring_globals->all_encodings_list = NULL;
1028 }
1029 /* }}} */
1030
1031 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1032 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1033 {
1034 if (mbstring_globals->http_input_list) {
1035 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1036 }
1037 if (mbstring_globals->detect_order_list) {
1038 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1039 }
1040 if (mbstring_globals->http_output_conv_mimetypes) {
1041 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1042 }
1043 #ifdef HAVE_MBREGEX
1044 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1045 #endif
1046 }
1047 /* }}} */
1048
1049 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1050 static void init_check_utf8(void);
1051 #endif
1052
1053 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1054 PHP_MINIT_FUNCTION(mbstring)
1055 {
1056 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1057 ZEND_TSRMLS_CACHE_UPDATE();
1058 #endif
1059
1060 REGISTER_INI_ENTRIES();
1061
1062 /* We assume that we're the only user of the hook. */
1063 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1064 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1065 mbstring_internal_encoding_changed_hook();
1066
1067 /* This is a global handler. Should not be set in a per-request handler. */
1068 sapi_register_treat_data(mbstr_treat_data);
1069
1070 /* Post handlers are stored in the thread-local context. */
1071 if (MBSTRG(encoding_translation)) {
1072 sapi_register_post_entries(mbstr_post_entries);
1073 }
1074
1075 #ifdef HAVE_MBREGEX
1076 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1077 #endif
1078
1079 register_mbstring_symbols(module_number);
1080
1081 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1082 return FAILURE;
1083 }
1084
1085 php_rfc1867_set_multibyte_callbacks(
1086 php_mb_encoding_translation,
1087 php_mb_gpc_get_detect_order,
1088 php_mb_gpc_set_input_encoding,
1089 php_mb_rfc1867_getword,
1090 php_mb_rfc1867_getword_conf,
1091 php_mb_rfc1867_basename);
1092
1093 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1094 init_check_utf8();
1095 init_convert_utf16();
1096 #endif
1097
1098 return SUCCESS;
1099 }
1100 /* }}} */
1101
1102 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1103 PHP_MSHUTDOWN_FUNCTION(mbstring)
1104 {
1105 UNREGISTER_INI_ENTRIES();
1106
1107 zend_multibyte_restore_functions();
1108
1109 #ifdef HAVE_MBREGEX
1110 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1111 #endif
1112
1113 php_internal_encoding_changed = NULL;
1114
1115 return SUCCESS;
1116 }
1117 /* }}} */
1118
1119 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1120 PHP_RINIT_FUNCTION(mbstring)
1121 {
1122 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1123 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1124 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1125 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1126
1127 MBSTRG(illegalchars) = 0;
1128
1129 php_mb_populate_current_detect_order_list();
1130
1131 #ifdef HAVE_MBREGEX
1132 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1133 #endif
1134 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1135
1136 return SUCCESS;
1137 }
1138 /* }}} */
1139
1140 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1141 PHP_RSHUTDOWN_FUNCTION(mbstring)
1142 {
1143 if (MBSTRG(current_detect_order_list) != NULL) {
1144 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1145 MBSTRG(current_detect_order_list) = NULL;
1146 MBSTRG(current_detect_order_list_size) = 0;
1147 }
1148
1149 /* clear http input identification. */
1150 MBSTRG(http_input_identify) = NULL;
1151 MBSTRG(http_input_identify_post) = NULL;
1152 MBSTRG(http_input_identify_get) = NULL;
1153 MBSTRG(http_input_identify_cookie) = NULL;
1154 MBSTRG(http_input_identify_string) = NULL;
1155
1156 if (MBSTRG(last_used_encoding_name)) {
1157 zend_string_release(MBSTRG(last_used_encoding_name));
1158 MBSTRG(last_used_encoding_name) = NULL;
1159 }
1160
1161 MBSTRG(internal_encoding_set) = 0;
1162 MBSTRG(http_output_set) = 0;
1163 MBSTRG(http_input_set) = 0;
1164
1165 MBSTRG(outconv_enabled) = false;
1166 MBSTRG(outconv_state) = 0;
1167
1168 if (MBSTRG(all_encodings_list)) {
1169 GC_DELREF(MBSTRG(all_encodings_list));
1170 zend_array_destroy(MBSTRG(all_encodings_list));
1171 MBSTRG(all_encodings_list) = NULL;
1172 }
1173
1174 #ifdef HAVE_MBREGEX
1175 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1176 #endif
1177
1178 return SUCCESS;
1179 }
1180 /* }}} */
1181
1182 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1183 PHP_MINFO_FUNCTION(mbstring)
1184 {
1185 php_info_print_table_start();
1186 php_info_print_table_row(2, "Multibyte Support", "enabled");
1187 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1188 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1189 {
1190 char tmp[256];
1191 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1192 php_info_print_table_row(2, "libmbfl version", tmp);
1193 }
1194 php_info_print_table_end();
1195
1196 php_info_print_table_start();
1197 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1198 php_info_print_table_end();
1199
1200 #ifdef HAVE_MBREGEX
1201 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1202 #endif
1203
1204 DISPLAY_INI_ENTRIES();
1205 }
1206 /* }}} */
1207
1208 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1209 PHP_FUNCTION(mb_language)
1210 {
1211 zend_string *name = NULL;
1212
1213 ZEND_PARSE_PARAMETERS_START(0, 1)
1214 Z_PARAM_OPTIONAL
1215 Z_PARAM_STR_OR_NULL(name)
1216 ZEND_PARSE_PARAMETERS_END();
1217
1218 if (name == NULL) {
1219 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1220 } else {
1221 zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1222 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1223 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1224 zend_string_release_ex(ini_name, 0);
1225 RETURN_THROWS();
1226 }
1227 // TODO Make return void
1228 RETVAL_TRUE;
1229 zend_string_release_ex(ini_name, 0);
1230 }
1231 }
1232 /* }}} */
1233
1234 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1235 PHP_FUNCTION(mb_internal_encoding)
1236 {
1237 char *name = NULL;
1238 size_t name_len;
1239 const mbfl_encoding *encoding;
1240
1241 ZEND_PARSE_PARAMETERS_START(0, 1)
1242 Z_PARAM_OPTIONAL
1243 Z_PARAM_STRING_OR_NULL(name, name_len)
1244 ZEND_PARSE_PARAMETERS_END();
1245
1246 if (name == NULL) {
1247 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1248 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1249 } else {
1250 encoding = mbfl_name2encoding(name);
1251 if (!encoding) {
1252 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1253 RETURN_THROWS();
1254 } else {
1255 MBSTRG(current_internal_encoding) = encoding;
1256 MBSTRG(internal_encoding_set) = 1;
1257 /* TODO Return old encoding */
1258 RETURN_TRUE;
1259 }
1260 }
1261 }
1262 /* }}} */
1263
1264 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1265 PHP_FUNCTION(mb_http_input)
1266 {
1267 char *type = NULL;
1268 size_t type_len = 0, n;
1269 const mbfl_encoding **entry;
1270 const mbfl_encoding *encoding;
1271
1272 ZEND_PARSE_PARAMETERS_START(0, 1)
1273 Z_PARAM_OPTIONAL
1274 Z_PARAM_STRING_OR_NULL(type, type_len)
1275 ZEND_PARSE_PARAMETERS_END();
1276
1277 if (type == NULL) {
1278 encoding = MBSTRG(http_input_identify);
1279 } else if (type_len != 1) {
1280 zend_argument_value_error(1,
1281 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1282 RETURN_THROWS();
1283 } else {
1284 switch (*type) {
1285 case 'G':
1286 case 'g':
1287 encoding = MBSTRG(http_input_identify_get);
1288 break;
1289 case 'P':
1290 case 'p':
1291 encoding = MBSTRG(http_input_identify_post);
1292 break;
1293 case 'C':
1294 case 'c':
1295 encoding = MBSTRG(http_input_identify_cookie);
1296 break;
1297 case 'S':
1298 case 's':
1299 encoding = MBSTRG(http_input_identify_string);
1300 break;
1301 case 'I':
1302 case 'i':
1303 entry = MBSTRG(http_input_list);
1304 n = MBSTRG(http_input_list_size);
1305 array_init(return_value);
1306 for (size_t i = 0; i < n; i++, entry++) {
1307 add_next_index_string(return_value, (*entry)->name);
1308 }
1309 return;
1310 case 'L':
1311 case 'l':
1312 entry = MBSTRG(http_input_list);
1313 n = MBSTRG(http_input_list_size);
1314 if (n == 0) {
1315 RETURN_FALSE;
1316 }
1317
1318 smart_str result = {0};
1319 for (size_t i = 0; i < n; i++, entry++) {
1320 if (i > 0) {
1321 smart_str_appendc(&result, ',');
1322 }
1323 smart_str_appends(&result, (*entry)->name);
1324 }
1325 RETURN_STR(smart_str_extract(&result));
1326 default:
1327 zend_argument_value_error(1,
1328 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1329 RETURN_THROWS();
1330 }
1331 }
1332
1333 if (encoding) {
1334 RETURN_STRING(encoding->name);
1335 } else {
1336 RETURN_FALSE;
1337 }
1338 }
1339 /* }}} */
1340
1341 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1342 PHP_FUNCTION(mb_http_output)
1343 {
1344 char *name = NULL;
1345 size_t name_len;
1346
1347 ZEND_PARSE_PARAMETERS_START(0, 1)
1348 Z_PARAM_OPTIONAL
1349 Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1350 ZEND_PARSE_PARAMETERS_END();
1351
1352 if (name == NULL) {
1353 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1354 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1355 } else {
1356 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1357 if (!encoding) {
1358 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1359 RETURN_THROWS();
1360 } else {
1361 MBSTRG(http_output_set) = 1;
1362 MBSTRG(current_http_output_encoding) = encoding;
1363 /* TODO Return previous encoding? */
1364 RETURN_TRUE;
1365 }
1366 }
1367 }
1368 /* }}} */
1369
1370 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1371 PHP_FUNCTION(mb_detect_order)
1372 {
1373 zend_string *order_str = NULL;
1374 HashTable *order_ht = NULL;
1375
1376 ZEND_PARSE_PARAMETERS_START(0, 1)
1377 Z_PARAM_OPTIONAL
1378 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1379 ZEND_PARSE_PARAMETERS_END();
1380
1381 if (!order_str && !order_ht) {
1382 size_t n = MBSTRG(current_detect_order_list_size);
1383 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1384 array_init(return_value);
1385 for (size_t i = 0; i < n; i++) {
1386 add_next_index_string(return_value, (*entry)->name);
1387 entry++;
1388 }
1389 } else {
1390 const mbfl_encoding **list;
1391 size_t size;
1392 if (order_ht) {
1393 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1394 RETURN_THROWS();
1395 }
1396 } else {
1397 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1398 RETURN_THROWS();
1399 }
1400 }
1401
1402 if (size == 0) {
1403 efree(ZEND_VOIDP(list));
1404 zend_argument_value_error(1, "must specify at least one encoding");
1405 RETURN_THROWS();
1406 }
1407
1408 if (MBSTRG(current_detect_order_list)) {
1409 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1410 }
1411 MBSTRG(current_detect_order_list) = list;
1412 MBSTRG(current_detect_order_list_size) = size;
1413 RETURN_TRUE;
1414 }
1415 }
1416 /* }}} */
1417
php_mb_check_code_point(zend_long cp)1418 static inline bool php_mb_check_code_point(zend_long cp)
1419 {
1420 if (cp < 0 || cp >= 0x110000) {
1421 /* Out of Unicode range */
1422 return false;
1423 }
1424
1425 if (cp >= 0xd800 && cp <= 0xdfff) {
1426 /* Surrogate code-point. These are never valid on their own and we only allow a single
1427 * substitute character. */
1428 return false;
1429 }
1430
1431 /* As we do not know the target encoding of the conversion operation that is going to
1432 * use the substitution character, we cannot check whether the codepoint is actually mapped
1433 * in the given encoding at this point. Thus we have to accept everything. */
1434 return true;
1435 }
1436
1437 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1438 PHP_FUNCTION(mb_substitute_character)
1439 {
1440 zend_string *substitute_character = NULL;
1441 zend_long substitute_codepoint;
1442 bool substitute_is_null = 1;
1443
1444 ZEND_PARSE_PARAMETERS_START(0, 1)
1445 Z_PARAM_OPTIONAL
1446 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1447 ZEND_PARSE_PARAMETERS_END();
1448
1449 if (substitute_is_null) {
1450 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1451 RETURN_STRING("none");
1452 }
1453 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1454 RETURN_STRING("long");
1455 }
1456 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1457 RETURN_STRING("entity");
1458 }
1459 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1460 }
1461
1462 if (substitute_character != NULL) {
1463 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1464 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1465 RETURN_TRUE;
1466 }
1467 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1468 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1469 RETURN_TRUE;
1470 }
1471 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1472 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1473 RETURN_TRUE;
1474 }
1475 /* Invalid string value */
1476 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1477 RETURN_THROWS();
1478 }
1479 /* Integer codepoint passed */
1480 if (!php_mb_check_code_point(substitute_codepoint)) {
1481 zend_argument_value_error(1, "is not a valid codepoint");
1482 RETURN_THROWS();
1483 }
1484
1485 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1486 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1487 RETURN_TRUE;
1488 }
1489 /* }}} */
1490
1491 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1492 PHP_FUNCTION(mb_preferred_mime_name)
1493 {
1494 char *name = NULL;
1495 size_t name_len;
1496
1497 ZEND_PARSE_PARAMETERS_START(1, 1)
1498 Z_PARAM_STRING(name, name_len)
1499 ZEND_PARSE_PARAMETERS_END();
1500
1501 const mbfl_encoding *enc = mbfl_name2encoding(name);
1502 if (enc == NULL) {
1503 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1504 RETURN_THROWS();
1505 }
1506
1507 const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1508 if (preferred_name == NULL || *preferred_name == '\0') {
1509 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1510 RETVAL_FALSE;
1511 } else {
1512 RETVAL_STRING((char *)preferred_name);
1513 }
1514 }
1515 /* }}} */
1516
1517 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1518 PHP_FUNCTION(mb_parse_str)
1519 {
1520 zval *track_vars_array = NULL;
1521 char *encstr;
1522 size_t encstr_len;
1523 php_mb_encoding_handler_info_t info;
1524 const mbfl_encoding *detected;
1525
1526 ZEND_PARSE_PARAMETERS_START(2, 2)
1527 Z_PARAM_STRING(encstr, encstr_len)
1528 Z_PARAM_ZVAL(track_vars_array)
1529 ZEND_PARSE_PARAMETERS_END();
1530
1531 track_vars_array = zend_try_array_init(track_vars_array);
1532 if (!track_vars_array) {
1533 RETURN_THROWS();
1534 }
1535
1536 encstr = estrndup(encstr, encstr_len);
1537
1538 info.data_type = PARSE_STRING;
1539 info.separator = PG(arg_separator).input;
1540 info.report_errors = true;
1541 info.to_encoding = MBSTRG(current_internal_encoding);
1542 info.from_encodings = MBSTRG(http_input_list);
1543 info.num_from_encodings = MBSTRG(http_input_list_size);
1544
1545 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1546
1547 MBSTRG(http_input_identify) = detected;
1548
1549 RETVAL_BOOL(detected);
1550
1551 if (encstr != NULL) efree(encstr);
1552 }
1553 /* }}} */
1554
PHP_FUNCTION(mb_output_handler)1555 PHP_FUNCTION(mb_output_handler)
1556 {
1557 zend_string *str;
1558 zend_long arg_status;
1559
1560 ZEND_PARSE_PARAMETERS_START(2, 2)
1561 Z_PARAM_STR(str)
1562 Z_PARAM_LONG(arg_status)
1563 ZEND_PARSE_PARAMETERS_END();
1564
1565 const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1566 if (encoding == &mbfl_encoding_pass) {
1567 RETURN_STR_COPY(str);
1568 }
1569
1570 if (arg_status & PHP_OUTPUT_HANDLER_START) {
1571 bool free_mimetype = false;
1572 char *mimetype = NULL;
1573
1574 /* Analyze mime type */
1575 if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1576 char *s;
1577 if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1578 mimetype = estrdup(SG(sapi_headers).mimetype);
1579 } else {
1580 mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1581 }
1582 free_mimetype = true;
1583 } else if (SG(sapi_headers).send_default_content_type) {
1584 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1585 }
1586
1587 /* If content-type is not yet set, set it and enable conversion */
1588 if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1589 const char *charset = encoding->mime_name;
1590 if (charset) {
1591 char *p;
1592 size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s", mimetype, charset);
1593 if (sapi_add_header(p, len, 0) != FAILURE) {
1594 SG(sapi_headers).send_default_content_type = 0;
1595 }
1596 }
1597
1598 MBSTRG(outconv_enabled) = true;
1599 }
1600
1601 if (free_mimetype) {
1602 efree(mimetype);
1603 }
1604 }
1605
1606 if (!MBSTRG(outconv_enabled)) {
1607 RETURN_STR_COPY(str);
1608 }
1609
1610 mb_convert_buf buf;
1611 mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1612
1613 uint32_t wchar_buf[128];
1614 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1615 size_t in_len = ZSTR_LEN(str);
1616 bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1617
1618 while (in_len) {
1619 size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1620 ZEND_ASSERT(out_len <= 128);
1621 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1622 }
1623
1624 MBSTRG(illegalchars) += buf.errors;
1625 RETVAL_STR(mb_convert_buf_result_raw(&buf));
1626
1627 if (last_feed) {
1628 MBSTRG(outconv_enabled) = false;
1629 MBSTRG(outconv_state) = 0;
1630 }
1631 }
1632
PHP_FUNCTION(mb_str_split)1633 PHP_FUNCTION(mb_str_split)
1634 {
1635 zend_string *str, *encoding = NULL;
1636 zend_long split_len = 1;
1637
1638 ZEND_PARSE_PARAMETERS_START(1, 3)
1639 Z_PARAM_STR(str)
1640 Z_PARAM_OPTIONAL
1641 Z_PARAM_LONG(split_len)
1642 Z_PARAM_STR_OR_NULL(encoding)
1643 ZEND_PARSE_PARAMETERS_END();
1644
1645 if (split_len <= 0) {
1646 zend_argument_value_error(2, "must be greater than 0");
1647 RETURN_THROWS();
1648 } else if (split_len > UINT_MAX / 4) {
1649 zend_argument_value_error(2, "is too large");
1650 RETURN_THROWS();
1651 }
1652
1653 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1654 if (!enc) {
1655 RETURN_THROWS();
1656 }
1657
1658 if (ZSTR_LEN(str) == 0) {
1659 RETURN_EMPTY_ARRAY();
1660 }
1661
1662 unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1663
1664 unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1665 if (char_len) {
1666 unsigned int chunk_len = char_len * split_len;
1667 unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1668 array_init_size(return_value, chunks);
1669 while (p < e) {
1670 add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1671 p += chunk_len;
1672 }
1673 } else if (enc->mblen_table) {
1674 unsigned char const *mbtab = enc->mblen_table;
1675
1676 /* Assume that we have 1-byte characters */
1677 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1678
1679 while (p < e) {
1680 unsigned char *chunk = p; /* start of chunk */
1681
1682 for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1683 p += mbtab[*p];
1684 }
1685 if (p > e) {
1686 p = e; /* ensure chunk is in bounds */
1687 }
1688 add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1689 }
1690 } else {
1691 /* Assume that we have 1-byte characters */
1692 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1693
1694 uint32_t wchar_buf[128];
1695 size_t in_len = ZSTR_LEN(str);
1696 unsigned int state = 0, char_count = 0;
1697
1698 mb_convert_buf buf;
1699
1700 while (in_len) {
1701 size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1702 ZEND_ASSERT(out_len <= 128);
1703 size_t i = 0;
1704
1705 /* Is there some output remaining from the previous iteration? */
1706 if (char_count) {
1707 if (out_len >= split_len - char_count) {
1708 /* Finish off an incomplete chunk from previous iteration
1709 * ('buf' was already initialized; we don't need to do it again) */
1710 enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1711 i += split_len - char_count;
1712 char_count = 0;
1713 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1714 } else {
1715 /* Output from this iteration is not enough to finish the next chunk;
1716 * output what we can, and leave 'buf' to be used again on next iteration */
1717 enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1718 char_count += out_len;
1719 continue;
1720 }
1721 }
1722
1723 while (i < out_len) {
1724 /* Prepare for the next chunk */
1725 mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1726
1727 if (out_len - i >= split_len) {
1728 enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1729 i += split_len;
1730 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1731 } else {
1732 /* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1733 * leave them for the next iteration */
1734 enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1735 char_count = out_len - i;
1736 break;
1737 }
1738 }
1739 }
1740
1741 if (char_count) {
1742 /* The main loop above has finished processing the input string, but
1743 * has left a partial chunk in 'buf' */
1744 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1745 }
1746 }
1747 }
1748
1749 #ifdef __SSE2__
1750 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1751 * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1752 * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1753 * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1754 static inline uint32_t _mm_sum_epu8(const __m128i v)
1755 {
1756 /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1757 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1758 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1759 * halves of the destination XMM register
1760 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1761 * summed up will actually just be the 8-bit values from `v` */
1762 __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1763 /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1764 * to extract it here; but it stored the sum as two different 16-bit values
1765 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1766 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1767 return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1768 }
1769 #endif
1770
1771 /* This assumes that `string` is valid UTF-8
1772 * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1773 * Interpreted as signed integers, those are all byte values less than -64
1774 * A fast way to get the length of a UTF-8 string is to start with its byte length,
1775 * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1776 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1777 {
1778 unsigned char *e = p + len;
1779
1780 #ifdef __SSE2__
1781 if (len >= sizeof(__m128i)) {
1782 e -= sizeof(__m128i);
1783
1784 const __m128i threshold = _mm_set1_epi8(-64);
1785 const __m128i delta = _mm_set1_epi8(1);
1786 __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1787
1788 unsigned char reset_counter = 255;
1789 do {
1790 __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1791 __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1792 counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1793
1794 /* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1795 * and reset them to zero */
1796 if (--reset_counter == 0) {
1797 len -= _mm_sum_epu8(counter);
1798 counter = _mm_setzero_si128();
1799 reset_counter = 255;
1800 }
1801
1802 p += sizeof(__m128i);
1803 } while (p <= e);
1804
1805 e += sizeof(__m128i);
1806 len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1807 }
1808 #endif
1809
1810 /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1811 while (p < e) {
1812 signed char c = *p++;
1813 if (c < -64) {
1814 len--;
1815 }
1816 }
1817
1818 return len;
1819 }
1820
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1821 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1822 {
1823 unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1824 if (char_len) {
1825 return ZSTR_LEN(string) / char_len;
1826 } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1827 return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1828 }
1829
1830 uint32_t wchar_buf[128];
1831 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1832 size_t in_len = ZSTR_LEN(string);
1833 unsigned int state = 0;
1834 size_t len = 0;
1835
1836 while (in_len) {
1837 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1838 }
1839
1840 return len;
1841 }
1842
1843 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1844 PHP_FUNCTION(mb_strlen)
1845 {
1846 zend_string *string, *enc_name = NULL;
1847
1848 ZEND_PARSE_PARAMETERS_START(1, 2)
1849 Z_PARAM_STR(string)
1850 Z_PARAM_OPTIONAL
1851 Z_PARAM_STR_OR_NULL(enc_name)
1852 ZEND_PARSE_PARAMETERS_END();
1853
1854 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1855 if (!enc) {
1856 RETURN_THROWS();
1857 }
1858
1859 RETVAL_LONG(mb_get_strlen(string, enc));
1860 }
1861 /* }}} */
1862
1863 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1864 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1865 {
1866 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1867 }
1868
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1869 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1870 if (offset < 0) {
1871 unsigned char *pos = end;
1872 while (offset < 0) {
1873 if (pos <= str) {
1874 return NULL;
1875 }
1876
1877 unsigned char c = *--pos;
1878 if (c < 0x80 || (c & 0xC0) != 0x80) {
1879 offset++;
1880 }
1881 }
1882 return pos;
1883 } else {
1884 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1885 unsigned char *pos = str;
1886 while (offset-- > 0) {
1887 if (pos >= end) {
1888 return NULL;
1889 }
1890 pos += u8_tbl[*pos];
1891 }
1892 return pos;
1893 }
1894 }
1895
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1896 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1897 return mb_fast_strlen_utf8(start, pos - start);
1898 }
1899
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1900 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1901 {
1902 size_t result;
1903 zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1904 unsigned char *offset_pointer;
1905
1906 if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1907 unsigned int num_errors = 0;
1908 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1909 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1910 } else {
1911 haystack_u8 = haystack;
1912 needle_u8 = needle;
1913 }
1914
1915 offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1916 if (!offset_pointer) {
1917 result = MBFL_ERROR_OFFSET;
1918 goto out;
1919 }
1920
1921 result = MBFL_ERROR_NOT_FOUND;
1922 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1923 goto out;
1924 }
1925
1926 const char *found_pos;
1927 if (!reverse) {
1928 found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1929 } else if (offset >= 0) {
1930 found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1931 } else {
1932 size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1933 offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1934 if (!offset_pointer) {
1935 offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1936 }
1937
1938 found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1939 }
1940
1941 if (found_pos) {
1942 result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1943 }
1944
1945 out:
1946 if (haystack_u8 != haystack) {
1947 zend_string_free(haystack_u8);
1948 }
1949 if (needle_u8 != needle) {
1950 zend_string_free(needle_u8);
1951 }
1952 return result;
1953 }
1954
handle_strpos_error(size_t error)1955 static void handle_strpos_error(size_t error) {
1956 switch (error) {
1957 case MBFL_ERROR_NOT_FOUND:
1958 break;
1959 case MBFL_ERROR_ENCODING:
1960 php_error_docref(NULL, E_WARNING, "Conversion error");
1961 break;
1962 case MBFL_ERROR_OFFSET:
1963 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1964 break;
1965 default:
1966 zend_value_error("mb_strpos(): Unknown error");
1967 break;
1968 }
1969 }
1970
PHP_FUNCTION(mb_strpos)1971 PHP_FUNCTION(mb_strpos)
1972 {
1973 zend_long offset = 0;
1974 zend_string *needle, *haystack;
1975 zend_string *enc_name = NULL;
1976
1977 ZEND_PARSE_PARAMETERS_START(2, 4)
1978 Z_PARAM_STR(haystack)
1979 Z_PARAM_STR(needle)
1980 Z_PARAM_OPTIONAL
1981 Z_PARAM_LONG(offset)
1982 Z_PARAM_STR_OR_NULL(enc_name)
1983 ZEND_PARSE_PARAMETERS_END();
1984
1985 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1986 if (!enc) {
1987 RETURN_THROWS();
1988 }
1989
1990 size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1991 if (!mbfl_is_error(n)) {
1992 RETVAL_LONG(n);
1993 } else {
1994 handle_strpos_error(n);
1995 RETVAL_FALSE;
1996 }
1997 }
1998
1999 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)2000 PHP_FUNCTION(mb_strrpos)
2001 {
2002 zend_long offset = 0;
2003 zend_string *needle, *haystack;
2004 zend_string *enc_name = NULL;
2005
2006 ZEND_PARSE_PARAMETERS_START(2, 4)
2007 Z_PARAM_STR(haystack)
2008 Z_PARAM_STR(needle)
2009 Z_PARAM_OPTIONAL
2010 Z_PARAM_LONG(offset)
2011 Z_PARAM_STR_OR_NULL(enc_name)
2012 ZEND_PARSE_PARAMETERS_END();
2013
2014 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2015 if (!enc) {
2016 RETURN_THROWS();
2017 }
2018
2019 size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2020 if (!mbfl_is_error(n)) {
2021 RETVAL_LONG(n);
2022 } else {
2023 handle_strpos_error(n);
2024 RETVAL_FALSE;
2025 }
2026 }
2027 /* }}} */
2028
2029 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2030 PHP_FUNCTION(mb_stripos)
2031 {
2032 zend_long offset = 0;
2033 zend_string *haystack, *needle;
2034 zend_string *from_encoding = NULL;
2035
2036 ZEND_PARSE_PARAMETERS_START(2, 4)
2037 Z_PARAM_STR(haystack)
2038 Z_PARAM_STR(needle)
2039 Z_PARAM_OPTIONAL
2040 Z_PARAM_LONG(offset)
2041 Z_PARAM_STR_OR_NULL(from_encoding)
2042 ZEND_PARSE_PARAMETERS_END();
2043
2044 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2045 if (!enc) {
2046 RETURN_THROWS();
2047 }
2048
2049 size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2050
2051 if (!mbfl_is_error(n)) {
2052 RETVAL_LONG(n);
2053 } else {
2054 handle_strpos_error(n);
2055 RETVAL_FALSE;
2056 }
2057 }
2058 /* }}} */
2059
2060 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2061 PHP_FUNCTION(mb_strripos)
2062 {
2063 zend_long offset = 0;
2064 zend_string *haystack, *needle;
2065 zend_string *from_encoding = NULL;
2066
2067 ZEND_PARSE_PARAMETERS_START(2, 4)
2068 Z_PARAM_STR(haystack)
2069 Z_PARAM_STR(needle)
2070 Z_PARAM_OPTIONAL
2071 Z_PARAM_LONG(offset)
2072 Z_PARAM_STR_OR_NULL(from_encoding)
2073 ZEND_PARSE_PARAMETERS_END();
2074
2075 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2076 if (!enc) {
2077 RETURN_THROWS();
2078 }
2079
2080 size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2081
2082 if (!mbfl_is_error(n)) {
2083 RETVAL_LONG(n);
2084 } else {
2085 handle_strpos_error(n);
2086 RETVAL_FALSE;
2087 }
2088 }
2089 /* }}} */
2090
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2091 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2092 {
2093 uint32_t wchar_buf[128];
2094 unsigned int state = 0;
2095
2096 mb_convert_buf buf;
2097 mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2098
2099 while (in_len && len) {
2100 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2101 ZEND_ASSERT(out_len <= 128);
2102
2103 if (from >= out_len) {
2104 from -= out_len;
2105 } else {
2106 size_t needed_codepoints = MIN(out_len - from, len);
2107 enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2108 from = 0;
2109 len -= needed_codepoints;
2110 }
2111 }
2112
2113 return mb_convert_buf_result(&buf, enc);
2114 }
2115
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2116 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2117 {
2118 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2119 size_t in_len = ZSTR_LEN(input);
2120
2121 if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2122 /* Other than MacJapanese, no supported text encoding decodes to
2123 * more than one codepoint per byte
2124 * So if the number of codepoints to skip >= number of input bytes,
2125 * then definitely the output should be empty */
2126 return zend_empty_string;
2127 }
2128
2129 /* Does each codepoint have a fixed byte width? */
2130 unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2131 if (flag) {
2132 /* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2133 from *= flag;
2134 len *= flag;
2135 if (from >= in_len) {
2136 return zend_empty_string;
2137 }
2138 in += from;
2139 in_len -= from;
2140 if (len > in_len) {
2141 len = in_len;
2142 }
2143 return zend_string_init_fast((const char*)in, len);
2144 }
2145
2146 return mb_get_substr_slow(in, in_len, from, len, enc);
2147 }
2148
2149 #define MB_STRSTR 1
2150 #define MB_STRRCHR 2
2151 #define MB_STRISTR 3
2152 #define MB_STRRICHR 4
2153
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2154 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2155 {
2156 bool reverse_mode = false, part = false;
2157 size_t n;
2158 zend_string *haystack, *needle;
2159 zend_string *encoding_name = NULL;
2160
2161 ZEND_PARSE_PARAMETERS_START(2, 4)
2162 Z_PARAM_STR(haystack)
2163 Z_PARAM_STR(needle)
2164 Z_PARAM_OPTIONAL
2165 Z_PARAM_BOOL(part)
2166 Z_PARAM_STR_OR_NULL(encoding_name)
2167 ZEND_PARSE_PARAMETERS_END();
2168
2169 const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2170 if (!enc) {
2171 RETURN_THROWS();
2172 }
2173
2174 if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2175 reverse_mode = true;
2176 }
2177
2178 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2179 n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2180 } else {
2181 n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2182 }
2183
2184 if (!mbfl_is_error(n)) {
2185 if (part) {
2186 RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2187 } else {
2188 RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2189 }
2190 } else {
2191 // FIXME use handle_strpos_error(n)
2192 RETVAL_FALSE;
2193 }
2194 }
2195
2196 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2197 PHP_FUNCTION(mb_strstr)
2198 {
2199 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2200 }
2201 /* }}} */
2202
2203 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2204 PHP_FUNCTION(mb_strrchr)
2205 {
2206 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2207 }
2208 /* }}} */
2209
2210 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2211 PHP_FUNCTION(mb_stristr)
2212 {
2213 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2214 }
2215 /* }}} */
2216
2217 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2218 PHP_FUNCTION(mb_strrichr)
2219 {
2220 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2221 }
2222 /* }}} */
2223
2224 #undef MB_STRSTR
2225 #undef MB_STRRCHR
2226 #undef MB_STRISTR
2227 #undef MB_STRRICHR
2228
PHP_FUNCTION(mb_substr_count)2229 PHP_FUNCTION(mb_substr_count)
2230 {
2231 zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2232
2233 ZEND_PARSE_PARAMETERS_START(2, 3)
2234 Z_PARAM_STR(haystack)
2235 Z_PARAM_STR(needle)
2236 Z_PARAM_OPTIONAL
2237 Z_PARAM_STR_OR_NULL(enc_name)
2238 ZEND_PARSE_PARAMETERS_END();
2239
2240 if (ZSTR_LEN(needle) == 0) {
2241 zend_argument_must_not_be_empty_error(2);
2242 RETURN_THROWS();
2243 }
2244
2245 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2246 if (!enc) {
2247 RETURN_THROWS();
2248 }
2249
2250 if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2251 /* No need to do any conversion if haystack/needle are already known-valid UTF-8
2252 * (If they are not valid, then not passing them through conversion filters could affect output) */
2253 if (ZSTR_IS_VALID_UTF8(haystack)) {
2254 haystack_u8 = haystack;
2255 } else {
2256 unsigned int num_errors = 0;
2257 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2258 if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2259 GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2260 }
2261 }
2262
2263 if (ZSTR_IS_VALID_UTF8(needle)) {
2264 needle_u8 = needle;
2265 } else {
2266 unsigned int num_errors = 0;
2267 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2268 if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2269 GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2270 }
2271 }
2272 } else {
2273 unsigned int num_errors = 0;
2274 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2275 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2276 /* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2277 * may be only escape sequences */
2278 if (ZSTR_LEN(needle_u8) == 0) {
2279 zend_string_free(haystack_u8);
2280 zend_string_free(needle_u8);
2281 zend_argument_must_not_be_empty_error(2);
2282 RETURN_THROWS();
2283 }
2284 }
2285
2286 size_t result = 0;
2287
2288 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2289 goto out;
2290 }
2291
2292 const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2293 while (true) {
2294 p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2295 if (!p) {
2296 break;
2297 }
2298 p += ZSTR_LEN(needle_u8);
2299 result++;
2300 }
2301
2302 out:
2303 if (haystack_u8 != haystack) {
2304 zend_string_free(haystack_u8);
2305 }
2306 if (needle_u8 != needle) {
2307 zend_string_free(needle_u8);
2308 }
2309
2310 RETVAL_LONG(result);
2311 }
2312
2313 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2314 PHP_FUNCTION(mb_substr)
2315 {
2316 zend_string *str, *encoding = NULL;
2317 zend_long from, len;
2318 size_t real_from, real_len;
2319 bool len_is_null = true;
2320
2321 ZEND_PARSE_PARAMETERS_START(2, 4)
2322 Z_PARAM_STR(str)
2323 Z_PARAM_LONG(from)
2324 Z_PARAM_OPTIONAL
2325 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2326 Z_PARAM_STR_OR_NULL(encoding)
2327 ZEND_PARSE_PARAMETERS_END();
2328
2329 if (from == ZEND_LONG_MIN) {
2330 zend_argument_value_error(2, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2331 RETURN_THROWS();
2332 }
2333
2334 if (!len_is_null && len == ZEND_LONG_MIN) {
2335 zend_argument_value_error(3, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2336 RETURN_THROWS();
2337 }
2338
2339 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2340 if (!enc) {
2341 RETURN_THROWS();
2342 }
2343
2344 size_t mblen = 0;
2345 if (from < 0 || (!len_is_null && len < 0)) {
2346 mblen = mb_get_strlen(str, enc);
2347 }
2348
2349 /* if "from" position is negative, count start position from the end
2350 * of the string */
2351 if (from >= 0) {
2352 real_from = (size_t) from;
2353 } else if (-from < mblen) {
2354 real_from = mblen + from;
2355 } else {
2356 real_from = 0;
2357 }
2358
2359 /* if "length" position is negative, set it to the length
2360 * needed to stop that many chars from the end of the string */
2361 if (len_is_null) {
2362 real_len = MBFL_SUBSTR_UNTIL_END;
2363 } else if (len >= 0) {
2364 real_len = (size_t) len;
2365 } else if (real_from < mblen && -len < mblen - real_from) {
2366 real_len = (mblen - real_from) + len;
2367 } else {
2368 real_len = 0;
2369 }
2370
2371 RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2372 }
2373 /* }}} */
2374
2375 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2376 PHP_FUNCTION(mb_strcut)
2377 {
2378 zend_string *encoding = NULL;
2379 char *string_val;
2380 zend_long from, len;
2381 bool len_is_null = true;
2382 mbfl_string string, result, *ret;
2383
2384 ZEND_PARSE_PARAMETERS_START(2, 4)
2385 Z_PARAM_STRING(string_val, string.len)
2386 Z_PARAM_LONG(from)
2387 Z_PARAM_OPTIONAL
2388 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2389 Z_PARAM_STR_OR_NULL(encoding)
2390 ZEND_PARSE_PARAMETERS_END();
2391
2392 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2393 if (!enc) {
2394 RETURN_THROWS();
2395 }
2396
2397 string.val = (unsigned char*)string_val;
2398 string.encoding = enc;
2399
2400 if (len_is_null) {
2401 len = string.len;
2402 }
2403
2404 /* if "from" position is negative, count start position from the end
2405 * of the string */
2406 if (from < 0) {
2407 from = string.len + from;
2408 if (from < 0) {
2409 from = 0;
2410 }
2411 }
2412
2413 /* if "length" position is negative, set it to the length
2414 * needed to stop that many chars from the end of the string */
2415 if (len < 0) {
2416 len = (string.len - from) + len;
2417 if (len < 0) {
2418 len = 0;
2419 }
2420 }
2421
2422 if (from > string.len || len == 0) {
2423 RETURN_EMPTY_STRING();
2424 }
2425
2426 if (enc->cut) {
2427 RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2428 }
2429
2430 unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2431 if (char_len) {
2432 /* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2433 from &= -char_len;
2434 if (len > string.len - from) {
2435 len = string.len - from;
2436 }
2437 RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2438 }
2439
2440 if (enc->mblen_table) {
2441 const unsigned char *mbtab = enc->mblen_table;
2442 const unsigned char *p, *q, *end;
2443 int m = 0;
2444 /* Search for start position */
2445 for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2446 if (p > q) {
2447 p -= m;
2448 }
2449 const unsigned char *start = p;
2450 /* Search for end position */
2451 if (len >= string.len - (start - (const unsigned char*)string.val)) {
2452 end = (const unsigned char*)(string.val + string.len);
2453 } else {
2454 for (q = p + len; p < q; p += (m = mbtab[*p]));
2455 if (p > q) {
2456 p -= m;
2457 }
2458 end = p;
2459 }
2460 RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2461 }
2462
2463 ret = mbfl_strcut(&string, &result, from, len);
2464 ZEND_ASSERT(ret != NULL);
2465 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2466 efree(ret->val);
2467 }
2468 /* }}} */
2469
2470 /* Some East Asian characters, when printed at a terminal (or the like), require double
2471 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2472 static size_t character_width(uint32_t c)
2473 {
2474 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2475 return 1;
2476 }
2477
2478 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2479 unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2480 while (lo < hi) {
2481 unsigned int probe = (lo + hi) / 2;
2482 if (c < mbfl_eaw_table[probe].begin) {
2483 hi = probe;
2484 } else if (c > mbfl_eaw_table[probe].end) {
2485 lo = probe + 1;
2486 } else {
2487 return 2;
2488 }
2489 }
2490
2491 return 1;
2492 }
2493
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2494 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2495 {
2496 size_t width = 0;
2497 uint32_t wchar_buf[128];
2498 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2499 size_t in_len = ZSTR_LEN(string);
2500 unsigned int state = 0;
2501
2502 while (in_len) {
2503 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2504 ZEND_ASSERT(out_len <= 128);
2505
2506 while (out_len) {
2507 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2508 * If text conversion is performed with an ordinary ASCII character as
2509 * the 'replacement character', this will give us the correct display width. */
2510 width += character_width(wchar_buf[--out_len]);
2511 }
2512 }
2513
2514 return width;
2515 }
2516
2517 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2518 PHP_FUNCTION(mb_strwidth)
2519 {
2520 zend_string *string, *enc_name = NULL;
2521
2522 ZEND_PARSE_PARAMETERS_START(1, 2)
2523 Z_PARAM_STR(string)
2524 Z_PARAM_OPTIONAL
2525 Z_PARAM_STR_OR_NULL(enc_name)
2526 ZEND_PARSE_PARAMETERS_END();
2527
2528 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2529 if (!enc) {
2530 RETURN_THROWS();
2531 }
2532
2533 RETVAL_LONG(mb_get_strwidth(string, enc));
2534 }
2535
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,size_t from,size_t width)2536 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2537 {
2538 uint32_t wchar_buf[128];
2539 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2540 size_t in_len = ZSTR_LEN(input);
2541 unsigned int state = 0;
2542 size_t remaining_width = width;
2543 size_t to_skip = from;
2544 size_t out_len = 0;
2545 bool first_call = true, input_err = false;
2546 mb_convert_buf buf;
2547
2548 while (in_len) {
2549 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2550 ZEND_ASSERT(out_len <= 128);
2551
2552 if (out_len <= to_skip) {
2553 to_skip -= out_len;
2554 } else {
2555 for (size_t i = to_skip; i < out_len; i++) {
2556 uint32_t w = wchar_buf[i];
2557 size_t current_w_width = character_width(w);
2558
2559 input_err |= (w == MBFL_BAD_INPUT);
2560
2561 if (remaining_width < current_w_width) {
2562 size_t marker_width = mb_get_strwidth(marker, enc);
2563
2564 /* The trim marker is larger than the desired string width */
2565 if (width <= marker_width) {
2566 return zend_string_copy(marker);
2567 }
2568
2569 /* We need to truncate string and append trim marker */
2570 width -= marker_width;
2571 /* 'width' is now the amount we want to take from 'input' */
2572 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2573
2574 if (first_call) {
2575 /* We can use the buffer of wchars which we have right now;
2576 * no need to convert again */
2577 goto dont_restart_conversion;
2578 } else {
2579 goto restart_conversion;
2580 }
2581 }
2582 remaining_width -= current_w_width;
2583 }
2584 to_skip = 0;
2585 }
2586 first_call = false;
2587 }
2588
2589 /* The input string fits in the requested width; we don't need to append the trim marker
2590 * However, if the string contains erroneous byte sequences, those should be converted
2591 * to error markers */
2592 if (!input_err) {
2593 if (from == 0) {
2594 /* This just increments the string's refcount; it doesn't really 'copy' it */
2595 return zend_string_copy(input);
2596 } else {
2597 return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2598 }
2599 } else {
2600 /* We can't use `mb_get_substr`, because it uses the fastest method possible of
2601 * picking out a substring, which may not include converting erroneous byte
2602 * sequences to error markers */
2603 return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2604 }
2605
2606 /* The input string is too wide; we need to build a new string which
2607 * includes some portion of the input string, with the trim marker
2608 * concatenated onto it */
2609 restart_conversion:
2610 in = (unsigned char*)ZSTR_VAL(input);
2611 in_len = ZSTR_LEN(input);
2612 state = 0;
2613
2614 while (true) {
2615 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2616 ZEND_ASSERT(out_len <= 128);
2617
2618 dont_restart_conversion:
2619 if (out_len <= from) {
2620 from -= out_len;
2621 } else {
2622 for (size_t i = from; i < out_len; i++) {
2623 size_t current_wchar_char_width = character_width(wchar_buf[i]);
2624 if (width < current_wchar_char_width) {
2625 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2626 goto append_trim_marker;
2627 }
2628 width -= current_wchar_char_width;
2629 }
2630 ZEND_ASSERT(in_len > 0);
2631 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2632 from = 0;
2633 }
2634 }
2635
2636 append_trim_marker:
2637 if (ZSTR_LEN(marker) > 0) {
2638 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2639 buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2640 }
2641
2642 /* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2643 * we have no guarantee that the trim marker string is valid UTF-8 */
2644 return mb_convert_buf_result_raw(&buf);
2645 }
2646
2647 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2648 PHP_FUNCTION(mb_strimwidth)
2649 {
2650 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2651 zend_long from, width;
2652
2653 ZEND_PARSE_PARAMETERS_START(3, 5)
2654 Z_PARAM_STR(str)
2655 Z_PARAM_LONG(from)
2656 Z_PARAM_LONG(width)
2657 Z_PARAM_OPTIONAL
2658 Z_PARAM_STR(trimmarker)
2659 Z_PARAM_STR_OR_NULL(encoding)
2660 ZEND_PARSE_PARAMETERS_END();
2661
2662 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2663 if (!enc) {
2664 RETURN_THROWS();
2665 }
2666
2667 if (from != 0) {
2668 size_t str_len = mb_get_strlen(str, enc);
2669 if (from < 0) {
2670 from += str_len;
2671 }
2672 if (from < 0 || from > str_len) {
2673 zend_argument_value_error(2, "is out of range");
2674 RETURN_THROWS();
2675 }
2676 }
2677
2678 if (width < 0) {
2679 php_error_docref(NULL, E_DEPRECATED,
2680 "passing a negative integer to argument #3 ($width) is deprecated");
2681 width += mb_get_strwidth(str, enc);
2682
2683 if (from > 0) {
2684 zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2685 width -= mb_get_strwidth(trimmed, enc);
2686 zend_string_free(trimmed);
2687 }
2688
2689 if (width < 0) {
2690 zend_argument_value_error(3, "is out of range");
2691 RETURN_THROWS();
2692 }
2693 }
2694
2695 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2696 }
2697
2698
2699 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2700 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2701 {
2702 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2703 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2704 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2705 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2706 }
2707
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2708 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2709 {
2710 unsigned int num_errors = 0;
2711 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2712 MBSTRG(illegalchars) += num_errors;
2713 return result;
2714 }
2715
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2716 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2717 {
2718 const mbfl_encoding *from_encoding;
2719
2720 /* pre-conversion encoding */
2721 ZEND_ASSERT(num_from_encodings >= 1);
2722 if (num_from_encodings == 1) {
2723 from_encoding = *from_encodings;
2724 } else {
2725 /* auto detect */
2726 from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2727 if (!from_encoding) {
2728 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2729 return NULL;
2730 }
2731 }
2732
2733 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2734 }
2735
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2736 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2737 {
2738 HashTable *output, *chash;
2739 zend_long idx;
2740 zend_string *key;
2741 zval *entry, entry_tmp;
2742
2743 if (!input) {
2744 return NULL;
2745 }
2746
2747 if (GC_IS_RECURSIVE(input)) {
2748 GC_UNPROTECT_RECURSION(input);
2749 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2750 return NULL;
2751 }
2752 GC_TRY_PROTECT_RECURSION(input);
2753 output = zend_new_array(zend_hash_num_elements(input));
2754 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2755 /* convert key */
2756 if (key) {
2757 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2758 if (!converted_key) {
2759 continue;
2760 }
2761 key = converted_key;
2762 }
2763 /* convert value */
2764 ZEND_ASSERT(entry);
2765 try_again:
2766 switch(Z_TYPE_P(entry)) {
2767 case IS_STRING: {
2768 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2769 if (!converted_key) {
2770 if (key) {
2771 zend_string_release(key);
2772 }
2773 continue;
2774 }
2775 ZVAL_STR(&entry_tmp, converted_key);
2776 break;
2777 }
2778 case IS_NULL:
2779 case IS_TRUE:
2780 case IS_FALSE:
2781 case IS_LONG:
2782 case IS_DOUBLE:
2783 ZVAL_COPY(&entry_tmp, entry);
2784 break;
2785 case IS_ARRAY:
2786 chash = php_mb_convert_encoding_recursive(
2787 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2788 if (chash) {
2789 ZVAL_ARR(&entry_tmp, chash);
2790 } else {
2791 ZVAL_EMPTY_ARRAY(&entry_tmp);
2792 }
2793 break;
2794 case IS_REFERENCE:
2795 entry = Z_REFVAL_P(entry);
2796 goto try_again;
2797 case IS_OBJECT:
2798 default:
2799 if (key) {
2800 zend_string_release(key);
2801 }
2802 php_error_docref(NULL, E_WARNING, "Object is not supported");
2803 continue;
2804 }
2805 if (key) {
2806 zend_hash_add(output, key, &entry_tmp);
2807 zend_string_release(key);
2808 } else {
2809 zend_hash_index_add(output, idx, &entry_tmp);
2810 }
2811 } ZEND_HASH_FOREACH_END();
2812 GC_TRY_UNPROTECT_RECURSION(input);
2813
2814 return output;
2815 }
2816 /* }}} */
2817
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2818 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2819 {
2820 /* mbstring supports some 'text encodings' which aren't really text encodings
2821 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2822 * These should never be returned by `mb_detect_encoding`. */
2823 unsigned int shift = 0;
2824 for (unsigned int i = 0; i < *size; i++) {
2825 const mbfl_encoding *encoding = elist[i];
2826 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2827 shift++; /* Remove this encoding from the list */
2828 } else if (shift) {
2829 elist[i - shift] = encoding;
2830 }
2831 }
2832 *size -= shift;
2833 }
2834
2835 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2836 PHP_FUNCTION(mb_convert_encoding)
2837 {
2838 zend_string *to_encoding_name;
2839 zend_string *input_str, *from_encodings_str = NULL;
2840 HashTable *input_ht, *from_encodings_ht = NULL;
2841 const mbfl_encoding **from_encodings;
2842 size_t num_from_encodings;
2843 bool free_from_encodings = false;
2844
2845 ZEND_PARSE_PARAMETERS_START(2, 3)
2846 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2847 Z_PARAM_STR(to_encoding_name)
2848 Z_PARAM_OPTIONAL
2849 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2850 ZEND_PARSE_PARAMETERS_END();
2851
2852 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2853 if (!to_encoding) {
2854 RETURN_THROWS();
2855 }
2856
2857 if (from_encodings_ht) {
2858 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2859 RETURN_THROWS();
2860 }
2861 free_from_encodings = true;
2862 } else if (from_encodings_str) {
2863 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2864 &from_encodings, &num_from_encodings,
2865 /* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2866 RETURN_THROWS();
2867 }
2868 free_from_encodings = true;
2869 } else {
2870 from_encodings = &MBSTRG(current_internal_encoding);
2871 num_from_encodings = 1;
2872 }
2873
2874 if (num_from_encodings > 1) {
2875 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2876 }
2877
2878 if (!num_from_encodings) {
2879 efree(ZEND_VOIDP(from_encodings));
2880 zend_argument_value_error(3, "must specify at least one encoding");
2881 RETURN_THROWS();
2882 }
2883
2884 if (input_str) {
2885 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2886 if (ret != NULL) {
2887 RETVAL_STR(ret);
2888 } else {
2889 RETVAL_FALSE;
2890 }
2891 } else {
2892 HashTable *tmp;
2893 tmp = php_mb_convert_encoding_recursive(
2894 input_ht, to_encoding, from_encodings, num_from_encodings);
2895 RETVAL_ARR(tmp);
2896 }
2897
2898 if (free_from_encodings) {
2899 efree(ZEND_VOIDP(from_encodings));
2900 }
2901 }
2902 /* }}} */
2903
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2904 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2905 {
2906 return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2907 }
2908
PHP_FUNCTION(mb_convert_case)2909 PHP_FUNCTION(mb_convert_case)
2910 {
2911 zend_string *str, *from_encoding = NULL;
2912 zend_long case_mode = 0;
2913
2914 ZEND_PARSE_PARAMETERS_START(2, 3)
2915 Z_PARAM_STR(str)
2916 Z_PARAM_LONG(case_mode)
2917 Z_PARAM_OPTIONAL
2918 Z_PARAM_STR_OR_NULL(from_encoding)
2919 ZEND_PARSE_PARAMETERS_END();
2920
2921 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2922 if (!enc) {
2923 RETURN_THROWS();
2924 }
2925
2926 if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2927 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2928 RETURN_THROWS();
2929 }
2930
2931 RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2932 }
2933
PHP_FUNCTION(mb_strtoupper)2934 PHP_FUNCTION(mb_strtoupper)
2935 {
2936 zend_string *str, *from_encoding = NULL;
2937
2938 ZEND_PARSE_PARAMETERS_START(1, 2)
2939 Z_PARAM_STR(str)
2940 Z_PARAM_OPTIONAL
2941 Z_PARAM_STR_OR_NULL(from_encoding)
2942 ZEND_PARSE_PARAMETERS_END();
2943
2944 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2945 if (!enc) {
2946 RETURN_THROWS();
2947 }
2948
2949 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2950 }
2951
PHP_FUNCTION(mb_strtolower)2952 PHP_FUNCTION(mb_strtolower)
2953 {
2954 zend_string *str, *from_encoding = NULL;
2955
2956 ZEND_PARSE_PARAMETERS_START(1, 2)
2957 Z_PARAM_STR(str)
2958 Z_PARAM_OPTIONAL
2959 Z_PARAM_STR_OR_NULL(from_encoding)
2960 ZEND_PARSE_PARAMETERS_END();
2961
2962 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2963 if (!enc) {
2964 RETURN_THROWS();
2965 }
2966
2967 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2968 }
2969
php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS,php_case_mode mode)2970 static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2971 {
2972 zend_string *str, *from_encoding = NULL;
2973
2974 ZEND_PARSE_PARAMETERS_START(1, 2)
2975 Z_PARAM_STR(str)
2976 Z_PARAM_OPTIONAL
2977 Z_PARAM_STR_OR_NULL(from_encoding)
2978 ZEND_PARSE_PARAMETERS_END();
2979
2980 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2981 if (!enc) {
2982 RETURN_THROWS();
2983 }
2984
2985 zend_string *first = mb_get_substr(str, 0, 1, enc);
2986 zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2987
2988 if (zend_string_equals(first, head)) {
2989 zend_string_release_ex(first, false);
2990 zend_string_release_ex(head, false);
2991 RETURN_STR(zend_string_copy(str));
2992 }
2993
2994 zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2995 zend_string *retval = zend_string_concat2(ZSTR_VAL(head), ZSTR_LEN(head), ZSTR_VAL(second), ZSTR_LEN(second));
2996
2997 zend_string_release_ex(first, false);
2998 zend_string_release_ex(head, false);
2999 zend_string_release_ex(second, false);
3000
3001 RETVAL_STR(retval);
3002 }
3003
PHP_FUNCTION(mb_ucfirst)3004 PHP_FUNCTION(mb_ucfirst)
3005 {
3006 php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_TITLE);
3007 }
3008
PHP_FUNCTION(mb_lcfirst)3009 PHP_FUNCTION(mb_lcfirst)
3010 {
3011 php_mb_ulcfirst(INTERNAL_FUNCTION_PARAM_PASSTHRU, PHP_UNICODE_CASE_LOWER);
3012 }
3013
3014 typedef enum {
3015 MB_LTRIM = 1,
3016 MB_RTRIM = 2,
3017 MB_BOTH_TRIM = 3
3018 } mb_trim_mode;
3019
is_trim_wchar(uint32_t w,const HashTable * ht,const uint32_t * default_chars,size_t default_chars_length)3020 static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3021 {
3022 if (ht) {
3023 return zend_hash_index_exists(ht, w);
3024 } else {
3025 for (size_t i = 0; i < default_chars_length; i++) {
3026 if (w == default_chars[i]) {
3027 return true;
3028 }
3029 }
3030 return false;
3031 }
3032 }
3033
trim_each_wchar(zend_string * str,const HashTable * what_ht,const uint32_t * default_chars,size_t default_chars_length,mb_trim_mode mode,const mbfl_encoding * enc)3034 static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3035 {
3036 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3037 uint32_t wchar_buf[128];
3038 size_t in_len = ZSTR_LEN(str);
3039 size_t out_len = 0;
3040 unsigned int state = 0;
3041 size_t left = 0;
3042 size_t right = 0;
3043 size_t total_len = 0;
3044
3045 while (in_len) {
3046 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3047 ZEND_ASSERT(out_len <= 128);
3048 total_len += out_len;
3049
3050 for (size_t i = 0; i < out_len; i++) {
3051 uint32_t w = wchar_buf[i];
3052 if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3053 if (mode & MB_LTRIM) {
3054 left += 1;
3055 }
3056 if (mode & MB_RTRIM) {
3057 right += 1;
3058 }
3059 } else {
3060 mode &= ~MB_LTRIM;
3061 if (mode & MB_RTRIM) {
3062 right = 0;
3063 }
3064 }
3065 }
3066 }
3067
3068 if (left == 0 && right == 0) {
3069 return zend_string_copy(str);
3070 }
3071 return mb_get_substr(str, left, total_len - (right + left), enc);
3072 }
3073
mb_trim_default_chars(zend_string * str,mb_trim_mode mode,const mbfl_encoding * enc)3074 static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3075 {
3076 const uint32_t trim_default_chars[] = {
3077 0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3078 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3079 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3080 0x85, 0x180E
3081 };
3082 size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3083
3084 HashTable what_ht;
3085 zval val;
3086 ZVAL_TRUE(&val);
3087
3088 zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3089
3090 for (size_t i = 0; i < trim_default_chars_length; i++) {
3091 zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3092 }
3093 zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3094 zend_hash_destroy(&what_ht);
3095
3096 return retval;
3097 }
3098
mb_trim_what_chars(zend_string * str,zend_string * what,mb_trim_mode mode,const mbfl_encoding * enc)3099 static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3100 {
3101 unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3102 uint32_t what_wchar_buf[128];
3103 size_t what_out_len = 0;
3104 unsigned int state = 0;
3105 size_t what_len = ZSTR_LEN(what);
3106 HashTable what_ht;
3107 zval val;
3108 bool hash_initialized = false;
3109
3110 while (what_len) {
3111 what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3112 ZEND_ASSERT(what_out_len <= 128);
3113
3114 if (what_out_len <= 4 && !hash_initialized) {
3115 return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3116 } else {
3117 if (!hash_initialized) {
3118 hash_initialized = true;
3119 ZVAL_TRUE(&val);
3120 zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3121 }
3122 for (size_t i = 0; i < what_out_len; i++) {
3123 zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3124 }
3125 }
3126 }
3127
3128 if (UNEXPECTED(!hash_initialized)) {
3129 /* This is only possible if what is empty */
3130 return zend_string_copy(str);
3131 }
3132
3133 zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3134 zend_hash_destroy(&what_ht);
3135
3136 return retval;
3137 }
3138
php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS,mb_trim_mode mode)3139 static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3140 {
3141 zend_string *str;
3142 zend_string *what = NULL;
3143 zend_string *encoding = NULL;
3144
3145 ZEND_PARSE_PARAMETERS_START(1, 3)
3146 Z_PARAM_STR(str)
3147 Z_PARAM_OPTIONAL
3148 Z_PARAM_STR_OR_NULL(what)
3149 Z_PARAM_STR_OR_NULL(encoding)
3150 ZEND_PARSE_PARAMETERS_END();
3151
3152 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3153 if (!enc) {
3154 RETURN_THROWS();
3155 }
3156
3157 if (what) {
3158 RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3159 } else {
3160 RETURN_STR(mb_trim_default_chars(str, mode, enc));
3161 }
3162 }
3163
PHP_FUNCTION(mb_trim)3164 PHP_FUNCTION(mb_trim)
3165 {
3166 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3167 }
3168
PHP_FUNCTION(mb_ltrim)3169 PHP_FUNCTION(mb_ltrim)
3170 {
3171 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3172 }
3173
PHP_FUNCTION(mb_rtrim)3174 PHP_FUNCTION(mb_rtrim)
3175 {
3176 php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3177 }
3178
duplicate_elist(const mbfl_encoding ** elist,size_t size)3179 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3180 {
3181 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3182 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3183 return new_elist;
3184 }
3185
estimate_demerits(uint32_t w)3186 static unsigned int estimate_demerits(uint32_t w)
3187 {
3188 /* Receive wchars decoded from input string using candidate encoding.
3189 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3190 * a smaller number for each ASCII punctuation character, and 1 for
3191 * all other codepoints.
3192 *
3193 * The 'common' codepoints should cover the vast majority of
3194 * codepoints we are likely to see in practice, while only covering
3195 * a small minority of the entire Unicode encoding space. Why?
3196 * Well, if the test string happens to be valid in an incorrect
3197 * candidate encoding, the bogus codepoints which it decodes to will
3198 * be more or less random. By treating the majority of codepoints as
3199 * 'rare', we ensure that in almost all such cases, the bogus
3200 * codepoints will include plenty of 'rares', thus giving the
3201 * incorrect candidate encoding lots of demerits. See
3202 * common_codepoints.txt for the actual list used.
3203 *
3204 * So, why give extra demerits for ASCII punctuation characters? It's
3205 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3206 * which deliberately only use bytes in the ASCII range. When
3207 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3208 * have an unusually high number of ASCII punctuation characters.
3209 * So giving extra demerits for such characters will improve
3210 * detection accuracy for UTF-7 and similar encodings.
3211 *
3212 * Finally, why 1 demerit for all other characters? That penalizes
3213 * long strings, meaning we will tend to choose a candidate encoding
3214 * in which the test string decodes to a smaller number of
3215 * codepoints. That prevents single-byte encodings in which almost
3216 * every possible input byte decodes to a 'common' codepoint from
3217 * being favored too much. */
3218 if (w > 0xFFFF) {
3219 return 40;
3220 } else if (w >= 0x21 && w <= 0x2F) {
3221 return 6;
3222 } else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3223 return 30;
3224 } else {
3225 return 1;
3226 }
3227 return 0;
3228 }
3229
3230 struct candidate {
3231 const mbfl_encoding *enc;
3232 const unsigned char *in;
3233 size_t in_len;
3234 uint64_t demerits; /* Wide bit size to prevent overflow */
3235 unsigned int state;
3236 float multiplier;
3237 };
3238
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)3239 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3240 {
3241 size_t j = 0;
3242
3243 for (size_t i = 0; i < length; i++) {
3244 const mbfl_encoding *enc = encodings[i];
3245
3246 array[j].enc = enc;
3247 array[j].state = 0;
3248 array[j].demerits = 0;
3249
3250 /* If any candidate encodings have specialized validation functions, use them
3251 * to eliminate as many candidates as possible */
3252 if (enc->check != NULL) {
3253 for (size_t k = 0; k < n; k++) {
3254 if (!enc->check((unsigned char*)in[k], in_len[k])) {
3255 if (strict) {
3256 goto skip_to_next;
3257 } else {
3258 array[j].demerits += 500;
3259 }
3260 }
3261 }
3262 }
3263
3264 /* This multiplier can optionally be used to make candidate encodings listed
3265 * first more likely to be chosen. It is a weight factor which multiplies
3266 * the number of demerits counted for each candidate. */
3267 array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3268 j++;
3269 skip_to_next: ;
3270 }
3271
3272 return j;
3273 }
3274
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3275 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3276 {
3277 for (size_t i = 0; i < length; i++) {
3278 const mbfl_encoding *enc = array[i].enc;
3279
3280 array[i].in = in;
3281 array[i].in_len = in_len;
3282
3283 /* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3284 if (enc == &mbfl_encoding_utf8) {
3285 if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3286 array[i].in_len -= 3;
3287 array[i].in += 3;
3288 }
3289 } else if (enc == &mbfl_encoding_utf16be) {
3290 if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3291 array[i].in_len -= 2;
3292 array[i].in += 2;
3293 }
3294 } else if (enc == &mbfl_encoding_utf16le) {
3295 if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3296 array[i].in_len -= 2;
3297 array[i].in += 2;
3298 }
3299 }
3300 }
3301 }
3302
count_demerits(struct candidate * array,size_t length,bool strict)3303 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3304 {
3305 uint32_t wchar_buf[128];
3306 unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3307
3308 for (size_t i = 0; i < length; i++) {
3309 if (array[i].in_len == 0) {
3310 finished++;
3311 }
3312 }
3313
3314 while ((strict || length > 1) && finished < length) {
3315 /* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3316 for (size_t i = length - 1; i != (size_t)-1; i--) {
3317 /* Do we still have more input to process for this candidate encoding? */
3318 if (array[i].in_len) {
3319 const mbfl_encoding *enc = array[i].enc;
3320 size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3321 ZEND_ASSERT(out_len <= 128);
3322 /* Check this batch of decoded codepoints; are there any error markers?
3323 * Also sum up the number of demerits */
3324 while (out_len) {
3325 uint32_t w = wchar_buf[--out_len];
3326 if (w == MBFL_BAD_INPUT) {
3327 if (strict) {
3328 /* This candidate encoding is not valid, eliminate it from consideration */
3329 length--;
3330 if (i < length) {
3331 /* The eliminated candidate was the last valid one in the list */
3332 memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3333 }
3334 goto try_next_encoding;
3335 } else {
3336 array[i].demerits += 1000;
3337 }
3338 } else {
3339 array[i].demerits += estimate_demerits(w);
3340 }
3341 }
3342 if (array[i].in_len == 0) {
3343 finished++;
3344 }
3345 }
3346 try_next_encoding:;
3347 }
3348 }
3349
3350 for (size_t i = 0; i < length; i++) {
3351 array[i].demerits *= array[i].multiplier;
3352 }
3353
3354 return length;
3355 }
3356
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3357 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3358 {
3359 if (elist_size == 0) {
3360 return NULL;
3361 }
3362 if (elist_size == 1) {
3363 if (strict) {
3364 while (n--) {
3365 if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3366 return NULL;
3367 }
3368 }
3369 }
3370 return *elist;
3371 }
3372 if (n == 1 && *str_lengths == 0) {
3373 return *elist;
3374 }
3375
3376 /* Allocate on stack; when we return, this array is automatically freed */
3377 struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3378 elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3379
3380 while (n--) {
3381 start_string(array, elist_size, strings[n], str_lengths[n]);
3382 elist_size = count_demerits(array, elist_size, strict);
3383 if (elist_size == 0) {
3384 /* All candidates were eliminated */
3385 return NULL;
3386 }
3387 }
3388
3389 /* See which remaining candidate encoding has the least demerits */
3390 unsigned int best = 0;
3391 for (unsigned int i = 1; i < elist_size; i++) {
3392 if (array[i].demerits < array[best].demerits) {
3393 best = i;
3394 }
3395 }
3396 return array[best].enc;
3397 }
3398
3399 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3400 * is rejected. With non-strict detection, we just continue, but apply demerits for
3401 * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3402 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3403 {
3404 return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3405 }
3406
3407 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3408 PHP_FUNCTION(mb_detect_encoding)
3409 {
3410 zend_string *str, *encoding_str = NULL;
3411 HashTable *encoding_ht = NULL;
3412 bool strict = false;
3413 const mbfl_encoding *ret, **elist;
3414 size_t size;
3415
3416 ZEND_PARSE_PARAMETERS_START(1, 3)
3417 Z_PARAM_STR(str)
3418 Z_PARAM_OPTIONAL
3419 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3420 Z_PARAM_BOOL(strict)
3421 ZEND_PARSE_PARAMETERS_END();
3422
3423 /* Should we pay attention to the order of the provided candidate encodings and prefer
3424 * the earlier ones (if more than one candidate encoding matches)?
3425 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3426 * in, then don't treat the order as significant */
3427 bool order_significant = true;
3428
3429 /* make encoding list */
3430 if (encoding_ht) {
3431 if (encoding_ht == MBSTRG(all_encodings_list)) {
3432 order_significant = false;
3433 }
3434 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3435 RETURN_THROWS();
3436 }
3437 } else if (encoding_str) {
3438 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3439 RETURN_THROWS();
3440 }
3441 } else {
3442 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3443 size = MBSTRG(current_detect_order_list_size);
3444 }
3445
3446 if (size == 0) {
3447 efree(ZEND_VOIDP(elist));
3448 zend_argument_value_error(2, "must specify at least one encoding");
3449 RETURN_THROWS();
3450 }
3451
3452 remove_non_encodings_from_elist(elist, &size);
3453 if (size == 0) {
3454 efree(ZEND_VOIDP(elist));
3455 RETURN_FALSE;
3456 }
3457
3458 if (ZEND_NUM_ARGS() < 3) {
3459 strict = MBSTRG(strict_detection);
3460 }
3461
3462 if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3463 ret = &mbfl_encoding_utf8;
3464 } else {
3465 ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3466 }
3467
3468 efree(ZEND_VOIDP(elist));
3469
3470 if (ret == NULL) {
3471 RETURN_FALSE;
3472 }
3473
3474 RETVAL_STRING((char *)ret->name);
3475 }
3476 /* }}} */
3477
3478 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3479 PHP_FUNCTION(mb_list_encodings)
3480 {
3481 ZEND_PARSE_PARAMETERS_NONE();
3482
3483 if (MBSTRG(all_encodings_list) == NULL) {
3484 /* Initialize shared array of supported encoding names
3485 * This is done so that we can check if `mb_list_encodings()` is being
3486 * passed to other mbstring functions using a cheap pointer equality check */
3487 HashTable *array = emalloc(sizeof(HashTable));
3488 zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3489 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3490 zval tmp;
3491 ZVAL_STRING(&tmp, (*encodings)->name);
3492 zend_hash_next_index_insert(array, &tmp);
3493 }
3494 MBSTRG(all_encodings_list) = array;
3495 }
3496
3497 GC_ADDREF(MBSTRG(all_encodings_list));
3498 RETURN_ARR(MBSTRG(all_encodings_list));
3499 }
3500 /* }}} */
3501
3502 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3503 PHP_FUNCTION(mb_encoding_aliases)
3504 {
3505 const mbfl_encoding *encoding;
3506 zend_string *encoding_name = NULL;
3507
3508 ZEND_PARSE_PARAMETERS_START(1, 1)
3509 Z_PARAM_STR(encoding_name)
3510 ZEND_PARSE_PARAMETERS_END();
3511
3512 encoding = php_mb_get_encoding(encoding_name, 1);
3513 if (!encoding) {
3514 RETURN_THROWS();
3515 }
3516
3517 array_init(return_value);
3518 if (encoding->aliases != NULL) {
3519 for (const char **alias = encoding->aliases; *alias; ++alias) {
3520 add_next_index_string(return_value, (char *)*alias);
3521 }
3522 }
3523 }
3524 /* }}} */
3525
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3526 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3527 {
3528 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3529 * if we are converting zenkaku kana to hankaku kana
3530 * Make the buffer for converted kana big enough that we never need to
3531 * perform bounds checks */
3532 uint32_t wchar_buf[64], converted_buf[64 * 2];
3533 unsigned int buf_offset = 0;
3534 unsigned int state = 0;
3535 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3536 size_t in_len = ZSTR_LEN(input);
3537
3538 mb_convert_buf buf;
3539 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3540
3541 while (in_len) {
3542 uint32_t *converted = converted_buf;
3543 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3544 * previous iteration, don't overwrite it */
3545 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3546 out_len += buf_offset;
3547 ZEND_ASSERT(out_len <= 64);
3548
3549 if (!out_len) {
3550 continue;
3551 }
3552
3553 for (size_t i = 0; i < out_len-1; i++) {
3554 uint32_t second = 0;
3555 bool consumed = false;
3556 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3557 if (second) {
3558 *converted++ = second;
3559 }
3560 if (consumed) {
3561 i++;
3562 if (i == out_len-1) {
3563 /* We consumed two codepoints at the very end of the wchar buffer
3564 * So there is nothing remaining to reprocess on the next iteration */
3565 buf_offset = 0;
3566 goto emit_converted_kana;
3567 }
3568 }
3569 }
3570
3571 if (!in_len) {
3572 /* This is the last iteration, so we need to process the final codepoint now */
3573 uint32_t second = 0;
3574 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3575 if (second) {
3576 *converted++ = second;
3577 }
3578 } else {
3579 /* Reprocess the last codepoint on the next iteration */
3580 wchar_buf[0] = wchar_buf[out_len-1];
3581 buf_offset = 1;
3582 }
3583
3584 emit_converted_kana:
3585 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3586 }
3587
3588 return mb_convert_buf_result(&buf, encoding);
3589 }
3590
3591 char mb_convert_kana_flags[17] = {
3592 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3593 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3594 'V'
3595 };
3596
3597 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3598 PHP_FUNCTION(mb_convert_kana)
3599 {
3600 unsigned int opt;
3601 char *optstr = NULL;
3602 size_t optstr_len;
3603 zend_string *encname = NULL, *str;
3604
3605 ZEND_PARSE_PARAMETERS_START(1, 3)
3606 Z_PARAM_STR(str)
3607 Z_PARAM_OPTIONAL
3608 Z_PARAM_STRING(optstr, optstr_len)
3609 Z_PARAM_STR_OR_NULL(encname)
3610 ZEND_PARSE_PARAMETERS_END();
3611
3612 if (optstr != NULL) {
3613 char *p = optstr, *e = p + optstr_len;
3614 opt = 0;
3615 next_option:
3616 while (p < e) {
3617 /* Walk through option string and convert to bit vector
3618 * See translit_kana_jisx0201_jisx0208.h for the values used */
3619 char c = *p++;
3620 if (c == 'A') {
3621 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3622 } else if (c == 'a') {
3623 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3624 } else {
3625 for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3626 if (c == mb_convert_kana_flags[i]) {
3627 opt |= (1 << i);
3628 goto next_option;
3629 }
3630 }
3631
3632 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3633 RETURN_THROWS();
3634 }
3635 }
3636
3637 /* Check for illegal combinations of options */
3638 if (((opt & 0xFF00) >> 8) & opt) {
3639 /* It doesn't make sense to convert the same type of characters from halfwidth to
3640 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3641 * FW hiragana to FW katakana and then back again. */
3642 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3643 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3644 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3645 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3646 flag1 = 'A';
3647 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3648 flag2 = 'a';
3649 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3650 RETURN_THROWS();
3651 }
3652
3653 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3654 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3655 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3656 RETURN_THROWS();
3657 }
3658
3659 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3660 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3661 * more than one of these */
3662 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3663 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3664 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3665 RETURN_THROWS();
3666 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3667 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3668 RETURN_THROWS();
3669 }
3670 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3671 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3672 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3673 RETURN_THROWS();
3674 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3675 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3676 RETURN_THROWS();
3677 }
3678 }
3679 } else {
3680 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3681 }
3682
3683 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3684 if (!enc) {
3685 RETURN_THROWS();
3686 }
3687
3688 RETVAL_STR(jp_kana_convert(str, enc, opt));
3689 }
3690
mb_recursive_count_strings(zval * var)3691 static unsigned int mb_recursive_count_strings(zval *var)
3692 {
3693 unsigned int count = 0;
3694 ZVAL_DEREF(var);
3695
3696 if (Z_TYPE_P(var) == IS_STRING) {
3697 count++;
3698 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3699 if (Z_REFCOUNTED_P(var)) {
3700 if (Z_IS_RECURSIVE_P(var)) {
3701 return count;
3702 }
3703 Z_PROTECT_RECURSION_P(var);
3704 }
3705
3706 HashTable *ht = HASH_OF(var);
3707 if (ht != NULL) {
3708 zval *entry;
3709 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3710 count += mb_recursive_count_strings(entry);
3711 } ZEND_HASH_FOREACH_END();
3712 }
3713
3714 if (Z_REFCOUNTED_P(var)) {
3715 Z_UNPROTECT_RECURSION_P(var);
3716 }
3717 }
3718
3719 return count;
3720 }
3721
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3722 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3723 {
3724 ZVAL_DEREF(var);
3725
3726 if (Z_TYPE_P(var) == IS_STRING) {
3727 val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3728 len_list[*count] = Z_STRLEN_P(var);
3729 (*count)++;
3730 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3731 if (Z_REFCOUNTED_P(var)) {
3732 if (Z_IS_RECURSIVE_P(var)) {
3733 return true;
3734 }
3735 Z_PROTECT_RECURSION_P(var);
3736 }
3737
3738 HashTable *ht = HASH_OF(var);
3739 if (ht != NULL) {
3740 zval *entry;
3741 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3742 if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3743 if (Z_REFCOUNTED_P(var)) {
3744 Z_UNPROTECT_RECURSION_P(var);
3745 return true;
3746 }
3747 }
3748 } ZEND_HASH_FOREACH_END();
3749 }
3750
3751 if (Z_REFCOUNTED_P(var)) {
3752 Z_UNPROTECT_RECURSION_P(var);
3753 }
3754 }
3755
3756 return false;
3757 }
3758
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3759 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3760 {
3761 zval *entry, *orig_var;
3762
3763 orig_var = var;
3764 ZVAL_DEREF(var);
3765
3766 if (Z_TYPE_P(var) == IS_STRING) {
3767 zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3768 zval_ptr_dtor(orig_var);
3769 ZVAL_STR(orig_var, ret);
3770 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3771 if (Z_TYPE_P(var) == IS_ARRAY) {
3772 SEPARATE_ARRAY(var);
3773 }
3774 if (Z_REFCOUNTED_P(var)) {
3775 if (Z_IS_RECURSIVE_P(var)) {
3776 return true;
3777 }
3778 Z_PROTECT_RECURSION_P(var);
3779 }
3780
3781 HashTable *ht = HASH_OF(var);
3782 if (ht != NULL) {
3783 ZEND_HASH_FOREACH_VAL(ht, entry) {
3784 /* Can be a typed property declaration, in which case we need to remove the reference from the source list.
3785 * Just using ZEND_TRY_ASSIGN_STRINGL is not sufficient because that would not unwrap the reference
3786 * and change values through references (see bug #26639). */
3787 if (Z_TYPE_P(entry) == IS_INDIRECT) {
3788 ZEND_ASSERT(Z_TYPE_P(var) == IS_OBJECT);
3789
3790 entry = Z_INDIRECT_P(entry);
3791 if (Z_ISREF_P(entry) && Z_TYPE_P(Z_REFVAL_P(entry)) == IS_STRING) {
3792 zend_property_info *info = zend_get_typed_property_info_for_slot(Z_OBJ_P(var), entry);
3793 if (info) {
3794 ZEND_REF_DEL_TYPE_SOURCE(Z_REF_P(entry), info);
3795 }
3796 }
3797 }
3798
3799 if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3800 if (Z_REFCOUNTED_P(var)) {
3801 Z_UNPROTECT_RECURSION_P(var);
3802 }
3803 return true;
3804 }
3805 } ZEND_HASH_FOREACH_END();
3806 }
3807
3808 if (Z_REFCOUNTED_P(var)) {
3809 Z_UNPROTECT_RECURSION_P(var);
3810 }
3811 }
3812
3813 return false;
3814 }
3815
PHP_FUNCTION(mb_convert_variables)3816 PHP_FUNCTION(mb_convert_variables)
3817 {
3818 zval *args;
3819 zend_string *to_enc_str;
3820 zend_string *from_enc_str;
3821 HashTable *from_enc_ht;
3822 const mbfl_encoding *from_encoding, *to_encoding;
3823 uint32_t argc;
3824 size_t elistsz;
3825 const mbfl_encoding **elist;
3826
3827 ZEND_PARSE_PARAMETERS_START(3, -1)
3828 Z_PARAM_STR(to_enc_str)
3829 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3830 Z_PARAM_VARIADIC('+', args, argc)
3831 ZEND_PARSE_PARAMETERS_END();
3832
3833 /* new encoding */
3834 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3835 if (!to_encoding) {
3836 RETURN_THROWS();
3837 }
3838
3839 from_encoding = MBSTRG(current_internal_encoding);
3840
3841 bool order_significant = true;
3842
3843 /* pre-conversion encoding */
3844 if (from_enc_ht) {
3845 if (from_enc_ht == MBSTRG(all_encodings_list)) {
3846 /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3847 * in, then don't treat the order of the list as significant */
3848 order_significant = false;
3849 }
3850 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3851 RETURN_THROWS();
3852 }
3853 } else {
3854 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3855 RETURN_THROWS();
3856 }
3857 }
3858
3859 if (elistsz == 0) {
3860 efree(ZEND_VOIDP(elist));
3861 zend_argument_value_error(2, "must specify at least one encoding");
3862 RETURN_THROWS();
3863 }
3864
3865 if (elistsz == 1) {
3866 from_encoding = *elist;
3867 } else {
3868 /* auto detect */
3869 unsigned int num = 0;
3870 for (size_t n = 0; n < argc; n++) {
3871 zval *zv = &args[n];
3872 num += mb_recursive_count_strings(zv);
3873 }
3874 const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3875 size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3876 unsigned int i = 0;
3877 for (size_t n = 0; n < argc; n++) {
3878 zval *zv = &args[n];
3879 if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3880 efree(ZEND_VOIDP(elist));
3881 efree(ZEND_VOIDP(val_list));
3882 efree(len_list);
3883 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3884 RETURN_FALSE;
3885 }
3886 }
3887 from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3888 efree(ZEND_VOIDP(val_list));
3889 efree(len_list);
3890 if (!from_encoding) {
3891 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3892 efree(ZEND_VOIDP(elist));
3893 RETURN_FALSE;
3894 }
3895
3896 }
3897
3898 efree(ZEND_VOIDP(elist));
3899
3900 /* convert */
3901 for (size_t n = 0; n < argc; n++) {
3902 zval *zv = &args[n];
3903 ZVAL_DEREF(zv);
3904 if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3905 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3906 RETURN_FALSE;
3907 }
3908 }
3909
3910 RETURN_STRING(from_encoding->name);
3911 }
3912
3913 /* HTML numeric entities */
3914
3915 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,size_t * conversion_map_size)3916 static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3917 {
3918 zval *hash_entry;
3919
3920 size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3921 if (n_elems % 4 != 0) {
3922 zend_argument_value_error(2, "must have a multiple of 4 elements");
3923 return NULL;
3924 }
3925
3926 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3927 uint32_t *mapelm = convmap;
3928
3929 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3930 bool failed = true;
3931 zend_long tmp = zval_try_get_long(hash_entry, &failed);
3932 if (failed) {
3933 efree(convmap);
3934 zend_argument_value_error(2, "must only be composed of values of type int");
3935 return NULL;
3936 }
3937 *mapelm++ = tmp;
3938 } ZEND_HASH_FOREACH_END();
3939
3940 return convmap;
3941 }
3942
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)3943 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3944 {
3945 uint32_t *convmap_end = convmap + conversion_map_size;
3946
3947 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3948 uint32_t lo_code = mapelm[0];
3949 uint32_t hi_code = mapelm[1];
3950 uint32_t offset = mapelm[2];
3951 uint32_t mask = mapelm[3];
3952
3953 if (w >= lo_code && w <= hi_code) {
3954 /* This wchar falls inside one of the ranges which should be
3955 * converted to HTML entities */
3956 *retval = (w + offset) & mask;
3957 return true;
3958 }
3959 }
3960
3961 /* None of the ranges matched */
3962 return false;
3963 }
3964
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size,bool hex)3965 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3966 {
3967 /* Each wchar which we get from decoding the input string may become up to
3968 * 13 wchars when we convert it to an HTML entity */
3969 uint32_t wchar_buf[32], converted_buf[32 * 13];
3970 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3971
3972 unsigned int state = 0;
3973 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3974 size_t in_len = ZSTR_LEN(input);
3975
3976 mb_convert_buf buf;
3977 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3978
3979 while (in_len) {
3980 /* Convert input string to wchars, up to 32 at a time */
3981 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3982 ZEND_ASSERT(out_len <= 32);
3983 uint32_t *converted = converted_buf;
3984
3985 /* Run through wchars and see if any of them fall into the ranges
3986 * which we want to convert to HTML entities */
3987 for (size_t i = 0; i < out_len; i++) {
3988 uint32_t w = wchar_buf[i];
3989
3990 if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3991 *converted++ = '&';
3992 *converted++ = '#';
3993 if (hex) {
3994 *converted++ = 'x';
3995 }
3996
3997 /* Convert wchar to decimal/hex string */
3998 if (w == 0) {
3999 *converted++ = '0';
4000 } else {
4001 unsigned char *p = entity + sizeof(entity);
4002 if (hex) {
4003 while (w > 0) {
4004 *(--p) = "0123456789ABCDEF"[w & 0xF];
4005 w >>= 4;
4006 }
4007 } else {
4008 while (w > 0) {
4009 *(--p) = "0123456789"[w % 10];
4010 w /= 10;
4011 }
4012 }
4013 while (p < entity + sizeof(entity)) {
4014 *converted++ = *p++;
4015 }
4016 }
4017
4018 *converted++ = ';';
4019 } else {
4020 *converted++ = w;
4021 }
4022 }
4023
4024 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
4025 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4026 }
4027
4028 return mb_convert_buf_result(&buf, encoding);
4029 }
4030
4031 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)4032 PHP_FUNCTION(mb_encode_numericentity)
4033 {
4034 zend_string *encoding = NULL, *str;
4035 size_t conversion_map_size;
4036 HashTable *target_hash;
4037 bool is_hex = false;
4038
4039 ZEND_PARSE_PARAMETERS_START(2, 4)
4040 Z_PARAM_STR(str)
4041 Z_PARAM_ARRAY_HT(target_hash)
4042 Z_PARAM_OPTIONAL
4043 Z_PARAM_STR_OR_NULL(encoding)
4044 Z_PARAM_BOOL(is_hex)
4045 ZEND_PARSE_PARAMETERS_END();
4046
4047 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4048 if (!enc) {
4049 RETURN_THROWS();
4050 }
4051
4052 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4053 if (convmap == NULL) {
4054 RETURN_THROWS();
4055 }
4056
4057 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4058 efree(convmap);
4059 }
4060 /* }}} */
4061
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,size_t conversion_map_size,uint32_t * retval)4062 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4063 {
4064 uint32_t *convmap_end = convmap + conversion_map_size;
4065
4066 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4067 uint32_t lo_code = mapelm[0];
4068 uint32_t hi_code = mapelm[1];
4069 uint32_t offset = mapelm[2];
4070 uint32_t codepoint = number - offset;
4071 if (codepoint >= lo_code && codepoint <= hi_code) {
4072 *retval = codepoint;
4073 return true;
4074 }
4075 }
4076
4077 return false;
4078 }
4079
4080 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
4081 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
4082 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4083 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4084
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,size_t conversion_map_size)4085 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4086 {
4087 uint32_t wchar_buf[128], converted_buf[128];
4088
4089 unsigned int state = 0;
4090 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4091 size_t in_len = ZSTR_LEN(input);
4092
4093 mb_convert_buf buf;
4094 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
4095
4096 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4097 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4098 * 2nd 'converted' buffer.
4099 *
4100 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4101 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4102 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4103 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4104 * to store the next batch of wchars after it.
4105 *
4106 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4107 * wchars from the 1st buffer to the 2nd one.
4108 *
4109 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4110 * have to do bounds checks when writing wchars into it.
4111 */
4112
4113 unsigned int wchar_buf_offset = 0;
4114
4115 while (in_len) {
4116 /* Leave space for sentinel at the end of the buffer */
4117 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4118 out_len += wchar_buf_offset;
4119 ZEND_ASSERT(out_len <= 127);
4120 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4121
4122 uint32_t *p, *converted;
4123
4124 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4125 * be there (in `wchar_buf[0]`), so don't bother in that case */
4126 if (wchar_buf_offset == 0) {
4127 p = wchar_buf;
4128 while (*p != '&')
4129 p++;
4130 if (p == wchar_buf + out_len) {
4131 /* No HTML entities in this buffer */
4132 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4133 continue;
4134 }
4135
4136 /* Copy over the prefix with no & which we already scanned */
4137 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4138 converted = converted_buf + (p - wchar_buf);
4139 } else {
4140 p = wchar_buf;
4141 converted = converted_buf;
4142 }
4143
4144 found_ampersand:
4145 ZEND_ASSERT(*p == '&');
4146 uint32_t *p2 = p;
4147
4148 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4149 if (*++p2 == '#') {
4150 if (*++p2 == 'x') {
4151 /* Possible hex entity */
4152 uint32_t w = *++p2;
4153 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4154 w = *++p2;
4155 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4156 /* We hit the end of the buffer while reading digits, and
4157 * more wchars are still coming in the next buffer
4158 * Reprocess this identity on next iteration */
4159 memmove(wchar_buf, p, (p2 - p) * 4);
4160 wchar_buf_offset = p2 - p;
4161 goto process_converted_wchars;
4162 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4163 /* Invalid entity (too long or "&#x" only) */
4164 memcpy(converted, p, (p2 - p) * 4);
4165 converted += p2 - p;
4166 } else {
4167 /* Valid hexadecimal entity */
4168 uint32_t value = 0, *p3 = p + 3;
4169 while (p3 < p2) {
4170 w = *p3++;
4171 if (w <= '9') {
4172 value = (value * 16) + (w - '0');
4173 } else if (w >= 'a') {
4174 value = (value * 16) + 10 + (w - 'a');
4175 } else {
4176 value = (value * 16) + 10 + (w - 'A');
4177 }
4178 }
4179 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4180 converted++;
4181 if (*p2 == ';')
4182 p2++;
4183 } else {
4184 memcpy(converted, p, (p2 - p) * 4);
4185 converted += p2 - p;
4186 }
4187 }
4188 } else {
4189 /* Possible decimal entity */
4190 uint32_t w = *p2;
4191 while (w >= '0' && w <= '9')
4192 w = *++p2;
4193 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4194 /* The number of digits was legal (no more than 10 decimal digits)
4195 * Reprocess this identity on next iteration of main loop */
4196 memmove(wchar_buf, p, (p2 - p) * 4);
4197 wchar_buf_offset = p2 - p;
4198 goto process_converted_wchars;
4199 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4200 /* Invalid entity (too long or "&#" only) */
4201 memcpy(converted, p, (p2 - p) * 4);
4202 converted += p2 - p;
4203 } else {
4204 /* Valid decimal entity */
4205 uint32_t value = 0, *p3 = p + 2;
4206 while (p3 < p2) {
4207 /* If unsigned integer overflow would occur in the below
4208 * multiplication by 10, this entity is no good
4209 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4210 if (value > 0x19999999) {
4211 memcpy(converted, p, (p2 - p) * 4);
4212 converted += p2 - p;
4213 goto decimal_entity_too_big;
4214 }
4215 value = (value * 10) + (*p3++ - '0');
4216 }
4217 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4218 converted++;
4219 if (*p2 == ';')
4220 p2++;
4221 } else {
4222 memcpy(converted, p, (p2 - p) * 4);
4223 converted += p2 - p;
4224 }
4225 }
4226 }
4227 } else if ((p2 == wchar_buf + out_len) && in_len) {
4228 /* Corner case: & at end of buffer */
4229 wchar_buf[0] = '&';
4230 wchar_buf_offset = 1;
4231 goto process_converted_wchars;
4232 } else {
4233 *converted++ = '&';
4234 }
4235 decimal_entity_too_big:
4236
4237 /* Starting to scan a new section of the wchar buffer
4238 * 'p2' is pointing at the next wchar which needs to be processed */
4239 p = p2;
4240 while (*p2 != '&')
4241 p2++;
4242
4243 if (p2 > p) {
4244 memcpy(converted, p, (p2 - p) * 4);
4245 converted += p2 - p;
4246 p = p2;
4247 }
4248
4249 if (p < wchar_buf + out_len)
4250 goto found_ampersand;
4251
4252 /* We do not have any wchars remaining at the end of this buffer which
4253 * we need to reprocess on the next call */
4254 wchar_buf_offset = 0;
4255 process_converted_wchars:
4256 ZEND_ASSERT(converted <= converted_buf + 128);
4257 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4258 }
4259
4260 return mb_convert_buf_result(&buf, encoding);
4261 }
4262
4263 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)4264 PHP_FUNCTION(mb_decode_numericentity)
4265 {
4266 zend_string *encoding = NULL, *str;
4267 size_t conversion_map_size;
4268 HashTable *target_hash;
4269
4270 ZEND_PARSE_PARAMETERS_START(2, 3)
4271 Z_PARAM_STR(str)
4272 Z_PARAM_ARRAY_HT(target_hash)
4273 Z_PARAM_OPTIONAL
4274 Z_PARAM_STR_OR_NULL(encoding)
4275 ZEND_PARSE_PARAMETERS_END();
4276
4277 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4278 if (!enc) {
4279 RETURN_THROWS();
4280 }
4281
4282 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4283 if (convmap == NULL) {
4284 RETURN_THROWS();
4285 }
4286
4287 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4288 efree(convmap);
4289 }
4290 /* }}} */
4291
4292 /* {{{ Sends an email message with MIME scheme */
4293 #define CRLF "\r\n"
4294
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4295 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4296 {
4297 const char *ps;
4298 size_t icnt;
4299 int state = 0;
4300 int crlf_state = -1;
4301 char *token = NULL;
4302 size_t token_pos = 0;
4303 zend_string *fld_name, *fld_val;
4304
4305 ps = str;
4306 icnt = str_len;
4307 fld_name = fld_val = NULL;
4308
4309 /*
4310 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4311 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4312 * state 0 1 2 3
4313 *
4314 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4315 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4316 * crlf_state -1 0 1 -1
4317 *
4318 */
4319
4320 while (icnt > 0) {
4321 switch (*ps) {
4322 case ':':
4323 if (crlf_state == 1) {
4324 token_pos++;
4325 }
4326
4327 if (state == 0 || state == 1) {
4328 if(token && token_pos > 0) {
4329 fld_name = zend_string_init(token, token_pos, 0);
4330 }
4331 state = 2;
4332 } else {
4333 token_pos++;
4334 }
4335
4336 crlf_state = 0;
4337 break;
4338
4339 case '\n':
4340 if (crlf_state == -1) {
4341 goto out;
4342 }
4343 crlf_state = -1;
4344 break;
4345
4346 case '\r':
4347 if (crlf_state == 1) {
4348 token_pos++;
4349 } else {
4350 crlf_state = 1;
4351 }
4352 break;
4353
4354 case ' ': case '\t':
4355 if (crlf_state == -1) {
4356 if (state == 3) {
4357 /* continuing from the previous line */
4358 state = 4;
4359 } else {
4360 /* simply skipping this new line */
4361 state = 5;
4362 }
4363 } else {
4364 if (crlf_state == 1) {
4365 token_pos++;
4366 }
4367 if (state == 1 || state == 3) {
4368 token_pos++;
4369 }
4370 }
4371 crlf_state = 0;
4372 break;
4373
4374 default:
4375 switch (state) {
4376 case 0:
4377 token = (char*)ps;
4378 token_pos = 0;
4379 state = 1;
4380 break;
4381
4382 case 2:
4383 if (crlf_state != -1) {
4384 token = (char*)ps;
4385 token_pos = 0;
4386
4387 state = 3;
4388 break;
4389 }
4390 ZEND_FALLTHROUGH;
4391
4392 case 3:
4393 if (crlf_state == -1) {
4394 if(token && token_pos > 0) {
4395 fld_val = zend_string_init(token, token_pos, 0);
4396 }
4397
4398 if (fld_name != NULL && fld_val != NULL) {
4399 zval val;
4400 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4401 ZVAL_STR(&val, fld_val);
4402
4403 zend_hash_update(ht, fld_name, &val);
4404
4405 zend_string_release_ex(fld_name, 0);
4406 }
4407
4408 fld_name = fld_val = NULL;
4409 token = (char*)ps;
4410 token_pos = 0;
4411
4412 state = 1;
4413 }
4414 break;
4415
4416 case 4:
4417 token_pos++;
4418 state = 3;
4419 break;
4420 }
4421
4422 if (crlf_state == 1) {
4423 token_pos++;
4424 }
4425
4426 token_pos++;
4427
4428 crlf_state = 0;
4429 break;
4430 }
4431 ps++, icnt--;
4432 }
4433 out:
4434 if (state == 2) {
4435 token = "";
4436 token_pos = 0;
4437
4438 state = 3;
4439 }
4440 if (state == 3) {
4441 if(token && token_pos > 0) {
4442 fld_val = zend_string_init(token, token_pos, 0);
4443 }
4444 if (fld_name != NULL && fld_val != NULL) {
4445 zval val;
4446 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4447 ZVAL_STR(&val, fld_val);
4448 zend_hash_update(ht, fld_name, &val);
4449
4450 zend_string_release_ex(fld_name, 0);
4451 }
4452 }
4453 return state;
4454 }
4455
PHP_FUNCTION(mb_send_mail)4456 PHP_FUNCTION(mb_send_mail)
4457 {
4458 char *to;
4459 size_t to_len;
4460 char *message;
4461 size_t message_len;
4462 zend_string *subject;
4463 zend_string *extra_cmd = NULL;
4464 HashTable *headers_ht = NULL;
4465 zend_string *str_headers = NULL;
4466 size_t i;
4467 char *to_r = NULL;
4468 bool suppress_content_type = false;
4469 bool suppress_content_transfer_encoding = false;
4470
4471 char *p;
4472 enum mbfl_no_encoding;
4473 const mbfl_encoding *tran_cs, /* transfer text charset */
4474 *head_enc, /* header transfer encoding */
4475 *body_enc; /* body transfer encoding */
4476 const mbfl_language *lang;
4477 HashTable ht_headers;
4478 zval *s;
4479
4480 /* character-set, transfer-encoding */
4481 tran_cs = &mbfl_encoding_utf8;
4482 head_enc = &mbfl_encoding_base64;
4483 body_enc = &mbfl_encoding_base64;
4484 lang = mbfl_no2language(MBSTRG(language));
4485 if (lang != NULL) {
4486 tran_cs = mbfl_no2encoding(lang->mail_charset);
4487 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4488 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4489 }
4490
4491 ZEND_PARSE_PARAMETERS_START(3, 5)
4492 Z_PARAM_PATH(to, to_len)
4493 Z_PARAM_PATH_STR(subject)
4494 Z_PARAM_PATH(message, message_len)
4495 Z_PARAM_OPTIONAL
4496 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4497 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4498 ZEND_PARSE_PARAMETERS_END();
4499
4500 if (str_headers) {
4501 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4502 zend_argument_value_error(4, "must not contain any null bytes");
4503 RETURN_THROWS();
4504 }
4505 str_headers = php_trim(str_headers, NULL, 0, 2);
4506 } else if (headers_ht) {
4507 str_headers = php_mail_build_headers(headers_ht);
4508 if (EG(exception)) {
4509 RETURN_THROWS();
4510 }
4511 }
4512
4513 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4514
4515 if (str_headers != NULL) {
4516 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4517 }
4518
4519 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4520 char *tmp;
4521 char *param_name;
4522 char *charset = NULL;
4523
4524 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4525 p = strchr(Z_STRVAL_P(s), ';');
4526
4527 if (p != NULL) {
4528 /* skipping the padded spaces */
4529 do {
4530 ++p;
4531 } while (*p == ' ' || *p == '\t');
4532
4533 if (*p != '\0') {
4534 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4535 if (strcasecmp(param_name, "charset") == 0) {
4536 const mbfl_encoding *_tran_cs = tran_cs;
4537
4538 charset = php_strtok_r(NULL, "= \"", &tmp);
4539 if (charset != NULL) {
4540 _tran_cs = mbfl_name2encoding(charset);
4541 }
4542
4543 if (!_tran_cs) {
4544 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4545 _tran_cs = &mbfl_encoding_ascii;
4546 }
4547 tran_cs = _tran_cs;
4548 }
4549 }
4550 }
4551 }
4552 suppress_content_type = true;
4553 }
4554
4555 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4556 const mbfl_encoding *_body_enc;
4557
4558 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4559 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4560 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4561 case mbfl_no_encoding_base64:
4562 case mbfl_no_encoding_7bit:
4563 case mbfl_no_encoding_8bit:
4564 body_enc = _body_enc;
4565 break;
4566
4567 default:
4568 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4569 body_enc = &mbfl_encoding_8bit;
4570 break;
4571 }
4572 suppress_content_transfer_encoding = true;
4573 }
4574
4575 /* To: */
4576 if (to_len > 0) {
4577 to_r = estrndup(to, to_len);
4578 for (; to_len; to_len--) {
4579 if (!isspace((unsigned char) to_r[to_len - 1])) {
4580 break;
4581 }
4582 to_r[to_len - 1] = '\0';
4583 }
4584 for (i = 0; to_r[i]; i++) {
4585 if (iscntrl((unsigned char) to_r[i])) {
4586 /* According to RFC 822, section 3.1.1 long headers may be separated into
4587 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4588 * To prevent these separators from being replaced with a space, we skip over them. */
4589 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4590 i += 2;
4591 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4592 i++;
4593 }
4594 continue;
4595 }
4596
4597 to_r[i] = ' ';
4598 }
4599 }
4600 } else {
4601 to_r = to;
4602 }
4603
4604 /* Subject: */
4605 const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4606 if (enc == &mbfl_encoding_pass) {
4607 enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4608 }
4609 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4610 size_t line_sep_len = strlen(line_sep);
4611
4612 subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4613
4614 /* message body */
4615 const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4616 if (msg_enc == &mbfl_encoding_pass) {
4617 msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4618 }
4619
4620 unsigned int num_errors = 0;
4621 zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4622 zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4623 zend_string_free(tmpstr);
4624 message = ZSTR_VAL(conv);
4625
4626 /* other headers */
4627 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4628 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4629 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4630 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4631
4632 smart_str str = {0};
4633 bool empty = true;
4634
4635 if (str_headers != NULL && ZSTR_LEN(str_headers) > 0) {
4636 /* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4637 size_t len = ZSTR_LEN(str_headers);
4638 if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4639 len--;
4640 }
4641 if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4642 len--;
4643 }
4644 smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4645 empty = false;
4646 zend_string_release_ex(str_headers, 0);
4647 }
4648
4649 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4650 if (!empty) {
4651 smart_str_appendl(&str, line_sep, line_sep_len);
4652 }
4653 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4654 empty = false;
4655 }
4656
4657 if (!suppress_content_type) {
4658 if (!empty) {
4659 smart_str_appendl(&str, line_sep, line_sep_len);
4660 }
4661 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4662
4663 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4664 if (p != NULL) {
4665 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4666 smart_str_appends(&str, p);
4667 }
4668 empty = false;
4669 }
4670
4671 if (!suppress_content_transfer_encoding) {
4672 if (!empty) {
4673 smart_str_appendl(&str, line_sep, line_sep_len);
4674 }
4675 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4676 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4677 if (p == NULL) {
4678 p = "7bit";
4679 }
4680 smart_str_appends(&str, p);
4681 }
4682
4683 str_headers = smart_str_extract(&str);
4684
4685 zend_string *force_extra_parameters = zend_ini_str_ex("mail.force_extra_parameters", strlen("mail.force_extra_parameters"), false, NULL);
4686 if (force_extra_parameters) {
4687 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4688 } else if (extra_cmd) {
4689 extra_cmd = php_escape_shell_cmd(extra_cmd);
4690 }
4691
4692 RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4693
4694 if (extra_cmd) {
4695 zend_string_release_ex(extra_cmd, 0);
4696 }
4697 if (to_r != to) {
4698 efree(to_r);
4699 }
4700 zend_string_release(subject);
4701 zend_string_free(conv);
4702 zend_hash_destroy(&ht_headers);
4703 if (str_headers) {
4704 zend_string_release_ex(str_headers, 0);
4705 }
4706 }
4707
4708 #undef CRLF
4709 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4710 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4711 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4712 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4713 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4714 /* }}} */
4715
4716 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4717 PHP_FUNCTION(mb_get_info)
4718 {
4719 zend_string *type = NULL;
4720 size_t n;
4721 char *name;
4722 zval row;
4723 const mbfl_encoding **entry;
4724 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4725
4726 ZEND_ASSERT(lang);
4727
4728 ZEND_PARSE_PARAMETERS_START(0, 1)
4729 Z_PARAM_OPTIONAL
4730 Z_PARAM_STR(type)
4731 ZEND_PARSE_PARAMETERS_END();
4732
4733 if (!type || zend_string_equals_literal_ci(type, "all")) {
4734 array_init(return_value);
4735 if (MBSTRG(current_internal_encoding)) {
4736 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4737 }
4738 if (MBSTRG(http_input_identify)) {
4739 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4740 }
4741 if (MBSTRG(current_http_output_encoding)) {
4742 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4743 }
4744
4745 add_assoc_str(return_value, "http_output_conv_mimetypes",
4746 zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4747 );
4748
4749 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4750 add_assoc_string(return_value, "mail_charset", name);
4751
4752 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4753 add_assoc_string(return_value, "mail_header_encoding", name);
4754
4755 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4756 add_assoc_string(return_value, "mail_body_encoding", name);
4757
4758 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4759
4760 if (MBSTRG(encoding_translation)) {
4761 add_assoc_string(return_value, "encoding_translation", "On");
4762 } else {
4763 add_assoc_string(return_value, "encoding_translation", "Off");
4764 }
4765
4766 name = (char *)mbfl_no_language2name(MBSTRG(language));
4767 add_assoc_string(return_value, "language", name);
4768
4769 // TODO Seems to always have one entry at least?
4770 n = MBSTRG(current_detect_order_list_size);
4771 entry = MBSTRG(current_detect_order_list);
4772 if (n > 0) {
4773 size_t i;
4774 array_init(&row);
4775 for (i = 0; i < n; i++) {
4776 add_next_index_string(&row, (*entry)->name);
4777 entry++;
4778 }
4779 add_assoc_zval(return_value, "detect_order", &row);
4780 }
4781 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4782 add_assoc_string(return_value, "substitute_character", "none");
4783 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4784 add_assoc_string(return_value, "substitute_character", "long");
4785 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4786 add_assoc_string(return_value, "substitute_character", "entity");
4787 } else {
4788 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4789 }
4790 if (MBSTRG(strict_detection)) {
4791 add_assoc_string(return_value, "strict_detection", "On");
4792 } else {
4793 add_assoc_string(return_value, "strict_detection", "Off");
4794 }
4795 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4796 ZEND_ASSERT(MBSTRG(current_internal_encoding));
4797 RETURN_STRING((char *)MBSTRG(current_internal_encoding)->name);
4798 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4799 if (MBSTRG(http_input_identify)) {
4800 RETURN_STRING((char *)MBSTRG(http_input_identify)->name);
4801 }
4802 RETURN_NULL();
4803 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4804 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
4805 RETURN_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4806 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4807 RETURN_STR(
4808 zend_ini_str(
4809 "mbstring.http_output_conv_mimetypes",
4810 sizeof("mbstring.http_output_conv_mimetypes") - 1,
4811 false
4812 )
4813 );
4814 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4815 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4816 RETURN_STRING(name);
4817 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4818 name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding);
4819 RETURN_STRING(name);
4820 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4821 name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding);
4822 RETURN_STRING(name);
4823 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4824 RETURN_LONG(MBSTRG(illegalchars));
4825 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4826 if (MBSTRG(encoding_translation)) {
4827 RETURN_STRING("On");
4828 } else {
4829 RETURN_STRING("Off");
4830 }
4831 } else if (zend_string_equals_literal_ci(type, "language")) {
4832 name = (char *)mbfl_no_language2name(MBSTRG(language));
4833 RETURN_STRING(name);
4834 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4835 // TODO Seems to always have one entry at least?
4836 n = MBSTRG(current_detect_order_list_size);
4837 entry = MBSTRG(current_detect_order_list);
4838 if (n > 0) {
4839 size_t i;
4840 array_init(return_value);
4841 for (i = 0; i < n; i++) {
4842 add_next_index_string(return_value, (*entry)->name);
4843 entry++;
4844 }
4845 }
4846 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4847 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4848 RETURN_STRING("none");
4849 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4850 RETURN_STRING("long");
4851 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4852 RETURN_STRING("entity");
4853 } else {
4854 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
4855 }
4856 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4857 if (MBSTRG(strict_detection)) {
4858 RETURN_STRING("On");
4859 } else {
4860 RETURN_STRING("Off");
4861 }
4862 } else {
4863 php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4864 RETURN_FALSE;
4865 }
4866 }
4867 /* }}} */
4868
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4869 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4870 {
4871 uint32_t wchar_buf[128];
4872 unsigned char *in = (unsigned char*)input;
4873 unsigned int state = 0;
4874
4875 if (encoding->check != NULL) {
4876 return encoding->check(in, length);
4877 }
4878
4879 /* If the input string is not encoded in the given encoding, there is a significant chance
4880 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4881 * buffer of 128 codepoints, convert and check just a few codepoints first */
4882 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4883 ZEND_ASSERT(out_len <= 8);
4884 for (unsigned int i = 0; i < out_len; i++) {
4885 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4886 return false;
4887 }
4888 }
4889
4890 while (length) {
4891 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4892 ZEND_ASSERT(out_len <= 128);
4893 for (unsigned int i = 0; i < out_len; i++) {
4894 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4895 return false;
4896 }
4897 }
4898 }
4899
4900 return true;
4901 }
4902
4903 /* MSVC 32-bit has issues with 64-bit intrinsics.
4904 * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4905 * It seems this is caused by a bug in MS Visual C++
4906 * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4907 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4908 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4909 #endif
4910
4911 /* If we are building an AVX2-only binary, don't compile the next function */
4912 #ifndef ZEND_INTRIN_AVX2_NATIVE
4913
4914 /* SSE2-based function for validating UTF-8 strings
4915 * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4916 static bool mb_fast_check_utf8_default(zend_string *str)
4917 {
4918 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4919 # ifdef __SSE2__
4920 /* `e` points 1 byte past the last full 16-byte block of string content
4921 * Note that we include the terminating null byte which is included in each zend_string
4922 * as part of the content to check; this ensures that multi-byte characters which are
4923 * truncated abruptly at the end of the string will be detected as invalid */
4924 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4925
4926 /* For checking for illegal bytes 0xF5-FF */
4927 const __m128i over_f5 = _mm_set1_epi8(-117);
4928 /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4929 const __m128i over_9f = _mm_set1_epi8(-97);
4930 /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4931 const __m128i over_8f = _mm_set1_epi8(-113);
4932 /* For checking for illegal bytes 0xC0-C1 */
4933 const __m128i find_c0 = _mm_set1_epi8(-64);
4934 const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4935 /* For checking structure of continuation bytes */
4936 const __m128i find_e0 = _mm_set1_epi8(-32);
4937 const __m128i find_f0 = _mm_set1_epi8(-16);
4938
4939 __m128i last_block = _mm_setzero_si128();
4940 __m128i operand;
4941
4942 while (p < e) {
4943 operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4944
4945 check_operand:
4946 /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4947 if (!_mm_movemask_epi8(operand)) {
4948 /* Even if this block only contains single-byte characters, there may have been a
4949 * multi-byte character at the end of the previous block, which was supposed to
4950 * have continuation bytes in this block
4951 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4952 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4953 * from the 3rd last */
4954 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4955 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4956 if (_mm_movemask_epi8(bad)) {
4957 return false;
4958 }
4959
4960 /* Consume as many full blocks of single-byte characters as we can */
4961 while (true) {
4962 p += sizeof(__m128i);
4963 if (p >= e) {
4964 goto finish_up_remaining_bytes;
4965 }
4966 operand = _mm_loadu_si128((__m128i*)p);
4967 if (_mm_movemask_epi8(operand)) {
4968 break;
4969 }
4970 }
4971 }
4972
4973 /* Check for >= 0xF5, which are illegal byte values in UTF-8
4974 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4975 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4976 * Then a single signed compare will pick out any bad bytes
4977 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4978 __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4979
4980 /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4981 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4982 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4983 * We can check for both problems at once by generating a vector where each byte < 0xA0
4984 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4985 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4986 __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4987 __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4988 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4989
4990 /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4991 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4992 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4993 * Build the bitmask and compare it with the shifted block */
4994 __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4995 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4996
4997 /* Check for overlong 2-byte code units
4998 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4999 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
5000 * byte range, do a signed compare to pick out any bad bytes */
5001 bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
5002
5003 /* Check structure of continuation bytes
5004 * A UTF-8 byte should be a continuation byte if, and only if, it is:
5005 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
5006 * 2) 2 bytes after the start of a 3-byte or 4-byte character
5007 * 3) 3 bytes after the start of a 4-byte character
5008 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
5009 * get a single bitmask with 0xFF in each position where a continuation byte should be */
5010 __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
5011 __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
5012 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
5013 __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
5014 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
5015
5016 /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
5017 * a continuation byte actually is
5018 * XOR those two bitmasks together; if everything is good, the result should be zero
5019 * However, if a byte which should have been a continuation wasn't, or if a byte which
5020 * shouldn't have been a continuation was, we will get 0xFF in that position */
5021 __m128i continuation = _mm_cmplt_epi8(operand, find_c0);
5022 bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
5023
5024 /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
5025 * If that value is non-zero, then we found a bad byte somewhere! */
5026 if (_mm_movemask_epi8(bad)) {
5027 return false;
5028 }
5029
5030 last_block = operand;
5031 p += sizeof(__m128i);
5032 }
5033
5034 finish_up_remaining_bytes:
5035 /* Finish up 1-15 remaining bytes */
5036 if (p == e) {
5037 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5038
5039 /* Crazy hack here for cases where 9 or more bytes are remaining...
5040 * We want to use the above vectorized code to check a block of less than 16 bytes,
5041 * but there is no good way to read a variable number of bytes into an XMM register
5042 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5043 * 'header' fields which occupy the memory just before its content
5044 * And, those header fields occupy more than 16 bytes...
5045 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5046 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5047 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5048 * Then, we do a left shift to get rid of the unwanted bytes
5049 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5050 *
5051 * The following `switch` looks useless, but it's not
5052 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5053 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5054 */
5055 switch (remaining_bytes) {
5056 case 0: ;
5057 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5058 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5059 return _mm_movemask_epi8(bad) == 0;
5060 case 1:
5061 case 2:
5062 operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5063 goto check_operand;
5064 case 3:
5065 case 4:
5066 operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5067 goto check_operand;
5068 case 5:
5069 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5070 goto check_operand;
5071 case 6:
5072 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5073 goto check_operand;
5074 case 7:
5075 case 8:
5076 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5077 operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5078 #else
5079 operand = _mm_set_epi64x(0, *((uint64_t*)p));
5080 #endif
5081 goto check_operand;
5082 case 9:
5083 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5084 goto check_operand;
5085 case 10:
5086 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5087 goto check_operand;
5088 case 11:
5089 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5090 goto check_operand;
5091 case 12:
5092 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5093 goto check_operand;
5094 case 13:
5095 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5096 goto check_operand;
5097 case 14:
5098 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5099 goto check_operand;
5100 case 15:
5101 /* No trailing bytes are left which need to be checked
5102 * We get 15 because we did not include the terminating null when
5103 * calculating `remaining_bytes`, so the value wraps around */
5104 return true;
5105 }
5106
5107 ZEND_UNREACHABLE();
5108 }
5109
5110 return true;
5111 # else
5112 /* This UTF-8 validation function is derived from PCRE2 */
5113 size_t length = ZSTR_LEN(str);
5114 /* Table of the number of extra bytes, indexed by the first byte masked with
5115 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5116 static const uint8_t utf8_table[] = {
5117 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5118 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5119 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5120 3,3,3,3,3,3,3,3
5121 };
5122
5123 for (; length > 0; p++) {
5124 uint32_t d;
5125 unsigned char c = *p;
5126 length--;
5127
5128 if (c < 128) {
5129 /* ASCII character */
5130 continue;
5131 }
5132
5133 if (c < 0xc0) {
5134 /* Isolated 10xx xxxx byte */
5135 return false;
5136 }
5137
5138 if (c >= 0xf5) {
5139 return false;
5140 }
5141
5142 uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5143 if (length < ab) {
5144 /* Missing bytes */
5145 return false;
5146 }
5147 length -= ab;
5148
5149 /* Check top bits in the second byte */
5150 if (((d = *(++p)) & 0xc0) != 0x80) {
5151 return false;
5152 }
5153
5154 /* For each length, check that the remaining bytes start with the 0x80 bit
5155 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5156 * excluded range 0xd800 to 0xdfff. */
5157 switch (ab) {
5158 case 1:
5159 /* 2-byte character. No further bytes to check for 0x80. Check first byte
5160 * for xx00 000x (overlong sequence). */
5161 if ((c & 0x3e) == 0) {
5162 return false;
5163 }
5164 break;
5165
5166 case 2:
5167 /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5168 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5169 if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5170 return false;
5171 }
5172 break;
5173
5174 case 3:
5175 /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5176 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5177 * character greater than 0x0010ffff (f4 8f bf bf) */
5178 if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5179 return false;
5180 }
5181 break;
5182
5183 EMPTY_SWITCH_DEFAULT_CASE();
5184 }
5185 }
5186
5187 return true;
5188 # endif
5189 }
5190
5191 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5192
5193 #ifdef ZEND_INTRIN_AVX2_NATIVE
5194
5195 /* We are building AVX2-only binary */
5196 # include <immintrin.h>
5197 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5198
5199 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5200
5201 /* We are building binary which works with or without AVX2; whether or not to use
5202 * AVX2-accelerated functions will be determined at runtime */
5203 # include <immintrin.h>
5204 # include "Zend/zend_cpuinfo.h"
5205
5206 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5207 /* Dynamic linker will decide whether or not to use AVX2-based functions and
5208 * resolve symbols accordingly */
5209
5210 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5211
5212 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5213
5214 typedef bool (*check_utf8_func_t)(zend_string*);
5215
5216 ZEND_NO_SANITIZE_ADDRESS
5217 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)5218 static check_utf8_func_t resolve_check_utf8(void)
5219 {
5220 if (zend_cpu_supports_avx2()) {
5221 return mb_fast_check_utf8_avx2;
5222 }
5223 return mb_fast_check_utf8_default;
5224 }
5225
5226 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5227 /* We are compiling for a target where the dynamic linker will not be able to
5228 * resolve symbols according to whether the host supports AVX2 or not; so instead,
5229 * we can make calls go through a function pointer and set the function pointer
5230 * on module load */
5231
5232 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5233 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5234 #else
5235 static bool mb_fast_check_utf8_avx2(zend_string *str);
5236 #endif
5237
5238 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5239
mb_fast_check_utf8(zend_string * str)5240 static bool mb_fast_check_utf8(zend_string *str)
5241 {
5242 return check_utf8_ptr(str);
5243 }
5244
init_check_utf8(void)5245 static void init_check_utf8(void)
5246 {
5247 if (zend_cpu_supports_avx2()) {
5248 check_utf8_ptr = mb_fast_check_utf8_avx2;
5249 } else {
5250 check_utf8_ptr = mb_fast_check_utf8_default;
5251 }
5252 }
5253 # endif
5254
5255 #else
5256
5257 /* No AVX2 support */
5258 #define mb_fast_check_utf8 mb_fast_check_utf8_default
5259
5260 #endif
5261
5262 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5263
5264 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5265 * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5266 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5267 # define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5268 #endif
5269
5270 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5271 * number of bytes, then take the low 256 bits
5272 * This is used to take some number of trailing bytes from the previous 32-byte
5273 * block followed by some number of leading bytes from the current 32-byte block
5274 *
5275 * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5276 * YMM register while shifting in bytes from another YMM register... but
5277 * it works separately on respective 128-bit halves of the YMM registers,
5278 * which is not what we want.
5279 * To make it work as desired, we first do _mm256_permute2x128_si256
5280 * (VPERM2I128) to combine the low 128 bits from the previous block and
5281 * the high 128 bits of the current block in one YMM register.
5282 * Then VPALIGNR will do what is needed. */
5283 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5284
5285 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5286 *
5287 * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5288 * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5289 * sections. */
5290 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)5291 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5292 #else
5293 static bool mb_fast_check_utf8_avx2(zend_string *str)
5294 #endif
5295 {
5296 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5297 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5298
5299 /* The algorithm used here for UTF-8 validation is partially adapted from the
5300 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5301 * and Daniel Lemire.
5302 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5303 *
5304 * Most types of invalid UTF-8 text can be detected by examining pairs of
5305 * successive bytes. Specifically:
5306 *
5307 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5308 * No valid UTF-8 string ever uses these byte values.
5309 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5310 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5311 * • 5-byte or 6-byte code units, which should never be used, start with
5312 * 0xF8-FE.
5313 * • A codepoint value higher than U+10FFFF, which is the highest value for
5314 * any Unicode codepoint, would either start with 0xF4, followed by a
5315 * byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5316 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5317 * be used, would start with 0xED, followed by a byte >= 0xA0.
5318 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5319 *
5320 * To detect all these problems, for each pair of successive bytes, we do
5321 * table lookups using the high nibble of the first byte, the low nibble of
5322 * the first byte, and the high nibble of the second byte. Each table lookup
5323 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5324 * combination; AND those three bitmasks together, and any 1 bit in the result
5325 * will indicate an actual invalid byte combination was found.
5326 */
5327
5328 #define BAD_BYTE 0x1
5329 #define OVERLONG_2BYTE 0x2
5330 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5331 #define OVERLONG_3BYTE 0x4
5332 #define SURROGATE 0x8
5333 #define OVERLONG_4BYTE 0x10
5334 #define INVALID_CP 0x20
5335
5336 /* Each of these are 16-entry tables, repeated twice; this is required by the
5337 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5338 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5339 *
5340 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5341 * that means that high nibble 0xC is consistent with the byte pair being part of
5342 * an overlong 2-byte code unit */
5343 const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5344 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5345 0, 0, 0, 0,
5346 0, 0, 0, 0,
5347 0, 0, 0, 0,
5348 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5349 0, 0, 0, 0,
5350 0, 0, 0, 0,
5351 0, 0, 0, 0);
5352 const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5353 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5354 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5355 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5356 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5357 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5358 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5359 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5360 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5361 const __m256i bad_hi_nibble = _mm256_set_epi8(
5362 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5363 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5364 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5365 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5366 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5367 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5368 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5369 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5370 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5371 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5372 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5373 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5374 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5375 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5376 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5377 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5378
5379 const __m256i find_continuation = _mm256_set1_epi8(-64);
5380 const __m256i _b = _mm256_set1_epi8(0xB);
5381 const __m256i _d = _mm256_set1_epi8(0xD);
5382 const __m256i _f = _mm256_set1_epi8(0xF);
5383
5384 __m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5385 __m256i operand;
5386
5387 while (p < e) {
5388 operand = _mm256_loadu_si256((__m256i*)p);
5389
5390 check_operand:
5391 if (!_mm256_movemask_epi8(operand)) {
5392 /* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5393 * the previous block didn't end with an incomplete multi-byte character
5394 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5395 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5396 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5397 if (_mm256_movemask_epi8(bad)) {
5398 return false;
5399 }
5400
5401 /* Consume as many full blocks of single-byte characters as we can */
5402 while (true) {
5403 p += sizeof(__m256i);
5404 if (p >= e) {
5405 goto finish_up_remaining_bytes;
5406 }
5407 operand = _mm256_loadu_si256((__m256i*)p);
5408 if (_mm256_movemask_epi8(operand)) {
5409 break;
5410 }
5411 }
5412 }
5413
5414 __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5415 __m256i lo_nibbles = _mm256_and_si256(operand, _f);
5416
5417 __m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5418 __m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5419
5420 /* Do parallel table lookups in all 3 tables */
5421 __m256i bad = _mm256_cmpgt_epi8(
5422 _mm256_and_si256(
5423 _mm256_and_si256(
5424 _mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5425 _mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5426 _mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5427 _mm256_setzero_si256());
5428
5429 __m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5430 __m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5431 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5432 __m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5433 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5434
5435 __m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5436 bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5437
5438 if (_mm256_movemask_epi8(bad)) {
5439 return false;
5440 }
5441
5442 last_hi_nibbles = hi_nibbles;
5443 last_lo_nibbles = lo_nibbles;
5444 p += sizeof(__m256i);
5445 }
5446
5447 finish_up_remaining_bytes:
5448 if (p == e) {
5449 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5450
5451 switch (remaining_bytes) {
5452 case 0: ;
5453 /* No actual data bytes are remaining */
5454 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5455 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5456 return _mm256_movemask_epi8(bad) == 0;
5457 case 1:
5458 case 2:
5459 operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5460 goto check_operand;
5461 case 3:
5462 case 4:
5463 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5464 goto check_operand;
5465 case 5:
5466 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5467 goto check_operand;
5468 case 6:
5469 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5470 goto check_operand;
5471 case 7:
5472 case 8:
5473 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5474 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5475 #else
5476 operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5477 #endif
5478 goto check_operand;
5479 case 9:
5480 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5481 goto check_operand;
5482 case 10:
5483 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5484 goto check_operand;
5485 case 11:
5486 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5487 goto check_operand;
5488 case 12:
5489 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5490 goto check_operand;
5491 case 13:
5492 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5493 goto check_operand;
5494 case 14:
5495 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5496 goto check_operand;
5497 case 15:
5498 case 16:
5499 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5500 goto check_operand;
5501 case 17:
5502 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5503 goto check_operand;
5504 case 18:
5505 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5506 goto check_operand;
5507 case 19:
5508 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5509 goto check_operand;
5510 case 20:
5511 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5512 goto check_operand;
5513 case 21:
5514 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5515 goto check_operand;
5516 case 22:
5517 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5518 goto check_operand;
5519 case 23:
5520 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5521 goto check_operand;
5522 case 24:
5523 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5524 goto check_operand;
5525 case 25:
5526 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5527 goto check_operand;
5528 case 26:
5529 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5530 goto check_operand;
5531 case 27:
5532 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5533 goto check_operand;
5534 case 28:
5535 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5536 goto check_operand;
5537 case 29:
5538 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5539 goto check_operand;
5540 case 30:
5541 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5542 goto check_operand;
5543 case 31:
5544 return true;
5545 }
5546
5547 ZEND_UNREACHABLE();
5548 }
5549
5550 return true;
5551 }
5552
5553 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5554
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5555 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5556 {
5557 if (encoding == &mbfl_encoding_utf8) {
5558 if (ZSTR_IS_VALID_UTF8(str)) {
5559 return true;
5560 }
5561 bool result = mb_fast_check_utf8(str);
5562 if (result && !ZSTR_IS_INTERNED(str)) {
5563 GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5564 }
5565 return result;
5566 } else {
5567 return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5568 }
5569 }
5570
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5571 static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5572 {
5573 zend_long idx;
5574 zend_string *key;
5575 zval *entry;
5576 bool valid = true;
5577
5578 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5579
5580 if (GC_IS_RECURSIVE(vars)) {
5581 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5582 return false;
5583 }
5584 GC_TRY_PROTECT_RECURSION(vars);
5585 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5586 ZVAL_DEREF(entry);
5587 if (key) {
5588 if (!mb_check_str_encoding(key, encoding)) {
5589 valid = false;
5590 break;
5591 }
5592 }
5593 switch (Z_TYPE_P(entry)) {
5594 case IS_STRING:
5595 if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5596 valid = false;
5597 break;
5598 }
5599 break;
5600 case IS_ARRAY:
5601 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5602 valid = false;
5603 break;
5604 }
5605 break;
5606 case IS_LONG:
5607 case IS_DOUBLE:
5608 case IS_NULL:
5609 case IS_TRUE:
5610 case IS_FALSE:
5611 break;
5612 default:
5613 /* Other types are error. */
5614 valid = false;
5615 break;
5616 }
5617 } ZEND_HASH_FOREACH_END();
5618 GC_TRY_UNPROTECT_RECURSION(vars);
5619 return valid;
5620 }
5621
5622 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5623 PHP_FUNCTION(mb_check_encoding)
5624 {
5625 zend_string *input_str = NULL, *enc = NULL;
5626 HashTable *input_ht = NULL;
5627 const mbfl_encoding *encoding;
5628
5629 ZEND_PARSE_PARAMETERS_START(0, 2)
5630 Z_PARAM_OPTIONAL
5631 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5632 Z_PARAM_STR_OR_NULL(enc)
5633 ZEND_PARSE_PARAMETERS_END();
5634
5635 encoding = php_mb_get_encoding(enc, 2);
5636 if (!encoding) {
5637 RETURN_THROWS();
5638 }
5639
5640 if (input_ht) {
5641 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5642 } else if (input_str) {
5643 RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5644 } else {
5645 php_error_docref(NULL, E_DEPRECATED,
5646 "Calling mb_check_encoding() without argument is deprecated");
5647
5648 /* FIXME: Actually check all inputs, except $_FILES file content. */
5649 RETURN_BOOL(MBSTRG(illegalchars) == 0);
5650 }
5651 }
5652 /* }}} */
5653
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5654 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5655 const uint32_t enc_name_arg_num)
5656 {
5657 const mbfl_encoding *enc;
5658 enum mbfl_no_encoding no_enc;
5659
5660 ZEND_ASSERT(str_len > 0);
5661
5662 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5663 if (!enc) {
5664 return -2;
5665 }
5666
5667 no_enc = enc->no_encoding;
5668 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5669 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5670 return -2;
5671 }
5672
5673 /* Some legacy text encodings have a minimum required wchar buffer size;
5674 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5675 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5676 unsigned int state = 0;
5677 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5678 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5679
5680 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5681 return -1;
5682 }
5683 return wchar_buf[0];
5684 }
5685
5686 /* {{{ */
PHP_FUNCTION(mb_ord)5687 PHP_FUNCTION(mb_ord)
5688 {
5689 char *str;
5690 size_t str_len;
5691 zend_string *enc = NULL;
5692 zend_long cp;
5693
5694 ZEND_PARSE_PARAMETERS_START(1, 2)
5695 Z_PARAM_STRING(str, str_len)
5696 Z_PARAM_OPTIONAL
5697 Z_PARAM_STR_OR_NULL(enc)
5698 ZEND_PARSE_PARAMETERS_END();
5699
5700 if (str_len == 0) {
5701 zend_argument_must_not_be_empty_error(1);
5702 RETURN_THROWS();
5703 }
5704
5705 cp = php_mb_ord(str, str_len, enc, 2);
5706
5707 if (0 > cp) {
5708 if (cp == -2) {
5709 RETURN_THROWS();
5710 }
5711 RETURN_FALSE;
5712 }
5713
5714 RETURN_LONG(cp);
5715 }
5716 /* }}} */
5717
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5718 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5719 {
5720 const mbfl_encoding *enc;
5721 enum mbfl_no_encoding no_enc;
5722 zend_string *ret;
5723 char buf[4];
5724
5725 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5726 if (!enc) {
5727 return NULL;
5728 }
5729
5730 no_enc = enc->no_encoding;
5731 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5732 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5733 return NULL;
5734 }
5735
5736 if (cp < 0 || cp > 0x10ffff) {
5737 return NULL;
5738 }
5739
5740 if (php_mb_is_no_encoding_utf8(no_enc)) {
5741 if (cp > 0xd7ff && 0xe000 > cp) {
5742 return NULL;
5743 }
5744
5745 if (cp < 0x80) {
5746 ret = ZSTR_CHAR(cp);
5747 } else if (cp < 0x800) {
5748 ret = zend_string_alloc(2, 0);
5749 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5750 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5751 ZSTR_VAL(ret)[2] = 0;
5752 } else if (cp < 0x10000) {
5753 ret = zend_string_alloc(3, 0);
5754 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5755 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5756 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5757 ZSTR_VAL(ret)[3] = 0;
5758 } else {
5759 ret = zend_string_alloc(4, 0);
5760 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5761 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5762 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5763 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5764 ZSTR_VAL(ret)[4] = 0;
5765 }
5766
5767 return ret;
5768 }
5769
5770 buf[0] = (cp >> 24) & 0xff;
5771 buf[1] = (cp >> 16) & 0xff;
5772 buf[2] = (cp >> 8) & 0xff;
5773 buf[3] = cp & 0xff;
5774
5775 long orig_illegalchars = MBSTRG(illegalchars);
5776 MBSTRG(illegalchars) = 0;
5777 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5778
5779 if (MBSTRG(illegalchars) != 0) {
5780 zend_string_release(ret);
5781 ret = NULL;
5782 }
5783
5784 MBSTRG(illegalchars) = orig_illegalchars;
5785 return ret;
5786 }
5787
5788 /* {{{ */
PHP_FUNCTION(mb_chr)5789 PHP_FUNCTION(mb_chr)
5790 {
5791 zend_long cp;
5792 zend_string *enc = NULL;
5793
5794 ZEND_PARSE_PARAMETERS_START(1, 2)
5795 Z_PARAM_LONG(cp)
5796 Z_PARAM_OPTIONAL
5797 Z_PARAM_STR_OR_NULL(enc)
5798 ZEND_PARSE_PARAMETERS_END();
5799
5800 zend_string* ret = php_mb_chr(cp, enc, 2);
5801 if (ret == NULL) {
5802 RETURN_FALSE;
5803 }
5804
5805 RETURN_STR(ret);
5806 }
5807 /* }}} */
5808
PHP_FUNCTION(mb_str_pad)5809 PHP_FUNCTION(mb_str_pad)
5810 {
5811 zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5812 zend_long pad_to_length;
5813 zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5814
5815 ZEND_PARSE_PARAMETERS_START(2, 5)
5816 Z_PARAM_STR(input)
5817 Z_PARAM_LONG(pad_to_length)
5818 Z_PARAM_OPTIONAL
5819 Z_PARAM_STR(pad)
5820 Z_PARAM_LONG(pad_type_val)
5821 Z_PARAM_STR_OR_NULL(encoding_str)
5822 ZEND_PARSE_PARAMETERS_END();
5823
5824 const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5825 if (!encoding) {
5826 RETURN_THROWS();
5827 }
5828
5829 size_t input_length = mb_get_strlen(input, encoding);
5830
5831 /* If resulting string turns out to be shorter than input string,
5832 we simply copy the input and return. */
5833 if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5834 RETURN_STR_COPY(input);
5835 }
5836
5837 if (ZSTR_LEN(pad) == 0) {
5838 zend_argument_must_not_be_empty_error(3);
5839 RETURN_THROWS();
5840 }
5841
5842 if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5843 zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5844 RETURN_THROWS();
5845 }
5846
5847 size_t pad_length = mb_get_strlen(pad, encoding);
5848
5849 size_t num_mb_pad_chars = pad_to_length - input_length;
5850
5851 /* We need to figure out the left/right padding lengths. */
5852 size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5853 switch (pad_type_val) {
5854 case PHP_STR_PAD_RIGHT:
5855 right_pad = num_mb_pad_chars;
5856 break;
5857
5858 case PHP_STR_PAD_LEFT:
5859 left_pad = num_mb_pad_chars;
5860 break;
5861
5862 case PHP_STR_PAD_BOTH:
5863 left_pad = num_mb_pad_chars / 2;
5864 right_pad = num_mb_pad_chars - left_pad;
5865 break;
5866 }
5867
5868 /* How many full block copies need to happen, and how many characters are then left over? */
5869 size_t full_left_pad_copies = left_pad / pad_length;
5870 size_t full_right_pad_copies = right_pad / pad_length;
5871 size_t remaining_left_pad_chars = left_pad % pad_length;
5872 size_t remaining_right_pad_chars = right_pad % pad_length;
5873
5874 if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5875 goto overflow_no_release;
5876 }
5877
5878 /* Compute the number of bytes required for the padding */
5879 size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5880 size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5881
5882 /* No special fast-path handling necessary for zero-length pads because these functions will not
5883 * allocate memory in case a zero-length pad is required. */
5884 zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5885 zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5886
5887 if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5888 || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5889 goto overflow;
5890 }
5891
5892 size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5893 size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5894
5895 if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5896 || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5897 goto overflow;
5898 }
5899
5900 zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5901 char *buffer = ZSTR_VAL(result);
5902
5903 /* First we pad the left. */
5904 for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5905 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5906 }
5907 memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5908 buffer += ZSTR_LEN(remaining_left_pad_str);
5909
5910 /* Then we copy the input string. */
5911 memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5912 buffer += ZSTR_LEN(input);
5913
5914 /* Finally, we pad on the right. */
5915 for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5916 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5917 }
5918 memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5919
5920 ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5921
5922 zend_string_release_ex(remaining_left_pad_str, false);
5923 zend_string_release_ex(remaining_right_pad_str, false);
5924
5925 RETURN_NEW_STR(result);
5926
5927 overflow:
5928 zend_string_release_ex(remaining_left_pad_str, false);
5929 zend_string_release_ex(remaining_right_pad_str, false);
5930 overflow_no_release:
5931 zend_throw_error(NULL, "String size overflow");
5932 RETURN_THROWS();
5933 }
5934
5935 /* {{{ */
PHP_FUNCTION(mb_scrub)5936 PHP_FUNCTION(mb_scrub)
5937 {
5938 zend_string *str, *enc_name = NULL;
5939
5940 ZEND_PARSE_PARAMETERS_START(1, 2)
5941 Z_PARAM_STR(str)
5942 Z_PARAM_OPTIONAL
5943 Z_PARAM_STR_OR_NULL(enc_name)
5944 ZEND_PARSE_PARAMETERS_END();
5945
5946 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5947 if (!enc) {
5948 RETURN_THROWS();
5949 }
5950
5951 if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5952 /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5953 RETURN_STR_COPY(str);
5954 }
5955
5956 RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5957 }
5958 /* }}} */
5959
5960 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5961 static void php_mb_populate_current_detect_order_list(void)
5962 {
5963 const mbfl_encoding **entry = 0;
5964 size_t nentries;
5965
5966 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5967 nentries = MBSTRG(detect_order_list_size);
5968 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5969 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5970 } else {
5971 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5972 size_t i;
5973 nentries = MBSTRG(default_detect_order_list_size);
5974 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5975 for (i = 0; i < nentries; i++) {
5976 entry[i] = mbfl_no2encoding(src[i]);
5977 }
5978 }
5979 MBSTRG(current_detect_order_list) = entry;
5980 MBSTRG(current_detect_order_list_size) = nentries;
5981 }
5982 /* }}} */
5983
5984 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5985 static int php_mb_encoding_translation(void)
5986 {
5987 return MBSTRG(encoding_translation);
5988 }
5989 /* }}} */
5990
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5991 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5992 {
5993 if (enc) {
5994 if (enc->mblen_table) {
5995 if (s) {
5996 return enc->mblen_table[*(unsigned char *)s];
5997 }
5998 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5999 return 2;
6000 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
6001 return 4;
6002 }
6003 }
6004 return 1;
6005 }
6006
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)6007 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
6008 {
6009 const char *p = s;
6010 char *last=NULL;
6011
6012 if (nbytes == (size_t)-1) {
6013 size_t nb = 0;
6014
6015 while (*p != '\0') {
6016 if (nb == 0) {
6017 if ((unsigned char)*p == (unsigned char)c) {
6018 last = (char *)p;
6019 }
6020 nb = php_mb_mbchar_bytes(p, enc);
6021 if (nb == 0) {
6022 return NULL; /* something is going wrong! */
6023 }
6024 }
6025 --nb;
6026 ++p;
6027 }
6028 } else {
6029 size_t bcnt = nbytes;
6030 size_t nbytes_char;
6031 while (bcnt > 0) {
6032 if ((unsigned char)*p == (unsigned char)c) {
6033 last = (char *)p;
6034 }
6035 nbytes_char = php_mb_mbchar_bytes(p, enc);
6036 if (bcnt < nbytes_char) {
6037 return NULL;
6038 }
6039 p += nbytes_char;
6040 bcnt -= nbytes_char;
6041 }
6042 }
6043 return last;
6044 }
6045
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)6046 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
6047 {
6048 /* We're using simple case-folding here, because we'd have to deal with remapping of
6049 * offsets otherwise. */
6050 zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6051 zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
6052
6053 size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6054
6055 zend_string_free(haystack_conv);
6056 zend_string_free(needle_conv);
6057
6058 return n;
6059 }
6060
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)6061 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6062 {
6063 *list = (const zend_encoding **)MBSTRG(http_input_list);
6064 *list_size = MBSTRG(http_input_list_size);
6065 }
6066 /* }}} */
6067
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)6068 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6069 {
6070 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
6071 }
6072 /* }}} */
6073
6074 static const unsigned char base64_table[] = {
6075 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6076 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6077 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6078 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6079 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6080 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6081 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6082 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6083 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6084 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6085 };
6086
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)6087 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6088 {
6089 if (base64) {
6090 return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6091 } else {
6092 size_t enc_size = 0;
6093 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6094 while (p < tmpbuf->out) {
6095 unsigned char c = *p++;
6096 enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6097 }
6098 return enc_size;
6099 }
6100 }
6101
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)6102 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6103 {
6104 unsigned char *out, *limit;
6105 MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6106 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6107
6108 if (base64) {
6109 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6110 while ((e - p) >= 3) {
6111 unsigned char a = *p++;
6112 unsigned char b = *p++;
6113 unsigned char c = *p++;
6114 uint32_t bits = (a << 16) | (b << 8) | c;
6115 out = mb_convert_buf_add4(out,
6116 base64_table[(bits >> 18) & 0x3F],
6117 base64_table[(bits >> 12) & 0x3F],
6118 base64_table[(bits >> 6) & 0x3F],
6119 base64_table[bits & 0x3F]);
6120 }
6121 if (p != e) {
6122 if ((e - p) == 1) {
6123 uint32_t bits = *p++;
6124 out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6125 } else {
6126 unsigned char a = *p++;
6127 unsigned char b = *p++;
6128 uint32_t bits = (a << 8) | b;
6129 out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6130 }
6131 }
6132 } else {
6133 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6134 while (p < e) {
6135 unsigned char c = *p++;
6136 if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6137 out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6138 } else {
6139 out = mb_convert_buf_add(out, c);
6140 }
6141 }
6142 }
6143
6144 mb_convert_buf_reset(tmpbuf, 0);
6145 MB_CONVERT_BUF_STORE(outbuf, out, limit);
6146 }
6147
6148 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6149
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)6150 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6151 {
6152 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6153 size_t in_len = ZSTR_LEN(input);
6154
6155 ZEND_ASSERT(outcode->mime_name != NULL);
6156 ZEND_ASSERT(outcode->mime_name[0] != '\0');
6157
6158 if (!in_len) {
6159 return zend_empty_string;
6160 }
6161
6162 if (indent < 0 || indent >= 74) {
6163 indent = 0;
6164 }
6165
6166 if (linefeed_len > 8) {
6167 linefeed_len = 8;
6168 }
6169 /* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6170 for (size_t i = 0; i < linefeed_len; i++) {
6171 if (linefeed[i] == '\0') {
6172 linefeed_len = i;
6173 break;
6174 }
6175 }
6176
6177 unsigned int state = 0;
6178 /* wchar_buf should be big enough that when it is full, we definitely have enough
6179 * wchars to fill an entire line of output */
6180 uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6181 uint32_t *p, *e;
6182 /* What part of wchar_buf is filled with still-unprocessed data which should not
6183 * be overwritten? */
6184 unsigned int offset = 0;
6185 size_t line_start = 0;
6186
6187 /* If the entire input string is ASCII with no spaces (except possibly leading
6188 * spaces), just pass it through unchanged */
6189 bool checking_leading_spaces = true;
6190 while (in_len) {
6191 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6192 p = wchar_buf;
6193 e = wchar_buf + out_len;
6194
6195 while (p < e) {
6196 uint32_t w = *p++;
6197 if (checking_leading_spaces) {
6198 if (w == ' ') {
6199 continue;
6200 } else {
6201 checking_leading_spaces = false;
6202 }
6203 }
6204 if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6205 /* We cannot simply pass input string through unchanged; start again */
6206 in = (unsigned char*)ZSTR_VAL(input);
6207 in_len = ZSTR_LEN(input);
6208 goto no_passthrough;
6209 }
6210 }
6211 }
6212
6213 return zend_string_copy(input); /* This just increments refcount */
6214
6215 no_passthrough: ;
6216
6217 mb_convert_buf buf;
6218 mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6219
6220 /* Encode some prefix of the input string as plain ASCII if possible
6221 * If we find it necessary to switch to Base64/QPrint encoding, we will
6222 * do so all the way to the end of the string */
6223 while (in_len) {
6224 /* Decode part of the input string, refill wchar_buf */
6225 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6226 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6227 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6228 p = wchar_buf;
6229 e = wchar_buf + offset + out_len;
6230 /* ASCII output is broken into space-delimited 'words'
6231 * If we find a non-ASCII character in the middle of a word, we will
6232 * transfer-encode the entire word */
6233 uint32_t *word_start = p;
6234
6235 /* Don't consider adding line feed for spaces at the beginning of a word */
6236 while (p < e && *p == ' ' && (p - word_start) <= 74) {
6237 p++;
6238 }
6239
6240 while (p < e) {
6241 uint32_t w = *p++;
6242
6243 if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6244 /* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6245 * If we are already too far along on a line to include Base64/QPrint encoded data
6246 * on the same line (without overrunning max line length), then add a line feed
6247 * right now */
6248 feed_and_mime_encode:
6249 if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6250 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6251 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6252 buf.out = mb_convert_buf_add(buf.out, ' ');
6253 indent = 0;
6254 line_start = mb_convert_buf_len(&buf);
6255 } else if (mb_convert_buf_len(&buf) > 0) {
6256 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6257 buf.out = mb_convert_buf_add(buf.out, ' ');
6258 }
6259 p = word_start; /* Back up to where MIME encoding of input chars should start */
6260 goto mime_encoding_needed;
6261 } else if (w == ' ') {
6262 /* When we see a space, check whether we should insert a line break */
6263 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6264 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6265 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6266 buf.out = mb_convert_buf_add(buf.out, ' ');
6267 indent = 0;
6268 line_start = mb_convert_buf_len(&buf);
6269 } else if (mb_convert_buf_len(&buf) > 0) {
6270 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6271 buf.out = mb_convert_buf_add(buf.out, ' ');
6272 }
6273 /* Output one (space-delimited) word as plain ASCII */
6274 while (word_start < p-1) {
6275 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6276 }
6277 word_start++;
6278 while (p < e && *p == ' ') {
6279 p++;
6280 }
6281 }
6282 }
6283
6284 if (in_len) {
6285 /* Copy chars which are part of an incomplete 'word' to the beginning
6286 * of wchar_buf and reprocess them on the next iteration.
6287 * But first make sure that the incomplete 'word' isn't so big that
6288 * there will be no space to add any more decoded wchars in the buffer
6289 * (which could lead to an infinite loop) */
6290 if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6291 goto feed_and_mime_encode;
6292 }
6293 offset = e - word_start;
6294 if (offset) {
6295 memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6296 }
6297 } else {
6298 /* We have reached the end of the input string while still in 'ASCII mode';
6299 * process any trailing ASCII chars which were not followed by a space */
6300 if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6301 /* The whole input string was not just one big ASCII 'word' with no spaces
6302 * consider adding a line feed if necessary to prevent output lines from
6303 * being too long */
6304 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6305 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6306 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6307 buf.out = mb_convert_buf_add(buf.out, ' ');
6308 } else {
6309 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6310 buf.out = mb_convert_buf_add(buf.out, ' ');
6311 }
6312 }
6313 while (word_start < e) {
6314 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6315 }
6316 }
6317 }
6318
6319 /* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6320 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6321
6322 mime_encoding_needed: ;
6323
6324 /* We will generate the output line by line, first converting wchars to bytes
6325 * in the requested output encoding, then transfer-encoding those bytes as
6326 * Base64 or QPrint
6327 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6328 * sending them to 'buf' */
6329 mb_convert_buf tmpbuf;
6330 mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6331
6332 /* Do we need to refill wchar_buf to make sure we don't run out of wchars
6333 * in the middle of a line? */
6334 offset = e - p;
6335 if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6336 goto start_new_line;
6337 }
6338 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6339
6340 while(true) {
6341 refill_wchar_buf: ;
6342 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6343 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6344 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6345 p = wchar_buf;
6346 e = wchar_buf + offset + out_len;
6347
6348 start_new_line: ;
6349 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6350 buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6351 buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6352 buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6353
6354 /* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6355 * We do something like a 'binary search' to find the greatest number which
6356 * can be included on this line without exceeding max line length */
6357 unsigned int n = 12;
6358 size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6359
6360 while (true) {
6361 ZEND_ASSERT(p < e);
6362
6363 /* Remember where we were in process of generating output, so we can back
6364 * up if necessary */
6365 size_t tmppos = mb_convert_buf_len(&tmpbuf);
6366 unsigned int tmpstate = tmpbuf.state;
6367
6368 /* Try encoding 'n' wchars in output text encoding and sending output
6369 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6370 * current line. */
6371 n = MIN(n, e - p);
6372 outcode->from_wchar(p, n, &tmpbuf, false);
6373
6374 /* For some output text encodings, there may be a few ending bytes
6375 * which need to be emitted to output before we break a line.
6376 * Again, remember where we were so we can back up */
6377 size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6378 unsigned int tmpstate2 = tmpbuf.state;
6379 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6380
6381 if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6382 /* If we convert 'n' more wchars on the current line, it will not
6383 * overflow the maximum line length */
6384 p += n;
6385
6386 if (p == e) {
6387 /* We are done; we shouldn't reach here if there is more remaining
6388 * of the input string which needs to be processed */
6389 ZEND_ASSERT(!in_len);
6390 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6391 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6392 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6393 mb_convert_buf_free(&tmpbuf);
6394 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6395 } else {
6396 /* It's possible that more chars might fit on the current line,
6397 * so back up to where we were before emitting any ending bytes */
6398 mb_convert_buf_reset(&tmpbuf, tmppos2);
6399 tmpbuf.state = tmpstate2;
6400 }
6401 } else {
6402 /* Converting 'n' more wchars on this line would be too much.
6403 * Back up to where we were before we tried that. */
6404 mb_convert_buf_reset(&tmpbuf, tmppos);
6405 tmpbuf.state = tmpstate;
6406
6407 if (n == 1) {
6408 /* We have found the exact number of chars which will fit on the
6409 * current line. Finish up and move to a new line. */
6410 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6411 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6412 tmpbuf.state = 0;
6413
6414 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6415 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6416
6417 indent = 0; /* Indent argument must only affect the first line */
6418
6419 if (in_len || p < e) {
6420 /* We still have more input to process */
6421 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6422 buf.out = mb_convert_buf_add(buf.out, ' ');
6423 line_start = mb_convert_buf_len(&buf);
6424 offset = e - p;
6425 if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6426 /* Copy any remaining wchars to beginning of buffer and refill
6427 * the rest of the buffer */
6428 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6429 goto refill_wchar_buf;
6430 }
6431 goto start_new_line;
6432 } else {
6433 /* We are done! */
6434 mb_convert_buf_free(&tmpbuf);
6435 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6436 }
6437 } else {
6438 /* Try a smaller number of wchars */
6439 n = MAX(n >> 1, 1);
6440 }
6441 }
6442 }
6443 }
6444 }
6445
PHP_FUNCTION(mb_encode_mimeheader)6446 PHP_FUNCTION(mb_encode_mimeheader)
6447 {
6448 const mbfl_encoding *charset = &mbfl_encoding_pass;
6449 zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6450 char *linefeed = "\r\n";
6451 size_t linefeed_len = 2;
6452 zend_long indent = 0;
6453 bool base64 = true;
6454
6455 ZEND_PARSE_PARAMETERS_START(1, 5)
6456 Z_PARAM_STR(str)
6457 Z_PARAM_OPTIONAL
6458 Z_PARAM_STR(charset_name)
6459 Z_PARAM_STR(transenc_name)
6460 Z_PARAM_STRING(linefeed, linefeed_len)
6461 Z_PARAM_LONG(indent)
6462 ZEND_PARSE_PARAMETERS_END();
6463
6464 if (charset_name != NULL) {
6465 charset = php_mb_get_encoding(charset_name, 2);
6466 if (!charset) {
6467 RETURN_THROWS();
6468 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6469 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6470 RETURN_THROWS();
6471 }
6472 } else {
6473 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6474 if (lang != NULL) {
6475 charset = mbfl_no2encoding(lang->mail_charset);
6476 const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6477 char t = transenc->name[0];
6478 if (t == 'Q' || t == 'q') {
6479 base64 = false;
6480 }
6481 }
6482 }
6483
6484 if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6485 char t = ZSTR_VAL(transenc_name)[0];
6486 if (t == 'Q' || t == 'q') {
6487 base64 = false;
6488 }
6489 }
6490
6491 RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6492 }
6493
decode_base64(unsigned char c)6494 static int8_t decode_base64(unsigned char c)
6495 {
6496 if (c >= 'A' && c <= 'Z') {
6497 return c - 'A';
6498 } else if (c >= 'a' && c <= 'z') {
6499 return c - 'a' + 26;
6500 } else if (c >= '0' && c <= '9') {
6501 return c - '0' + 52;
6502 } else if (c == '+') {
6503 return 62;
6504 } else if (c == '/') {
6505 return 63;
6506 }
6507 return -1;
6508 }
6509
6510 static int8_t qprint_map[] = {
6511 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6512 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6513 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6514 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
6515 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6516 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6517 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6518 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6519 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6520 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6521 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6522 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6523 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6524 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6525 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6526 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6527 };
6528
6529 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6530 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6531 {
6532 if ((e - p) < 6) {
6533 return NULL;
6534 }
6535
6536 ZEND_ASSERT(p[0] == '=');
6537 ZEND_ASSERT(p[1] == '?');
6538 p += 2;
6539
6540 unsigned char *charset = p;
6541 unsigned char *charset_end = memchr(charset, '?', e - charset);
6542 if (charset_end == NULL) {
6543 return NULL;
6544 }
6545
6546 unsigned char *encoding = charset_end + 1;
6547 p = encoding + 1;
6548 if (p >= e || *p++ != '?') {
6549 return NULL;
6550 }
6551
6552 char *charset_name = estrndup((const char*)charset, charset_end - charset);
6553 const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6554 efree(charset_name);
6555 if (incode == NULL) {
6556 return NULL;
6557 }
6558
6559 unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6560 if (end_marker) {
6561 e = end_marker;
6562 } else if (p < e && *(e-1) == '?') {
6563 /* If encoded word is not properly terminated, but last byte is '?',
6564 * take that as a terminator (legacy behavior) */
6565 e--;
6566 }
6567
6568 unsigned char *buf = emalloc(e - p), *bufp = buf;
6569 if (*encoding == 'Q' || *encoding == 'q') {
6570 /* Fill `buf` with bytes from decoding QPrint */
6571 while (p < e) {
6572 unsigned char c = *p++;
6573 if (c == '_') {
6574 *bufp++ = ' ';
6575 continue;
6576 } else if (c == '=' && (e - p) >= 2) {
6577 unsigned char c2 = *p++;
6578 unsigned char c3 = *p++;
6579 if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6580 *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6581 continue;
6582 } else if (c2 == '\r') {
6583 if (c3 != '\n') {
6584 p--;
6585 }
6586 continue;
6587 } else if (c2 == '\n') {
6588 p--;
6589 continue;
6590 }
6591 }
6592 *bufp++ = c;
6593 }
6594 } else if (*encoding == 'B' || *encoding == 'b') {
6595 /* Fill `buf` with bytes from decoding Base64 */
6596 unsigned int bits = 0, cache = 0;
6597 while (p < e) {
6598 unsigned char c = *p++;
6599 if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6600 continue;
6601 }
6602 int8_t decoded = decode_base64(c);
6603 if (decoded == -1) {
6604 *bufp++ = '?';
6605 continue;
6606 }
6607 bits += 6;
6608 cache = (cache << 6) | (decoded & 0x3F);
6609 if (bits == 24) {
6610 *bufp++ = (cache >> 16) & 0xFF;
6611 *bufp++ = (cache >> 8) & 0xFF;
6612 *bufp++ = cache & 0xFF;
6613 bits = cache = 0;
6614 }
6615 }
6616 if (bits == 18) {
6617 *bufp++ = (cache >> 10) & 0xFF;
6618 *bufp++ = (cache >> 2) & 0xFF;
6619 } else if (bits == 12) {
6620 *bufp++ = (cache >> 4) & 0xFF;
6621 }
6622 } else {
6623 efree(buf);
6624 return NULL;
6625 }
6626
6627 size_t in_len = bufp - buf;
6628 uint32_t wchar_buf[128];
6629
6630 bufp = buf;
6631 while (in_len) {
6632 size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6633 ZEND_ASSERT(out_len <= 128);
6634 outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6635 }
6636
6637 efree(buf);
6638 return e + 2;
6639 }
6640
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6641 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6642 {
6643 unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6644 unsigned int state = 0;
6645 bool space_pending = false;
6646
6647 mb_convert_buf buf;
6648 mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6649
6650 while (p < e) {
6651 unsigned char c = *p;
6652
6653 if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6654 /* Does this look like a MIME encoded word? If so, try to decode it as one */
6655 unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6656 if (incode_end && (e - incode_end) >= 3) {
6657 unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6658 if (temp) {
6659 p = temp;
6660 /* Decoding of MIME encoded word was successful;
6661 * Try to collapse a run of whitespace */
6662 if (p < e && (*p == '\n' || *p == '\r')) {
6663 do {
6664 p++;
6665 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6666 /* We will only actually output a space if this is not immediately followed
6667 * by another valid encoded word */
6668 space_pending = true;
6669 }
6670 continue;
6671 }
6672 }
6673 }
6674
6675 if (space_pending) {
6676 uint32_t space = ' ';
6677 outcode->from_wchar(&space, 1, &buf, false);
6678 space_pending = false;
6679 }
6680
6681 /* Consume a run of plain ASCII characters */
6682 if (c != '\n' && c != '\r') {
6683 unsigned char *end = p + 1;
6684 while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6685 end++;
6686 }
6687 uint32_t wchar_buf[128];
6688 size_t in_len = end - p;
6689 while (in_len) {
6690 size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6691 ZEND_ASSERT(out_len <= 128);
6692 outcode->from_wchar(wchar_buf, out_len, &buf, false);
6693 }
6694 }
6695 /* Collapse a run of whitespace into a single space */
6696 if (p < e && (*p == '\n' || *p == '\r')) {
6697 do {
6698 p++;
6699 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6700 if (p < e) {
6701 /* Emulating legacy behavior of mb_decode_mimeheader here;
6702 * a run of whitespace is not converted to a space at the very
6703 * end of the input string */
6704 uint32_t space = ' ';
6705 outcode->from_wchar(&space, 1, &buf, false);
6706 }
6707 }
6708 }
6709
6710 outcode->from_wchar(NULL, 0, &buf, true);
6711
6712 return mb_convert_buf_result(&buf, outcode);
6713 }
6714
PHP_FUNCTION(mb_decode_mimeheader)6715 PHP_FUNCTION(mb_decode_mimeheader)
6716 {
6717 zend_string *str;
6718
6719 ZEND_PARSE_PARAMETERS_START(1, 1)
6720 Z_PARAM_STR(str)
6721 ZEND_PARSE_PARAMETERS_END();
6722
6723 RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6724 }
6725