1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include <limits.h>
22
23 #include "libmbfl/config.h"
24 #include "php.h"
25 #include "php_ini.h"
26 #include "php_variables.h"
27 #include "mbstring.h"
28 #include "ext/standard/php_string.h"
29 #include "ext/standard/php_mail.h"
30 #include "ext/standard/exec.h"
31 #include "ext/standard/url.h"
32 #include "main/php_output.h"
33 #include "ext/standard/info.h"
34 #include "ext/pcre/php_pcre.h"
35
36 #include "libmbfl/mbfl/mbfilter_8bit.h"
37 #include "libmbfl/mbfl/mbfilter_pass.h"
38 #include "libmbfl/mbfl/mbfilter_wchar.h"
39 #include "libmbfl/mbfl/eaw_table.h"
40 #include "libmbfl/filters/mbfilter_base64.h"
41 #include "libmbfl/filters/mbfilter_cjk.h"
42 #include "libmbfl/filters/mbfilter_qprint.h"
43 #include "libmbfl/filters/mbfilter_htmlent.h"
44 #include "libmbfl/filters/mbfilter_uuencode.h"
45 #include "libmbfl/filters/mbfilter_ucs4.h"
46 #include "libmbfl/filters/mbfilter_utf8.h"
47 #include "libmbfl/filters/mbfilter_utf16.h"
48 #include "libmbfl/filters/mbfilter_singlebyte.h"
49 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
50 #include "libmbfl/filters/unicode_prop.h"
51
52 #include "php_variables.h"
53 #include "php_globals.h"
54 #include "rfc1867.h"
55 #include "php_content_types.h"
56 #include "SAPI.h"
57 #include "php_unicode.h"
58 #include "TSRM.h"
59
60 #include "mb_gpc.h"
61
62 #ifdef HAVE_MBREGEX
63 # include "php_mbregex.h"
64 #endif
65
66 #include "zend_smart_str.h"
67 #include "zend_multibyte.h"
68 #include "mbstring_arginfo.h"
69
70 #include "rare_cp_bitvec.h"
71
72 /* }}} */
73
74 /* {{{ prototypes */
75 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
76
77 static PHP_GINIT_FUNCTION(mbstring);
78 static PHP_GSHUTDOWN_FUNCTION(mbstring);
79
80 static void php_mb_populate_current_detect_order_list(void);
81
82 static int php_mb_encoding_translation(void);
83
84 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
85
86 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
87
88 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
89
90 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
91
92 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
93
94 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
95
96 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
97
98 /* See mbfilter_cp5022x.c */
99 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
100 /* }}} */
101
102 /* {{{ php_mb_default_identify_list */
103 typedef struct _php_mb_nls_ident_list {
104 enum mbfl_no_language lang;
105 const enum mbfl_no_encoding *list;
106 size_t list_size;
107 } php_mb_nls_ident_list;
108
109 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
110 mbfl_no_encoding_ascii,
111 mbfl_no_encoding_jis,
112 mbfl_no_encoding_utf8,
113 mbfl_no_encoding_euc_jp,
114 mbfl_no_encoding_sjis
115 };
116
117 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
118 mbfl_no_encoding_ascii,
119 mbfl_no_encoding_utf8,
120 mbfl_no_encoding_euc_cn,
121 mbfl_no_encoding_cp936
122 };
123
124 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
125 mbfl_no_encoding_ascii,
126 mbfl_no_encoding_utf8,
127 mbfl_no_encoding_euc_tw,
128 mbfl_no_encoding_big5
129 };
130
131 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
132 mbfl_no_encoding_ascii,
133 mbfl_no_encoding_utf8,
134 mbfl_no_encoding_euc_kr,
135 mbfl_no_encoding_uhc
136 };
137
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
139 mbfl_no_encoding_ascii,
140 mbfl_no_encoding_utf8,
141 mbfl_no_encoding_koi8r,
142 mbfl_no_encoding_cp1251,
143 mbfl_no_encoding_cp866
144 };
145
146 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
147 mbfl_no_encoding_ascii,
148 mbfl_no_encoding_utf8,
149 mbfl_no_encoding_armscii8
150 };
151
152 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
153 mbfl_no_encoding_ascii,
154 mbfl_no_encoding_utf8,
155 mbfl_no_encoding_cp1254,
156 mbfl_no_encoding_8859_9
157 };
158
159 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
160 mbfl_no_encoding_ascii,
161 mbfl_no_encoding_utf8,
162 mbfl_no_encoding_koi8u
163 };
164
165 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
166 mbfl_no_encoding_ascii,
167 mbfl_no_encoding_utf8
168 };
169
170
171 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
172 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
173 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
174 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
175 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
176 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
177 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
178 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
179 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
180 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
181 };
182
183 /* }}} */
184
185 /* {{{ mbstring_deps[] */
186 static const zend_module_dep mbstring_deps[] = {
187 ZEND_MOD_REQUIRED("pcre")
188 ZEND_MOD_END
189 };
190 /* }}} */
191
192 /* {{{ zend_module_entry mbstring_module_entry */
193 zend_module_entry mbstring_module_entry = {
194 STANDARD_MODULE_HEADER_EX,
195 NULL,
196 mbstring_deps,
197 "mbstring",
198 ext_functions,
199 PHP_MINIT(mbstring),
200 PHP_MSHUTDOWN(mbstring),
201 PHP_RINIT(mbstring),
202 PHP_RSHUTDOWN(mbstring),
203 PHP_MINFO(mbstring),
204 PHP_MBSTRING_VERSION,
205 PHP_MODULE_GLOBALS(mbstring),
206 PHP_GINIT(mbstring),
207 PHP_GSHUTDOWN(mbstring),
208 NULL,
209 STANDARD_MODULE_PROPERTIES_EX
210 };
211 /* }}} */
212
213 /* {{{ static sapi_post_entry php_post_entries[] */
214 static const sapi_post_entry php_post_entries[] = {
215 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
216 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
217 { NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220
221 #ifdef COMPILE_DL_MBSTRING
222 #ifdef ZTS
223 ZEND_TSRMLS_CACHE_DEFINE()
224 #endif
225 ZEND_GET_MODULE(mbstring)
226 #endif
227
228 /* {{{ static sapi_post_entry mbstr_post_entries[] */
229 static const sapi_post_entry mbstr_post_entries[] = {
230 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
231 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
232 { NULL, 0, NULL, NULL }
233 };
234 /* }}} */
235
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)236 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
237 if (encoding_name) {
238 const mbfl_encoding *encoding;
239 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
240 if (last_encoding_name && (last_encoding_name == encoding_name
241 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
242 return MBSTRG(last_used_encoding);
243 }
244
245 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
246 if (!encoding) {
247 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
248 return NULL;
249 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
250 if (encoding == &mbfl_encoding_base64) {
251 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
252 } else if (encoding == &mbfl_encoding_qprint) {
253 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
254 } else if (encoding == &mbfl_encoding_html_ent) {
255 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
256 } else if (encoding == &mbfl_encoding_uuencode) {
257 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
258 }
259 }
260
261 if (last_encoding_name) {
262 zend_string_release(last_encoding_name);
263 }
264 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
265 MBSTRG(last_used_encoding) = encoding;
266 return encoding;
267 } else {
268 return MBSTRG(current_internal_encoding);
269 }
270 }
271
php_mb_get_encoding_or_pass(const char * encoding_name)272 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
273 if (strcmp(encoding_name, "pass") == 0) {
274 return &mbfl_encoding_pass;
275 }
276
277 return mbfl_name2encoding(encoding_name);
278 }
279
count_commas(const char * p,const char * end)280 static size_t count_commas(const char *p, const char *end) {
281 size_t count = 0;
282 while ((p = memchr(p, ',', end - p))) {
283 count++;
284 p++;
285 }
286 return count;
287 }
288
289 /* {{{ static zend_result php_mb_parse_encoding_list()
290 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
291 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
292 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num)293 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
294 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
295 {
296 if (value == NULL || value_length == 0) {
297 *return_list = NULL;
298 *return_size = 0;
299 return SUCCESS;
300 } else {
301 bool included_auto;
302 size_t n, size;
303 char *p1, *endp, *tmpstr;
304 const mbfl_encoding **entry, **list;
305
306 /* copy the value string for work */
307 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
308 tmpstr = (char *)estrndup(value+1, value_length-2);
309 value_length -= 2;
310 } else {
311 tmpstr = (char *)estrndup(value, value_length);
312 }
313
314 endp = tmpstr + value_length;
315 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
316 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
317 entry = list;
318 n = 0;
319 included_auto = 0;
320 p1 = tmpstr;
321 while (1) {
322 char *comma = memchr(p1, ',', endp - p1);
323 char *p = comma ? comma : endp;
324 *p = '\0';
325 /* trim spaces */
326 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
327 p1++;
328 }
329 p--;
330 while (p > p1 && (*p == ' ' || *p == '\t')) {
331 *p = '\0';
332 p--;
333 }
334 /* convert to the encoding number and check encoding */
335 if (strcasecmp(p1, "auto") == 0) {
336 if (!included_auto) {
337 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
338 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
339 size_t i;
340 included_auto = 1;
341 for (i = 0; i < identify_list_size; i++) {
342 *entry++ = mbfl_no2encoding(*src++);
343 n++;
344 }
345 }
346 } else {
347 const mbfl_encoding *encoding = mbfl_name2encoding(p1);
348 if (!encoding) {
349 /* Called from an INI setting modification */
350 if (arg_num == 0) {
351 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
352 } else {
353 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
354 }
355 efree(tmpstr);
356 pefree(ZEND_VOIDP(list), persistent);
357 return FAILURE;
358 }
359
360 *entry++ = encoding;
361 n++;
362 }
363 if (n >= size || comma == NULL) {
364 break;
365 }
366 p1 = comma + 1;
367 }
368 *return_list = list;
369 *return_size = n;
370 efree(tmpstr);
371 }
372
373 return SUCCESS;
374 }
375 /* }}} */
376
377 /* {{{ static int php_mb_parse_encoding_array()
378 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
379 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
380 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)381 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
382 size_t *return_size, uint32_t arg_num)
383 {
384 /* Allocate enough space to include the default detect order if "auto" is used. */
385 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
386 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
387 const mbfl_encoding **entry = list;
388 bool included_auto = 0;
389 size_t n = 0;
390 zval *hash_entry;
391 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
392 zend_string *encoding_str = zval_try_get_string(hash_entry);
393 if (UNEXPECTED(!encoding_str)) {
394 efree(ZEND_VOIDP(list));
395 return FAILURE;
396 }
397
398 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
399 if (!included_auto) {
400 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
401 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
402 size_t j;
403
404 included_auto = 1;
405 for (j = 0; j < identify_list_size; j++) {
406 *entry++ = mbfl_no2encoding(*src++);
407 n++;
408 }
409 }
410 } else {
411 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
412 if (encoding) {
413 *entry++ = encoding;
414 n++;
415 } else {
416 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
417 zend_string_release(encoding_str);
418 efree(ZEND_VOIDP(list));
419 return FAILURE;
420 }
421 }
422 zend_string_release(encoding_str);
423 } ZEND_HASH_FOREACH_END();
424 *return_list = list;
425 *return_size = n;
426 return SUCCESS;
427 }
428 /* }}} */
429
430 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)431 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
432 {
433 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
434 }
435
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)436 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
437 {
438 return ((const mbfl_encoding *)encoding)->name;
439 }
440
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)441 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
442 {
443 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
444 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
445 }
446
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)447 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
448 {
449 if (!list) {
450 list = (const zend_encoding**)MBSTRG(current_detect_order_list);
451 list_size = MBSTRG(current_detect_order_list_size);
452 }
453 if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
454 /* Emulate behavior of previous implementation; it would never return "pass"
455 * from an encoding auto-detection operation */
456 return NULL;
457 }
458 return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
459 }
460
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)461 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
462 {
463 unsigned int num_errors = 0;
464 zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
465
466 *to_length = ZSTR_LEN(result);
467 *to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
468 memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
469 zend_string_free(result);
470
471 return from_length;
472 }
473
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)474 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
475 {
476 return php_mb_parse_encoding_list(
477 encoding_list, encoding_list_len,
478 (const mbfl_encoding ***)return_list, return_size,
479 persistent, /* arg_num */ 0);
480 }
481
php_mb_zend_internal_encoding_getter(void)482 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
483 {
484 return (const zend_encoding *)MBSTRG(internal_encoding);
485 }
486
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)487 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
488 {
489 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
490 return SUCCESS;
491 }
492
493 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
494 "mbstring",
495 php_mb_zend_encoding_fetcher,
496 php_mb_zend_encoding_name_getter,
497 php_mb_zend_encoding_lexer_compatibility_checker,
498 php_mb_zend_encoding_detector,
499 php_mb_zend_encoding_converter,
500 php_mb_zend_encoding_list_parser,
501 php_mb_zend_internal_encoding_getter,
502 php_mb_zend_internal_encoding_setter
503 };
504 /* }}} */
505
506 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)507 static void *_php_mb_compile_regex(const char *pattern)
508 {
509 pcre2_code *retval;
510 PCRE2_SIZE err_offset;
511 int errnum;
512
513 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
514 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
515 PCRE2_UCHAR err_str[128];
516 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
517 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
518 }
519 return retval;
520 }
521 /* }}} */
522
523 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)524 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
525 {
526 int res;
527
528 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
529 if (NULL == match_data) {
530 pcre2_code_free(opaque);
531 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
532 return FAILURE;
533 }
534 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
535 php_pcre_free_match_data(match_data);
536
537 return res;
538 }
539 /* }}} */
540
541 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)542 static void _php_mb_free_regex(void *opaque)
543 {
544 pcre2_code_free(opaque);
545 }
546 /* }}} */
547
548 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)549 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
550 {
551 size_t i;
552
553 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
554 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
555
556 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
557 if (php_mb_default_identify_list[i].lang == lang) {
558 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
559 *plist_size = php_mb_default_identify_list[i].list_size;
560 return 1;
561 }
562 }
563 return 0;
564 }
565 /* }}} */
566
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)567 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
568 {
569 char *result = emalloc(len + 2);
570 char *resp = result;
571 size_t i;
572
573 for (i = 0; i < len && start[i] != quote; ++i) {
574 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
575 *resp++ = start[++i];
576 } else {
577 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
578
579 while (j-- > 0 && i < len) {
580 *resp++ = start[i++];
581 }
582 --i;
583 }
584 }
585
586 *resp = '\0';
587 return result;
588 }
589
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)590 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
591 {
592 char *pos = *line, quote;
593 char *res;
594
595 while (*pos && *pos != stop) {
596 if ((quote = *pos) == '"' || quote == '\'') {
597 ++pos;
598 while (*pos && *pos != quote) {
599 if (*pos == '\\' && pos[1] && pos[1] == quote) {
600 pos += 2;
601 } else {
602 ++pos;
603 }
604 }
605 if (*pos) {
606 ++pos;
607 }
608 } else {
609 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
610
611 }
612 }
613 if (*pos == '\0') {
614 res = estrdup(*line);
615 *line += strlen(*line);
616 return res;
617 }
618
619 res = estrndup(*line, pos - *line);
620
621 while (*pos == stop) {
622 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
623 }
624
625 *line = pos;
626 return res;
627 }
628 /* }}} */
629
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)630 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
631 {
632 while (*str && isspace(*(unsigned char *)str)) {
633 ++str;
634 }
635
636 if (!*str) {
637 return estrdup("");
638 }
639
640 if (*str == '"' || *str == '\'') {
641 char quote = *str;
642
643 str++;
644 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
645 } else {
646 char *strend = str;
647
648 while (*strend && !isspace(*(unsigned char *)strend)) {
649 ++strend;
650 }
651 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
652 }
653 }
654 /* }}} */
655
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)656 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
657 {
658 char *s, *s2;
659 const size_t filename_len = strlen(filename);
660
661 /* The \ check should technically be needed for win32 systems only where
662 * it is a valid path separator. However, IE in all it's wisdom always sends
663 * the full path of the file on the user's filesystem, which means that unless
664 * the user does basename() they get a bogus file name. Until IE's user base drops
665 * to nill or problem is fixed this code must remain enabled for all systems. */
666 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
667 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
668
669 if (s && s2) {
670 if (s > s2) {
671 return ++s;
672 } else {
673 return ++s2;
674 }
675 } else if (s) {
676 return ++s;
677 } else if (s2) {
678 return ++s2;
679 } else {
680 return filename;
681 }
682 }
683 /* }}} */
684
685 /* {{{ php.ini directive handler */
686 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)687 static PHP_INI_MH(OnUpdate_mbstring_language)
688 {
689 enum mbfl_no_language no_language;
690
691 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
692 if (no_language == mbfl_no_language_invalid) {
693 MBSTRG(language) = mbfl_no_language_neutral;
694 return FAILURE;
695 }
696 MBSTRG(language) = no_language;
697 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
698 return SUCCESS;
699 }
700 /* }}} */
701
702 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)703 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
704 {
705 const mbfl_encoding **list;
706 size_t size;
707
708 if (!new_value) {
709 if (MBSTRG(detect_order_list)) {
710 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
711 }
712 MBSTRG(detect_order_list) = NULL;
713 MBSTRG(detect_order_list_size) = 0;
714 return SUCCESS;
715 }
716
717 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
718 return FAILURE;
719 }
720
721 if (MBSTRG(detect_order_list)) {
722 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
723 }
724 MBSTRG(detect_order_list) = list;
725 MBSTRG(detect_order_list_size) = size;
726 return SUCCESS;
727 }
728 /* }}} */
729
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)730 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
731 const mbfl_encoding **list;
732 size_t size;
733 if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
734 list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
735 *list = &mbfl_encoding_pass;
736 size = 1;
737 } else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
738 return FAILURE;
739 }
740 if (MBSTRG(http_input_list)) {
741 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
742 }
743 MBSTRG(http_input_list) = list;
744 MBSTRG(http_input_list_size) = size;
745 return SUCCESS;
746 }
747
748 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)749 static PHP_INI_MH(OnUpdate_mbstring_http_input)
750 {
751 if (new_value) {
752 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
753 }
754
755 if (!new_value || !ZSTR_LEN(new_value)) {
756 const char *encoding = php_get_input_encoding();
757 MBSTRG(http_input_set) = 0;
758 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
759 return SUCCESS;
760 }
761
762 MBSTRG(http_input_set) = 1;
763 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
764 }
765 /* }}} */
766
_php_mb_ini_mbstring_http_output_set(const char * new_value)767 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
768 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
769 if (!encoding) {
770 return FAILURE;
771 }
772
773 MBSTRG(http_output_encoding) = encoding;
774 MBSTRG(current_http_output_encoding) = encoding;
775 return SUCCESS;
776 }
777
778 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)779 static PHP_INI_MH(OnUpdate_mbstring_http_output)
780 {
781 if (new_value) {
782 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
783 }
784
785 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
786 MBSTRG(http_output_set) = 0;
787 _php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
788 return SUCCESS;
789 }
790
791 MBSTRG(http_output_set) = 1;
792 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
793 }
794 /* }}} */
795
796 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)797 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
798 {
799 const mbfl_encoding *encoding;
800
801 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
802 /* falls back to UTF-8 if an unknown encoding name is given */
803 if (new_value) {
804 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
805 }
806 encoding = &mbfl_encoding_utf8;
807 }
808 MBSTRG(internal_encoding) = encoding;
809 MBSTRG(current_internal_encoding) = encoding;
810 #ifdef HAVE_MBREGEX
811 {
812 const char *enc_name = new_value;
813 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
814 /* falls back to UTF-8 if an unknown encoding name is given */
815 enc_name = "UTF-8";
816 php_mb_regex_set_default_mbctype(enc_name);
817 }
818 php_mb_regex_set_mbctype(new_value);
819 }
820 #endif
821 return SUCCESS;
822 }
823 /* }}} */
824
825 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)826 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
827 {
828 if (new_value) {
829 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
830 }
831
832 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
833 return FAILURE;
834 }
835
836 if (new_value && ZSTR_LEN(new_value)) {
837 MBSTRG(internal_encoding_set) = 1;
838 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
839 } else {
840 const char *encoding = php_get_internal_encoding();
841 MBSTRG(internal_encoding_set) = 0;
842 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
843 }
844 }
845 /* }}} */
846
847 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)848 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
849 {
850 int c;
851 char *endptr = NULL;
852
853 if (new_value != NULL) {
854 if (zend_string_equals_literal_ci(new_value, "none")) {
855 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
856 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
857 } else if (zend_string_equals_literal_ci(new_value, "long")) {
858 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
859 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
860 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
861 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
862 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
863 } else {
864 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
865 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
866 if (ZSTR_LEN(new_value) > 0) {
867 c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868 if (*endptr == '\0') {
869 MBSTRG(filter_illegal_substchar) = c;
870 MBSTRG(current_filter_illegal_substchar) = c;
871 }
872 }
873 }
874 } else {
875 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
876 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
877 MBSTRG(filter_illegal_substchar) = '?';
878 MBSTRG(current_filter_illegal_substchar) = '?';
879 }
880
881 return SUCCESS;
882 }
883 /* }}} */
884
885 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)886 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
887 {
888 if (new_value == NULL) {
889 return FAILURE;
890 }
891
892 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
893
894 if (MBSTRG(encoding_translation)) {
895 sapi_unregister_post_entry(php_post_entries);
896 sapi_register_post_entries(mbstr_post_entries);
897 } else {
898 sapi_unregister_post_entry(mbstr_post_entries);
899 sapi_register_post_entries(php_post_entries);
900 }
901
902 return SUCCESS;
903 }
904 /* }}} */
905
906 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)907 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
908 {
909 zend_string *tmp;
910 void *re = NULL;
911
912 if (!new_value) {
913 new_value = entry->orig_value;
914 }
915 tmp = php_trim(new_value, NULL, 0, 3);
916
917 if (ZSTR_LEN(tmp) > 0) {
918 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
919 zend_string_release_ex(tmp, 0);
920 return FAILURE;
921 }
922 }
923
924 if (MBSTRG(http_output_conv_mimetypes)) {
925 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
926 }
927
928 MBSTRG(http_output_conv_mimetypes) = re;
929
930 zend_string_release_ex(tmp, 0);
931 return SUCCESS;
932 }
933 /* }}} */
934 /* }}} */
935
936 /* {{{ php.ini directive registration */
937 PHP_INI_BEGIN()
938 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
939 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
940 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
941 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
942 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
943 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
944
945 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
946 PHP_INI_SYSTEM | PHP_INI_PERDIR,
947 OnUpdate_mbstring_encoding_translation,
948 encoding_translation, zend_mbstring_globals, mbstring_globals)
949 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
950 "^(text/|application/xhtml\\+xml)",
951 PHP_INI_ALL,
952 OnUpdate_mbstring_http_output_conv_mimetypes)
953
954 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
955 PHP_INI_ALL,
956 OnUpdateBool,
957 strict_detection, zend_mbstring_globals, mbstring_globals)
958 #ifdef HAVE_MBREGEX
959 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
960 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
961 #endif
PHP_INI_END()962 PHP_INI_END()
963 /* }}} */
964
965 static void mbstring_internal_encoding_changed_hook(void) {
966 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
967 if (!MBSTRG(internal_encoding_set)) {
968 const char *encoding = php_get_internal_encoding();
969 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
970 }
971
972 if (!MBSTRG(http_output_set)) {
973 const char *encoding = php_get_output_encoding();
974 _php_mb_ini_mbstring_http_output_set(encoding);
975 }
976
977 if (!MBSTRG(http_input_set)) {
978 const char *encoding = php_get_input_encoding();
979 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
980 }
981 }
982
983 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)984 static PHP_GINIT_FUNCTION(mbstring)
985 {
986 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
987 ZEND_TSRMLS_CACHE_UPDATE();
988 #endif
989
990 mbstring_globals->language = mbfl_no_language_uni;
991 mbstring_globals->internal_encoding = NULL;
992 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
993 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
994 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
995 mbstring_globals->http_input_identify = NULL;
996 mbstring_globals->http_input_identify_get = NULL;
997 mbstring_globals->http_input_identify_post = NULL;
998 mbstring_globals->http_input_identify_cookie = NULL;
999 mbstring_globals->http_input_identify_string = NULL;
1000 mbstring_globals->http_input_list = NULL;
1001 mbstring_globals->http_input_list_size = 0;
1002 mbstring_globals->detect_order_list = NULL;
1003 mbstring_globals->detect_order_list_size = 0;
1004 mbstring_globals->current_detect_order_list = NULL;
1005 mbstring_globals->current_detect_order_list_size = 0;
1006 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1007 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1008 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1009 mbstring_globals->filter_illegal_substchar = '?';
1010 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1011 mbstring_globals->current_filter_illegal_substchar = '?';
1012 mbstring_globals->illegalchars = 0;
1013 mbstring_globals->encoding_translation = 0;
1014 mbstring_globals->strict_detection = 0;
1015 mbstring_globals->outconv_enabled = false;
1016 mbstring_globals->outconv_state = 0;
1017 mbstring_globals->http_output_conv_mimetypes = NULL;
1018 #ifdef HAVE_MBREGEX
1019 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1020 #endif
1021 mbstring_globals->last_used_encoding_name = NULL;
1022 mbstring_globals->last_used_encoding = NULL;
1023 mbstring_globals->internal_encoding_set = 0;
1024 mbstring_globals->http_output_set = 0;
1025 mbstring_globals->http_input_set = 0;
1026 mbstring_globals->all_encodings_list = NULL;
1027 }
1028 /* }}} */
1029
1030 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1031 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1032 {
1033 if (mbstring_globals->http_input_list) {
1034 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1035 }
1036 if (mbstring_globals->detect_order_list) {
1037 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1038 }
1039 if (mbstring_globals->http_output_conv_mimetypes) {
1040 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1041 }
1042 #ifdef HAVE_MBREGEX
1043 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1044 #endif
1045 }
1046 /* }}} */
1047
1048 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1049 static void init_check_utf8(void);
1050 #endif
1051
1052 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1053 PHP_MINIT_FUNCTION(mbstring)
1054 {
1055 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1056 ZEND_TSRMLS_CACHE_UPDATE();
1057 #endif
1058
1059 REGISTER_INI_ENTRIES();
1060
1061 /* We assume that we're the only user of the hook. */
1062 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1063 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1064 mbstring_internal_encoding_changed_hook();
1065
1066 /* This is a global handler. Should not be set in a per-request handler. */
1067 sapi_register_treat_data(mbstr_treat_data);
1068
1069 /* Post handlers are stored in the thread-local context. */
1070 if (MBSTRG(encoding_translation)) {
1071 sapi_register_post_entries(mbstr_post_entries);
1072 }
1073
1074 #ifdef HAVE_MBREGEX
1075 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1076 #endif
1077
1078 register_mbstring_symbols(module_number);
1079
1080 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1081 return FAILURE;
1082 }
1083
1084 php_rfc1867_set_multibyte_callbacks(
1085 php_mb_encoding_translation,
1086 php_mb_gpc_get_detect_order,
1087 php_mb_gpc_set_input_encoding,
1088 php_mb_rfc1867_getword,
1089 php_mb_rfc1867_getword_conf,
1090 php_mb_rfc1867_basename);
1091
1092 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1093 init_check_utf8();
1094 init_convert_utf16();
1095 #endif
1096
1097 return SUCCESS;
1098 }
1099 /* }}} */
1100
1101 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1102 PHP_MSHUTDOWN_FUNCTION(mbstring)
1103 {
1104 UNREGISTER_INI_ENTRIES();
1105
1106 zend_multibyte_restore_functions();
1107
1108 #ifdef HAVE_MBREGEX
1109 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1110 #endif
1111
1112 php_internal_encoding_changed = NULL;
1113
1114 return SUCCESS;
1115 }
1116 /* }}} */
1117
1118 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1119 PHP_RINIT_FUNCTION(mbstring)
1120 {
1121 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1122 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1123 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1124 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1125
1126 MBSTRG(illegalchars) = 0;
1127
1128 php_mb_populate_current_detect_order_list();
1129
1130 #ifdef HAVE_MBREGEX
1131 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1132 #endif
1133 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1134
1135 return SUCCESS;
1136 }
1137 /* }}} */
1138
1139 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1140 PHP_RSHUTDOWN_FUNCTION(mbstring)
1141 {
1142 if (MBSTRG(current_detect_order_list) != NULL) {
1143 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1144 MBSTRG(current_detect_order_list) = NULL;
1145 MBSTRG(current_detect_order_list_size) = 0;
1146 }
1147
1148 /* clear http input identification. */
1149 MBSTRG(http_input_identify) = NULL;
1150 MBSTRG(http_input_identify_post) = NULL;
1151 MBSTRG(http_input_identify_get) = NULL;
1152 MBSTRG(http_input_identify_cookie) = NULL;
1153 MBSTRG(http_input_identify_string) = NULL;
1154
1155 if (MBSTRG(last_used_encoding_name)) {
1156 zend_string_release(MBSTRG(last_used_encoding_name));
1157 MBSTRG(last_used_encoding_name) = NULL;
1158 }
1159
1160 MBSTRG(internal_encoding_set) = 0;
1161 MBSTRG(http_output_set) = 0;
1162 MBSTRG(http_input_set) = 0;
1163
1164 MBSTRG(outconv_enabled) = false;
1165 MBSTRG(outconv_state) = 0;
1166
1167 if (MBSTRG(all_encodings_list)) {
1168 GC_DELREF(MBSTRG(all_encodings_list));
1169 zend_array_destroy(MBSTRG(all_encodings_list));
1170 MBSTRG(all_encodings_list) = NULL;
1171 }
1172
1173 #ifdef HAVE_MBREGEX
1174 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1175 #endif
1176
1177 return SUCCESS;
1178 }
1179 /* }}} */
1180
1181 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1182 PHP_MINFO_FUNCTION(mbstring)
1183 {
1184 php_info_print_table_start();
1185 php_info_print_table_row(2, "Multibyte Support", "enabled");
1186 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1187 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1188 {
1189 char tmp[256];
1190 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1191 php_info_print_table_row(2, "libmbfl version", tmp);
1192 }
1193 php_info_print_table_end();
1194
1195 php_info_print_table_start();
1196 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1197 php_info_print_table_end();
1198
1199 #ifdef HAVE_MBREGEX
1200 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1201 #endif
1202
1203 DISPLAY_INI_ENTRIES();
1204 }
1205 /* }}} */
1206
1207 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1208 PHP_FUNCTION(mb_language)
1209 {
1210 zend_string *name = NULL;
1211
1212 ZEND_PARSE_PARAMETERS_START(0, 1)
1213 Z_PARAM_OPTIONAL
1214 Z_PARAM_STR_OR_NULL(name)
1215 ZEND_PARSE_PARAMETERS_END();
1216
1217 if (name == NULL) {
1218 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1219 } else {
1220 zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1221 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1222 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1223 zend_string_release_ex(ini_name, 0);
1224 RETURN_THROWS();
1225 }
1226 // TODO Make return void
1227 RETVAL_TRUE;
1228 zend_string_release_ex(ini_name, 0);
1229 }
1230 }
1231 /* }}} */
1232
1233 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1234 PHP_FUNCTION(mb_internal_encoding)
1235 {
1236 char *name = NULL;
1237 size_t name_len;
1238 const mbfl_encoding *encoding;
1239
1240 ZEND_PARSE_PARAMETERS_START(0, 1)
1241 Z_PARAM_OPTIONAL
1242 Z_PARAM_STRING_OR_NULL(name, name_len)
1243 ZEND_PARSE_PARAMETERS_END();
1244
1245 if (name == NULL) {
1246 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1247 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1248 } else {
1249 encoding = mbfl_name2encoding(name);
1250 if (!encoding) {
1251 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1252 RETURN_THROWS();
1253 } else {
1254 MBSTRG(current_internal_encoding) = encoding;
1255 MBSTRG(internal_encoding_set) = 1;
1256 /* TODO Return old encoding */
1257 RETURN_TRUE;
1258 }
1259 }
1260 }
1261 /* }}} */
1262
1263 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1264 PHP_FUNCTION(mb_http_input)
1265 {
1266 char *type = NULL;
1267 size_t type_len = 0, n;
1268 const mbfl_encoding **entry;
1269 const mbfl_encoding *encoding;
1270
1271 ZEND_PARSE_PARAMETERS_START(0, 1)
1272 Z_PARAM_OPTIONAL
1273 Z_PARAM_STRING_OR_NULL(type, type_len)
1274 ZEND_PARSE_PARAMETERS_END();
1275
1276 if (type == NULL) {
1277 encoding = MBSTRG(http_input_identify);
1278 } else {
1279 switch (*type) {
1280 case 'G':
1281 case 'g':
1282 encoding = MBSTRG(http_input_identify_get);
1283 break;
1284 case 'P':
1285 case 'p':
1286 encoding = MBSTRG(http_input_identify_post);
1287 break;
1288 case 'C':
1289 case 'c':
1290 encoding = MBSTRG(http_input_identify_cookie);
1291 break;
1292 case 'S':
1293 case 's':
1294 encoding = MBSTRG(http_input_identify_string);
1295 break;
1296 case 'I':
1297 case 'i':
1298 entry = MBSTRG(http_input_list);
1299 n = MBSTRG(http_input_list_size);
1300 array_init(return_value);
1301 for (size_t i = 0; i < n; i++, entry++) {
1302 add_next_index_string(return_value, (*entry)->name);
1303 }
1304 return;
1305 case 'L':
1306 case 'l':
1307 entry = MBSTRG(http_input_list);
1308 n = MBSTRG(http_input_list_size);
1309 if (n == 0) {
1310 RETURN_FALSE;
1311 }
1312
1313 smart_str result = {0};
1314 for (size_t i = 0; i < n; i++, entry++) {
1315 if (i > 0) {
1316 smart_str_appendc(&result, ',');
1317 }
1318 smart_str_appends(&result, (*entry)->name);
1319 }
1320 RETURN_STR(smart_str_extract(&result));
1321 default:
1322 zend_argument_value_error(1,
1323 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1324 RETURN_THROWS();
1325 }
1326 }
1327
1328 if (encoding) {
1329 RETURN_STRING(encoding->name);
1330 } else {
1331 RETURN_FALSE;
1332 }
1333 }
1334 /* }}} */
1335
1336 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1337 PHP_FUNCTION(mb_http_output)
1338 {
1339 char *name = NULL;
1340 size_t name_len;
1341
1342 ZEND_PARSE_PARAMETERS_START(0, 1)
1343 Z_PARAM_OPTIONAL
1344 Z_PARAM_STRING_OR_NULL(name, name_len)
1345 ZEND_PARSE_PARAMETERS_END();
1346
1347 if (name == NULL) {
1348 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1349 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1350 } else {
1351 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1352 if (!encoding) {
1353 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1354 RETURN_THROWS();
1355 } else {
1356 MBSTRG(http_output_set) = 1;
1357 MBSTRG(current_http_output_encoding) = encoding;
1358 /* TODO Return previous encoding? */
1359 RETURN_TRUE;
1360 }
1361 }
1362 }
1363 /* }}} */
1364
1365 /* {{{ Sets the current detect_order or Return the current detect_order as an array */
PHP_FUNCTION(mb_detect_order)1366 PHP_FUNCTION(mb_detect_order)
1367 {
1368 zend_string *order_str = NULL;
1369 HashTable *order_ht = NULL;
1370
1371 ZEND_PARSE_PARAMETERS_START(0, 1)
1372 Z_PARAM_OPTIONAL
1373 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1374 ZEND_PARSE_PARAMETERS_END();
1375
1376 if (!order_str && !order_ht) {
1377 size_t n = MBSTRG(current_detect_order_list_size);
1378 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1379 array_init(return_value);
1380 for (size_t i = 0; i < n; i++) {
1381 add_next_index_string(return_value, (*entry)->name);
1382 entry++;
1383 }
1384 } else {
1385 const mbfl_encoding **list;
1386 size_t size;
1387 if (order_ht) {
1388 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1389 RETURN_THROWS();
1390 }
1391 } else {
1392 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1393 RETURN_THROWS();
1394 }
1395 }
1396
1397 if (size == 0) {
1398 efree(ZEND_VOIDP(list));
1399 zend_argument_value_error(1, "must specify at least one encoding");
1400 RETURN_THROWS();
1401 }
1402
1403 if (MBSTRG(current_detect_order_list)) {
1404 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1405 }
1406 MBSTRG(current_detect_order_list) = list;
1407 MBSTRG(current_detect_order_list_size) = size;
1408 RETURN_TRUE;
1409 }
1410 }
1411 /* }}} */
1412
php_mb_check_code_point(zend_long cp)1413 static inline int php_mb_check_code_point(zend_long cp)
1414 {
1415 if (cp < 0 || cp >= 0x110000) {
1416 /* Out of Unicode range */
1417 return 0;
1418 }
1419
1420 if (cp >= 0xd800 && cp <= 0xdfff) {
1421 /* Surrogate code-point. These are never valid on their own and we only allow a single
1422 * substitute character. */
1423 return 0;
1424 }
1425
1426 /* As we do not know the target encoding of the conversion operation that is going to
1427 * use the substitution character, we cannot check whether the codepoint is actually mapped
1428 * in the given encoding at this point. Thus we have to accept everything. */
1429 return 1;
1430 }
1431
1432 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1433 PHP_FUNCTION(mb_substitute_character)
1434 {
1435 zend_string *substitute_character = NULL;
1436 zend_long substitute_codepoint;
1437 bool substitute_is_null = 1;
1438
1439 ZEND_PARSE_PARAMETERS_START(0, 1)
1440 Z_PARAM_OPTIONAL
1441 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1442 ZEND_PARSE_PARAMETERS_END();
1443
1444 if (substitute_is_null) {
1445 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1446 RETURN_STRING("none");
1447 }
1448 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1449 RETURN_STRING("long");
1450 }
1451 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1452 RETURN_STRING("entity");
1453 }
1454 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1455 }
1456
1457 if (substitute_character != NULL) {
1458 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1459 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1460 RETURN_TRUE;
1461 }
1462 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1463 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1464 RETURN_TRUE;
1465 }
1466 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1467 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1468 RETURN_TRUE;
1469 }
1470 /* Invalid string value */
1471 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1472 RETURN_THROWS();
1473 }
1474 /* Integer codepoint passed */
1475 if (!php_mb_check_code_point(substitute_codepoint)) {
1476 zend_argument_value_error(1, "is not a valid codepoint");
1477 RETURN_THROWS();
1478 }
1479
1480 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1481 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1482 RETURN_TRUE;
1483 }
1484 /* }}} */
1485
1486 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1487 PHP_FUNCTION(mb_preferred_mime_name)
1488 {
1489 char *name = NULL;
1490 size_t name_len;
1491
1492 ZEND_PARSE_PARAMETERS_START(1, 1)
1493 Z_PARAM_STRING(name, name_len)
1494 ZEND_PARSE_PARAMETERS_END();
1495
1496 const mbfl_encoding *enc = mbfl_name2encoding(name);
1497 if (enc == NULL) {
1498 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1499 RETURN_THROWS();
1500 }
1501
1502 const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1503 if (preferred_name == NULL || *preferred_name == '\0') {
1504 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1505 RETVAL_FALSE;
1506 } else {
1507 RETVAL_STRING((char *)preferred_name);
1508 }
1509 }
1510 /* }}} */
1511
1512 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1513 PHP_FUNCTION(mb_parse_str)
1514 {
1515 zval *track_vars_array = NULL;
1516 char *encstr;
1517 size_t encstr_len;
1518 php_mb_encoding_handler_info_t info;
1519 const mbfl_encoding *detected;
1520
1521 ZEND_PARSE_PARAMETERS_START(2, 2)
1522 Z_PARAM_STRING(encstr, encstr_len)
1523 Z_PARAM_ZVAL(track_vars_array)
1524 ZEND_PARSE_PARAMETERS_END();
1525
1526 track_vars_array = zend_try_array_init(track_vars_array);
1527 if (!track_vars_array) {
1528 RETURN_THROWS();
1529 }
1530
1531 encstr = estrndup(encstr, encstr_len);
1532
1533 info.data_type = PARSE_STRING;
1534 info.separator = PG(arg_separator).input;
1535 info.report_errors = true;
1536 info.to_encoding = MBSTRG(current_internal_encoding);
1537 info.from_encodings = MBSTRG(http_input_list);
1538 info.num_from_encodings = MBSTRG(http_input_list_size);
1539
1540 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1541
1542 MBSTRG(http_input_identify) = detected;
1543
1544 RETVAL_BOOL(detected);
1545
1546 if (encstr != NULL) efree(encstr);
1547 }
1548 /* }}} */
1549
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 zend_string *str;
1553 zend_long arg_status;
1554
1555 ZEND_PARSE_PARAMETERS_START(2, 2)
1556 Z_PARAM_STR(str)
1557 Z_PARAM_LONG(arg_status)
1558 ZEND_PARSE_PARAMETERS_END();
1559
1560 const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding);
1561 if (encoding == &mbfl_encoding_pass) {
1562 RETURN_STR_COPY(str);
1563 }
1564
1565 if (arg_status & PHP_OUTPUT_HANDLER_START) {
1566 bool free_mimetype = false;
1567 char *mimetype = NULL;
1568
1569 /* Analyze mime type */
1570 if (SG(sapi_headers).mimetype && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1571 char *s;
1572 if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1573 mimetype = estrdup(SG(sapi_headers).mimetype);
1574 } else {
1575 mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1576 }
1577 free_mimetype = true;
1578 } else if (SG(sapi_headers).send_default_content_type) {
1579 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1580 }
1581
1582 /* If content-type is not yet set, set it and enable conversion */
1583 if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1584 const char *charset = encoding->mime_name;
1585 if (charset) {
1586 char *p;
1587 size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s", mimetype, charset);
1588 if (sapi_add_header(p, len, 0) != FAILURE) {
1589 SG(sapi_headers).send_default_content_type = 0;
1590 }
1591 }
1592
1593 MBSTRG(outconv_enabled) = true;
1594 }
1595
1596 if (free_mimetype) {
1597 efree(mimetype);
1598 }
1599 }
1600
1601 if (!MBSTRG(outconv_enabled)) {
1602 RETURN_STR_COPY(str);
1603 }
1604
1605 mb_convert_buf buf;
1606 mb_convert_buf_init(&buf, ZSTR_LEN(str), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1607
1608 uint32_t wchar_buf[128];
1609 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1610 size_t in_len = ZSTR_LEN(str);
1611 bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1612
1613 while (in_len) {
1614 size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1615 ZEND_ASSERT(out_len <= 128);
1616 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1617 }
1618
1619 MBSTRG(illegalchars) += buf.errors;
1620 RETVAL_STR(mb_convert_buf_result_raw(&buf));
1621
1622 if (last_feed) {
1623 MBSTRG(outconv_enabled) = false;
1624 MBSTRG(outconv_state) = 0;
1625 }
1626 }
1627
PHP_FUNCTION(mb_str_split)1628 PHP_FUNCTION(mb_str_split)
1629 {
1630 zend_string *str, *encoding = NULL;
1631 zend_long split_len = 1;
1632
1633 ZEND_PARSE_PARAMETERS_START(1, 3)
1634 Z_PARAM_STR(str)
1635 Z_PARAM_OPTIONAL
1636 Z_PARAM_LONG(split_len)
1637 Z_PARAM_STR_OR_NULL(encoding)
1638 ZEND_PARSE_PARAMETERS_END();
1639
1640 if (split_len <= 0) {
1641 zend_argument_value_error(2, "must be greater than 0");
1642 RETURN_THROWS();
1643 } else if (split_len > UINT_MAX / 4) {
1644 zend_argument_value_error(2, "is too large");
1645 RETURN_THROWS();
1646 }
1647
1648 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1649 if (!enc) {
1650 RETURN_THROWS();
1651 }
1652
1653 if (ZSTR_LEN(str) == 0) {
1654 RETURN_EMPTY_ARRAY();
1655 }
1656
1657 unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1658
1659 unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1660 if (char_len) {
1661 unsigned int chunk_len = char_len * split_len;
1662 unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1663 array_init_size(return_value, chunks);
1664 while (p < e) {
1665 add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1666 p += chunk_len;
1667 }
1668 } else if (enc->mblen_table) {
1669 unsigned char const *mbtab = enc->mblen_table;
1670
1671 /* Assume that we have 1-byte characters */
1672 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1673
1674 while (p < e) {
1675 unsigned char *chunk = p; /* start of chunk */
1676
1677 for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1678 p += mbtab[*p];
1679 }
1680 if (p > e) {
1681 p = e; /* ensure chunk is in bounds */
1682 }
1683 add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1684 }
1685 } else {
1686 /* Assume that we have 1-byte characters */
1687 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1688
1689 uint32_t wchar_buf[128];
1690 size_t in_len = ZSTR_LEN(str);
1691 unsigned int state = 0, char_count = 0;
1692
1693 mb_convert_buf buf;
1694
1695 while (in_len) {
1696 size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1697 ZEND_ASSERT(out_len <= 128);
1698 size_t i = 0;
1699
1700 /* Is there some output remaining from the previous iteration? */
1701 if (char_count) {
1702 if (out_len >= split_len - char_count) {
1703 /* Finish off an incomplete chunk from previous iteration
1704 * ('buf' was already initialized; we don't need to do it again) */
1705 enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1706 i += split_len - char_count;
1707 char_count = 0;
1708 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1709 } else {
1710 /* Output from this iteration is not enough to finish the next chunk;
1711 * output what we can, and leave 'buf' to be used again on next iteration */
1712 enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1713 char_count += out_len;
1714 continue;
1715 }
1716 }
1717
1718 while (i < out_len) {
1719 /* Prepare for the next chunk */
1720 mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1721
1722 if (out_len - i >= split_len) {
1723 enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1724 i += split_len;
1725 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1726 } else {
1727 /* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1728 * leave them for the next iteration */
1729 enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1730 char_count = out_len - i;
1731 break;
1732 }
1733 }
1734 }
1735
1736 if (char_count) {
1737 /* The main loop above has finished processing the input string, but
1738 * has left a partial chunk in 'buf' */
1739 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1740 }
1741 }
1742 }
1743
1744 #ifdef __SSE2__
1745 /* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1746 * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1747 * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1748 * 16 of them, returning the sum in an ordinary scalar register */
_mm_sum_epu8(const __m128i v)1749 static inline uint32_t _mm_sum_epu8(const __m128i v)
1750 {
1751 /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1752 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1753 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1754 * halves of the destination XMM register
1755 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1756 * summed up will actually just be the 8-bit values from `v` */
1757 __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1758 /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1759 * to extract it here; but it stored the sum as two different 16-bit values
1760 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1761 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1762 return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1763 }
1764 #endif
1765
1766 /* This assumes that `string` is valid UTF-8
1767 * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1768 * Interpreted as signed integers, those are all byte values less than -64
1769 * A fast way to get the length of a UTF-8 string is to start with its byte length,
1770 * then subtract off the number of continuation bytes */
mb_fast_strlen_utf8(unsigned char * p,size_t len)1771 static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1772 {
1773 unsigned char *e = p + len;
1774
1775 #ifdef __SSE2__
1776 if (len >= sizeof(__m128i)) {
1777 e -= sizeof(__m128i);
1778
1779 const __m128i threshold = _mm_set1_epi8(-64);
1780 const __m128i delta = _mm_set1_epi8(1);
1781 __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1782
1783 int reset_counter = 255;
1784 do {
1785 __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1786 __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1787 counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1788
1789 /* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1790 * and reset them to zero */
1791 if (--reset_counter == 0) {
1792 len -= _mm_sum_epu8(counter);
1793 counter = _mm_setzero_si128();
1794 reset_counter = 255;
1795 }
1796
1797 p += sizeof(__m128i);
1798 } while (p <= e);
1799
1800 e += sizeof(__m128i);
1801 len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1802 }
1803 #endif
1804
1805 /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1806 while (p < e) {
1807 signed char c = *p++;
1808 if (c < -64) {
1809 len--;
1810 }
1811 }
1812
1813 return len;
1814 }
1815
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1816 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1817 {
1818 unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1819 if (char_len) {
1820 return ZSTR_LEN(string) / char_len;
1821 } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) {
1822 return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1823 }
1824
1825 uint32_t wchar_buf[128];
1826 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1827 size_t in_len = ZSTR_LEN(string);
1828 unsigned int state = 0;
1829 size_t len = 0;
1830
1831 while (in_len) {
1832 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1833 }
1834
1835 return len;
1836 }
1837
1838 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1839 PHP_FUNCTION(mb_strlen)
1840 {
1841 zend_string *string, *enc_name = NULL;
1842
1843 ZEND_PARSE_PARAMETERS_START(1, 2)
1844 Z_PARAM_STR(string)
1845 Z_PARAM_OPTIONAL
1846 Z_PARAM_STR_OR_NULL(enc_name)
1847 ZEND_PARSE_PARAMETERS_END();
1848
1849 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1850 if (!enc) {
1851 RETURN_THROWS();
1852 }
1853
1854 RETVAL_LONG(mb_get_strlen(string, enc));
1855 }
1856 /* }}} */
1857
1858 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)1859 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1860 {
1861 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1862 }
1863
offset_to_pointer_utf8(unsigned char * str,unsigned char * end,ssize_t offset)1864 static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1865 if (offset < 0) {
1866 unsigned char *pos = end;
1867 while (offset < 0) {
1868 if (pos <= str) {
1869 return NULL;
1870 }
1871
1872 unsigned char c = *--pos;
1873 if (c < 0x80 || (c & 0xC0) != 0x80) {
1874 offset++;
1875 }
1876 }
1877 return pos;
1878 } else {
1879 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1880 unsigned char *pos = str;
1881 while (offset-- > 0) {
1882 if (pos >= end) {
1883 return NULL;
1884 }
1885 pos += u8_tbl[*pos];
1886 }
1887 return pos;
1888 }
1889 }
1890
pointer_to_offset_utf8(unsigned char * start,unsigned char * pos)1891 static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1892 return mb_fast_strlen_utf8(start, pos - start);
1893 }
1894
mb_find_strpos(zend_string * haystack,zend_string * needle,const mbfl_encoding * enc,ssize_t offset,bool reverse)1895 static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1896 {
1897 size_t result;
1898 zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1899 unsigned char *offset_pointer;
1900
1901 if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1902 unsigned int num_errors = 0;
1903 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1904 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1905 } else {
1906 haystack_u8 = haystack;
1907 needle_u8 = needle;
1908 }
1909
1910 offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1911 if (!offset_pointer) {
1912 result = MBFL_ERROR_OFFSET;
1913 goto out;
1914 }
1915
1916 result = MBFL_ERROR_NOT_FOUND;
1917 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1918 goto out;
1919 }
1920
1921 const char *found_pos;
1922 if (!reverse) {
1923 found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1924 } else if (offset >= 0) {
1925 found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1926 } else {
1927 size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1928 offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1929 if (!offset_pointer) {
1930 offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1931 }
1932
1933 found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1934 }
1935
1936 if (found_pos) {
1937 result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1938 }
1939
1940 out:
1941 if (haystack_u8 != haystack) {
1942 zend_string_free(haystack_u8);
1943 }
1944 if (needle_u8 != needle) {
1945 zend_string_free(needle_u8);
1946 }
1947 return result;
1948 }
1949
handle_strpos_error(size_t error)1950 static void handle_strpos_error(size_t error) {
1951 switch (error) {
1952 case MBFL_ERROR_NOT_FOUND:
1953 break;
1954 case MBFL_ERROR_ENCODING:
1955 php_error_docref(NULL, E_WARNING, "Conversion error");
1956 break;
1957 case MBFL_ERROR_OFFSET:
1958 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1959 break;
1960 default:
1961 zend_value_error("mb_strpos(): Unknown error");
1962 break;
1963 }
1964 }
1965
PHP_FUNCTION(mb_strpos)1966 PHP_FUNCTION(mb_strpos)
1967 {
1968 zend_long offset = 0;
1969 zend_string *needle, *haystack;
1970 zend_string *enc_name = NULL;
1971
1972 ZEND_PARSE_PARAMETERS_START(2, 4)
1973 Z_PARAM_STR(haystack)
1974 Z_PARAM_STR(needle)
1975 Z_PARAM_OPTIONAL
1976 Z_PARAM_LONG(offset)
1977 Z_PARAM_STR_OR_NULL(enc_name)
1978 ZEND_PARSE_PARAMETERS_END();
1979
1980 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1981 if (!enc) {
1982 RETURN_THROWS();
1983 }
1984
1985 size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1986 if (!mbfl_is_error(n)) {
1987 RETVAL_LONG(n);
1988 } else {
1989 handle_strpos_error(n);
1990 RETVAL_FALSE;
1991 }
1992 }
1993
1994 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1995 PHP_FUNCTION(mb_strrpos)
1996 {
1997 zend_long offset = 0;
1998 zend_string *needle, *haystack;
1999 zend_string *enc_name = NULL;
2000
2001 ZEND_PARSE_PARAMETERS_START(2, 4)
2002 Z_PARAM_STR(haystack)
2003 Z_PARAM_STR(needle)
2004 Z_PARAM_OPTIONAL
2005 Z_PARAM_LONG(offset)
2006 Z_PARAM_STR_OR_NULL(enc_name)
2007 ZEND_PARSE_PARAMETERS_END();
2008
2009 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2010 if (!enc) {
2011 RETURN_THROWS();
2012 }
2013
2014 size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2015 if (!mbfl_is_error(n)) {
2016 RETVAL_LONG(n);
2017 } else {
2018 handle_strpos_error(n);
2019 RETVAL_FALSE;
2020 }
2021 }
2022 /* }}} */
2023
2024 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)2025 PHP_FUNCTION(mb_stripos)
2026 {
2027 zend_long offset = 0;
2028 zend_string *haystack, *needle;
2029 zend_string *from_encoding = NULL;
2030
2031 ZEND_PARSE_PARAMETERS_START(2, 4)
2032 Z_PARAM_STR(haystack)
2033 Z_PARAM_STR(needle)
2034 Z_PARAM_OPTIONAL
2035 Z_PARAM_LONG(offset)
2036 Z_PARAM_STR_OR_NULL(from_encoding)
2037 ZEND_PARSE_PARAMETERS_END();
2038
2039 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2040 if (!enc) {
2041 RETURN_THROWS();
2042 }
2043
2044 size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2045
2046 if (!mbfl_is_error(n)) {
2047 RETVAL_LONG(n);
2048 } else {
2049 handle_strpos_error(n);
2050 RETVAL_FALSE;
2051 }
2052 }
2053 /* }}} */
2054
2055 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)2056 PHP_FUNCTION(mb_strripos)
2057 {
2058 zend_long offset = 0;
2059 zend_string *haystack, *needle;
2060 zend_string *from_encoding = NULL;
2061
2062 ZEND_PARSE_PARAMETERS_START(2, 4)
2063 Z_PARAM_STR(haystack)
2064 Z_PARAM_STR(needle)
2065 Z_PARAM_OPTIONAL
2066 Z_PARAM_LONG(offset)
2067 Z_PARAM_STR_OR_NULL(from_encoding)
2068 ZEND_PARSE_PARAMETERS_END();
2069
2070 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2071 if (!enc) {
2072 RETURN_THROWS();
2073 }
2074
2075 size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2076
2077 if (!mbfl_is_error(n)) {
2078 RETVAL_LONG(n);
2079 } else {
2080 handle_strpos_error(n);
2081 RETVAL_FALSE;
2082 }
2083 }
2084 /* }}} */
2085
mb_get_substr_slow(unsigned char * in,size_t in_len,size_t from,size_t len,const mbfl_encoding * enc)2086 static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2087 {
2088 uint32_t wchar_buf[128];
2089 unsigned int state = 0;
2090
2091 mb_convert_buf buf;
2092 mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2093
2094 while (in_len && len) {
2095 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2096 ZEND_ASSERT(out_len <= 128);
2097
2098 if (from >= out_len) {
2099 from -= out_len;
2100 } else {
2101 size_t needed_codepoints = MIN(out_len - from, len);
2102 enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2103 from = 0;
2104 len -= needed_codepoints;
2105 }
2106 }
2107
2108 return mb_convert_buf_result(&buf, enc);
2109 }
2110
mb_get_substr(zend_string * input,size_t from,size_t len,const mbfl_encoding * enc)2111 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2112 {
2113 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2114 size_t in_len = ZSTR_LEN(input);
2115
2116 if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2117 /* Other than MacJapanese, no supported text encoding decodes to
2118 * more than one codepoint per byte
2119 * So if the number of codepoints to skip >= number of input bytes,
2120 * then definitely the output should be empty */
2121 return zend_empty_string;
2122 }
2123
2124 /* Does each codepoint have a fixed byte width? */
2125 unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2126 if (flag) {
2127 /* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2128 from *= flag;
2129 len *= flag;
2130 if (from >= in_len) {
2131 return zend_empty_string;
2132 }
2133 in += from;
2134 in_len -= from;
2135 if (len > in_len) {
2136 len = in_len;
2137 }
2138 return zend_string_init_fast((const char*)in, len);
2139 }
2140
2141 return mb_get_substr_slow(in, in_len, from, len, enc);
2142 }
2143
2144 #define MB_STRSTR 1
2145 #define MB_STRRCHR 2
2146 #define MB_STRISTR 3
2147 #define MB_STRRICHR 4
2148
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2149 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2150 {
2151 bool reverse_mode = false, part = false;
2152 size_t n;
2153 zend_string *haystack, *needle;
2154 zend_string *encoding_name = NULL;
2155
2156 ZEND_PARSE_PARAMETERS_START(2, 4)
2157 Z_PARAM_STR(haystack)
2158 Z_PARAM_STR(needle)
2159 Z_PARAM_OPTIONAL
2160 Z_PARAM_BOOL(part)
2161 Z_PARAM_STR_OR_NULL(encoding_name)
2162 ZEND_PARSE_PARAMETERS_END();
2163
2164 const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2165 if (!enc) {
2166 RETURN_THROWS();
2167 }
2168
2169 if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2170 reverse_mode = true;
2171 }
2172
2173 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2174 n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2175 } else {
2176 n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2177 }
2178
2179 if (!mbfl_is_error(n)) {
2180 if (part) {
2181 RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2182 } else {
2183 RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2184 }
2185 } else {
2186 // FIXME use handle_strpos_error(n)
2187 RETVAL_FALSE;
2188 }
2189 }
2190
2191 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2192 PHP_FUNCTION(mb_strstr)
2193 {
2194 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2195 }
2196 /* }}} */
2197
2198 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2199 PHP_FUNCTION(mb_strrchr)
2200 {
2201 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2202 }
2203 /* }}} */
2204
2205 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2206 PHP_FUNCTION(mb_stristr)
2207 {
2208 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2209 }
2210 /* }}} */
2211
2212 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2213 PHP_FUNCTION(mb_strrichr)
2214 {
2215 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2216 }
2217 /* }}} */
2218
2219 #undef MB_STRSTR
2220 #undef MB_STRRCHR
2221 #undef MB_STRISTR
2222 #undef MB_STRRICHR
2223
PHP_FUNCTION(mb_substr_count)2224 PHP_FUNCTION(mb_substr_count)
2225 {
2226 zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2227
2228 ZEND_PARSE_PARAMETERS_START(2, 3)
2229 Z_PARAM_STR(haystack)
2230 Z_PARAM_STR(needle)
2231 Z_PARAM_OPTIONAL
2232 Z_PARAM_STR_OR_NULL(enc_name)
2233 ZEND_PARSE_PARAMETERS_END();
2234
2235 if (ZSTR_LEN(needle) == 0) {
2236 zend_argument_value_error(2, "must not be empty");
2237 RETURN_THROWS();
2238 }
2239
2240 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2241 if (!enc) {
2242 RETURN_THROWS();
2243 }
2244
2245 if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2246 /* No need to do any conversion if haystack/needle are already known-valid UTF-8
2247 * (If they are not valid, then not passing them through conversion filters could affect output) */
2248 if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
2249 haystack_u8 = haystack;
2250 } else {
2251 unsigned int num_errors = 0;
2252 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2253 if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2254 GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2255 }
2256 }
2257
2258 if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
2259 needle_u8 = needle;
2260 } else {
2261 unsigned int num_errors = 0;
2262 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2263 if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2264 GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2265 }
2266 }
2267 } else {
2268 unsigned int num_errors = 0;
2269 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2271 /* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2272 * may be only escape sequences */
2273 if (ZSTR_LEN(needle_u8) == 0) {
2274 zend_string_free(haystack_u8);
2275 zend_string_free(needle_u8);
2276 zend_argument_value_error(2, "must not be empty");
2277 RETURN_THROWS();
2278 }
2279 }
2280
2281 size_t result = 0;
2282
2283 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2284 goto out;
2285 }
2286
2287 const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2288 while (true) {
2289 p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2290 if (!p) {
2291 break;
2292 }
2293 p += ZSTR_LEN(needle_u8);
2294 result++;
2295 }
2296
2297 out:
2298 if (haystack_u8 != haystack) {
2299 zend_string_free(haystack_u8);
2300 }
2301 if (needle_u8 != needle) {
2302 zend_string_free(needle_u8);
2303 }
2304
2305 RETVAL_LONG(result);
2306 }
2307
2308 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2309 PHP_FUNCTION(mb_substr)
2310 {
2311 zend_string *str, *encoding = NULL;
2312 zend_long from, len;
2313 size_t real_from, real_len;
2314 bool len_is_null = true;
2315
2316 ZEND_PARSE_PARAMETERS_START(2, 4)
2317 Z_PARAM_STR(str)
2318 Z_PARAM_LONG(from)
2319 Z_PARAM_OPTIONAL
2320 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2321 Z_PARAM_STR_OR_NULL(encoding)
2322 ZEND_PARSE_PARAMETERS_END();
2323
2324 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2325 if (!enc) {
2326 RETURN_THROWS();
2327 }
2328
2329 size_t mblen = 0;
2330 if (from < 0 || (!len_is_null && len < 0)) {
2331 mblen = mb_get_strlen(str, enc);
2332 }
2333
2334 /* if "from" position is negative, count start position from the end
2335 * of the string */
2336 if (from >= 0) {
2337 real_from = (size_t) from;
2338 } else if (-from < mblen) {
2339 real_from = mblen + from;
2340 } else {
2341 real_from = 0;
2342 }
2343
2344 /* if "length" position is negative, set it to the length
2345 * needed to stop that many chars from the end of the string */
2346 if (len_is_null) {
2347 real_len = MBFL_SUBSTR_UNTIL_END;
2348 } else if (len >= 0) {
2349 real_len = (size_t) len;
2350 } else if (real_from < mblen && -len < mblen - real_from) {
2351 real_len = (mblen - real_from) + len;
2352 } else {
2353 real_len = 0;
2354 }
2355
2356 RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2357 }
2358 /* }}} */
2359
2360 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2361 PHP_FUNCTION(mb_strcut)
2362 {
2363 zend_string *encoding = NULL;
2364 char *string_val;
2365 zend_long from, len;
2366 bool len_is_null = 1;
2367 mbfl_string string, result, *ret;
2368
2369 ZEND_PARSE_PARAMETERS_START(2, 4)
2370 Z_PARAM_STRING(string_val, string.len)
2371 Z_PARAM_LONG(from)
2372 Z_PARAM_OPTIONAL
2373 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2374 Z_PARAM_STR_OR_NULL(encoding)
2375 ZEND_PARSE_PARAMETERS_END();
2376
2377 string.val = (unsigned char*)string_val;
2378 string.encoding = php_mb_get_encoding(encoding, 4);
2379 if (!string.encoding) {
2380 RETURN_THROWS();
2381 }
2382
2383 if (len_is_null) {
2384 len = string.len;
2385 }
2386
2387 /* if "from" position is negative, count start position from the end
2388 * of the string
2389 */
2390 if (from < 0) {
2391 from = string.len + from;
2392 if (from < 0) {
2393 from = 0;
2394 }
2395 }
2396
2397 /* if "length" position is negative, set it to the length
2398 * needed to stop that many chars from the end of the string
2399 */
2400 if (len < 0) {
2401 len = (string.len - from) + len;
2402 if (len < 0) {
2403 len = 0;
2404 }
2405 }
2406
2407 if (from > string.len) {
2408 RETURN_EMPTY_STRING();
2409 }
2410
2411 ret = mbfl_strcut(&string, &result, from, len);
2412 ZEND_ASSERT(ret != NULL);
2413
2414 // TODO: avoid reallocation ???
2415 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2416 efree(ret->val);
2417 }
2418 /* }}} */
2419
2420 /* Some East Asian characters, when printed at a terminal (or the like), require double
2421 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2422 static size_t character_width(uint32_t c)
2423 {
2424 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2425 return 1;
2426 }
2427
2428 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2429 int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2430 while (lo < hi) {
2431 int probe = (lo + hi) / 2;
2432 if (c < mbfl_eaw_table[probe].begin) {
2433 hi = probe;
2434 } else if (c > mbfl_eaw_table[probe].end) {
2435 lo = probe + 1;
2436 } else {
2437 return 2;
2438 }
2439 }
2440
2441 return 1;
2442 }
2443
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2444 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2445 {
2446 size_t width = 0;
2447 uint32_t wchar_buf[128];
2448 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2449 size_t in_len = ZSTR_LEN(string);
2450 unsigned int state = 0;
2451
2452 while (in_len) {
2453 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2454 ZEND_ASSERT(out_len <= 128);
2455
2456 while (out_len) {
2457 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2458 * If text conversion is performed with an ordinary ASCII character as
2459 * the 'replacement character', this will give us the correct display width. */
2460 width += character_width(wchar_buf[--out_len]);
2461 }
2462 }
2463
2464 return width;
2465 }
2466
2467 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2468 PHP_FUNCTION(mb_strwidth)
2469 {
2470 zend_string *string, *enc_name = NULL;
2471
2472 ZEND_PARSE_PARAMETERS_START(1, 2)
2473 Z_PARAM_STR(string)
2474 Z_PARAM_OPTIONAL
2475 Z_PARAM_STR_OR_NULL(enc_name)
2476 ZEND_PARSE_PARAMETERS_END();
2477
2478 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2479 if (!enc) {
2480 RETURN_THROWS();
2481 }
2482
2483 RETVAL_LONG(mb_get_strwidth(string, enc));
2484 }
2485
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2486 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2487 {
2488 uint32_t wchar_buf[128];
2489 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2490 size_t in_len = ZSTR_LEN(input);
2491 unsigned int state = 0;
2492 int remaining_width = width;
2493 unsigned int to_skip = from;
2494 size_t out_len = 0;
2495 bool first_call = true, input_err = false;
2496 mb_convert_buf buf;
2497
2498 while (in_len) {
2499 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2500 ZEND_ASSERT(out_len <= 128);
2501
2502 if (out_len <= to_skip) {
2503 to_skip -= out_len;
2504 } else {
2505 for (int i = to_skip; i < out_len; i++) {
2506 uint32_t w = wchar_buf[i];
2507 input_err |= (w == MBFL_BAD_INPUT);
2508 remaining_width -= character_width(w);
2509 if (remaining_width < 0) {
2510 /* We need to truncate string and append trim marker */
2511 width -= mb_get_strwidth(marker, enc);
2512 /* 'width' is now the amount we want to take from 'input' */
2513 if (width <= 0) {
2514 return zend_string_copy(marker);
2515 }
2516 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2517
2518 if (first_call) {
2519 /* We can use the buffer of wchars which we have right now;
2520 * no need to convert again */
2521 goto dont_restart_conversion;
2522 } else {
2523 goto restart_conversion;
2524 }
2525 }
2526 }
2527 to_skip = 0;
2528 }
2529 first_call = false;
2530 }
2531
2532 /* The input string fits in the requested width; we don't need to append the trim marker
2533 * However, if the string contains erroneous byte sequences, those should be converted
2534 * to error markers */
2535 if (!input_err) {
2536 if (from == 0) {
2537 /* This just increments the string's refcount; it doesn't really 'copy' it */
2538 return zend_string_copy(input);
2539 } else {
2540 return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2541 }
2542 } else {
2543 /* We can't use `mb_get_substr`, because it uses the fastest method possible of
2544 * picking out a substring, which may not include converting erroneous byte
2545 * sequences to error markers */
2546 return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2547 }
2548
2549 /* The input string is too wide; we need to build a new string which
2550 * includes some portion of the input string, with the trim marker
2551 * concatenated onto it */
2552 restart_conversion:
2553 in = (unsigned char*)ZSTR_VAL(input);
2554 in_len = ZSTR_LEN(input);
2555 state = 0;
2556
2557 while (true) {
2558 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2559 ZEND_ASSERT(out_len <= 128);
2560
2561 dont_restart_conversion:
2562 if (out_len <= from) {
2563 from -= out_len;
2564 } else {
2565 for (int i = from; i < out_len; i++) {
2566 width -= character_width(wchar_buf[i]);
2567 if (width < 0) {
2568 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2569 goto append_trim_marker;
2570 }
2571 }
2572 ZEND_ASSERT(in_len > 0);
2573 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2574 from = 0;
2575 }
2576 }
2577
2578 append_trim_marker:
2579 if (ZSTR_LEN(marker) > 0) {
2580 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2581 memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2582 buf.out += ZSTR_LEN(marker);
2583 }
2584
2585 /* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2586 * we have no guarantee that the trim marker string is valid UTF-8 */
2587 return mb_convert_buf_result_raw(&buf);
2588 }
2589
2590 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2591 PHP_FUNCTION(mb_strimwidth)
2592 {
2593 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2594 zend_long from, width;
2595
2596 ZEND_PARSE_PARAMETERS_START(3, 5)
2597 Z_PARAM_STR(str)
2598 Z_PARAM_LONG(from)
2599 Z_PARAM_LONG(width)
2600 Z_PARAM_OPTIONAL
2601 Z_PARAM_STR(trimmarker)
2602 Z_PARAM_STR_OR_NULL(encoding)
2603 ZEND_PARSE_PARAMETERS_END();
2604
2605 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2606 if (!enc) {
2607 RETURN_THROWS();
2608 }
2609
2610 if (from != 0) {
2611 size_t str_len = mb_get_strlen(str, enc);
2612 if (from < 0) {
2613 from += str_len;
2614 }
2615 if (from < 0 || from > str_len) {
2616 zend_argument_value_error(2, "is out of range");
2617 RETURN_THROWS();
2618 }
2619 }
2620
2621 if (width < 0) {
2622 php_error_docref(NULL, E_DEPRECATED,
2623 "passing a negative integer to argument #3 ($width) is deprecated");
2624 width += mb_get_strwidth(str, enc);
2625
2626 if (from > 0) {
2627 zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2628 width -= mb_get_strwidth(trimmed, enc);
2629 zend_string_free(trimmed);
2630 }
2631
2632 if (width < 0) {
2633 zend_argument_value_error(3, "is out of range");
2634 RETURN_THROWS();
2635 }
2636 }
2637
2638 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2639 }
2640
2641
2642 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2643 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2644 {
2645 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2646 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2647 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2648 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2649 }
2650
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2651 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2652 {
2653 unsigned int num_errors = 0;
2654 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2655 MBSTRG(illegalchars) += num_errors;
2656 return result;
2657 }
2658
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2659 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2660 {
2661 const mbfl_encoding *from_encoding;
2662
2663 /* pre-conversion encoding */
2664 ZEND_ASSERT(num_from_encodings >= 1);
2665 if (num_from_encodings == 1) {
2666 from_encoding = *from_encodings;
2667 } else {
2668 /* auto detect */
2669 from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2670 if (!from_encoding) {
2671 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2672 return NULL;
2673 }
2674 }
2675
2676 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2677 }
2678
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2679 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2680 {
2681 HashTable *output, *chash;
2682 zend_long idx;
2683 zend_string *key;
2684 zval *entry, entry_tmp;
2685
2686 if (!input) {
2687 return NULL;
2688 }
2689
2690 if (GC_IS_RECURSIVE(input)) {
2691 GC_UNPROTECT_RECURSION(input);
2692 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2693 return NULL;
2694 }
2695 GC_TRY_PROTECT_RECURSION(input);
2696 output = zend_new_array(zend_hash_num_elements(input));
2697 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2698 /* convert key */
2699 if (key) {
2700 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2701 if (!converted_key) {
2702 continue;
2703 }
2704 key = converted_key;
2705 }
2706 /* convert value */
2707 ZEND_ASSERT(entry);
2708 try_again:
2709 switch(Z_TYPE_P(entry)) {
2710 case IS_STRING: {
2711 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2712 if (!converted_key) {
2713 if (key) {
2714 zend_string_release(key);
2715 }
2716 continue;
2717 }
2718 ZVAL_STR(&entry_tmp, converted_key);
2719 break;
2720 }
2721 case IS_NULL:
2722 case IS_TRUE:
2723 case IS_FALSE:
2724 case IS_LONG:
2725 case IS_DOUBLE:
2726 ZVAL_COPY(&entry_tmp, entry);
2727 break;
2728 case IS_ARRAY:
2729 chash = php_mb_convert_encoding_recursive(
2730 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2731 if (chash) {
2732 ZVAL_ARR(&entry_tmp, chash);
2733 } else {
2734 ZVAL_EMPTY_ARRAY(&entry_tmp);
2735 }
2736 break;
2737 case IS_REFERENCE:
2738 entry = Z_REFVAL_P(entry);
2739 goto try_again;
2740 case IS_OBJECT:
2741 default:
2742 if (key) {
2743 zend_string_release(key);
2744 }
2745 php_error_docref(NULL, E_WARNING, "Object is not supported");
2746 continue;
2747 }
2748 if (key) {
2749 zend_hash_add(output, key, &entry_tmp);
2750 zend_string_release(key);
2751 } else {
2752 zend_hash_index_add(output, idx, &entry_tmp);
2753 }
2754 } ZEND_HASH_FOREACH_END();
2755 GC_TRY_UNPROTECT_RECURSION(input);
2756
2757 return output;
2758 }
2759 /* }}} */
2760
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2761 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2762 {
2763 /* mbstring supports some 'text encodings' which aren't really text encodings
2764 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2765 * These should never be returned by `mb_detect_encoding`. */
2766 int shift = 0;
2767 for (int i = 0; i < *size; i++) {
2768 const mbfl_encoding *encoding = elist[i];
2769 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2770 shift++; /* Remove this encoding from the list */
2771 } else if (shift) {
2772 elist[i - shift] = encoding;
2773 }
2774 }
2775 *size -= shift;
2776 }
2777
2778 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2779 PHP_FUNCTION(mb_convert_encoding)
2780 {
2781 zend_string *to_encoding_name;
2782 zend_string *input_str, *from_encodings_str = NULL;
2783 HashTable *input_ht, *from_encodings_ht = NULL;
2784 const mbfl_encoding **from_encodings;
2785 size_t num_from_encodings;
2786 bool free_from_encodings = false;
2787
2788 ZEND_PARSE_PARAMETERS_START(2, 3)
2789 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2790 Z_PARAM_STR(to_encoding_name)
2791 Z_PARAM_OPTIONAL
2792 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2793 ZEND_PARSE_PARAMETERS_END();
2794
2795 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2796 if (!to_encoding) {
2797 RETURN_THROWS();
2798 }
2799
2800 if (from_encodings_ht) {
2801 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2802 RETURN_THROWS();
2803 }
2804 free_from_encodings = true;
2805 } else if (from_encodings_str) {
2806 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2807 &from_encodings, &num_from_encodings,
2808 /* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2809 RETURN_THROWS();
2810 }
2811 free_from_encodings = true;
2812 } else {
2813 from_encodings = &MBSTRG(current_internal_encoding);
2814 num_from_encodings = 1;
2815 }
2816
2817 if (num_from_encodings > 1) {
2818 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2819 }
2820
2821 if (!num_from_encodings) {
2822 efree(ZEND_VOIDP(from_encodings));
2823 zend_argument_value_error(3, "must specify at least one encoding");
2824 RETURN_THROWS();
2825 }
2826
2827 if (input_str) {
2828 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2829 if (ret != NULL) {
2830 RETVAL_STR(ret);
2831 } else {
2832 RETVAL_FALSE;
2833 }
2834 } else {
2835 HashTable *tmp;
2836 tmp = php_mb_convert_encoding_recursive(
2837 input_ht, to_encoding, from_encodings, num_from_encodings);
2838 RETVAL_ARR(tmp);
2839 }
2840
2841 if (free_from_encodings) {
2842 efree(ZEND_VOIDP(from_encodings));
2843 }
2844 }
2845 /* }}} */
2846
mbstring_convert_case(php_case_mode case_mode,const char * str,size_t str_len,const mbfl_encoding * enc)2847 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2848 {
2849 return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2850 }
2851
PHP_FUNCTION(mb_convert_case)2852 PHP_FUNCTION(mb_convert_case)
2853 {
2854 zend_string *str, *from_encoding = NULL;
2855 zend_long case_mode = 0;
2856
2857 ZEND_PARSE_PARAMETERS_START(2, 3)
2858 Z_PARAM_STR(str)
2859 Z_PARAM_LONG(case_mode)
2860 Z_PARAM_OPTIONAL
2861 Z_PARAM_STR_OR_NULL(from_encoding)
2862 ZEND_PARSE_PARAMETERS_END();
2863
2864 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2865 if (!enc) {
2866 RETURN_THROWS();
2867 }
2868
2869 if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2870 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2871 RETURN_THROWS();
2872 }
2873
2874 RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2875 }
2876
PHP_FUNCTION(mb_strtoupper)2877 PHP_FUNCTION(mb_strtoupper)
2878 {
2879 zend_string *str, *from_encoding = NULL;
2880
2881 ZEND_PARSE_PARAMETERS_START(1, 2)
2882 Z_PARAM_STR(str)
2883 Z_PARAM_OPTIONAL
2884 Z_PARAM_STR_OR_NULL(from_encoding)
2885 ZEND_PARSE_PARAMETERS_END();
2886
2887 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2888 if (!enc) {
2889 RETURN_THROWS();
2890 }
2891
2892 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2893 }
2894
PHP_FUNCTION(mb_strtolower)2895 PHP_FUNCTION(mb_strtolower)
2896 {
2897 zend_string *str, *from_encoding = NULL;
2898
2899 ZEND_PARSE_PARAMETERS_START(1, 2)
2900 Z_PARAM_STR(str)
2901 Z_PARAM_OPTIONAL
2902 Z_PARAM_STR_OR_NULL(from_encoding)
2903 ZEND_PARSE_PARAMETERS_END();
2904
2905 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2906 if (!enc) {
2907 RETURN_THROWS();
2908 }
2909
2910 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2911 }
2912
duplicate_elist(const mbfl_encoding ** elist,size_t size)2913 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2914 {
2915 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2916 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2917 return new_elist;
2918 }
2919
estimate_demerits(uint32_t w)2920 static unsigned int estimate_demerits(uint32_t w)
2921 {
2922 /* Receive wchars decoded from input string using candidate encoding.
2923 * Give the candidate many 'demerits' for each 'rare' codepoint found,
2924 * a smaller number for each ASCII punctuation character, and 1 for
2925 * all other codepoints.
2926 *
2927 * The 'common' codepoints should cover the vast majority of
2928 * codepoints we are likely to see in practice, while only covering
2929 * a small minority of the entire Unicode encoding space. Why?
2930 * Well, if the test string happens to be valid in an incorrect
2931 * candidate encoding, the bogus codepoints which it decodes to will
2932 * be more or less random. By treating the majority of codepoints as
2933 * 'rare', we ensure that in almost all such cases, the bogus
2934 * codepoints will include plenty of 'rares', thus giving the
2935 * incorrect candidate encoding lots of demerits. See
2936 * common_codepoints.txt for the actual list used.
2937 *
2938 * So, why give extra demerits for ASCII punctuation characters? It's
2939 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
2940 * which deliberately only use bytes in the ASCII range. When
2941 * misinterpreted as ASCII/UTF-8, strings in these encodings will
2942 * have an unusually high number of ASCII punctuation characters.
2943 * So giving extra demerits for such characters will improve
2944 * detection accuracy for UTF-7 and similar encodings.
2945 *
2946 * Finally, why 1 demerit for all other characters? That penalizes
2947 * long strings, meaning we will tend to choose a candidate encoding
2948 * in which the test string decodes to a smaller number of
2949 * codepoints. That prevents single-byte encodings in which almost
2950 * every possible input byte decodes to a 'common' codepoint from
2951 * being favored too much. */
2952 if (w > 0xFFFF) {
2953 return 40;
2954 } else if (w >= 0x21 && w <= 0x2F) {
2955 return 6;
2956 } else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
2957 return 30;
2958 } else {
2959 return 1;
2960 }
2961 return 0;
2962 }
2963
2964 struct candidate {
2965 const mbfl_encoding *enc;
2966 const unsigned char *in;
2967 size_t in_len;
2968 uint64_t demerits; /* Wide bit size to prevent overflow */
2969 unsigned int state;
2970 float multiplier;
2971 };
2972
init_candidate_array(struct candidate * array,size_t length,const mbfl_encoding ** encodings,const unsigned char ** in,size_t * in_len,size_t n,bool strict,bool order_significant)2973 static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
2974 {
2975 size_t j = 0;
2976
2977 for (size_t i = 0; i < length; i++) {
2978 const mbfl_encoding *enc = encodings[i];
2979
2980 array[j].enc = enc;
2981 array[j].state = 0;
2982 array[j].demerits = 0;
2983
2984 /* If any candidate encodings have specialized validation functions, use them
2985 * to eliminate as many candidates as possible */
2986 if (enc->check != NULL) {
2987 for (size_t k = 0; k < n; k++) {
2988 if (!enc->check((unsigned char*)in[k], in_len[k])) {
2989 if (strict) {
2990 goto skip_to_next;
2991 } else {
2992 array[j].demerits += 500;
2993 }
2994 }
2995 }
2996 }
2997
2998 /* This multiplier can optionally be used to make candidate encodings listed
2999 * first more likely to be chosen. It is a weight factor which multiplies
3000 * the number of demerits counted for each candidate. */
3001 array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3002 j++;
3003 skip_to_next: ;
3004 }
3005
3006 return j;
3007 }
3008
start_string(struct candidate * array,size_t length,const unsigned char * in,size_t in_len)3009 static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3010 {
3011 for (size_t i = 0; i < length; i++) {
3012 const mbfl_encoding *enc = array[i].enc;
3013
3014 array[i].in = in;
3015 array[i].in_len = in_len;
3016
3017 /* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3018 if (enc == &mbfl_encoding_utf8) {
3019 if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3020 array[i].in_len -= 3;
3021 array[i].in += 3;
3022 }
3023 } else if (enc == &mbfl_encoding_utf16be) {
3024 if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3025 array[i].in_len -= 2;
3026 array[i].in += 2;
3027 }
3028 } else if (enc == &mbfl_encoding_utf16le) {
3029 if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3030 array[i].in_len -= 2;
3031 array[i].in += 2;
3032 }
3033 }
3034 }
3035 }
3036
count_demerits(struct candidate * array,size_t length,bool strict)3037 static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3038 {
3039 uint32_t wchar_buf[128];
3040 unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3041
3042 for (size_t i = 0; i < length; i++) {
3043 if (array[i].in_len == 0) {
3044 finished++;
3045 }
3046 }
3047
3048 while ((strict || length > 1) && finished < length) {
3049 /* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3050 for (size_t i = length - 1; i != (size_t)-1; i--) {
3051 /* Do we still have more input to process for this candidate encoding? */
3052 if (array[i].in_len) {
3053 const mbfl_encoding *enc = array[i].enc;
3054 size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3055 ZEND_ASSERT(out_len <= 128);
3056 /* Check this batch of decoded codepoints; are there any error markers?
3057 * Also sum up the number of demerits */
3058 while (out_len) {
3059 uint32_t w = wchar_buf[--out_len];
3060 if (w == MBFL_BAD_INPUT) {
3061 if (strict) {
3062 /* This candidate encoding is not valid, eliminate it from consideration */
3063 length--;
3064 if (i < length) {
3065 /* The eliminated candidate was the last valid one in the list */
3066 memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3067 }
3068 goto try_next_encoding;
3069 } else {
3070 array[i].demerits += 1000;
3071 }
3072 } else {
3073 array[i].demerits += estimate_demerits(w);
3074 }
3075 }
3076 if (array[i].in_len == 0) {
3077 finished++;
3078 }
3079 }
3080 try_next_encoding:;
3081 }
3082 }
3083
3084 for (size_t i = 0; i < length; i++) {
3085 array[i].demerits *= array[i].multiplier;
3086 }
3087
3088 return length;
3089 }
3090
mb_guess_encoding_for_strings(const unsigned char ** strings,size_t * str_lengths,size_t n,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3091 MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3092 {
3093 if (elist_size == 0) {
3094 return NULL;
3095 }
3096 if (elist_size == 1) {
3097 if (strict) {
3098 while (n--) {
3099 if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3100 return NULL;
3101 }
3102 }
3103 }
3104 return *elist;
3105 }
3106 if (n == 1 && *str_lengths == 0) {
3107 return *elist;
3108 }
3109
3110 /* Allocate on stack; when we return, this array is automatically freed */
3111 struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3112 elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3113
3114 while (n--) {
3115 start_string(array, elist_size, strings[n], str_lengths[n]);
3116 elist_size = count_demerits(array, elist_size, strict);
3117 if (elist_size == 0) {
3118 /* All candidates were eliminated */
3119 return NULL;
3120 }
3121 }
3122
3123 /* See which remaining candidate encoding has the least demerits */
3124 unsigned int best = 0;
3125 for (unsigned int i = 1; i < elist_size; i++) {
3126 if (array[i].demerits < array[best].demerits) {
3127 best = i;
3128 }
3129 }
3130 return array[best].enc;
3131 }
3132
3133 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
3134 * is rejected. With non-strict detection, we just continue, but apply demerits for
3135 * each invalid byte sequence */
mb_guess_encoding(unsigned char * in,size_t in_len,const mbfl_encoding ** elist,unsigned int elist_size,bool strict,bool order_significant)3136 static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3137 {
3138 return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3139 }
3140
3141 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)3142 PHP_FUNCTION(mb_detect_encoding)
3143 {
3144 zend_string *str, *encoding_str = NULL;
3145 HashTable *encoding_ht = NULL;
3146 bool strict = false;
3147 const mbfl_encoding *ret, **elist;
3148 size_t size;
3149
3150 ZEND_PARSE_PARAMETERS_START(1, 3)
3151 Z_PARAM_STR(str)
3152 Z_PARAM_OPTIONAL
3153 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3154 Z_PARAM_BOOL(strict)
3155 ZEND_PARSE_PARAMETERS_END();
3156
3157 /* Should we pay attention to the order of the provided candidate encodings and prefer
3158 * the earlier ones (if more than one candidate encoding matches)?
3159 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3160 * in, then don't treat the order as significant */
3161 bool order_significant = true;
3162
3163 /* make encoding list */
3164 if (encoding_ht) {
3165 if (encoding_ht == MBSTRG(all_encodings_list)) {
3166 order_significant = false;
3167 }
3168 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3169 RETURN_THROWS();
3170 }
3171 } else if (encoding_str) {
3172 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3173 RETURN_THROWS();
3174 }
3175 } else {
3176 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
3177 size = MBSTRG(current_detect_order_list_size);
3178 }
3179
3180 if (size == 0) {
3181 efree(ZEND_VOIDP(elist));
3182 zend_argument_value_error(2, "must specify at least one encoding");
3183 RETURN_THROWS();
3184 }
3185
3186 remove_non_encodings_from_elist(elist, &size);
3187 if (size == 0) {
3188 efree(ZEND_VOIDP(elist));
3189 RETURN_FALSE;
3190 }
3191
3192 if (ZEND_NUM_ARGS() < 3) {
3193 strict = MBSTRG(strict_detection);
3194 }
3195
3196 if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
3197 ret = &mbfl_encoding_utf8;
3198 } else {
3199 ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3200 }
3201
3202 efree(ZEND_VOIDP(elist));
3203
3204 if (ret == NULL) {
3205 RETURN_FALSE;
3206 }
3207
3208 RETVAL_STRING((char *)ret->name);
3209 }
3210 /* }}} */
3211
3212 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)3213 PHP_FUNCTION(mb_list_encodings)
3214 {
3215 ZEND_PARSE_PARAMETERS_NONE();
3216
3217 if (MBSTRG(all_encodings_list) == NULL) {
3218 /* Initialize shared array of supported encoding names
3219 * This is done so that we can check if `mb_list_encodings()` is being
3220 * passed to other mbstring functions using a cheap pointer equality check */
3221 HashTable *array = emalloc(sizeof(HashTable));
3222 zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3223 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3224 zval tmp;
3225 ZVAL_STRING(&tmp, (*encodings)->name);
3226 zend_hash_next_index_insert(array, &tmp);
3227 }
3228 MBSTRG(all_encodings_list) = array;
3229 }
3230
3231 GC_ADDREF(MBSTRG(all_encodings_list));
3232 RETURN_ARR(MBSTRG(all_encodings_list));
3233 }
3234 /* }}} */
3235
3236 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)3237 PHP_FUNCTION(mb_encoding_aliases)
3238 {
3239 const mbfl_encoding *encoding;
3240 zend_string *encoding_name = NULL;
3241
3242 ZEND_PARSE_PARAMETERS_START(1, 1)
3243 Z_PARAM_STR(encoding_name)
3244 ZEND_PARSE_PARAMETERS_END();
3245
3246 encoding = php_mb_get_encoding(encoding_name, 1);
3247 if (!encoding) {
3248 RETURN_THROWS();
3249 }
3250
3251 array_init(return_value);
3252 if (encoding->aliases != NULL) {
3253 for (const char **alias = encoding->aliases; *alias; ++alias) {
3254 add_next_index_string(return_value, (char *)*alias);
3255 }
3256 }
3257 }
3258 /* }}} */
3259
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3260 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3261 {
3262 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3263 * if we are converting zenkaku kana to hankaku kana
3264 * Make the buffer for converted kana big enough that we never need to
3265 * perform bounds checks */
3266 uint32_t wchar_buf[64], converted_buf[64 * 2];
3267 unsigned int buf_offset = 0;
3268 unsigned int state = 0;
3269 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3270 size_t in_len = ZSTR_LEN(input);
3271
3272 mb_convert_buf buf;
3273 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3274
3275 while (in_len) {
3276 uint32_t *converted = converted_buf;
3277 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3278 * previous iteration, don't overwrite it */
3279 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3280 out_len += buf_offset;
3281 ZEND_ASSERT(out_len <= 64);
3282
3283 if (!out_len) {
3284 continue;
3285 }
3286
3287 for (int i = 0; i < out_len-1; i++) {
3288 uint32_t second = 0;
3289 bool consumed = false;
3290 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3291 if (second) {
3292 *converted++ = second;
3293 }
3294 if (consumed) {
3295 i++;
3296 if (i == out_len-1) {
3297 /* We consumed two codepoints at the very end of the wchar buffer
3298 * So there is nothing remaining to reprocess on the next iteration */
3299 buf_offset = 0;
3300 goto emit_converted_kana;
3301 }
3302 }
3303 }
3304
3305 if (!in_len) {
3306 /* This is the last iteration, so we need to process the final codepoint now */
3307 uint32_t second = 0;
3308 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3309 if (second) {
3310 *converted++ = second;
3311 }
3312 } else {
3313 /* Reprocess the last codepoint on the next iteration */
3314 wchar_buf[0] = wchar_buf[out_len-1];
3315 buf_offset = 1;
3316 }
3317
3318 emit_converted_kana:
3319 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3320 }
3321
3322 return mb_convert_buf_result(&buf, encoding);
3323 }
3324
3325 char mb_convert_kana_flags[17] = {
3326 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3327 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3328 'V'
3329 };
3330
3331 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3332 PHP_FUNCTION(mb_convert_kana)
3333 {
3334 unsigned int opt;
3335 char *optstr = NULL;
3336 size_t optstr_len;
3337 zend_string *encname = NULL, *str;
3338
3339 ZEND_PARSE_PARAMETERS_START(1, 3)
3340 Z_PARAM_STR(str)
3341 Z_PARAM_OPTIONAL
3342 Z_PARAM_STRING(optstr, optstr_len)
3343 Z_PARAM_STR_OR_NULL(encname)
3344 ZEND_PARSE_PARAMETERS_END();
3345
3346 if (optstr != NULL) {
3347 char *p = optstr, *e = p + optstr_len;
3348 opt = 0;
3349 next_option:
3350 while (p < e) {
3351 /* Walk through option string and convert to bit vector
3352 * See translit_kana_jisx0201_jisx0208.h for the values used */
3353 char c = *p++;
3354 if (c == 'A') {
3355 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3356 } else if (c == 'a') {
3357 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3358 } else {
3359 for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3360 if (c == mb_convert_kana_flags[i]) {
3361 opt |= (1 << i);
3362 goto next_option;
3363 }
3364 }
3365
3366 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3367 RETURN_THROWS();
3368 }
3369 }
3370
3371 /* Check for illegal combinations of options */
3372 if (((opt & 0xFF00) >> 8) & opt) {
3373 /* It doesn't make sense to convert the same type of characters from halfwidth to
3374 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3375 * FW hiragana to FW katakana and then back again. */
3376 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3377 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3378 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3379 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3380 flag1 = 'A';
3381 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3382 flag2 = 'a';
3383 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3384 RETURN_THROWS();
3385 }
3386
3387 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3388 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3389 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3390 RETURN_THROWS();
3391 }
3392
3393 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3394 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3395 * more than one of these */
3396 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3397 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3398 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3399 RETURN_THROWS();
3400 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3401 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3402 RETURN_THROWS();
3403 }
3404 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3405 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3406 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3407 RETURN_THROWS();
3408 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3409 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3410 RETURN_THROWS();
3411 }
3412 }
3413 } else {
3414 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3415 }
3416
3417 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3418 if (!enc) {
3419 RETURN_THROWS();
3420 }
3421
3422 RETVAL_STR(jp_kana_convert(str, enc, opt));
3423 }
3424
mb_recursive_count_strings(zval * var)3425 static unsigned int mb_recursive_count_strings(zval *var)
3426 {
3427 unsigned int count = 0;
3428 ZVAL_DEREF(var);
3429
3430 if (Z_TYPE_P(var) == IS_STRING) {
3431 count++;
3432 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3433 if (Z_REFCOUNTED_P(var)) {
3434 if (Z_IS_RECURSIVE_P(var)) {
3435 return count;
3436 }
3437 Z_PROTECT_RECURSION_P(var);
3438 }
3439
3440 HashTable *ht = HASH_OF(var);
3441 if (ht != NULL) {
3442 zval *entry;
3443 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3444 count += mb_recursive_count_strings(entry);
3445 } ZEND_HASH_FOREACH_END();
3446 }
3447
3448 if (Z_REFCOUNTED_P(var)) {
3449 Z_UNPROTECT_RECURSION_P(var);
3450 }
3451 }
3452
3453 return count;
3454 }
3455
mb_recursive_find_strings(zval * var,const unsigned char ** val_list,size_t * len_list,unsigned int * count)3456 static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3457 {
3458 ZVAL_DEREF(var);
3459
3460 if (Z_TYPE_P(var) == IS_STRING) {
3461 val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3462 len_list[*count] = Z_STRLEN_P(var);
3463 (*count)++;
3464 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3465 if (Z_REFCOUNTED_P(var)) {
3466 if (Z_IS_RECURSIVE_P(var)) {
3467 return true;
3468 }
3469 Z_PROTECT_RECURSION_P(var);
3470 }
3471
3472 HashTable *ht = HASH_OF(var);
3473 if (ht != NULL) {
3474 zval *entry;
3475 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3476 if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3477 if (Z_REFCOUNTED_P(var)) {
3478 Z_UNPROTECT_RECURSION_P(var);
3479 return true;
3480 }
3481 }
3482 } ZEND_HASH_FOREACH_END();
3483 }
3484
3485 if (Z_REFCOUNTED_P(var)) {
3486 Z_UNPROTECT_RECURSION_P(var);
3487 }
3488 }
3489
3490 return false;
3491 }
3492
mb_recursive_convert_variable(zval * var,const mbfl_encoding * from_encoding,const mbfl_encoding * to_encoding)3493 static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3494 {
3495 zval *entry, *orig_var;
3496
3497 orig_var = var;
3498 ZVAL_DEREF(var);
3499
3500 if (Z_TYPE_P(var) == IS_STRING) {
3501 zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3502 zval_ptr_dtor(orig_var);
3503 ZVAL_STR(orig_var, ret);
3504 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3505 if (Z_TYPE_P(var) == IS_ARRAY) {
3506 SEPARATE_ARRAY(var);
3507 }
3508 if (Z_REFCOUNTED_P(var)) {
3509 if (Z_IS_RECURSIVE_P(var)) {
3510 return true;
3511 }
3512 Z_PROTECT_RECURSION_P(var);
3513 }
3514
3515 HashTable *ht = HASH_OF(var);
3516 if (ht != NULL) {
3517 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3518 if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3519 if (Z_REFCOUNTED_P(var)) {
3520 Z_UNPROTECT_RECURSION_P(var);
3521 }
3522 return true;
3523 }
3524 } ZEND_HASH_FOREACH_END();
3525 }
3526
3527 if (Z_REFCOUNTED_P(var)) {
3528 Z_UNPROTECT_RECURSION_P(var);
3529 }
3530 }
3531
3532 return false;
3533 }
3534
PHP_FUNCTION(mb_convert_variables)3535 PHP_FUNCTION(mb_convert_variables)
3536 {
3537 zval *args;
3538 zend_string *to_enc_str;
3539 zend_string *from_enc_str;
3540 HashTable *from_enc_ht;
3541 const mbfl_encoding *from_encoding, *to_encoding;
3542 uint32_t argc;
3543 size_t elistsz;
3544 const mbfl_encoding **elist;
3545
3546 ZEND_PARSE_PARAMETERS_START(3, -1)
3547 Z_PARAM_STR(to_enc_str)
3548 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3549 Z_PARAM_VARIADIC('+', args, argc)
3550 ZEND_PARSE_PARAMETERS_END();
3551
3552 /* new encoding */
3553 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3554 if (!to_encoding) {
3555 RETURN_THROWS();
3556 }
3557
3558 from_encoding = MBSTRG(current_internal_encoding);
3559
3560 bool order_significant = true;
3561
3562 /* pre-conversion encoding */
3563 if (from_enc_ht) {
3564 if (from_enc_ht == MBSTRG(all_encodings_list)) {
3565 /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3566 * in, then don't treat the order of the list as significant */
3567 order_significant = false;
3568 }
3569 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3570 RETURN_THROWS();
3571 }
3572 } else {
3573 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3574 RETURN_THROWS();
3575 }
3576 }
3577
3578 if (elistsz == 0) {
3579 efree(ZEND_VOIDP(elist));
3580 zend_argument_value_error(2, "must specify at least one encoding");
3581 RETURN_THROWS();
3582 }
3583
3584 if (elistsz == 1) {
3585 from_encoding = *elist;
3586 } else {
3587 /* auto detect */
3588 unsigned int num = 0;
3589 for (size_t n = 0; n < argc; n++) {
3590 zval *zv = &args[n];
3591 num += mb_recursive_count_strings(zv);
3592 }
3593 const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3594 size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3595 unsigned int i = 0;
3596 for (size_t n = 0; n < argc; n++) {
3597 zval *zv = &args[n];
3598 if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3599 efree(ZEND_VOIDP(elist));
3600 efree(ZEND_VOIDP(val_list));
3601 efree(len_list);
3602 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3603 RETURN_FALSE;
3604 }
3605 }
3606 from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3607 efree(ZEND_VOIDP(val_list));
3608 efree(len_list);
3609 if (!from_encoding) {
3610 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3611 efree(ZEND_VOIDP(elist));
3612 RETURN_FALSE;
3613 }
3614
3615 }
3616
3617 efree(ZEND_VOIDP(elist));
3618
3619 /* convert */
3620 for (size_t n = 0; n < argc; n++) {
3621 zval *zv = &args[n];
3622 ZVAL_DEREF(zv);
3623 if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3624 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3625 RETURN_FALSE;
3626 }
3627 }
3628
3629 RETURN_STRING(from_encoding->name);
3630 }
3631
3632 /* HTML numeric entities */
3633
3634 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3635 static uint32_t *make_conversion_map(HashTable *target_hash, int *convmap_size)
3636 {
3637 zval *hash_entry;
3638
3639 int n_elems = zend_hash_num_elements(target_hash);
3640 if (n_elems % 4 != 0) {
3641 zend_argument_value_error(2, "must have a multiple of 4 elements");
3642 return NULL;
3643 }
3644
3645 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3646 uint32_t *mapelm = convmap;
3647
3648 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3649 *mapelm++ = zval_get_long(hash_entry);
3650 } ZEND_HASH_FOREACH_END();
3651
3652 *convmap_size = n_elems / 4;
3653 return convmap;
3654 }
3655
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,int mapsize,uint32_t * retval)3656 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, int mapsize, uint32_t *retval)
3657 {
3658 uint32_t *convmap_end = convmap + (mapsize * 4);
3659
3660 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3661 uint32_t lo_code = mapelm[0];
3662 uint32_t hi_code = mapelm[1];
3663 uint32_t offset = mapelm[2];
3664 uint32_t mask = mapelm[3];
3665
3666 if (w >= lo_code && w <= hi_code) {
3667 /* This wchar falls inside one of the ranges which should be
3668 * converted to HTML entities */
3669 *retval = (w + offset) & mask;
3670 return true;
3671 }
3672 }
3673
3674 /* None of the ranges matched */
3675 return false;
3676 }
3677
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize,bool hex)3678 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize, bool hex)
3679 {
3680 /* Each wchar which we get from decoding the input string may become up to
3681 * 13 wchars when we convert it to an HTML entity */
3682 uint32_t wchar_buf[32], converted_buf[32 * 13];
3683 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3684
3685 unsigned int state = 0;
3686 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3687 size_t in_len = ZSTR_LEN(input);
3688
3689 mb_convert_buf buf;
3690 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3691
3692 while (in_len) {
3693 /* Convert input string to wchars, up to 32 at a time */
3694 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3695 ZEND_ASSERT(out_len <= 32);
3696 uint32_t *converted = converted_buf;
3697
3698 /* Run through wchars and see if any of them fall into the ranges
3699 * which we want to convert to HTML entities */
3700 for (int i = 0; i < out_len; i++) {
3701 uint32_t w = wchar_buf[i];
3702
3703 if (html_numeric_entity_convert(w, convmap, mapsize, &w)) {
3704 *converted++ = '&';
3705 *converted++ = '#';
3706 if (hex) {
3707 *converted++ = 'x';
3708 }
3709
3710 /* Convert wchar to decimal/hex string */
3711 if (w == 0) {
3712 *converted++ = '0';
3713 } else {
3714 unsigned char *p = entity + sizeof(entity);
3715 if (hex) {
3716 while (w > 0) {
3717 *(--p) = "0123456789ABCDEF"[w & 0xF];
3718 w >>= 4;
3719 }
3720 } else {
3721 while (w > 0) {
3722 *(--p) = "0123456789"[w % 10];
3723 w /= 10;
3724 }
3725 }
3726 while (p < entity + sizeof(entity)) {
3727 *converted++ = *p++;
3728 }
3729 }
3730
3731 *converted++ = ';';
3732 } else {
3733 *converted++ = w;
3734 }
3735 }
3736
3737 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3738 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3739 }
3740
3741 return mb_convert_buf_result(&buf, encoding);
3742 }
3743
3744 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3745 PHP_FUNCTION(mb_encode_numericentity)
3746 {
3747 zend_string *encoding = NULL, *str;
3748 int mapsize;
3749 HashTable *target_hash;
3750 bool is_hex = false;
3751
3752 ZEND_PARSE_PARAMETERS_START(2, 4)
3753 Z_PARAM_STR(str)
3754 Z_PARAM_ARRAY_HT(target_hash)
3755 Z_PARAM_OPTIONAL
3756 Z_PARAM_STR_OR_NULL(encoding)
3757 Z_PARAM_BOOL(is_hex)
3758 ZEND_PARSE_PARAMETERS_END();
3759
3760 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3761 if (!enc) {
3762 RETURN_THROWS();
3763 }
3764
3765 uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3766 if (convmap == NULL) {
3767 RETURN_THROWS();
3768 }
3769
3770 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, mapsize, is_hex));
3771 efree(convmap);
3772 }
3773 /* }}} */
3774
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,int mapsize,uint32_t * retval)3775 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, int mapsize, uint32_t *retval)
3776 {
3777 uint32_t *convmap_end = convmap + (mapsize * 4);
3778
3779 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3780 uint32_t lo_code = mapelm[0];
3781 uint32_t hi_code = mapelm[1];
3782 uint32_t offset = mapelm[2];
3783 uint32_t codepoint = number - offset;
3784 if (codepoint >= lo_code && codepoint <= hi_code) {
3785 *retval = codepoint;
3786 return true;
3787 }
3788 }
3789
3790 return false;
3791 }
3792
3793 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
3794 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
3795 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3796 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3797
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize)3798 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
3799 {
3800 uint32_t wchar_buf[128], converted_buf[128];
3801
3802 unsigned int state = 0;
3803 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3804 size_t in_len = ZSTR_LEN(input);
3805
3806 mb_convert_buf buf;
3807 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3808
3809 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
3810 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
3811 * 2nd 'converted' buffer.
3812 *
3813 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
3814 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
3815 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
3816 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
3817 * to store the next batch of wchars after it.
3818 *
3819 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
3820 * wchars from the 1st buffer to the 2nd one.
3821 *
3822 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
3823 * have to do bounds checks when writing wchars into it.
3824 */
3825
3826 unsigned int wchar_buf_offset = 0;
3827
3828 while (in_len) {
3829 /* Leave space for sentinel at the end of the buffer */
3830 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
3831 out_len += wchar_buf_offset;
3832 ZEND_ASSERT(out_len <= 127);
3833 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
3834
3835 uint32_t *p, *converted;
3836
3837 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
3838 * be there (in `wchar_buf[0]`), so don't bother in that case */
3839 if (wchar_buf_offset == 0) {
3840 p = wchar_buf;
3841 while (*p != '&')
3842 p++;
3843 if (p == wchar_buf + out_len) {
3844 /* No HTML entities in this buffer */
3845 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
3846 continue;
3847 }
3848
3849 /* Copy over the prefix with no & which we already scanned */
3850 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
3851 converted = converted_buf + (p - wchar_buf);
3852 } else {
3853 p = wchar_buf;
3854 converted = converted_buf;
3855 }
3856
3857 found_ampersand:
3858 ZEND_ASSERT(*p == '&');
3859 uint32_t *p2 = p;
3860
3861 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
3862 if (*++p2 == '#') {
3863 if (*++p2 == 'x') {
3864 /* Possible hex entity */
3865 uint32_t w = *++p2;
3866 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
3867 w = *++p2;
3868 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3869 /* We hit the end of the buffer while reading digits, and
3870 * more wchars are still coming in the next buffer
3871 * Reprocess this identity on next iteration */
3872 memmove(wchar_buf, p, (p2 - p) * 4);
3873 wchar_buf_offset = p2 - p;
3874 goto process_converted_wchars;
3875 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3876 /* Invalid entity (too long or "&#x" only) */
3877 memcpy(converted, p, (p2 - p) * 4);
3878 converted += p2 - p;
3879 } else {
3880 /* Valid hexadecimal entity */
3881 uint32_t value = 0, *p3 = p + 3;
3882 while (p3 < p2) {
3883 w = *p3++;
3884 if (w <= '9') {
3885 value = (value * 16) + (w - '0');
3886 } else if (w >= 'a') {
3887 value = (value * 16) + 10 + (w - 'a');
3888 } else {
3889 value = (value * 16) + 10 + (w - 'A');
3890 }
3891 }
3892 if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3893 converted++;
3894 if (*p2 == ';')
3895 p2++;
3896 } else {
3897 memcpy(converted, p, (p2 - p) * 4);
3898 converted += p2 - p;
3899 }
3900 }
3901 } else {
3902 /* Possible decimal entity */
3903 uint32_t w = *p2;
3904 while (w >= '0' && w <= '9')
3905 w = *++p2;
3906 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3907 /* The number of digits was legal (no more than 10 decimal digits)
3908 * Reprocess this identity on next iteration of main loop */
3909 memmove(wchar_buf, p, (p2 - p) * 4);
3910 wchar_buf_offset = p2 - p;
3911 goto process_converted_wchars;
3912 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3913 /* Invalid entity (too long or "&#" only) */
3914 memcpy(converted, p, (p2 - p) * 4);
3915 converted += p2 - p;
3916 } else {
3917 /* Valid decimal entity */
3918 uint32_t value = 0, *p3 = p + 2;
3919 while (p3 < p2) {
3920 /* If unsigned integer overflow would occur in the below
3921 * multiplication by 10, this entity is no good
3922 * 0x19999999 is 1/10th of 0xFFFFFFFF */
3923 if (value > 0x19999999) {
3924 memcpy(converted, p, (p2 - p) * 4);
3925 converted += p2 - p;
3926 goto decimal_entity_too_big;
3927 }
3928 value = (value * 10) + (*p3++ - '0');
3929 }
3930 if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3931 converted++;
3932 if (*p2 == ';')
3933 p2++;
3934 } else {
3935 memcpy(converted, p, (p2 - p) * 4);
3936 converted += p2 - p;
3937 }
3938 }
3939 }
3940 } else if ((p2 == wchar_buf + out_len) && in_len) {
3941 /* Corner case: & at end of buffer */
3942 wchar_buf[0] = '&';
3943 wchar_buf_offset = 1;
3944 goto process_converted_wchars;
3945 } else {
3946 *converted++ = '&';
3947 }
3948 decimal_entity_too_big:
3949
3950 /* Starting to scan a new section of the wchar buffer
3951 * 'p2' is pointing at the next wchar which needs to be processed */
3952 p = p2;
3953 while (*p2 != '&')
3954 p2++;
3955
3956 if (p2 > p) {
3957 memcpy(converted, p, (p2 - p) * 4);
3958 converted += p2 - p;
3959 p = p2;
3960 }
3961
3962 if (p < wchar_buf + out_len)
3963 goto found_ampersand;
3964
3965 /* We do not have any wchars remaining at the end of this buffer which
3966 * we need to reprocess on the next call */
3967 wchar_buf_offset = 0;
3968 process_converted_wchars:
3969 ZEND_ASSERT(converted <= converted_buf + 128);
3970 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3971 }
3972
3973 return mb_convert_buf_result(&buf, encoding);
3974 }
3975
3976 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3977 PHP_FUNCTION(mb_decode_numericentity)
3978 {
3979 zend_string *encoding = NULL, *str;
3980 int mapsize;
3981 HashTable *target_hash;
3982
3983 ZEND_PARSE_PARAMETERS_START(2, 3)
3984 Z_PARAM_STR(str)
3985 Z_PARAM_ARRAY_HT(target_hash)
3986 Z_PARAM_OPTIONAL
3987 Z_PARAM_STR_OR_NULL(encoding)
3988 ZEND_PARSE_PARAMETERS_END();
3989
3990 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3991 if (!enc) {
3992 RETURN_THROWS();
3993 }
3994
3995 uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3996 if (convmap == NULL) {
3997 RETURN_THROWS();
3998 }
3999
4000 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, mapsize));
4001 efree(convmap);
4002 }
4003 /* }}} */
4004
4005 /* {{{ Sends an email message with MIME scheme */
4006 #define CRLF "\r\n"
4007
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)4008 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4009 {
4010 const char *ps;
4011 size_t icnt;
4012 int state = 0;
4013 int crlf_state = -1;
4014 char *token = NULL;
4015 size_t token_pos = 0;
4016 zend_string *fld_name, *fld_val;
4017
4018 ps = str;
4019 icnt = str_len;
4020 fld_name = fld_val = NULL;
4021
4022 /*
4023 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4024 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4025 * state 0 1 2 3
4026 *
4027 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4028 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4029 * crlf_state -1 0 1 -1
4030 *
4031 */
4032
4033 while (icnt > 0) {
4034 switch (*ps) {
4035 case ':':
4036 if (crlf_state == 1) {
4037 token_pos++;
4038 }
4039
4040 if (state == 0 || state == 1) {
4041 if(token && token_pos > 0) {
4042 fld_name = zend_string_init(token, token_pos, 0);
4043 }
4044 state = 2;
4045 } else {
4046 token_pos++;
4047 }
4048
4049 crlf_state = 0;
4050 break;
4051
4052 case '\n':
4053 if (crlf_state == -1) {
4054 goto out;
4055 }
4056 crlf_state = -1;
4057 break;
4058
4059 case '\r':
4060 if (crlf_state == 1) {
4061 token_pos++;
4062 } else {
4063 crlf_state = 1;
4064 }
4065 break;
4066
4067 case ' ': case '\t':
4068 if (crlf_state == -1) {
4069 if (state == 3) {
4070 /* continuing from the previous line */
4071 state = 4;
4072 } else {
4073 /* simply skipping this new line */
4074 state = 5;
4075 }
4076 } else {
4077 if (crlf_state == 1) {
4078 token_pos++;
4079 }
4080 if (state == 1 || state == 3) {
4081 token_pos++;
4082 }
4083 }
4084 crlf_state = 0;
4085 break;
4086
4087 default:
4088 switch (state) {
4089 case 0:
4090 token = (char*)ps;
4091 token_pos = 0;
4092 state = 1;
4093 break;
4094
4095 case 2:
4096 if (crlf_state != -1) {
4097 token = (char*)ps;
4098 token_pos = 0;
4099
4100 state = 3;
4101 break;
4102 }
4103 ZEND_FALLTHROUGH;
4104
4105 case 3:
4106 if (crlf_state == -1) {
4107 if(token && token_pos > 0) {
4108 fld_val = zend_string_init(token, token_pos, 0);
4109 }
4110
4111 if (fld_name != NULL && fld_val != NULL) {
4112 zval val;
4113 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4114 ZVAL_STR(&val, fld_val);
4115
4116 zend_hash_update(ht, fld_name, &val);
4117
4118 zend_string_release_ex(fld_name, 0);
4119 }
4120
4121 fld_name = fld_val = NULL;
4122 token = (char*)ps;
4123 token_pos = 0;
4124
4125 state = 1;
4126 }
4127 break;
4128
4129 case 4:
4130 token_pos++;
4131 state = 3;
4132 break;
4133 }
4134
4135 if (crlf_state == 1) {
4136 token_pos++;
4137 }
4138
4139 token_pos++;
4140
4141 crlf_state = 0;
4142 break;
4143 }
4144 ps++, icnt--;
4145 }
4146 out:
4147 if (state == 2) {
4148 token = "";
4149 token_pos = 0;
4150
4151 state = 3;
4152 }
4153 if (state == 3) {
4154 if(token && token_pos > 0) {
4155 fld_val = zend_string_init(token, token_pos, 0);
4156 }
4157 if (fld_name != NULL && fld_val != NULL) {
4158 zval val;
4159 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4160 ZVAL_STR(&val, fld_val);
4161 zend_hash_update(ht, fld_name, &val);
4162
4163 zend_string_release_ex(fld_name, 0);
4164 }
4165 }
4166 return state;
4167 }
4168
PHP_FUNCTION(mb_send_mail)4169 PHP_FUNCTION(mb_send_mail)
4170 {
4171 char *to;
4172 size_t to_len;
4173 char *message;
4174 size_t message_len;
4175 zend_string *subject;
4176 zend_string *extra_cmd = NULL;
4177 HashTable *headers_ht = NULL;
4178 zend_string *str_headers = NULL;
4179 size_t i;
4180 char *to_r = NULL;
4181 char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4182 bool suppress_content_type = false;
4183 bool suppress_content_transfer_encoding = false;
4184
4185 char *p;
4186 enum mbfl_no_encoding;
4187 const mbfl_encoding *tran_cs, /* transfer text charset */
4188 *head_enc, /* header transfer encoding */
4189 *body_enc; /* body transfer encoding */
4190 const mbfl_language *lang;
4191 HashTable ht_headers;
4192 zval *s;
4193
4194 /* character-set, transfer-encoding */
4195 tran_cs = &mbfl_encoding_utf8;
4196 head_enc = &mbfl_encoding_base64;
4197 body_enc = &mbfl_encoding_base64;
4198 lang = mbfl_no2language(MBSTRG(language));
4199 if (lang != NULL) {
4200 tran_cs = mbfl_no2encoding(lang->mail_charset);
4201 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4202 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4203 }
4204
4205 ZEND_PARSE_PARAMETERS_START(3, 5)
4206 Z_PARAM_PATH(to, to_len)
4207 Z_PARAM_PATH_STR(subject)
4208 Z_PARAM_PATH(message, message_len)
4209 Z_PARAM_OPTIONAL
4210 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4211 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4212 ZEND_PARSE_PARAMETERS_END();
4213
4214 if (str_headers) {
4215 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4216 zend_argument_value_error(4, "must not contain any null bytes");
4217 RETURN_THROWS();
4218 }
4219 str_headers = php_trim(str_headers, NULL, 0, 2);
4220 } else if (headers_ht) {
4221 str_headers = php_mail_build_headers(headers_ht);
4222 if (EG(exception)) {
4223 RETURN_THROWS();
4224 }
4225 }
4226
4227 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4228
4229 if (str_headers != NULL) {
4230 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4231 }
4232
4233 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4234 char *tmp;
4235 char *param_name;
4236 char *charset = NULL;
4237
4238 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4239 p = strchr(Z_STRVAL_P(s), ';');
4240
4241 if (p != NULL) {
4242 /* skipping the padded spaces */
4243 do {
4244 ++p;
4245 } while (*p == ' ' || *p == '\t');
4246
4247 if (*p != '\0') {
4248 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4249 if (strcasecmp(param_name, "charset") == 0) {
4250 const mbfl_encoding *_tran_cs = tran_cs;
4251
4252 charset = php_strtok_r(NULL, "= \"", &tmp);
4253 if (charset != NULL) {
4254 _tran_cs = mbfl_name2encoding(charset);
4255 }
4256
4257 if (!_tran_cs) {
4258 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4259 _tran_cs = &mbfl_encoding_ascii;
4260 }
4261 tran_cs = _tran_cs;
4262 }
4263 }
4264 }
4265 }
4266 suppress_content_type = true;
4267 }
4268
4269 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4270 const mbfl_encoding *_body_enc;
4271
4272 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4273 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4274 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4275 case mbfl_no_encoding_base64:
4276 case mbfl_no_encoding_7bit:
4277 case mbfl_no_encoding_8bit:
4278 body_enc = _body_enc;
4279 break;
4280
4281 default:
4282 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4283 body_enc = &mbfl_encoding_8bit;
4284 break;
4285 }
4286 suppress_content_transfer_encoding = true;
4287 }
4288
4289 /* To: */
4290 if (to_len > 0) {
4291 to_r = estrndup(to, to_len);
4292 for (; to_len; to_len--) {
4293 if (!isspace((unsigned char) to_r[to_len - 1])) {
4294 break;
4295 }
4296 to_r[to_len - 1] = '\0';
4297 }
4298 for (i = 0; to_r[i]; i++) {
4299 if (iscntrl((unsigned char) to_r[i])) {
4300 /* According to RFC 822, section 3.1.1 long headers may be separated into
4301 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4302 * To prevent these separators from being replaced with a space, we skip over them. */
4303 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4304 i += 2;
4305 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4306 i++;
4307 }
4308 continue;
4309 }
4310
4311 to_r[i] = ' ';
4312 }
4313 }
4314 } else {
4315 to_r = to;
4316 }
4317
4318 /* Subject: */
4319 const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
4320 if (enc == &mbfl_encoding_pass) {
4321 enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4322 }
4323 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4324 size_t line_sep_len = strlen(line_sep);
4325
4326 subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4327
4328 /* message body */
4329 const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
4330 if (msg_enc == &mbfl_encoding_pass) {
4331 msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4332 }
4333
4334 unsigned int num_errors = 0;
4335 zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4336 zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4337 zend_string_free(tmpstr);
4338 message = ZSTR_VAL(conv);
4339
4340 /* other headers */
4341 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4342 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4343 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4344 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4345
4346 smart_str str = {0};
4347 bool empty = true;
4348
4349 if (str_headers != NULL) {
4350 /* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4351 size_t len = ZSTR_LEN(str_headers);
4352 if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4353 len--;
4354 }
4355 if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4356 len--;
4357 }
4358 smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4359 empty = false;
4360 zend_string_release_ex(str_headers, 0);
4361 }
4362
4363 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4364 if (!empty) {
4365 smart_str_appendl(&str, line_sep, line_sep_len);
4366 }
4367 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4368 empty = false;
4369 }
4370
4371 if (!suppress_content_type) {
4372 if (!empty) {
4373 smart_str_appendl(&str, line_sep, line_sep_len);
4374 }
4375 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4376
4377 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4378 if (p != NULL) {
4379 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4380 smart_str_appends(&str, p);
4381 }
4382 empty = false;
4383 }
4384
4385 if (!suppress_content_transfer_encoding) {
4386 if (!empty) {
4387 smart_str_appendl(&str, line_sep, line_sep_len);
4388 }
4389 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4390 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4391 if (p == NULL) {
4392 p = "7bit";
4393 }
4394 smart_str_appends(&str, p);
4395 }
4396
4397 str_headers = smart_str_extract(&str);
4398
4399 if (force_extra_parameters) {
4400 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4401 } else if (extra_cmd) {
4402 extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4403 }
4404
4405 RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4406
4407 if (extra_cmd) {
4408 zend_string_release_ex(extra_cmd, 0);
4409 }
4410 if (to_r != to) {
4411 efree(to_r);
4412 }
4413 zend_string_release(subject);
4414 zend_string_free(conv);
4415 zend_hash_destroy(&ht_headers);
4416 if (str_headers) {
4417 zend_string_release_ex(str_headers, 0);
4418 }
4419 }
4420
4421 #undef CRLF
4422 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4423 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4424 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4425 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4426 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4427 /* }}} */
4428
4429 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4430 PHP_FUNCTION(mb_get_info)
4431 {
4432 zend_string *type = NULL;
4433 size_t n;
4434 char *name;
4435 zval row;
4436 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4437 const mbfl_encoding **entry;
4438
4439 ZEND_PARSE_PARAMETERS_START(0, 1)
4440 Z_PARAM_OPTIONAL
4441 Z_PARAM_STR(type)
4442 ZEND_PARSE_PARAMETERS_END();
4443
4444 if (!type || zend_string_equals_literal_ci(type, "all")) {
4445 array_init(return_value);
4446 if (MBSTRG(current_internal_encoding)) {
4447 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4448 }
4449 if (MBSTRG(http_input_identify)) {
4450 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4451 }
4452 if (MBSTRG(current_http_output_encoding)) {
4453 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4454 }
4455 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4456 add_assoc_string(return_value, "http_output_conv_mimetypes", name);
4457 }
4458 if (lang != NULL) {
4459 if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4460 add_assoc_string(return_value, "mail_charset", name);
4461 }
4462 if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4463 add_assoc_string(return_value, "mail_header_encoding", name);
4464 }
4465 if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4466 add_assoc_string(return_value, "mail_body_encoding", name);
4467 }
4468 }
4469 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4470 if (MBSTRG(encoding_translation)) {
4471 add_assoc_string(return_value, "encoding_translation", "On");
4472 } else {
4473 add_assoc_string(return_value, "encoding_translation", "Off");
4474 }
4475 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4476 add_assoc_string(return_value, "language", name);
4477 }
4478 n = MBSTRG(current_detect_order_list_size);
4479 entry = MBSTRG(current_detect_order_list);
4480 if (n > 0) {
4481 size_t i;
4482 array_init(&row);
4483 for (i = 0; i < n; i++) {
4484 add_next_index_string(&row, (*entry)->name);
4485 entry++;
4486 }
4487 add_assoc_zval(return_value, "detect_order", &row);
4488 }
4489 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4490 add_assoc_string(return_value, "substitute_character", "none");
4491 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4492 add_assoc_string(return_value, "substitute_character", "long");
4493 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4494 add_assoc_string(return_value, "substitute_character", "entity");
4495 } else {
4496 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4497 }
4498 if (MBSTRG(strict_detection)) {
4499 add_assoc_string(return_value, "strict_detection", "On");
4500 } else {
4501 add_assoc_string(return_value, "strict_detection", "Off");
4502 }
4503 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4504 if (MBSTRG(current_internal_encoding)) {
4505 RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
4506 }
4507 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4508 if (MBSTRG(http_input_identify)) {
4509 RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
4510 }
4511 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4512 if (MBSTRG(current_http_output_encoding)) {
4513 RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4514 }
4515 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4516 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4517 RETVAL_STRING(name);
4518 }
4519 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4520 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4521 RETVAL_STRING(name);
4522 }
4523 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4524 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4525 RETVAL_STRING(name);
4526 }
4527 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4528 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4529 RETVAL_STRING(name);
4530 }
4531 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4532 RETVAL_LONG(MBSTRG(illegalchars));
4533 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4534 if (MBSTRG(encoding_translation)) {
4535 RETVAL_STRING("On");
4536 } else {
4537 RETVAL_STRING("Off");
4538 }
4539 } else if (zend_string_equals_literal_ci(type, "language")) {
4540 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4541 RETVAL_STRING(name);
4542 }
4543 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4544 n = MBSTRG(current_detect_order_list_size);
4545 entry = MBSTRG(current_detect_order_list);
4546 if (n > 0) {
4547 size_t i;
4548 array_init(return_value);
4549 for (i = 0; i < n; i++) {
4550 add_next_index_string(return_value, (*entry)->name);
4551 entry++;
4552 }
4553 }
4554 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4555 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4556 RETVAL_STRING("none");
4557 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4558 RETVAL_STRING("long");
4559 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4560 RETVAL_STRING("entity");
4561 } else {
4562 RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
4563 }
4564 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4565 if (MBSTRG(strict_detection)) {
4566 RETVAL_STRING("On");
4567 } else {
4568 RETVAL_STRING("Off");
4569 }
4570 } else {
4571 // TODO Convert to ValueError
4572 RETURN_FALSE;
4573 }
4574 }
4575 /* }}} */
4576
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4577 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4578 {
4579 uint32_t wchar_buf[128];
4580 unsigned char *in = (unsigned char*)input;
4581 unsigned int state = 0;
4582
4583 if (encoding->check != NULL) {
4584 return encoding->check(in, length);
4585 }
4586
4587 /* If the input string is not encoded in the given encoding, there is a significant chance
4588 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4589 * buffer of 128 codepoints, convert and check just a few codepoints first */
4590 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4591 ZEND_ASSERT(out_len <= 8);
4592 for (int i = 0; i < out_len; i++) {
4593 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4594 return false;
4595 }
4596 }
4597
4598 while (length) {
4599 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4600 ZEND_ASSERT(out_len <= 128);
4601 for (int i = 0; i < out_len; i++) {
4602 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4603 return false;
4604 }
4605 }
4606 }
4607
4608 return true;
4609 }
4610
4611 /* MSVC 32-bit has issues with 64-bit intrinsics.
4612 * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4613 * It seems this is caused by a bug in MS Visual C++
4614 * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4615 #if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4616 # define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4617 #endif
4618
4619 /* If we are building an AVX2-only binary, don't compile the next function */
4620 #ifndef ZEND_INTRIN_AVX2_NATIVE
4621
4622 /* SSE2-based function for validating UTF-8 strings
4623 * A faster implementation which uses AVX2 instructions follows */
mb_fast_check_utf8_default(zend_string * str)4624 static bool mb_fast_check_utf8_default(zend_string *str)
4625 {
4626 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4627 # ifdef __SSE2__
4628 /* `e` points 1 byte past the last full 16-byte block of string content
4629 * Note that we include the terminating null byte which is included in each zend_string
4630 * as part of the content to check; this ensures that multi-byte characters which are
4631 * truncated abruptly at the end of the string will be detected as invalid */
4632 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4633
4634 /* For checking for illegal bytes 0xF5-FF */
4635 const __m128i over_f5 = _mm_set1_epi8(-117);
4636 /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4637 const __m128i over_9f = _mm_set1_epi8(-97);
4638 /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4639 const __m128i over_8f = _mm_set1_epi8(-113);
4640 /* For checking for illegal bytes 0xC0-C1 */
4641 const __m128i find_c0 = _mm_set1_epi8(-64);
4642 const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4643 /* For checking structure of continuation bytes */
4644 const __m128i find_e0 = _mm_set1_epi8(-32);
4645 const __m128i find_f0 = _mm_set1_epi8(-16);
4646
4647 __m128i last_block = _mm_setzero_si128();
4648 __m128i operand;
4649
4650 while (p < e) {
4651 operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4652
4653 check_operand:
4654 /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4655 if (!_mm_movemask_epi8(operand)) {
4656 /* Even if this block only contains single-byte characters, there may have been a
4657 * multi-byte character at the end of the previous block, which was supposed to
4658 * have continuation bytes in this block
4659 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4660 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4661 * from the 3rd last */
4662 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4663 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4664 if (_mm_movemask_epi8(bad)) {
4665 return false;
4666 }
4667
4668 /* Consume as many full blocks of single-byte characters as we can */
4669 while (true) {
4670 p += sizeof(__m128i);
4671 if (p >= e) {
4672 goto finish_up_remaining_bytes;
4673 }
4674 operand = _mm_loadu_si128((__m128i*)p);
4675 if (_mm_movemask_epi8(operand)) {
4676 break;
4677 }
4678 }
4679 }
4680
4681 /* Check for >= 0xF5, which are illegal byte values in UTF-8
4682 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4683 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4684 * Then a single signed compare will pick out any bad bytes
4685 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4686 __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4687
4688 /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4689 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4690 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4691 * We can check for both problems at once by generating a vector where each byte < 0xA0
4692 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4693 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4694 __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4695 __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4696 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4697
4698 /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4699 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4700 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4701 * Build the bitmask and compare it with the shifted block */
4702 __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4703 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4704
4705 /* Check for overlong 2-byte code units
4706 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4707 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4708 * byte range, do a signed compare to pick out any bad bytes */
4709 bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4710
4711 /* Check structure of continuation bytes
4712 * A UTF-8 byte should be a continuation byte if, and only if, it is:
4713 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4714 * 2) 2 bytes after the start of a 3-byte or 4-byte character
4715 * 3) 3 bytes after the start of a 4-byte character
4716 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4717 * get a single bitmask with 0xFF in each position where a continuation byte should be */
4718 __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4719 __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4720 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4721 __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4722 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4723
4724 /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4725 * a continuation byte actually is
4726 * XOR those two bitmasks together; if everything is good, the result should be zero
4727 * However, if a byte which should have been a continuation wasn't, or if a byte which
4728 * shouldn't have been a continuation was, we will get 0xFF in that position */
4729 __m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4730 bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4731
4732 /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4733 * If that value is non-zero, then we found a bad byte somewhere! */
4734 if (_mm_movemask_epi8(bad)) {
4735 return false;
4736 }
4737
4738 last_block = operand;
4739 p += sizeof(__m128i);
4740 }
4741
4742 finish_up_remaining_bytes:
4743 /* Finish up 1-15 remaining bytes */
4744 if (p == e) {
4745 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
4746
4747 /* Crazy hack here for cases where 9 or more bytes are remaining...
4748 * We want to use the above vectorized code to check a block of less than 16 bytes,
4749 * but there is no good way to read a variable number of bytes into an XMM register
4750 * However, we know that these bytes are part of a zend_string, and a zend_string has some
4751 * 'header' fields which occupy the memory just before its content
4752 * And, those header fields occupy more than 16 bytes...
4753 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4754 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4755 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4756 * Then, we do a left shift to get rid of the unwanted bytes
4757 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4758 *
4759 * The following `switch` looks useless, but it's not
4760 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4761 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4762 */
4763 switch (remaining_bytes) {
4764 case 0: ;
4765 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4766 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4767 return _mm_movemask_epi8(bad) == 0;
4768 case 1:
4769 case 2:
4770 operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
4771 goto check_operand;
4772 case 3:
4773 case 4:
4774 operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
4775 goto check_operand;
4776 case 5:
4777 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
4778 goto check_operand;
4779 case 6:
4780 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
4781 goto check_operand;
4782 case 7:
4783 case 8:
4784 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4785 operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
4786 #else
4787 operand = _mm_set_epi64x(0, *((uint64_t*)p));
4788 #endif
4789 goto check_operand;
4790 case 9:
4791 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
4792 goto check_operand;
4793 case 10:
4794 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
4795 goto check_operand;
4796 case 11:
4797 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
4798 goto check_operand;
4799 case 12:
4800 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
4801 goto check_operand;
4802 case 13:
4803 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
4804 goto check_operand;
4805 case 14:
4806 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
4807 goto check_operand;
4808 case 15:
4809 /* No trailing bytes are left which need to be checked
4810 * We get 15 because we did not include the terminating null when
4811 * calculating `remaining_bytes`, so the value wraps around */
4812 return true;
4813 }
4814
4815 ZEND_UNREACHABLE();
4816 }
4817
4818 return true;
4819 # else
4820 /* This UTF-8 validation function is derived from PCRE2 */
4821 size_t length = ZSTR_LEN(str);
4822 /* Table of the number of extra bytes, indexed by the first byte masked with
4823 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
4824 static const uint8_t utf8_table[] = {
4825 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
4826 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
4827 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
4828 3,3,3,3,3,3,3,3
4829 };
4830
4831 for (; length > 0; p++) {
4832 uint32_t d;
4833 unsigned char c = *p;
4834 length--;
4835
4836 if (c < 128) {
4837 /* ASCII character */
4838 continue;
4839 }
4840
4841 if (c < 0xc0) {
4842 /* Isolated 10xx xxxx byte */
4843 return false;
4844 }
4845
4846 if (c >= 0xf5) {
4847 return false;
4848 }
4849
4850 uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
4851 if (length < ab) {
4852 /* Missing bytes */
4853 return false;
4854 }
4855 length -= ab;
4856
4857 /* Check top bits in the second byte */
4858 if (((d = *(++p)) & 0xc0) != 0x80) {
4859 return false;
4860 }
4861
4862 /* For each length, check that the remaining bytes start with the 0x80 bit
4863 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
4864 * excluded range 0xd800 to 0xdfff. */
4865 switch (ab) {
4866 case 1:
4867 /* 2-byte character. No further bytes to check for 0x80. Check first byte
4868 * for xx00 000x (overlong sequence). */
4869 if ((c & 0x3e) == 0) {
4870 return false;
4871 }
4872 break;
4873
4874 case 2:
4875 /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
4876 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
4877 if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
4878 return false;
4879 }
4880 break;
4881
4882 case 3:
4883 /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
4884 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
4885 * character greater than 0x0010ffff (f4 8f bf bf) */
4886 if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
4887 return false;
4888 }
4889 break;
4890
4891 EMPTY_SWITCH_DEFAULT_CASE();
4892 }
4893 }
4894
4895 return true;
4896 # endif
4897 }
4898
4899 #endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
4900
4901 #ifdef ZEND_INTRIN_AVX2_NATIVE
4902
4903 /* We are building AVX2-only binary */
4904 # include <immintrin.h>
4905 # define mb_fast_check_utf8 mb_fast_check_utf8_avx2
4906
4907 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
4908
4909 /* We are building binary which works with or without AVX2; whether or not to use
4910 * AVX2-accelerated functions will be determined at runtime */
4911 # include <immintrin.h>
4912 # include "Zend/zend_cpuinfo.h"
4913
4914 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
4915 /* Dynamic linker will decide whether or not to use AVX2-based functions and
4916 * resolve symbols accordingly */
4917
4918 ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
4919
4920 bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
4921
4922 typedef bool (*check_utf8_func_t)(zend_string*);
4923
4924 ZEND_NO_SANITIZE_ADDRESS
4925 ZEND_ATTRIBUTE_UNUSED
resolve_check_utf8(void)4926 static check_utf8_func_t resolve_check_utf8(void)
4927 {
4928 if (zend_cpu_supports_avx2()) {
4929 return mb_fast_check_utf8_avx2;
4930 }
4931 return mb_fast_check_utf8_default;
4932 }
4933
4934 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
4935 /* We are compiling for a target where the dynamic linker will not be able to
4936 * resolve symbols according to whether the host supports AVX2 or not; so instead,
4937 * we can make calls go through a function pointer and set the function pointer
4938 * on module load */
4939
4940 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
4941 static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
4942 #else
4943 static bool mb_fast_check_utf8_avx2(zend_string *str);
4944 #endif
4945
4946 static bool (*check_utf8_ptr)(zend_string *str) = NULL;
4947
mb_fast_check_utf8(zend_string * str)4948 static bool mb_fast_check_utf8(zend_string *str)
4949 {
4950 return check_utf8_ptr(str);
4951 }
4952
init_check_utf8(void)4953 static void init_check_utf8(void)
4954 {
4955 if (zend_cpu_supports_avx2()) {
4956 check_utf8_ptr = mb_fast_check_utf8_avx2;
4957 } else {
4958 check_utf8_ptr = mb_fast_check_utf8_default;
4959 }
4960 }
4961 # endif
4962
4963 #else
4964
4965 /* No AVX2 support */
4966 #define mb_fast_check_utf8 mb_fast_check_utf8_default
4967
4968 #endif
4969
4970 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
4971
4972 /* GCC prior to version 8 does not define all intrinsics. See GH-11514.
4973 * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
4974 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
4975 # define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
4976 #endif
4977
4978 /* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
4979 * number of bytes, then take the low 256 bits
4980 * This is used to take some number of trailing bytes from the previous 32-byte
4981 * block followed by some number of leading bytes from the current 32-byte block
4982 *
4983 * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
4984 * YMM register while shifting in bytes from another YMM register... but
4985 * it works separately on respective 128-bit halves of the YMM registers,
4986 * which is not what we want.
4987 * To make it work as desired, we first do _mm256_permute2x128_si256
4988 * (VPERM2I128) to combine the low 128 bits from the previous block and
4989 * the high 128 bits of the current block in one YMM register.
4990 * Then VPALIGNR will do what is needed. */
4991 #define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
4992
4993 /* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
4994 *
4995 * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
4996 * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
4997 * sections. */
4998 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
mb_fast_check_utf8_avx2(zend_string * str)4999 ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5000 #else
5001 static bool mb_fast_check_utf8_avx2(zend_string *str)
5002 #endif
5003 {
5004 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5005 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5006
5007 /* The algorithm used here for UTF-8 validation is partially adapted from the
5008 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5009 * and Daniel Lemire.
5010 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5011 *
5012 * Most types of invalid UTF-8 text can be detected by examining pairs of
5013 * successive bytes. Specifically:
5014 *
5015 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5016 * No valid UTF-8 string ever uses these byte values.
5017 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5018 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5019 * • 5-byte or 6-byte code units, which should never be used, start with
5020 * 0xF8-FE.
5021 * • A codepoint value higher than U+10FFFF, which is the highest value for
5022 * any Unicode codepoint, would either start with 0xF4, followed by a
5023 * byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5024 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5025 * be used, would start with 0xED, followed by a byte >= 0xA0.
5026 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5027 *
5028 * To detect all these problems, for each pair of successive bytes, we do
5029 * table lookups using the high nibble of the first byte, the low nibble of
5030 * the first byte, and the high nibble of the second byte. Each table lookup
5031 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5032 * combination; AND those three bitmasks together, and any 1 bit in the result
5033 * will indicate an actual invalid byte combination was found.
5034 */
5035
5036 #define BAD_BYTE 0x1
5037 #define OVERLONG_2BYTE 0x2
5038 #define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5039 #define OVERLONG_3BYTE 0x4
5040 #define SURROGATE 0x8
5041 #define OVERLONG_4BYTE 0x10
5042 #define INVALID_CP 0x20
5043
5044 /* Each of these are 16-entry tables, repeated twice; this is required by the
5045 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5046 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5047 *
5048 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5049 * that means that high nibble 0xC is consistent with the byte pair being part of
5050 * an overlong 2-byte code unit */
5051 const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5052 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5053 0, 0, 0, 0,
5054 0, 0, 0, 0,
5055 0, 0, 0, 0,
5056 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5057 0, 0, 0, 0,
5058 0, 0, 0, 0,
5059 0, 0, 0, 0);
5060 const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5061 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5062 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5063 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5064 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5065 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5066 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5067 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5068 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5069 const __m256i bad_hi_nibble = _mm256_set_epi8(
5070 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5071 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5072 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5073 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5074 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5075 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5076 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5077 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5078 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5079 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5080 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5081 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5082 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5083 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5084 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5085 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5086
5087 const __m256i find_continuation = _mm256_set1_epi8(-64);
5088 const __m256i _b = _mm256_set1_epi8(0xB);
5089 const __m256i _d = _mm256_set1_epi8(0xD);
5090 const __m256i _f = _mm256_set1_epi8(0xF);
5091
5092 __m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5093 __m256i operand;
5094
5095 while (p < e) {
5096 operand = _mm256_loadu_si256((__m256i*)p);
5097
5098 check_operand:
5099 if (!_mm256_movemask_epi8(operand)) {
5100 /* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5101 * the previous block didn't end with an incomplete multi-byte character
5102 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5103 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5104 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5105 if (_mm256_movemask_epi8(bad)) {
5106 return false;
5107 }
5108
5109 /* Consume as many full blocks of single-byte characters as we can */
5110 while (true) {
5111 p += sizeof(__m256i);
5112 if (p >= e) {
5113 goto finish_up_remaining_bytes;
5114 }
5115 operand = _mm256_loadu_si256((__m256i*)p);
5116 if (_mm256_movemask_epi8(operand)) {
5117 break;
5118 }
5119 }
5120 }
5121
5122 __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5123 __m256i lo_nibbles = _mm256_and_si256(operand, _f);
5124
5125 __m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5126 __m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5127
5128 /* Do parallel table lookups in all 3 tables */
5129 __m256i bad = _mm256_cmpgt_epi8(
5130 _mm256_and_si256(
5131 _mm256_and_si256(
5132 _mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5133 _mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5134 _mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5135 _mm256_setzero_si256());
5136
5137 __m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5138 __m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5139 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5140 __m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5141 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5142
5143 __m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5144 bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5145
5146 if (_mm256_movemask_epi8(bad)) {
5147 return false;
5148 }
5149
5150 last_hi_nibbles = hi_nibbles;
5151 last_lo_nibbles = lo_nibbles;
5152 p += sizeof(__m256i);
5153 }
5154
5155 finish_up_remaining_bytes:
5156 if (p == e) {
5157 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5158
5159 switch (remaining_bytes) {
5160 case 0: ;
5161 /* No actual data bytes are remaining */
5162 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5163 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5164 return _mm256_movemask_epi8(bad) == 0;
5165 case 1:
5166 case 2:
5167 operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5168 goto check_operand;
5169 case 3:
5170 case 4:
5171 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5172 goto check_operand;
5173 case 5:
5174 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5175 goto check_operand;
5176 case 6:
5177 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5178 goto check_operand;
5179 case 7:
5180 case 8:
5181 #ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5182 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5183 #else
5184 operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5185 #endif
5186 goto check_operand;
5187 case 9:
5188 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5189 goto check_operand;
5190 case 10:
5191 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5192 goto check_operand;
5193 case 11:
5194 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5195 goto check_operand;
5196 case 12:
5197 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5198 goto check_operand;
5199 case 13:
5200 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5201 goto check_operand;
5202 case 14:
5203 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5204 goto check_operand;
5205 case 15:
5206 case 16:
5207 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5208 goto check_operand;
5209 case 17:
5210 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5211 goto check_operand;
5212 case 18:
5213 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5214 goto check_operand;
5215 case 19:
5216 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5217 goto check_operand;
5218 case 20:
5219 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5220 goto check_operand;
5221 case 21:
5222 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5223 goto check_operand;
5224 case 22:
5225 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5226 goto check_operand;
5227 case 23:
5228 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5229 goto check_operand;
5230 case 24:
5231 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5232 goto check_operand;
5233 case 25:
5234 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5235 goto check_operand;
5236 case 26:
5237 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5238 goto check_operand;
5239 case 27:
5240 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5241 goto check_operand;
5242 case 28:
5243 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5244 goto check_operand;
5245 case 29:
5246 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5247 goto check_operand;
5248 case 30:
5249 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5250 goto check_operand;
5251 case 31:
5252 return true;
5253 }
5254
5255 ZEND_UNREACHABLE();
5256 }
5257
5258 return true;
5259 }
5260
5261 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5262
mb_check_str_encoding(zend_string * str,const mbfl_encoding * encoding)5263 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5264 {
5265 if (encoding == &mbfl_encoding_utf8) {
5266 if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
5267 return true;
5268 }
5269 bool result = mb_fast_check_utf8(str);
5270 if (result && !ZSTR_IS_INTERNED(str)) {
5271 GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
5272 }
5273 return result;
5274 } else {
5275 return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5276 }
5277 }
5278
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)5279 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5280 {
5281 zend_long idx;
5282 zend_string *key;
5283 zval *entry;
5284 int valid = 1;
5285
5286 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5287
5288 if (GC_IS_RECURSIVE(vars)) {
5289 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5290 return 0;
5291 }
5292 GC_TRY_PROTECT_RECURSION(vars);
5293 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5294 ZVAL_DEREF(entry);
5295 if (key) {
5296 if (!mb_check_str_encoding(key, encoding)) {
5297 valid = 0;
5298 break;
5299 }
5300 }
5301 switch (Z_TYPE_P(entry)) {
5302 case IS_STRING:
5303 if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5304 valid = 0;
5305 break;
5306 }
5307 break;
5308 case IS_ARRAY:
5309 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5310 valid = 0;
5311 break;
5312 }
5313 break;
5314 case IS_LONG:
5315 case IS_DOUBLE:
5316 case IS_NULL:
5317 case IS_TRUE:
5318 case IS_FALSE:
5319 break;
5320 default:
5321 /* Other types are error. */
5322 valid = 0;
5323 break;
5324 }
5325 } ZEND_HASH_FOREACH_END();
5326 GC_TRY_UNPROTECT_RECURSION(vars);
5327 return valid;
5328 }
5329
5330 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)5331 PHP_FUNCTION(mb_check_encoding)
5332 {
5333 zend_string *input_str = NULL, *enc = NULL;
5334 HashTable *input_ht = NULL;
5335 const mbfl_encoding *encoding;
5336
5337 ZEND_PARSE_PARAMETERS_START(0, 2)
5338 Z_PARAM_OPTIONAL
5339 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5340 Z_PARAM_STR_OR_NULL(enc)
5341 ZEND_PARSE_PARAMETERS_END();
5342
5343 encoding = php_mb_get_encoding(enc, 2);
5344 if (!encoding) {
5345 RETURN_THROWS();
5346 }
5347
5348 if (input_ht) {
5349 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5350 } else if (input_str) {
5351 RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5352 } else {
5353 php_error_docref(NULL, E_DEPRECATED,
5354 "Calling mb_check_encoding() without argument is deprecated");
5355
5356 /* FIXME: Actually check all inputs, except $_FILES file content. */
5357 RETURN_BOOL(MBSTRG(illegalchars) == 0);
5358 }
5359 }
5360 /* }}} */
5361
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)5362 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5363 const uint32_t enc_name_arg_num)
5364 {
5365 const mbfl_encoding *enc;
5366 enum mbfl_no_encoding no_enc;
5367
5368 ZEND_ASSERT(str_len > 0);
5369
5370 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5371 if (!enc) {
5372 return -2;
5373 }
5374
5375 no_enc = enc->no_encoding;
5376 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5377 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5378 return -2;
5379 }
5380
5381 /* Some legacy text encodings have a minimum required wchar buffer size;
5382 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5383 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5384 unsigned int state = 0;
5385 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5386 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
5387
5388 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5389 return -1;
5390 }
5391 return wchar_buf[0];
5392 }
5393
5394 /* {{{ */
PHP_FUNCTION(mb_ord)5395 PHP_FUNCTION(mb_ord)
5396 {
5397 char *str;
5398 size_t str_len;
5399 zend_string *enc = NULL;
5400 zend_long cp;
5401
5402 ZEND_PARSE_PARAMETERS_START(1, 2)
5403 Z_PARAM_STRING(str, str_len)
5404 Z_PARAM_OPTIONAL
5405 Z_PARAM_STR_OR_NULL(enc)
5406 ZEND_PARSE_PARAMETERS_END();
5407
5408 if (str_len == 0) {
5409 zend_argument_value_error(1, "must not be empty");
5410 RETURN_THROWS();
5411 }
5412
5413 cp = php_mb_ord(str, str_len, enc, 2);
5414
5415 if (0 > cp) {
5416 if (cp == -2) {
5417 RETURN_THROWS();
5418 }
5419 RETURN_FALSE;
5420 }
5421
5422 RETURN_LONG(cp);
5423 }
5424 /* }}} */
5425
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)5426 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5427 {
5428 const mbfl_encoding *enc;
5429 enum mbfl_no_encoding no_enc;
5430 zend_string *ret;
5431 char buf[4];
5432
5433 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5434 if (!enc) {
5435 return NULL;
5436 }
5437
5438 no_enc = enc->no_encoding;
5439 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5440 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5441 return NULL;
5442 }
5443
5444 if (cp < 0 || cp > 0x10ffff) {
5445 return NULL;
5446 }
5447
5448 if (php_mb_is_no_encoding_utf8(no_enc)) {
5449 if (cp > 0xd7ff && 0xe000 > cp) {
5450 return NULL;
5451 }
5452
5453 if (cp < 0x80) {
5454 ret = ZSTR_CHAR(cp);
5455 } else if (cp < 0x800) {
5456 ret = zend_string_alloc(2, 0);
5457 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5458 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5459 ZSTR_VAL(ret)[2] = 0;
5460 } else if (cp < 0x10000) {
5461 ret = zend_string_alloc(3, 0);
5462 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5463 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5464 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5465 ZSTR_VAL(ret)[3] = 0;
5466 } else {
5467 ret = zend_string_alloc(4, 0);
5468 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5469 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5470 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5471 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5472 ZSTR_VAL(ret)[4] = 0;
5473 }
5474
5475 return ret;
5476 }
5477
5478 buf[0] = (cp >> 24) & 0xff;
5479 buf[1] = (cp >> 16) & 0xff;
5480 buf[2] = (cp >> 8) & 0xff;
5481 buf[3] = cp & 0xff;
5482
5483 long orig_illegalchars = MBSTRG(illegalchars);
5484 MBSTRG(illegalchars) = 0;
5485 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
5486
5487 if (MBSTRG(illegalchars) != 0) {
5488 zend_string_release(ret);
5489 ret = NULL;
5490 }
5491
5492 MBSTRG(illegalchars) = orig_illegalchars;
5493 return ret;
5494 }
5495
5496 /* {{{ */
PHP_FUNCTION(mb_chr)5497 PHP_FUNCTION(mb_chr)
5498 {
5499 zend_long cp;
5500 zend_string *enc = NULL;
5501
5502 ZEND_PARSE_PARAMETERS_START(1, 2)
5503 Z_PARAM_LONG(cp)
5504 Z_PARAM_OPTIONAL
5505 Z_PARAM_STR_OR_NULL(enc)
5506 ZEND_PARSE_PARAMETERS_END();
5507
5508 zend_string* ret = php_mb_chr(cp, enc, 2);
5509 if (ret == NULL) {
5510 RETURN_FALSE;
5511 }
5512
5513 RETURN_STR(ret);
5514 }
5515 /* }}} */
5516
PHP_FUNCTION(mb_str_pad)5517 PHP_FUNCTION(mb_str_pad)
5518 {
5519 zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5520 zend_long pad_to_length;
5521 zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5522
5523 ZEND_PARSE_PARAMETERS_START(2, 5)
5524 Z_PARAM_STR(input)
5525 Z_PARAM_LONG(pad_to_length)
5526 Z_PARAM_OPTIONAL
5527 Z_PARAM_STR(pad)
5528 Z_PARAM_LONG(pad_type_val)
5529 Z_PARAM_STR_OR_NULL(encoding_str)
5530 ZEND_PARSE_PARAMETERS_END();
5531
5532 const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5533 if (!encoding) {
5534 RETURN_THROWS();
5535 }
5536
5537 size_t input_length = mb_get_strlen(input, encoding);
5538
5539 /* If resulting string turns out to be shorter than input string,
5540 we simply copy the input and return. */
5541 if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5542 RETURN_STR_COPY(input);
5543 }
5544
5545 if (ZSTR_LEN(pad) == 0) {
5546 zend_argument_value_error(3, "must be a non-empty string");
5547 RETURN_THROWS();
5548 }
5549
5550 if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5551 zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5552 RETURN_THROWS();
5553 }
5554
5555 size_t pad_length = mb_get_strlen(pad, encoding);
5556
5557 size_t num_mb_pad_chars = pad_to_length - input_length;
5558
5559 /* We need to figure out the left/right padding lengths. */
5560 size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5561 switch (pad_type_val) {
5562 case PHP_STR_PAD_RIGHT:
5563 right_pad = num_mb_pad_chars;
5564 break;
5565
5566 case PHP_STR_PAD_LEFT:
5567 left_pad = num_mb_pad_chars;
5568 break;
5569
5570 case PHP_STR_PAD_BOTH:
5571 left_pad = num_mb_pad_chars / 2;
5572 right_pad = num_mb_pad_chars - left_pad;
5573 break;
5574 }
5575
5576 /* How many full block copies need to happen, and how many characters are then left over? */
5577 size_t full_left_pad_copies = left_pad / pad_length;
5578 size_t full_right_pad_copies = right_pad / pad_length;
5579 size_t remaining_left_pad_chars = left_pad % pad_length;
5580 size_t remaining_right_pad_chars = right_pad % pad_length;
5581
5582 if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5583 goto overflow_no_release;
5584 }
5585
5586 /* Compute the number of bytes required for the padding */
5587 size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5588 size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5589
5590 /* No special fast-path handling necessary for zero-length pads because these functions will not
5591 * allocate memory in case a zero-length pad is required. */
5592 zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5593 zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5594
5595 if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5596 || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5597 goto overflow;
5598 }
5599
5600 size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5601 size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5602
5603 if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5604 || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5605 goto overflow;
5606 }
5607
5608 zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5609 char *buffer = ZSTR_VAL(result);
5610
5611 /* First we pad the left. */
5612 for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5613 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5614 }
5615 memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5616 buffer += ZSTR_LEN(remaining_left_pad_str);
5617
5618 /* Then we copy the input string. */
5619 memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5620 buffer += ZSTR_LEN(input);
5621
5622 /* Finally, we pad on the right. */
5623 for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5624 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5625 }
5626 memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5627
5628 ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5629
5630 zend_string_release_ex(remaining_left_pad_str, false);
5631 zend_string_release_ex(remaining_right_pad_str, false);
5632
5633 RETURN_NEW_STR(result);
5634
5635 overflow:
5636 zend_string_release_ex(remaining_left_pad_str, false);
5637 zend_string_release_ex(remaining_right_pad_str, false);
5638 overflow_no_release:
5639 zend_throw_error(NULL, "String size overflow");
5640 RETURN_THROWS();
5641 }
5642
5643 /* {{{ */
PHP_FUNCTION(mb_scrub)5644 PHP_FUNCTION(mb_scrub)
5645 {
5646 zend_string *str, *enc_name = NULL;
5647
5648 ZEND_PARSE_PARAMETERS_START(1, 2)
5649 Z_PARAM_STR(str)
5650 Z_PARAM_OPTIONAL
5651 Z_PARAM_STR_OR_NULL(enc_name)
5652 ZEND_PARSE_PARAMETERS_END();
5653
5654 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5655 if (!enc) {
5656 RETURN_THROWS();
5657 }
5658
5659 if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
5660 /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5661 RETURN_STR_COPY(str);
5662 }
5663
5664 RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
5665 }
5666 /* }}} */
5667
5668 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)5669 static void php_mb_populate_current_detect_order_list(void)
5670 {
5671 const mbfl_encoding **entry = 0;
5672 size_t nentries;
5673
5674 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
5675 nentries = MBSTRG(detect_order_list_size);
5676 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5677 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5678 } else {
5679 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
5680 size_t i;
5681 nentries = MBSTRG(default_detect_order_list_size);
5682 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5683 for (i = 0; i < nentries; i++) {
5684 entry[i] = mbfl_no2encoding(src[i]);
5685 }
5686 }
5687 MBSTRG(current_detect_order_list) = entry;
5688 MBSTRG(current_detect_order_list_size) = nentries;
5689 }
5690 /* }}} */
5691
5692 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)5693 static int php_mb_encoding_translation(void)
5694 {
5695 return MBSTRG(encoding_translation);
5696 }
5697 /* }}} */
5698
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)5699 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
5700 {
5701 if (enc) {
5702 if (enc->mblen_table) {
5703 if (s) {
5704 return enc->mblen_table[*(unsigned char *)s];
5705 }
5706 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
5707 return 2;
5708 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
5709 return 4;
5710 }
5711 }
5712 return 1;
5713 }
5714
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)5715 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
5716 {
5717 const char *p = s;
5718 char *last=NULL;
5719
5720 if (nbytes == (size_t)-1) {
5721 size_t nb = 0;
5722
5723 while (*p != '\0') {
5724 if (nb == 0) {
5725 if ((unsigned char)*p == (unsigned char)c) {
5726 last = (char *)p;
5727 }
5728 nb = php_mb_mbchar_bytes(p, enc);
5729 if (nb == 0) {
5730 return NULL; /* something is going wrong! */
5731 }
5732 }
5733 --nb;
5734 ++p;
5735 }
5736 } else {
5737 size_t bcnt = nbytes;
5738 size_t nbytes_char;
5739 while (bcnt > 0) {
5740 if ((unsigned char)*p == (unsigned char)c) {
5741 last = (char *)p;
5742 }
5743 nbytes_char = php_mb_mbchar_bytes(p, enc);
5744 if (bcnt < nbytes_char) {
5745 return NULL;
5746 }
5747 p += nbytes_char;
5748 bcnt -= nbytes_char;
5749 }
5750 }
5751 return last;
5752 }
5753
php_mb_stripos(bool mode,zend_string * haystack,zend_string * needle,zend_long offset,const mbfl_encoding * enc)5754 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
5755 {
5756 /* We're using simple case-folding here, because we'd have to deal with remapping of
5757 * offsets otherwise. */
5758 zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5759 zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, 0);
5760
5761 size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
5762
5763 zend_string_free(haystack_conv);
5764 zend_string_free(needle_conv);
5765
5766 return n;
5767 }
5768
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)5769 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
5770 {
5771 *list = (const zend_encoding **)MBSTRG(http_input_list);
5772 *list_size = MBSTRG(http_input_list_size);
5773 }
5774 /* }}} */
5775
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)5776 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
5777 {
5778 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
5779 }
5780 /* }}} */
5781
5782 static const unsigned char base64_table[] = {
5783 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
5784 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
5785 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
5786 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
5787 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
5788 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
5789 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
5790 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
5791 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
5792 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
5793 };
5794
transfer_encoded_size(mb_convert_buf * tmpbuf,bool base64)5795 static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
5796 {
5797 if (base64) {
5798 return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
5799 } else {
5800 size_t enc_size = 0;
5801 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
5802 while (p < tmpbuf->out) {
5803 unsigned char c = *p++;
5804 enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
5805 }
5806 return enc_size;
5807 }
5808 }
5809
transfer_encode_mime_bytes(mb_convert_buf * tmpbuf,mb_convert_buf * outbuf,bool base64)5810 static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
5811 {
5812 unsigned char *out, *limit;
5813 MB_CONVERT_BUF_LOAD(outbuf, out, limit);
5814 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
5815
5816 if (base64) {
5817 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
5818 while ((e - p) >= 3) {
5819 unsigned char a = *p++;
5820 unsigned char b = *p++;
5821 unsigned char c = *p++;
5822 uint32_t bits = (a << 16) | (b << 8) | c;
5823 out = mb_convert_buf_add4(out,
5824 base64_table[(bits >> 18) & 0x3F],
5825 base64_table[(bits >> 12) & 0x3F],
5826 base64_table[(bits >> 6) & 0x3F],
5827 base64_table[bits & 0x3F]);
5828 }
5829 if (p != e) {
5830 if ((e - p) == 1) {
5831 uint32_t bits = *p++;
5832 out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
5833 } else {
5834 unsigned char a = *p++;
5835 unsigned char b = *p++;
5836 uint32_t bits = (a << 8) | b;
5837 out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
5838 }
5839 }
5840 } else {
5841 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
5842 while (p < e) {
5843 unsigned char c = *p++;
5844 if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
5845 out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
5846 } else {
5847 out = mb_convert_buf_add(out, c);
5848 }
5849 }
5850 }
5851
5852 mb_convert_buf_reset(tmpbuf, 0);
5853 MB_CONVERT_BUF_STORE(outbuf, out, limit);
5854 }
5855
5856 #define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
5857
mb_mime_header_encode(zend_string * input,const mbfl_encoding * incode,const mbfl_encoding * outcode,bool base64,char * linefeed,size_t linefeed_len,zend_long indent)5858 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
5859 {
5860 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
5861 size_t in_len = ZSTR_LEN(input);
5862
5863 ZEND_ASSERT(outcode->mime_name != NULL);
5864 ZEND_ASSERT(outcode->mime_name[0] != '\0');
5865
5866 if (!in_len) {
5867 return zend_empty_string;
5868 }
5869
5870 if (indent < 0 || indent >= 74) {
5871 indent = 0;
5872 }
5873
5874 if (linefeed_len > 8) {
5875 linefeed_len = 8;
5876 }
5877 /* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
5878 for (size_t i = 0; i < linefeed_len; i++) {
5879 if (linefeed[i] == '\0') {
5880 linefeed_len = i;
5881 break;
5882 }
5883 }
5884
5885 unsigned int state = 0;
5886 /* wchar_buf should be big enough that when it is full, we definitely have enough
5887 * wchars to fill an entire line of output */
5888 uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
5889 uint32_t *p, *e;
5890 /* What part of wchar_buf is filled with still-unprocessed data which should not
5891 * be overwritten? */
5892 unsigned int offset = 0;
5893 size_t line_start = 0;
5894
5895 /* If the entire input string is ASCII with no spaces (except possibly leading
5896 * spaces), just pass it through unchanged */
5897 bool checking_leading_spaces = true;
5898 while (in_len) {
5899 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
5900 p = wchar_buf;
5901 e = wchar_buf + out_len;
5902
5903 while (p < e) {
5904 uint32_t w = *p++;
5905 if (checking_leading_spaces) {
5906 if (w == ' ') {
5907 continue;
5908 } else {
5909 checking_leading_spaces = false;
5910 }
5911 }
5912 if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
5913 /* We cannot simply pass input string through unchanged; start again */
5914 in = (unsigned char*)ZSTR_VAL(input);
5915 in_len = ZSTR_LEN(input);
5916 goto no_passthrough;
5917 }
5918 }
5919 }
5920
5921 return zend_string_copy(input); /* This just increments refcount */
5922
5923 no_passthrough: ;
5924
5925 mb_convert_buf buf;
5926 mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
5927
5928 /* Encode some prefix of the input string as plain ASCII if possible
5929 * If we find it necessary to switch to Base64/QPrint encoding, we will
5930 * do so all the way to the end of the string */
5931 while (in_len) {
5932 /* Decode part of the input string, refill wchar_buf */
5933 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
5934 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
5935 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
5936 p = wchar_buf;
5937 e = wchar_buf + offset + out_len;
5938 /* ASCII output is broken into space-delimited 'words'
5939 * If we find a non-ASCII character in the middle of a word, we will
5940 * transfer-encode the entire word */
5941 uint32_t *word_start = p;
5942
5943 /* Don't consider adding line feed for spaces at the beginning of a word */
5944 while (p < e && *p == ' ' && (p - word_start) <= 74) {
5945 p++;
5946 }
5947
5948 while (p < e) {
5949 uint32_t w = *p++;
5950
5951 if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
5952 /* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
5953 * If we are already too far along on a line to include Base64/QPrint encoded data
5954 * on the same line (without overrunning max line length), then add a line feed
5955 * right now */
5956 feed_and_mime_encode:
5957 if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
5958 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
5959 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
5960 buf.out = mb_convert_buf_add(buf.out, ' ');
5961 indent = 0;
5962 line_start = mb_convert_buf_len(&buf);
5963 } else if (mb_convert_buf_len(&buf) > 0) {
5964 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
5965 buf.out = mb_convert_buf_add(buf.out, ' ');
5966 }
5967 p = word_start; /* Back up to where MIME encoding of input chars should start */
5968 goto mime_encoding_needed;
5969 } else if (w == ' ') {
5970 /* When we see a space, check whether we should insert a line break */
5971 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
5972 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
5973 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
5974 buf.out = mb_convert_buf_add(buf.out, ' ');
5975 indent = 0;
5976 line_start = mb_convert_buf_len(&buf);
5977 } else if (mb_convert_buf_len(&buf) > 0) {
5978 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
5979 buf.out = mb_convert_buf_add(buf.out, ' ');
5980 }
5981 /* Output one (space-delimited) word as plain ASCII */
5982 while (word_start < p-1) {
5983 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
5984 }
5985 word_start++;
5986 while (p < e && *p == ' ') {
5987 p++;
5988 }
5989 }
5990 }
5991
5992 if (in_len) {
5993 /* Copy chars which are part of an incomplete 'word' to the beginning
5994 * of wchar_buf and reprocess them on the next iteration.
5995 * But first make sure that the incomplete 'word' isn't so big that
5996 * there will be no space to add any more decoded wchars in the buffer
5997 * (which could lead to an infinite loop) */
5998 if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
5999 goto feed_and_mime_encode;
6000 }
6001 offset = e - word_start;
6002 if (offset) {
6003 memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6004 }
6005 } else {
6006 /* We have reached the end of the input string while still in 'ASCII mode';
6007 * process any trailing ASCII chars which were not followed by a space */
6008 if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6009 /* The whole input string was not just one big ASCII 'word' with no spaces
6010 * consider adding a line feed if necessary to prevent output lines from
6011 * being too long */
6012 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6013 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6014 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6015 buf.out = mb_convert_buf_add(buf.out, ' ');
6016 } else {
6017 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6018 buf.out = mb_convert_buf_add(buf.out, ' ');
6019 }
6020 }
6021 while (word_start < e) {
6022 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6023 }
6024 }
6025 }
6026
6027 /* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6028 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6029
6030 mime_encoding_needed: ;
6031
6032 /* We will generate the output line by line, first converting wchars to bytes
6033 * in the requested output encoding, then transfer-encoding those bytes as
6034 * Base64 or QPrint
6035 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6036 * sending them to 'buf' */
6037 mb_convert_buf tmpbuf;
6038 mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6039
6040 /* Do we need to refill wchar_buf to make sure we don't run out of wchars
6041 * in the middle of a line? */
6042 offset = e - p;
6043 if (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
6044 goto start_new_line;
6045 }
6046 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6047
6048 while(true) {
6049 refill_wchar_buf: ;
6050 ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE);
6051 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6052 ZEND_ASSERT(out_len <= MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset);
6053 p = wchar_buf;
6054 e = wchar_buf + offset + out_len;
6055
6056 start_new_line: ;
6057 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6058 buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6059 buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6060 buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6061
6062 /* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6063 * We do something like a 'binary search' to find the greatest number which
6064 * can be included on this line without exceeding max line length */
6065 unsigned int n = 12;
6066 size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6067
6068 while (true) {
6069 ZEND_ASSERT(p < e);
6070
6071 /* Remember where we were in process of generating output, so we can back
6072 * up if necessary */
6073 size_t tmppos = mb_convert_buf_len(&tmpbuf);
6074 unsigned int tmpstate = tmpbuf.state;
6075
6076 /* Try encoding 'n' wchars in output text encoding and sending output
6077 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6078 * current line. */
6079 n = MIN(n, e - p);
6080 outcode->from_wchar(p, n, &tmpbuf, false);
6081
6082 /* For some output text encodings, there may be a few ending bytes
6083 * which need to be emitted to output before we break a line.
6084 * Again, remember where we were so we can back up */
6085 size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6086 unsigned int tmpstate2 = tmpbuf.state;
6087 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6088
6089 if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6090 /* If we convert 'n' more wchars on the current line, it will not
6091 * overflow the maximum line length */
6092 p += n;
6093
6094 if (p == e) {
6095 /* We are done; we shouldn't reach here if there is more remaining
6096 * of the input string which needs to be processed */
6097 ZEND_ASSERT(!in_len);
6098 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6099 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6100 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6101 mb_convert_buf_free(&tmpbuf);
6102 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6103 } else {
6104 /* It's possible that more chars might fit on the current line,
6105 * so back up to where we were before emitting any ending bytes */
6106 mb_convert_buf_reset(&tmpbuf, tmppos2);
6107 tmpbuf.state = tmpstate2;
6108 }
6109 } else {
6110 /* Converting 'n' more wchars on this line would be too much.
6111 * Back up to where we were before we tried that. */
6112 mb_convert_buf_reset(&tmpbuf, tmppos);
6113 tmpbuf.state = tmpstate;
6114
6115 if (n == 1) {
6116 /* We have found the exact number of chars which will fit on the
6117 * current line. Finish up and move to a new line. */
6118 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6119 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6120 tmpbuf.state = 0;
6121
6122 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6123 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6124
6125 indent = 0; /* Indent argument must only affect the first line */
6126
6127 if (in_len || p < e) {
6128 /* We still have more input to process */
6129 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6130 buf.out = mb_convert_buf_add(buf.out, ' ');
6131 line_start = mb_convert_buf_len(&buf);
6132 offset = e - p;
6133 if (in_len && (MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
6134 /* Copy any remaining wchars to beginning of buffer and refill
6135 * the rest of the buffer */
6136 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6137 goto refill_wchar_buf;
6138 }
6139 goto start_new_line;
6140 } else {
6141 /* We are done! */
6142 mb_convert_buf_free(&tmpbuf);
6143 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6144 }
6145 } else {
6146 /* Try a smaller number of wchars */
6147 n = MAX(n >> 1, 1);
6148 }
6149 }
6150 }
6151 }
6152 }
6153
PHP_FUNCTION(mb_encode_mimeheader)6154 PHP_FUNCTION(mb_encode_mimeheader)
6155 {
6156 const mbfl_encoding *charset = &mbfl_encoding_pass;
6157 zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6158 char *linefeed = "\r\n";
6159 size_t linefeed_len = 2;
6160 zend_long indent = 0;
6161 bool base64 = true;
6162
6163 ZEND_PARSE_PARAMETERS_START(1, 5)
6164 Z_PARAM_STR(str)
6165 Z_PARAM_OPTIONAL
6166 Z_PARAM_STR(charset_name)
6167 Z_PARAM_STR(transenc_name)
6168 Z_PARAM_STRING(linefeed, linefeed_len)
6169 Z_PARAM_LONG(indent)
6170 ZEND_PARSE_PARAMETERS_END();
6171
6172 if (charset_name != NULL) {
6173 charset = php_mb_get_encoding(charset_name, 2);
6174 if (!charset) {
6175 RETURN_THROWS();
6176 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6177 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6178 RETURN_THROWS();
6179 }
6180 } else {
6181 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
6182 if (lang != NULL) {
6183 charset = mbfl_no2encoding(lang->mail_charset);
6184 const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6185 char t = transenc->name[0];
6186 if (t == 'Q' || t == 'q') {
6187 base64 = false;
6188 }
6189 }
6190 }
6191
6192 if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6193 char t = ZSTR_VAL(transenc_name)[0];
6194 if (t == 'Q' || t == 'q') {
6195 base64 = false;
6196 }
6197 }
6198
6199 RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6200 }
6201
decode_base64(unsigned char c)6202 static int8_t decode_base64(unsigned char c)
6203 {
6204 if (c >= 'A' && c <= 'Z') {
6205 return c - 'A';
6206 } else if (c >= 'a' && c <= 'z') {
6207 return c - 'a' + 26;
6208 } else if (c >= '0' && c <= '9') {
6209 return c - '0' + 52;
6210 } else if (c == '+') {
6211 return 62;
6212 } else if (c == '/') {
6213 return 63;
6214 }
6215 return -1;
6216 }
6217
6218 static int8_t qprint_map[] = {
6219 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6220 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6221 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6222 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
6223 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6224 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6225 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6226 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6227 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6228 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6229 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6230 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6231 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6232 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6233 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6234 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6235 };
6236
6237 /* Decode MIME encoded word as defined in RFC 2047 */
mime_header_decode_encoded_word(unsigned char * p,unsigned char * e,const mbfl_encoding * outcode,mb_convert_buf * outbuf,unsigned int * state)6238 static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6239 {
6240 if ((e - p) < 6) {
6241 return NULL;
6242 }
6243
6244 ZEND_ASSERT(p[0] == '=');
6245 ZEND_ASSERT(p[1] == '?');
6246 p += 2;
6247
6248 unsigned char *charset = p;
6249 unsigned char *charset_end = memchr(charset, '?', e - charset);
6250 if (charset_end == NULL) {
6251 return NULL;
6252 }
6253
6254 unsigned char *encoding = charset_end + 1;
6255 p = encoding + 1;
6256 if (p >= e || *p++ != '?') {
6257 return NULL;
6258 }
6259
6260 char *charset_name = estrndup((const char*)charset, charset_end - charset);
6261 const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6262 efree(charset_name);
6263 if (incode == NULL) {
6264 return NULL;
6265 }
6266
6267 unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6268 if (end_marker) {
6269 e = end_marker;
6270 } else if (p < e && *(e-1) == '?') {
6271 /* If encoded word is not properly terminated, but last byte is '?',
6272 * take that as a terminator (legacy behavior) */
6273 e--;
6274 }
6275
6276 unsigned char *buf = emalloc(e - p), *bufp = buf;
6277 if (*encoding == 'Q' || *encoding == 'q') {
6278 /* Fill `buf` with bytes from decoding QPrint */
6279 while (p < e) {
6280 unsigned char c = *p++;
6281 if (c == '_') {
6282 *bufp++ = ' ';
6283 continue;
6284 } else if (c == '=' && (e - p) >= 2) {
6285 unsigned char c2 = *p++;
6286 unsigned char c3 = *p++;
6287 if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6288 *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6289 continue;
6290 } else if (c2 == '\r') {
6291 if (c3 != '\n') {
6292 p--;
6293 }
6294 continue;
6295 } else if (c2 == '\n') {
6296 p--;
6297 continue;
6298 }
6299 }
6300 *bufp++ = c;
6301 }
6302 } else if (*encoding == 'B' || *encoding == 'b') {
6303 /* Fill `buf` with bytes from decoding Base64 */
6304 unsigned int bits = 0, cache = 0;
6305 while (p < e) {
6306 unsigned char c = *p++;
6307 if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6308 continue;
6309 }
6310 int8_t decoded = decode_base64(c);
6311 if (decoded == -1) {
6312 *bufp++ = '?';
6313 continue;
6314 }
6315 bits += 6;
6316 cache = (cache << 6) | (decoded & 0x3F);
6317 if (bits == 24) {
6318 *bufp++ = (cache >> 16) & 0xFF;
6319 *bufp++ = (cache >> 8) & 0xFF;
6320 *bufp++ = cache & 0xFF;
6321 bits = cache = 0;
6322 }
6323 }
6324 if (bits == 18) {
6325 *bufp++ = (cache >> 10) & 0xFF;
6326 *bufp++ = (cache >> 2) & 0xFF;
6327 } else if (bits == 12) {
6328 *bufp++ = (cache >> 4) & 0xFF;
6329 }
6330 } else {
6331 efree(buf);
6332 return NULL;
6333 }
6334
6335 size_t in_len = bufp - buf;
6336 uint32_t wchar_buf[128];
6337
6338 bufp = buf;
6339 while (in_len) {
6340 size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6341 ZEND_ASSERT(out_len <= 128);
6342 outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6343 }
6344
6345 efree(buf);
6346 return e + 2;
6347 }
6348
mb_mime_header_decode(zend_string * input,const mbfl_encoding * outcode)6349 static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6350 {
6351 unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6352 unsigned int state = 0;
6353 bool space_pending = false;
6354
6355 mb_convert_buf buf;
6356 mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6357
6358 while (p < e) {
6359 unsigned char c = *p;
6360
6361 if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6362 /* Does this look like a MIME encoded word? If so, try to decode it as one */
6363 unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6364 if (incode_end && (e - incode_end) >= 3) {
6365 unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6366 if (temp) {
6367 p = temp;
6368 /* Decoding of MIME encoded word was successful;
6369 * Try to collapse a run of whitespace */
6370 if (p < e && (*p == '\n' || *p == '\r')) {
6371 do {
6372 p++;
6373 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6374 /* We will only actually output a space if this is not immediately followed
6375 * by another valid encoded word */
6376 space_pending = true;
6377 }
6378 continue;
6379 }
6380 }
6381 }
6382
6383 if (space_pending) {
6384 uint32_t space = ' ';
6385 outcode->from_wchar(&space, 1, &buf, false);
6386 space_pending = false;
6387 }
6388
6389 /* Consume a run of plain ASCII characters */
6390 if (c != '\n' && c != '\r') {
6391 unsigned char *end = p + 1;
6392 while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6393 end++;
6394 }
6395 uint32_t wchar_buf[128];
6396 size_t in_len = end - p;
6397 while (in_len) {
6398 size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6399 ZEND_ASSERT(out_len <= 128);
6400 outcode->from_wchar(wchar_buf, out_len, &buf, false);
6401 }
6402 }
6403 /* Collapse a run of whitespace into a single space */
6404 if (p < e && (*p == '\n' || *p == '\r')) {
6405 do {
6406 p++;
6407 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6408 if (p < e) {
6409 /* Emulating legacy behavior of mb_decode_mimeheader here;
6410 * a run of whitespace is not converted to a space at the very
6411 * end of the input string */
6412 uint32_t space = ' ';
6413 outcode->from_wchar(&space, 1, &buf, false);
6414 }
6415 }
6416 }
6417
6418 outcode->from_wchar(NULL, 0, &buf, true);
6419
6420 return mb_convert_buf_result(&buf, outcode);
6421 }
6422
PHP_FUNCTION(mb_decode_mimeheader)6423 PHP_FUNCTION(mb_decode_mimeheader)
6424 {
6425 zend_string *str;
6426
6427 ZEND_PARSE_PARAMETERS_START(1, 1)
6428 Z_PARAM_STR(str)
6429 ZEND_PARSE_PARAMETERS_END();
6430
6431 RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6432 }
6433