1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include "libmbfl/config.h"
22 #include "php.h"
23 #include "php_ini.h"
24 #include "php_variables.h"
25 #include "mbstring.h"
26 #include "ext/standard/php_string.h"
27 #include "ext/standard/php_mail.h"
28 #include "ext/standard/exec.h"
29 #include "ext/standard/url.h"
30 #include "main/php_output.h"
31 #include "ext/standard/info.h"
32 #include "ext/pcre/php_pcre.h"
33
34 #include "libmbfl/mbfl/mbfilter_8bit.h"
35 #include "libmbfl/mbfl/mbfilter_pass.h"
36 #include "libmbfl/mbfl/mbfilter_wchar.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_qprint.h"
40 #include "libmbfl/filters/mbfilter_htmlent.h"
41 #include "libmbfl/filters/mbfilter_uuencode.h"
42 #include "libmbfl/filters/mbfilter_ucs4.h"
43 #include "libmbfl/filters/mbfilter_utf8.h"
44 #include "libmbfl/filters/mbfilter_singlebyte.h"
45 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
46
47 #include "php_variables.h"
48 #include "php_globals.h"
49 #include "rfc1867.h"
50 #include "php_content_types.h"
51 #include "SAPI.h"
52 #include "php_unicode.h"
53 #include "TSRM.h"
54
55 #include "mb_gpc.h"
56
57 #ifdef HAVE_MBREGEX
58 # include "php_mbregex.h"
59 #endif
60
61 #include "zend_multibyte.h"
62 #include "mbstring_arginfo.h"
63 /* }}} */
64
65 /* {{{ prototypes */
66 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
67
68 static PHP_GINIT_FUNCTION(mbstring);
69 static PHP_GSHUTDOWN_FUNCTION(mbstring);
70
71 static void php_mb_populate_current_detect_order_list(void);
72
73 static int php_mb_encoding_translation(void);
74
75 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
76
77 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
78
79 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
80
81 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
82
83 /* See mbfilter_cp5022x.c */
84 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
85 /* }}} */
86
87 /* {{{ php_mb_default_identify_list */
88 typedef struct _php_mb_nls_ident_list {
89 enum mbfl_no_language lang;
90 const enum mbfl_no_encoding *list;
91 size_t list_size;
92 } php_mb_nls_ident_list;
93
94 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
95 mbfl_no_encoding_ascii,
96 mbfl_no_encoding_jis,
97 mbfl_no_encoding_utf8,
98 mbfl_no_encoding_euc_jp,
99 mbfl_no_encoding_sjis
100 };
101
102 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
103 mbfl_no_encoding_ascii,
104 mbfl_no_encoding_utf8,
105 mbfl_no_encoding_euc_cn,
106 mbfl_no_encoding_cp936
107 };
108
109 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
110 mbfl_no_encoding_ascii,
111 mbfl_no_encoding_utf8,
112 mbfl_no_encoding_euc_tw,
113 mbfl_no_encoding_big5
114 };
115
116 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
117 mbfl_no_encoding_ascii,
118 mbfl_no_encoding_utf8,
119 mbfl_no_encoding_euc_kr,
120 mbfl_no_encoding_uhc
121 };
122
123 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
124 mbfl_no_encoding_ascii,
125 mbfl_no_encoding_utf8,
126 mbfl_no_encoding_koi8r,
127 mbfl_no_encoding_cp1251,
128 mbfl_no_encoding_cp866
129 };
130
131 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
132 mbfl_no_encoding_ascii,
133 mbfl_no_encoding_utf8,
134 mbfl_no_encoding_armscii8
135 };
136
137 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
138 mbfl_no_encoding_ascii,
139 mbfl_no_encoding_utf8,
140 mbfl_no_encoding_cp1254,
141 mbfl_no_encoding_8859_9
142 };
143
144 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
145 mbfl_no_encoding_ascii,
146 mbfl_no_encoding_utf8,
147 mbfl_no_encoding_koi8u
148 };
149
150 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
151 mbfl_no_encoding_ascii,
152 mbfl_no_encoding_utf8
153 };
154
155
156 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
157 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
158 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
159 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
160 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
161 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
162 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
163 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
164 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
165 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
166 };
167
168 /* }}} */
169
170 /* {{{ mbstring_deps[] */
171 static const zend_module_dep mbstring_deps[] = {
172 ZEND_MOD_REQUIRED("pcre")
173 ZEND_MOD_END
174 };
175 /* }}} */
176
177 /* {{{ zend_module_entry mbstring_module_entry */
178 zend_module_entry mbstring_module_entry = {
179 STANDARD_MODULE_HEADER_EX,
180 NULL,
181 mbstring_deps,
182 "mbstring",
183 ext_functions,
184 PHP_MINIT(mbstring),
185 PHP_MSHUTDOWN(mbstring),
186 PHP_RINIT(mbstring),
187 PHP_RSHUTDOWN(mbstring),
188 PHP_MINFO(mbstring),
189 PHP_MBSTRING_VERSION,
190 PHP_MODULE_GLOBALS(mbstring),
191 PHP_GINIT(mbstring),
192 PHP_GSHUTDOWN(mbstring),
193 NULL,
194 STANDARD_MODULE_PROPERTIES_EX
195 };
196 /* }}} */
197
198 /* {{{ static sapi_post_entry php_post_entries[] */
199 static const sapi_post_entry php_post_entries[] = {
200 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
201 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
202 { NULL, 0, NULL, NULL }
203 };
204 /* }}} */
205
206 #ifdef COMPILE_DL_MBSTRING
207 #ifdef ZTS
208 ZEND_TSRMLS_CACHE_DEFINE()
209 #endif
210 ZEND_GET_MODULE(mbstring)
211 #endif
212
213 /* {{{ static sapi_post_entry mbstr_post_entries[] */
214 static const sapi_post_entry mbstr_post_entries[] = {
215 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
216 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
217 { NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)221 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
222 if (encoding_name) {
223 const mbfl_encoding *encoding;
224 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
225 if (last_encoding_name && (last_encoding_name == encoding_name
226 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
227 return MBSTRG(last_used_encoding);
228 }
229
230 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
231 if (!encoding) {
232 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
233 return NULL;
234 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
235 if (encoding == &mbfl_encoding_base64) {
236 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
237 } else if (encoding == &mbfl_encoding_qprint) {
238 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
239 } else if (encoding == &mbfl_encoding_html_ent) {
240 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
241 } else if (encoding == &mbfl_encoding_uuencode) {
242 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
243 }
244 }
245
246 if (last_encoding_name) {
247 zend_string_release(last_encoding_name);
248 }
249 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
250 MBSTRG(last_used_encoding) = encoding;
251 return encoding;
252 } else {
253 return MBSTRG(current_internal_encoding);
254 }
255 }
256
php_mb_get_encoding_or_pass(const char * encoding_name)257 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
258 if (strcmp(encoding_name, "pass") == 0) {
259 return &mbfl_encoding_pass;
260 }
261
262 return mbfl_name2encoding(encoding_name);
263 }
264
count_commas(const char * p,const char * end)265 static size_t count_commas(const char *p, const char *end) {
266 size_t count = 0;
267 while ((p = memchr(p, ',', end - p))) {
268 count++;
269 p++;
270 }
271 return count;
272 }
273
274 /* {{{ static zend_result php_mb_parse_encoding_list()
275 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
276 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
277 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num,bool allow_pass_encoding)278 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
279 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num,
280 bool allow_pass_encoding)
281 {
282 if (value == NULL || value_length == 0) {
283 *return_list = NULL;
284 *return_size = 0;
285 return SUCCESS;
286 } else {
287 bool included_auto;
288 size_t n, size;
289 char *p1, *endp, *tmpstr;
290 const mbfl_encoding **entry, **list;
291
292 /* copy the value string for work */
293 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
294 tmpstr = (char *)estrndup(value+1, value_length-2);
295 value_length -= 2;
296 } else {
297 tmpstr = (char *)estrndup(value, value_length);
298 }
299
300 endp = tmpstr + value_length;
301 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
302 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
303 entry = list;
304 n = 0;
305 included_auto = 0;
306 p1 = tmpstr;
307 while (1) {
308 char *comma = memchr(p1, ',', endp - p1);
309 char *p = comma ? comma : endp;
310 *p = '\0';
311 /* trim spaces */
312 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
313 p1++;
314 }
315 p--;
316 while (p > p1 && (*p == ' ' || *p == '\t')) {
317 *p = '\0';
318 p--;
319 }
320 /* convert to the encoding number and check encoding */
321 if (strcasecmp(p1, "auto") == 0) {
322 if (!included_auto) {
323 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
324 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
325 size_t i;
326 included_auto = 1;
327 for (i = 0; i < identify_list_size; i++) {
328 *entry++ = mbfl_no2encoding(*src++);
329 n++;
330 }
331 }
332 } else {
333 const mbfl_encoding *encoding =
334 allow_pass_encoding ? php_mb_get_encoding_or_pass(p1) : mbfl_name2encoding(p1);
335 if (!encoding) {
336 /* Called from an INI setting modification */
337 if (arg_num == 0) {
338 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
339 } else {
340 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
341 }
342 efree(tmpstr);
343 pefree(ZEND_VOIDP(list), persistent);
344 return FAILURE;
345 }
346
347 *entry++ = encoding;
348 n++;
349 }
350 if (n >= size || comma == NULL) {
351 break;
352 }
353 p1 = comma + 1;
354 }
355 *return_list = list;
356 *return_size = n;
357 efree(tmpstr);
358 }
359
360 return SUCCESS;
361 }
362 /* }}} */
363
364 /* {{{ static int php_mb_parse_encoding_array()
365 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
366 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
367 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)368 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
369 size_t *return_size, uint32_t arg_num)
370 {
371 /* Allocate enough space to include the default detect order if "auto" is used. */
372 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
373 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
374 const mbfl_encoding **entry = list;
375 bool included_auto = 0;
376 size_t n = 0;
377 zval *hash_entry;
378 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
379 zend_string *encoding_str = zval_try_get_string(hash_entry);
380 if (UNEXPECTED(!encoding_str)) {
381 efree(ZEND_VOIDP(list));
382 return FAILURE;
383 }
384
385 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
386 if (!included_auto) {
387 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
388 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
389 size_t j;
390
391 included_auto = 1;
392 for (j = 0; j < identify_list_size; j++) {
393 *entry++ = mbfl_no2encoding(*src++);
394 n++;
395 }
396 }
397 } else {
398 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
399 if (encoding) {
400 *entry++ = encoding;
401 n++;
402 } else {
403 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
404 zend_string_release(encoding_str);
405 efree(ZEND_VOIDP(list));
406 return FAILURE;
407 }
408 }
409 zend_string_release(encoding_str);
410 } ZEND_HASH_FOREACH_END();
411 *return_list = list;
412 *return_size = n;
413 return SUCCESS;
414 }
415 /* }}} */
416
417 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)418 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
419 {
420 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
421 }
422
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)423 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
424 {
425 return ((const mbfl_encoding *)encoding)->name;
426 }
427
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)428 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
429 {
430 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
431 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
432 }
433
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)434 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
435 {
436 mbfl_string string;
437
438 if (!list) {
439 list = (const zend_encoding **)MBSTRG(current_detect_order_list);
440 list_size = MBSTRG(current_detect_order_list_size);
441 }
442
443 mbfl_string_init(&string);
444 string.val = (unsigned char *)arg_string;
445 string.len = arg_length;
446 return (const zend_encoding *) mbfl_identify_encoding(&string, (const mbfl_encoding **)list, list_size, 0);
447 }
448
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)449 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
450 {
451 mbfl_string string, result;
452 mbfl_buffer_converter *convd;
453
454 /* new encoding */
455 /* initialize string */
456 string.encoding = (const mbfl_encoding*)encoding_from;
457 string.val = (unsigned char*)from;
458 string.len = from_length;
459
460 /* initialize converter */
461 convd = mbfl_buffer_converter_new((const mbfl_encoding *)encoding_from, (const mbfl_encoding *)encoding_to, string.len);
462 if (convd == NULL) {
463 return (size_t) -1;
464 }
465
466 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
467 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
468
469 /* do it */
470 size_t loc = mbfl_buffer_converter_feed(convd, &string);
471
472 mbfl_buffer_converter_flush(convd);
473 mbfl_string_init(&result);
474 if (!mbfl_buffer_converter_result(convd, &result)) {
475 mbfl_buffer_converter_delete(convd);
476 return (size_t)-1;
477 }
478
479 *to = result.val;
480 *to_length = result.len;
481
482 mbfl_buffer_converter_delete(convd);
483
484 return loc;
485 }
486
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)487 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
488 {
489 return php_mb_parse_encoding_list(
490 encoding_list, encoding_list_len,
491 (const mbfl_encoding ***)return_list, return_size,
492 persistent, /* arg_num */ 0, /* allow_pass_encoding */ 1);
493 }
494
php_mb_zend_internal_encoding_getter(void)495 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
496 {
497 return (const zend_encoding *)MBSTRG(internal_encoding);
498 }
499
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)500 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
501 {
502 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
503 return SUCCESS;
504 }
505
506 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
507 "mbstring",
508 php_mb_zend_encoding_fetcher,
509 php_mb_zend_encoding_name_getter,
510 php_mb_zend_encoding_lexer_compatibility_checker,
511 php_mb_zend_encoding_detector,
512 php_mb_zend_encoding_converter,
513 php_mb_zend_encoding_list_parser,
514 php_mb_zend_internal_encoding_getter,
515 php_mb_zend_internal_encoding_setter
516 };
517 /* }}} */
518
519 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)520 static void *_php_mb_compile_regex(const char *pattern)
521 {
522 pcre2_code *retval;
523 PCRE2_SIZE err_offset;
524 int errnum;
525
526 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
527 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
528 PCRE2_UCHAR err_str[128];
529 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
530 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
531 }
532 return retval;
533 }
534 /* }}} */
535
536 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)537 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
538 {
539 int res;
540
541 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
542 if (NULL == match_data) {
543 pcre2_code_free(opaque);
544 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
545 return FAILURE;
546 }
547 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
548 php_pcre_free_match_data(match_data);
549
550 return res;
551 }
552 /* }}} */
553
554 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)555 static void _php_mb_free_regex(void *opaque)
556 {
557 pcre2_code_free(opaque);
558 }
559 /* }}} */
560
561 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)562 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
563 {
564 size_t i;
565
566 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
567 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
568
569 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
570 if (php_mb_default_identify_list[i].lang == lang) {
571 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
572 *plist_size = php_mb_default_identify_list[i].list_size;
573 return 1;
574 }
575 }
576 return 0;
577 }
578 /* }}} */
579
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)580 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
581 {
582 char *result = emalloc(len + 2);
583 char *resp = result;
584 size_t i;
585
586 for (i = 0; i < len && start[i] != quote; ++i) {
587 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
588 *resp++ = start[++i];
589 } else {
590 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
591
592 while (j-- > 0 && i < len) {
593 *resp++ = start[i++];
594 }
595 --i;
596 }
597 }
598
599 *resp = '\0';
600 return result;
601 }
602
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)603 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
604 {
605 char *pos = *line, quote;
606 char *res;
607
608 while (*pos && *pos != stop) {
609 if ((quote = *pos) == '"' || quote == '\'') {
610 ++pos;
611 while (*pos && *pos != quote) {
612 if (*pos == '\\' && pos[1] && pos[1] == quote) {
613 pos += 2;
614 } else {
615 ++pos;
616 }
617 }
618 if (*pos) {
619 ++pos;
620 }
621 } else {
622 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
623
624 }
625 }
626 if (*pos == '\0') {
627 res = estrdup(*line);
628 *line += strlen(*line);
629 return res;
630 }
631
632 res = estrndup(*line, pos - *line);
633
634 while (*pos == stop) {
635 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
636 }
637
638 *line = pos;
639 return res;
640 }
641 /* }}} */
642
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)643 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
644 {
645 while (*str && isspace(*(unsigned char *)str)) {
646 ++str;
647 }
648
649 if (!*str) {
650 return estrdup("");
651 }
652
653 if (*str == '"' || *str == '\'') {
654 char quote = *str;
655
656 str++;
657 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
658 } else {
659 char *strend = str;
660
661 while (*strend && !isspace(*(unsigned char *)strend)) {
662 ++strend;
663 }
664 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
665 }
666 }
667 /* }}} */
668
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)669 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
670 {
671 char *s, *s2;
672 const size_t filename_len = strlen(filename);
673
674 /* The \ check should technically be needed for win32 systems only where
675 * it is a valid path separator. However, IE in all it's wisdom always sends
676 * the full path of the file on the user's filesystem, which means that unless
677 * the user does basename() they get a bogus file name. Until IE's user base drops
678 * to nill or problem is fixed this code must remain enabled for all systems. */
679 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
680 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
681
682 if (s && s2) {
683 if (s > s2) {
684 return ++s;
685 } else {
686 return ++s2;
687 }
688 } else if (s) {
689 return ++s;
690 } else if (s2) {
691 return ++s2;
692 } else {
693 return filename;
694 }
695 }
696 /* }}} */
697
698 /* {{{ php.ini directive handler */
699 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)700 static PHP_INI_MH(OnUpdate_mbstring_language)
701 {
702 enum mbfl_no_language no_language;
703
704 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
705 if (no_language == mbfl_no_language_invalid) {
706 MBSTRG(language) = mbfl_no_language_neutral;
707 return FAILURE;
708 }
709 MBSTRG(language) = no_language;
710 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
711 return SUCCESS;
712 }
713 /* }}} */
714
715 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)716 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
717 {
718 const mbfl_encoding **list;
719 size_t size;
720
721 if (!new_value) {
722 if (MBSTRG(detect_order_list)) {
723 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 }
725 MBSTRG(detect_order_list) = NULL;
726 MBSTRG(detect_order_list_size) = 0;
727 return SUCCESS;
728 }
729
730 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 0) || size == 0) {
731 return FAILURE;
732 }
733
734 if (MBSTRG(detect_order_list)) {
735 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
736 }
737 MBSTRG(detect_order_list) = list;
738 MBSTRG(detect_order_list_size) = size;
739 return SUCCESS;
740 }
741 /* }}} */
742
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)743 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
744 const mbfl_encoding **list;
745 size_t size;
746 if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 1) || size == 0) {
747 return FAILURE;
748 }
749 if (MBSTRG(http_input_list)) {
750 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
751 }
752 MBSTRG(http_input_list) = list;
753 MBSTRG(http_input_list_size) = size;
754 return SUCCESS;
755 }
756
757 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)758 static PHP_INI_MH(OnUpdate_mbstring_http_input)
759 {
760 if (new_value) {
761 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
762 }
763
764 if (!new_value || !ZSTR_LEN(new_value)) {
765 const char *encoding = php_get_input_encoding();
766 MBSTRG(http_input_set) = 0;
767 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
768 return SUCCESS;
769 }
770
771 MBSTRG(http_input_set) = 1;
772 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
773 }
774 /* }}} */
775
_php_mb_ini_mbstring_http_output_set(const char * new_value)776 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
777 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
778 if (!encoding) {
779 return FAILURE;
780 }
781
782 MBSTRG(http_output_encoding) = encoding;
783 MBSTRG(current_http_output_encoding) = encoding;
784 return SUCCESS;
785 }
786
787 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)788 static PHP_INI_MH(OnUpdate_mbstring_http_output)
789 {
790 if (new_value) {
791 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
792 }
793
794 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
795 MBSTRG(http_output_set) = 0;
796 _php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
797 return SUCCESS;
798 }
799
800 MBSTRG(http_output_set) = 1;
801 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
802 }
803 /* }}} */
804
805 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)806 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
807 {
808 const mbfl_encoding *encoding;
809
810 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
811 /* falls back to UTF-8 if an unknown encoding name is given */
812 if (new_value) {
813 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
814 }
815 encoding = &mbfl_encoding_utf8;
816 }
817 MBSTRG(internal_encoding) = encoding;
818 MBSTRG(current_internal_encoding) = encoding;
819 #ifdef HAVE_MBREGEX
820 {
821 const char *enc_name = new_value;
822 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
823 /* falls back to UTF-8 if an unknown encoding name is given */
824 enc_name = "UTF-8";
825 php_mb_regex_set_default_mbctype(enc_name);
826 }
827 php_mb_regex_set_mbctype(new_value);
828 }
829 #endif
830 return SUCCESS;
831 }
832 /* }}} */
833
834 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)835 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
836 {
837 if (new_value) {
838 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
839 }
840
841 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
842 return FAILURE;
843 }
844
845 if (new_value && ZSTR_LEN(new_value)) {
846 MBSTRG(internal_encoding_set) = 1;
847 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
848 } else {
849 const char *encoding = php_get_internal_encoding();
850 MBSTRG(internal_encoding_set) = 0;
851 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
852 }
853 }
854 /* }}} */
855
856 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)857 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
858 {
859 int c;
860 char *endptr = NULL;
861
862 if (new_value != NULL) {
863 if (zend_string_equals_literal_ci(new_value, "none")) {
864 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
865 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
866 } else if (zend_string_equals_literal_ci(new_value, "long")) {
867 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
868 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
869 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
870 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
871 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
872 } else {
873 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
874 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
875 if (ZSTR_LEN(new_value) > 0) {
876 c = strtol(ZSTR_VAL(new_value), &endptr, 0);
877 if (*endptr == '\0') {
878 MBSTRG(filter_illegal_substchar) = c;
879 MBSTRG(current_filter_illegal_substchar) = c;
880 }
881 }
882 }
883 } else {
884 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
885 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
886 MBSTRG(filter_illegal_substchar) = 0x3f; /* '?' */
887 MBSTRG(current_filter_illegal_substchar) = 0x3f; /* '?' */
888 }
889
890 return SUCCESS;
891 }
892 /* }}} */
893
894 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)895 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
896 {
897 if (new_value == NULL) {
898 return FAILURE;
899 }
900
901 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
902
903 if (MBSTRG(encoding_translation)) {
904 sapi_unregister_post_entry(php_post_entries);
905 sapi_register_post_entries(mbstr_post_entries);
906 } else {
907 sapi_unregister_post_entry(mbstr_post_entries);
908 sapi_register_post_entries(php_post_entries);
909 }
910
911 return SUCCESS;
912 }
913 /* }}} */
914
915 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)916 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
917 {
918 zend_string *tmp;
919 void *re = NULL;
920
921 if (!new_value) {
922 new_value = entry->orig_value;
923 }
924 tmp = php_trim(new_value, NULL, 0, 3);
925
926 if (ZSTR_LEN(tmp) > 0) {
927 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
928 zend_string_release_ex(tmp, 0);
929 return FAILURE;
930 }
931 }
932
933 if (MBSTRG(http_output_conv_mimetypes)) {
934 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
935 }
936
937 MBSTRG(http_output_conv_mimetypes) = re;
938
939 zend_string_release_ex(tmp, 0);
940 return SUCCESS;
941 }
942 /* }}} */
943 /* }}} */
944
945 /* {{{ php.ini directive registration */
946 PHP_INI_BEGIN()
947 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
948 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
949 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
950 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
951 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
952 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
953
954 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
955 PHP_INI_SYSTEM | PHP_INI_PERDIR,
956 OnUpdate_mbstring_encoding_translation,
957 encoding_translation, zend_mbstring_globals, mbstring_globals)
958 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
959 "^(text/|application/xhtml\\+xml)",
960 PHP_INI_ALL,
961 OnUpdate_mbstring_http_output_conv_mimetypes)
962
963 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
964 PHP_INI_ALL,
965 OnUpdateBool,
966 strict_detection, zend_mbstring_globals, mbstring_globals)
967 #ifdef HAVE_MBREGEX
968 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
969 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
970 #endif
PHP_INI_END()971 PHP_INI_END()
972 /* }}} */
973
974 static void mbstring_internal_encoding_changed_hook(void) {
975 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
976 if (!MBSTRG(internal_encoding_set)) {
977 const char *encoding = php_get_internal_encoding();
978 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
979 }
980
981 if (!MBSTRG(http_output_set)) {
982 const char *encoding = php_get_output_encoding();
983 _php_mb_ini_mbstring_http_output_set(encoding);
984 }
985
986 if (!MBSTRG(http_input_set)) {
987 const char *encoding = php_get_input_encoding();
988 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
989 }
990 }
991
992 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)993 static PHP_GINIT_FUNCTION(mbstring)
994 {
995 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
996 ZEND_TSRMLS_CACHE_UPDATE();
997 #endif
998
999 mbstring_globals->language = mbfl_no_language_uni;
1000 mbstring_globals->internal_encoding = NULL;
1001 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
1002 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
1003 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
1004 mbstring_globals->http_input_identify = NULL;
1005 mbstring_globals->http_input_identify_get = NULL;
1006 mbstring_globals->http_input_identify_post = NULL;
1007 mbstring_globals->http_input_identify_cookie = NULL;
1008 mbstring_globals->http_input_identify_string = NULL;
1009 mbstring_globals->http_input_list = NULL;
1010 mbstring_globals->http_input_list_size = 0;
1011 mbstring_globals->detect_order_list = NULL;
1012 mbstring_globals->detect_order_list_size = 0;
1013 mbstring_globals->current_detect_order_list = NULL;
1014 mbstring_globals->current_detect_order_list_size = 0;
1015 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1016 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1017 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1018 mbstring_globals->filter_illegal_substchar = 0x3f; /* '?' */
1019 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1020 mbstring_globals->current_filter_illegal_substchar = 0x3f; /* '?' */
1021 mbstring_globals->illegalchars = 0;
1022 mbstring_globals->encoding_translation = 0;
1023 mbstring_globals->strict_detection = 0;
1024 mbstring_globals->outconv = NULL;
1025 mbstring_globals->http_output_conv_mimetypes = NULL;
1026 #ifdef HAVE_MBREGEX
1027 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1028 #endif
1029 mbstring_globals->last_used_encoding_name = NULL;
1030 mbstring_globals->last_used_encoding = NULL;
1031 mbstring_globals->internal_encoding_set = 0;
1032 mbstring_globals->http_output_set = 0;
1033 mbstring_globals->http_input_set = 0;
1034 }
1035 /* }}} */
1036
1037 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1038 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1039 {
1040 if (mbstring_globals->http_input_list) {
1041 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1042 }
1043 if (mbstring_globals->detect_order_list) {
1044 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1045 }
1046 if (mbstring_globals->http_output_conv_mimetypes) {
1047 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1048 }
1049 #ifdef HAVE_MBREGEX
1050 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1051 #endif
1052 }
1053 /* }}} */
1054
1055 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1056 PHP_MINIT_FUNCTION(mbstring)
1057 {
1058 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1059 ZEND_TSRMLS_CACHE_UPDATE();
1060 #endif
1061
1062 REGISTER_INI_ENTRIES();
1063
1064 /* We assume that we're the only user of the hook. */
1065 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1066 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1067 mbstring_internal_encoding_changed_hook();
1068
1069 /* This is a global handler. Should not be set in a per-request handler. */
1070 sapi_register_treat_data(mbstr_treat_data);
1071
1072 /* Post handlers are stored in the thread-local context. */
1073 if (MBSTRG(encoding_translation)) {
1074 sapi_register_post_entries(mbstr_post_entries);
1075 }
1076
1077 #ifdef HAVE_MBREGEX
1078 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1079 #endif
1080
1081 register_mbstring_symbols(module_number);
1082
1083 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1084 return FAILURE;
1085 }
1086
1087 php_rfc1867_set_multibyte_callbacks(
1088 php_mb_encoding_translation,
1089 php_mb_gpc_get_detect_order,
1090 php_mb_gpc_set_input_encoding,
1091 php_mb_rfc1867_getword,
1092 php_mb_rfc1867_getword_conf,
1093 php_mb_rfc1867_basename);
1094
1095 return SUCCESS;
1096 }
1097 /* }}} */
1098
1099 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1100 PHP_MSHUTDOWN_FUNCTION(mbstring)
1101 {
1102 UNREGISTER_INI_ENTRIES();
1103
1104 zend_multibyte_restore_functions();
1105
1106 #ifdef HAVE_MBREGEX
1107 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1108 #endif
1109
1110 php_internal_encoding_changed = NULL;
1111
1112 return SUCCESS;
1113 }
1114 /* }}} */
1115
1116 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1117 PHP_RINIT_FUNCTION(mbstring)
1118 {
1119 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1120 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1121 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1122 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1123
1124 MBSTRG(illegalchars) = 0;
1125
1126 php_mb_populate_current_detect_order_list();
1127
1128 #ifdef HAVE_MBREGEX
1129 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1130 #endif
1131 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1132
1133 return SUCCESS;
1134 }
1135 /* }}} */
1136
1137 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1138 PHP_RSHUTDOWN_FUNCTION(mbstring)
1139 {
1140 if (MBSTRG(current_detect_order_list) != NULL) {
1141 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1142 MBSTRG(current_detect_order_list) = NULL;
1143 MBSTRG(current_detect_order_list_size) = 0;
1144 }
1145 if (MBSTRG(outconv) != NULL) {
1146 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1147 mbfl_buffer_converter_delete(MBSTRG(outconv));
1148 MBSTRG(outconv) = NULL;
1149 }
1150
1151 /* clear http input identification. */
1152 MBSTRG(http_input_identify) = NULL;
1153 MBSTRG(http_input_identify_post) = NULL;
1154 MBSTRG(http_input_identify_get) = NULL;
1155 MBSTRG(http_input_identify_cookie) = NULL;
1156 MBSTRG(http_input_identify_string) = NULL;
1157
1158 if (MBSTRG(last_used_encoding_name)) {
1159 zend_string_release(MBSTRG(last_used_encoding_name));
1160 MBSTRG(last_used_encoding_name) = NULL;
1161 }
1162
1163 MBSTRG(internal_encoding_set) = 0;
1164 MBSTRG(http_output_set) = 0;
1165 MBSTRG(http_input_set) = 0;
1166
1167 #ifdef HAVE_MBREGEX
1168 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1169 #endif
1170
1171 return SUCCESS;
1172 }
1173 /* }}} */
1174
1175 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1176 PHP_MINFO_FUNCTION(mbstring)
1177 {
1178 php_info_print_table_start();
1179 php_info_print_table_row(2, "Multibyte Support", "enabled");
1180 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1181 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1182 {
1183 char tmp[256];
1184 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1185 php_info_print_table_row(2, "libmbfl version", tmp);
1186 }
1187 php_info_print_table_end();
1188
1189 php_info_print_table_start();
1190 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1191 php_info_print_table_end();
1192
1193 #ifdef HAVE_MBREGEX
1194 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1195 #endif
1196
1197 DISPLAY_INI_ENTRIES();
1198 }
1199 /* }}} */
1200
1201 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1202 PHP_FUNCTION(mb_language)
1203 {
1204 zend_string *name = NULL;
1205
1206 ZEND_PARSE_PARAMETERS_START(0, 1)
1207 Z_PARAM_OPTIONAL
1208 Z_PARAM_STR_OR_NULL(name)
1209 ZEND_PARSE_PARAMETERS_END();
1210
1211 if (name == NULL) {
1212 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1213 } else {
1214 zend_string *ini_name = zend_string_init("mbstring.language", sizeof("mbstring.language") - 1, 0);
1215 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1216 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1217 zend_string_release_ex(ini_name, 0);
1218 RETURN_THROWS();
1219 }
1220 // TODO Make return void
1221 RETVAL_TRUE;
1222 zend_string_release_ex(ini_name, 0);
1223 }
1224 }
1225 /* }}} */
1226
1227 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1228 PHP_FUNCTION(mb_internal_encoding)
1229 {
1230 char *name = NULL;
1231 size_t name_len;
1232 const mbfl_encoding *encoding;
1233
1234 ZEND_PARSE_PARAMETERS_START(0, 1)
1235 Z_PARAM_OPTIONAL
1236 Z_PARAM_STRING_OR_NULL(name, name_len)
1237 ZEND_PARSE_PARAMETERS_END();
1238
1239 if (name == NULL) {
1240 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1241 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1242 } else {
1243 encoding = mbfl_name2encoding(name);
1244 if (!encoding) {
1245 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1246 RETURN_THROWS();
1247 } else {
1248 MBSTRG(current_internal_encoding) = encoding;
1249 MBSTRG(internal_encoding_set) = 1;
1250 /* TODO Return old encoding */
1251 RETURN_TRUE;
1252 }
1253 }
1254 }
1255 /* }}} */
1256
1257 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1258 PHP_FUNCTION(mb_http_input)
1259 {
1260 char *type = NULL;
1261 size_t type_len = 0, n;
1262 const mbfl_encoding **entry;
1263 const mbfl_encoding *encoding;
1264
1265 ZEND_PARSE_PARAMETERS_START(0, 1)
1266 Z_PARAM_OPTIONAL
1267 Z_PARAM_STRING_OR_NULL(type, type_len)
1268 ZEND_PARSE_PARAMETERS_END();
1269
1270 if (type == NULL) {
1271 encoding = MBSTRG(http_input_identify);
1272 } else {
1273 switch (*type) {
1274 case 'G':
1275 case 'g':
1276 encoding = MBSTRG(http_input_identify_get);
1277 break;
1278 case 'P':
1279 case 'p':
1280 encoding = MBSTRG(http_input_identify_post);
1281 break;
1282 case 'C':
1283 case 'c':
1284 encoding = MBSTRG(http_input_identify_cookie);
1285 break;
1286 case 'S':
1287 case 's':
1288 encoding = MBSTRG(http_input_identify_string);
1289 break;
1290 case 'I':
1291 case 'i':
1292 entry = MBSTRG(http_input_list);
1293 n = MBSTRG(http_input_list_size);
1294 array_init(return_value);
1295 for (size_t i = 0; i < n; i++, entry++) {
1296 add_next_index_string(return_value, (*entry)->name);
1297 }
1298 return;
1299 case 'L':
1300 case 'l':
1301 entry = MBSTRG(http_input_list);
1302 n = MBSTRG(http_input_list_size);
1303 if (n == 0) {
1304 RETURN_FALSE;
1305 }
1306 // TODO Use smart_str instead.
1307 mbfl_string result;
1308 mbfl_memory_device device;
1309 mbfl_memory_device_init(&device, n * 12, 0);
1310 for (size_t i = 0; i < n; i++, entry++) {
1311 mbfl_memory_device_strcat(&device, (*entry)->name);
1312 mbfl_memory_device_output(',', &device);
1313 }
1314 mbfl_memory_device_unput(&device); /* Remove trailing comma */
1315 mbfl_memory_device_result(&device, &result);
1316 RETVAL_STRINGL((const char*)result.val, result.len);
1317 mbfl_string_clear(&result);
1318 return;
1319 default:
1320 zend_argument_value_error(1,
1321 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1322 RETURN_THROWS();
1323 }
1324 }
1325
1326 if (encoding) {
1327 RETURN_STRING(encoding->name);
1328 } else {
1329 RETURN_FALSE;
1330 }
1331 }
1332 /* }}} */
1333
1334 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1335 PHP_FUNCTION(mb_http_output)
1336 {
1337 char *name = NULL;
1338 size_t name_len;
1339
1340 ZEND_PARSE_PARAMETERS_START(0, 1)
1341 Z_PARAM_OPTIONAL
1342 Z_PARAM_STRING_OR_NULL(name, name_len)
1343 ZEND_PARSE_PARAMETERS_END();
1344
1345 if (name == NULL) {
1346 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1347 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1348 } else {
1349 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1350 if (!encoding) {
1351 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1352 RETURN_THROWS();
1353 } else {
1354 MBSTRG(http_output_set) = 1;
1355 MBSTRG(current_http_output_encoding) = encoding;
1356 /* TODO Return previous encoding? */
1357 RETURN_TRUE;
1358 }
1359 }
1360 }
1361 /* }}} */
1362
1363 /* {{{ Sets the current detect_order or Return the current detect_order as a array */
PHP_FUNCTION(mb_detect_order)1364 PHP_FUNCTION(mb_detect_order)
1365 {
1366 zend_string *order_str = NULL;
1367 HashTable *order_ht = NULL;
1368
1369 ZEND_PARSE_PARAMETERS_START(0, 1)
1370 Z_PARAM_OPTIONAL
1371 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1372 ZEND_PARSE_PARAMETERS_END();
1373
1374 if (!order_str && !order_ht) {
1375 size_t n = MBSTRG(current_detect_order_list_size);
1376 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1377 array_init(return_value);
1378 for (size_t i = 0; i < n; i++) {
1379 add_next_index_string(return_value, (*entry)->name);
1380 entry++;
1381 }
1382 } else {
1383 const mbfl_encoding **list;
1384 size_t size;
1385 if (order_ht) {
1386 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1387 RETURN_THROWS();
1388 }
1389 } else {
1390 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1, /* allow_pass_encoding */ 0)) {
1391 RETURN_THROWS();
1392 }
1393 }
1394
1395 if (size == 0) {
1396 efree(ZEND_VOIDP(list));
1397 zend_argument_value_error(1, "must specify at least one encoding");
1398 RETURN_THROWS();
1399 }
1400
1401 if (MBSTRG(current_detect_order_list)) {
1402 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1403 }
1404 MBSTRG(current_detect_order_list) = list;
1405 MBSTRG(current_detect_order_list_size) = size;
1406 RETURN_TRUE;
1407 }
1408 }
1409 /* }}} */
1410
php_mb_check_code_point(zend_long cp)1411 static inline int php_mb_check_code_point(zend_long cp)
1412 {
1413 if (cp < 0 || cp >= 0x110000) {
1414 /* Out of Unicode range */
1415 return 0;
1416 }
1417
1418 if (cp >= 0xd800 && cp <= 0xdfff) {
1419 /* Surrogate code-point. These are never valid on their own and we only allow a single
1420 * substitute character. */
1421 return 0;
1422 }
1423
1424 /* As we do not know the target encoding of the conversion operation that is going to
1425 * use the substitution character, we cannot check whether the codepoint is actually mapped
1426 * in the given encoding at this point. Thus we have to accept everything. */
1427 return 1;
1428 }
1429
1430 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1431 PHP_FUNCTION(mb_substitute_character)
1432 {
1433 zend_string *substitute_character = NULL;
1434 zend_long substitute_codepoint;
1435 bool substitute_is_null = 1;
1436
1437 ZEND_PARSE_PARAMETERS_START(0, 1)
1438 Z_PARAM_OPTIONAL
1439 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1440 ZEND_PARSE_PARAMETERS_END();
1441
1442 if (substitute_is_null) {
1443 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1444 RETURN_STRING("none");
1445 }
1446 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1447 RETURN_STRING("long");
1448 }
1449 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1450 RETURN_STRING("entity");
1451 }
1452 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1453 }
1454
1455 if (substitute_character != NULL) {
1456 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1457 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1458 RETURN_TRUE;
1459 }
1460 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1461 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1462 RETURN_TRUE;
1463 }
1464 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1465 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1466 RETURN_TRUE;
1467 }
1468 /* Invalid string value */
1469 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1470 RETURN_THROWS();
1471 }
1472 /* Integer codepoint passed */
1473 if (!php_mb_check_code_point(substitute_codepoint)) {
1474 zend_argument_value_error(1, "is not a valid codepoint");
1475 RETURN_THROWS();
1476 }
1477
1478 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1479 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1480 RETURN_TRUE;
1481 }
1482 /* }}} */
1483
1484 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1485 PHP_FUNCTION(mb_preferred_mime_name)
1486 {
1487 enum mbfl_no_encoding no_encoding;
1488 char *name = NULL;
1489 size_t name_len;
1490
1491 ZEND_PARSE_PARAMETERS_START(1, 1)
1492 Z_PARAM_STRING(name, name_len)
1493 ZEND_PARSE_PARAMETERS_END();
1494
1495 no_encoding = mbfl_name2no_encoding(name);
1496 if (no_encoding == mbfl_no_encoding_invalid) {
1497 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1498 RETURN_THROWS();
1499 }
1500
1501 const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
1502 if (preferred_name == NULL || *preferred_name == '\0') {
1503 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1504 RETVAL_FALSE;
1505 } else {
1506 RETVAL_STRING((char *)preferred_name);
1507 }
1508 }
1509 /* }}} */
1510
1511 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1512 PHP_FUNCTION(mb_parse_str)
1513 {
1514 zval *track_vars_array = NULL;
1515 char *encstr;
1516 size_t encstr_len;
1517 php_mb_encoding_handler_info_t info;
1518 const mbfl_encoding *detected;
1519
1520 ZEND_PARSE_PARAMETERS_START(2, 2)
1521 Z_PARAM_STRING(encstr, encstr_len)
1522 Z_PARAM_ZVAL(track_vars_array)
1523 ZEND_PARSE_PARAMETERS_END();
1524
1525 track_vars_array = zend_try_array_init(track_vars_array);
1526 if (!track_vars_array) {
1527 RETURN_THROWS();
1528 }
1529
1530 encstr = estrndup(encstr, encstr_len);
1531
1532 info.data_type = PARSE_STRING;
1533 info.separator = PG(arg_separator).input;
1534 info.report_errors = true;
1535 info.to_encoding = MBSTRG(current_internal_encoding);
1536 info.from_encodings = MBSTRG(http_input_list);
1537 info.num_from_encodings = MBSTRG(http_input_list_size);
1538
1539 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1540
1541 MBSTRG(http_input_identify) = detected;
1542
1543 RETVAL_BOOL(detected);
1544
1545 if (encstr != NULL) efree(encstr);
1546 }
1547 /* }}} */
1548
1549 /* {{{ Returns string in output buffer converted to the http_output encoding */
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 char *arg_string;
1553 size_t arg_string_len;
1554 zend_long arg_status;
1555 mbfl_string string, result;
1556 const char *charset;
1557 char *p;
1558 const mbfl_encoding *encoding;
1559 int last_feed;
1560 size_t len;
1561 unsigned char send_text_mimetype = 0;
1562 char *s, *mimetype = NULL;
1563
1564 ZEND_PARSE_PARAMETERS_START(2, 2)
1565 Z_PARAM_STRING(arg_string, arg_string_len)
1566 Z_PARAM_LONG(arg_status)
1567 ZEND_PARSE_PARAMETERS_END();
1568
1569 encoding = MBSTRG(current_http_output_encoding);
1570
1571 /* start phase only */
1572 if ((arg_status & PHP_OUTPUT_HANDLER_START) != 0) {
1573 /* delete the converter just in case. */
1574 if (MBSTRG(outconv)) {
1575 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1576 mbfl_buffer_converter_delete(MBSTRG(outconv));
1577 MBSTRG(outconv) = NULL;
1578 }
1579
1580 if (encoding == &mbfl_encoding_pass) {
1581 RETURN_STRINGL(arg_string, arg_string_len);
1582 }
1583
1584 /* analyze mime type */
1585 if (SG(sapi_headers).mimetype &&
1586 _php_mb_match_regex(
1587 MBSTRG(http_output_conv_mimetypes),
1588 SG(sapi_headers).mimetype,
1589 strlen(SG(sapi_headers).mimetype))) {
1590 if ((s = strchr(SG(sapi_headers).mimetype,';')) == NULL) {
1591 mimetype = estrdup(SG(sapi_headers).mimetype);
1592 } else {
1593 mimetype = estrndup(SG(sapi_headers).mimetype,s-SG(sapi_headers).mimetype);
1594 }
1595 send_text_mimetype = 1;
1596 } else if (SG(sapi_headers).send_default_content_type) {
1597 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1598 }
1599
1600 /* if content-type is not yet set, set it and activate the converter */
1601 if (SG(sapi_headers).send_default_content_type || send_text_mimetype) {
1602 charset = encoding->mime_name;
1603 if (charset) {
1604 len = spprintf( &p, 0, "Content-Type: %s; charset=%s", mimetype, charset );
1605 if (sapi_add_header(p, len, 0) != FAILURE) {
1606 SG(sapi_headers).send_default_content_type = 0;
1607 }
1608 }
1609 /* activate the converter */
1610 MBSTRG(outconv) = mbfl_buffer_converter_new(MBSTRG(current_internal_encoding), encoding, 0);
1611 if (send_text_mimetype){
1612 efree(mimetype);
1613 }
1614 }
1615 }
1616
1617 /* just return if the converter is not activated. */
1618 if (MBSTRG(outconv) == NULL) {
1619 RETURN_STRINGL(arg_string, arg_string_len);
1620 }
1621
1622 /* flag */
1623 last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1624 /* mode */
1625 mbfl_buffer_converter_illegal_mode(MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1626 mbfl_buffer_converter_illegal_substchar(MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1627
1628 /* feed the string */
1629 mbfl_string_init(&string);
1630 /* these are not needed. convd has encoding info.
1631 string.encoding = MBSTRG(current_internal_encoding);
1632 */
1633 string.val = (unsigned char *)arg_string;
1634 string.len = arg_string_len;
1635
1636 mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1637 if (last_feed) {
1638 mbfl_buffer_converter_flush(MBSTRG(outconv));
1639 }
1640 /* get the converter output, and return it */
1641 mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1642
1643 // TODO: avoid reallocation ???
1644 RETVAL_STRINGL((char *)result.val, result.len); /* the string is already strdup()'ed */
1645 efree(result.val);
1646
1647 /* delete the converter if it is the last feed. */
1648 if (last_feed) {
1649 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1650 mbfl_buffer_converter_delete(MBSTRG(outconv));
1651 MBSTRG(outconv) = NULL;
1652 }
1653 }
1654 /* }}} */
1655
1656 /* {{{ Convert a multibyte string to an array. If split_length is specified,
1657 break the string down into chunks each split_length characters long. */
1658
1659 /* structure to pass split params to the callback */
1660 struct mbfl_split_params {
1661 zval *return_value; /* php function return value structure pointer */
1662 mbfl_string *result_string; /* string to store result chunk */
1663 size_t mb_chunk_length; /* actual chunk length in chars */
1664 size_t split_length; /* split length in chars */
1665 mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1666 };
1667
1668 /* callback function to fill split array */
mbfl_split_output(int c,void * data)1669 static int mbfl_split_output(int c, void *data)
1670 {
1671 struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1672
1673 (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1674
1675 if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1676 mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1677 mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1678 mbfl_string *chunk = params->result_string;
1679 mbfl_memory_device_result(device, chunk); /* make chunk */
1680 add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1681 efree(chunk->val);
1682 params->mb_chunk_length = 0; /* reset mb_chunk size */
1683 }
1684
1685 return 0;
1686 }
1687
PHP_FUNCTION(mb_str_split)1688 PHP_FUNCTION(mb_str_split)
1689 {
1690 zend_string *str, *encoding = NULL;
1691 size_t mb_len, chunks, chunk_len;
1692 const char *p, *last; /* pointer for the string cursor and last string char */
1693 mbfl_string string, result_string;
1694 const mbfl_encoding *mbfl_encoding;
1695 zend_long split_length = 1;
1696
1697 ZEND_PARSE_PARAMETERS_START(1, 3)
1698 Z_PARAM_STR(str)
1699 Z_PARAM_OPTIONAL
1700 Z_PARAM_LONG(split_length)
1701 Z_PARAM_STR_OR_NULL(encoding)
1702 ZEND_PARSE_PARAMETERS_END();
1703
1704 if (split_length <= 0) {
1705 zend_argument_value_error(2, "must be greater than 0");
1706 RETURN_THROWS();
1707 }
1708
1709 /* fill mbfl_string structure */
1710 string.val = (unsigned char *) ZSTR_VAL(str);
1711 string.len = ZSTR_LEN(str);
1712 string.encoding = php_mb_get_encoding(encoding, 3);
1713 if (!string.encoding) {
1714 RETURN_THROWS();
1715 }
1716
1717 if (ZSTR_LEN(str) == 0) {
1718 RETURN_EMPTY_ARRAY();
1719 }
1720
1721 p = ZSTR_VAL(str); /* string cursor pointer */
1722 last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1723
1724 mbfl_encoding = string.encoding;
1725
1726 /* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1727 if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1728 mb_len = string.len;
1729 chunk_len = (size_t)split_length; /* chunk length in bytes */
1730 } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1731 mb_len = string.len / 2;
1732 chunk_len = split_length * 2;
1733 } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1734 mb_len = string.len / 4;
1735 chunk_len = split_length * 4;
1736 } else if (mbfl_encoding->mblen_table != NULL) {
1737 /* second scenario: variable width encodings with length table */
1738 char unsigned const *mbtab = mbfl_encoding->mblen_table;
1739
1740 /* assume that we have 1-bytes characters */
1741 array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1742
1743 while (p < last) { /* split cycle work until the cursor has reached the last byte */
1744 char const *chunk_p = p; /* chunk first byte pointer */
1745 chunk_len = 0; /* chunk length in bytes */
1746 zend_long char_count;
1747
1748 for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1749 char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1750 chunk_len += m;
1751 p += m;
1752 }
1753 if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1754 add_next_index_stringl(return_value, chunk_p, chunk_len);
1755 }
1756 return;
1757 } else {
1758 /* third scenario: other multibyte encodings */
1759 mbfl_convert_filter *filter, *decoder;
1760
1761 /* assume that we have 1-bytes characters */
1762 array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1763
1764 /* decoder filter to decode wchar to encoding */
1765 mbfl_memory_device device;
1766 mbfl_memory_device_init(&device, split_length + 1, 0);
1767
1768 decoder = mbfl_convert_filter_new(
1769 &mbfl_encoding_wchar,
1770 string.encoding,
1771 mbfl_memory_device_output,
1772 NULL,
1773 &device);
1774 /* assert that nothing is wrong with the decoder */
1775 ZEND_ASSERT(decoder != NULL);
1776
1777 /* wchar filter */
1778 mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1779 struct mbfl_split_params params = { /* init callback function params structure */
1780 .return_value = return_value,
1781 .result_string = &result_string,
1782 .mb_chunk_length = 0,
1783 .split_length = (size_t)split_length,
1784 .next_filter = decoder,
1785 };
1786
1787 filter = mbfl_convert_filter_new(
1788 string.encoding,
1789 &mbfl_encoding_wchar,
1790 mbfl_split_output,
1791 NULL,
1792 ¶ms);
1793 /* assert that nothing is wrong with the filter */
1794 ZEND_ASSERT(filter != NULL);
1795
1796 while (p < last - 1) { /* cycle each byte except last with callback function */
1797 (*filter->filter_function)(*p++, filter);
1798 }
1799 params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1800 (*filter->filter_function)(*p++, filter); /* process last char */
1801
1802 mbfl_convert_filter_delete(decoder);
1803 mbfl_convert_filter_delete(filter);
1804 mbfl_memory_device_clear(&device);
1805 return;
1806 }
1807
1808 /* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1809 chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1810 array_init_size(return_value, chunks);
1811 if (chunks != 0) {
1812 zend_long i;
1813
1814 for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1815 add_next_index_stringl(return_value, p, chunk_len);
1816 }
1817 add_next_index_stringl(return_value, p, last - p);
1818 }
1819 }
1820 /* }}} */
1821
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1822 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1823 {
1824 size_t len = 0;
1825
1826 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
1827 return ZSTR_LEN(string);
1828 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1829 return ZSTR_LEN(string) / 2;
1830 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1831 return ZSTR_LEN(string) / 4;
1832 } else if (encoding->mblen_table) {
1833 const unsigned char *mbtab = encoding->mblen_table;
1834 unsigned char *p = (unsigned char*)ZSTR_VAL(string), *e = p + ZSTR_LEN(string);
1835 while (p < e) {
1836 p += mbtab[*p];
1837 len++;
1838 }
1839 } else {
1840 uint32_t wchar_buf[128];
1841 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1842 size_t in_len = ZSTR_LEN(string);
1843 unsigned int state = 0;
1844
1845 while (in_len) {
1846 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1847 }
1848 }
1849
1850 return len;
1851 }
1852
1853 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1854 PHP_FUNCTION(mb_strlen)
1855 {
1856 zend_string *string, *enc_name = NULL;
1857
1858 ZEND_PARSE_PARAMETERS_START(1, 2)
1859 Z_PARAM_STR(string)
1860 Z_PARAM_OPTIONAL
1861 Z_PARAM_STR_OR_NULL(enc_name)
1862 ZEND_PARSE_PARAMETERS_END();
1863
1864 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1865 if (!enc) {
1866 RETURN_THROWS();
1867 }
1868
1869 RETVAL_LONG(mb_get_strlen(string, enc));
1870 }
1871 /* }}} */
1872
handle_strpos_error(size_t error)1873 static void handle_strpos_error(size_t error) {
1874 switch (error) {
1875 case MBFL_ERROR_NOT_FOUND:
1876 break;
1877 case MBFL_ERROR_ENCODING:
1878 php_error_docref(NULL, E_WARNING, "Conversion error");
1879 break;
1880 case MBFL_ERROR_OFFSET:
1881 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1882 break;
1883 default:
1884 zend_value_error("mb_strpos(): Unknown error");
1885 break;
1886 }
1887 }
1888
1889 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(mb_strpos)1890 PHP_FUNCTION(mb_strpos)
1891 {
1892 int reverse = 0;
1893 zend_long offset = 0;
1894 char *haystack_val, *needle_val;
1895 mbfl_string haystack, needle;
1896 zend_string *enc_name = NULL;
1897
1898 ZEND_PARSE_PARAMETERS_START(2, 4)
1899 Z_PARAM_STRING(haystack_val, haystack.len)
1900 Z_PARAM_STRING(needle_val, needle.len)
1901 Z_PARAM_OPTIONAL
1902 Z_PARAM_LONG(offset)
1903 Z_PARAM_STR_OR_NULL(enc_name)
1904 ZEND_PARSE_PARAMETERS_END();
1905
1906 haystack.val = (unsigned char*)haystack_val;
1907 needle.val = (unsigned char*)needle_val;
1908
1909 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1910 if (!haystack.encoding) {
1911 RETURN_THROWS();
1912 }
1913
1914 size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
1915 if (!mbfl_is_error(n)) {
1916 RETVAL_LONG(n);
1917 } else {
1918 handle_strpos_error(n);
1919 RETVAL_FALSE;
1920 }
1921 }
1922 /* }}} */
1923
1924 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1925 PHP_FUNCTION(mb_strrpos)
1926 {
1927 mbfl_string haystack, needle;
1928 char *haystack_val, *needle_val;
1929 zend_string *enc_name = NULL;
1930 zend_long offset = 0;
1931
1932 ZEND_PARSE_PARAMETERS_START(2, 4)
1933 Z_PARAM_STRING(haystack_val, haystack.len)
1934 Z_PARAM_STRING(needle_val, needle.len)
1935 Z_PARAM_OPTIONAL
1936 Z_PARAM_LONG(offset)
1937 Z_PARAM_STR_OR_NULL(enc_name)
1938 ZEND_PARSE_PARAMETERS_END();
1939
1940 haystack.val = (unsigned char*)haystack_val;
1941 needle.val = (unsigned char*)needle_val;
1942
1943 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1944 if (!haystack.encoding) {
1945 RETURN_THROWS();
1946 }
1947
1948 size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
1949 if (!mbfl_is_error(n)) {
1950 RETVAL_LONG(n);
1951 } else {
1952 handle_strpos_error(n);
1953 RETVAL_FALSE;
1954 }
1955 }
1956 /* }}} */
1957
1958 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)1959 PHP_FUNCTION(mb_stripos)
1960 {
1961 zend_long offset = 0;
1962 mbfl_string haystack, needle;
1963 char *haystack_val, *needle_val;
1964 zend_string *from_encoding = NULL;
1965
1966 ZEND_PARSE_PARAMETERS_START(2, 4)
1967 Z_PARAM_STRING(haystack_val, haystack.len)
1968 Z_PARAM_STRING(needle_val, needle.len)
1969 Z_PARAM_OPTIONAL
1970 Z_PARAM_LONG(offset)
1971 Z_PARAM_STR_OR_NULL(from_encoding)
1972 ZEND_PARSE_PARAMETERS_END();
1973
1974 haystack.val = (unsigned char*)haystack_val;
1975 needle.val = (unsigned char*)needle_val;
1976
1977 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1978 if (!enc) {
1979 RETURN_THROWS();
1980 }
1981
1982 size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1983
1984 if (!mbfl_is_error(n)) {
1985 RETVAL_LONG(n);
1986 } else {
1987 handle_strpos_error(n);
1988 RETVAL_FALSE;
1989 }
1990 }
1991 /* }}} */
1992
1993 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)1994 PHP_FUNCTION(mb_strripos)
1995 {
1996 zend_long offset = 0;
1997 mbfl_string haystack, needle;
1998 char *haystack_val, *needle_val;
1999 zend_string *from_encoding = NULL;
2000
2001 ZEND_PARSE_PARAMETERS_START(2, 4)
2002 Z_PARAM_STRING(haystack_val, haystack.len)
2003 Z_PARAM_STRING(needle_val, needle.len)
2004 Z_PARAM_OPTIONAL
2005 Z_PARAM_LONG(offset)
2006 Z_PARAM_STR_OR_NULL(from_encoding)
2007 ZEND_PARSE_PARAMETERS_END();
2008
2009 haystack.val = (unsigned char*)haystack_val;
2010 needle.val = (unsigned char*)needle_val;
2011
2012 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2013 if (!enc) {
2014 RETURN_THROWS();
2015 }
2016
2017 size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
2018
2019 if (!mbfl_is_error(n)) {
2020 RETVAL_LONG(n);
2021 } else {
2022 handle_strpos_error(n);
2023 RETVAL_FALSE;
2024 }
2025 }
2026 /* }}} */
2027
2028 #define MB_STRSTR 1
2029 #define MB_STRRCHR 2
2030 #define MB_STRISTR 3
2031 #define MB_STRRICHR 4
2032 /* {{{ php_mb_strstr_variants */
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2033 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2034 {
2035 int reverse_mode = 0;
2036 size_t n;
2037 char *haystack_val, *needle_val;
2038 mbfl_string haystack, needle, result, *ret = NULL;
2039 zend_string *encoding_name = NULL;
2040 bool part = 0;
2041
2042 ZEND_PARSE_PARAMETERS_START(2, 4)
2043 Z_PARAM_STRING(haystack_val, haystack.len)
2044 Z_PARAM_STRING(needle_val, needle.len)
2045 Z_PARAM_OPTIONAL
2046 Z_PARAM_BOOL(part)
2047 Z_PARAM_STR_OR_NULL(encoding_name)
2048 ZEND_PARSE_PARAMETERS_END();
2049
2050 haystack.val = (unsigned char*)haystack_val;
2051 needle.val = (unsigned char*)needle_val;
2052 haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
2053 if (!haystack.encoding) {
2054 RETURN_THROWS();
2055 }
2056
2057 if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
2058
2059 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2060 n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
2061 needle.len, 0, needle.encoding);
2062 } else {
2063 n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
2064 }
2065
2066 if (!mbfl_is_error(n)) {
2067 if (part) {
2068 ret = mbfl_substr(&haystack, &result, 0, n);
2069 ZEND_ASSERT(ret != NULL);
2070 // TODO: avoid reallocation ???
2071 RETVAL_STRINGL((char *)ret->val, ret->len);
2072 efree(ret->val);
2073 } else {
2074 ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
2075 ZEND_ASSERT(ret != NULL);
2076 // TODO: avoid reallocation ???
2077 RETVAL_STRINGL((char *)ret->val, ret->len);
2078 efree(ret->val);
2079 }
2080 } else {
2081 // FIXME use handle_strpos_error(n)
2082 RETVAL_FALSE;
2083 }
2084 }
2085
2086 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2087 PHP_FUNCTION(mb_strstr)
2088 {
2089 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2090 }
2091 /* }}} */
2092
2093 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2094 PHP_FUNCTION(mb_strrchr)
2095 {
2096 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2097 }
2098 /* }}} */
2099
2100 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2101 PHP_FUNCTION(mb_stristr)
2102 {
2103 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2104 }
2105 /* }}} */
2106
2107 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2108 PHP_FUNCTION(mb_strrichr)
2109 {
2110 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2111 }
2112 /* }}} */
2113
2114 #undef MB_STRSTR
2115 #undef MB_STRRCHR
2116 #undef MB_STRISTR
2117 #undef MB_STRRICHR
2118
2119 /* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)2120 PHP_FUNCTION(mb_substr_count)
2121 {
2122 mbfl_string haystack, needle;
2123 char *haystack_val, *needle_val;
2124 zend_string *enc_name = NULL;
2125
2126 ZEND_PARSE_PARAMETERS_START(2, 3)
2127 Z_PARAM_STRING(haystack_val, haystack.len)
2128 Z_PARAM_STRING(needle_val, needle.len)
2129 Z_PARAM_OPTIONAL
2130 Z_PARAM_STR_OR_NULL(enc_name)
2131 ZEND_PARSE_PARAMETERS_END();
2132
2133 haystack.val = (unsigned char*)haystack_val;
2134 needle.val = (unsigned char*)needle_val;
2135
2136 if (needle.len == 0) {
2137 zend_argument_value_error(2, "must not be empty");
2138 RETURN_THROWS();
2139 }
2140
2141 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2142 if (!haystack.encoding) {
2143 RETURN_THROWS();
2144 }
2145
2146 size_t n = mbfl_substr_count(&haystack, &needle);
2147 /* An error can only occur if needle is empty,
2148 * an encoding error happens (which should not happen at this stage and is a bug)
2149 * or the haystack is more than sizeof(size_t) bytes
2150 * If one of these things occur this is a bug and should be flagged as such */
2151 ZEND_ASSERT(!mbfl_is_error(n));
2152 RETVAL_LONG(n);
2153 }
2154 /* }}} */
2155
2156 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2157 PHP_FUNCTION(mb_substr)
2158 {
2159 char *str;
2160 zend_string *encoding = NULL;
2161 zend_long from, len;
2162 size_t real_from, real_len;
2163 size_t str_len;
2164 bool len_is_null = 1;
2165 mbfl_string string, result, *ret;
2166
2167 ZEND_PARSE_PARAMETERS_START(2, 4)
2168 Z_PARAM_STRING(str, str_len)
2169 Z_PARAM_LONG(from)
2170 Z_PARAM_OPTIONAL
2171 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2172 Z_PARAM_STR_OR_NULL(encoding)
2173 ZEND_PARSE_PARAMETERS_END();
2174
2175 if (from == ZEND_LONG_MIN) {
2176 zend_argument_value_error(2, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2177 RETURN_THROWS();
2178 }
2179
2180 if (!len_is_null && len == ZEND_LONG_MIN) {
2181 zend_argument_value_error(3, "must be between " ZEND_LONG_FMT " and " ZEND_LONG_FMT, (ZEND_LONG_MIN + 1), ZEND_LONG_MAX);
2182 RETURN_THROWS();
2183 }
2184
2185 string.encoding = php_mb_get_encoding(encoding, 4);
2186 if (!string.encoding) {
2187 RETURN_THROWS();
2188 }
2189
2190 string.val = (unsigned char *)str;
2191 string.len = str_len;
2192
2193 /* measures length */
2194 size_t mblen = 0;
2195 if (from < 0 || (!len_is_null && len < 0)) {
2196 mblen = mbfl_strlen(&string);
2197 }
2198
2199 /* if "from" position is negative, count start position from the end
2200 * of the string
2201 */
2202 if (from >= 0) {
2203 real_from = (size_t) from;
2204 } else if (-from < mblen) {
2205 real_from = mblen + from;
2206 } else {
2207 real_from = 0;
2208 }
2209
2210 /* if "length" position is negative, set it to the length
2211 * needed to stop that many chars from the end of the string
2212 */
2213 if (len_is_null) {
2214 real_len = MBFL_SUBSTR_UNTIL_END;
2215 } else if (len >= 0) {
2216 real_len = (size_t) len;
2217 } else if (real_from < mblen && -len < mblen - real_from) {
2218 real_len = (mblen - real_from) + len;
2219 } else {
2220 real_len = 0;
2221 }
2222
2223 ret = mbfl_substr(&string, &result, real_from, real_len);
2224 ZEND_ASSERT(ret != NULL);
2225
2226 // TODO: avoid reallocation ???
2227 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2228 efree(ret->val);
2229 }
2230 /* }}} */
2231
2232 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2233 PHP_FUNCTION(mb_strcut)
2234 {
2235 zend_string *encoding = NULL;
2236 char *string_val;
2237 zend_long from, len;
2238 bool len_is_null = 1;
2239 mbfl_string string, result, *ret;
2240
2241 ZEND_PARSE_PARAMETERS_START(2, 4)
2242 Z_PARAM_STRING(string_val, string.len)
2243 Z_PARAM_LONG(from)
2244 Z_PARAM_OPTIONAL
2245 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2246 Z_PARAM_STR_OR_NULL(encoding)
2247 ZEND_PARSE_PARAMETERS_END();
2248
2249 string.val = (unsigned char*)string_val;
2250 string.encoding = php_mb_get_encoding(encoding, 4);
2251 if (!string.encoding) {
2252 RETURN_THROWS();
2253 }
2254
2255 if (len_is_null) {
2256 len = string.len;
2257 }
2258
2259 /* if "from" position is negative, count start position from the end
2260 * of the string
2261 */
2262 if (from < 0) {
2263 from = string.len + from;
2264 if (from < 0) {
2265 from = 0;
2266 }
2267 }
2268
2269 /* if "length" position is negative, set it to the length
2270 * needed to stop that many chars from the end of the string
2271 */
2272 if (len < 0) {
2273 len = (string.len - from) + len;
2274 if (len < 0) {
2275 len = 0;
2276 }
2277 }
2278
2279 if (from > string.len) {
2280 RETURN_EMPTY_STRING();
2281 }
2282
2283 ret = mbfl_strcut(&string, &result, from, len);
2284 ZEND_ASSERT(ret != NULL);
2285
2286 // TODO: avoid reallocation ???
2287 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2288 efree(ret->val);
2289 }
2290 /* }}} */
2291
2292 /* Some East Asian characters, when printed at a terminal (or the like), require double
2293 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2294 static size_t character_width(uint32_t c)
2295 {
2296 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2297 return 1;
2298 }
2299
2300 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2301 int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2302 while (lo < hi) {
2303 int probe = (lo + hi) / 2;
2304 if (c < mbfl_eaw_table[probe].begin) {
2305 hi = probe;
2306 } else if (c > mbfl_eaw_table[probe].end) {
2307 lo = probe + 1;
2308 } else {
2309 return 2;
2310 }
2311 }
2312
2313 return 1;
2314 }
2315
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2316 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2317 {
2318 size_t width = 0;
2319 uint32_t wchar_buf[128];
2320 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2321 size_t in_len = ZSTR_LEN(string);
2322 unsigned int state = 0;
2323
2324 while (in_len) {
2325 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2326 ZEND_ASSERT(out_len <= 128);
2327
2328 while (out_len) {
2329 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2330 * If text conversion is performed with an ordinary ASCII character as
2331 * the 'replacement character', this will give us the correct display width. */
2332 width += character_width(wchar_buf[--out_len]);
2333 }
2334 }
2335
2336 return width;
2337 }
2338
2339 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2340 PHP_FUNCTION(mb_strwidth)
2341 {
2342 zend_string *string, *enc_name = NULL;
2343
2344 ZEND_PARSE_PARAMETERS_START(1, 2)
2345 Z_PARAM_STR(string)
2346 Z_PARAM_OPTIONAL
2347 Z_PARAM_STR_OR_NULL(enc_name)
2348 ZEND_PARSE_PARAMETERS_END();
2349
2350 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2351 if (!enc) {
2352 RETURN_THROWS();
2353 }
2354
2355 RETVAL_LONG(mb_get_strwidth(string, enc));
2356 }
2357
2358 /* Cut 'n' codepoints from beginning of string
2359 * Remove this once mb_substr is implemented using the new conversion filters */
mb_drop_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2360 static zend_string* mb_drop_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2361 {
2362 if (n >= ZSTR_LEN(input)) {
2363 /* No supported text encoding decodes to more than one codepoint per byte
2364 * So if the number of codepoints to drop >= number of input bytes,
2365 * then definitely the output should be empty
2366 * This also guards `ZSTR_LEN(input) - n` (below) from underflow */
2367 return zend_empty_string;
2368 }
2369
2370 uint32_t wchar_buf[128];
2371 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2372 size_t in_len = ZSTR_LEN(input);
2373 unsigned int state = 0;
2374
2375 mb_convert_buf buf;
2376 mb_convert_buf_init(&buf, ZSTR_LEN(input) - n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2377
2378 while (in_len) {
2379 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2380 ZEND_ASSERT(out_len <= 128);
2381
2382 if (n >= out_len) {
2383 n -= out_len;
2384 } else {
2385 enc->from_wchar(wchar_buf + n, out_len - n, &buf, !in_len);
2386 n = 0;
2387 }
2388 }
2389
2390 return mb_convert_buf_result(&buf);
2391 }
2392
2393 /* Pick 'n' codepoints from beginning of string
2394 * Remove this once mb_substr is implemented using the new conversion filters */
mb_pick_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2395 static zend_string* mb_pick_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2396 {
2397 uint32_t wchar_buf[128];
2398 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2399 size_t in_len = ZSTR_LEN(input);
2400 unsigned int state = 0;
2401
2402 mb_convert_buf buf;
2403 mb_convert_buf_init(&buf, n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2404
2405 while (in_len && n) {
2406 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2407 ZEND_ASSERT(out_len <= 128);
2408
2409 enc->from_wchar(wchar_buf, MIN(out_len, n), &buf, !in_len || out_len >= n);
2410 n -= MIN(out_len, n);
2411 }
2412
2413 return mb_convert_buf_result(&buf);
2414 }
2415
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2416 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2417 {
2418 uint32_t wchar_buf[128];
2419 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2420 size_t in_len = ZSTR_LEN(input);
2421 unsigned int state = 0;
2422 int remaining_width = width;
2423 unsigned int to_skip = from;
2424 size_t out_len = 0;
2425 bool first_call = true, input_err = false;
2426 mb_convert_buf buf;
2427
2428 while (in_len) {
2429 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2430 ZEND_ASSERT(out_len <= 128);
2431
2432 if (out_len <= to_skip) {
2433 to_skip -= out_len;
2434 } else {
2435 for (int i = to_skip; i < out_len; i++) {
2436 uint32_t w = wchar_buf[i];
2437 input_err |= (w == MBFL_BAD_INPUT);
2438 remaining_width -= character_width(w);
2439 if (remaining_width < 0) {
2440 /* We need to truncate string and append trim marker */
2441 width -= mb_get_strwidth(marker, enc);
2442 /* 'width' is now the amount we want to take from 'input' */
2443 if (width <= 0) {
2444 return zend_string_copy(marker);
2445 }
2446 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2447
2448 if (first_call) {
2449 /* We can use the buffer of wchars which we have right now;
2450 * no need to convert again */
2451 goto dont_restart_conversion;
2452 } else {
2453 goto restart_conversion;
2454 }
2455 }
2456 }
2457 to_skip = 0;
2458 }
2459 first_call = false;
2460 }
2461
2462 /* The input string fits in the requested width; we don't need to append the trim marker
2463 * However, if the string contains erroneous byte sequences, those should be converted
2464 * to error markers */
2465 if (from == 0 && !input_err) {
2466 /* This just increments the string's refcount; it doesn't really 'copy' it */
2467 return zend_string_copy(input);
2468 }
2469 return mb_drop_chars(input, enc, from);
2470
2471 /* The input string is too wide; we need to build a new string which
2472 * includes some portion of the input string, with the trim marker
2473 * concatenated onto it */
2474 restart_conversion:
2475 in = (unsigned char*)ZSTR_VAL(input);
2476 in_len = ZSTR_LEN(input);
2477 state = 0;
2478
2479 while (true) {
2480 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2481 ZEND_ASSERT(out_len <= 128);
2482
2483 dont_restart_conversion:
2484 if (out_len <= from) {
2485 from -= out_len;
2486 } else {
2487 for (int i = from; i < out_len; i++) {
2488 width -= character_width(wchar_buf[i]);
2489 if (width < 0) {
2490 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2491 goto append_trim_marker;
2492 }
2493 }
2494 ZEND_ASSERT(in_len > 0);
2495 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2496 from = 0;
2497 }
2498 }
2499
2500 append_trim_marker:
2501 if (ZSTR_LEN(marker) > 0) {
2502 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2503 memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2504 buf.out += ZSTR_LEN(marker);
2505 }
2506
2507 return mb_convert_buf_result(&buf);
2508 }
2509
2510 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2511 PHP_FUNCTION(mb_strimwidth)
2512 {
2513 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2514 zend_long from, width;
2515
2516 ZEND_PARSE_PARAMETERS_START(3, 5)
2517 Z_PARAM_STR(str)
2518 Z_PARAM_LONG(from)
2519 Z_PARAM_LONG(width)
2520 Z_PARAM_OPTIONAL
2521 Z_PARAM_STR(trimmarker)
2522 Z_PARAM_STR_OR_NULL(encoding)
2523 ZEND_PARSE_PARAMETERS_END();
2524
2525 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2526 if (!enc) {
2527 RETURN_THROWS();
2528 }
2529
2530 if (from != 0) {
2531 size_t str_len = mb_get_strlen(str, enc);
2532 if (from < 0) {
2533 from += str_len;
2534 }
2535 if (from < 0 || from > str_len) {
2536 zend_argument_value_error(2, "is out of range");
2537 RETURN_THROWS();
2538 }
2539 }
2540
2541 if (width < 0) {
2542 width += mb_get_strwidth(str, enc);
2543
2544 if (from > 0) {
2545 zend_string *trimmed = mb_pick_chars(str, enc, from);
2546 width -= mb_get_strwidth(trimmed, enc);
2547 zend_string_free(trimmed);
2548 }
2549
2550 if (width < 0) {
2551 zend_argument_value_error(3, "is out of range");
2552 RETURN_THROWS();
2553 }
2554 }
2555
2556 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2557 }
2558
2559
2560 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2561 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2562 {
2563 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2564 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2565 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2566 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2567 }
2568
2569
2570 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)2571 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
2572 {
2573 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
2574 }
2575
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2576 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2577 {
2578 unsigned int num_errors = 0;
2579 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2580 MBSTRG(illegalchars) += num_errors;
2581 return result;
2582 }
2583
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2584 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2585 {
2586 const mbfl_encoding *from_encoding;
2587
2588 /* pre-conversion encoding */
2589 ZEND_ASSERT(num_from_encodings >= 1);
2590 if (num_from_encodings == 1) {
2591 from_encoding = *from_encodings;
2592 } else {
2593 /* auto detect */
2594 mbfl_string string;
2595 mbfl_string_init(&string);
2596 string.val = (unsigned char *)input;
2597 string.len = length;
2598 from_encoding = mbfl_identify_encoding(
2599 &string, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2600 if (!from_encoding) {
2601 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2602 return NULL;
2603 }
2604 }
2605
2606 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2607 }
2608
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2609 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2610 {
2611 HashTable *output, *chash;
2612 zend_long idx;
2613 zend_string *key;
2614 zval *entry, entry_tmp;
2615
2616 if (!input) {
2617 return NULL;
2618 }
2619
2620 if (GC_IS_RECURSIVE(input)) {
2621 GC_UNPROTECT_RECURSION(input);
2622 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2623 return NULL;
2624 }
2625 GC_TRY_PROTECT_RECURSION(input);
2626 output = zend_new_array(zend_hash_num_elements(input));
2627 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2628 /* convert key */
2629 if (key) {
2630 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2631 if (!converted_key) {
2632 continue;
2633 }
2634 key = converted_key;
2635 }
2636 /* convert value */
2637 ZEND_ASSERT(entry);
2638 try_again:
2639 switch(Z_TYPE_P(entry)) {
2640 case IS_STRING: {
2641 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2642 if (!converted_key) {
2643 if (key) {
2644 zend_string_release(key);
2645 }
2646 continue;
2647 }
2648 ZVAL_STR(&entry_tmp, converted_key);
2649 break;
2650 }
2651 case IS_NULL:
2652 case IS_TRUE:
2653 case IS_FALSE:
2654 case IS_LONG:
2655 case IS_DOUBLE:
2656 ZVAL_COPY(&entry_tmp, entry);
2657 break;
2658 case IS_ARRAY:
2659 chash = php_mb_convert_encoding_recursive(
2660 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2661 if (chash) {
2662 ZVAL_ARR(&entry_tmp, chash);
2663 } else {
2664 ZVAL_EMPTY_ARRAY(&entry_tmp);
2665 }
2666 break;
2667 case IS_REFERENCE:
2668 entry = Z_REFVAL_P(entry);
2669 goto try_again;
2670 case IS_OBJECT:
2671 default:
2672 if (key) {
2673 zend_string_release(key);
2674 }
2675 php_error_docref(NULL, E_WARNING, "Object is not supported");
2676 continue;
2677 }
2678 if (key) {
2679 zend_hash_add(output, key, &entry_tmp);
2680 zend_string_release(key);
2681 } else {
2682 zend_hash_index_add(output, idx, &entry_tmp);
2683 }
2684 } ZEND_HASH_FOREACH_END();
2685 GC_TRY_UNPROTECT_RECURSION(input);
2686
2687 return output;
2688 }
2689 /* }}} */
2690
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2691 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2692 {
2693 /* mbstring supports some 'text encodings' which aren't really text encodings
2694 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2695 * These should never be returned by `mb_detect_encoding`. */
2696 int shift = 0;
2697 for (int i = 0; i < *size; i++) {
2698 const mbfl_encoding *encoding = elist[i];
2699 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2700 shift++; /* Remove this encoding from the list */
2701 } else if (shift) {
2702 elist[i - shift] = encoding;
2703 }
2704 }
2705 *size -= shift;
2706 }
2707
2708 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2709 PHP_FUNCTION(mb_convert_encoding)
2710 {
2711 zend_string *to_encoding_name;
2712 zend_string *input_str, *from_encodings_str = NULL;
2713 HashTable *input_ht, *from_encodings_ht = NULL;
2714 const mbfl_encoding **from_encodings;
2715 size_t num_from_encodings;
2716 bool free_from_encodings;
2717
2718 ZEND_PARSE_PARAMETERS_START(2, 3)
2719 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2720 Z_PARAM_STR(to_encoding_name)
2721 Z_PARAM_OPTIONAL
2722 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2723 ZEND_PARSE_PARAMETERS_END();
2724
2725 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2726 if (!to_encoding) {
2727 RETURN_THROWS();
2728 }
2729
2730 if (from_encodings_ht) {
2731 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2732 RETURN_THROWS();
2733 }
2734 free_from_encodings = 1;
2735 } else if (from_encodings_str) {
2736 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2737 &from_encodings, &num_from_encodings,
2738 /* persistent */ 0, /* arg_num */ 3, /* allow_pass_encoding */ 0) == FAILURE) {
2739 RETURN_THROWS();
2740 }
2741 free_from_encodings = 1;
2742 } else {
2743 from_encodings = &MBSTRG(current_internal_encoding);
2744 num_from_encodings = 1;
2745 free_from_encodings = 0;
2746 }
2747
2748 if (num_from_encodings > 1) {
2749 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2750 }
2751
2752 if (!num_from_encodings) {
2753 efree(ZEND_VOIDP(from_encodings));
2754 zend_argument_value_error(3, "must specify at least one encoding");
2755 RETURN_THROWS();
2756 }
2757
2758 if (input_str) {
2759 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2760 if (ret != NULL) {
2761 RETVAL_STR(ret);
2762 } else {
2763 RETVAL_FALSE;
2764 }
2765 } else {
2766 HashTable *tmp;
2767 tmp = php_mb_convert_encoding_recursive(
2768 input_ht, to_encoding, from_encodings, num_from_encodings);
2769 RETVAL_ARR(tmp);
2770 }
2771
2772 if (free_from_encodings) {
2773 efree(ZEND_VOIDP(from_encodings));
2774 }
2775 }
2776 /* }}} */
2777
mbstring_convert_case(int case_mode,const char * str,size_t str_len,size_t * ret_len,const mbfl_encoding * enc)2778 static char *mbstring_convert_case(
2779 int case_mode, const char *str, size_t str_len, size_t *ret_len,
2780 const mbfl_encoding *enc) {
2781 return php_unicode_convert_case(
2782 case_mode, str, str_len, ret_len, enc,
2783 MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2784 }
2785
2786 /* {{{ Returns a case-folded version of source_string */
PHP_FUNCTION(mb_convert_case)2787 PHP_FUNCTION(mb_convert_case)
2788 {
2789 zend_string *from_encoding = NULL;
2790 char *str;
2791 size_t str_len, ret_len;
2792 zend_long case_mode = 0;
2793
2794 ZEND_PARSE_PARAMETERS_START(2, 3)
2795 Z_PARAM_STRING(str, str_len)
2796 Z_PARAM_LONG(case_mode)
2797 Z_PARAM_OPTIONAL
2798 Z_PARAM_STR_OR_NULL(from_encoding)
2799 ZEND_PARSE_PARAMETERS_END();
2800
2801 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2802 if (!enc) {
2803 RETURN_THROWS();
2804 }
2805
2806 if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) {
2807 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2808 RETURN_THROWS();
2809 }
2810
2811 char *newstr = mbstring_convert_case(case_mode, str, str_len, &ret_len, enc);
2812 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2813 ZEND_ASSERT(newstr != NULL);
2814
2815 // TODO: avoid reallocation ???
2816 RETVAL_STRINGL(newstr, ret_len);
2817 efree(newstr);
2818 }
2819 /* }}} */
2820
2821 /* {{{ Returns a upper cased version of source_string */
PHP_FUNCTION(mb_strtoupper)2822 PHP_FUNCTION(mb_strtoupper)
2823 {
2824 zend_string *from_encoding = NULL;
2825 char *str;
2826 size_t str_len, ret_len;
2827
2828 ZEND_PARSE_PARAMETERS_START(1, 2)
2829 Z_PARAM_STRING(str, str_len)
2830 Z_PARAM_OPTIONAL
2831 Z_PARAM_STR_OR_NULL(from_encoding)
2832 ZEND_PARSE_PARAMETERS_END();
2833
2834 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2835 if (!enc) {
2836 RETURN_THROWS();
2837 }
2838
2839 char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
2840 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2841 ZEND_ASSERT(newstr != NULL);
2842
2843 // TODO: avoid reallocation ???
2844 RETVAL_STRINGL(newstr, ret_len);
2845 efree(newstr);
2846 }
2847 /* }}} */
2848
2849 /* {{{ Returns a lower cased version of source_string */
PHP_FUNCTION(mb_strtolower)2850 PHP_FUNCTION(mb_strtolower)
2851 {
2852 zend_string *from_encoding = NULL;
2853 char *str;
2854 size_t str_len;
2855 char *newstr;
2856 size_t ret_len;
2857 const mbfl_encoding *enc;
2858
2859 ZEND_PARSE_PARAMETERS_START(1, 2)
2860 Z_PARAM_STRING(str, str_len)
2861 Z_PARAM_OPTIONAL
2862 Z_PARAM_STR_OR_NULL(from_encoding)
2863 ZEND_PARSE_PARAMETERS_END();
2864
2865 enc = php_mb_get_encoding(from_encoding, 2);
2866 if (!enc) {
2867 RETURN_THROWS();
2868 }
2869
2870 newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
2871 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2872 ZEND_ASSERT(newstr != NULL);
2873
2874 // TODO: avoid reallocation ???
2875 RETVAL_STRINGL(newstr, ret_len);
2876 efree(newstr);
2877 }
2878 /* }}} */
2879
duplicate_elist(const mbfl_encoding ** elist,size_t size)2880 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2881 {
2882 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2883 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2884 return new_elist;
2885 }
2886
2887 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)2888 PHP_FUNCTION(mb_detect_encoding)
2889 {
2890 char *str;
2891 size_t str_len;
2892 zend_string *encoding_str = NULL;
2893 HashTable *encoding_ht = NULL;
2894 bool strict = 0;
2895
2896 mbfl_string string;
2897 const mbfl_encoding *ret;
2898 const mbfl_encoding **elist;
2899 size_t size;
2900
2901 ZEND_PARSE_PARAMETERS_START(1, 3)
2902 Z_PARAM_STRING(str, str_len)
2903 Z_PARAM_OPTIONAL
2904 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
2905 Z_PARAM_BOOL(strict)
2906 ZEND_PARSE_PARAMETERS_END();
2907
2908 /* make encoding list */
2909 if (encoding_ht) {
2910 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
2911 RETURN_THROWS();
2912 }
2913 } else if (encoding_str) {
2914 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0)) {
2915 RETURN_THROWS();
2916 }
2917 } else {
2918 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
2919 size = MBSTRG(current_detect_order_list_size);
2920 }
2921
2922 if (size == 0) {
2923 efree(ZEND_VOIDP(elist));
2924 zend_argument_value_error(2, "must specify at least one encoding");
2925 RETURN_THROWS();
2926 }
2927
2928 remove_non_encodings_from_elist(elist, &size);
2929 if (size == 0) {
2930 efree(ZEND_VOIDP(elist));
2931 RETURN_FALSE;
2932 }
2933
2934 if (ZEND_NUM_ARGS() < 3) {
2935 strict = MBSTRG(strict_detection);
2936 }
2937
2938 if (strict && size == 1) {
2939 /* If there is only a single candidate encoding, mb_check_encoding is faster */
2940 ret = (php_mb_check_encoding(str, str_len, *elist)) ? *elist : NULL;
2941 } else {
2942 mbfl_string_init(&string);
2943 string.val = (unsigned char *)str;
2944 string.len = str_len;
2945 ret = mbfl_identify_encoding(&string, elist, size, strict);
2946 }
2947
2948 efree(ZEND_VOIDP(elist));
2949
2950 if (ret == NULL) {
2951 RETURN_FALSE;
2952 }
2953
2954 RETVAL_STRING((char *)ret->name);
2955 }
2956 /* }}} */
2957
2958 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)2959 PHP_FUNCTION(mb_list_encodings)
2960 {
2961 ZEND_PARSE_PARAMETERS_NONE();
2962
2963 array_init(return_value);
2964 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
2965 add_next_index_string(return_value, (*encodings)->name);
2966 }
2967 }
2968 /* }}} */
2969
2970 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)2971 PHP_FUNCTION(mb_encoding_aliases)
2972 {
2973 const mbfl_encoding *encoding;
2974 zend_string *encoding_name = NULL;
2975
2976 ZEND_PARSE_PARAMETERS_START(1, 1)
2977 Z_PARAM_STR(encoding_name)
2978 ZEND_PARSE_PARAMETERS_END();
2979
2980 encoding = php_mb_get_encoding(encoding_name, 1);
2981 if (!encoding) {
2982 RETURN_THROWS();
2983 }
2984
2985 array_init(return_value);
2986 if (encoding->aliases != NULL) {
2987 for (const char **alias = encoding->aliases; *alias; ++alias) {
2988 add_next_index_string(return_value, (char *)*alias);
2989 }
2990 }
2991 }
2992 /* }}} */
2993
2994 /* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)2995 PHP_FUNCTION(mb_encode_mimeheader)
2996 {
2997 const mbfl_encoding *charset, *transenc;
2998 mbfl_string string, result, *ret;
2999 zend_string *charset_name = NULL;
3000 char *trans_enc_name = NULL, *string_val;
3001 size_t trans_enc_name_len;
3002 char *linefeed = "\r\n";
3003 size_t linefeed_len;
3004 zend_long indent = 0;
3005
3006 string.encoding = MBSTRG(current_internal_encoding);
3007
3008 ZEND_PARSE_PARAMETERS_START(1, 5)
3009 Z_PARAM_STRING(string_val, string.len)
3010 Z_PARAM_OPTIONAL
3011 Z_PARAM_STR(charset_name)
3012 Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
3013 Z_PARAM_STRING(linefeed, linefeed_len)
3014 Z_PARAM_LONG(indent)
3015 ZEND_PARSE_PARAMETERS_END();
3016
3017 string.val = (unsigned char*)string_val;
3018 charset = &mbfl_encoding_pass;
3019 transenc = &mbfl_encoding_base64;
3020
3021 if (charset_name != NULL) {
3022 charset = php_mb_get_encoding(charset_name, 2);
3023 if (!charset) {
3024 RETURN_THROWS();
3025 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
3026 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
3027 RETURN_THROWS();
3028 }
3029 } else {
3030 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
3031 if (lang != NULL) {
3032 charset = mbfl_no2encoding(lang->mail_charset);
3033 transenc = mbfl_no2encoding(lang->mail_header_encoding);
3034 }
3035 }
3036
3037 if (trans_enc_name != NULL) {
3038 if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
3039 transenc = &mbfl_encoding_base64;
3040 } else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
3041 transenc = &mbfl_encoding_qprint;
3042 }
3043 }
3044
3045 mbfl_string_init(&result);
3046 ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
3047 ZEND_ASSERT(ret != NULL);
3048 // TODO: avoid reallocation ???
3049 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
3050 efree(ret->val);
3051 }
3052 /* }}} */
3053
3054 /* {{{ Decodes the MIME "encoded-word" in the string */
PHP_FUNCTION(mb_decode_mimeheader)3055 PHP_FUNCTION(mb_decode_mimeheader)
3056 {
3057 char *string_val;
3058 mbfl_string string, result, *ret;
3059
3060 string.encoding = MBSTRG(current_internal_encoding);
3061
3062 ZEND_PARSE_PARAMETERS_START(1, 1)
3063 Z_PARAM_STRING(string_val, string.len)
3064 ZEND_PARSE_PARAMETERS_END();
3065
3066 string.val = (unsigned char*)string_val;
3067 mbfl_string_init(&result);
3068 ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding));
3069 ZEND_ASSERT(ret != NULL);
3070 // TODO: avoid reallocation ???
3071 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
3072 efree(ret->val);
3073 }
3074 /* }}} */
3075
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3076 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3077 {
3078 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3079 * if we are converting zenkaku kana to hankaku kana
3080 * Make the buffer for converted kana big enough that we never need to
3081 * perform bounds checks */
3082 uint32_t wchar_buf[64], converted_buf[64 * 2];
3083 unsigned int buf_offset = 0;
3084 unsigned int state = 0;
3085 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3086 size_t in_len = ZSTR_LEN(input);
3087
3088 mb_convert_buf buf;
3089 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3090
3091 while (in_len) {
3092 uint32_t *converted = converted_buf;
3093 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3094 * previous iteration, don't overwrite it */
3095 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3096 out_len += buf_offset;
3097 ZEND_ASSERT(out_len <= 64);
3098
3099 if (!out_len) {
3100 continue;
3101 }
3102
3103 for (int i = 0; i < out_len-1; i++) {
3104 uint32_t second = 0;
3105 bool consumed = false;
3106 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3107 if (second) {
3108 *converted++ = second;
3109 }
3110 if (consumed) {
3111 i++;
3112 if (i == out_len-1) {
3113 /* We consumed two codepoints at the very end of the wchar buffer
3114 * So there is nothing remaining to reprocess on the next iteration */
3115 buf_offset = 0;
3116 goto emit_converted_kana;
3117 }
3118 }
3119 }
3120
3121 if (!in_len) {
3122 /* This is the last iteration, so we need to process the final codepoint now */
3123 uint32_t second = 0;
3124 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3125 if (second) {
3126 *converted++ = second;
3127 }
3128 } else {
3129 /* Reprocess the last codepoint on the next iteration */
3130 wchar_buf[0] = wchar_buf[out_len-1];
3131 buf_offset = 1;
3132 }
3133
3134 emit_converted_kana:
3135 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3136 }
3137
3138 return mb_convert_buf_result(&buf);
3139 }
3140
3141 char mb_convert_kana_flags[17] = {
3142 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3143 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3144 'V'
3145 };
3146
3147 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3148 PHP_FUNCTION(mb_convert_kana)
3149 {
3150 unsigned int opt;
3151 char *optstr = NULL;
3152 size_t optstr_len;
3153 zend_string *encname = NULL, *str;
3154
3155 ZEND_PARSE_PARAMETERS_START(1, 3)
3156 Z_PARAM_STR(str)
3157 Z_PARAM_OPTIONAL
3158 Z_PARAM_STRING(optstr, optstr_len)
3159 Z_PARAM_STR_OR_NULL(encname)
3160 ZEND_PARSE_PARAMETERS_END();
3161
3162 if (optstr != NULL) {
3163 char *p = optstr, *e = p + optstr_len;
3164 opt = 0;
3165 next_option:
3166 while (p < e) {
3167 /* Walk through option string and convert to bit vector
3168 * See translit_kana_jisx0201_jisx0208.h for the values used */
3169 char c = *p++;
3170 if (c == 'A') {
3171 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3172 } else if (c == 'a') {
3173 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3174 } else {
3175 for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3176 if (c == mb_convert_kana_flags[i]) {
3177 opt |= (1 << i);
3178 goto next_option;
3179 }
3180 }
3181
3182 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3183 RETURN_THROWS();
3184 }
3185 }
3186
3187 /* Check for illegal combinations of options */
3188 if (((opt & 0xFF00) >> 8) & opt) {
3189 /* It doesn't make sense to convert the same type of characters from halfwidth to
3190 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3191 * FW hiragana to FW katakana and then back again. */
3192 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3193 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3194 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3195 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3196 flag1 = 'A';
3197 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3198 flag2 = 'a';
3199 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3200 RETURN_THROWS();
3201 }
3202
3203 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3204 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3205 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3206 RETURN_THROWS();
3207 }
3208
3209 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3210 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3211 * more than one of these */
3212 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3213 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3214 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3215 RETURN_THROWS();
3216 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3217 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3218 RETURN_THROWS();
3219 }
3220 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3221 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3222 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3223 RETURN_THROWS();
3224 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3225 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3226 RETURN_THROWS();
3227 }
3228 }
3229 } else {
3230 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3231 }
3232
3233 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3234 if (!enc) {
3235 RETURN_THROWS();
3236 }
3237
3238 RETVAL_STR(jp_kana_convert(str, enc, opt));
3239 }
3240
mb_recursive_encoder_detector_feed(mbfl_encoding_detector * identd,zval * var,int * recursion_error)3241 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
3242 {
3243 mbfl_string string;
3244 HashTable *ht;
3245 zval *entry;
3246
3247 ZVAL_DEREF(var);
3248 if (Z_TYPE_P(var) == IS_STRING) {
3249 string.val = (unsigned char *)Z_STRVAL_P(var);
3250 string.len = Z_STRLEN_P(var);
3251 if (mbfl_encoding_detector_feed(identd, &string)) {
3252 return 1; /* complete detecting */
3253 }
3254 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3255 if (Z_REFCOUNTED_P(var)) {
3256 if (Z_IS_RECURSIVE_P(var)) {
3257 *recursion_error = 1;
3258 return 0;
3259 }
3260 Z_PROTECT_RECURSION_P(var);
3261 }
3262
3263 ht = HASH_OF(var);
3264 if (ht != NULL) {
3265 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3266 if (mb_recursive_encoder_detector_feed(identd, entry, recursion_error)) {
3267 if (Z_REFCOUNTED_P(var)) {
3268 Z_UNPROTECT_RECURSION_P(var);
3269 }
3270 return 1;
3271 } else if (*recursion_error) {
3272 if (Z_REFCOUNTED_P(var)) {
3273 Z_UNPROTECT_RECURSION_P(var);
3274 }
3275 return 0;
3276 }
3277 } ZEND_HASH_FOREACH_END();
3278 }
3279
3280 if (Z_REFCOUNTED_P(var)) {
3281 Z_UNPROTECT_RECURSION_P(var);
3282 }
3283 }
3284 return 0;
3285 } /* }}} */
3286
mb_recursive_convert_variable(mbfl_buffer_converter * convd,zval * var)3287 static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
3288 {
3289 mbfl_string string, result, *ret;
3290 HashTable *ht;
3291 zval *entry, *orig_var;
3292
3293 orig_var = var;
3294 ZVAL_DEREF(var);
3295 if (Z_TYPE_P(var) == IS_STRING) {
3296 string.val = (unsigned char *)Z_STRVAL_P(var);
3297 string.len = Z_STRLEN_P(var);
3298 ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3299 if (ret != NULL) {
3300 zval_ptr_dtor(orig_var);
3301 // TODO: avoid reallocation ???
3302 ZVAL_STRINGL(orig_var, (const char *) ret->val, ret->len);
3303 efree(ret->val);
3304 }
3305 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3306 if (Z_TYPE_P(var) == IS_ARRAY) {
3307 SEPARATE_ARRAY(var);
3308 }
3309 if (Z_REFCOUNTED_P(var)) {
3310 if (Z_IS_RECURSIVE_P(var)) {
3311 return 1;
3312 }
3313 Z_PROTECT_RECURSION_P(var);
3314 }
3315
3316 ht = HASH_OF(var);
3317 if (ht != NULL) {
3318 ZEND_HASH_FOREACH_VAL(ht, entry) {
3319 /* Can be a typed property declaration, in which case we need to remove the reference from the source list.
3320 * Just using ZEND_TRY_ASSIGN_STRINGL is not sufficient because that would not unwrap the reference
3321 * and change values through references (see bug #26639). */
3322 if (Z_TYPE_P(entry) == IS_INDIRECT) {
3323 ZEND_ASSERT(Z_TYPE_P(var) == IS_OBJECT);
3324
3325 entry = Z_INDIRECT_P(entry);
3326 if (Z_ISREF_P(entry) && Z_TYPE_P(Z_REFVAL_P(entry)) == IS_STRING) {
3327 zend_property_info *info = zend_get_typed_property_info_for_slot(Z_OBJ_P(var), entry);
3328 if (info) {
3329 ZEND_REF_DEL_TYPE_SOURCE(Z_REF_P(entry), info);
3330 }
3331 }
3332 }
3333
3334 if (mb_recursive_convert_variable(convd, entry)) {
3335 if (Z_REFCOUNTED_P(var)) {
3336 Z_UNPROTECT_RECURSION_P(var);
3337 }
3338 return 1;
3339 }
3340 } ZEND_HASH_FOREACH_END();
3341 }
3342
3343 if (Z_REFCOUNTED_P(var)) {
3344 Z_UNPROTECT_RECURSION_P(var);
3345 }
3346 }
3347 return 0;
3348 } /* }}} */
3349
3350 /* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)3351 PHP_FUNCTION(mb_convert_variables)
3352 {
3353 zval *args;
3354 zend_string *to_enc_str;
3355 zend_string *from_enc_str;
3356 HashTable *from_enc_ht;
3357 mbfl_string string, result;
3358 const mbfl_encoding *from_encoding, *to_encoding;
3359 mbfl_encoding_detector *identd;
3360 mbfl_buffer_converter *convd;
3361 int n, argc;
3362 size_t elistsz;
3363 const mbfl_encoding **elist;
3364 int recursion_error = 0;
3365
3366 ZEND_PARSE_PARAMETERS_START(3, -1)
3367 Z_PARAM_STR(to_enc_str)
3368 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3369 Z_PARAM_VARIADIC('+', args, argc)
3370 ZEND_PARSE_PARAMETERS_END();
3371
3372 /* new encoding */
3373 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3374 if (!to_encoding) {
3375 RETURN_THROWS();
3376 }
3377
3378 /* initialize string */
3379 from_encoding = MBSTRG(current_internal_encoding);
3380 mbfl_string_init_set(&string, from_encoding);
3381 mbfl_string_init(&result);
3382
3383 /* pre-conversion encoding */
3384 if (from_enc_ht) {
3385 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3386 RETURN_THROWS();
3387 }
3388 } else {
3389 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0) == FAILURE) {
3390 RETURN_THROWS();
3391 }
3392 }
3393
3394 if (elistsz == 0) {
3395 efree(ZEND_VOIDP(elist));
3396 zend_argument_value_error(2, "must specify at least one encoding");
3397 RETURN_THROWS();
3398 }
3399
3400 if (elistsz == 1) {
3401 from_encoding = *elist;
3402 } else {
3403 /* auto detect */
3404 from_encoding = NULL;
3405 identd = mbfl_encoding_detector_new(elist, elistsz, MBSTRG(strict_detection));
3406 if (identd != NULL) {
3407 n = 0;
3408 while (n < argc) {
3409 if (mb_recursive_encoder_detector_feed(identd, &args[n], &recursion_error)) {
3410 break;
3411 }
3412 n++;
3413 }
3414 from_encoding = mbfl_encoding_detector_judge(identd);
3415 mbfl_encoding_detector_delete(identd);
3416 if (recursion_error) {
3417 efree(ZEND_VOIDP(elist));
3418 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3419 RETURN_FALSE;
3420 }
3421 }
3422
3423 if (!from_encoding) {
3424 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3425 efree(ZEND_VOIDP(elist));
3426 RETURN_FALSE;
3427 }
3428 }
3429
3430 efree(ZEND_VOIDP(elist));
3431
3432 convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
3433 /* If this assertion fails this means some memory allocation failure which is a bug */
3434 ZEND_ASSERT(convd != NULL);
3435
3436 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
3437 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
3438
3439 /* convert */
3440 n = 0;
3441 while (n < argc) {
3442 zval *zv = &args[n];
3443
3444 ZVAL_DEREF(zv);
3445 recursion_error = mb_recursive_convert_variable(convd, zv);
3446 if (recursion_error) {
3447 break;
3448 }
3449 n++;
3450 }
3451
3452 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
3453 mbfl_buffer_converter_delete(convd);
3454
3455 if (recursion_error) {
3456 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3457 RETURN_FALSE;
3458 }
3459
3460 RETURN_STRING(from_encoding->name);
3461 }
3462 /* }}} */
3463
3464 /* HTML numeric entities */
3465
3466 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3467 static uint32_t *make_conversion_map(HashTable *target_hash, int *convmap_size)
3468 {
3469 zval *hash_entry;
3470
3471 int n_elems = zend_hash_num_elements(target_hash);
3472 if (n_elems % 4 != 0) {
3473 zend_argument_value_error(2, "must have a multiple of 4 elements");
3474 return NULL;
3475 }
3476
3477 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3478 uint32_t *mapelm = convmap;
3479
3480 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3481 *mapelm++ = zval_get_long(hash_entry);
3482 } ZEND_HASH_FOREACH_END();
3483
3484 *convmap_size = n_elems / 4;
3485 return convmap;
3486 }
3487
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,int mapsize,uint32_t * retval)3488 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, int mapsize, uint32_t *retval)
3489 {
3490 uint32_t *convmap_end = convmap + (mapsize * 4);
3491
3492 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3493 uint32_t lo_code = mapelm[0];
3494 uint32_t hi_code = mapelm[1];
3495 uint32_t offset = mapelm[2];
3496 uint32_t mask = mapelm[3];
3497
3498 if (w >= lo_code && w <= hi_code) {
3499 /* This wchar falls inside one of the ranges which should be
3500 * converted to HTML entities */
3501 *retval = (w + offset) & mask;
3502 return true;
3503 }
3504 }
3505
3506 /* None of the ranges matched */
3507 return false;
3508 }
3509
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize,bool hex)3510 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize, bool hex)
3511 {
3512 /* Each wchar which we get from decoding the input string may become up to
3513 * 13 wchars when we convert it to an HTML entity */
3514 uint32_t wchar_buf[32], converted_buf[32 * 13];
3515 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3516
3517 unsigned int state = 0;
3518 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3519 size_t in_len = ZSTR_LEN(input);
3520
3521 mb_convert_buf buf;
3522 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3523
3524 while (in_len) {
3525 /* Convert input string to wchars, up to 32 at a time */
3526 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3527 ZEND_ASSERT(out_len <= 32);
3528 uint32_t *converted = converted_buf;
3529
3530 /* Run through wchars and see if any of them fall into the ranges
3531 * which we want to convert to HTML entities */
3532 for (int i = 0; i < out_len; i++) {
3533 uint32_t w = wchar_buf[i];
3534
3535 if (html_numeric_entity_convert(w, convmap, mapsize, &w)) {
3536 *converted++ = '&';
3537 *converted++ = '#';
3538 if (hex) {
3539 *converted++ = 'x';
3540 }
3541
3542 /* Convert wchar to decimal/hex string */
3543 if (w == 0) {
3544 *converted++ = '0';
3545 } else {
3546 unsigned char *p = entity + sizeof(entity);
3547 if (hex) {
3548 while (w > 0) {
3549 *(--p) = "0123456789ABCDEF"[w & 0xF];
3550 w >>= 4;
3551 }
3552 } else {
3553 while (w > 0) {
3554 *(--p) = "0123456789"[w % 10];
3555 w /= 10;
3556 }
3557 }
3558 while (p < entity + sizeof(entity)) {
3559 *converted++ = *p++;
3560 }
3561 }
3562
3563 *converted++ = ';';
3564 } else {
3565 *converted++ = w;
3566 }
3567 }
3568
3569 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3570 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3571 }
3572
3573 return mb_convert_buf_result(&buf);
3574 }
3575
3576 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3577 PHP_FUNCTION(mb_encode_numericentity)
3578 {
3579 zend_string *encoding = NULL, *str;
3580 int mapsize;
3581 HashTable *target_hash;
3582 bool is_hex = false;
3583
3584 ZEND_PARSE_PARAMETERS_START(2, 4)
3585 Z_PARAM_STR(str)
3586 Z_PARAM_ARRAY_HT(target_hash)
3587 Z_PARAM_OPTIONAL
3588 Z_PARAM_STR_OR_NULL(encoding)
3589 Z_PARAM_BOOL(is_hex)
3590 ZEND_PARSE_PARAMETERS_END();
3591
3592 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3593 if (!enc) {
3594 RETURN_THROWS();
3595 }
3596
3597 uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3598 if (convmap == NULL) {
3599 RETURN_THROWS();
3600 }
3601
3602 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, mapsize, is_hex));
3603 efree(convmap);
3604 }
3605 /* }}} */
3606
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,int mapsize,uint32_t * retval)3607 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, int mapsize, uint32_t *retval)
3608 {
3609 uint32_t *convmap_end = convmap + (mapsize * 4);
3610
3611 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3612 uint32_t lo_code = mapelm[0];
3613 uint32_t hi_code = mapelm[1];
3614 uint32_t offset = mapelm[2];
3615 uint32_t codepoint = number - offset;
3616 if (codepoint >= lo_code && codepoint <= hi_code) {
3617 *retval = codepoint;
3618 return true;
3619 }
3620 }
3621
3622 return false;
3623 }
3624
3625 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
3626 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
3627 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3628 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3629
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize)3630 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
3631 {
3632 uint32_t wchar_buf[128], converted_buf[128];
3633
3634 unsigned int state = 0;
3635 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3636 size_t in_len = ZSTR_LEN(input);
3637
3638 mb_convert_buf buf;
3639 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3640
3641 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
3642 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
3643 * 2nd 'converted' buffer.
3644 *
3645 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
3646 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
3647 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
3648 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
3649 * to store the next batch of wchars after it.
3650 *
3651 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
3652 * wchars from the 1st buffer to the 2nd one.
3653 *
3654 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
3655 * have to do bounds checks when writing wchars into it.
3656 */
3657
3658 unsigned int wchar_buf_offset = 0;
3659
3660 while (in_len) {
3661 /* Leave space for sentinel at the end of the buffer */
3662 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
3663 out_len += wchar_buf_offset;
3664 ZEND_ASSERT(out_len <= 127);
3665 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
3666
3667 uint32_t *p, *converted;
3668
3669 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
3670 * be there (in `wchar_buf[0]`), so don't bother in that case */
3671 if (wchar_buf_offset == 0) {
3672 p = wchar_buf;
3673 while (*p != '&')
3674 p++;
3675 if (p == wchar_buf + out_len) {
3676 /* No HTML entities in this buffer */
3677 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
3678 continue;
3679 }
3680
3681 /* Copy over the prefix with no & which we already scanned */
3682 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
3683 converted = converted_buf + (p - wchar_buf);
3684 } else {
3685 p = wchar_buf;
3686 converted = converted_buf;
3687 }
3688
3689 found_ampersand:
3690 ZEND_ASSERT(*p == '&');
3691 uint32_t *p2 = p;
3692
3693 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
3694 if (*++p2 == '#') {
3695 if (*++p2 == 'x') {
3696 /* Possible hex entity */
3697 uint32_t w = *++p2;
3698 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
3699 w = *++p2;
3700 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3701 /* We hit the end of the buffer while reading digits, and
3702 * more wchars are still coming in the next buffer
3703 * Reprocess this identity on next iteration */
3704 memmove(wchar_buf, p, (p2 - p) * 4);
3705 wchar_buf_offset = p2 - p;
3706 goto process_converted_wchars;
3707 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3708 /* Invalid entity (too long or "&#x" only) */
3709 memcpy(converted, p, (p2 - p) * 4);
3710 converted += p2 - p;
3711 } else {
3712 /* Valid hexadecimal entity */
3713 uint32_t value = 0, *p3 = p + 3;
3714 while (p3 < p2) {
3715 w = *p3++;
3716 if (w <= '9') {
3717 value = (value * 16) + (w - '0');
3718 } else if (w >= 'a') {
3719 value = (value * 16) + 10 + (w - 'a');
3720 } else {
3721 value = (value * 16) + 10 + (w - 'A');
3722 }
3723 }
3724 if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3725 converted++;
3726 if (*p2 == ';')
3727 p2++;
3728 } else {
3729 memcpy(converted, p, (p2 - p) * 4);
3730 converted += p2 - p;
3731 }
3732 }
3733 } else {
3734 /* Possible decimal entity */
3735 uint32_t w = *p2;
3736 while (w >= '0' && w <= '9')
3737 w = *++p2;
3738 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3739 /* The number of digits was legal (no more than 10 decimal digits)
3740 * Reprocess this identity on next iteration of main loop */
3741 memmove(wchar_buf, p, (p2 - p) * 4);
3742 wchar_buf_offset = p2 - p;
3743 goto process_converted_wchars;
3744 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3745 /* Invalid entity (too long or "&#" only) */
3746 memcpy(converted, p, (p2 - p) * 4);
3747 converted += p2 - p;
3748 } else {
3749 /* Valid decimal entity */
3750 uint32_t value = 0, *p3 = p + 2;
3751 while (p3 < p2) {
3752 /* If unsigned integer overflow would occur in the below
3753 * multiplication by 10, this entity is no good
3754 * 0x19999999 is 1/10th of 0xFFFFFFFF */
3755 if (value > 0x19999999) {
3756 memcpy(converted, p, (p2 - p) * 4);
3757 converted += p2 - p;
3758 goto decimal_entity_too_big;
3759 }
3760 value = (value * 10) + (*p3++ - '0');
3761 }
3762 if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3763 converted++;
3764 if (*p2 == ';')
3765 p2++;
3766 } else {
3767 memcpy(converted, p, (p2 - p) * 4);
3768 converted += p2 - p;
3769 }
3770 }
3771 }
3772 } else if ((p2 == wchar_buf + out_len) && in_len) {
3773 /* Corner case: & at end of buffer */
3774 wchar_buf[0] = '&';
3775 wchar_buf_offset = 1;
3776 goto process_converted_wchars;
3777 } else {
3778 *converted++ = '&';
3779 }
3780 decimal_entity_too_big:
3781
3782 /* Starting to scan a new section of the wchar buffer
3783 * 'p2' is pointing at the next wchar which needs to be processed */
3784 p = p2;
3785 while (*p2 != '&')
3786 p2++;
3787
3788 if (p2 > p) {
3789 memcpy(converted, p, (p2 - p) * 4);
3790 converted += p2 - p;
3791 p = p2;
3792 }
3793
3794 if (p < wchar_buf + out_len)
3795 goto found_ampersand;
3796
3797 /* We do not have any wchars remaining at the end of this buffer which
3798 * we need to reprocess on the next call */
3799 wchar_buf_offset = 0;
3800 process_converted_wchars:
3801 ZEND_ASSERT(converted <= converted_buf + 128);
3802 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3803 }
3804
3805 return mb_convert_buf_result(&buf);
3806 }
3807
3808 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3809 PHP_FUNCTION(mb_decode_numericentity)
3810 {
3811 zend_string *encoding = NULL, *str;
3812 int mapsize;
3813 HashTable *target_hash;
3814
3815 ZEND_PARSE_PARAMETERS_START(2, 3)
3816 Z_PARAM_STR(str)
3817 Z_PARAM_ARRAY_HT(target_hash)
3818 Z_PARAM_OPTIONAL
3819 Z_PARAM_STR_OR_NULL(encoding)
3820 ZEND_PARSE_PARAMETERS_END();
3821
3822 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3823 if (!enc) {
3824 RETURN_THROWS();
3825 }
3826
3827 uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3828 if (convmap == NULL) {
3829 RETURN_THROWS();
3830 }
3831
3832 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, mapsize));
3833 efree(convmap);
3834 }
3835 /* }}} */
3836
3837 /* {{{ Sends an email message with MIME scheme */
3838 #define CRLF "\r\n"
3839
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)3840 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
3841 {
3842 const char *ps;
3843 size_t icnt;
3844 int state = 0;
3845 int crlf_state = -1;
3846 char *token = NULL;
3847 size_t token_pos = 0;
3848 zend_string *fld_name, *fld_val;
3849
3850 ps = str;
3851 icnt = str_len;
3852 fld_name = fld_val = NULL;
3853
3854 /*
3855 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3856 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
3857 * state 0 1 2 3
3858 *
3859 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3860 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
3861 * crlf_state -1 0 1 -1
3862 *
3863 */
3864
3865 while (icnt > 0) {
3866 switch (*ps) {
3867 case ':':
3868 if (crlf_state == 1) {
3869 token_pos++;
3870 }
3871
3872 if (state == 0 || state == 1) {
3873 if(token && token_pos > 0) {
3874 fld_name = zend_string_init(token, token_pos, 0);
3875 }
3876 state = 2;
3877 } else {
3878 token_pos++;
3879 }
3880
3881 crlf_state = 0;
3882 break;
3883
3884 case '\n':
3885 if (crlf_state == -1) {
3886 goto out;
3887 }
3888 crlf_state = -1;
3889 break;
3890
3891 case '\r':
3892 if (crlf_state == 1) {
3893 token_pos++;
3894 } else {
3895 crlf_state = 1;
3896 }
3897 break;
3898
3899 case ' ': case '\t':
3900 if (crlf_state == -1) {
3901 if (state == 3) {
3902 /* continuing from the previous line */
3903 state = 4;
3904 } else {
3905 /* simply skipping this new line */
3906 state = 5;
3907 }
3908 } else {
3909 if (crlf_state == 1) {
3910 token_pos++;
3911 }
3912 if (state == 1 || state == 3) {
3913 token_pos++;
3914 }
3915 }
3916 crlf_state = 0;
3917 break;
3918
3919 default:
3920 switch (state) {
3921 case 0:
3922 token = (char*)ps;
3923 token_pos = 0;
3924 state = 1;
3925 break;
3926
3927 case 2:
3928 if (crlf_state != -1) {
3929 token = (char*)ps;
3930 token_pos = 0;
3931
3932 state = 3;
3933 break;
3934 }
3935 ZEND_FALLTHROUGH;
3936
3937 case 3:
3938 if (crlf_state == -1) {
3939 if(token && token_pos > 0) {
3940 fld_val = zend_string_init(token, token_pos, 0);
3941 }
3942
3943 if (fld_name != NULL && fld_val != NULL) {
3944 zval val;
3945 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3946 ZVAL_STR(&val, fld_val);
3947
3948 zend_hash_update(ht, fld_name, &val);
3949
3950 zend_string_release_ex(fld_name, 0);
3951 }
3952
3953 fld_name = fld_val = NULL;
3954 token = (char*)ps;
3955 token_pos = 0;
3956
3957 state = 1;
3958 }
3959 break;
3960
3961 case 4:
3962 token_pos++;
3963 state = 3;
3964 break;
3965 }
3966
3967 if (crlf_state == 1) {
3968 token_pos++;
3969 }
3970
3971 token_pos++;
3972
3973 crlf_state = 0;
3974 break;
3975 }
3976 ps++, icnt--;
3977 }
3978 out:
3979 if (state == 2) {
3980 token = "";
3981 token_pos = 0;
3982
3983 state = 3;
3984 }
3985 if (state == 3) {
3986 if(token && token_pos > 0) {
3987 fld_val = zend_string_init(token, token_pos, 0);
3988 }
3989 if (fld_name != NULL && fld_val != NULL) {
3990 zval val;
3991 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3992 ZVAL_STR(&val, fld_val);
3993 zend_hash_update(ht, fld_name, &val);
3994
3995 zend_string_release_ex(fld_name, 0);
3996 }
3997 }
3998 return state;
3999 }
4000
PHP_FUNCTION(mb_send_mail)4001 PHP_FUNCTION(mb_send_mail)
4002 {
4003 char *to;
4004 size_t to_len;
4005 char *message;
4006 size_t message_len;
4007 char *subject;
4008 size_t subject_len;
4009 zend_string *extra_cmd = NULL;
4010 HashTable *headers_ht = NULL;
4011 zend_string *str_headers = NULL;
4012 size_t n, i;
4013 char *to_r = NULL;
4014 char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
4015 struct {
4016 int cnt_type:1;
4017 int cnt_trans_enc:1;
4018 } suppressed_hdrs = { 0, 0 };
4019
4020 char *message_buf = NULL, *subject_buf = NULL, *p;
4021 mbfl_string orig_str, conv_str;
4022 mbfl_string *pstr; /* pointer to mbfl string for return value */
4023 enum mbfl_no_encoding;
4024 const mbfl_encoding *tran_cs, /* transfer text charset */
4025 *head_enc, /* header transfer encoding */
4026 *body_enc; /* body transfer encoding */
4027 mbfl_memory_device device; /* automatic allocateable buffer for additional header */
4028 const mbfl_language *lang;
4029 int err = 0;
4030 HashTable ht_headers;
4031 zval *s;
4032 extern void mbfl_memory_device_unput(mbfl_memory_device *device);
4033
4034 /* initialize */
4035 mbfl_memory_device_init(&device, 0, 0);
4036 mbfl_string_init(&orig_str);
4037 mbfl_string_init(&conv_str);
4038
4039 /* character-set, transfer-encoding */
4040 tran_cs = &mbfl_encoding_utf8;
4041 head_enc = &mbfl_encoding_base64;
4042 body_enc = &mbfl_encoding_base64;
4043 lang = mbfl_no2language(MBSTRG(language));
4044 if (lang != NULL) {
4045 tran_cs = mbfl_no2encoding(lang->mail_charset);
4046 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4047 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4048 }
4049
4050 ZEND_PARSE_PARAMETERS_START(3, 5)
4051 Z_PARAM_PATH(to, to_len)
4052 Z_PARAM_PATH(subject, subject_len)
4053 Z_PARAM_PATH(message, message_len)
4054 Z_PARAM_OPTIONAL
4055 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4056 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4057 ZEND_PARSE_PARAMETERS_END();
4058
4059 if (str_headers) {
4060 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4061 zend_argument_value_error(4, "must not contain any null bytes");
4062 RETURN_THROWS();
4063 }
4064 str_headers = php_trim(str_headers, NULL, 0, 2);
4065 } else if (headers_ht) {
4066 str_headers = php_mail_build_headers(headers_ht);
4067 if (EG(exception)) {
4068 RETURN_THROWS();
4069 }
4070 }
4071
4072 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4073
4074 if (str_headers != NULL) {
4075 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4076 }
4077
4078 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4079 char *tmp;
4080 char *param_name;
4081 char *charset = NULL;
4082
4083 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4084 p = strchr(Z_STRVAL_P(s), ';');
4085
4086 if (p != NULL) {
4087 /* skipping the padded spaces */
4088 do {
4089 ++p;
4090 } while (*p == ' ' || *p == '\t');
4091
4092 if (*p != '\0') {
4093 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4094 if (strcasecmp(param_name, "charset") == 0) {
4095 const mbfl_encoding *_tran_cs = tran_cs;
4096
4097 charset = php_strtok_r(NULL, "= \"", &tmp);
4098 if (charset != NULL) {
4099 _tran_cs = mbfl_name2encoding(charset);
4100 }
4101
4102 if (!_tran_cs) {
4103 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4104 _tran_cs = &mbfl_encoding_ascii;
4105 }
4106 tran_cs = _tran_cs;
4107 }
4108 }
4109 }
4110 }
4111 suppressed_hdrs.cnt_type = 1;
4112 }
4113
4114 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4115 const mbfl_encoding *_body_enc;
4116
4117 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4118 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4119 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4120 case mbfl_no_encoding_base64:
4121 case mbfl_no_encoding_7bit:
4122 case mbfl_no_encoding_8bit:
4123 body_enc = _body_enc;
4124 break;
4125
4126 default:
4127 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4128 body_enc = &mbfl_encoding_8bit;
4129 break;
4130 }
4131 suppressed_hdrs.cnt_trans_enc = 1;
4132 }
4133
4134 /* To: */
4135 if (to_len > 0) {
4136 to_r = estrndup(to, to_len);
4137 for (; to_len; to_len--) {
4138 if (!isspace((unsigned char) to_r[to_len - 1])) {
4139 break;
4140 }
4141 to_r[to_len - 1] = '\0';
4142 }
4143 for (i = 0; to_r[i]; i++) {
4144 if (iscntrl((unsigned char) to_r[i])) {
4145 /* According to RFC 822, section 3.1.1 long headers may be separated into
4146 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4147 * To prevent these separators from being replaced with a space, we skip over them. */
4148 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4149 i += 2;
4150 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4151 i++;
4152 }
4153 continue;
4154 }
4155
4156 to_r[i] = ' ';
4157 }
4158 }
4159 } else {
4160 to_r = to;
4161 }
4162
4163 /* Subject: */
4164 orig_str.val = (unsigned char *)subject;
4165 orig_str.len = subject_len;
4166 orig_str.encoding = MBSTRG(current_internal_encoding);
4167 if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4168 || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4169 orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4170 }
4171 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4172 size_t line_sep_len = strlen(line_sep);
4173 pstr = mbfl_mime_header_encode(&orig_str, &conv_str, tran_cs, head_enc, line_sep, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4174 if (pstr != NULL) {
4175 subject_buf = subject = (char *)pstr->val;
4176 }
4177
4178 /* message body */
4179 orig_str.val = (unsigned char *)message;
4180 orig_str.len = message_len;
4181 orig_str.encoding = MBSTRG(current_internal_encoding);
4182
4183 if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4184 || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4185 orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4186 }
4187
4188 pstr = NULL;
4189 {
4190 mbfl_string tmpstr;
4191
4192 if (mbfl_convert_encoding(&orig_str, &tmpstr, tran_cs) != NULL) {
4193 tmpstr.encoding = &mbfl_encoding_8bit;
4194 pstr = mbfl_convert_encoding(&tmpstr, &conv_str, body_enc);
4195 efree(tmpstr.val);
4196 }
4197 }
4198 if (pstr != NULL) {
4199 message_buf = message = (char *)pstr->val;
4200 }
4201
4202 /* other headers */
4203 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4204 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4205 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4206 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4207
4208 if (str_headers != NULL && ZSTR_LEN(str_headers) > 0) {
4209 p = ZSTR_VAL(str_headers);
4210 n = ZSTR_LEN(str_headers);
4211 mbfl_memory_device_strncat(&device, p, n);
4212 if (n > 0 && p[n - 1] != '\n') {
4213 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4214 }
4215 zend_string_release_ex(str_headers, 0);
4216 }
4217
4218 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4219 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4220 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4221 }
4222
4223 if (!suppressed_hdrs.cnt_type) {
4224 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4225
4226 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4227 if (p != NULL) {
4228 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4229 mbfl_memory_device_strcat(&device, p);
4230 }
4231 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4232 }
4233 if (!suppressed_hdrs.cnt_trans_enc) {
4234 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4235 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4236 if (p == NULL) {
4237 p = "7bit";
4238 }
4239 mbfl_memory_device_strcat(&device, p);
4240 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4241 }
4242
4243 if (!PG(mail_mixed_lf_and_crlf)) {
4244 mbfl_memory_device_unput(&device);
4245 }
4246 mbfl_memory_device_unput(&device);
4247 mbfl_memory_device_output('\0', &device);
4248 str_headers = zend_string_init((char *)device.buffer, strlen((char *)device.buffer), 0);
4249
4250 if (force_extra_parameters) {
4251 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4252 } else if (extra_cmd) {
4253 extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4254 }
4255
4256 if (!err && php_mail(to_r, subject, message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL)) {
4257 RETVAL_TRUE;
4258 } else {
4259 RETVAL_FALSE;
4260 }
4261
4262 if (extra_cmd) {
4263 zend_string_release_ex(extra_cmd, 0);
4264 }
4265
4266 if (to_r != to) {
4267 efree(to_r);
4268 }
4269 if (subject_buf) {
4270 efree((void *)subject_buf);
4271 }
4272 if (message_buf) {
4273 efree((void *)message_buf);
4274 }
4275 mbfl_memory_device_clear(&device);
4276 zend_hash_destroy(&ht_headers);
4277 if (str_headers) {
4278 zend_string_release_ex(str_headers, 0);
4279 }
4280 }
4281
4282 #undef CRLF
4283 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4284 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4285 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4286 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4287 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4288 /* }}} */
4289
4290 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4291 PHP_FUNCTION(mb_get_info)
4292 {
4293 zend_string *type = NULL;
4294 size_t n;
4295 char *name;
4296 zval row;
4297 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4298 const mbfl_encoding **entry;
4299
4300 ZEND_PARSE_PARAMETERS_START(0, 1)
4301 Z_PARAM_OPTIONAL
4302 Z_PARAM_STR(type)
4303 ZEND_PARSE_PARAMETERS_END();
4304
4305 if (!type || zend_string_equals_literal_ci(type, "all")) {
4306 array_init(return_value);
4307 if (MBSTRG(current_internal_encoding)) {
4308 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4309 }
4310 if (MBSTRG(http_input_identify)) {
4311 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4312 }
4313 if (MBSTRG(current_http_output_encoding)) {
4314 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4315 }
4316 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4317 add_assoc_string(return_value, "http_output_conv_mimetypes", name);
4318 }
4319 if (lang != NULL) {
4320 if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4321 add_assoc_string(return_value, "mail_charset", name);
4322 }
4323 if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4324 add_assoc_string(return_value, "mail_header_encoding", name);
4325 }
4326 if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4327 add_assoc_string(return_value, "mail_body_encoding", name);
4328 }
4329 }
4330 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4331 if (MBSTRG(encoding_translation)) {
4332 add_assoc_string(return_value, "encoding_translation", "On");
4333 } else {
4334 add_assoc_string(return_value, "encoding_translation", "Off");
4335 }
4336 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4337 add_assoc_string(return_value, "language", name);
4338 }
4339 n = MBSTRG(current_detect_order_list_size);
4340 entry = MBSTRG(current_detect_order_list);
4341 if (n > 0) {
4342 size_t i;
4343 array_init(&row);
4344 for (i = 0; i < n; i++) {
4345 add_next_index_string(&row, (*entry)->name);
4346 entry++;
4347 }
4348 add_assoc_zval(return_value, "detect_order", &row);
4349 }
4350 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4351 add_assoc_string(return_value, "substitute_character", "none");
4352 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4353 add_assoc_string(return_value, "substitute_character", "long");
4354 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4355 add_assoc_string(return_value, "substitute_character", "entity");
4356 } else {
4357 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4358 }
4359 if (MBSTRG(strict_detection)) {
4360 add_assoc_string(return_value, "strict_detection", "On");
4361 } else {
4362 add_assoc_string(return_value, "strict_detection", "Off");
4363 }
4364 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4365 if (MBSTRG(current_internal_encoding)) {
4366 RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
4367 }
4368 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4369 if (MBSTRG(http_input_identify)) {
4370 RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
4371 }
4372 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4373 if (MBSTRG(current_http_output_encoding)) {
4374 RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4375 }
4376 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4377 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4378 RETVAL_STRING(name);
4379 }
4380 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4381 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4382 RETVAL_STRING(name);
4383 }
4384 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4385 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4386 RETVAL_STRING(name);
4387 }
4388 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4389 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4390 RETVAL_STRING(name);
4391 }
4392 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4393 RETVAL_LONG(MBSTRG(illegalchars));
4394 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4395 if (MBSTRG(encoding_translation)) {
4396 RETVAL_STRING("On");
4397 } else {
4398 RETVAL_STRING("Off");
4399 }
4400 } else if (zend_string_equals_literal_ci(type, "language")) {
4401 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4402 RETVAL_STRING(name);
4403 }
4404 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4405 n = MBSTRG(current_detect_order_list_size);
4406 entry = MBSTRG(current_detect_order_list);
4407 if (n > 0) {
4408 size_t i;
4409 array_init(return_value);
4410 for (i = 0; i < n; i++) {
4411 add_next_index_string(return_value, (*entry)->name);
4412 entry++;
4413 }
4414 }
4415 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4416 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4417 RETVAL_STRING("none");
4418 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4419 RETVAL_STRING("long");
4420 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4421 RETVAL_STRING("entity");
4422 } else {
4423 RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
4424 }
4425 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4426 if (MBSTRG(strict_detection)) {
4427 RETVAL_STRING("On");
4428 } else {
4429 RETVAL_STRING("Off");
4430 }
4431 } else {
4432 // TODO Convert to ValueError
4433 RETURN_FALSE;
4434 }
4435 }
4436 /* }}} */
4437
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4438 MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4439 {
4440 uint32_t wchar_buf[128];
4441 unsigned char *in = (unsigned char*)input;
4442 unsigned int state = 0;
4443
4444 if (encoding->check != NULL) {
4445 return encoding->check(in, length);
4446 }
4447
4448 /* If the input string is not encoded in the given encoding, there is a significant chance
4449 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4450 * buffer of 128 codepoints, convert and check just a few codepoints first */
4451 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4452 ZEND_ASSERT(out_len <= 8);
4453 for (int i = 0; i < out_len; i++) {
4454 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4455 return 0;
4456 }
4457 }
4458
4459 while (length) {
4460 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4461 ZEND_ASSERT(out_len <= 128);
4462 for (int i = 0; i < out_len; i++) {
4463 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4464 return 0;
4465 }
4466 }
4467 }
4468
4469 return 1;
4470 }
4471
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)4472 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
4473 {
4474 zend_long idx;
4475 zend_string *key;
4476 zval *entry;
4477 int valid = 1;
4478
4479 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
4480
4481 if (GC_IS_RECURSIVE(vars)) {
4482 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
4483 return 0;
4484 }
4485 GC_TRY_PROTECT_RECURSION(vars);
4486 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
4487 ZVAL_DEREF(entry);
4488 if (key) {
4489 if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
4490 valid = 0;
4491 break;
4492 }
4493 }
4494 switch (Z_TYPE_P(entry)) {
4495 case IS_STRING:
4496 if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
4497 valid = 0;
4498 break;
4499 }
4500 break;
4501 case IS_ARRAY:
4502 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
4503 valid = 0;
4504 break;
4505 }
4506 break;
4507 case IS_LONG:
4508 case IS_DOUBLE:
4509 case IS_NULL:
4510 case IS_TRUE:
4511 case IS_FALSE:
4512 break;
4513 default:
4514 /* Other types are error. */
4515 valid = 0;
4516 break;
4517 }
4518 } ZEND_HASH_FOREACH_END();
4519 GC_TRY_UNPROTECT_RECURSION(vars);
4520 return valid;
4521 }
4522
4523 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)4524 PHP_FUNCTION(mb_check_encoding)
4525 {
4526 zend_string *input_str = NULL, *enc = NULL;
4527 HashTable *input_ht = NULL;
4528 const mbfl_encoding *encoding;
4529
4530 ZEND_PARSE_PARAMETERS_START(0, 2)
4531 Z_PARAM_OPTIONAL
4532 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
4533 Z_PARAM_STR_OR_NULL(enc)
4534 ZEND_PARSE_PARAMETERS_END();
4535
4536 encoding = php_mb_get_encoding(enc, 2);
4537 if (!encoding) {
4538 RETURN_THROWS();
4539 }
4540
4541 if (input_ht) {
4542 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
4543 } else if (input_str) {
4544 RETURN_BOOL(php_mb_check_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), encoding));
4545 } else {
4546 php_error_docref(NULL, E_DEPRECATED,
4547 "Calling mb_check_encoding() without argument is deprecated");
4548
4549 /* FIXME: Actually check all inputs, except $_FILES file content. */
4550 RETURN_BOOL(MBSTRG(illegalchars) == 0);
4551 }
4552 }
4553 /* }}} */
4554
4555
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)4556 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
4557 const uint32_t enc_name_arg_num)
4558 {
4559 const mbfl_encoding *enc;
4560 enum mbfl_no_encoding no_enc;
4561
4562 ZEND_ASSERT(str_len > 0);
4563
4564 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4565 if (!enc) {
4566 return -2;
4567 }
4568
4569 no_enc = enc->no_encoding;
4570 if (php_mb_is_unsupported_no_encoding(no_enc)) {
4571 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
4572 return -2;
4573 }
4574
4575 /* Some legacy text encodings have a minimum required wchar buffer size;
4576 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
4577 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
4578 unsigned int state = 0;
4579 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
4580 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
4581
4582 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
4583 return -1;
4584 }
4585 return wchar_buf[0];
4586 }
4587
4588
4589 /* {{{ */
PHP_FUNCTION(mb_ord)4590 PHP_FUNCTION(mb_ord)
4591 {
4592 char *str;
4593 size_t str_len;
4594 zend_string *enc = NULL;
4595 zend_long cp;
4596
4597 ZEND_PARSE_PARAMETERS_START(1, 2)
4598 Z_PARAM_STRING(str, str_len)
4599 Z_PARAM_OPTIONAL
4600 Z_PARAM_STR_OR_NULL(enc)
4601 ZEND_PARSE_PARAMETERS_END();
4602
4603 if (str_len == 0) {
4604 zend_argument_value_error(1, "must not be empty");
4605 RETURN_THROWS();
4606 }
4607
4608 cp = php_mb_ord(str, str_len, enc, 2);
4609
4610 if (0 > cp) {
4611 if (cp == -2) {
4612 RETURN_THROWS();
4613 }
4614 RETURN_FALSE;
4615 }
4616
4617 RETURN_LONG(cp);
4618 }
4619 /* }}} */
4620
4621
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)4622 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
4623 {
4624 const mbfl_encoding *enc;
4625 enum mbfl_no_encoding no_enc;
4626 zend_string *ret;
4627 char buf[4];
4628
4629 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4630 if (!enc) {
4631 return NULL;
4632 }
4633
4634 no_enc = enc->no_encoding;
4635 if (php_mb_is_unsupported_no_encoding(no_enc)) {
4636 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
4637 return NULL;
4638 }
4639
4640 if (cp < 0 || cp > 0x10ffff) {
4641 return NULL;
4642 }
4643
4644 if (php_mb_is_no_encoding_utf8(no_enc)) {
4645 if (cp > 0xd7ff && 0xe000 > cp) {
4646 return NULL;
4647 }
4648
4649 if (cp < 0x80) {
4650 ret = ZSTR_CHAR(cp);
4651 } else if (cp < 0x800) {
4652 ret = zend_string_alloc(2, 0);
4653 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
4654 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
4655 ZSTR_VAL(ret)[2] = 0;
4656 } else if (cp < 0x10000) {
4657 ret = zend_string_alloc(3, 0);
4658 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
4659 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
4660 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
4661 ZSTR_VAL(ret)[3] = 0;
4662 } else {
4663 ret = zend_string_alloc(4, 0);
4664 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
4665 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
4666 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
4667 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
4668 ZSTR_VAL(ret)[4] = 0;
4669 }
4670
4671 return ret;
4672 }
4673
4674 buf[0] = (cp >> 24) & 0xff;
4675 buf[1] = (cp >> 16) & 0xff;
4676 buf[2] = (cp >> 8) & 0xff;
4677 buf[3] = cp & 0xff;
4678
4679 long orig_illegalchars = MBSTRG(illegalchars);
4680 MBSTRG(illegalchars) = 0;
4681 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
4682
4683 if (MBSTRG(illegalchars) != 0) {
4684 zend_string_release(ret);
4685 ret = NULL;
4686 }
4687
4688 MBSTRG(illegalchars) = orig_illegalchars;
4689 return ret;
4690 }
4691
4692
4693 /* {{{ */
PHP_FUNCTION(mb_chr)4694 PHP_FUNCTION(mb_chr)
4695 {
4696 zend_long cp;
4697 zend_string *enc = NULL;
4698
4699 ZEND_PARSE_PARAMETERS_START(1, 2)
4700 Z_PARAM_LONG(cp)
4701 Z_PARAM_OPTIONAL
4702 Z_PARAM_STR_OR_NULL(enc)
4703 ZEND_PARSE_PARAMETERS_END();
4704
4705 zend_string* ret = php_mb_chr(cp, enc, 2);
4706 if (ret == NULL) {
4707 RETURN_FALSE;
4708 }
4709
4710 RETURN_STR(ret);
4711 }
4712 /* }}} */
4713
4714 /* {{{ */
PHP_FUNCTION(mb_scrub)4715 PHP_FUNCTION(mb_scrub)
4716 {
4717 char* str;
4718 size_t str_len;
4719 zend_string *enc_name = NULL;
4720
4721 ZEND_PARSE_PARAMETERS_START(1, 2)
4722 Z_PARAM_STRING(str, str_len)
4723 Z_PARAM_OPTIONAL
4724 Z_PARAM_STR_OR_NULL(enc_name)
4725 ZEND_PARSE_PARAMETERS_END();
4726
4727 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
4728 if (!enc) {
4729 RETURN_THROWS();
4730 }
4731
4732 RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc));
4733 }
4734 /* }}} */
4735
4736
4737 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)4738 static void php_mb_populate_current_detect_order_list(void)
4739 {
4740 const mbfl_encoding **entry = 0;
4741 size_t nentries;
4742
4743 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
4744 nentries = MBSTRG(detect_order_list_size);
4745 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4746 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
4747 } else {
4748 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
4749 size_t i;
4750 nentries = MBSTRG(default_detect_order_list_size);
4751 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4752 for (i = 0; i < nentries; i++) {
4753 entry[i] = mbfl_no2encoding(src[i]);
4754 }
4755 }
4756 MBSTRG(current_detect_order_list) = entry;
4757 MBSTRG(current_detect_order_list_size) = nentries;
4758 }
4759 /* }}} */
4760
4761 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)4762 static int php_mb_encoding_translation(void)
4763 {
4764 return MBSTRG(encoding_translation);
4765 }
4766 /* }}} */
4767
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)4768 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
4769 {
4770 if (enc) {
4771 if (enc->mblen_table) {
4772 if (s) {
4773 return enc->mblen_table[*(unsigned char *)s];
4774 }
4775 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
4776 return 2;
4777 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
4778 return 4;
4779 }
4780 }
4781 return 1;
4782 }
4783
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)4784 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
4785 {
4786 const char *p = s;
4787 char *last=NULL;
4788
4789 if (nbytes == (size_t)-1) {
4790 size_t nb = 0;
4791
4792 while (*p != '\0') {
4793 if (nb == 0) {
4794 if ((unsigned char)*p == (unsigned char)c) {
4795 last = (char *)p;
4796 }
4797 nb = php_mb_mbchar_bytes(p, enc);
4798 if (nb == 0) {
4799 return NULL; /* something is going wrong! */
4800 }
4801 }
4802 --nb;
4803 ++p;
4804 }
4805 } else {
4806 size_t bcnt = nbytes;
4807 size_t nbytes_char;
4808 while (bcnt > 0) {
4809 if ((unsigned char)*p == (unsigned char)c) {
4810 last = (char *)p;
4811 }
4812 nbytes_char = php_mb_mbchar_bytes(p, enc);
4813 if (bcnt < nbytes_char) {
4814 return NULL;
4815 }
4816 p += nbytes_char;
4817 bcnt -= nbytes_char;
4818 }
4819 }
4820 return last;
4821 }
4822
4823 /* {{{ MBSTRING_API int php_mb_stripos() */
php_mb_stripos(int mode,const char * old_haystack,size_t old_haystack_len,const char * old_needle,size_t old_needle_len,zend_long offset,const mbfl_encoding * enc)4824 MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *enc)
4825 {
4826 size_t n = (size_t) -1;
4827 mbfl_string haystack, needle;
4828
4829 mbfl_string_init_set(&haystack, enc);
4830 mbfl_string_init_set(&needle, enc);
4831
4832 do {
4833 /* We're using simple case-folding here, because we'd have to deal with remapping of
4834 * offsets otherwise. */
4835
4836 size_t len = 0;
4837 haystack.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc);
4838 haystack.len = len;
4839
4840 if (!haystack.val) {
4841 break;
4842 }
4843
4844 if (haystack.len == 0) {
4845 break;
4846 }
4847
4848 needle.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc);
4849 needle.len = len;
4850
4851 if (!needle.val) {
4852 break;
4853 }
4854
4855 n = mbfl_strpos(&haystack, &needle, offset, mode);
4856 } while(0);
4857
4858 if (haystack.val) {
4859 efree(haystack.val);
4860 }
4861
4862 if (needle.val) {
4863 efree(needle.val);
4864 }
4865
4866 return n;
4867 }
4868 /* }}} */
4869
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)4870 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
4871 {
4872 *list = (const zend_encoding **)MBSTRG(http_input_list);
4873 *list_size = MBSTRG(http_input_list_size);
4874 }
4875 /* }}} */
4876
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)4877 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
4878 {
4879 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
4880 }
4881 /* }}} */
4882