1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include "libmbfl/config.h"
22 #include "php.h"
23 #include "php_ini.h"
24 #include "php_variables.h"
25 #include "mbstring.h"
26 #include "ext/standard/php_string.h"
27 #include "ext/standard/php_mail.h"
28 #include "ext/standard/exec.h"
29 #include "ext/standard/url.h"
30 #include "main/php_output.h"
31 #include "ext/standard/info.h"
32 #include "ext/pcre/php_pcre.h"
33
34 #include "libmbfl/mbfl/mbfilter_8bit.h"
35 #include "libmbfl/mbfl/mbfilter_pass.h"
36 #include "libmbfl/mbfl/mbfilter_wchar.h"
37 #include "libmbfl/mbfl/eaw_table.h"
38 #include "libmbfl/filters/mbfilter_base64.h"
39 #include "libmbfl/filters/mbfilter_qprint.h"
40 #include "libmbfl/filters/mbfilter_htmlent.h"
41 #include "libmbfl/filters/mbfilter_uuencode.h"
42 #include "libmbfl/filters/mbfilter_ucs4.h"
43 #include "libmbfl/filters/mbfilter_utf8.h"
44 #include "libmbfl/filters/mbfilter_singlebyte.h"
45 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
46
47 #include "php_variables.h"
48 #include "php_globals.h"
49 #include "rfc1867.h"
50 #include "php_content_types.h"
51 #include "SAPI.h"
52 #include "php_unicode.h"
53 #include "TSRM.h"
54
55 #include "mb_gpc.h"
56
57 #ifdef HAVE_MBREGEX
58 # include "php_mbregex.h"
59 #endif
60
61 #include "zend_multibyte.h"
62 #include "mbstring_arginfo.h"
63 /* }}} */
64
65 /* {{{ prototypes */
66 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
67
68 static PHP_GINIT_FUNCTION(mbstring);
69 static PHP_GSHUTDOWN_FUNCTION(mbstring);
70
71 static void php_mb_populate_current_detect_order_list(void);
72
73 static int php_mb_encoding_translation(void);
74
75 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
76
77 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
78
79 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
80
81 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
82
83 /* See mbfilter_cp5022x.c */
84 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
85 /* }}} */
86
87 /* {{{ php_mb_default_identify_list */
88 typedef struct _php_mb_nls_ident_list {
89 enum mbfl_no_language lang;
90 const enum mbfl_no_encoding *list;
91 size_t list_size;
92 } php_mb_nls_ident_list;
93
94 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
95 mbfl_no_encoding_ascii,
96 mbfl_no_encoding_jis,
97 mbfl_no_encoding_utf8,
98 mbfl_no_encoding_euc_jp,
99 mbfl_no_encoding_sjis
100 };
101
102 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
103 mbfl_no_encoding_ascii,
104 mbfl_no_encoding_utf8,
105 mbfl_no_encoding_euc_cn,
106 mbfl_no_encoding_cp936
107 };
108
109 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
110 mbfl_no_encoding_ascii,
111 mbfl_no_encoding_utf8,
112 mbfl_no_encoding_euc_tw,
113 mbfl_no_encoding_big5
114 };
115
116 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
117 mbfl_no_encoding_ascii,
118 mbfl_no_encoding_utf8,
119 mbfl_no_encoding_euc_kr,
120 mbfl_no_encoding_uhc
121 };
122
123 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
124 mbfl_no_encoding_ascii,
125 mbfl_no_encoding_utf8,
126 mbfl_no_encoding_koi8r,
127 mbfl_no_encoding_cp1251,
128 mbfl_no_encoding_cp866
129 };
130
131 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
132 mbfl_no_encoding_ascii,
133 mbfl_no_encoding_utf8,
134 mbfl_no_encoding_armscii8
135 };
136
137 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
138 mbfl_no_encoding_ascii,
139 mbfl_no_encoding_utf8,
140 mbfl_no_encoding_cp1254,
141 mbfl_no_encoding_8859_9
142 };
143
144 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
145 mbfl_no_encoding_ascii,
146 mbfl_no_encoding_utf8,
147 mbfl_no_encoding_koi8u
148 };
149
150 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
151 mbfl_no_encoding_ascii,
152 mbfl_no_encoding_utf8
153 };
154
155
156 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
157 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
158 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
159 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
160 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
161 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
162 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
163 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
164 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
165 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
166 };
167
168 /* }}} */
169
170 /* {{{ mbstring_deps[] */
171 static const zend_module_dep mbstring_deps[] = {
172 ZEND_MOD_REQUIRED("pcre")
173 ZEND_MOD_END
174 };
175 /* }}} */
176
177 /* {{{ zend_module_entry mbstring_module_entry */
178 zend_module_entry mbstring_module_entry = {
179 STANDARD_MODULE_HEADER_EX,
180 NULL,
181 mbstring_deps,
182 "mbstring",
183 ext_functions,
184 PHP_MINIT(mbstring),
185 PHP_MSHUTDOWN(mbstring),
186 PHP_RINIT(mbstring),
187 PHP_RSHUTDOWN(mbstring),
188 PHP_MINFO(mbstring),
189 PHP_MBSTRING_VERSION,
190 PHP_MODULE_GLOBALS(mbstring),
191 PHP_GINIT(mbstring),
192 PHP_GSHUTDOWN(mbstring),
193 NULL,
194 STANDARD_MODULE_PROPERTIES_EX
195 };
196 /* }}} */
197
198 /* {{{ static sapi_post_entry php_post_entries[] */
199 static const sapi_post_entry php_post_entries[] = {
200 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
201 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
202 { NULL, 0, NULL, NULL }
203 };
204 /* }}} */
205
206 #ifdef COMPILE_DL_MBSTRING
207 #ifdef ZTS
208 ZEND_TSRMLS_CACHE_DEFINE()
209 #endif
210 ZEND_GET_MODULE(mbstring)
211 #endif
212
213 /* {{{ static sapi_post_entry mbstr_post_entries[] */
214 static const sapi_post_entry mbstr_post_entries[] = {
215 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
216 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
217 { NULL, 0, NULL, NULL }
218 };
219 /* }}} */
220
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)221 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
222 if (encoding_name) {
223 const mbfl_encoding *encoding;
224 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
225 if (last_encoding_name && (last_encoding_name == encoding_name
226 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
227 return MBSTRG(last_used_encoding);
228 }
229
230 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
231 if (!encoding) {
232 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
233 return NULL;
234 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
235 if (encoding == &mbfl_encoding_base64) {
236 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
237 } else if (encoding == &mbfl_encoding_qprint) {
238 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
239 } else if (encoding == &mbfl_encoding_html_ent) {
240 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
241 } else if (encoding == &mbfl_encoding_uuencode) {
242 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
243 }
244 }
245
246 if (last_encoding_name) {
247 zend_string_release(last_encoding_name);
248 }
249 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
250 MBSTRG(last_used_encoding) = encoding;
251 return encoding;
252 } else {
253 return MBSTRG(current_internal_encoding);
254 }
255 }
256
php_mb_get_encoding_or_pass(const char * encoding_name)257 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
258 if (strcmp(encoding_name, "pass") == 0) {
259 return &mbfl_encoding_pass;
260 }
261
262 return mbfl_name2encoding(encoding_name);
263 }
264
count_commas(const char * p,const char * end)265 static size_t count_commas(const char *p, const char *end) {
266 size_t count = 0;
267 while ((p = memchr(p, ',', end - p))) {
268 count++;
269 p++;
270 }
271 return count;
272 }
273
274 /* {{{ static zend_result php_mb_parse_encoding_list()
275 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
276 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
277 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num,bool allow_pass_encoding)278 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
279 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num,
280 bool allow_pass_encoding)
281 {
282 if (value == NULL || value_length == 0) {
283 *return_list = NULL;
284 *return_size = 0;
285 return SUCCESS;
286 } else {
287 bool included_auto;
288 size_t n, size;
289 char *p1, *endp, *tmpstr;
290 const mbfl_encoding **entry, **list;
291
292 /* copy the value string for work */
293 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
294 tmpstr = (char *)estrndup(value+1, value_length-2);
295 value_length -= 2;
296 } else {
297 tmpstr = (char *)estrndup(value, value_length);
298 }
299
300 endp = tmpstr + value_length;
301 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
302 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
303 entry = list;
304 n = 0;
305 included_auto = 0;
306 p1 = tmpstr;
307 while (1) {
308 char *comma = memchr(p1, ',', endp - p1);
309 char *p = comma ? comma : endp;
310 *p = '\0';
311 /* trim spaces */
312 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
313 p1++;
314 }
315 p--;
316 while (p > p1 && (*p == ' ' || *p == '\t')) {
317 *p = '\0';
318 p--;
319 }
320 /* convert to the encoding number and check encoding */
321 if (strcasecmp(p1, "auto") == 0) {
322 if (!included_auto) {
323 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
324 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
325 size_t i;
326 included_auto = 1;
327 for (i = 0; i < identify_list_size; i++) {
328 *entry++ = mbfl_no2encoding(*src++);
329 n++;
330 }
331 }
332 } else {
333 const mbfl_encoding *encoding =
334 allow_pass_encoding ? php_mb_get_encoding_or_pass(p1) : mbfl_name2encoding(p1);
335 if (!encoding) {
336 /* Called from an INI setting modification */
337 if (arg_num == 0) {
338 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
339 } else {
340 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
341 }
342 efree(tmpstr);
343 pefree(ZEND_VOIDP(list), persistent);
344 return FAILURE;
345 }
346
347 *entry++ = encoding;
348 n++;
349 }
350 if (n >= size || comma == NULL) {
351 break;
352 }
353 p1 = comma + 1;
354 }
355 *return_list = list;
356 *return_size = n;
357 efree(tmpstr);
358 }
359
360 return SUCCESS;
361 }
362 /* }}} */
363
364 /* {{{ static int php_mb_parse_encoding_array()
365 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
366 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
367 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)368 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
369 size_t *return_size, uint32_t arg_num)
370 {
371 /* Allocate enough space to include the default detect order if "auto" is used. */
372 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
373 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
374 const mbfl_encoding **entry = list;
375 bool included_auto = 0;
376 size_t n = 0;
377 zval *hash_entry;
378 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
379 zend_string *encoding_str = zval_try_get_string(hash_entry);
380 if (UNEXPECTED(!encoding_str)) {
381 efree(ZEND_VOIDP(list));
382 return FAILURE;
383 }
384
385 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
386 if (!included_auto) {
387 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
388 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
389 size_t j;
390
391 included_auto = 1;
392 for (j = 0; j < identify_list_size; j++) {
393 *entry++ = mbfl_no2encoding(*src++);
394 n++;
395 }
396 }
397 } else {
398 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
399 if (encoding) {
400 *entry++ = encoding;
401 n++;
402 } else {
403 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
404 zend_string_release(encoding_str);
405 efree(ZEND_VOIDP(list));
406 return FAILURE;
407 }
408 }
409 zend_string_release(encoding_str);
410 } ZEND_HASH_FOREACH_END();
411 *return_list = list;
412 *return_size = n;
413 return SUCCESS;
414 }
415 /* }}} */
416
417 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)418 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
419 {
420 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
421 }
422
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)423 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
424 {
425 return ((const mbfl_encoding *)encoding)->name;
426 }
427
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)428 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
429 {
430 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
431 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
432 }
433
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)434 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
435 {
436 mbfl_string string;
437
438 if (!list) {
439 list = (const zend_encoding **)MBSTRG(current_detect_order_list);
440 list_size = MBSTRG(current_detect_order_list_size);
441 }
442
443 mbfl_string_init(&string);
444 string.val = (unsigned char *)arg_string;
445 string.len = arg_length;
446 return (const zend_encoding *) mbfl_identify_encoding(&string, (const mbfl_encoding **)list, list_size, 0);
447 }
448
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)449 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
450 {
451 mbfl_string string, result;
452 mbfl_buffer_converter *convd;
453
454 /* new encoding */
455 /* initialize string */
456 string.encoding = (const mbfl_encoding*)encoding_from;
457 string.val = (unsigned char*)from;
458 string.len = from_length;
459
460 /* initialize converter */
461 convd = mbfl_buffer_converter_new((const mbfl_encoding *)encoding_from, (const mbfl_encoding *)encoding_to, string.len);
462 if (convd == NULL) {
463 return (size_t) -1;
464 }
465
466 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
467 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
468
469 /* do it */
470 size_t loc = mbfl_buffer_converter_feed(convd, &string);
471
472 mbfl_buffer_converter_flush(convd);
473 mbfl_string_init(&result);
474 if (!mbfl_buffer_converter_result(convd, &result)) {
475 mbfl_buffer_converter_delete(convd);
476 return (size_t)-1;
477 }
478
479 *to = result.val;
480 *to_length = result.len;
481
482 mbfl_buffer_converter_delete(convd);
483
484 return loc;
485 }
486
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)487 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
488 {
489 return php_mb_parse_encoding_list(
490 encoding_list, encoding_list_len,
491 (const mbfl_encoding ***)return_list, return_size,
492 persistent, /* arg_num */ 0, /* allow_pass_encoding */ 1);
493 }
494
php_mb_zend_internal_encoding_getter(void)495 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
496 {
497 return (const zend_encoding *)MBSTRG(internal_encoding);
498 }
499
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)500 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
501 {
502 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
503 return SUCCESS;
504 }
505
506 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
507 "mbstring",
508 php_mb_zend_encoding_fetcher,
509 php_mb_zend_encoding_name_getter,
510 php_mb_zend_encoding_lexer_compatibility_checker,
511 php_mb_zend_encoding_detector,
512 php_mb_zend_encoding_converter,
513 php_mb_zend_encoding_list_parser,
514 php_mb_zend_internal_encoding_getter,
515 php_mb_zend_internal_encoding_setter
516 };
517 /* }}} */
518
519 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)520 static void *_php_mb_compile_regex(const char *pattern)
521 {
522 pcre2_code *retval;
523 PCRE2_SIZE err_offset;
524 int errnum;
525
526 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
527 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
528 PCRE2_UCHAR err_str[128];
529 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
530 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
531 }
532 return retval;
533 }
534 /* }}} */
535
536 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)537 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
538 {
539 int res;
540
541 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
542 if (NULL == match_data) {
543 pcre2_code_free(opaque);
544 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
545 return FAILURE;
546 }
547 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
548 php_pcre_free_match_data(match_data);
549
550 return res;
551 }
552 /* }}} */
553
554 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)555 static void _php_mb_free_regex(void *opaque)
556 {
557 pcre2_code_free(opaque);
558 }
559 /* }}} */
560
561 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)562 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
563 {
564 size_t i;
565
566 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
567 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
568
569 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
570 if (php_mb_default_identify_list[i].lang == lang) {
571 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
572 *plist_size = php_mb_default_identify_list[i].list_size;
573 return 1;
574 }
575 }
576 return 0;
577 }
578 /* }}} */
579
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)580 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
581 {
582 char *result = emalloc(len + 2);
583 char *resp = result;
584 size_t i;
585
586 for (i = 0; i < len && start[i] != quote; ++i) {
587 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
588 *resp++ = start[++i];
589 } else {
590 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
591
592 while (j-- > 0 && i < len) {
593 *resp++ = start[i++];
594 }
595 --i;
596 }
597 }
598
599 *resp = '\0';
600 return result;
601 }
602
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)603 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
604 {
605 char *pos = *line, quote;
606 char *res;
607
608 while (*pos && *pos != stop) {
609 if ((quote = *pos) == '"' || quote == '\'') {
610 ++pos;
611 while (*pos && *pos != quote) {
612 if (*pos == '\\' && pos[1] && pos[1] == quote) {
613 pos += 2;
614 } else {
615 ++pos;
616 }
617 }
618 if (*pos) {
619 ++pos;
620 }
621 } else {
622 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
623
624 }
625 }
626 if (*pos == '\0') {
627 res = estrdup(*line);
628 *line += strlen(*line);
629 return res;
630 }
631
632 res = estrndup(*line, pos - *line);
633
634 while (*pos == stop) {
635 pos += php_mb_mbchar_bytes(pos, (const mbfl_encoding *)encoding);
636 }
637
638 *line = pos;
639 return res;
640 }
641 /* }}} */
642
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)643 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
644 {
645 while (*str && isspace(*(unsigned char *)str)) {
646 ++str;
647 }
648
649 if (!*str) {
650 return estrdup("");
651 }
652
653 if (*str == '"' || *str == '\'') {
654 char quote = *str;
655
656 str++;
657 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
658 } else {
659 char *strend = str;
660
661 while (*strend && !isspace(*(unsigned char *)strend)) {
662 ++strend;
663 }
664 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
665 }
666 }
667 /* }}} */
668
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)669 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
670 {
671 char *s, *s2;
672 const size_t filename_len = strlen(filename);
673
674 /* The \ check should technically be needed for win32 systems only where
675 * it is a valid path separator. However, IE in all it's wisdom always sends
676 * the full path of the file on the user's filesystem, which means that unless
677 * the user does basename() they get a bogus file name. Until IE's user base drops
678 * to nill or problem is fixed this code must remain enabled for all systems. */
679 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
680 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
681
682 if (s && s2) {
683 if (s > s2) {
684 return ++s;
685 } else {
686 return ++s2;
687 }
688 } else if (s) {
689 return ++s;
690 } else if (s2) {
691 return ++s2;
692 } else {
693 return filename;
694 }
695 }
696 /* }}} */
697
698 /* {{{ php.ini directive handler */
699 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)700 static PHP_INI_MH(OnUpdate_mbstring_language)
701 {
702 enum mbfl_no_language no_language;
703
704 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
705 if (no_language == mbfl_no_language_invalid) {
706 MBSTRG(language) = mbfl_no_language_neutral;
707 return FAILURE;
708 }
709 MBSTRG(language) = no_language;
710 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
711 return SUCCESS;
712 }
713 /* }}} */
714
715 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)716 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
717 {
718 const mbfl_encoding **list;
719 size_t size;
720
721 if (!new_value) {
722 if (MBSTRG(detect_order_list)) {
723 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
724 }
725 MBSTRG(detect_order_list) = NULL;
726 MBSTRG(detect_order_list_size) = 0;
727 return SUCCESS;
728 }
729
730 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 0) || size == 0) {
731 return FAILURE;
732 }
733
734 if (MBSTRG(detect_order_list)) {
735 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
736 }
737 MBSTRG(detect_order_list) = list;
738 MBSTRG(detect_order_list_size) = size;
739 return SUCCESS;
740 }
741 /* }}} */
742
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)743 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
744 const mbfl_encoding **list;
745 size_t size;
746 if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 1) || size == 0) {
747 return FAILURE;
748 }
749 if (MBSTRG(http_input_list)) {
750 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
751 }
752 MBSTRG(http_input_list) = list;
753 MBSTRG(http_input_list_size) = size;
754 return SUCCESS;
755 }
756
757 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)758 static PHP_INI_MH(OnUpdate_mbstring_http_input)
759 {
760 if (new_value) {
761 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
762 }
763
764 if (!new_value || !ZSTR_LEN(new_value)) {
765 const char *encoding = php_get_input_encoding();
766 MBSTRG(http_input_set) = 0;
767 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
768 return SUCCESS;
769 }
770
771 MBSTRG(http_input_set) = 1;
772 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
773 }
774 /* }}} */
775
_php_mb_ini_mbstring_http_output_set(const char * new_value)776 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
777 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
778 if (!encoding) {
779 return FAILURE;
780 }
781
782 MBSTRG(http_output_encoding) = encoding;
783 MBSTRG(current_http_output_encoding) = encoding;
784 return SUCCESS;
785 }
786
787 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)788 static PHP_INI_MH(OnUpdate_mbstring_http_output)
789 {
790 if (new_value) {
791 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
792 }
793
794 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
795 MBSTRG(http_output_set) = 0;
796 _php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
797 return SUCCESS;
798 }
799
800 MBSTRG(http_output_set) = 1;
801 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
802 }
803 /* }}} */
804
805 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)806 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
807 {
808 const mbfl_encoding *encoding;
809
810 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
811 /* falls back to UTF-8 if an unknown encoding name is given */
812 if (new_value) {
813 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
814 }
815 encoding = &mbfl_encoding_utf8;
816 }
817 MBSTRG(internal_encoding) = encoding;
818 MBSTRG(current_internal_encoding) = encoding;
819 #ifdef HAVE_MBREGEX
820 {
821 const char *enc_name = new_value;
822 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
823 /* falls back to UTF-8 if an unknown encoding name is given */
824 enc_name = "UTF-8";
825 php_mb_regex_set_default_mbctype(enc_name);
826 }
827 php_mb_regex_set_mbctype(new_value);
828 }
829 #endif
830 return SUCCESS;
831 }
832 /* }}} */
833
834 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)835 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
836 {
837 if (new_value) {
838 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
839 }
840
841 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
842 return FAILURE;
843 }
844
845 if (new_value && ZSTR_LEN(new_value)) {
846 MBSTRG(internal_encoding_set) = 1;
847 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
848 } else {
849 const char *encoding = php_get_internal_encoding();
850 MBSTRG(internal_encoding_set) = 0;
851 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
852 }
853 }
854 /* }}} */
855
856 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)857 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
858 {
859 int c;
860 char *endptr = NULL;
861
862 if (new_value != NULL) {
863 if (zend_string_equals_literal_ci(new_value, "none")) {
864 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
865 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
866 } else if (zend_string_equals_literal_ci(new_value, "long")) {
867 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
868 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
869 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
870 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
871 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
872 } else {
873 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
874 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
875 if (ZSTR_LEN(new_value) > 0) {
876 c = strtol(ZSTR_VAL(new_value), &endptr, 0);
877 if (*endptr == '\0') {
878 MBSTRG(filter_illegal_substchar) = c;
879 MBSTRG(current_filter_illegal_substchar) = c;
880 }
881 }
882 }
883 } else {
884 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
885 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
886 MBSTRG(filter_illegal_substchar) = 0x3f; /* '?' */
887 MBSTRG(current_filter_illegal_substchar) = 0x3f; /* '?' */
888 }
889
890 return SUCCESS;
891 }
892 /* }}} */
893
894 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)895 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
896 {
897 if (new_value == NULL) {
898 return FAILURE;
899 }
900
901 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
902
903 if (MBSTRG(encoding_translation)) {
904 sapi_unregister_post_entry(php_post_entries);
905 sapi_register_post_entries(mbstr_post_entries);
906 } else {
907 sapi_unregister_post_entry(mbstr_post_entries);
908 sapi_register_post_entries(php_post_entries);
909 }
910
911 return SUCCESS;
912 }
913 /* }}} */
914
915 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)916 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
917 {
918 zend_string *tmp;
919 void *re = NULL;
920
921 if (!new_value) {
922 new_value = entry->orig_value;
923 }
924 tmp = php_trim(new_value, NULL, 0, 3);
925
926 if (ZSTR_LEN(tmp) > 0) {
927 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
928 zend_string_release_ex(tmp, 0);
929 return FAILURE;
930 }
931 }
932
933 if (MBSTRG(http_output_conv_mimetypes)) {
934 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
935 }
936
937 MBSTRG(http_output_conv_mimetypes) = re;
938
939 zend_string_release_ex(tmp, 0);
940 return SUCCESS;
941 }
942 /* }}} */
943 /* }}} */
944
945 /* {{{ php.ini directive registration */
946 PHP_INI_BEGIN()
947 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
948 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
949 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
950 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
951 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
952 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
953
954 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
955 PHP_INI_SYSTEM | PHP_INI_PERDIR,
956 OnUpdate_mbstring_encoding_translation,
957 encoding_translation, zend_mbstring_globals, mbstring_globals)
958 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
959 "^(text/|application/xhtml\\+xml)",
960 PHP_INI_ALL,
961 OnUpdate_mbstring_http_output_conv_mimetypes)
962
963 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
964 PHP_INI_ALL,
965 OnUpdateBool,
966 strict_detection, zend_mbstring_globals, mbstring_globals)
967 #ifdef HAVE_MBREGEX
968 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
969 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
970 #endif
PHP_INI_END()971 PHP_INI_END()
972 /* }}} */
973
974 static void mbstring_internal_encoding_changed_hook(void) {
975 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
976 if (!MBSTRG(internal_encoding_set)) {
977 const char *encoding = php_get_internal_encoding();
978 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
979 }
980
981 if (!MBSTRG(http_output_set)) {
982 const char *encoding = php_get_output_encoding();
983 _php_mb_ini_mbstring_http_output_set(encoding);
984 }
985
986 if (!MBSTRG(http_input_set)) {
987 const char *encoding = php_get_input_encoding();
988 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
989 }
990 }
991
992 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)993 static PHP_GINIT_FUNCTION(mbstring)
994 {
995 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
996 ZEND_TSRMLS_CACHE_UPDATE();
997 #endif
998
999 mbstring_globals->language = mbfl_no_language_uni;
1000 mbstring_globals->internal_encoding = NULL;
1001 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
1002 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
1003 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
1004 mbstring_globals->http_input_identify = NULL;
1005 mbstring_globals->http_input_identify_get = NULL;
1006 mbstring_globals->http_input_identify_post = NULL;
1007 mbstring_globals->http_input_identify_cookie = NULL;
1008 mbstring_globals->http_input_identify_string = NULL;
1009 mbstring_globals->http_input_list = NULL;
1010 mbstring_globals->http_input_list_size = 0;
1011 mbstring_globals->detect_order_list = NULL;
1012 mbstring_globals->detect_order_list_size = 0;
1013 mbstring_globals->current_detect_order_list = NULL;
1014 mbstring_globals->current_detect_order_list_size = 0;
1015 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1016 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1017 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1018 mbstring_globals->filter_illegal_substchar = 0x3f; /* '?' */
1019 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1020 mbstring_globals->current_filter_illegal_substchar = 0x3f; /* '?' */
1021 mbstring_globals->illegalchars = 0;
1022 mbstring_globals->encoding_translation = 0;
1023 mbstring_globals->strict_detection = 0;
1024 mbstring_globals->outconv = NULL;
1025 mbstring_globals->http_output_conv_mimetypes = NULL;
1026 #ifdef HAVE_MBREGEX
1027 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1028 #endif
1029 mbstring_globals->last_used_encoding_name = NULL;
1030 mbstring_globals->last_used_encoding = NULL;
1031 mbstring_globals->internal_encoding_set = 0;
1032 mbstring_globals->http_output_set = 0;
1033 mbstring_globals->http_input_set = 0;
1034 }
1035 /* }}} */
1036
1037 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1038 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1039 {
1040 if (mbstring_globals->http_input_list) {
1041 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1042 }
1043 if (mbstring_globals->detect_order_list) {
1044 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1045 }
1046 if (mbstring_globals->http_output_conv_mimetypes) {
1047 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1048 }
1049 #ifdef HAVE_MBREGEX
1050 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1051 #endif
1052 }
1053 /* }}} */
1054
1055 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1056 PHP_MINIT_FUNCTION(mbstring)
1057 {
1058 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1059 ZEND_TSRMLS_CACHE_UPDATE();
1060 #endif
1061
1062 REGISTER_INI_ENTRIES();
1063
1064 /* We assume that we're the only user of the hook. */
1065 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1066 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1067 mbstring_internal_encoding_changed_hook();
1068
1069 /* This is a global handler. Should not be set in a per-request handler. */
1070 sapi_register_treat_data(mbstr_treat_data);
1071
1072 /* Post handlers are stored in the thread-local context. */
1073 if (MBSTRG(encoding_translation)) {
1074 sapi_register_post_entries(mbstr_post_entries);
1075 }
1076
1077 #ifdef HAVE_MBREGEX
1078 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1079 #endif
1080
1081 register_mbstring_symbols(module_number);
1082
1083 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1084 return FAILURE;
1085 }
1086
1087 php_rfc1867_set_multibyte_callbacks(
1088 php_mb_encoding_translation,
1089 php_mb_gpc_get_detect_order,
1090 php_mb_gpc_set_input_encoding,
1091 php_mb_rfc1867_getword,
1092 php_mb_rfc1867_getword_conf,
1093 php_mb_rfc1867_basename);
1094
1095 return SUCCESS;
1096 }
1097 /* }}} */
1098
1099 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1100 PHP_MSHUTDOWN_FUNCTION(mbstring)
1101 {
1102 UNREGISTER_INI_ENTRIES();
1103
1104 zend_multibyte_restore_functions();
1105
1106 #ifdef HAVE_MBREGEX
1107 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1108 #endif
1109
1110 php_internal_encoding_changed = NULL;
1111
1112 return SUCCESS;
1113 }
1114 /* }}} */
1115
1116 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1117 PHP_RINIT_FUNCTION(mbstring)
1118 {
1119 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1120 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1121 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1122 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1123
1124 MBSTRG(illegalchars) = 0;
1125
1126 php_mb_populate_current_detect_order_list();
1127
1128 #ifdef HAVE_MBREGEX
1129 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1130 #endif
1131 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1132
1133 return SUCCESS;
1134 }
1135 /* }}} */
1136
1137 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1138 PHP_RSHUTDOWN_FUNCTION(mbstring)
1139 {
1140 if (MBSTRG(current_detect_order_list) != NULL) {
1141 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1142 MBSTRG(current_detect_order_list) = NULL;
1143 MBSTRG(current_detect_order_list_size) = 0;
1144 }
1145 if (MBSTRG(outconv) != NULL) {
1146 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1147 mbfl_buffer_converter_delete(MBSTRG(outconv));
1148 MBSTRG(outconv) = NULL;
1149 }
1150
1151 /* clear http input identification. */
1152 MBSTRG(http_input_identify) = NULL;
1153 MBSTRG(http_input_identify_post) = NULL;
1154 MBSTRG(http_input_identify_get) = NULL;
1155 MBSTRG(http_input_identify_cookie) = NULL;
1156 MBSTRG(http_input_identify_string) = NULL;
1157
1158 if (MBSTRG(last_used_encoding_name)) {
1159 zend_string_release(MBSTRG(last_used_encoding_name));
1160 MBSTRG(last_used_encoding_name) = NULL;
1161 }
1162
1163 MBSTRG(internal_encoding_set) = 0;
1164 MBSTRG(http_output_set) = 0;
1165 MBSTRG(http_input_set) = 0;
1166
1167 #ifdef HAVE_MBREGEX
1168 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1169 #endif
1170
1171 return SUCCESS;
1172 }
1173 /* }}} */
1174
1175 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1176 PHP_MINFO_FUNCTION(mbstring)
1177 {
1178 php_info_print_table_start();
1179 php_info_print_table_row(2, "Multibyte Support", "enabled");
1180 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1181 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1182 {
1183 char tmp[256];
1184 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1185 php_info_print_table_row(2, "libmbfl version", tmp);
1186 }
1187 php_info_print_table_end();
1188
1189 php_info_print_table_start();
1190 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1191 php_info_print_table_end();
1192
1193 #ifdef HAVE_MBREGEX
1194 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1195 #endif
1196
1197 DISPLAY_INI_ENTRIES();
1198 }
1199 /* }}} */
1200
1201 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1202 PHP_FUNCTION(mb_language)
1203 {
1204 zend_string *name = NULL;
1205
1206 ZEND_PARSE_PARAMETERS_START(0, 1)
1207 Z_PARAM_OPTIONAL
1208 Z_PARAM_STR_OR_NULL(name)
1209 ZEND_PARSE_PARAMETERS_END();
1210
1211 if (name == NULL) {
1212 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1213 } else {
1214 zend_string *ini_name = zend_string_init("mbstring.language", sizeof("mbstring.language") - 1, 0);
1215 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1216 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1217 zend_string_release_ex(ini_name, 0);
1218 RETURN_THROWS();
1219 }
1220 // TODO Make return void
1221 RETVAL_TRUE;
1222 zend_string_release_ex(ini_name, 0);
1223 }
1224 }
1225 /* }}} */
1226
1227 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1228 PHP_FUNCTION(mb_internal_encoding)
1229 {
1230 char *name = NULL;
1231 size_t name_len;
1232 const mbfl_encoding *encoding;
1233
1234 ZEND_PARSE_PARAMETERS_START(0, 1)
1235 Z_PARAM_OPTIONAL
1236 Z_PARAM_STRING_OR_NULL(name, name_len)
1237 ZEND_PARSE_PARAMETERS_END();
1238
1239 if (name == NULL) {
1240 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1241 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1242 } else {
1243 encoding = mbfl_name2encoding(name);
1244 if (!encoding) {
1245 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1246 RETURN_THROWS();
1247 } else {
1248 MBSTRG(current_internal_encoding) = encoding;
1249 MBSTRG(internal_encoding_set) = 1;
1250 /* TODO Return old encoding */
1251 RETURN_TRUE;
1252 }
1253 }
1254 }
1255 /* }}} */
1256
1257 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1258 PHP_FUNCTION(mb_http_input)
1259 {
1260 char *type = NULL;
1261 size_t type_len = 0, n;
1262 const mbfl_encoding **entry;
1263 const mbfl_encoding *encoding;
1264
1265 ZEND_PARSE_PARAMETERS_START(0, 1)
1266 Z_PARAM_OPTIONAL
1267 Z_PARAM_STRING_OR_NULL(type, type_len)
1268 ZEND_PARSE_PARAMETERS_END();
1269
1270 if (type == NULL) {
1271 encoding = MBSTRG(http_input_identify);
1272 } else {
1273 switch (*type) {
1274 case 'G':
1275 case 'g':
1276 encoding = MBSTRG(http_input_identify_get);
1277 break;
1278 case 'P':
1279 case 'p':
1280 encoding = MBSTRG(http_input_identify_post);
1281 break;
1282 case 'C':
1283 case 'c':
1284 encoding = MBSTRG(http_input_identify_cookie);
1285 break;
1286 case 'S':
1287 case 's':
1288 encoding = MBSTRG(http_input_identify_string);
1289 break;
1290 case 'I':
1291 case 'i':
1292 entry = MBSTRG(http_input_list);
1293 n = MBSTRG(http_input_list_size);
1294 array_init(return_value);
1295 for (size_t i = 0; i < n; i++, entry++) {
1296 add_next_index_string(return_value, (*entry)->name);
1297 }
1298 return;
1299 case 'L':
1300 case 'l':
1301 entry = MBSTRG(http_input_list);
1302 n = MBSTRG(http_input_list_size);
1303 if (n == 0) {
1304 RETURN_FALSE;
1305 }
1306 // TODO Use smart_str instead.
1307 mbfl_string result;
1308 mbfl_memory_device device;
1309 mbfl_memory_device_init(&device, n * 12, 0);
1310 for (size_t i = 0; i < n; i++, entry++) {
1311 mbfl_memory_device_strcat(&device, (*entry)->name);
1312 mbfl_memory_device_output(',', &device);
1313 }
1314 mbfl_memory_device_unput(&device); /* Remove trailing comma */
1315 mbfl_memory_device_result(&device, &result);
1316 RETVAL_STRINGL((const char*)result.val, result.len);
1317 mbfl_string_clear(&result);
1318 return;
1319 default:
1320 zend_argument_value_error(1,
1321 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1322 RETURN_THROWS();
1323 }
1324 }
1325
1326 if (encoding) {
1327 RETURN_STRING(encoding->name);
1328 } else {
1329 RETURN_FALSE;
1330 }
1331 }
1332 /* }}} */
1333
1334 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1335 PHP_FUNCTION(mb_http_output)
1336 {
1337 char *name = NULL;
1338 size_t name_len;
1339
1340 ZEND_PARSE_PARAMETERS_START(0, 1)
1341 Z_PARAM_OPTIONAL
1342 Z_PARAM_STRING_OR_NULL(name, name_len)
1343 ZEND_PARSE_PARAMETERS_END();
1344
1345 if (name == NULL) {
1346 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1347 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1348 } else {
1349 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1350 if (!encoding) {
1351 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1352 RETURN_THROWS();
1353 } else {
1354 MBSTRG(http_output_set) = 1;
1355 MBSTRG(current_http_output_encoding) = encoding;
1356 /* TODO Return previous encoding? */
1357 RETURN_TRUE;
1358 }
1359 }
1360 }
1361 /* }}} */
1362
1363 /* {{{ Sets the current detect_order or Return the current detect_order as a array */
PHP_FUNCTION(mb_detect_order)1364 PHP_FUNCTION(mb_detect_order)
1365 {
1366 zend_string *order_str = NULL;
1367 HashTable *order_ht = NULL;
1368
1369 ZEND_PARSE_PARAMETERS_START(0, 1)
1370 Z_PARAM_OPTIONAL
1371 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1372 ZEND_PARSE_PARAMETERS_END();
1373
1374 if (!order_str && !order_ht) {
1375 size_t n = MBSTRG(current_detect_order_list_size);
1376 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1377 array_init(return_value);
1378 for (size_t i = 0; i < n; i++) {
1379 add_next_index_string(return_value, (*entry)->name);
1380 entry++;
1381 }
1382 } else {
1383 const mbfl_encoding **list;
1384 size_t size;
1385 if (order_ht) {
1386 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1387 RETURN_THROWS();
1388 }
1389 } else {
1390 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1, /* allow_pass_encoding */ 0)) {
1391 RETURN_THROWS();
1392 }
1393 }
1394
1395 if (size == 0) {
1396 efree(ZEND_VOIDP(list));
1397 zend_argument_value_error(1, "must specify at least one encoding");
1398 RETURN_THROWS();
1399 }
1400
1401 if (MBSTRG(current_detect_order_list)) {
1402 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1403 }
1404 MBSTRG(current_detect_order_list) = list;
1405 MBSTRG(current_detect_order_list_size) = size;
1406 RETURN_TRUE;
1407 }
1408 }
1409 /* }}} */
1410
php_mb_check_code_point(zend_long cp)1411 static inline int php_mb_check_code_point(zend_long cp)
1412 {
1413 if (cp < 0 || cp >= 0x110000) {
1414 /* Out of Unicode range */
1415 return 0;
1416 }
1417
1418 if (cp >= 0xd800 && cp <= 0xdfff) {
1419 /* Surrogate code-point. These are never valid on their own and we only allow a single
1420 * substitute character. */
1421 return 0;
1422 }
1423
1424 /* As we do not know the target encoding of the conversion operation that is going to
1425 * use the substitution character, we cannot check whether the codepoint is actually mapped
1426 * in the given encoding at this point. Thus we have to accept everything. */
1427 return 1;
1428 }
1429
1430 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1431 PHP_FUNCTION(mb_substitute_character)
1432 {
1433 zend_string *substitute_character = NULL;
1434 zend_long substitute_codepoint;
1435 bool substitute_is_null = 1;
1436
1437 ZEND_PARSE_PARAMETERS_START(0, 1)
1438 Z_PARAM_OPTIONAL
1439 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1440 ZEND_PARSE_PARAMETERS_END();
1441
1442 if (substitute_is_null) {
1443 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1444 RETURN_STRING("none");
1445 }
1446 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1447 RETURN_STRING("long");
1448 }
1449 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1450 RETURN_STRING("entity");
1451 }
1452 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1453 }
1454
1455 if (substitute_character != NULL) {
1456 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1457 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1458 RETURN_TRUE;
1459 }
1460 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1461 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1462 RETURN_TRUE;
1463 }
1464 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1465 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1466 RETURN_TRUE;
1467 }
1468 /* Invalid string value */
1469 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1470 RETURN_THROWS();
1471 }
1472 /* Integer codepoint passed */
1473 if (!php_mb_check_code_point(substitute_codepoint)) {
1474 zend_argument_value_error(1, "is not a valid codepoint");
1475 RETURN_THROWS();
1476 }
1477
1478 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1479 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1480 RETURN_TRUE;
1481 }
1482 /* }}} */
1483
1484 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1485 PHP_FUNCTION(mb_preferred_mime_name)
1486 {
1487 enum mbfl_no_encoding no_encoding;
1488 char *name = NULL;
1489 size_t name_len;
1490
1491 ZEND_PARSE_PARAMETERS_START(1, 1)
1492 Z_PARAM_STRING(name, name_len)
1493 ZEND_PARSE_PARAMETERS_END();
1494
1495 no_encoding = mbfl_name2no_encoding(name);
1496 if (no_encoding == mbfl_no_encoding_invalid) {
1497 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1498 RETURN_THROWS();
1499 }
1500
1501 const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
1502 if (preferred_name == NULL || *preferred_name == '\0') {
1503 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1504 RETVAL_FALSE;
1505 } else {
1506 RETVAL_STRING((char *)preferred_name);
1507 }
1508 }
1509 /* }}} */
1510
1511 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1512 PHP_FUNCTION(mb_parse_str)
1513 {
1514 zval *track_vars_array = NULL;
1515 char *encstr;
1516 size_t encstr_len;
1517 php_mb_encoding_handler_info_t info;
1518 const mbfl_encoding *detected;
1519
1520 ZEND_PARSE_PARAMETERS_START(2, 2)
1521 Z_PARAM_STRING(encstr, encstr_len)
1522 Z_PARAM_ZVAL(track_vars_array)
1523 ZEND_PARSE_PARAMETERS_END();
1524
1525 track_vars_array = zend_try_array_init(track_vars_array);
1526 if (!track_vars_array) {
1527 RETURN_THROWS();
1528 }
1529
1530 encstr = estrndup(encstr, encstr_len);
1531
1532 info.data_type = PARSE_STRING;
1533 info.separator = PG(arg_separator).input;
1534 info.report_errors = true;
1535 info.to_encoding = MBSTRG(current_internal_encoding);
1536 info.from_encodings = MBSTRG(http_input_list);
1537 info.num_from_encodings = MBSTRG(http_input_list_size);
1538
1539 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1540
1541 MBSTRG(http_input_identify) = detected;
1542
1543 RETVAL_BOOL(detected);
1544
1545 if (encstr != NULL) efree(encstr);
1546 }
1547 /* }}} */
1548
1549 /* {{{ Returns string in output buffer converted to the http_output encoding */
PHP_FUNCTION(mb_output_handler)1550 PHP_FUNCTION(mb_output_handler)
1551 {
1552 char *arg_string;
1553 size_t arg_string_len;
1554 zend_long arg_status;
1555 mbfl_string string, result;
1556 const char *charset;
1557 char *p;
1558 const mbfl_encoding *encoding;
1559 int last_feed;
1560 size_t len;
1561 unsigned char send_text_mimetype = 0;
1562 char *s, *mimetype = NULL;
1563
1564 ZEND_PARSE_PARAMETERS_START(2, 2)
1565 Z_PARAM_STRING(arg_string, arg_string_len)
1566 Z_PARAM_LONG(arg_status)
1567 ZEND_PARSE_PARAMETERS_END();
1568
1569 encoding = MBSTRG(current_http_output_encoding);
1570
1571 /* start phase only */
1572 if ((arg_status & PHP_OUTPUT_HANDLER_START) != 0) {
1573 /* delete the converter just in case. */
1574 if (MBSTRG(outconv)) {
1575 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1576 mbfl_buffer_converter_delete(MBSTRG(outconv));
1577 MBSTRG(outconv) = NULL;
1578 }
1579
1580 if (encoding == &mbfl_encoding_pass) {
1581 RETURN_STRINGL(arg_string, arg_string_len);
1582 }
1583
1584 /* analyze mime type */
1585 if (SG(sapi_headers).mimetype &&
1586 _php_mb_match_regex(
1587 MBSTRG(http_output_conv_mimetypes),
1588 SG(sapi_headers).mimetype,
1589 strlen(SG(sapi_headers).mimetype))) {
1590 if ((s = strchr(SG(sapi_headers).mimetype,';')) == NULL) {
1591 mimetype = estrdup(SG(sapi_headers).mimetype);
1592 } else {
1593 mimetype = estrndup(SG(sapi_headers).mimetype,s-SG(sapi_headers).mimetype);
1594 }
1595 send_text_mimetype = 1;
1596 } else if (SG(sapi_headers).send_default_content_type) {
1597 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1598 }
1599
1600 /* if content-type is not yet set, set it and activate the converter */
1601 if (SG(sapi_headers).send_default_content_type || send_text_mimetype) {
1602 charset = encoding->mime_name;
1603 if (charset) {
1604 len = spprintf( &p, 0, "Content-Type: %s; charset=%s", mimetype, charset );
1605 if (sapi_add_header(p, len, 0) != FAILURE) {
1606 SG(sapi_headers).send_default_content_type = 0;
1607 }
1608 }
1609 /* activate the converter */
1610 MBSTRG(outconv) = mbfl_buffer_converter_new(MBSTRG(current_internal_encoding), encoding, 0);
1611 if (send_text_mimetype){
1612 efree(mimetype);
1613 }
1614 }
1615 }
1616
1617 /* just return if the converter is not activated. */
1618 if (MBSTRG(outconv) == NULL) {
1619 RETURN_STRINGL(arg_string, arg_string_len);
1620 }
1621
1622 /* flag */
1623 last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1624 /* mode */
1625 mbfl_buffer_converter_illegal_mode(MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1626 mbfl_buffer_converter_illegal_substchar(MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1627
1628 /* feed the string */
1629 mbfl_string_init(&string);
1630 /* these are not needed. convd has encoding info.
1631 string.encoding = MBSTRG(current_internal_encoding);
1632 */
1633 string.val = (unsigned char *)arg_string;
1634 string.len = arg_string_len;
1635
1636 mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1637 if (last_feed) {
1638 mbfl_buffer_converter_flush(MBSTRG(outconv));
1639 }
1640 /* get the converter output, and return it */
1641 mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1642
1643 // TODO: avoid reallocation ???
1644 RETVAL_STRINGL((char *)result.val, result.len); /* the string is already strdup()'ed */
1645 efree(result.val);
1646
1647 /* delete the converter if it is the last feed. */
1648 if (last_feed) {
1649 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1650 mbfl_buffer_converter_delete(MBSTRG(outconv));
1651 MBSTRG(outconv) = NULL;
1652 }
1653 }
1654 /* }}} */
1655
1656 /* {{{ Convert a multibyte string to an array. If split_length is specified,
1657 break the string down into chunks each split_length characters long. */
1658
1659 /* structure to pass split params to the callback */
1660 struct mbfl_split_params {
1661 zval *return_value; /* php function return value structure pointer */
1662 mbfl_string *result_string; /* string to store result chunk */
1663 size_t mb_chunk_length; /* actual chunk length in chars */
1664 size_t split_length; /* split length in chars */
1665 mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1666 };
1667
1668 /* callback function to fill split array */
mbfl_split_output(int c,void * data)1669 static int mbfl_split_output(int c, void *data)
1670 {
1671 struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1672
1673 (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1674
1675 if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1676 mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1677 mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1678 mbfl_string *chunk = params->result_string;
1679 mbfl_memory_device_result(device, chunk); /* make chunk */
1680 add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1681 efree(chunk->val);
1682 params->mb_chunk_length = 0; /* reset mb_chunk size */
1683 }
1684
1685 return 0;
1686 }
1687
PHP_FUNCTION(mb_str_split)1688 PHP_FUNCTION(mb_str_split)
1689 {
1690 zend_string *str, *encoding = NULL;
1691 size_t mb_len, chunks, chunk_len;
1692 const char *p, *last; /* pointer for the string cursor and last string char */
1693 mbfl_string string, result_string;
1694 const mbfl_encoding *mbfl_encoding;
1695 zend_long split_length = 1;
1696
1697 ZEND_PARSE_PARAMETERS_START(1, 3)
1698 Z_PARAM_STR(str)
1699 Z_PARAM_OPTIONAL
1700 Z_PARAM_LONG(split_length)
1701 Z_PARAM_STR_OR_NULL(encoding)
1702 ZEND_PARSE_PARAMETERS_END();
1703
1704 if (split_length <= 0) {
1705 zend_argument_value_error(2, "must be greater than 0");
1706 RETURN_THROWS();
1707 }
1708
1709 /* fill mbfl_string structure */
1710 string.val = (unsigned char *) ZSTR_VAL(str);
1711 string.len = ZSTR_LEN(str);
1712 string.encoding = php_mb_get_encoding(encoding, 3);
1713 if (!string.encoding) {
1714 RETURN_THROWS();
1715 }
1716
1717 if (ZSTR_LEN(str) == 0) {
1718 RETURN_EMPTY_ARRAY();
1719 }
1720
1721 p = ZSTR_VAL(str); /* string cursor pointer */
1722 last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1723
1724 mbfl_encoding = string.encoding;
1725
1726 /* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1727 if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1728 mb_len = string.len;
1729 chunk_len = (size_t)split_length; /* chunk length in bytes */
1730 } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1731 mb_len = string.len / 2;
1732 chunk_len = split_length * 2;
1733 } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1734 mb_len = string.len / 4;
1735 chunk_len = split_length * 4;
1736 } else if (mbfl_encoding->mblen_table != NULL) {
1737 /* second scenario: variable width encodings with length table */
1738 char unsigned const *mbtab = mbfl_encoding->mblen_table;
1739
1740 /* assume that we have 1-bytes characters */
1741 array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1742
1743 while (p < last) { /* split cycle work until the cursor has reached the last byte */
1744 char const *chunk_p = p; /* chunk first byte pointer */
1745 chunk_len = 0; /* chunk length in bytes */
1746 zend_long char_count;
1747
1748 for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1749 char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1750 chunk_len += m;
1751 p += m;
1752 }
1753 if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1754 add_next_index_stringl(return_value, chunk_p, chunk_len);
1755 }
1756 return;
1757 } else {
1758 /* third scenario: other multibyte encodings */
1759 mbfl_convert_filter *filter, *decoder;
1760
1761 /* assume that we have 1-bytes characters */
1762 array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1763
1764 /* decoder filter to decode wchar to encoding */
1765 mbfl_memory_device device;
1766 mbfl_memory_device_init(&device, split_length + 1, 0);
1767
1768 decoder = mbfl_convert_filter_new(
1769 &mbfl_encoding_wchar,
1770 string.encoding,
1771 mbfl_memory_device_output,
1772 NULL,
1773 &device);
1774 /* assert that nothing is wrong with the decoder */
1775 ZEND_ASSERT(decoder != NULL);
1776
1777 /* wchar filter */
1778 mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1779 struct mbfl_split_params params = { /* init callback function params structure */
1780 .return_value = return_value,
1781 .result_string = &result_string,
1782 .mb_chunk_length = 0,
1783 .split_length = (size_t)split_length,
1784 .next_filter = decoder,
1785 };
1786
1787 filter = mbfl_convert_filter_new(
1788 string.encoding,
1789 &mbfl_encoding_wchar,
1790 mbfl_split_output,
1791 NULL,
1792 ¶ms);
1793 /* assert that nothing is wrong with the filter */
1794 ZEND_ASSERT(filter != NULL);
1795
1796 while (p < last - 1) { /* cycle each byte except last with callback function */
1797 (*filter->filter_function)(*p++, filter);
1798 }
1799 params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1800 (*filter->filter_function)(*p++, filter); /* process last char */
1801
1802 mbfl_convert_filter_delete(decoder);
1803 mbfl_convert_filter_delete(filter);
1804 mbfl_memory_device_clear(&device);
1805 return;
1806 }
1807
1808 /* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1809 chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1810 array_init_size(return_value, chunks);
1811 if (chunks != 0) {
1812 zend_long i;
1813
1814 for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1815 add_next_index_stringl(return_value, p, chunk_len);
1816 }
1817 add_next_index_stringl(return_value, p, last - p);
1818 }
1819 }
1820 /* }}} */
1821
mb_get_strlen(zend_string * string,const mbfl_encoding * encoding)1822 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1823 {
1824 size_t len = 0;
1825
1826 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
1827 return ZSTR_LEN(string);
1828 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1829 return ZSTR_LEN(string) / 2;
1830 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1831 return ZSTR_LEN(string) / 4;
1832 } else if (encoding->mblen_table) {
1833 const unsigned char *mbtab = encoding->mblen_table;
1834 unsigned char *p = (unsigned char*)ZSTR_VAL(string), *e = p + ZSTR_LEN(string);
1835 while (p < e) {
1836 p += mbtab[*p];
1837 len++;
1838 }
1839 } else {
1840 uint32_t wchar_buf[128];
1841 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1842 size_t in_len = ZSTR_LEN(string);
1843 unsigned int state = 0;
1844
1845 while (in_len) {
1846 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1847 }
1848 }
1849
1850 return len;
1851 }
1852
1853 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1854 PHP_FUNCTION(mb_strlen)
1855 {
1856 zend_string *string, *enc_name = NULL;
1857
1858 ZEND_PARSE_PARAMETERS_START(1, 2)
1859 Z_PARAM_STR(string)
1860 Z_PARAM_OPTIONAL
1861 Z_PARAM_STR_OR_NULL(enc_name)
1862 ZEND_PARSE_PARAMETERS_END();
1863
1864 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1865 if (!enc) {
1866 RETURN_THROWS();
1867 }
1868
1869 RETVAL_LONG(mb_get_strlen(string, enc));
1870 }
1871 /* }}} */
1872
handle_strpos_error(size_t error)1873 static void handle_strpos_error(size_t error) {
1874 switch (error) {
1875 case MBFL_ERROR_NOT_FOUND:
1876 break;
1877 case MBFL_ERROR_ENCODING:
1878 php_error_docref(NULL, E_WARNING, "Conversion error");
1879 break;
1880 case MBFL_ERROR_OFFSET:
1881 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1882 break;
1883 default:
1884 zend_value_error("mb_strpos(): Unknown error");
1885 break;
1886 }
1887 }
1888
1889 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(mb_strpos)1890 PHP_FUNCTION(mb_strpos)
1891 {
1892 int reverse = 0;
1893 zend_long offset = 0;
1894 char *haystack_val, *needle_val;
1895 mbfl_string haystack, needle;
1896 zend_string *enc_name = NULL;
1897
1898 ZEND_PARSE_PARAMETERS_START(2, 4)
1899 Z_PARAM_STRING(haystack_val, haystack.len)
1900 Z_PARAM_STRING(needle_val, needle.len)
1901 Z_PARAM_OPTIONAL
1902 Z_PARAM_LONG(offset)
1903 Z_PARAM_STR_OR_NULL(enc_name)
1904 ZEND_PARSE_PARAMETERS_END();
1905
1906 haystack.val = (unsigned char*)haystack_val;
1907 needle.val = (unsigned char*)needle_val;
1908
1909 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1910 if (!haystack.encoding) {
1911 RETURN_THROWS();
1912 }
1913
1914 size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
1915 if (!mbfl_is_error(n)) {
1916 RETVAL_LONG(n);
1917 } else {
1918 handle_strpos_error(n);
1919 RETVAL_FALSE;
1920 }
1921 }
1922 /* }}} */
1923
1924 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1925 PHP_FUNCTION(mb_strrpos)
1926 {
1927 mbfl_string haystack, needle;
1928 char *haystack_val, *needle_val;
1929 zend_string *enc_name = NULL;
1930 zend_long offset = 0;
1931
1932 ZEND_PARSE_PARAMETERS_START(2, 4)
1933 Z_PARAM_STRING(haystack_val, haystack.len)
1934 Z_PARAM_STRING(needle_val, needle.len)
1935 Z_PARAM_OPTIONAL
1936 Z_PARAM_LONG(offset)
1937 Z_PARAM_STR_OR_NULL(enc_name)
1938 ZEND_PARSE_PARAMETERS_END();
1939
1940 haystack.val = (unsigned char*)haystack_val;
1941 needle.val = (unsigned char*)needle_val;
1942
1943 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1944 if (!haystack.encoding) {
1945 RETURN_THROWS();
1946 }
1947
1948 size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
1949 if (!mbfl_is_error(n)) {
1950 RETVAL_LONG(n);
1951 } else {
1952 handle_strpos_error(n);
1953 RETVAL_FALSE;
1954 }
1955 }
1956 /* }}} */
1957
1958 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)1959 PHP_FUNCTION(mb_stripos)
1960 {
1961 zend_long offset = 0;
1962 mbfl_string haystack, needle;
1963 char *haystack_val, *needle_val;
1964 zend_string *from_encoding = NULL;
1965
1966 ZEND_PARSE_PARAMETERS_START(2, 4)
1967 Z_PARAM_STRING(haystack_val, haystack.len)
1968 Z_PARAM_STRING(needle_val, needle.len)
1969 Z_PARAM_OPTIONAL
1970 Z_PARAM_LONG(offset)
1971 Z_PARAM_STR_OR_NULL(from_encoding)
1972 ZEND_PARSE_PARAMETERS_END();
1973
1974 haystack.val = (unsigned char*)haystack_val;
1975 needle.val = (unsigned char*)needle_val;
1976
1977 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1978 if (!enc) {
1979 RETURN_THROWS();
1980 }
1981
1982 size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1983
1984 if (!mbfl_is_error(n)) {
1985 RETVAL_LONG(n);
1986 } else {
1987 handle_strpos_error(n);
1988 RETVAL_FALSE;
1989 }
1990 }
1991 /* }}} */
1992
1993 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)1994 PHP_FUNCTION(mb_strripos)
1995 {
1996 zend_long offset = 0;
1997 mbfl_string haystack, needle;
1998 char *haystack_val, *needle_val;
1999 zend_string *from_encoding = NULL;
2000
2001 ZEND_PARSE_PARAMETERS_START(2, 4)
2002 Z_PARAM_STRING(haystack_val, haystack.len)
2003 Z_PARAM_STRING(needle_val, needle.len)
2004 Z_PARAM_OPTIONAL
2005 Z_PARAM_LONG(offset)
2006 Z_PARAM_STR_OR_NULL(from_encoding)
2007 ZEND_PARSE_PARAMETERS_END();
2008
2009 haystack.val = (unsigned char*)haystack_val;
2010 needle.val = (unsigned char*)needle_val;
2011
2012 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2013 if (!enc) {
2014 RETURN_THROWS();
2015 }
2016
2017 size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
2018
2019 if (!mbfl_is_error(n)) {
2020 RETVAL_LONG(n);
2021 } else {
2022 handle_strpos_error(n);
2023 RETVAL_FALSE;
2024 }
2025 }
2026 /* }}} */
2027
2028 #define MB_STRSTR 1
2029 #define MB_STRRCHR 2
2030 #define MB_STRISTR 3
2031 #define MB_STRRICHR 4
2032 /* {{{ php_mb_strstr_variants */
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)2033 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2034 {
2035 int reverse_mode = 0;
2036 size_t n;
2037 char *haystack_val, *needle_val;
2038 mbfl_string haystack, needle, result, *ret = NULL;
2039 zend_string *encoding_name = NULL;
2040 bool part = 0;
2041
2042 ZEND_PARSE_PARAMETERS_START(2, 4)
2043 Z_PARAM_STRING(haystack_val, haystack.len)
2044 Z_PARAM_STRING(needle_val, needle.len)
2045 Z_PARAM_OPTIONAL
2046 Z_PARAM_BOOL(part)
2047 Z_PARAM_STR_OR_NULL(encoding_name)
2048 ZEND_PARSE_PARAMETERS_END();
2049
2050 haystack.val = (unsigned char*)haystack_val;
2051 needle.val = (unsigned char*)needle_val;
2052 haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
2053 if (!haystack.encoding) {
2054 RETURN_THROWS();
2055 }
2056
2057 if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
2058
2059 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2060 n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
2061 needle.len, 0, needle.encoding);
2062 } else {
2063 n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
2064 }
2065
2066 if (!mbfl_is_error(n)) {
2067 if (part) {
2068 ret = mbfl_substr(&haystack, &result, 0, n);
2069 ZEND_ASSERT(ret != NULL);
2070 // TODO: avoid reallocation ???
2071 RETVAL_STRINGL((char *)ret->val, ret->len);
2072 efree(ret->val);
2073 } else {
2074 ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
2075 ZEND_ASSERT(ret != NULL);
2076 // TODO: avoid reallocation ???
2077 RETVAL_STRINGL((char *)ret->val, ret->len);
2078 efree(ret->val);
2079 }
2080 } else {
2081 // FIXME use handle_strpos_error(n)
2082 RETVAL_FALSE;
2083 }
2084 }
2085
2086 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2087 PHP_FUNCTION(mb_strstr)
2088 {
2089 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2090 }
2091 /* }}} */
2092
2093 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2094 PHP_FUNCTION(mb_strrchr)
2095 {
2096 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2097 }
2098 /* }}} */
2099
2100 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2101 PHP_FUNCTION(mb_stristr)
2102 {
2103 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2104 }
2105 /* }}} */
2106
2107 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2108 PHP_FUNCTION(mb_strrichr)
2109 {
2110 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2111 }
2112 /* }}} */
2113
2114 #undef MB_STRSTR
2115 #undef MB_STRRCHR
2116 #undef MB_STRISTR
2117 #undef MB_STRRICHR
2118
2119 /* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)2120 PHP_FUNCTION(mb_substr_count)
2121 {
2122 mbfl_string haystack, needle;
2123 char *haystack_val, *needle_val;
2124 zend_string *enc_name = NULL;
2125
2126 ZEND_PARSE_PARAMETERS_START(2, 3)
2127 Z_PARAM_STRING(haystack_val, haystack.len)
2128 Z_PARAM_STRING(needle_val, needle.len)
2129 Z_PARAM_OPTIONAL
2130 Z_PARAM_STR_OR_NULL(enc_name)
2131 ZEND_PARSE_PARAMETERS_END();
2132
2133 haystack.val = (unsigned char*)haystack_val;
2134 needle.val = (unsigned char*)needle_val;
2135
2136 if (needle.len == 0) {
2137 zend_argument_value_error(2, "must not be empty");
2138 RETURN_THROWS();
2139 }
2140
2141 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2142 if (!haystack.encoding) {
2143 RETURN_THROWS();
2144 }
2145
2146 size_t n = mbfl_substr_count(&haystack, &needle);
2147 /* An error can only occur if needle is empty,
2148 * an encoding error happens (which should not happen at this stage and is a bug)
2149 * or the haystack is more than sizeof(size_t) bytes
2150 * If one of these things occur this is a bug and should be flagged as such */
2151 ZEND_ASSERT(!mbfl_is_error(n));
2152 RETVAL_LONG(n);
2153 }
2154 /* }}} */
2155
2156 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2157 PHP_FUNCTION(mb_substr)
2158 {
2159 char *str;
2160 zend_string *encoding = NULL;
2161 zend_long from, len;
2162 size_t real_from, real_len;
2163 size_t str_len;
2164 bool len_is_null = 1;
2165 mbfl_string string, result, *ret;
2166
2167 ZEND_PARSE_PARAMETERS_START(2, 4)
2168 Z_PARAM_STRING(str, str_len)
2169 Z_PARAM_LONG(from)
2170 Z_PARAM_OPTIONAL
2171 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2172 Z_PARAM_STR_OR_NULL(encoding)
2173 ZEND_PARSE_PARAMETERS_END();
2174
2175 string.encoding = php_mb_get_encoding(encoding, 4);
2176 if (!string.encoding) {
2177 RETURN_THROWS();
2178 }
2179
2180 string.val = (unsigned char *)str;
2181 string.len = str_len;
2182
2183 /* measures length */
2184 size_t mblen = 0;
2185 if (from < 0 || (!len_is_null && len < 0)) {
2186 mblen = mbfl_strlen(&string);
2187 }
2188
2189 /* if "from" position is negative, count start position from the end
2190 * of the string
2191 */
2192 if (from >= 0) {
2193 real_from = (size_t) from;
2194 } else if (-from < mblen) {
2195 real_from = mblen + from;
2196 } else {
2197 real_from = 0;
2198 }
2199
2200 /* if "length" position is negative, set it to the length
2201 * needed to stop that many chars from the end of the string
2202 */
2203 if (len_is_null) {
2204 real_len = MBFL_SUBSTR_UNTIL_END;
2205 } else if (len >= 0) {
2206 real_len = (size_t) len;
2207 } else if (real_from < mblen && -len < mblen - real_from) {
2208 real_len = (mblen - real_from) + len;
2209 } else {
2210 real_len = 0;
2211 }
2212
2213 ret = mbfl_substr(&string, &result, real_from, real_len);
2214 ZEND_ASSERT(ret != NULL);
2215
2216 // TODO: avoid reallocation ???
2217 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2218 efree(ret->val);
2219 }
2220 /* }}} */
2221
2222 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2223 PHP_FUNCTION(mb_strcut)
2224 {
2225 zend_string *encoding = NULL;
2226 char *string_val;
2227 zend_long from, len;
2228 bool len_is_null = 1;
2229 mbfl_string string, result, *ret;
2230
2231 ZEND_PARSE_PARAMETERS_START(2, 4)
2232 Z_PARAM_STRING(string_val, string.len)
2233 Z_PARAM_LONG(from)
2234 Z_PARAM_OPTIONAL
2235 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2236 Z_PARAM_STR_OR_NULL(encoding)
2237 ZEND_PARSE_PARAMETERS_END();
2238
2239 string.val = (unsigned char*)string_val;
2240 string.encoding = php_mb_get_encoding(encoding, 4);
2241 if (!string.encoding) {
2242 RETURN_THROWS();
2243 }
2244
2245 if (len_is_null) {
2246 len = string.len;
2247 }
2248
2249 /* if "from" position is negative, count start position from the end
2250 * of the string
2251 */
2252 if (from < 0) {
2253 from = string.len + from;
2254 if (from < 0) {
2255 from = 0;
2256 }
2257 }
2258
2259 /* if "length" position is negative, set it to the length
2260 * needed to stop that many chars from the end of the string
2261 */
2262 if (len < 0) {
2263 len = (string.len - from) + len;
2264 if (len < 0) {
2265 len = 0;
2266 }
2267 }
2268
2269 if (from > string.len) {
2270 RETURN_EMPTY_STRING();
2271 }
2272
2273 ret = mbfl_strcut(&string, &result, from, len);
2274 ZEND_ASSERT(ret != NULL);
2275
2276 // TODO: avoid reallocation ???
2277 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2278 efree(ret->val);
2279 }
2280 /* }}} */
2281
2282 /* Some East Asian characters, when printed at a terminal (or the like), require double
2283 * the usual amount of horizontal space. We call these "fullwidth" characters. */
character_width(uint32_t c)2284 static size_t character_width(uint32_t c)
2285 {
2286 if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
2287 return 1;
2288 }
2289
2290 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2291 int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2292 while (lo < hi) {
2293 int probe = (lo + hi) / 2;
2294 if (c < mbfl_eaw_table[probe].begin) {
2295 hi = probe;
2296 } else if (c > mbfl_eaw_table[probe].end) {
2297 lo = probe + 1;
2298 } else {
2299 return 2;
2300 }
2301 }
2302
2303 return 1;
2304 }
2305
mb_get_strwidth(zend_string * string,const mbfl_encoding * enc)2306 static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2307 {
2308 size_t width = 0;
2309 uint32_t wchar_buf[128];
2310 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2311 size_t in_len = ZSTR_LEN(string);
2312 unsigned int state = 0;
2313
2314 while (in_len) {
2315 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2316 ZEND_ASSERT(out_len <= 128);
2317
2318 while (out_len) {
2319 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2320 * If text conversion is performed with an ordinary ASCII character as
2321 * the 'replacement character', this will give us the correct display width. */
2322 width += character_width(wchar_buf[--out_len]);
2323 }
2324 }
2325
2326 return width;
2327 }
2328
2329 /* Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2330 PHP_FUNCTION(mb_strwidth)
2331 {
2332 zend_string *string, *enc_name = NULL;
2333
2334 ZEND_PARSE_PARAMETERS_START(1, 2)
2335 Z_PARAM_STR(string)
2336 Z_PARAM_OPTIONAL
2337 Z_PARAM_STR_OR_NULL(enc_name)
2338 ZEND_PARSE_PARAMETERS_END();
2339
2340 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2341 if (!enc) {
2342 RETURN_THROWS();
2343 }
2344
2345 RETVAL_LONG(mb_get_strwidth(string, enc));
2346 }
2347
2348 /* Cut 'n' codepoints from beginning of string
2349 * Remove this once mb_substr is implemented using the new conversion filters */
mb_drop_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2350 static zend_string* mb_drop_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2351 {
2352 if (n >= ZSTR_LEN(input)) {
2353 /* No supported text encoding decodes to more than one codepoint per byte
2354 * So if the number of codepoints to drop >= number of input bytes,
2355 * then definitely the output should be empty
2356 * This also guards `ZSTR_LEN(input) - n` (below) from underflow */
2357 return zend_empty_string;
2358 }
2359
2360 uint32_t wchar_buf[128];
2361 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2362 size_t in_len = ZSTR_LEN(input);
2363 unsigned int state = 0;
2364
2365 mb_convert_buf buf;
2366 mb_convert_buf_init(&buf, ZSTR_LEN(input) - n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2367
2368 while (in_len) {
2369 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2370 ZEND_ASSERT(out_len <= 128);
2371
2372 if (n >= out_len) {
2373 n -= out_len;
2374 } else {
2375 enc->from_wchar(wchar_buf + n, out_len - n, &buf, !in_len);
2376 n = 0;
2377 }
2378 }
2379
2380 return mb_convert_buf_result(&buf);
2381 }
2382
2383 /* Pick 'n' codepoints from beginning of string
2384 * Remove this once mb_substr is implemented using the new conversion filters */
mb_pick_chars(zend_string * input,const mbfl_encoding * enc,size_t n)2385 static zend_string* mb_pick_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
2386 {
2387 uint32_t wchar_buf[128];
2388 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2389 size_t in_len = ZSTR_LEN(input);
2390 unsigned int state = 0;
2391
2392 mb_convert_buf buf;
2393 mb_convert_buf_init(&buf, n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2394
2395 while (in_len && n) {
2396 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2397 ZEND_ASSERT(out_len <= 128);
2398
2399 enc->from_wchar(wchar_buf, MIN(out_len, n), &buf, !in_len || out_len >= n);
2400 n -= MIN(out_len, n);
2401 }
2402
2403 return mb_convert_buf_result(&buf);
2404 }
2405
mb_trim_string(zend_string * input,zend_string * marker,const mbfl_encoding * enc,unsigned int from,int width)2406 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
2407 {
2408 uint32_t wchar_buf[128];
2409 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2410 size_t in_len = ZSTR_LEN(input);
2411 unsigned int state = 0;
2412 int remaining_width = width;
2413 unsigned int to_skip = from;
2414 size_t out_len = 0;
2415 bool first_call = true, input_err = false;
2416 mb_convert_buf buf;
2417
2418 while (in_len) {
2419 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2420 ZEND_ASSERT(out_len <= 128);
2421
2422 if (out_len <= to_skip) {
2423 to_skip -= out_len;
2424 } else {
2425 for (int i = to_skip; i < out_len; i++) {
2426 uint32_t w = wchar_buf[i];
2427 input_err |= (w == MBFL_BAD_INPUT);
2428 remaining_width -= character_width(w);
2429 if (remaining_width < 0) {
2430 /* We need to truncate string and append trim marker */
2431 width -= mb_get_strwidth(marker, enc);
2432 /* 'width' is now the amount we want to take from 'input' */
2433 if (width <= 0) {
2434 return zend_string_copy(marker);
2435 }
2436 mb_convert_buf_init(&buf, width, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2437
2438 if (first_call) {
2439 /* We can use the buffer of wchars which we have right now;
2440 * no need to convert again */
2441 goto dont_restart_conversion;
2442 } else {
2443 goto restart_conversion;
2444 }
2445 }
2446 }
2447 to_skip = 0;
2448 }
2449 first_call = false;
2450 }
2451
2452 /* The input string fits in the requested width; we don't need to append the trim marker
2453 * However, if the string contains erroneous byte sequences, those should be converted
2454 * to error markers */
2455 if (from == 0 && !input_err) {
2456 /* This just increments the string's refcount; it doesn't really 'copy' it */
2457 return zend_string_copy(input);
2458 }
2459 return mb_drop_chars(input, enc, from);
2460
2461 /* The input string is too wide; we need to build a new string which
2462 * includes some portion of the input string, with the trim marker
2463 * concatenated onto it */
2464 restart_conversion:
2465 in = (unsigned char*)ZSTR_VAL(input);
2466 in_len = ZSTR_LEN(input);
2467 state = 0;
2468
2469 while (true) {
2470 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2471 ZEND_ASSERT(out_len <= 128);
2472
2473 dont_restart_conversion:
2474 if (out_len <= from) {
2475 from -= out_len;
2476 } else {
2477 for (int i = from; i < out_len; i++) {
2478 width -= character_width(wchar_buf[i]);
2479 if (width < 0) {
2480 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2481 goto append_trim_marker;
2482 }
2483 }
2484 ZEND_ASSERT(in_len > 0);
2485 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2486 from = 0;
2487 }
2488 }
2489
2490 append_trim_marker:
2491 if (ZSTR_LEN(marker) > 0) {
2492 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2493 memcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2494 buf.out += ZSTR_LEN(marker);
2495 }
2496
2497 return mb_convert_buf_result(&buf);
2498 }
2499
2500 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
PHP_FUNCTION(mb_strimwidth)2501 PHP_FUNCTION(mb_strimwidth)
2502 {
2503 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2504 zend_long from, width;
2505
2506 ZEND_PARSE_PARAMETERS_START(3, 5)
2507 Z_PARAM_STR(str)
2508 Z_PARAM_LONG(from)
2509 Z_PARAM_LONG(width)
2510 Z_PARAM_OPTIONAL
2511 Z_PARAM_STR(trimmarker)
2512 Z_PARAM_STR_OR_NULL(encoding)
2513 ZEND_PARSE_PARAMETERS_END();
2514
2515 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2516 if (!enc) {
2517 RETURN_THROWS();
2518 }
2519
2520 if (from != 0) {
2521 size_t str_len = mb_get_strlen(str, enc);
2522 if (from < 0) {
2523 from += str_len;
2524 }
2525 if (from < 0 || from > str_len) {
2526 zend_argument_value_error(2, "is out of range");
2527 RETURN_THROWS();
2528 }
2529 }
2530
2531 if (width < 0) {
2532 width += mb_get_strwidth(str, enc);
2533
2534 if (from > 0) {
2535 zend_string *trimmed = mb_pick_chars(str, enc, from);
2536 width -= mb_get_strwidth(trimmed, enc);
2537 zend_string_free(trimmed);
2538 }
2539
2540 if (width < 0) {
2541 zend_argument_value_error(3, "is out of range");
2542 RETURN_THROWS();
2543 }
2544 }
2545
2546 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2547 }
2548
2549
2550 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2551 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2552 {
2553 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2554 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2555 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2556 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2557 }
2558
2559
2560 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)2561 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
2562 {
2563 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
2564 }
2565
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding)2566 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2567 {
2568 unsigned int num_errors = 0;
2569 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2570 MBSTRG(illegalchars) += num_errors;
2571 return result;
2572 }
2573
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2574 MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2575 {
2576 const mbfl_encoding *from_encoding;
2577
2578 /* pre-conversion encoding */
2579 ZEND_ASSERT(num_from_encodings >= 1);
2580 if (num_from_encodings == 1) {
2581 from_encoding = *from_encodings;
2582 } else {
2583 /* auto detect */
2584 mbfl_string string;
2585 mbfl_string_init(&string);
2586 string.val = (unsigned char *)input;
2587 string.len = length;
2588 from_encoding = mbfl_identify_encoding(
2589 &string, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2590 if (!from_encoding) {
2591 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2592 return NULL;
2593 }
2594 }
2595
2596 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2597 }
2598
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2599 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2600 {
2601 HashTable *output, *chash;
2602 zend_long idx;
2603 zend_string *key;
2604 zval *entry, entry_tmp;
2605
2606 if (!input) {
2607 return NULL;
2608 }
2609
2610 if (GC_IS_RECURSIVE(input)) {
2611 GC_UNPROTECT_RECURSION(input);
2612 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2613 return NULL;
2614 }
2615 GC_TRY_PROTECT_RECURSION(input);
2616 output = zend_new_array(zend_hash_num_elements(input));
2617 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2618 /* convert key */
2619 if (key) {
2620 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2621 if (!converted_key) {
2622 continue;
2623 }
2624 key = converted_key;
2625 }
2626 /* convert value */
2627 ZEND_ASSERT(entry);
2628 try_again:
2629 switch(Z_TYPE_P(entry)) {
2630 case IS_STRING: {
2631 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2632 if (!converted_key) {
2633 if (key) {
2634 zend_string_release(key);
2635 }
2636 continue;
2637 }
2638 ZVAL_STR(&entry_tmp, converted_key);
2639 break;
2640 }
2641 case IS_NULL:
2642 case IS_TRUE:
2643 case IS_FALSE:
2644 case IS_LONG:
2645 case IS_DOUBLE:
2646 ZVAL_COPY(&entry_tmp, entry);
2647 break;
2648 case IS_ARRAY:
2649 chash = php_mb_convert_encoding_recursive(
2650 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2651 if (chash) {
2652 ZVAL_ARR(&entry_tmp, chash);
2653 } else {
2654 ZVAL_EMPTY_ARRAY(&entry_tmp);
2655 }
2656 break;
2657 case IS_REFERENCE:
2658 entry = Z_REFVAL_P(entry);
2659 goto try_again;
2660 case IS_OBJECT:
2661 default:
2662 if (key) {
2663 zend_string_release(key);
2664 }
2665 php_error_docref(NULL, E_WARNING, "Object is not supported");
2666 continue;
2667 }
2668 if (key) {
2669 zend_hash_add(output, key, &entry_tmp);
2670 zend_string_release(key);
2671 } else {
2672 zend_hash_index_add(output, idx, &entry_tmp);
2673 }
2674 } ZEND_HASH_FOREACH_END();
2675 GC_TRY_UNPROTECT_RECURSION(input);
2676
2677 return output;
2678 }
2679 /* }}} */
2680
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2681 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2682 {
2683 /* mbstring supports some 'text encodings' which aren't really text encodings
2684 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2685 * These should never be returned by `mb_detect_encoding`. */
2686 int shift = 0;
2687 for (int i = 0; i < *size; i++) {
2688 const mbfl_encoding *encoding = elist[i];
2689 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2690 shift++; /* Remove this encoding from the list */
2691 } else if (shift) {
2692 elist[i - shift] = encoding;
2693 }
2694 }
2695 *size -= shift;
2696 }
2697
2698 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2699 PHP_FUNCTION(mb_convert_encoding)
2700 {
2701 zend_string *to_encoding_name;
2702 zend_string *input_str, *from_encodings_str = NULL;
2703 HashTable *input_ht, *from_encodings_ht = NULL;
2704 const mbfl_encoding **from_encodings;
2705 size_t num_from_encodings;
2706 bool free_from_encodings;
2707
2708 ZEND_PARSE_PARAMETERS_START(2, 3)
2709 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2710 Z_PARAM_STR(to_encoding_name)
2711 Z_PARAM_OPTIONAL
2712 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2713 ZEND_PARSE_PARAMETERS_END();
2714
2715 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2716 if (!to_encoding) {
2717 RETURN_THROWS();
2718 }
2719
2720 if (from_encodings_ht) {
2721 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2722 RETURN_THROWS();
2723 }
2724 free_from_encodings = 1;
2725 } else if (from_encodings_str) {
2726 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2727 &from_encodings, &num_from_encodings,
2728 /* persistent */ 0, /* arg_num */ 3, /* allow_pass_encoding */ 0) == FAILURE) {
2729 RETURN_THROWS();
2730 }
2731 free_from_encodings = 1;
2732 } else {
2733 from_encodings = &MBSTRG(current_internal_encoding);
2734 num_from_encodings = 1;
2735 free_from_encodings = 0;
2736 }
2737
2738 if (num_from_encodings > 1) {
2739 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2740 }
2741
2742 if (!num_from_encodings) {
2743 efree(ZEND_VOIDP(from_encodings));
2744 zend_argument_value_error(3, "must specify at least one encoding");
2745 RETURN_THROWS();
2746 }
2747
2748 if (input_str) {
2749 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2750 if (ret != NULL) {
2751 RETVAL_STR(ret);
2752 } else {
2753 RETVAL_FALSE;
2754 }
2755 } else {
2756 HashTable *tmp;
2757 tmp = php_mb_convert_encoding_recursive(
2758 input_ht, to_encoding, from_encodings, num_from_encodings);
2759 RETVAL_ARR(tmp);
2760 }
2761
2762 if (free_from_encodings) {
2763 efree(ZEND_VOIDP(from_encodings));
2764 }
2765 }
2766 /* }}} */
2767
mbstring_convert_case(int case_mode,const char * str,size_t str_len,size_t * ret_len,const mbfl_encoding * enc)2768 static char *mbstring_convert_case(
2769 int case_mode, const char *str, size_t str_len, size_t *ret_len,
2770 const mbfl_encoding *enc) {
2771 return php_unicode_convert_case(
2772 case_mode, str, str_len, ret_len, enc,
2773 MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2774 }
2775
2776 /* {{{ Returns a case-folded version of source_string */
PHP_FUNCTION(mb_convert_case)2777 PHP_FUNCTION(mb_convert_case)
2778 {
2779 zend_string *from_encoding = NULL;
2780 char *str;
2781 size_t str_len, ret_len;
2782 zend_long case_mode = 0;
2783
2784 ZEND_PARSE_PARAMETERS_START(2, 3)
2785 Z_PARAM_STRING(str, str_len)
2786 Z_PARAM_LONG(case_mode)
2787 Z_PARAM_OPTIONAL
2788 Z_PARAM_STR_OR_NULL(from_encoding)
2789 ZEND_PARSE_PARAMETERS_END();
2790
2791 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2792 if (!enc) {
2793 RETURN_THROWS();
2794 }
2795
2796 if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) {
2797 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2798 RETURN_THROWS();
2799 }
2800
2801 char *newstr = mbstring_convert_case(case_mode, str, str_len, &ret_len, enc);
2802 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2803 ZEND_ASSERT(newstr != NULL);
2804
2805 // TODO: avoid reallocation ???
2806 RETVAL_STRINGL(newstr, ret_len);
2807 efree(newstr);
2808 }
2809 /* }}} */
2810
2811 /* {{{ Returns a upper cased version of source_string */
PHP_FUNCTION(mb_strtoupper)2812 PHP_FUNCTION(mb_strtoupper)
2813 {
2814 zend_string *from_encoding = NULL;
2815 char *str;
2816 size_t str_len, ret_len;
2817
2818 ZEND_PARSE_PARAMETERS_START(1, 2)
2819 Z_PARAM_STRING(str, str_len)
2820 Z_PARAM_OPTIONAL
2821 Z_PARAM_STR_OR_NULL(from_encoding)
2822 ZEND_PARSE_PARAMETERS_END();
2823
2824 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2825 if (!enc) {
2826 RETURN_THROWS();
2827 }
2828
2829 char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
2830 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2831 ZEND_ASSERT(newstr != NULL);
2832
2833 // TODO: avoid reallocation ???
2834 RETVAL_STRINGL(newstr, ret_len);
2835 efree(newstr);
2836 }
2837 /* }}} */
2838
2839 /* {{{ Returns a lower cased version of source_string */
PHP_FUNCTION(mb_strtolower)2840 PHP_FUNCTION(mb_strtolower)
2841 {
2842 zend_string *from_encoding = NULL;
2843 char *str;
2844 size_t str_len;
2845 char *newstr;
2846 size_t ret_len;
2847 const mbfl_encoding *enc;
2848
2849 ZEND_PARSE_PARAMETERS_START(1, 2)
2850 Z_PARAM_STRING(str, str_len)
2851 Z_PARAM_OPTIONAL
2852 Z_PARAM_STR_OR_NULL(from_encoding)
2853 ZEND_PARSE_PARAMETERS_END();
2854
2855 enc = php_mb_get_encoding(from_encoding, 2);
2856 if (!enc) {
2857 RETURN_THROWS();
2858 }
2859
2860 newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
2861 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2862 ZEND_ASSERT(newstr != NULL);
2863
2864 // TODO: avoid reallocation ???
2865 RETVAL_STRINGL(newstr, ret_len);
2866 efree(newstr);
2867 }
2868 /* }}} */
2869
duplicate_elist(const mbfl_encoding ** elist,size_t size)2870 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2871 {
2872 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2873 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2874 return new_elist;
2875 }
2876
2877 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)2878 PHP_FUNCTION(mb_detect_encoding)
2879 {
2880 char *str;
2881 size_t str_len;
2882 zend_string *encoding_str = NULL;
2883 HashTable *encoding_ht = NULL;
2884 bool strict = 0;
2885
2886 mbfl_string string;
2887 const mbfl_encoding *ret;
2888 const mbfl_encoding **elist;
2889 size_t size;
2890
2891 ZEND_PARSE_PARAMETERS_START(1, 3)
2892 Z_PARAM_STRING(str, str_len)
2893 Z_PARAM_OPTIONAL
2894 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
2895 Z_PARAM_BOOL(strict)
2896 ZEND_PARSE_PARAMETERS_END();
2897
2898 /* make encoding list */
2899 if (encoding_ht) {
2900 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
2901 RETURN_THROWS();
2902 }
2903 } else if (encoding_str) {
2904 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0)) {
2905 RETURN_THROWS();
2906 }
2907 } else {
2908 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
2909 size = MBSTRG(current_detect_order_list_size);
2910 }
2911
2912 if (size == 0) {
2913 efree(ZEND_VOIDP(elist));
2914 zend_argument_value_error(2, "must specify at least one encoding");
2915 RETURN_THROWS();
2916 }
2917
2918 remove_non_encodings_from_elist(elist, &size);
2919 if (size == 0) {
2920 efree(ZEND_VOIDP(elist));
2921 RETURN_FALSE;
2922 }
2923
2924 if (ZEND_NUM_ARGS() < 3) {
2925 strict = MBSTRG(strict_detection);
2926 }
2927
2928 if (strict && size == 1) {
2929 /* If there is only a single candidate encoding, mb_check_encoding is faster */
2930 ret = (php_mb_check_encoding(str, str_len, *elist)) ? *elist : NULL;
2931 } else {
2932 mbfl_string_init(&string);
2933 string.val = (unsigned char *)str;
2934 string.len = str_len;
2935 ret = mbfl_identify_encoding(&string, elist, size, strict);
2936 }
2937
2938 efree(ZEND_VOIDP(elist));
2939
2940 if (ret == NULL) {
2941 RETURN_FALSE;
2942 }
2943
2944 RETVAL_STRING((char *)ret->name);
2945 }
2946 /* }}} */
2947
2948 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)2949 PHP_FUNCTION(mb_list_encodings)
2950 {
2951 ZEND_PARSE_PARAMETERS_NONE();
2952
2953 array_init(return_value);
2954 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
2955 add_next_index_string(return_value, (*encodings)->name);
2956 }
2957 }
2958 /* }}} */
2959
2960 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)2961 PHP_FUNCTION(mb_encoding_aliases)
2962 {
2963 const mbfl_encoding *encoding;
2964 zend_string *encoding_name = NULL;
2965
2966 ZEND_PARSE_PARAMETERS_START(1, 1)
2967 Z_PARAM_STR(encoding_name)
2968 ZEND_PARSE_PARAMETERS_END();
2969
2970 encoding = php_mb_get_encoding(encoding_name, 1);
2971 if (!encoding) {
2972 RETURN_THROWS();
2973 }
2974
2975 array_init(return_value);
2976 if (encoding->aliases != NULL) {
2977 for (const char **alias = encoding->aliases; *alias; ++alias) {
2978 add_next_index_string(return_value, (char *)*alias);
2979 }
2980 }
2981 }
2982 /* }}} */
2983
2984 /* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)2985 PHP_FUNCTION(mb_encode_mimeheader)
2986 {
2987 const mbfl_encoding *charset, *transenc;
2988 mbfl_string string, result, *ret;
2989 zend_string *charset_name = NULL;
2990 char *trans_enc_name = NULL, *string_val;
2991 size_t trans_enc_name_len;
2992 char *linefeed = "\r\n";
2993 size_t linefeed_len;
2994 zend_long indent = 0;
2995
2996 string.encoding = MBSTRG(current_internal_encoding);
2997
2998 ZEND_PARSE_PARAMETERS_START(1, 5)
2999 Z_PARAM_STRING(string_val, string.len)
3000 Z_PARAM_OPTIONAL
3001 Z_PARAM_STR(charset_name)
3002 Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
3003 Z_PARAM_STRING(linefeed, linefeed_len)
3004 Z_PARAM_LONG(indent)
3005 ZEND_PARSE_PARAMETERS_END();
3006
3007 string.val = (unsigned char*)string_val;
3008 charset = &mbfl_encoding_pass;
3009 transenc = &mbfl_encoding_base64;
3010
3011 if (charset_name != NULL) {
3012 charset = php_mb_get_encoding(charset_name, 2);
3013 if (!charset) {
3014 RETURN_THROWS();
3015 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
3016 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
3017 RETURN_THROWS();
3018 }
3019 } else {
3020 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
3021 if (lang != NULL) {
3022 charset = mbfl_no2encoding(lang->mail_charset);
3023 transenc = mbfl_no2encoding(lang->mail_header_encoding);
3024 }
3025 }
3026
3027 if (trans_enc_name != NULL) {
3028 if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
3029 transenc = &mbfl_encoding_base64;
3030 } else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
3031 transenc = &mbfl_encoding_qprint;
3032 }
3033 }
3034
3035 mbfl_string_init(&result);
3036 ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
3037 ZEND_ASSERT(ret != NULL);
3038 // TODO: avoid reallocation ???
3039 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
3040 efree(ret->val);
3041 }
3042 /* }}} */
3043
3044 /* {{{ Decodes the MIME "encoded-word" in the string */
PHP_FUNCTION(mb_decode_mimeheader)3045 PHP_FUNCTION(mb_decode_mimeheader)
3046 {
3047 char *string_val;
3048 mbfl_string string, result, *ret;
3049
3050 string.encoding = MBSTRG(current_internal_encoding);
3051
3052 ZEND_PARSE_PARAMETERS_START(1, 1)
3053 Z_PARAM_STRING(string_val, string.len)
3054 ZEND_PARSE_PARAMETERS_END();
3055
3056 string.val = (unsigned char*)string_val;
3057 mbfl_string_init(&result);
3058 ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding));
3059 ZEND_ASSERT(ret != NULL);
3060 // TODO: avoid reallocation ???
3061 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
3062 efree(ret->val);
3063 }
3064 /* }}} */
3065
jp_kana_convert(zend_string * input,const mbfl_encoding * encoding,unsigned int mode)3066 static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3067 {
3068 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3069 * if we are converting zenkaku kana to hankaku kana
3070 * Make the buffer for converted kana big enough that we never need to
3071 * perform bounds checks */
3072 uint32_t wchar_buf[64], converted_buf[64 * 2];
3073 unsigned int buf_offset = 0;
3074 unsigned int state = 0;
3075 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3076 size_t in_len = ZSTR_LEN(input);
3077
3078 mb_convert_buf buf;
3079 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3080
3081 while (in_len) {
3082 uint32_t *converted = converted_buf;
3083 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3084 * previous iteration, don't overwrite it */
3085 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3086 out_len += buf_offset;
3087 ZEND_ASSERT(out_len <= 64);
3088
3089 if (!out_len) {
3090 continue;
3091 }
3092
3093 for (int i = 0; i < out_len-1; i++) {
3094 uint32_t second = 0;
3095 bool consumed = false;
3096 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3097 if (second) {
3098 *converted++ = second;
3099 }
3100 if (consumed) {
3101 i++;
3102 if (i == out_len-1) {
3103 /* We consumed two codepoints at the very end of the wchar buffer
3104 * So there is nothing remaining to reprocess on the next iteration */
3105 buf_offset = 0;
3106 goto emit_converted_kana;
3107 }
3108 }
3109 }
3110
3111 if (!in_len) {
3112 /* This is the last iteration, so we need to process the final codepoint now */
3113 uint32_t second = 0;
3114 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3115 if (second) {
3116 *converted++ = second;
3117 }
3118 } else {
3119 /* Reprocess the last codepoint on the next iteration */
3120 wchar_buf[0] = wchar_buf[out_len-1];
3121 buf_offset = 1;
3122 }
3123
3124 emit_converted_kana:
3125 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3126 }
3127
3128 return mb_convert_buf_result(&buf);
3129 }
3130
3131 char mb_convert_kana_flags[17] = {
3132 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3133 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3134 'V'
3135 };
3136
3137 /* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)3138 PHP_FUNCTION(mb_convert_kana)
3139 {
3140 unsigned int opt;
3141 char *optstr = NULL;
3142 size_t optstr_len;
3143 zend_string *encname = NULL, *str;
3144
3145 ZEND_PARSE_PARAMETERS_START(1, 3)
3146 Z_PARAM_STR(str)
3147 Z_PARAM_OPTIONAL
3148 Z_PARAM_STRING(optstr, optstr_len)
3149 Z_PARAM_STR_OR_NULL(encname)
3150 ZEND_PARSE_PARAMETERS_END();
3151
3152 if (optstr != NULL) {
3153 char *p = optstr, *e = p + optstr_len;
3154 opt = 0;
3155 next_option:
3156 while (p < e) {
3157 /* Walk through option string and convert to bit vector
3158 * See translit_kana_jisx0201_jisx0208.h for the values used */
3159 char c = *p++;
3160 if (c == 'A') {
3161 opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
3162 } else if (c == 'a') {
3163 opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
3164 } else {
3165 for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3166 if (c == mb_convert_kana_flags[i]) {
3167 opt |= (1 << i);
3168 goto next_option;
3169 }
3170 }
3171
3172 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3173 RETURN_THROWS();
3174 }
3175 }
3176
3177 /* Check for illegal combinations of options */
3178 if (((opt & 0xFF00) >> 8) & opt) {
3179 /* It doesn't make sense to convert the same type of characters from halfwidth to
3180 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3181 * FW hiragana to FW katakana and then back again. */
3182 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3183 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3184 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3185 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3186 flag1 = 'A';
3187 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3188 flag2 = 'a';
3189 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3190 RETURN_THROWS();
3191 }
3192
3193 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3194 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3195 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3196 RETURN_THROWS();
3197 }
3198
3199 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3200 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3201 * more than one of these */
3202 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3203 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3204 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3205 RETURN_THROWS();
3206 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3207 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3208 RETURN_THROWS();
3209 }
3210 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3211 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3212 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3213 RETURN_THROWS();
3214 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3215 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3216 RETURN_THROWS();
3217 }
3218 }
3219 } else {
3220 opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
3221 }
3222
3223 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3224 if (!enc) {
3225 RETURN_THROWS();
3226 }
3227
3228 RETVAL_STR(jp_kana_convert(str, enc, opt));
3229 }
3230
mb_recursive_encoder_detector_feed(mbfl_encoding_detector * identd,zval * var,int * recursion_error)3231 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
3232 {
3233 mbfl_string string;
3234 HashTable *ht;
3235 zval *entry;
3236
3237 ZVAL_DEREF(var);
3238 if (Z_TYPE_P(var) == IS_STRING) {
3239 string.val = (unsigned char *)Z_STRVAL_P(var);
3240 string.len = Z_STRLEN_P(var);
3241 if (mbfl_encoding_detector_feed(identd, &string)) {
3242 return 1; /* complete detecting */
3243 }
3244 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3245 if (Z_REFCOUNTED_P(var)) {
3246 if (Z_IS_RECURSIVE_P(var)) {
3247 *recursion_error = 1;
3248 return 0;
3249 }
3250 Z_PROTECT_RECURSION_P(var);
3251 }
3252
3253 ht = HASH_OF(var);
3254 if (ht != NULL) {
3255 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3256 if (mb_recursive_encoder_detector_feed(identd, entry, recursion_error)) {
3257 if (Z_REFCOUNTED_P(var)) {
3258 Z_UNPROTECT_RECURSION_P(var);
3259 }
3260 return 1;
3261 } else if (*recursion_error) {
3262 if (Z_REFCOUNTED_P(var)) {
3263 Z_UNPROTECT_RECURSION_P(var);
3264 }
3265 return 0;
3266 }
3267 } ZEND_HASH_FOREACH_END();
3268 }
3269
3270 if (Z_REFCOUNTED_P(var)) {
3271 Z_UNPROTECT_RECURSION_P(var);
3272 }
3273 }
3274 return 0;
3275 } /* }}} */
3276
mb_recursive_convert_variable(mbfl_buffer_converter * convd,zval * var)3277 static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
3278 {
3279 mbfl_string string, result, *ret;
3280 HashTable *ht;
3281 zval *entry, *orig_var;
3282
3283 orig_var = var;
3284 ZVAL_DEREF(var);
3285 if (Z_TYPE_P(var) == IS_STRING) {
3286 string.val = (unsigned char *)Z_STRVAL_P(var);
3287 string.len = Z_STRLEN_P(var);
3288 ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3289 if (ret != NULL) {
3290 zval_ptr_dtor(orig_var);
3291 // TODO: avoid reallocation ???
3292 ZVAL_STRINGL(orig_var, (char *)ret->val, ret->len);
3293 efree(ret->val);
3294 }
3295 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3296 if (Z_TYPE_P(var) == IS_ARRAY) {
3297 SEPARATE_ARRAY(var);
3298 }
3299 if (Z_REFCOUNTED_P(var)) {
3300 if (Z_IS_RECURSIVE_P(var)) {
3301 return 1;
3302 }
3303 Z_PROTECT_RECURSION_P(var);
3304 }
3305
3306 ht = HASH_OF(var);
3307 if (ht != NULL) {
3308 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3309 if (mb_recursive_convert_variable(convd, entry)) {
3310 if (Z_REFCOUNTED_P(var)) {
3311 Z_UNPROTECT_RECURSION_P(var);
3312 }
3313 return 1;
3314 }
3315 } ZEND_HASH_FOREACH_END();
3316 }
3317
3318 if (Z_REFCOUNTED_P(var)) {
3319 Z_UNPROTECT_RECURSION_P(var);
3320 }
3321 }
3322 return 0;
3323 } /* }}} */
3324
3325 /* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)3326 PHP_FUNCTION(mb_convert_variables)
3327 {
3328 zval *args;
3329 zend_string *to_enc_str;
3330 zend_string *from_enc_str;
3331 HashTable *from_enc_ht;
3332 mbfl_string string, result;
3333 const mbfl_encoding *from_encoding, *to_encoding;
3334 mbfl_encoding_detector *identd;
3335 mbfl_buffer_converter *convd;
3336 int n, argc;
3337 size_t elistsz;
3338 const mbfl_encoding **elist;
3339 int recursion_error = 0;
3340
3341 ZEND_PARSE_PARAMETERS_START(3, -1)
3342 Z_PARAM_STR(to_enc_str)
3343 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3344 Z_PARAM_VARIADIC('+', args, argc)
3345 ZEND_PARSE_PARAMETERS_END();
3346
3347 /* new encoding */
3348 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3349 if (!to_encoding) {
3350 RETURN_THROWS();
3351 }
3352
3353 /* initialize string */
3354 from_encoding = MBSTRG(current_internal_encoding);
3355 mbfl_string_init_set(&string, from_encoding);
3356 mbfl_string_init(&result);
3357
3358 /* pre-conversion encoding */
3359 if (from_enc_ht) {
3360 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3361 RETURN_THROWS();
3362 }
3363 } else {
3364 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0) == FAILURE) {
3365 RETURN_THROWS();
3366 }
3367 }
3368
3369 if (elistsz == 0) {
3370 efree(ZEND_VOIDP(elist));
3371 zend_argument_value_error(2, "must specify at least one encoding");
3372 RETURN_THROWS();
3373 }
3374
3375 if (elistsz == 1) {
3376 from_encoding = *elist;
3377 } else {
3378 /* auto detect */
3379 from_encoding = NULL;
3380 identd = mbfl_encoding_detector_new(elist, elistsz, MBSTRG(strict_detection));
3381 if (identd != NULL) {
3382 n = 0;
3383 while (n < argc) {
3384 if (mb_recursive_encoder_detector_feed(identd, &args[n], &recursion_error)) {
3385 break;
3386 }
3387 n++;
3388 }
3389 from_encoding = mbfl_encoding_detector_judge(identd);
3390 mbfl_encoding_detector_delete(identd);
3391 if (recursion_error) {
3392 efree(ZEND_VOIDP(elist));
3393 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3394 RETURN_FALSE;
3395 }
3396 }
3397
3398 if (!from_encoding) {
3399 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3400 efree(ZEND_VOIDP(elist));
3401 RETURN_FALSE;
3402 }
3403 }
3404
3405 efree(ZEND_VOIDP(elist));
3406
3407 convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
3408 /* If this assertion fails this means some memory allocation failure which is a bug */
3409 ZEND_ASSERT(convd != NULL);
3410
3411 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
3412 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
3413
3414 /* convert */
3415 n = 0;
3416 while (n < argc) {
3417 zval *zv = &args[n];
3418
3419 ZVAL_DEREF(zv);
3420 recursion_error = mb_recursive_convert_variable(convd, zv);
3421 if (recursion_error) {
3422 break;
3423 }
3424 n++;
3425 }
3426
3427 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
3428 mbfl_buffer_converter_delete(convd);
3429
3430 if (recursion_error) {
3431 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3432 RETURN_FALSE;
3433 }
3434
3435 RETURN_STRING(from_encoding->name);
3436 }
3437 /* }}} */
3438
3439 /* HTML numeric entities */
3440
3441 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3442 static uint32_t *make_conversion_map(HashTable *target_hash, int *convmap_size)
3443 {
3444 zval *hash_entry;
3445
3446 int n_elems = zend_hash_num_elements(target_hash);
3447 if (n_elems % 4 != 0) {
3448 zend_argument_value_error(2, "must have a multiple of 4 elements");
3449 return NULL;
3450 }
3451
3452 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3453 uint32_t *mapelm = convmap;
3454
3455 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3456 *mapelm++ = zval_get_long(hash_entry);
3457 } ZEND_HASH_FOREACH_END();
3458
3459 *convmap_size = n_elems / 4;
3460 return convmap;
3461 }
3462
html_numeric_entity_convert(uint32_t w,uint32_t * convmap,int mapsize,uint32_t * retval)3463 static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, int mapsize, uint32_t *retval)
3464 {
3465 uint32_t *convmap_end = convmap + (mapsize * 4);
3466
3467 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3468 uint32_t lo_code = mapelm[0];
3469 uint32_t hi_code = mapelm[1];
3470 uint32_t offset = mapelm[2];
3471 uint32_t mask = mapelm[3];
3472
3473 if (w >= lo_code && w <= hi_code) {
3474 /* This wchar falls inside one of the ranges which should be
3475 * converted to HTML entities */
3476 *retval = (w + offset) & mask;
3477 return true;
3478 }
3479 }
3480
3481 /* None of the ranges matched */
3482 return false;
3483 }
3484
html_numeric_entity_encode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize,bool hex)3485 static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize, bool hex)
3486 {
3487 /* Each wchar which we get from decoding the input string may become up to
3488 * 13 wchars when we convert it to an HTML entity */
3489 uint32_t wchar_buf[32], converted_buf[32 * 13];
3490 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3491
3492 unsigned int state = 0;
3493 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3494 size_t in_len = ZSTR_LEN(input);
3495
3496 mb_convert_buf buf;
3497 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3498
3499 while (in_len) {
3500 /* Convert input string to wchars, up to 32 at a time */
3501 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3502 ZEND_ASSERT(out_len <= 32);
3503 uint32_t *converted = converted_buf;
3504
3505 /* Run through wchars and see if any of them fall into the ranges
3506 * which we want to convert to HTML entities */
3507 for (int i = 0; i < out_len; i++) {
3508 uint32_t w = wchar_buf[i];
3509
3510 if (html_numeric_entity_convert(w, convmap, mapsize, &w)) {
3511 *converted++ = '&';
3512 *converted++ = '#';
3513 if (hex) {
3514 *converted++ = 'x';
3515 }
3516
3517 /* Convert wchar to decimal/hex string */
3518 if (w == 0) {
3519 *converted++ = '0';
3520 } else {
3521 unsigned char *p = entity + sizeof(entity);
3522 if (hex) {
3523 while (w > 0) {
3524 *(--p) = "0123456789ABCDEF"[w & 0xF];
3525 w >>= 4;
3526 }
3527 } else {
3528 while (w > 0) {
3529 *(--p) = "0123456789"[w % 10];
3530 w /= 10;
3531 }
3532 }
3533 while (p < entity + sizeof(entity)) {
3534 *converted++ = *p++;
3535 }
3536 }
3537
3538 *converted++ = ';';
3539 } else {
3540 *converted++ = w;
3541 }
3542 }
3543
3544 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
3545 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3546 }
3547
3548 return mb_convert_buf_result(&buf);
3549 }
3550
3551 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3552 PHP_FUNCTION(mb_encode_numericentity)
3553 {
3554 zend_string *encoding = NULL, *str;
3555 int mapsize;
3556 HashTable *target_hash;
3557 bool is_hex = false;
3558
3559 ZEND_PARSE_PARAMETERS_START(2, 4)
3560 Z_PARAM_STR(str)
3561 Z_PARAM_ARRAY_HT(target_hash)
3562 Z_PARAM_OPTIONAL
3563 Z_PARAM_STR_OR_NULL(encoding)
3564 Z_PARAM_BOOL(is_hex)
3565 ZEND_PARSE_PARAMETERS_END();
3566
3567 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3568 if (!enc) {
3569 RETURN_THROWS();
3570 }
3571
3572 uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3573 if (convmap == NULL) {
3574 RETURN_THROWS();
3575 }
3576
3577 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, mapsize, is_hex));
3578 efree(convmap);
3579 }
3580 /* }}} */
3581
html_numeric_entity_deconvert(uint32_t number,uint32_t * convmap,int mapsize,uint32_t * retval)3582 static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, int mapsize, uint32_t *retval)
3583 {
3584 uint32_t *convmap_end = convmap + (mapsize * 4);
3585
3586 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3587 uint32_t lo_code = mapelm[0];
3588 uint32_t hi_code = mapelm[1];
3589 uint32_t offset = mapelm[2];
3590 uint32_t codepoint = number - offset;
3591 if (codepoint >= lo_code && codepoint <= hi_code) {
3592 *retval = codepoint;
3593 return true;
3594 }
3595 }
3596
3597 return false;
3598 }
3599
3600 #define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
3601 #define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
3602 #define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3603 #define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3604
html_numeric_entity_decode(zend_string * input,const mbfl_encoding * encoding,uint32_t * convmap,int mapsize)3605 static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
3606 {
3607 uint32_t wchar_buf[128], converted_buf[128];
3608
3609 unsigned int state = 0;
3610 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3611 size_t in_len = ZSTR_LEN(input);
3612
3613 mb_convert_buf buf;
3614 mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
3615
3616 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
3617 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
3618 * 2nd 'converted' buffer.
3619 *
3620 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
3621 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
3622 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
3623 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
3624 * to store the next batch of wchars after it.
3625 *
3626 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
3627 * wchars from the 1st buffer to the 2nd one.
3628 *
3629 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
3630 * have to do bounds checks when writing wchars into it.
3631 */
3632
3633 unsigned int wchar_buf_offset = 0;
3634
3635 while (in_len) {
3636 /* Leave space for sentinel at the end of the buffer */
3637 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
3638 out_len += wchar_buf_offset;
3639 ZEND_ASSERT(out_len <= 127);
3640 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
3641
3642 uint32_t *p, *converted;
3643
3644 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
3645 * be there (in `wchar_buf[0]`), so don't bother in that case */
3646 if (wchar_buf_offset == 0) {
3647 p = wchar_buf;
3648 while (*p != '&')
3649 p++;
3650 if (p == wchar_buf + out_len) {
3651 /* No HTML entities in this buffer */
3652 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
3653 continue;
3654 }
3655
3656 /* Copy over the prefix with no & which we already scanned */
3657 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
3658 converted = converted_buf + (p - wchar_buf);
3659 } else {
3660 p = wchar_buf;
3661 converted = converted_buf;
3662 }
3663
3664 found_ampersand:
3665 ZEND_ASSERT(*p == '&');
3666 uint32_t *p2 = p;
3667
3668 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
3669 if (*++p2 == '#') {
3670 if (*++p2 == 'x') {
3671 /* Possible hex entity */
3672 uint32_t w = *++p2;
3673 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
3674 w = *++p2;
3675 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3676 /* We hit the end of the buffer while reading digits, and
3677 * more wchars are still coming in the next buffer
3678 * Reprocess this identity on next iteration */
3679 memmove(wchar_buf, p, (p2 - p) * 4);
3680 wchar_buf_offset = p2 - p;
3681 goto process_converted_wchars;
3682 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3683 /* Invalid entity (too long or "&#x" only) */
3684 memcpy(converted, p, (p2 - p) * 4);
3685 converted += p2 - p;
3686 } else {
3687 /* Valid hexadecimal entity */
3688 uint32_t value = 0, *p3 = p + 3;
3689 while (p3 < p2) {
3690 w = *p3++;
3691 if (w <= '9') {
3692 value = (value * 16) + (w - '0');
3693 } else if (w >= 'a') {
3694 value = (value * 16) + 10 + (w - 'a');
3695 } else {
3696 value = (value * 16) + 10 + (w - 'A');
3697 }
3698 }
3699 if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3700 converted++;
3701 if (*p2 == ';')
3702 p2++;
3703 } else {
3704 memcpy(converted, p, (p2 - p) * 4);
3705 converted += p2 - p;
3706 }
3707 }
3708 } else {
3709 /* Possible decimal entity */
3710 uint32_t w = *p2;
3711 while (w >= '0' && w <= '9')
3712 w = *++p2;
3713 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3714 /* The number of digits was legal (no more than 10 decimal digits)
3715 * Reprocess this identity on next iteration of main loop */
3716 memmove(wchar_buf, p, (p2 - p) * 4);
3717 wchar_buf_offset = p2 - p;
3718 goto process_converted_wchars;
3719 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3720 /* Invalid entity (too long or "&#" only) */
3721 memcpy(converted, p, (p2 - p) * 4);
3722 converted += p2 - p;
3723 } else {
3724 /* Valid decimal entity */
3725 uint32_t value = 0, *p3 = p + 2;
3726 while (p3 < p2) {
3727 /* If unsigned integer overflow would occur in the below
3728 * multiplication by 10, this entity is no good
3729 * 0x19999999 is 1/10th of 0xFFFFFFFF */
3730 if (value > 0x19999999) {
3731 memcpy(converted, p, (p2 - p) * 4);
3732 converted += p2 - p;
3733 goto decimal_entity_too_big;
3734 }
3735 value = (value * 10) + (*p3++ - '0');
3736 }
3737 if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3738 converted++;
3739 if (*p2 == ';')
3740 p2++;
3741 } else {
3742 memcpy(converted, p, (p2 - p) * 4);
3743 converted += p2 - p;
3744 }
3745 }
3746 }
3747 } else if ((p2 == wchar_buf + out_len) && in_len) {
3748 /* Corner case: & at end of buffer */
3749 wchar_buf[0] = '&';
3750 wchar_buf_offset = 1;
3751 goto process_converted_wchars;
3752 } else {
3753 *converted++ = '&';
3754 }
3755 decimal_entity_too_big:
3756
3757 /* Starting to scan a new section of the wchar buffer
3758 * 'p2' is pointing at the next wchar which needs to be processed */
3759 p = p2;
3760 while (*p2 != '&')
3761 p2++;
3762
3763 if (p2 > p) {
3764 memcpy(converted, p, (p2 - p) * 4);
3765 converted += p2 - p;
3766 p = p2;
3767 }
3768
3769 if (p < wchar_buf + out_len)
3770 goto found_ampersand;
3771
3772 /* We do not have any wchars remaining at the end of this buffer which
3773 * we need to reprocess on the next call */
3774 wchar_buf_offset = 0;
3775 process_converted_wchars:
3776 ZEND_ASSERT(converted <= converted_buf + 128);
3777 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3778 }
3779
3780 return mb_convert_buf_result(&buf);
3781 }
3782
3783 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3784 PHP_FUNCTION(mb_decode_numericentity)
3785 {
3786 zend_string *encoding = NULL, *str;
3787 int mapsize;
3788 HashTable *target_hash;
3789
3790 ZEND_PARSE_PARAMETERS_START(2, 3)
3791 Z_PARAM_STR(str)
3792 Z_PARAM_ARRAY_HT(target_hash)
3793 Z_PARAM_OPTIONAL
3794 Z_PARAM_STR_OR_NULL(encoding)
3795 ZEND_PARSE_PARAMETERS_END();
3796
3797 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3798 if (!enc) {
3799 RETURN_THROWS();
3800 }
3801
3802 uint32_t *convmap = make_conversion_map(target_hash, &mapsize);
3803 if (convmap == NULL) {
3804 RETURN_THROWS();
3805 }
3806
3807 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, mapsize));
3808 efree(convmap);
3809 }
3810 /* }}} */
3811
3812 /* {{{ Sends an email message with MIME scheme */
3813 #define CRLF "\r\n"
3814
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)3815 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
3816 {
3817 const char *ps;
3818 size_t icnt;
3819 int state = 0;
3820 int crlf_state = -1;
3821 char *token = NULL;
3822 size_t token_pos = 0;
3823 zend_string *fld_name, *fld_val;
3824
3825 ps = str;
3826 icnt = str_len;
3827 fld_name = fld_val = NULL;
3828
3829 /*
3830 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3831 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
3832 * state 0 1 2 3
3833 *
3834 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3835 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
3836 * crlf_state -1 0 1 -1
3837 *
3838 */
3839
3840 while (icnt > 0) {
3841 switch (*ps) {
3842 case ':':
3843 if (crlf_state == 1) {
3844 token_pos++;
3845 }
3846
3847 if (state == 0 || state == 1) {
3848 if(token && token_pos > 0) {
3849 fld_name = zend_string_init(token, token_pos, 0);
3850 }
3851 state = 2;
3852 } else {
3853 token_pos++;
3854 }
3855
3856 crlf_state = 0;
3857 break;
3858
3859 case '\n':
3860 if (crlf_state == -1) {
3861 goto out;
3862 }
3863 crlf_state = -1;
3864 break;
3865
3866 case '\r':
3867 if (crlf_state == 1) {
3868 token_pos++;
3869 } else {
3870 crlf_state = 1;
3871 }
3872 break;
3873
3874 case ' ': case '\t':
3875 if (crlf_state == -1) {
3876 if (state == 3) {
3877 /* continuing from the previous line */
3878 state = 4;
3879 } else {
3880 /* simply skipping this new line */
3881 state = 5;
3882 }
3883 } else {
3884 if (crlf_state == 1) {
3885 token_pos++;
3886 }
3887 if (state == 1 || state == 3) {
3888 token_pos++;
3889 }
3890 }
3891 crlf_state = 0;
3892 break;
3893
3894 default:
3895 switch (state) {
3896 case 0:
3897 token = (char*)ps;
3898 token_pos = 0;
3899 state = 1;
3900 break;
3901
3902 case 2:
3903 if (crlf_state != -1) {
3904 token = (char*)ps;
3905 token_pos = 0;
3906
3907 state = 3;
3908 break;
3909 }
3910 ZEND_FALLTHROUGH;
3911
3912 case 3:
3913 if (crlf_state == -1) {
3914 if(token && token_pos > 0) {
3915 fld_val = zend_string_init(token, token_pos, 0);
3916 }
3917
3918 if (fld_name != NULL && fld_val != NULL) {
3919 zval val;
3920 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3921 ZVAL_STR(&val, fld_val);
3922
3923 zend_hash_update(ht, fld_name, &val);
3924
3925 zend_string_release_ex(fld_name, 0);
3926 }
3927
3928 fld_name = fld_val = NULL;
3929 token = (char*)ps;
3930 token_pos = 0;
3931
3932 state = 1;
3933 }
3934 break;
3935
3936 case 4:
3937 token_pos++;
3938 state = 3;
3939 break;
3940 }
3941
3942 if (crlf_state == 1) {
3943 token_pos++;
3944 }
3945
3946 token_pos++;
3947
3948 crlf_state = 0;
3949 break;
3950 }
3951 ps++, icnt--;
3952 }
3953 out:
3954 if (state == 2) {
3955 token = "";
3956 token_pos = 0;
3957
3958 state = 3;
3959 }
3960 if (state == 3) {
3961 if(token && token_pos > 0) {
3962 fld_val = zend_string_init(token, token_pos, 0);
3963 }
3964 if (fld_name != NULL && fld_val != NULL) {
3965 zval val;
3966 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3967 ZVAL_STR(&val, fld_val);
3968 zend_hash_update(ht, fld_name, &val);
3969
3970 zend_string_release_ex(fld_name, 0);
3971 }
3972 }
3973 return state;
3974 }
3975
PHP_FUNCTION(mb_send_mail)3976 PHP_FUNCTION(mb_send_mail)
3977 {
3978 char *to;
3979 size_t to_len;
3980 char *message;
3981 size_t message_len;
3982 char *subject;
3983 size_t subject_len;
3984 zend_string *extra_cmd = NULL;
3985 HashTable *headers_ht = NULL;
3986 zend_string *str_headers = NULL;
3987 size_t n, i;
3988 char *to_r = NULL;
3989 char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
3990 struct {
3991 int cnt_type:1;
3992 int cnt_trans_enc:1;
3993 } suppressed_hdrs = { 0, 0 };
3994
3995 char *message_buf = NULL, *subject_buf = NULL, *p;
3996 mbfl_string orig_str, conv_str;
3997 mbfl_string *pstr; /* pointer to mbfl string for return value */
3998 enum mbfl_no_encoding;
3999 const mbfl_encoding *tran_cs, /* transfer text charset */
4000 *head_enc, /* header transfer encoding */
4001 *body_enc; /* body transfer encoding */
4002 mbfl_memory_device device; /* automatic allocateable buffer for additional header */
4003 const mbfl_language *lang;
4004 int err = 0;
4005 HashTable ht_headers;
4006 zval *s;
4007 extern void mbfl_memory_device_unput(mbfl_memory_device *device);
4008
4009 /* initialize */
4010 mbfl_memory_device_init(&device, 0, 0);
4011 mbfl_string_init(&orig_str);
4012 mbfl_string_init(&conv_str);
4013
4014 /* character-set, transfer-encoding */
4015 tran_cs = &mbfl_encoding_utf8;
4016 head_enc = &mbfl_encoding_base64;
4017 body_enc = &mbfl_encoding_base64;
4018 lang = mbfl_no2language(MBSTRG(language));
4019 if (lang != NULL) {
4020 tran_cs = mbfl_no2encoding(lang->mail_charset);
4021 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4022 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4023 }
4024
4025 ZEND_PARSE_PARAMETERS_START(3, 5)
4026 Z_PARAM_PATH(to, to_len)
4027 Z_PARAM_PATH(subject, subject_len)
4028 Z_PARAM_PATH(message, message_len)
4029 Z_PARAM_OPTIONAL
4030 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4031 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4032 ZEND_PARSE_PARAMETERS_END();
4033
4034 if (str_headers) {
4035 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4036 zend_argument_value_error(4, "must not contain any null bytes");
4037 RETURN_THROWS();
4038 }
4039 str_headers = php_trim(str_headers, NULL, 0, 2);
4040 } else if (headers_ht) {
4041 str_headers = php_mail_build_headers(headers_ht);
4042 if (EG(exception)) {
4043 RETURN_THROWS();
4044 }
4045 }
4046
4047 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4048
4049 if (str_headers != NULL) {
4050 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4051 }
4052
4053 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4054 char *tmp;
4055 char *param_name;
4056 char *charset = NULL;
4057
4058 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4059 p = strchr(Z_STRVAL_P(s), ';');
4060
4061 if (p != NULL) {
4062 /* skipping the padded spaces */
4063 do {
4064 ++p;
4065 } while (*p == ' ' || *p == '\t');
4066
4067 if (*p != '\0') {
4068 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4069 if (strcasecmp(param_name, "charset") == 0) {
4070 const mbfl_encoding *_tran_cs = tran_cs;
4071
4072 charset = php_strtok_r(NULL, "= \"", &tmp);
4073 if (charset != NULL) {
4074 _tran_cs = mbfl_name2encoding(charset);
4075 }
4076
4077 if (!_tran_cs) {
4078 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4079 _tran_cs = &mbfl_encoding_ascii;
4080 }
4081 tran_cs = _tran_cs;
4082 }
4083 }
4084 }
4085 }
4086 suppressed_hdrs.cnt_type = 1;
4087 }
4088
4089 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4090 const mbfl_encoding *_body_enc;
4091
4092 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
4093 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4094 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4095 case mbfl_no_encoding_base64:
4096 case mbfl_no_encoding_7bit:
4097 case mbfl_no_encoding_8bit:
4098 body_enc = _body_enc;
4099 break;
4100
4101 default:
4102 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4103 body_enc = &mbfl_encoding_8bit;
4104 break;
4105 }
4106 suppressed_hdrs.cnt_trans_enc = 1;
4107 }
4108
4109 /* To: */
4110 if (to_len > 0) {
4111 to_r = estrndup(to, to_len);
4112 for (; to_len; to_len--) {
4113 if (!isspace((unsigned char) to_r[to_len - 1])) {
4114 break;
4115 }
4116 to_r[to_len - 1] = '\0';
4117 }
4118 for (i = 0; to_r[i]; i++) {
4119 if (iscntrl((unsigned char) to_r[i])) {
4120 /* According to RFC 822, section 3.1.1 long headers may be separated into
4121 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4122 * To prevent these separators from being replaced with a space, we skip over them. */
4123 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4124 i += 2;
4125 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4126 i++;
4127 }
4128 continue;
4129 }
4130
4131 to_r[i] = ' ';
4132 }
4133 }
4134 } else {
4135 to_r = to;
4136 }
4137
4138 /* Subject: */
4139 orig_str.val = (unsigned char *)subject;
4140 orig_str.len = subject_len;
4141 orig_str.encoding = MBSTRG(current_internal_encoding);
4142 if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4143 || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4144 orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4145 }
4146 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4147 size_t line_sep_len = strlen(line_sep);
4148 pstr = mbfl_mime_header_encode(&orig_str, &conv_str, tran_cs, head_enc, line_sep, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4149 if (pstr != NULL) {
4150 subject_buf = subject = (char *)pstr->val;
4151 }
4152
4153 /* message body */
4154 orig_str.val = (unsigned char *)message;
4155 orig_str.len = message_len;
4156 orig_str.encoding = MBSTRG(current_internal_encoding);
4157
4158 if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
4159 || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
4160 orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4161 }
4162
4163 pstr = NULL;
4164 {
4165 mbfl_string tmpstr;
4166
4167 if (mbfl_convert_encoding(&orig_str, &tmpstr, tran_cs) != NULL) {
4168 tmpstr.encoding = &mbfl_encoding_8bit;
4169 pstr = mbfl_convert_encoding(&tmpstr, &conv_str, body_enc);
4170 efree(tmpstr.val);
4171 }
4172 }
4173 if (pstr != NULL) {
4174 message_buf = message = (char *)pstr->val;
4175 }
4176
4177 /* other headers */
4178 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4179 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4180 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4181 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4182 if (str_headers != NULL) {
4183 p = ZSTR_VAL(str_headers);
4184 n = ZSTR_LEN(str_headers);
4185 mbfl_memory_device_strncat(&device, p, n);
4186 if (n > 0 && p[n - 1] != '\n') {
4187 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4188 }
4189 zend_string_release_ex(str_headers, 0);
4190 }
4191
4192 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4193 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4194 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4195 }
4196
4197 if (!suppressed_hdrs.cnt_type) {
4198 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4199
4200 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4201 if (p != NULL) {
4202 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4203 mbfl_memory_device_strcat(&device, p);
4204 }
4205 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4206 }
4207 if (!suppressed_hdrs.cnt_trans_enc) {
4208 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4209 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4210 if (p == NULL) {
4211 p = "7bit";
4212 }
4213 mbfl_memory_device_strcat(&device, p);
4214 mbfl_memory_device_strncat(&device, line_sep, line_sep_len);
4215 }
4216
4217 if (!PG(mail_mixed_lf_and_crlf)) {
4218 mbfl_memory_device_unput(&device);
4219 }
4220 mbfl_memory_device_unput(&device);
4221 mbfl_memory_device_output('\0', &device);
4222 str_headers = zend_string_init((char *)device.buffer, strlen((char *)device.buffer), 0);
4223
4224 if (force_extra_parameters) {
4225 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4226 } else if (extra_cmd) {
4227 extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
4228 }
4229
4230 if (!err && php_mail(to_r, subject, message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL)) {
4231 RETVAL_TRUE;
4232 } else {
4233 RETVAL_FALSE;
4234 }
4235
4236 if (extra_cmd) {
4237 zend_string_release_ex(extra_cmd, 0);
4238 }
4239
4240 if (to_r != to) {
4241 efree(to_r);
4242 }
4243 if (subject_buf) {
4244 efree((void *)subject_buf);
4245 }
4246 if (message_buf) {
4247 efree((void *)message_buf);
4248 }
4249 mbfl_memory_device_clear(&device);
4250 zend_hash_destroy(&ht_headers);
4251 if (str_headers) {
4252 zend_string_release_ex(str_headers, 0);
4253 }
4254 }
4255
4256 #undef CRLF
4257 #undef MAIL_ASCIIZ_CHECK_MBSTRING
4258 #undef PHP_MBSTR_MAIL_MIME_HEADER1
4259 #undef PHP_MBSTR_MAIL_MIME_HEADER2
4260 #undef PHP_MBSTR_MAIL_MIME_HEADER3
4261 #undef PHP_MBSTR_MAIL_MIME_HEADER4
4262 /* }}} */
4263
4264 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)4265 PHP_FUNCTION(mb_get_info)
4266 {
4267 zend_string *type = NULL;
4268 size_t n;
4269 char *name;
4270 zval row;
4271 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
4272 const mbfl_encoding **entry;
4273
4274 ZEND_PARSE_PARAMETERS_START(0, 1)
4275 Z_PARAM_OPTIONAL
4276 Z_PARAM_STR(type)
4277 ZEND_PARSE_PARAMETERS_END();
4278
4279 if (!type || zend_string_equals_literal_ci(type, "all")) {
4280 array_init(return_value);
4281 if (MBSTRG(current_internal_encoding)) {
4282 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4283 }
4284 if (MBSTRG(http_input_identify)) {
4285 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4286 }
4287 if (MBSTRG(current_http_output_encoding)) {
4288 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4289 }
4290 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4291 add_assoc_string(return_value, "http_output_conv_mimetypes", name);
4292 }
4293 if (lang != NULL) {
4294 if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4295 add_assoc_string(return_value, "mail_charset", name);
4296 }
4297 if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4298 add_assoc_string(return_value, "mail_header_encoding", name);
4299 }
4300 if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4301 add_assoc_string(return_value, "mail_body_encoding", name);
4302 }
4303 }
4304 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4305 if (MBSTRG(encoding_translation)) {
4306 add_assoc_string(return_value, "encoding_translation", "On");
4307 } else {
4308 add_assoc_string(return_value, "encoding_translation", "Off");
4309 }
4310 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4311 add_assoc_string(return_value, "language", name);
4312 }
4313 n = MBSTRG(current_detect_order_list_size);
4314 entry = MBSTRG(current_detect_order_list);
4315 if (n > 0) {
4316 size_t i;
4317 array_init(&row);
4318 for (i = 0; i < n; i++) {
4319 add_next_index_string(&row, (*entry)->name);
4320 entry++;
4321 }
4322 add_assoc_zval(return_value, "detect_order", &row);
4323 }
4324 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4325 add_assoc_string(return_value, "substitute_character", "none");
4326 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4327 add_assoc_string(return_value, "substitute_character", "long");
4328 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4329 add_assoc_string(return_value, "substitute_character", "entity");
4330 } else {
4331 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4332 }
4333 if (MBSTRG(strict_detection)) {
4334 add_assoc_string(return_value, "strict_detection", "On");
4335 } else {
4336 add_assoc_string(return_value, "strict_detection", "Off");
4337 }
4338 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4339 if (MBSTRG(current_internal_encoding)) {
4340 RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
4341 }
4342 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4343 if (MBSTRG(http_input_identify)) {
4344 RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
4345 }
4346 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4347 if (MBSTRG(current_http_output_encoding)) {
4348 RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
4349 }
4350 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4351 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
4352 RETVAL_STRING(name);
4353 }
4354 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4355 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
4356 RETVAL_STRING(name);
4357 }
4358 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4359 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
4360 RETVAL_STRING(name);
4361 }
4362 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4363 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
4364 RETVAL_STRING(name);
4365 }
4366 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4367 RETVAL_LONG(MBSTRG(illegalchars));
4368 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4369 if (MBSTRG(encoding_translation)) {
4370 RETVAL_STRING("On");
4371 } else {
4372 RETVAL_STRING("Off");
4373 }
4374 } else if (zend_string_equals_literal_ci(type, "language")) {
4375 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
4376 RETVAL_STRING(name);
4377 }
4378 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4379 n = MBSTRG(current_detect_order_list_size);
4380 entry = MBSTRG(current_detect_order_list);
4381 if (n > 0) {
4382 size_t i;
4383 array_init(return_value);
4384 for (i = 0; i < n; i++) {
4385 add_next_index_string(return_value, (*entry)->name);
4386 entry++;
4387 }
4388 }
4389 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4390 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
4391 RETVAL_STRING("none");
4392 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
4393 RETVAL_STRING("long");
4394 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
4395 RETVAL_STRING("entity");
4396 } else {
4397 RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
4398 }
4399 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4400 if (MBSTRG(strict_detection)) {
4401 RETVAL_STRING("On");
4402 } else {
4403 RETVAL_STRING("Off");
4404 }
4405 } else {
4406 // TODO Convert to ValueError
4407 RETURN_FALSE;
4408 }
4409 }
4410 /* }}} */
4411
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)4412 MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4413 {
4414 uint32_t wchar_buf[128];
4415 unsigned char *in = (unsigned char*)input;
4416 unsigned int state = 0;
4417
4418 if (encoding->check != NULL) {
4419 return encoding->check(in, length);
4420 }
4421
4422 /* If the input string is not encoded in the given encoding, there is a significant chance
4423 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4424 * buffer of 128 codepoints, convert and check just a few codepoints first */
4425 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4426 ZEND_ASSERT(out_len <= 8);
4427 for (int i = 0; i < out_len; i++) {
4428 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4429 return 0;
4430 }
4431 }
4432
4433 while (length) {
4434 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4435 ZEND_ASSERT(out_len <= 128);
4436 for (int i = 0; i < out_len; i++) {
4437 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4438 return 0;
4439 }
4440 }
4441 }
4442
4443 return 1;
4444 }
4445
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)4446 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
4447 {
4448 zend_long idx;
4449 zend_string *key;
4450 zval *entry;
4451 int valid = 1;
4452
4453 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
4454
4455 if (GC_IS_RECURSIVE(vars)) {
4456 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
4457 return 0;
4458 }
4459 GC_TRY_PROTECT_RECURSION(vars);
4460 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
4461 ZVAL_DEREF(entry);
4462 if (key) {
4463 if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
4464 valid = 0;
4465 break;
4466 }
4467 }
4468 switch (Z_TYPE_P(entry)) {
4469 case IS_STRING:
4470 if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
4471 valid = 0;
4472 break;
4473 }
4474 break;
4475 case IS_ARRAY:
4476 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
4477 valid = 0;
4478 break;
4479 }
4480 break;
4481 case IS_LONG:
4482 case IS_DOUBLE:
4483 case IS_NULL:
4484 case IS_TRUE:
4485 case IS_FALSE:
4486 break;
4487 default:
4488 /* Other types are error. */
4489 valid = 0;
4490 break;
4491 }
4492 } ZEND_HASH_FOREACH_END();
4493 GC_TRY_UNPROTECT_RECURSION(vars);
4494 return valid;
4495 }
4496
4497 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)4498 PHP_FUNCTION(mb_check_encoding)
4499 {
4500 zend_string *input_str = NULL, *enc = NULL;
4501 HashTable *input_ht = NULL;
4502 const mbfl_encoding *encoding;
4503
4504 ZEND_PARSE_PARAMETERS_START(0, 2)
4505 Z_PARAM_OPTIONAL
4506 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
4507 Z_PARAM_STR_OR_NULL(enc)
4508 ZEND_PARSE_PARAMETERS_END();
4509
4510 encoding = php_mb_get_encoding(enc, 2);
4511 if (!encoding) {
4512 RETURN_THROWS();
4513 }
4514
4515 if (input_ht) {
4516 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
4517 } else if (input_str) {
4518 RETURN_BOOL(php_mb_check_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), encoding));
4519 } else {
4520 php_error_docref(NULL, E_DEPRECATED,
4521 "Calling mb_check_encoding() without argument is deprecated");
4522
4523 /* FIXME: Actually check all inputs, except $_FILES file content. */
4524 RETURN_BOOL(MBSTRG(illegalchars) == 0);
4525 }
4526 }
4527 /* }}} */
4528
4529
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)4530 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
4531 const uint32_t enc_name_arg_num)
4532 {
4533 const mbfl_encoding *enc;
4534 enum mbfl_no_encoding no_enc;
4535
4536 ZEND_ASSERT(str_len > 0);
4537
4538 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4539 if (!enc) {
4540 return -2;
4541 }
4542
4543 no_enc = enc->no_encoding;
4544 if (php_mb_is_unsupported_no_encoding(no_enc)) {
4545 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
4546 return -2;
4547 }
4548
4549 /* Some legacy text encodings have a minimum required wchar buffer size;
4550 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
4551 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
4552 unsigned int state = 0;
4553 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
4554 ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
4555
4556 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
4557 return -1;
4558 }
4559 return wchar_buf[0];
4560 }
4561
4562
4563 /* {{{ */
PHP_FUNCTION(mb_ord)4564 PHP_FUNCTION(mb_ord)
4565 {
4566 char *str;
4567 size_t str_len;
4568 zend_string *enc = NULL;
4569 zend_long cp;
4570
4571 ZEND_PARSE_PARAMETERS_START(1, 2)
4572 Z_PARAM_STRING(str, str_len)
4573 Z_PARAM_OPTIONAL
4574 Z_PARAM_STR_OR_NULL(enc)
4575 ZEND_PARSE_PARAMETERS_END();
4576
4577 if (str_len == 0) {
4578 zend_argument_value_error(1, "must not be empty");
4579 RETURN_THROWS();
4580 }
4581
4582 cp = php_mb_ord(str, str_len, enc, 2);
4583
4584 if (0 > cp) {
4585 if (cp == -2) {
4586 RETURN_THROWS();
4587 }
4588 RETURN_FALSE;
4589 }
4590
4591 RETURN_LONG(cp);
4592 }
4593 /* }}} */
4594
4595
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)4596 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
4597 {
4598 const mbfl_encoding *enc;
4599 enum mbfl_no_encoding no_enc;
4600 zend_string *ret;
4601 char buf[4];
4602
4603 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4604 if (!enc) {
4605 return NULL;
4606 }
4607
4608 no_enc = enc->no_encoding;
4609 if (php_mb_is_unsupported_no_encoding(no_enc)) {
4610 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
4611 return NULL;
4612 }
4613
4614 if (cp < 0 || cp > 0x10ffff) {
4615 return NULL;
4616 }
4617
4618 if (php_mb_is_no_encoding_utf8(no_enc)) {
4619 if (cp > 0xd7ff && 0xe000 > cp) {
4620 return NULL;
4621 }
4622
4623 if (cp < 0x80) {
4624 ret = ZSTR_CHAR(cp);
4625 } else if (cp < 0x800) {
4626 ret = zend_string_alloc(2, 0);
4627 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
4628 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
4629 ZSTR_VAL(ret)[2] = 0;
4630 } else if (cp < 0x10000) {
4631 ret = zend_string_alloc(3, 0);
4632 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
4633 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
4634 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
4635 ZSTR_VAL(ret)[3] = 0;
4636 } else {
4637 ret = zend_string_alloc(4, 0);
4638 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
4639 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
4640 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
4641 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
4642 ZSTR_VAL(ret)[4] = 0;
4643 }
4644
4645 return ret;
4646 }
4647
4648 buf[0] = (cp >> 24) & 0xff;
4649 buf[1] = (cp >> 16) & 0xff;
4650 buf[2] = (cp >> 8) & 0xff;
4651 buf[3] = cp & 0xff;
4652
4653 long orig_illegalchars = MBSTRG(illegalchars);
4654 MBSTRG(illegalchars) = 0;
4655 ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be);
4656
4657 if (MBSTRG(illegalchars) != 0) {
4658 zend_string_release(ret);
4659 ret = NULL;
4660 }
4661
4662 MBSTRG(illegalchars) = orig_illegalchars;
4663 return ret;
4664 }
4665
4666
4667 /* {{{ */
PHP_FUNCTION(mb_chr)4668 PHP_FUNCTION(mb_chr)
4669 {
4670 zend_long cp;
4671 zend_string *enc = NULL;
4672
4673 ZEND_PARSE_PARAMETERS_START(1, 2)
4674 Z_PARAM_LONG(cp)
4675 Z_PARAM_OPTIONAL
4676 Z_PARAM_STR_OR_NULL(enc)
4677 ZEND_PARSE_PARAMETERS_END();
4678
4679 zend_string* ret = php_mb_chr(cp, enc, 2);
4680 if (ret == NULL) {
4681 RETURN_FALSE;
4682 }
4683
4684 RETURN_STR(ret);
4685 }
4686 /* }}} */
4687
4688 /* {{{ */
PHP_FUNCTION(mb_scrub)4689 PHP_FUNCTION(mb_scrub)
4690 {
4691 char* str;
4692 size_t str_len;
4693 zend_string *enc_name = NULL;
4694
4695 ZEND_PARSE_PARAMETERS_START(1, 2)
4696 Z_PARAM_STRING(str, str_len)
4697 Z_PARAM_OPTIONAL
4698 Z_PARAM_STR_OR_NULL(enc_name)
4699 ZEND_PARSE_PARAMETERS_END();
4700
4701 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
4702 if (!enc) {
4703 RETURN_THROWS();
4704 }
4705
4706 RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc));
4707 }
4708 /* }}} */
4709
4710
4711 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)4712 static void php_mb_populate_current_detect_order_list(void)
4713 {
4714 const mbfl_encoding **entry = 0;
4715 size_t nentries;
4716
4717 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
4718 nentries = MBSTRG(detect_order_list_size);
4719 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4720 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
4721 } else {
4722 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
4723 size_t i;
4724 nentries = MBSTRG(default_detect_order_list_size);
4725 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4726 for (i = 0; i < nentries; i++) {
4727 entry[i] = mbfl_no2encoding(src[i]);
4728 }
4729 }
4730 MBSTRG(current_detect_order_list) = entry;
4731 MBSTRG(current_detect_order_list_size) = nentries;
4732 }
4733 /* }}} */
4734
4735 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)4736 static int php_mb_encoding_translation(void)
4737 {
4738 return MBSTRG(encoding_translation);
4739 }
4740 /* }}} */
4741
php_mb_mbchar_bytes(const char * s,const mbfl_encoding * enc)4742 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
4743 {
4744 if (enc) {
4745 if (enc->mblen_table) {
4746 if (s) {
4747 return enc->mblen_table[*(unsigned char *)s];
4748 }
4749 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
4750 return 2;
4751 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
4752 return 4;
4753 }
4754 }
4755 return 1;
4756 }
4757
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)4758 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
4759 {
4760 const char *p = s;
4761 char *last=NULL;
4762
4763 if (nbytes == (size_t)-1) {
4764 size_t nb = 0;
4765
4766 while (*p != '\0') {
4767 if (nb == 0) {
4768 if ((unsigned char)*p == (unsigned char)c) {
4769 last = (char *)p;
4770 }
4771 nb = php_mb_mbchar_bytes(p, enc);
4772 if (nb == 0) {
4773 return NULL; /* something is going wrong! */
4774 }
4775 }
4776 --nb;
4777 ++p;
4778 }
4779 } else {
4780 size_t bcnt = nbytes;
4781 size_t nbytes_char;
4782 while (bcnt > 0) {
4783 if ((unsigned char)*p == (unsigned char)c) {
4784 last = (char *)p;
4785 }
4786 nbytes_char = php_mb_mbchar_bytes(p, enc);
4787 if (bcnt < nbytes_char) {
4788 return NULL;
4789 }
4790 p += nbytes_char;
4791 bcnt -= nbytes_char;
4792 }
4793 }
4794 return last;
4795 }
4796
4797 /* {{{ MBSTRING_API int php_mb_stripos() */
php_mb_stripos(int mode,const char * old_haystack,size_t old_haystack_len,const char * old_needle,size_t old_needle_len,zend_long offset,const mbfl_encoding * enc)4798 MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *enc)
4799 {
4800 size_t n = (size_t) -1;
4801 mbfl_string haystack, needle;
4802
4803 mbfl_string_init_set(&haystack, enc);
4804 mbfl_string_init_set(&needle, enc);
4805
4806 do {
4807 /* We're using simple case-folding here, because we'd have to deal with remapping of
4808 * offsets otherwise. */
4809
4810 size_t len = 0;
4811 haystack.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc);
4812 haystack.len = len;
4813
4814 if (!haystack.val) {
4815 break;
4816 }
4817
4818 if (haystack.len == 0) {
4819 break;
4820 }
4821
4822 needle.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc);
4823 needle.len = len;
4824
4825 if (!needle.val) {
4826 break;
4827 }
4828
4829 n = mbfl_strpos(&haystack, &needle, offset, mode);
4830 } while(0);
4831
4832 if (haystack.val) {
4833 efree(haystack.val);
4834 }
4835
4836 if (needle.val) {
4837 efree(needle.val);
4838 }
4839
4840 return n;
4841 }
4842 /* }}} */
4843
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)4844 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
4845 {
4846 *list = (const zend_encoding **)MBSTRG(http_input_list);
4847 *list_size = MBSTRG(http_input_list_size);
4848 }
4849 /* }}} */
4850
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)4851 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
4852 {
4853 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
4854 }
4855 /* }}} */
4856