1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18 */
19
20 /* {{{ includes */
21 #include "libmbfl/config.h"
22 #include "php.h"
23 #include "php_ini.h"
24 #include "php_variables.h"
25 #include "mbstring.h"
26 #include "ext/standard/php_string.h"
27 #include "ext/standard/php_mail.h"
28 #include "ext/standard/exec.h"
29 #include "ext/standard/url.h"
30 #include "main/php_output.h"
31 #include "ext/standard/info.h"
32 #include "ext/pcre/php_pcre.h"
33
34 #include "libmbfl/mbfl/mbfilter_8bit.h"
35 #include "libmbfl/mbfl/mbfilter_pass.h"
36 #include "libmbfl/mbfl/mbfilter_wchar.h"
37 #include "libmbfl/filters/mbfilter_base64.h"
38 #include "libmbfl/filters/mbfilter_qprint.h"
39 #include "libmbfl/filters/mbfilter_ucs4.h"
40 #include "libmbfl/filters/mbfilter_utf8.h"
41 #include "libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h"
42 #include "libmbfl/filters/mbfilter_singlebyte.h"
43
44 #include "php_variables.h"
45 #include "php_globals.h"
46 #include "rfc1867.h"
47 #include "php_content_types.h"
48 #include "SAPI.h"
49 #include "php_unicode.h"
50 #include "TSRM.h"
51
52 #include "mb_gpc.h"
53
54 #ifdef HAVE_MBREGEX
55 # include "php_mbregex.h"
56 #endif
57
58 #include "zend_multibyte.h"
59 #include "mbstring_arginfo.h"
60 /* }}} */
61
62 /* {{{ prototypes */
63 ZEND_DECLARE_MODULE_GLOBALS(mbstring)
64
65 static PHP_GINIT_FUNCTION(mbstring);
66 static PHP_GSHUTDOWN_FUNCTION(mbstring);
67
68 static void php_mb_populate_current_detect_order_list(void);
69
70 static int php_mb_encoding_translation(void);
71
72 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
73
74 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
75
76 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
77
78 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
79 /* }}} */
80
81 /* {{{ php_mb_default_identify_list */
82 typedef struct _php_mb_nls_ident_list {
83 enum mbfl_no_language lang;
84 const enum mbfl_no_encoding *list;
85 size_t list_size;
86 } php_mb_nls_ident_list;
87
88 static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
89 mbfl_no_encoding_ascii,
90 mbfl_no_encoding_jis,
91 mbfl_no_encoding_utf8,
92 mbfl_no_encoding_euc_jp,
93 mbfl_no_encoding_sjis
94 };
95
96 static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
97 mbfl_no_encoding_ascii,
98 mbfl_no_encoding_utf8,
99 mbfl_no_encoding_euc_cn,
100 mbfl_no_encoding_cp936
101 };
102
103 static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
104 mbfl_no_encoding_ascii,
105 mbfl_no_encoding_utf8,
106 mbfl_no_encoding_euc_tw,
107 mbfl_no_encoding_big5
108 };
109
110 static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
111 mbfl_no_encoding_ascii,
112 mbfl_no_encoding_utf8,
113 mbfl_no_encoding_euc_kr,
114 mbfl_no_encoding_uhc
115 };
116
117 static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
118 mbfl_no_encoding_ascii,
119 mbfl_no_encoding_utf8,
120 mbfl_no_encoding_koi8r,
121 mbfl_no_encoding_cp1251,
122 mbfl_no_encoding_cp866
123 };
124
125 static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
126 mbfl_no_encoding_ascii,
127 mbfl_no_encoding_utf8,
128 mbfl_no_encoding_armscii8
129 };
130
131 static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
132 mbfl_no_encoding_ascii,
133 mbfl_no_encoding_utf8,
134 mbfl_no_encoding_cp1254,
135 mbfl_no_encoding_8859_9
136 };
137
138 static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
139 mbfl_no_encoding_ascii,
140 mbfl_no_encoding_utf8,
141 mbfl_no_encoding_koi8u
142 };
143
144 static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
145 mbfl_no_encoding_ascii,
146 mbfl_no_encoding_utf8
147 };
148
149
150 static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
151 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
152 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
153 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
154 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
155 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
156 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
157 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
158 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
159 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
160 };
161
162 /* }}} */
163
164 /* {{{ mbstring_deps[] */
165 static const zend_module_dep mbstring_deps[] = {
166 ZEND_MOD_REQUIRED("pcre")
167 ZEND_MOD_END
168 };
169 /* }}} */
170
171 /* {{{ zend_module_entry mbstring_module_entry */
172 zend_module_entry mbstring_module_entry = {
173 STANDARD_MODULE_HEADER_EX,
174 NULL,
175 mbstring_deps,
176 "mbstring",
177 ext_functions,
178 PHP_MINIT(mbstring),
179 PHP_MSHUTDOWN(mbstring),
180 PHP_RINIT(mbstring),
181 PHP_RSHUTDOWN(mbstring),
182 PHP_MINFO(mbstring),
183 PHP_MBSTRING_VERSION,
184 PHP_MODULE_GLOBALS(mbstring),
185 PHP_GINIT(mbstring),
186 PHP_GSHUTDOWN(mbstring),
187 NULL,
188 STANDARD_MODULE_PROPERTIES_EX
189 };
190 /* }}} */
191
192 /* {{{ static sapi_post_entry php_post_entries[] */
193 static const sapi_post_entry php_post_entries[] = {
194 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
195 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
196 { NULL, 0, NULL, NULL }
197 };
198 /* }}} */
199
200 #ifdef COMPILE_DL_MBSTRING
201 #ifdef ZTS
202 ZEND_TSRMLS_CACHE_DEFINE()
203 #endif
204 ZEND_GET_MODULE(mbstring)
205 #endif
206
207 /* {{{ static sapi_post_entry mbstr_post_entries[] */
208 static const sapi_post_entry mbstr_post_entries[] = {
209 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
210 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
211 { NULL, 0, NULL, NULL }
212 };
213 /* }}} */
214
php_mb_get_encoding(zend_string * encoding_name,uint32_t arg_num)215 static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
216 if (encoding_name) {
217 const mbfl_encoding *encoding;
218 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
219 if (last_encoding_name && (last_encoding_name == encoding_name
220 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
221 return MBSTRG(last_used_encoding);
222 }
223
224 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
225 if (!encoding) {
226 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
227 return NULL;
228 }
229
230 if (last_encoding_name) {
231 zend_string_release(last_encoding_name);
232 }
233 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
234 MBSTRG(last_used_encoding) = encoding;
235 return encoding;
236 } else {
237 return MBSTRG(current_internal_encoding);
238 }
239 }
240
php_mb_get_encoding_or_pass(const char * encoding_name)241 static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name) {
242 if (strcmp(encoding_name, "pass") == 0) {
243 return &mbfl_encoding_pass;
244 }
245
246 return mbfl_name2encoding(encoding_name);
247 }
248
count_commas(const char * p,const char * end)249 static size_t count_commas(const char *p, const char *end) {
250 size_t count = 0;
251 while ((p = memchr(p, ',', end - p))) {
252 count++;
253 p++;
254 }
255 return count;
256 }
257
258 /* {{{ static zend_result php_mb_parse_encoding_list()
259 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
260 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
261 */
php_mb_parse_encoding_list(const char * value,size_t value_length,const mbfl_encoding *** return_list,size_t * return_size,bool persistent,uint32_t arg_num,bool allow_pass_encoding)262 static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
263 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num,
264 bool allow_pass_encoding)
265 {
266 if (value == NULL || value_length == 0) {
267 *return_list = NULL;
268 *return_size = 0;
269 return SUCCESS;
270 } else {
271 bool included_auto;
272 size_t n, size;
273 char *p1, *endp, *tmpstr;
274 const mbfl_encoding **entry, **list;
275
276 /* copy the value string for work */
277 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
278 tmpstr = (char *)estrndup(value+1, value_length-2);
279 value_length -= 2;
280 } else {
281 tmpstr = (char *)estrndup(value, value_length);
282 }
283
284 endp = tmpstr + value_length;
285 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
286 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
287 entry = list;
288 n = 0;
289 included_auto = 0;
290 p1 = tmpstr;
291 while (1) {
292 char *comma = (char *) php_memnstr(p1, ",", 1, endp);
293 char *p = comma ? comma : endp;
294 *p = '\0';
295 /* trim spaces */
296 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
297 p1++;
298 }
299 p--;
300 while (p > p1 && (*p == ' ' || *p == '\t')) {
301 *p = '\0';
302 p--;
303 }
304 /* convert to the encoding number and check encoding */
305 if (strcasecmp(p1, "auto") == 0) {
306 if (!included_auto) {
307 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
308 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
309 size_t i;
310 included_auto = 1;
311 for (i = 0; i < identify_list_size; i++) {
312 *entry++ = mbfl_no2encoding(*src++);
313 n++;
314 }
315 }
316 } else {
317 const mbfl_encoding *encoding =
318 allow_pass_encoding ? php_mb_get_encoding_or_pass(p1) : mbfl_name2encoding(p1);
319 if (!encoding) {
320 /* Called from an INI setting modification */
321 if (arg_num == 0) {
322 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%s\"", p1);
323 } else {
324 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", p1);
325 }
326 efree(tmpstr);
327 pefree(ZEND_VOIDP(list), persistent);
328 return FAILURE;
329 }
330
331 *entry++ = encoding;
332 n++;
333 }
334 if (n >= size || comma == NULL) {
335 break;
336 }
337 p1 = comma + 1;
338 }
339 *return_list = list;
340 *return_size = n;
341 efree(tmpstr);
342 }
343
344 return SUCCESS;
345 }
346 /* }}} */
347
348 /* {{{ static int php_mb_parse_encoding_array()
349 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
350 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
351 */
php_mb_parse_encoding_array(HashTable * target_hash,const mbfl_encoding *** return_list,size_t * return_size,uint32_t arg_num)352 static int php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
353 size_t *return_size, uint32_t arg_num)
354 {
355 /* Allocate enough space to include the default detect order if "auto" is used. */
356 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
357 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
358 const mbfl_encoding **entry = list;
359 bool included_auto = 0;
360 size_t n = 0;
361 zval *hash_entry;
362 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
363 zend_string *encoding_str = zval_try_get_string(hash_entry);
364 if (UNEXPECTED(!encoding_str)) {
365 efree(ZEND_VOIDP(list));
366 return FAILURE;
367 }
368
369 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
370 if (!included_auto) {
371 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
372 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
373 size_t j;
374
375 included_auto = 1;
376 for (j = 0; j < identify_list_size; j++) {
377 *entry++ = mbfl_no2encoding(*src++);
378 n++;
379 }
380 }
381 } else {
382 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
383 if (encoding) {
384 *entry++ = encoding;
385 n++;
386 } else {
387 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
388 zend_string_release(encoding_str);
389 efree(ZEND_VOIDP(list));
390 return FAILURE;
391 }
392 }
393 zend_string_release(encoding_str);
394 } ZEND_HASH_FOREACH_END();
395 *return_list = list;
396 *return_size = n;
397 return SUCCESS;
398 }
399 /* }}} */
400
401 /* {{{ zend_multibyte interface */
php_mb_zend_encoding_fetcher(const char * encoding_name)402 static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
403 {
404 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
405 }
406
php_mb_zend_encoding_name_getter(const zend_encoding * encoding)407 static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
408 {
409 return ((const mbfl_encoding *)encoding)->name;
410 }
411
php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding * _encoding)412 static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
413 {
414 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
415 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
416 }
417
php_mb_zend_encoding_detector(const unsigned char * arg_string,size_t arg_length,const zend_encoding ** list,size_t list_size)418 static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
419 {
420 mbfl_string string;
421
422 if (!list) {
423 list = (const zend_encoding **)MBSTRG(current_detect_order_list);
424 list_size = MBSTRG(current_detect_order_list_size);
425 }
426
427 mbfl_string_init(&string);
428 string.val = (unsigned char *)arg_string;
429 string.len = arg_length;
430 return (const zend_encoding *) mbfl_identify_encoding(&string, (const mbfl_encoding **)list, list_size, 0);
431 }
432
php_mb_zend_encoding_converter(unsigned char ** to,size_t * to_length,const unsigned char * from,size_t from_length,const zend_encoding * encoding_to,const zend_encoding * encoding_from)433 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
434 {
435 mbfl_string string, result;
436 mbfl_buffer_converter *convd;
437
438 /* new encoding */
439 /* initialize string */
440 string.encoding = (const mbfl_encoding*)encoding_from;
441 string.val = (unsigned char*)from;
442 string.len = from_length;
443
444 /* initialize converter */
445 convd = mbfl_buffer_converter_new((const mbfl_encoding *)encoding_from, (const mbfl_encoding *)encoding_to, string.len);
446 if (convd == NULL) {
447 return (size_t) -1;
448 }
449
450 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
451 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
452
453 /* do it */
454 size_t loc = mbfl_buffer_converter_feed(convd, &string);
455
456 mbfl_buffer_converter_flush(convd);
457 mbfl_string_init(&result);
458 if (!mbfl_buffer_converter_result(convd, &result)) {
459 mbfl_buffer_converter_delete(convd);
460 return (size_t)-1;
461 }
462
463 *to = result.val;
464 *to_length = result.len;
465
466 mbfl_buffer_converter_delete(convd);
467
468 return loc;
469 }
470
php_mb_zend_encoding_list_parser(const char * encoding_list,size_t encoding_list_len,const zend_encoding *** return_list,size_t * return_size,bool persistent)471 static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
472 {
473 return php_mb_parse_encoding_list(
474 encoding_list, encoding_list_len,
475 (const mbfl_encoding ***)return_list, return_size,
476 persistent, /* arg_num */ 0, /* allow_pass_encoding */ 1);
477 }
478
php_mb_zend_internal_encoding_getter(void)479 static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
480 {
481 return (const zend_encoding *)MBSTRG(internal_encoding);
482 }
483
php_mb_zend_internal_encoding_setter(const zend_encoding * encoding)484 static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
485 {
486 MBSTRG(internal_encoding) = (const mbfl_encoding *)encoding;
487 return SUCCESS;
488 }
489
490 static zend_multibyte_functions php_mb_zend_multibyte_functions = {
491 "mbstring",
492 php_mb_zend_encoding_fetcher,
493 php_mb_zend_encoding_name_getter,
494 php_mb_zend_encoding_lexer_compatibility_checker,
495 php_mb_zend_encoding_detector,
496 php_mb_zend_encoding_converter,
497 php_mb_zend_encoding_list_parser,
498 php_mb_zend_internal_encoding_getter,
499 php_mb_zend_internal_encoding_setter
500 };
501 /* }}} */
502
503 /* {{{ _php_mb_compile_regex */
_php_mb_compile_regex(const char * pattern)504 static void *_php_mb_compile_regex(const char *pattern)
505 {
506 pcre2_code *retval;
507 PCRE2_SIZE err_offset;
508 int errnum;
509
510 if (!(retval = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
511 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
512 PCRE2_UCHAR err_str[128];
513 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
514 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
515 }
516 return retval;
517 }
518 /* }}} */
519
520 /* {{{ _php_mb_match_regex */
_php_mb_match_regex(void * opaque,const char * str,size_t str_len)521 static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
522 {
523 int res;
524
525 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
526 if (NULL == match_data) {
527 pcre2_code_free(opaque);
528 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
529 return FAILURE;
530 }
531 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
532 php_pcre_free_match_data(match_data);
533
534 return res;
535 }
536 /* }}} */
537
538 /* {{{ _php_mb_free_regex */
_php_mb_free_regex(void * opaque)539 static void _php_mb_free_regex(void *opaque)
540 {
541 pcre2_code_free(opaque);
542 }
543 /* }}} */
544
545 /* {{{ php_mb_nls_get_default_detect_order_list */
php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang,enum mbfl_no_encoding ** plist,size_t * plist_size)546 static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
547 {
548 size_t i;
549
550 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
551 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
552
553 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
554 if (php_mb_default_identify_list[i].lang == lang) {
555 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
556 *plist_size = php_mb_default_identify_list[i].list_size;
557 return 1;
558 }
559 }
560 return 0;
561 }
562 /* }}} */
563
php_mb_rfc1867_substring_conf(const zend_encoding * encoding,char * start,size_t len,char quote)564 static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
565 {
566 char *result = emalloc(len + 2);
567 char *resp = result;
568 size_t i;
569
570 for (i = 0; i < len && start[i] != quote; ++i) {
571 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
572 *resp++ = start[++i];
573 } else {
574 size_t j = php_mb_mbchar_bytes_ex(start+i, (const mbfl_encoding *)encoding);
575
576 while (j-- > 0 && i < len) {
577 *resp++ = start[i++];
578 }
579 --i;
580 }
581 }
582
583 *resp = '\0';
584 return result;
585 }
586
php_mb_rfc1867_getword(const zend_encoding * encoding,char ** line,char stop)587 static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
588 {
589 char *pos = *line, quote;
590 char *res;
591
592 while (*pos && *pos != stop) {
593 if ((quote = *pos) == '"' || quote == '\'') {
594 ++pos;
595 while (*pos && *pos != quote) {
596 if (*pos == '\\' && pos[1] && pos[1] == quote) {
597 pos += 2;
598 } else {
599 ++pos;
600 }
601 }
602 if (*pos) {
603 ++pos;
604 }
605 } else {
606 pos += php_mb_mbchar_bytes_ex(pos, (const mbfl_encoding *)encoding);
607
608 }
609 }
610 if (*pos == '\0') {
611 res = estrdup(*line);
612 *line += strlen(*line);
613 return res;
614 }
615
616 res = estrndup(*line, pos - *line);
617
618 while (*pos == stop) {
619 pos += php_mb_mbchar_bytes_ex(pos, (const mbfl_encoding *)encoding);
620 }
621
622 *line = pos;
623 return res;
624 }
625 /* }}} */
626
php_mb_rfc1867_getword_conf(const zend_encoding * encoding,char * str)627 static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
628 {
629 while (*str && isspace(*(unsigned char *)str)) {
630 ++str;
631 }
632
633 if (!*str) {
634 return estrdup("");
635 }
636
637 if (*str == '"' || *str == '\'') {
638 char quote = *str;
639
640 str++;
641 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
642 } else {
643 char *strend = str;
644
645 while (*strend && !isspace(*(unsigned char *)strend)) {
646 ++strend;
647 }
648 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
649 }
650 }
651 /* }}} */
652
php_mb_rfc1867_basename(const zend_encoding * encoding,char * filename)653 static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
654 {
655 char *s, *s2;
656 const size_t filename_len = strlen(filename);
657
658 /* The \ check should technically be needed for win32 systems only where
659 * it is a valid path separator. However, IE in all it's wisdom always sends
660 * the full path of the file on the user's filesystem, which means that unless
661 * the user does basename() they get a bogus file name. Until IE's user base drops
662 * to nill or problem is fixed this code must remain enabled for all systems. */
663 s = php_mb_safe_strrchr_ex(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
664 s2 = php_mb_safe_strrchr_ex(filename, '/', filename_len, (const mbfl_encoding *)encoding);
665
666 if (s && s2) {
667 if (s > s2) {
668 return ++s;
669 } else {
670 return ++s2;
671 }
672 } else if (s) {
673 return ++s;
674 } else if (s2) {
675 return ++s2;
676 } else {
677 return filename;
678 }
679 }
680 /* }}} */
681
682 /* {{{ php.ini directive handler */
683 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
PHP_INI_MH(OnUpdate_mbstring_language)684 static PHP_INI_MH(OnUpdate_mbstring_language)
685 {
686 enum mbfl_no_language no_language;
687
688 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
689 if (no_language == mbfl_no_language_invalid) {
690 MBSTRG(language) = mbfl_no_language_neutral;
691 return FAILURE;
692 }
693 MBSTRG(language) = no_language;
694 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
695 return SUCCESS;
696 }
697 /* }}} */
698
699 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
PHP_INI_MH(OnUpdate_mbstring_detect_order)700 static PHP_INI_MH(OnUpdate_mbstring_detect_order)
701 {
702 const mbfl_encoding **list;
703 size_t size;
704
705 if (!new_value) {
706 if (MBSTRG(detect_order_list)) {
707 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
708 }
709 MBSTRG(detect_order_list) = NULL;
710 MBSTRG(detect_order_list_size) = 0;
711 return SUCCESS;
712 }
713
714 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 0) || size == 0) {
715 return FAILURE;
716 }
717
718 if (MBSTRG(detect_order_list)) {
719 pefree(ZEND_VOIDP(MBSTRG(detect_order_list)), 1);
720 }
721 MBSTRG(detect_order_list) = list;
722 MBSTRG(detect_order_list_size) = size;
723 return SUCCESS;
724 }
725 /* }}} */
726
_php_mb_ini_mbstring_http_input_set(const char * new_value,size_t new_value_length)727 static int _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
728 const mbfl_encoding **list;
729 size_t size;
730 if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0, /* allow_pass_encoding */ 1) || size == 0) {
731 return FAILURE;
732 }
733 if (MBSTRG(http_input_list)) {
734 pefree(ZEND_VOIDP(MBSTRG(http_input_list)), 1);
735 }
736 MBSTRG(http_input_list) = list;
737 MBSTRG(http_input_list_size) = size;
738 return SUCCESS;
739 }
740
741 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
PHP_INI_MH(OnUpdate_mbstring_http_input)742 static PHP_INI_MH(OnUpdate_mbstring_http_input)
743 {
744 if (new_value) {
745 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
746 }
747
748 if (!new_value || !ZSTR_LEN(new_value)) {
749 const char *encoding = php_get_input_encoding();
750 MBSTRG(http_input_set) = 0;
751 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
752 return SUCCESS;
753 }
754
755 MBSTRG(http_input_set) = 1;
756 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
757 }
758 /* }}} */
759
_php_mb_ini_mbstring_http_output_set(const char * new_value)760 static int _php_mb_ini_mbstring_http_output_set(const char *new_value) {
761 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value);
762 if (!encoding) {
763 return FAILURE;
764 }
765
766 MBSTRG(http_output_encoding) = encoding;
767 MBSTRG(current_http_output_encoding) = encoding;
768 return SUCCESS;
769 }
770
771 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
PHP_INI_MH(OnUpdate_mbstring_http_output)772 static PHP_INI_MH(OnUpdate_mbstring_http_output)
773 {
774 if (new_value) {
775 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
776 }
777
778 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
779 MBSTRG(http_output_set) = 0;
780 _php_mb_ini_mbstring_http_output_set(php_get_output_encoding());
781 return SUCCESS;
782 }
783
784 MBSTRG(http_output_set) = 1;
785 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value));
786 }
787 /* }}} */
788
789 /* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
_php_mb_ini_mbstring_internal_encoding_set(const char * new_value,size_t new_value_length)790 static int _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
791 {
792 const mbfl_encoding *encoding;
793
794 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
795 /* falls back to UTF-8 if an unknown encoding name is given */
796 if (new_value) {
797 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
798 }
799 encoding = &mbfl_encoding_utf8;
800 }
801 MBSTRG(internal_encoding) = encoding;
802 MBSTRG(current_internal_encoding) = encoding;
803 #ifdef HAVE_MBREGEX
804 {
805 const char *enc_name = new_value;
806 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
807 /* falls back to UTF-8 if an unknown encoding name is given */
808 enc_name = "UTF-8";
809 php_mb_regex_set_default_mbctype(enc_name);
810 }
811 php_mb_regex_set_mbctype(new_value);
812 }
813 #endif
814 return SUCCESS;
815 }
816 /* }}} */
817
818 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
PHP_INI_MH(OnUpdate_mbstring_internal_encoding)819 static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
820 {
821 if (new_value) {
822 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
823 }
824
825 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
826 return FAILURE;
827 }
828
829 if (new_value && ZSTR_LEN(new_value)) {
830 MBSTRG(internal_encoding_set) = 1;
831 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
832 } else {
833 const char *encoding = php_get_internal_encoding();
834 MBSTRG(internal_encoding_set) = 0;
835 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
836 }
837 }
838 /* }}} */
839
840 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
PHP_INI_MH(OnUpdate_mbstring_substitute_character)841 static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
842 {
843 int c;
844 char *endptr = NULL;
845
846 if (new_value != NULL) {
847 if (zend_string_equals_literal_ci(new_value, "none")) {
848 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
849 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
850 } else if (zend_string_equals_literal_ci(new_value, "long")) {
851 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
852 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
853 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
854 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
855 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
856 } else {
857 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
858 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
859 if (ZSTR_LEN(new_value) > 0) {
860 c = strtol(ZSTR_VAL(new_value), &endptr, 0);
861 if (*endptr == '\0') {
862 MBSTRG(filter_illegal_substchar) = c;
863 MBSTRG(current_filter_illegal_substchar) = c;
864 }
865 }
866 }
867 } else {
868 MBSTRG(filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
869 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
870 MBSTRG(filter_illegal_substchar) = 0x3f; /* '?' */
871 MBSTRG(current_filter_illegal_substchar) = 0x3f; /* '?' */
872 }
873
874 return SUCCESS;
875 }
876 /* }}} */
877
878 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
PHP_INI_MH(OnUpdate_mbstring_encoding_translation)879 static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
880 {
881 if (new_value == NULL) {
882 return FAILURE;
883 }
884
885 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
886
887 if (MBSTRG(encoding_translation)) {
888 sapi_unregister_post_entry(php_post_entries);
889 sapi_register_post_entries(mbstr_post_entries);
890 } else {
891 sapi_unregister_post_entry(mbstr_post_entries);
892 sapi_register_post_entries(php_post_entries);
893 }
894
895 return SUCCESS;
896 }
897 /* }}} */
898
899 /* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)900 static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
901 {
902 zend_string *tmp;
903 void *re = NULL;
904
905 if (!new_value) {
906 new_value = entry->orig_value;
907 }
908 tmp = php_trim(new_value, NULL, 0, 3);
909
910 if (ZSTR_LEN(tmp) > 0) {
911 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
912 zend_string_release_ex(tmp, 0);
913 return FAILURE;
914 }
915 }
916
917 if (MBSTRG(http_output_conv_mimetypes)) {
918 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
919 }
920
921 MBSTRG(http_output_conv_mimetypes) = re;
922
923 zend_string_release_ex(tmp, 0);
924 return SUCCESS;
925 }
926 /* }}} */
927 /* }}} */
928
929 /* {{{ php.ini directive registration */
930 PHP_INI_BEGIN()
931 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
932 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
933 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
934 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
935 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
936 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
937
938 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
939 PHP_INI_SYSTEM | PHP_INI_PERDIR,
940 OnUpdate_mbstring_encoding_translation,
941 encoding_translation, zend_mbstring_globals, mbstring_globals)
942 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
943 "^(text/|application/xhtml\\+xml)",
944 PHP_INI_ALL,
945 OnUpdate_mbstring_http_output_conv_mimetypes)
946
947 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
948 PHP_INI_ALL,
949 OnUpdateBool,
950 strict_detection, zend_mbstring_globals, mbstring_globals)
951 #ifdef HAVE_MBREGEX
952 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
953 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
954 #endif
PHP_INI_END()955 PHP_INI_END()
956 /* }}} */
957
958 static void mbstring_internal_encoding_changed_hook(void) {
959 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
960 if (!MBSTRG(internal_encoding_set)) {
961 const char *encoding = php_get_internal_encoding();
962 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
963 }
964
965 if (!MBSTRG(http_output_set)) {
966 const char *encoding = php_get_output_encoding();
967 _php_mb_ini_mbstring_http_output_set(encoding);
968 }
969
970 if (!MBSTRG(http_input_set)) {
971 const char *encoding = php_get_input_encoding();
972 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
973 }
974 }
975
976 /* {{{ module global initialize handler */
PHP_GINIT_FUNCTION(mbstring)977 static PHP_GINIT_FUNCTION(mbstring)
978 {
979 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
980 ZEND_TSRMLS_CACHE_UPDATE();
981 #endif
982
983 mbstring_globals->language = mbfl_no_language_uni;
984 mbstring_globals->internal_encoding = NULL;
985 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
986 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
987 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
988 mbstring_globals->http_input_identify = NULL;
989 mbstring_globals->http_input_identify_get = NULL;
990 mbstring_globals->http_input_identify_post = NULL;
991 mbstring_globals->http_input_identify_cookie = NULL;
992 mbstring_globals->http_input_identify_string = NULL;
993 mbstring_globals->http_input_list = NULL;
994 mbstring_globals->http_input_list_size = 0;
995 mbstring_globals->detect_order_list = NULL;
996 mbstring_globals->detect_order_list_size = 0;
997 mbstring_globals->current_detect_order_list = NULL;
998 mbstring_globals->current_detect_order_list_size = 0;
999 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1000 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1001 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1002 mbstring_globals->filter_illegal_substchar = 0x3f; /* '?' */
1003 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1004 mbstring_globals->current_filter_illegal_substchar = 0x3f; /* '?' */
1005 mbstring_globals->illegalchars = 0;
1006 mbstring_globals->encoding_translation = 0;
1007 mbstring_globals->strict_detection = 0;
1008 mbstring_globals->outconv = NULL;
1009 mbstring_globals->http_output_conv_mimetypes = NULL;
1010 #ifdef HAVE_MBREGEX
1011 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1012 #endif
1013 mbstring_globals->last_used_encoding_name = NULL;
1014 mbstring_globals->last_used_encoding = NULL;
1015 mbstring_globals->internal_encoding_set = 0;
1016 mbstring_globals->http_output_set = 0;
1017 mbstring_globals->http_input_set = 0;
1018 }
1019 /* }}} */
1020
1021 /* {{{ PHP_GSHUTDOWN_FUNCTION */
PHP_GSHUTDOWN_FUNCTION(mbstring)1022 static PHP_GSHUTDOWN_FUNCTION(mbstring)
1023 {
1024 if (mbstring_globals->http_input_list) {
1025 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1026 }
1027 if (mbstring_globals->detect_order_list) {
1028 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1029 }
1030 if (mbstring_globals->http_output_conv_mimetypes) {
1031 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1032 }
1033 #ifdef HAVE_MBREGEX
1034 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1035 #endif
1036 }
1037 /* }}} */
1038
1039 /* {{{ PHP_MINIT_FUNCTION(mbstring) */
PHP_MINIT_FUNCTION(mbstring)1040 PHP_MINIT_FUNCTION(mbstring)
1041 {
1042 #if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1043 ZEND_TSRMLS_CACHE_UPDATE();
1044 #endif
1045
1046 REGISTER_INI_ENTRIES();
1047
1048 /* We assume that we're the only user of the hook. */
1049 ZEND_ASSERT(php_internal_encoding_changed == NULL);
1050 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1051 mbstring_internal_encoding_changed_hook();
1052
1053 /* This is a global handler. Should not be set in a per-request handler. */
1054 sapi_register_treat_data(mbstr_treat_data);
1055
1056 /* Post handlers are stored in the thread-local context. */
1057 if (MBSTRG(encoding_translation)) {
1058 sapi_register_post_entries(mbstr_post_entries);
1059 }
1060
1061 REGISTER_LONG_CONSTANT("MB_CASE_UPPER", PHP_UNICODE_CASE_UPPER, CONST_CS | CONST_PERSISTENT);
1062 REGISTER_LONG_CONSTANT("MB_CASE_LOWER", PHP_UNICODE_CASE_LOWER, CONST_CS | CONST_PERSISTENT);
1063 REGISTER_LONG_CONSTANT("MB_CASE_TITLE", PHP_UNICODE_CASE_TITLE, CONST_CS | CONST_PERSISTENT);
1064 REGISTER_LONG_CONSTANT("MB_CASE_FOLD", PHP_UNICODE_CASE_FOLD, CONST_CS | CONST_PERSISTENT);
1065 REGISTER_LONG_CONSTANT("MB_CASE_UPPER_SIMPLE", PHP_UNICODE_CASE_UPPER_SIMPLE, CONST_CS | CONST_PERSISTENT);
1066 REGISTER_LONG_CONSTANT("MB_CASE_LOWER_SIMPLE", PHP_UNICODE_CASE_LOWER_SIMPLE, CONST_CS | CONST_PERSISTENT);
1067 REGISTER_LONG_CONSTANT("MB_CASE_TITLE_SIMPLE", PHP_UNICODE_CASE_TITLE_SIMPLE, CONST_CS | CONST_PERSISTENT);
1068 REGISTER_LONG_CONSTANT("MB_CASE_FOLD_SIMPLE", PHP_UNICODE_CASE_FOLD_SIMPLE, CONST_CS | CONST_PERSISTENT);
1069
1070 #ifdef HAVE_MBREGEX
1071 PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1072 #endif
1073
1074 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1075 return FAILURE;
1076 }
1077
1078 php_rfc1867_set_multibyte_callbacks(
1079 php_mb_encoding_translation,
1080 php_mb_gpc_get_detect_order,
1081 php_mb_gpc_set_input_encoding,
1082 php_mb_rfc1867_getword,
1083 php_mb_rfc1867_getword_conf,
1084 php_mb_rfc1867_basename);
1085
1086 return SUCCESS;
1087 }
1088 /* }}} */
1089
1090 /* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
PHP_MSHUTDOWN_FUNCTION(mbstring)1091 PHP_MSHUTDOWN_FUNCTION(mbstring)
1092 {
1093 UNREGISTER_INI_ENTRIES();
1094
1095 zend_multibyte_restore_functions();
1096
1097 #ifdef HAVE_MBREGEX
1098 PHP_MSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1099 #endif
1100
1101 php_internal_encoding_changed = NULL;
1102
1103 return SUCCESS;
1104 }
1105 /* }}} */
1106
1107 /* {{{ PHP_RINIT_FUNCTION(mbstring) */
PHP_RINIT_FUNCTION(mbstring)1108 PHP_RINIT_FUNCTION(mbstring)
1109 {
1110 MBSTRG(current_internal_encoding) = MBSTRG(internal_encoding);
1111 MBSTRG(current_http_output_encoding) = MBSTRG(http_output_encoding);
1112 MBSTRG(current_filter_illegal_mode) = MBSTRG(filter_illegal_mode);
1113 MBSTRG(current_filter_illegal_substchar) = MBSTRG(filter_illegal_substchar);
1114
1115 MBSTRG(illegalchars) = 0;
1116
1117 php_mb_populate_current_detect_order_list();
1118
1119 #ifdef HAVE_MBREGEX
1120 PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1121 #endif
1122 zend_multibyte_set_internal_encoding((const zend_encoding *)MBSTRG(internal_encoding));
1123
1124 return SUCCESS;
1125 }
1126 /* }}} */
1127
1128 /* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
PHP_RSHUTDOWN_FUNCTION(mbstring)1129 PHP_RSHUTDOWN_FUNCTION(mbstring)
1130 {
1131 if (MBSTRG(current_detect_order_list) != NULL) {
1132 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1133 MBSTRG(current_detect_order_list) = NULL;
1134 MBSTRG(current_detect_order_list_size) = 0;
1135 }
1136 if (MBSTRG(outconv) != NULL) {
1137 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1138 mbfl_buffer_converter_delete(MBSTRG(outconv));
1139 MBSTRG(outconv) = NULL;
1140 }
1141
1142 /* clear http input identification. */
1143 MBSTRG(http_input_identify) = NULL;
1144 MBSTRG(http_input_identify_post) = NULL;
1145 MBSTRG(http_input_identify_get) = NULL;
1146 MBSTRG(http_input_identify_cookie) = NULL;
1147 MBSTRG(http_input_identify_string) = NULL;
1148
1149 if (MBSTRG(last_used_encoding_name)) {
1150 zend_string_release(MBSTRG(last_used_encoding_name));
1151 MBSTRG(last_used_encoding_name) = NULL;
1152 }
1153
1154 MBSTRG(internal_encoding_set) = 0;
1155 MBSTRG(http_output_set) = 0;
1156 MBSTRG(http_input_set) = 0;
1157
1158 #ifdef HAVE_MBREGEX
1159 PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
1160 #endif
1161
1162 return SUCCESS;
1163 }
1164 /* }}} */
1165
1166 /* {{{ PHP_MINFO_FUNCTION(mbstring) */
PHP_MINFO_FUNCTION(mbstring)1167 PHP_MINFO_FUNCTION(mbstring)
1168 {
1169 php_info_print_table_start();
1170 php_info_print_table_row(2, "Multibyte Support", "enabled");
1171 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1172 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1173 {
1174 char tmp[256];
1175 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1176 php_info_print_table_row(2, "libmbfl version", tmp);
1177 }
1178 php_info_print_table_end();
1179
1180 php_info_print_table_start();
1181 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1182 php_info_print_table_end();
1183
1184 #ifdef HAVE_MBREGEX
1185 PHP_MINFO(mb_regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);
1186 #endif
1187
1188 DISPLAY_INI_ENTRIES();
1189 }
1190 /* }}} */
1191
1192 /* {{{ Sets the current language or Returns the current language as a string */
PHP_FUNCTION(mb_language)1193 PHP_FUNCTION(mb_language)
1194 {
1195 zend_string *name = NULL;
1196
1197 ZEND_PARSE_PARAMETERS_START(0, 1)
1198 Z_PARAM_OPTIONAL
1199 Z_PARAM_STR_OR_NULL(name)
1200 ZEND_PARSE_PARAMETERS_END();
1201
1202 if (name == NULL) {
1203 RETVAL_STRING((char *)mbfl_no_language2name(MBSTRG(language)));
1204 } else {
1205 zend_string *ini_name = zend_string_init("mbstring.language", sizeof("mbstring.language") - 1, 0);
1206 if (FAILURE == zend_alter_ini_entry(ini_name, name, PHP_INI_USER, PHP_INI_STAGE_RUNTIME)) {
1207 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1208 zend_string_release_ex(ini_name, 0);
1209 RETURN_THROWS();
1210 }
1211 // TODO Make return void
1212 RETVAL_TRUE;
1213 zend_string_release_ex(ini_name, 0);
1214 }
1215 }
1216 /* }}} */
1217
1218 /* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
PHP_FUNCTION(mb_internal_encoding)1219 PHP_FUNCTION(mb_internal_encoding)
1220 {
1221 char *name = NULL;
1222 size_t name_len;
1223 const mbfl_encoding *encoding;
1224
1225 ZEND_PARSE_PARAMETERS_START(0, 1)
1226 Z_PARAM_OPTIONAL
1227 Z_PARAM_STRING_OR_NULL(name, name_len)
1228 ZEND_PARSE_PARAMETERS_END();
1229
1230 if (name == NULL) {
1231 ZEND_ASSERT(MBSTRG(current_internal_encoding));
1232 RETURN_STRING(MBSTRG(current_internal_encoding)->name);
1233 } else {
1234 encoding = mbfl_name2encoding(name);
1235 if (!encoding) {
1236 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1237 RETURN_THROWS();
1238 } else {
1239 MBSTRG(current_internal_encoding) = encoding;
1240 MBSTRG(internal_encoding_set) = 1;
1241 /* TODO Return old encoding */
1242 RETURN_TRUE;
1243 }
1244 }
1245 }
1246 /* }}} */
1247
1248 /* {{{ Returns the input encoding */
PHP_FUNCTION(mb_http_input)1249 PHP_FUNCTION(mb_http_input)
1250 {
1251 char *type = NULL;
1252 size_t type_len = 0, n;
1253 const mbfl_encoding **entry;
1254 const mbfl_encoding *encoding;
1255
1256 ZEND_PARSE_PARAMETERS_START(0, 1)
1257 Z_PARAM_OPTIONAL
1258 Z_PARAM_STRING_OR_NULL(type, type_len)
1259 ZEND_PARSE_PARAMETERS_END();
1260
1261 if (type == NULL) {
1262 encoding = MBSTRG(http_input_identify);
1263 } else {
1264 switch (*type) {
1265 case 'G':
1266 case 'g':
1267 encoding = MBSTRG(http_input_identify_get);
1268 break;
1269 case 'P':
1270 case 'p':
1271 encoding = MBSTRG(http_input_identify_post);
1272 break;
1273 case 'C':
1274 case 'c':
1275 encoding = MBSTRG(http_input_identify_cookie);
1276 break;
1277 case 'S':
1278 case 's':
1279 encoding = MBSTRG(http_input_identify_string);
1280 break;
1281 case 'I':
1282 case 'i':
1283 entry = MBSTRG(http_input_list);
1284 n = MBSTRG(http_input_list_size);
1285 array_init(return_value);
1286 for (size_t i = 0; i < n; i++, entry++) {
1287 add_next_index_string(return_value, (*entry)->name);
1288 }
1289 return;
1290 case 'L':
1291 case 'l':
1292 entry = MBSTRG(http_input_list);
1293 n = MBSTRG(http_input_list_size);
1294 if (n == 0) {
1295 // TODO should return empty string?
1296 RETURN_FALSE;
1297 }
1298 // TODO Use smart_str instead.
1299 mbfl_string result;
1300 mbfl_memory_device device;
1301 mbfl_memory_device_init(&device, n * 12, 0);
1302 for (size_t i = 0; i < n; i++, entry++) {
1303 mbfl_memory_device_strcat(&device, (*entry)->name);
1304 mbfl_memory_device_output(',', &device);
1305 }
1306 mbfl_memory_device_unput(&device); /* Remove trailing comma */
1307 mbfl_memory_device_result(&device, &result);
1308 RETVAL_STRINGL((const char*)result.val, result.len);
1309 mbfl_string_clear(&result);
1310 return;
1311 default:
1312 zend_argument_value_error(1,
1313 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1314 RETURN_THROWS();
1315 }
1316 }
1317
1318 if (encoding) {
1319 RETURN_STRING(encoding->name);
1320 } else {
1321 RETURN_FALSE;
1322 }
1323 }
1324 /* }}} */
1325
1326 /* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
PHP_FUNCTION(mb_http_output)1327 PHP_FUNCTION(mb_http_output)
1328 {
1329 char *name = NULL;
1330 size_t name_len;
1331
1332 ZEND_PARSE_PARAMETERS_START(0, 1)
1333 Z_PARAM_OPTIONAL
1334 Z_PARAM_STRING_OR_NULL(name, name_len)
1335 ZEND_PARSE_PARAMETERS_END();
1336
1337 if (name == NULL) {
1338 ZEND_ASSERT(MBSTRG(current_http_output_encoding));
1339 RETURN_STRING(MBSTRG(current_http_output_encoding)->name);
1340 } else {
1341 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name);
1342 if (!encoding) {
1343 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1344 RETURN_THROWS();
1345 } else {
1346 MBSTRG(http_output_set) = 1;
1347 MBSTRG(current_http_output_encoding) = encoding;
1348 /* TODO Return previous encoding? */
1349 RETURN_TRUE;
1350 }
1351 }
1352 }
1353 /* }}} */
1354
1355 /* {{{ Sets the current detect_order or Return the current detect_order as a array */
PHP_FUNCTION(mb_detect_order)1356 PHP_FUNCTION(mb_detect_order)
1357 {
1358 zend_string *order_str = NULL;
1359 HashTable *order_ht = NULL;
1360
1361 ZEND_PARSE_PARAMETERS_START(0, 1)
1362 Z_PARAM_OPTIONAL
1363 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1364 ZEND_PARSE_PARAMETERS_END();
1365
1366 if (!order_str && !order_ht) {
1367 size_t n = MBSTRG(current_detect_order_list_size);
1368 const mbfl_encoding **entry = MBSTRG(current_detect_order_list);
1369 array_init(return_value);
1370 for (size_t i = 0; i < n; i++) {
1371 add_next_index_string(return_value, (*entry)->name);
1372 entry++;
1373 }
1374 } else {
1375 const mbfl_encoding **list;
1376 size_t size;
1377 if (order_ht) {
1378 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1379 RETURN_THROWS();
1380 }
1381 } else {
1382 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1, /* allow_pass_encoding */ 0)) {
1383 RETURN_THROWS();
1384 }
1385 }
1386
1387 if (size == 0) {
1388 efree(ZEND_VOIDP(list));
1389 zend_argument_value_error(1, "must specify at least one encoding");
1390 RETURN_THROWS();
1391 }
1392
1393 if (MBSTRG(current_detect_order_list)) {
1394 efree(ZEND_VOIDP(MBSTRG(current_detect_order_list)));
1395 }
1396 MBSTRG(current_detect_order_list) = list;
1397 MBSTRG(current_detect_order_list_size) = size;
1398 RETURN_TRUE;
1399 }
1400 }
1401 /* }}} */
1402
php_mb_check_code_point(zend_long cp)1403 static inline int php_mb_check_code_point(zend_long cp)
1404 {
1405 if (cp < 0 || cp >= 0x110000) {
1406 /* Out of Unicode range */
1407 return 0;
1408 }
1409
1410 if (cp >= 0xd800 && cp <= 0xdfff) {
1411 /* Surrogate code-point. These are never valid on their own and we only allow a single
1412 * substitute character. */
1413 return 0;
1414 }
1415
1416 /* As we do not know the target encoding of the conversion operation that is going to
1417 * use the substitution character, we cannot check whether the codepoint is actually mapped
1418 * in the given encoding at this point. Thus we have to accept everything. */
1419 return 1;
1420 }
1421
1422 /* {{{ Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)1423 PHP_FUNCTION(mb_substitute_character)
1424 {
1425 zend_string *substitute_character = NULL;
1426 zend_long substitute_codepoint;
1427 bool substitute_is_null = 1;
1428
1429 ZEND_PARSE_PARAMETERS_START(0, 1)
1430 Z_PARAM_OPTIONAL
1431 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1432 ZEND_PARSE_PARAMETERS_END();
1433
1434 if (substitute_is_null) {
1435 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1436 RETURN_STRING("none");
1437 }
1438 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1439 RETURN_STRING("long");
1440 }
1441 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1442 RETURN_STRING("entity");
1443 }
1444 RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
1445 }
1446
1447 if (substitute_character != NULL) {
1448 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1449 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1450 RETURN_TRUE;
1451 }
1452 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1453 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
1454 RETURN_TRUE;
1455 }
1456 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1457 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
1458 RETURN_TRUE;
1459 }
1460 /* Invalid string value */
1461 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1462 RETURN_THROWS();
1463 }
1464 /* Integer codepoint passed */
1465 if (!php_mb_check_code_point(substitute_codepoint)) {
1466 zend_argument_value_error(1, "is not a valid codepoint");
1467 RETURN_THROWS();
1468 }
1469
1470 MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1471 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1472 RETURN_TRUE;
1473 }
1474 /* }}} */
1475
1476 /* {{{ Return the preferred MIME name (charset) as a string */
PHP_FUNCTION(mb_preferred_mime_name)1477 PHP_FUNCTION(mb_preferred_mime_name)
1478 {
1479 enum mbfl_no_encoding no_encoding;
1480 char *name = NULL;
1481 size_t name_len;
1482
1483 ZEND_PARSE_PARAMETERS_START(1, 1)
1484 Z_PARAM_STRING(name, name_len)
1485 ZEND_PARSE_PARAMETERS_END();
1486
1487 no_encoding = mbfl_name2no_encoding(name);
1488 if (no_encoding == mbfl_no_encoding_invalid) {
1489 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1490 RETURN_THROWS();
1491 }
1492
1493 const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
1494 if (preferred_name == NULL || *preferred_name == '\0') {
1495 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1496 RETVAL_FALSE;
1497 } else {
1498 RETVAL_STRING((char *)preferred_name);
1499 }
1500 }
1501 /* }}} */
1502
1503 /* {{{ Parses GET/POST/COOKIE data and sets global variables */
PHP_FUNCTION(mb_parse_str)1504 PHP_FUNCTION(mb_parse_str)
1505 {
1506 zval *track_vars_array = NULL;
1507 char *encstr;
1508 size_t encstr_len;
1509 php_mb_encoding_handler_info_t info;
1510 const mbfl_encoding *detected;
1511
1512 ZEND_PARSE_PARAMETERS_START(2, 2)
1513 Z_PARAM_STRING(encstr, encstr_len)
1514 Z_PARAM_ZVAL(track_vars_array)
1515 ZEND_PARSE_PARAMETERS_END();
1516
1517 track_vars_array = zend_try_array_init(track_vars_array);
1518 if (!track_vars_array) {
1519 RETURN_THROWS();
1520 }
1521
1522 encstr = estrndup(encstr, encstr_len);
1523
1524 info.data_type = PARSE_STRING;
1525 info.separator = PG(arg_separator).input;
1526 info.report_errors = 1;
1527 info.to_encoding = MBSTRG(current_internal_encoding);
1528 info.to_language = MBSTRG(language);
1529 info.from_encodings = MBSTRG(http_input_list);
1530 info.num_from_encodings = MBSTRG(http_input_list_size);
1531 info.from_language = MBSTRG(language);
1532
1533 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1534
1535 MBSTRG(http_input_identify) = detected;
1536
1537 RETVAL_BOOL(detected);
1538
1539 if (encstr != NULL) efree(encstr);
1540 }
1541 /* }}} */
1542
1543 /* {{{ Returns string in output buffer converted to the http_output encoding */
PHP_FUNCTION(mb_output_handler)1544 PHP_FUNCTION(mb_output_handler)
1545 {
1546 char *arg_string;
1547 size_t arg_string_len;
1548 zend_long arg_status;
1549 mbfl_string string, result;
1550 const char *charset;
1551 char *p;
1552 const mbfl_encoding *encoding;
1553 int last_feed;
1554 size_t len;
1555 unsigned char send_text_mimetype = 0;
1556 char *s, *mimetype = NULL;
1557
1558 ZEND_PARSE_PARAMETERS_START(2, 2)
1559 Z_PARAM_STRING(arg_string, arg_string_len)
1560 Z_PARAM_LONG(arg_status)
1561 ZEND_PARSE_PARAMETERS_END();
1562
1563 encoding = MBSTRG(current_http_output_encoding);
1564
1565 /* start phase only */
1566 if ((arg_status & PHP_OUTPUT_HANDLER_START) != 0) {
1567 /* delete the converter just in case. */
1568 if (MBSTRG(outconv)) {
1569 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1570 mbfl_buffer_converter_delete(MBSTRG(outconv));
1571 MBSTRG(outconv) = NULL;
1572 }
1573
1574 if (encoding == &mbfl_encoding_pass) {
1575 RETURN_STRINGL(arg_string, arg_string_len);
1576 }
1577
1578 /* analyze mime type */
1579 if (SG(sapi_headers).mimetype &&
1580 _php_mb_match_regex(
1581 MBSTRG(http_output_conv_mimetypes),
1582 SG(sapi_headers).mimetype,
1583 strlen(SG(sapi_headers).mimetype))) {
1584 if ((s = strchr(SG(sapi_headers).mimetype,';')) == NULL) {
1585 mimetype = estrdup(SG(sapi_headers).mimetype);
1586 } else {
1587 mimetype = estrndup(SG(sapi_headers).mimetype,s-SG(sapi_headers).mimetype);
1588 }
1589 send_text_mimetype = 1;
1590 } else if (SG(sapi_headers).send_default_content_type) {
1591 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1592 }
1593
1594 /* if content-type is not yet set, set it and activate the converter */
1595 if (SG(sapi_headers).send_default_content_type || send_text_mimetype) {
1596 charset = encoding->mime_name;
1597 if (charset) {
1598 len = spprintf( &p, 0, "Content-Type: %s; charset=%s", mimetype, charset );
1599 if (sapi_add_header(p, len, 0) != FAILURE) {
1600 SG(sapi_headers).send_default_content_type = 0;
1601 }
1602 }
1603 /* activate the converter */
1604 MBSTRG(outconv) = mbfl_buffer_converter_new(MBSTRG(current_internal_encoding), encoding, 0);
1605 if (send_text_mimetype){
1606 efree(mimetype);
1607 }
1608 }
1609 }
1610
1611 /* just return if the converter is not activated. */
1612 if (MBSTRG(outconv) == NULL) {
1613 RETURN_STRINGL(arg_string, arg_string_len);
1614 }
1615
1616 /* flag */
1617 last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1618 /* mode */
1619 mbfl_buffer_converter_illegal_mode(MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1620 mbfl_buffer_converter_illegal_substchar(MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1621
1622 /* feed the string */
1623 mbfl_string_init(&string);
1624 /* these are not needed. convd has encoding info.
1625 string.encoding = MBSTRG(current_internal_encoding);
1626 */
1627 string.val = (unsigned char *)arg_string;
1628 string.len = arg_string_len;
1629
1630 mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1631 if (last_feed) {
1632 mbfl_buffer_converter_flush(MBSTRG(outconv));
1633 }
1634 /* get the converter output, and return it */
1635 mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1636
1637 // TODO: avoid reallocation ???
1638 RETVAL_STRINGL((char *)result.val, result.len); /* the string is already strdup()'ed */
1639 efree(result.val);
1640
1641 /* delete the converter if it is the last feed. */
1642 if (last_feed) {
1643 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1644 mbfl_buffer_converter_delete(MBSTRG(outconv));
1645 MBSTRG(outconv) = NULL;
1646 }
1647 }
1648 /* }}} */
1649
1650 /* {{{ Convert a multibyte string to an array. If split_length is specified,
1651 break the string down into chunks each split_length characters long. */
1652
1653 /* structure to pass split params to the callback */
1654 struct mbfl_split_params {
1655 zval *return_value; /* php function return value structure pointer */
1656 mbfl_string *result_string; /* string to store result chunk */
1657 size_t mb_chunk_length; /* actual chunk length in chars */
1658 size_t split_length; /* split length in chars */
1659 mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1660 };
1661
1662 /* callback function to fill split array */
mbfl_split_output(int c,void * data)1663 static int mbfl_split_output(int c, void *data)
1664 {
1665 struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1666
1667 (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1668
1669 if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1670 mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1671 mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1672 mbfl_string *chunk = params->result_string;
1673 mbfl_memory_device_result(device, chunk); /* make chunk */
1674 add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1675 efree(chunk->val);
1676 params->mb_chunk_length = 0; /* reset mb_chunk size */
1677 }
1678
1679 return 0;
1680 }
1681
PHP_FUNCTION(mb_str_split)1682 PHP_FUNCTION(mb_str_split)
1683 {
1684 zend_string *str, *encoding = NULL;
1685 size_t mb_len, chunks, chunk_len;
1686 const char *p, *last; /* pointer for the string cursor and last string char */
1687 mbfl_string string, result_string;
1688 const mbfl_encoding *mbfl_encoding;
1689 zend_long split_length = 1;
1690
1691 ZEND_PARSE_PARAMETERS_START(1, 3)
1692 Z_PARAM_STR(str)
1693 Z_PARAM_OPTIONAL
1694 Z_PARAM_LONG(split_length)
1695 Z_PARAM_STR_OR_NULL(encoding)
1696 ZEND_PARSE_PARAMETERS_END();
1697
1698 if (split_length <= 0) {
1699 zend_argument_value_error(2, "must be greater than 0");
1700 RETURN_THROWS();
1701 }
1702
1703 /* fill mbfl_string structure */
1704 string.val = (unsigned char *) ZSTR_VAL(str);
1705 string.len = ZSTR_LEN(str);
1706 string.encoding = php_mb_get_encoding(encoding, 3);
1707 if (!string.encoding) {
1708 RETURN_THROWS();
1709 }
1710
1711 p = ZSTR_VAL(str); /* string cursor pointer */
1712 last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1713
1714 mbfl_encoding = string.encoding;
1715
1716 /* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1717 if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1718 mb_len = string.len;
1719 chunk_len = (size_t)split_length; /* chunk length in bytes */
1720 } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1721 mb_len = string.len / 2;
1722 chunk_len = split_length * 2;
1723 } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1724 mb_len = string.len / 4;
1725 chunk_len = split_length * 4;
1726 } else if (mbfl_encoding->mblen_table != NULL) {
1727 /* second scenario: variable width encodings with length table */
1728 char unsigned const *mbtab = mbfl_encoding->mblen_table;
1729
1730 /* assume that we have 1-bytes characters */
1731 array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1732
1733 while (p < last) { /* split cycle work until the cursor has reached the last byte */
1734 char const *chunk_p = p; /* chunk first byte pointer */
1735 chunk_len = 0; /* chunk length in bytes */
1736 zend_long char_count;
1737
1738 for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1739 char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1740 chunk_len += m;
1741 p += m;
1742 }
1743 if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1744 add_next_index_stringl(return_value, chunk_p, chunk_len);
1745 }
1746 return;
1747 } else {
1748 /* third scenario: other multibyte encodings */
1749 mbfl_convert_filter *filter, *decoder;
1750
1751 /* assume that we have 1-bytes characters */
1752 array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1753
1754 /* decoder filter to decode wchar to encoding */
1755 mbfl_memory_device device;
1756 mbfl_memory_device_init(&device, split_length + 1, 0);
1757
1758 decoder = mbfl_convert_filter_new(
1759 &mbfl_encoding_wchar,
1760 string.encoding,
1761 mbfl_memory_device_output,
1762 NULL,
1763 &device);
1764 /* assert that nothing is wrong with the decoder */
1765 ZEND_ASSERT(decoder != NULL);
1766
1767 /* wchar filter */
1768 mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1769 struct mbfl_split_params params = { /* init callback function params structure */
1770 .return_value = return_value,
1771 .result_string = &result_string,
1772 .mb_chunk_length = 0,
1773 .split_length = (size_t)split_length,
1774 .next_filter = decoder,
1775 };
1776
1777 filter = mbfl_convert_filter_new(
1778 string.encoding,
1779 &mbfl_encoding_wchar,
1780 mbfl_split_output,
1781 NULL,
1782 ¶ms);
1783 /* assert that nothing is wrong with the filter */
1784 ZEND_ASSERT(filter != NULL);
1785
1786 while (p < last - 1) { /* cycle each byte except last with callback function */
1787 (*filter->filter_function)(*p++, filter);
1788 }
1789 params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1790 (*filter->filter_function)(*p++, filter); /* process last char */
1791
1792 mbfl_convert_filter_delete(decoder);
1793 mbfl_convert_filter_delete(filter);
1794 mbfl_memory_device_clear(&device);
1795 return;
1796 }
1797
1798 /* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1799 chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1800 array_init_size(return_value, chunks);
1801 if (chunks != 0) {
1802 zend_long i;
1803
1804 for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1805 add_next_index_stringl(return_value, p, chunk_len);
1806 }
1807 add_next_index_stringl(return_value, p, last - p);
1808 }
1809 }
1810 /* }}} */
1811
1812 /* {{{ Get character numbers of a string */
PHP_FUNCTION(mb_strlen)1813 PHP_FUNCTION(mb_strlen)
1814 {
1815 mbfl_string string;
1816 char *str;
1817 zend_string *enc_name = NULL;
1818
1819 ZEND_PARSE_PARAMETERS_START(1, 2)
1820 Z_PARAM_STRING(str, string.len)
1821 Z_PARAM_OPTIONAL
1822 Z_PARAM_STR_OR_NULL(enc_name)
1823 ZEND_PARSE_PARAMETERS_END();
1824
1825 string.val = (unsigned char*)str;
1826 string.encoding = php_mb_get_encoding(enc_name, 2);
1827 if (!string.encoding) {
1828 RETURN_THROWS();
1829 }
1830
1831 size_t n = mbfl_strlen(&string);
1832 /* Only way this can fail is if the conversion creation fails
1833 * this would imply some sort of memory allocation failure which is a bug */
1834 ZEND_ASSERT(!mbfl_is_error(n));
1835 RETVAL_LONG(n);
1836 }
1837 /* }}} */
1838
handle_strpos_error(size_t error)1839 static void handle_strpos_error(size_t error) {
1840 switch (error) {
1841 case MBFL_ERROR_NOT_FOUND:
1842 break;
1843 case MBFL_ERROR_ENCODING:
1844 php_error_docref(NULL, E_WARNING, "Conversion error");
1845 break;
1846 case MBFL_ERROR_OFFSET:
1847 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1848 break;
1849 default:
1850 zend_value_error("mb_strpos(): Unknown error");
1851 break;
1852 }
1853 }
1854
1855 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(mb_strpos)1856 PHP_FUNCTION(mb_strpos)
1857 {
1858 int reverse = 0;
1859 zend_long offset = 0;
1860 char *haystack_val, *needle_val;
1861 mbfl_string haystack, needle;
1862 zend_string *enc_name = NULL;
1863
1864 ZEND_PARSE_PARAMETERS_START(2, 4)
1865 Z_PARAM_STRING(haystack_val, haystack.len)
1866 Z_PARAM_STRING(needle_val, needle.len)
1867 Z_PARAM_OPTIONAL
1868 Z_PARAM_LONG(offset)
1869 Z_PARAM_STR_OR_NULL(enc_name)
1870 ZEND_PARSE_PARAMETERS_END();
1871
1872 haystack.val = (unsigned char*)haystack_val;
1873 needle.val = (unsigned char*)needle_val;
1874
1875 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1876 if (!haystack.encoding) {
1877 RETURN_THROWS();
1878 }
1879
1880 size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
1881 if (!mbfl_is_error(n)) {
1882 RETVAL_LONG(n);
1883 } else {
1884 handle_strpos_error(n);
1885 RETVAL_FALSE;
1886 }
1887 }
1888 /* }}} */
1889
1890 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(mb_strrpos)1891 PHP_FUNCTION(mb_strrpos)
1892 {
1893 mbfl_string haystack, needle;
1894 char *haystack_val, *needle_val;
1895 zend_string *enc_name = NULL;
1896 zend_long offset = 0;
1897
1898 ZEND_PARSE_PARAMETERS_START(2, 4)
1899 Z_PARAM_STRING(haystack_val, haystack.len)
1900 Z_PARAM_STRING(needle_val, needle.len)
1901 Z_PARAM_OPTIONAL
1902 Z_PARAM_LONG(offset)
1903 Z_PARAM_STR_OR_NULL(enc_name)
1904 ZEND_PARSE_PARAMETERS_END();
1905
1906 haystack.val = (unsigned char*)haystack_val;
1907 needle.val = (unsigned char*)needle_val;
1908
1909 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
1910 if (!haystack.encoding) {
1911 RETURN_THROWS();
1912 }
1913
1914 size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
1915 if (!mbfl_is_error(n)) {
1916 RETVAL_LONG(n);
1917 } else {
1918 handle_strpos_error(n);
1919 RETVAL_FALSE;
1920 }
1921 }
1922 /* }}} */
1923
1924 /* {{{ Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stripos)1925 PHP_FUNCTION(mb_stripos)
1926 {
1927 zend_long offset = 0;
1928 mbfl_string haystack, needle;
1929 char *haystack_val, *needle_val;
1930 zend_string *from_encoding = NULL;
1931
1932 ZEND_PARSE_PARAMETERS_START(2, 4)
1933 Z_PARAM_STRING(haystack_val, haystack.len)
1934 Z_PARAM_STRING(needle_val, needle.len)
1935 Z_PARAM_OPTIONAL
1936 Z_PARAM_LONG(offset)
1937 Z_PARAM_STR_OR_NULL(from_encoding)
1938 ZEND_PARSE_PARAMETERS_END();
1939
1940 haystack.val = (unsigned char*)haystack_val;
1941 needle.val = (unsigned char*)needle_val;
1942
1943 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1944 if (!enc) {
1945 RETURN_THROWS();
1946 }
1947
1948 size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1949
1950 if (!mbfl_is_error(n)) {
1951 RETVAL_LONG(n);
1952 } else {
1953 handle_strpos_error(n);
1954 RETVAL_FALSE;
1955 }
1956 }
1957 /* }}} */
1958
1959 /* {{{ Finds position of last occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_strripos)1960 PHP_FUNCTION(mb_strripos)
1961 {
1962 zend_long offset = 0;
1963 mbfl_string haystack, needle;
1964 char *haystack_val, *needle_val;
1965 zend_string *from_encoding = NULL;
1966
1967 ZEND_PARSE_PARAMETERS_START(2, 4)
1968 Z_PARAM_STRING(haystack_val, haystack.len)
1969 Z_PARAM_STRING(needle_val, needle.len)
1970 Z_PARAM_OPTIONAL
1971 Z_PARAM_LONG(offset)
1972 Z_PARAM_STR_OR_NULL(from_encoding)
1973 ZEND_PARSE_PARAMETERS_END();
1974
1975 haystack.val = (unsigned char*)haystack_val;
1976 needle.val = (unsigned char*)needle_val;
1977
1978 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
1979 if (!enc) {
1980 RETURN_THROWS();
1981 }
1982
1983 size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
1984
1985 if (!mbfl_is_error(n)) {
1986 RETVAL_LONG(n);
1987 } else {
1988 handle_strpos_error(n);
1989 RETVAL_FALSE;
1990 }
1991 }
1992 /* }}} */
1993
1994 #define MB_STRSTR 1
1995 #define MB_STRRCHR 2
1996 #define MB_STRISTR 3
1997 #define MB_STRRICHR 4
1998 /* {{{ php_mb_strstr_variants */
php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS,unsigned int variant)1999 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2000 {
2001 int reverse_mode = 0;
2002 size_t n;
2003 char *haystack_val, *needle_val;
2004 mbfl_string haystack, needle, result, *ret = NULL;
2005 zend_string *encoding_name = NULL;
2006 bool part = 0;
2007
2008 ZEND_PARSE_PARAMETERS_START(2, 4)
2009 Z_PARAM_STRING(haystack_val, haystack.len)
2010 Z_PARAM_STRING(needle_val, needle.len)
2011 Z_PARAM_OPTIONAL
2012 Z_PARAM_BOOL(part)
2013 Z_PARAM_STR_OR_NULL(encoding_name)
2014 ZEND_PARSE_PARAMETERS_END();
2015
2016 haystack.val = (unsigned char*)haystack_val;
2017 needle.val = (unsigned char*)needle_val;
2018 haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
2019 if (!haystack.encoding) {
2020 RETURN_THROWS();
2021 }
2022
2023 if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
2024
2025 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2026 n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
2027 needle.len, 0, needle.encoding);
2028 } else {
2029 n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
2030 }
2031
2032 if (!mbfl_is_error(n)) {
2033 if (part) {
2034 ret = mbfl_substr(&haystack, &result, 0, n);
2035 ZEND_ASSERT(ret != NULL);
2036 // TODO: avoid reallocation ???
2037 RETVAL_STRINGL((char *)ret->val, ret->len);
2038 efree(ret->val);
2039 } else {
2040 ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
2041 ZEND_ASSERT(ret != NULL);
2042 // TODO: avoid reallocation ???
2043 RETVAL_STRINGL((char *)ret->val, ret->len);
2044 efree(ret->val);
2045 }
2046 } else {
2047 // FIXME use handle_strpos_error(n)
2048 RETVAL_FALSE;
2049 }
2050 }
2051
2052 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(mb_strstr)2053 PHP_FUNCTION(mb_strstr)
2054 {
2055 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2056 }
2057 /* }}} */
2058
2059 /* {{{ Finds the last occurrence of a character in a string within another */
PHP_FUNCTION(mb_strrchr)2060 PHP_FUNCTION(mb_strrchr)
2061 {
2062 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2063 }
2064 /* }}} */
2065
2066 /* {{{ Finds first occurrence of a string within another, case insensitive */
PHP_FUNCTION(mb_stristr)2067 PHP_FUNCTION(mb_stristr)
2068 {
2069 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2070 }
2071 /* }}} */
2072
2073 /* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
PHP_FUNCTION(mb_strrichr)2074 PHP_FUNCTION(mb_strrichr)
2075 {
2076 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2077 }
2078 /* }}} */
2079
2080 #undef MB_STRSTR
2081 #undef MB_STRRCHR
2082 #undef MB_STRISTR
2083 #undef MB_STRRICHR
2084
2085 /* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)2086 PHP_FUNCTION(mb_substr_count)
2087 {
2088 mbfl_string haystack, needle;
2089 char *haystack_val, *needle_val;
2090 zend_string *enc_name = NULL;
2091
2092 ZEND_PARSE_PARAMETERS_START(2, 3)
2093 Z_PARAM_STRING(haystack_val, haystack.len)
2094 Z_PARAM_STRING(needle_val, needle.len)
2095 Z_PARAM_OPTIONAL
2096 Z_PARAM_STR_OR_NULL(enc_name)
2097 ZEND_PARSE_PARAMETERS_END();
2098
2099 haystack.val = (unsigned char*)haystack_val;
2100 needle.val = (unsigned char*)needle_val;
2101
2102 if (needle.len == 0) {
2103 zend_argument_value_error(2, "must not be empty");
2104 RETURN_THROWS();
2105 }
2106
2107 haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2108 if (!haystack.encoding) {
2109 RETURN_THROWS();
2110 }
2111
2112 size_t n = mbfl_substr_count(&haystack, &needle);
2113 /* An error can only occur if needle is empty,
2114 * an encoding error happens (which should not happen at this stage and is a bug)
2115 * or the haystack is more than sizeof(size_t) bytes
2116 * If one of these things occur this is a bug and should be flagged as such */
2117 ZEND_ASSERT(!mbfl_is_error(n));
2118 RETVAL_LONG(n);
2119 }
2120 /* }}} */
2121
2122 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)2123 PHP_FUNCTION(mb_substr)
2124 {
2125 char *str;
2126 zend_string *encoding = NULL;
2127 zend_long from, len;
2128 size_t real_from, real_len;
2129 size_t str_len;
2130 bool len_is_null = 1;
2131 mbfl_string string, result, *ret;
2132
2133 ZEND_PARSE_PARAMETERS_START(2, 4)
2134 Z_PARAM_STRING(str, str_len)
2135 Z_PARAM_LONG(from)
2136 Z_PARAM_OPTIONAL
2137 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2138 Z_PARAM_STR_OR_NULL(encoding)
2139 ZEND_PARSE_PARAMETERS_END();
2140
2141 string.encoding = php_mb_get_encoding(encoding, 4);
2142 if (!string.encoding) {
2143 RETURN_THROWS();
2144 }
2145
2146 string.val = (unsigned char *)str;
2147 string.len = str_len;
2148
2149 /* measures length */
2150 size_t mblen = 0;
2151 if (from < 0 || (!len_is_null && len < 0)) {
2152 mblen = mbfl_strlen(&string);
2153 }
2154
2155 /* if "from" position is negative, count start position from the end
2156 * of the string
2157 */
2158 if (from >= 0) {
2159 real_from = (size_t) from;
2160 } else if (-from < mblen) {
2161 real_from = mblen + from;
2162 } else {
2163 real_from = 0;
2164 }
2165
2166 /* if "length" position is negative, set it to the length
2167 * needed to stop that many chars from the end of the string
2168 */
2169 if (len_is_null) {
2170 real_len = MBFL_SUBSTR_UNTIL_END;
2171 } else if (len >= 0) {
2172 real_len = (size_t) len;
2173 } else if (real_from < mblen && -len < mblen - real_from) {
2174 real_len = (mblen - real_from) + len;
2175 } else {
2176 real_len = 0;
2177 }
2178
2179 ret = mbfl_substr(&string, &result, real_from, real_len);
2180 ZEND_ASSERT(ret != NULL);
2181
2182 // TODO: avoid reallocation ???
2183 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2184 efree(ret->val);
2185 }
2186 /* }}} */
2187
2188 /* {{{ Returns part of a string */
PHP_FUNCTION(mb_strcut)2189 PHP_FUNCTION(mb_strcut)
2190 {
2191 zend_string *encoding = NULL;
2192 char *string_val;
2193 zend_long from, len;
2194 bool len_is_null = 1;
2195 mbfl_string string, result, *ret;
2196
2197 ZEND_PARSE_PARAMETERS_START(2, 4)
2198 Z_PARAM_STRING(string_val, string.len)
2199 Z_PARAM_LONG(from)
2200 Z_PARAM_OPTIONAL
2201 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2202 Z_PARAM_STR_OR_NULL(encoding)
2203 ZEND_PARSE_PARAMETERS_END();
2204
2205 string.val = (unsigned char*)string_val;
2206 string.encoding = php_mb_get_encoding(encoding, 4);
2207 if (!string.encoding) {
2208 RETURN_THROWS();
2209 }
2210
2211 if (len_is_null) {
2212 len = string.len;
2213 }
2214
2215 /* if "from" position is negative, count start position from the end
2216 * of the string
2217 */
2218 if (from < 0) {
2219 from = string.len + from;
2220 if (from < 0) {
2221 from = 0;
2222 }
2223 }
2224
2225 /* if "length" position is negative, set it to the length
2226 * needed to stop that many chars from the end of the string
2227 */
2228 if (len < 0) {
2229 len = (string.len - from) + len;
2230 if (len < 0) {
2231 len = 0;
2232 }
2233 }
2234
2235 if (from > string.len) {
2236 RETURN_EMPTY_STRING();
2237 }
2238
2239 ret = mbfl_strcut(&string, &result, from, len);
2240 ZEND_ASSERT(ret != NULL);
2241
2242 // TODO: avoid reallocation ???
2243 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2244 efree(ret->val);
2245 }
2246 /* }}} */
2247
2248 /* {{{ Gets terminal width of a string */
PHP_FUNCTION(mb_strwidth)2249 PHP_FUNCTION(mb_strwidth)
2250 {
2251 char *string_val;
2252 mbfl_string string;
2253 zend_string *enc_name = NULL;
2254
2255 ZEND_PARSE_PARAMETERS_START(1, 2)
2256 Z_PARAM_STRING(string_val, string.len)
2257 Z_PARAM_OPTIONAL
2258 Z_PARAM_STR_OR_NULL(enc_name)
2259 ZEND_PARSE_PARAMETERS_END();
2260
2261 string.val = (unsigned char*)string_val;
2262 string.encoding = php_mb_get_encoding(enc_name, 2);
2263 if (!string.encoding) {
2264 RETURN_THROWS();
2265 }
2266
2267 size_t n = mbfl_strwidth(&string);
2268 ZEND_ASSERT(n != (size_t) -1);
2269 RETVAL_LONG(n);
2270 }
2271 /* }}} */
2272
2273 /* {{{ Trim the string in terminal width */
PHP_FUNCTION(mb_strimwidth)2274 PHP_FUNCTION(mb_strimwidth)
2275 {
2276 char *str, *trimmarker = NULL;
2277 zend_string *encoding = NULL;
2278 zend_long from, width, swidth = 0;
2279 size_t str_len, trimmarker_len;
2280 mbfl_string string, result, marker, *ret;
2281
2282 ZEND_PARSE_PARAMETERS_START(3, 5)
2283 Z_PARAM_STRING(str, str_len)
2284 Z_PARAM_LONG(from)
2285 Z_PARAM_LONG(width)
2286 Z_PARAM_OPTIONAL
2287 Z_PARAM_STRING(trimmarker, trimmarker_len)
2288 Z_PARAM_STR_OR_NULL(encoding)
2289 ZEND_PARSE_PARAMETERS_END();
2290
2291 string.encoding = marker.encoding = php_mb_get_encoding(encoding, 5);
2292 if (!string.encoding) {
2293 RETURN_THROWS();
2294 }
2295
2296 string.val = (unsigned char *)str;
2297 string.len = str_len;
2298 marker.val = NULL;
2299 marker.len = 0;
2300
2301 if ((from < 0) || (width < 0)) {
2302 swidth = mbfl_strwidth(&string);
2303 }
2304
2305 if (from < 0) {
2306 from += swidth;
2307 }
2308
2309 if (from < 0 || (size_t)from > str_len) {
2310 zend_argument_value_error(2, "is out of range");
2311 RETURN_THROWS();
2312 }
2313
2314 if (width < 0) {
2315 width = swidth + width - from;
2316 }
2317
2318 if (width < 0) {
2319 zend_argument_value_error(3, "is out of range");
2320 RETURN_THROWS();
2321 }
2322
2323 if (trimmarker) {
2324 marker.val = (unsigned char *)trimmarker;
2325 marker.len = trimmarker_len;
2326 }
2327
2328 ret = mbfl_strimwidth(&string, &marker, &result, from, width);
2329 ZEND_ASSERT(ret != NULL);
2330 // TODO: avoid reallocation ???
2331 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2332 efree(ret->val);
2333 }
2334 /* }}} */
2335
2336
2337 /* See mbfl_no_encoding definition for list of unsupported encodings */
php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)2338 static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2339 {
2340 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2341 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2342 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2343 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2344 }
2345
2346
2347 /* See mbfl_no_encoding definition for list of UTF-8 encodings */
php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)2348 static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
2349 {
2350 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
2351 }
2352
php_mb_convert_encoding_ex(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding * from_encoding,size_t * output_len)2353 MBSTRING_API char *php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len)
2354 {
2355 mbfl_string string, result, *ret;
2356 mbfl_buffer_converter *convd;
2357 char *output = NULL;
2358
2359 if (output_len) {
2360 *output_len = 0;
2361 }
2362
2363 /* initialize string */
2364 string.encoding = from_encoding;
2365 string.val = (unsigned char *)input;
2366 string.len = length;
2367
2368 /* initialize converter */
2369 convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len);
2370 /* If this assertion fails this means some memory allocation failure which is a bug */
2371 ZEND_ASSERT(convd != NULL);
2372
2373 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
2374 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
2375
2376 /* do it */
2377 mbfl_string_init(&result);
2378 ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
2379 if (ret) {
2380 if (output_len) {
2381 *output_len = ret->len;
2382 }
2383 output = (char *)ret->val;
2384 }
2385
2386 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
2387 mbfl_buffer_converter_delete(convd);
2388 return output;
2389 }
2390 /* }}} */
2391
2392 /* {{{ MBSTRING_API char *php_mb_convert_encoding() */
php_mb_convert_encoding(const char * input,size_t length,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings,size_t * output_len)2393 MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings, size_t *output_len)
2394 {
2395 const mbfl_encoding *from_encoding;
2396
2397 if (output_len) {
2398 *output_len = 0;
2399 }
2400
2401 /* pre-conversion encoding */
2402 ZEND_ASSERT(num_from_encodings >= 1);
2403 if (num_from_encodings == 1) {
2404 from_encoding = *from_encodings;
2405 } else {
2406 /* auto detect */
2407 mbfl_string string;
2408 mbfl_string_init(&string);
2409 string.val = (unsigned char *)input;
2410 string.len = length;
2411 from_encoding = mbfl_identify_encoding(
2412 &string, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2413 if (!from_encoding) {
2414 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2415 return NULL;
2416 }
2417 }
2418
2419 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding, output_len);
2420 }
2421 /* }}} */
2422
php_mb_convert_encoding_recursive(HashTable * input,const mbfl_encoding * to_encoding,const mbfl_encoding ** from_encodings,size_t num_from_encodings)2423 MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2424 {
2425 HashTable *output, *chash;
2426 zend_long idx;
2427 zend_string *key;
2428 zval *entry, entry_tmp;
2429 size_t ckey_len, cval_len;
2430 char *ckey, *cval;
2431
2432 if (!input) {
2433 return NULL;
2434 }
2435
2436 if (GC_IS_RECURSIVE(input)) {
2437 GC_UNPROTECT_RECURSION(input);
2438 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2439 return NULL;
2440 }
2441 GC_TRY_PROTECT_RECURSION(input);
2442 output = zend_new_array(zend_hash_num_elements(input));
2443 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2444 /* convert key */
2445 if (key) {
2446 ckey = php_mb_convert_encoding(
2447 ZSTR_VAL(key), ZSTR_LEN(key),
2448 to_encoding, from_encodings, num_from_encodings, &ckey_len);
2449 if (!ckey) {
2450 continue;
2451 }
2452 key = zend_string_init(ckey, ckey_len, 0);
2453 efree(ckey);
2454 }
2455 /* convert value */
2456 ZEND_ASSERT(entry);
2457 try_again:
2458 switch(Z_TYPE_P(entry)) {
2459 case IS_STRING:
2460 cval = php_mb_convert_encoding(
2461 Z_STRVAL_P(entry), Z_STRLEN_P(entry),
2462 to_encoding, from_encodings, num_from_encodings, &cval_len);
2463 if (!cval) {
2464 if (key) {
2465 zend_string_release(key);
2466 }
2467 continue;
2468 }
2469 ZVAL_STRINGL(&entry_tmp, cval, cval_len);
2470 efree(cval);
2471 break;
2472 case IS_NULL:
2473 case IS_TRUE:
2474 case IS_FALSE:
2475 case IS_LONG:
2476 case IS_DOUBLE:
2477 ZVAL_COPY(&entry_tmp, entry);
2478 break;
2479 case IS_ARRAY:
2480 chash = php_mb_convert_encoding_recursive(
2481 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2482 if (chash) {
2483 ZVAL_ARR(&entry_tmp, chash);
2484 } else {
2485 ZVAL_EMPTY_ARRAY(&entry_tmp);
2486 }
2487 break;
2488 case IS_REFERENCE:
2489 entry = Z_REFVAL_P(entry);
2490 goto try_again;
2491 case IS_OBJECT:
2492 default:
2493 if (key) {
2494 zend_string_release(key);
2495 }
2496 php_error_docref(NULL, E_WARNING, "Object is not supported");
2497 continue;
2498 }
2499 if (key) {
2500 zend_hash_add(output, key, &entry_tmp);
2501 zend_string_release(key);
2502 } else {
2503 zend_hash_index_add(output, idx, &entry_tmp);
2504 }
2505 } ZEND_HASH_FOREACH_END();
2506 GC_TRY_UNPROTECT_RECURSION(input);
2507
2508 return output;
2509 }
2510 /* }}} */
2511
remove_non_encodings_from_elist(const mbfl_encoding ** elist,size_t * size)2512 static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2513 {
2514 /* mbstring supports some 'text encodings' which aren't really text encodings
2515 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2516 * These should never be returned by `mb_detect_encoding`. */
2517 int shift = 0;
2518 for (int i = 0; i < *size; i++) {
2519 const mbfl_encoding *encoding = elist[i];
2520 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2521 shift++; /* Remove this encoding from the list */
2522 } else if (shift) {
2523 elist[i - shift] = encoding;
2524 }
2525 }
2526 *size -= shift;
2527 }
2528
2529 /* {{{ Returns converted string in desired encoding */
PHP_FUNCTION(mb_convert_encoding)2530 PHP_FUNCTION(mb_convert_encoding)
2531 {
2532 zend_string *to_encoding_name;
2533 zend_string *input_str, *from_encodings_str = NULL;
2534 HashTable *input_ht, *from_encodings_ht = NULL;
2535 const mbfl_encoding **from_encodings;
2536 size_t num_from_encodings;
2537 bool free_from_encodings;
2538
2539 ZEND_PARSE_PARAMETERS_START(2, 3)
2540 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2541 Z_PARAM_STR(to_encoding_name)
2542 Z_PARAM_OPTIONAL
2543 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2544 ZEND_PARSE_PARAMETERS_END();
2545
2546 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2547 if (!to_encoding) {
2548 RETURN_THROWS();
2549 }
2550
2551 if (from_encodings_ht) {
2552 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2553 RETURN_THROWS();
2554 }
2555 free_from_encodings = 1;
2556 } else if (from_encodings_str) {
2557 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2558 &from_encodings, &num_from_encodings,
2559 /* persistent */ 0, /* arg_num */ 3, /* allow_pass_encoding */ 0) == FAILURE) {
2560 RETURN_THROWS();
2561 }
2562 free_from_encodings = 1;
2563 } else {
2564 from_encodings = &MBSTRG(current_internal_encoding);
2565 num_from_encodings = 1;
2566 free_from_encodings = 0;
2567 }
2568
2569 if (num_from_encodings > 1) {
2570 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2571 }
2572
2573 if (!num_from_encodings) {
2574 efree(ZEND_VOIDP(from_encodings));
2575 zend_argument_value_error(3, "must specify at least one encoding");
2576 RETURN_THROWS();
2577 }
2578
2579 if (input_str) {
2580 /* new encoding */
2581 size_t size;
2582 char *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str),
2583 to_encoding, from_encodings, num_from_encodings, &size);
2584 if (ret != NULL) {
2585 // TODO: avoid reallocation ???
2586 RETVAL_STRINGL(ret, size); /* the string is already strdup()'ed */
2587 efree(ret);
2588 } else {
2589 RETVAL_FALSE;
2590 }
2591 } else {
2592 HashTable *tmp;
2593 tmp = php_mb_convert_encoding_recursive(
2594 input_ht, to_encoding, from_encodings, num_from_encodings);
2595 RETVAL_ARR(tmp);
2596 }
2597
2598 if (free_from_encodings) {
2599 efree(ZEND_VOIDP(from_encodings));
2600 }
2601 }
2602 /* }}} */
2603
mbstring_convert_case(int case_mode,const char * str,size_t str_len,size_t * ret_len,const mbfl_encoding * enc)2604 static char *mbstring_convert_case(
2605 int case_mode, const char *str, size_t str_len, size_t *ret_len,
2606 const mbfl_encoding *enc) {
2607 return php_unicode_convert_case(
2608 case_mode, str, str_len, ret_len, enc,
2609 MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2610 }
2611
2612 /* {{{ Returns a case-folded version of source_string */
PHP_FUNCTION(mb_convert_case)2613 PHP_FUNCTION(mb_convert_case)
2614 {
2615 zend_string *from_encoding = NULL;
2616 char *str;
2617 size_t str_len, ret_len;
2618 zend_long case_mode = 0;
2619
2620 ZEND_PARSE_PARAMETERS_START(2, 3)
2621 Z_PARAM_STRING(str, str_len)
2622 Z_PARAM_LONG(case_mode)
2623 Z_PARAM_OPTIONAL
2624 Z_PARAM_STR_OR_NULL(from_encoding)
2625 ZEND_PARSE_PARAMETERS_END();
2626
2627 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2628 if (!enc) {
2629 RETURN_THROWS();
2630 }
2631
2632 if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) {
2633 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2634 RETURN_THROWS();
2635 }
2636
2637 char *newstr = mbstring_convert_case(case_mode, str, str_len, &ret_len, enc);
2638 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2639 ZEND_ASSERT(newstr != NULL);
2640
2641 // TODO: avoid reallocation ???
2642 RETVAL_STRINGL(newstr, ret_len);
2643 efree(newstr);
2644 }
2645 /* }}} */
2646
2647 /* {{{ Returns a upper cased version of source_string */
PHP_FUNCTION(mb_strtoupper)2648 PHP_FUNCTION(mb_strtoupper)
2649 {
2650 zend_string *from_encoding = NULL;
2651 char *str;
2652 size_t str_len, ret_len;
2653
2654 ZEND_PARSE_PARAMETERS_START(1, 2)
2655 Z_PARAM_STRING(str, str_len)
2656 Z_PARAM_OPTIONAL
2657 Z_PARAM_STR_OR_NULL(from_encoding)
2658 ZEND_PARSE_PARAMETERS_END();
2659
2660 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2661 if (!enc) {
2662 RETURN_THROWS();
2663 }
2664
2665 char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
2666 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2667 ZEND_ASSERT(newstr != NULL);
2668
2669 // TODO: avoid reallocation ???
2670 RETVAL_STRINGL(newstr, ret_len);
2671 efree(newstr);
2672 }
2673 /* }}} */
2674
2675 /* {{{ Returns a lower cased version of source_string */
PHP_FUNCTION(mb_strtolower)2676 PHP_FUNCTION(mb_strtolower)
2677 {
2678 zend_string *from_encoding = NULL;
2679 char *str;
2680 size_t str_len;
2681 char *newstr;
2682 size_t ret_len;
2683 const mbfl_encoding *enc;
2684
2685 ZEND_PARSE_PARAMETERS_START(1, 2)
2686 Z_PARAM_STRING(str, str_len)
2687 Z_PARAM_OPTIONAL
2688 Z_PARAM_STR_OR_NULL(from_encoding)
2689 ZEND_PARSE_PARAMETERS_END();
2690
2691 enc = php_mb_get_encoding(from_encoding, 2);
2692 if (!enc) {
2693 RETURN_THROWS();
2694 }
2695
2696 newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
2697 /* If newstr is NULL something went wrong in mbfl and this is a bug */
2698 ZEND_ASSERT(newstr != NULL);
2699
2700 // TODO: avoid reallocation ???
2701 RETVAL_STRINGL(newstr, ret_len);
2702 efree(newstr);
2703 }
2704 /* }}} */
2705
duplicate_elist(const mbfl_encoding ** elist,size_t size)2706 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
2707 {
2708 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
2709 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
2710 return new_elist;
2711 }
2712
2713 /* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)2714 PHP_FUNCTION(mb_detect_encoding)
2715 {
2716 char *str;
2717 size_t str_len;
2718 zend_string *encoding_str = NULL;
2719 HashTable *encoding_ht = NULL;
2720 bool strict = 0;
2721
2722 mbfl_string string;
2723 const mbfl_encoding *ret;
2724 const mbfl_encoding **elist;
2725 size_t size;
2726
2727 ZEND_PARSE_PARAMETERS_START(1, 3)
2728 Z_PARAM_STRING(str, str_len)
2729 Z_PARAM_OPTIONAL
2730 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
2731 Z_PARAM_BOOL(strict)
2732 ZEND_PARSE_PARAMETERS_END();
2733
2734 /* make encoding list */
2735 if (encoding_ht) {
2736 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
2737 RETURN_THROWS();
2738 }
2739 } else if (encoding_str) {
2740 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0)) {
2741 RETURN_THROWS();
2742 }
2743 } else {
2744 elist = duplicate_elist(MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size));
2745 size = MBSTRG(current_detect_order_list_size);
2746 }
2747
2748 if (size == 0) {
2749 efree(ZEND_VOIDP(elist));
2750 zend_argument_value_error(2, "must specify at least one encoding");
2751 RETURN_THROWS();
2752 }
2753
2754 remove_non_encodings_from_elist(elist, &size);
2755 if (size == 0) {
2756 efree(ZEND_VOIDP(elist));
2757 RETURN_FALSE;
2758 }
2759
2760 if (ZEND_NUM_ARGS() < 3) {
2761 strict = MBSTRG(strict_detection);
2762 }
2763
2764 if (strict && size == 1) {
2765 /* If there is only a single candidate encoding, mb_check_encoding is faster */
2766 ret = (php_mb_check_encoding(str, str_len, *elist)) ? *elist : NULL;
2767 } else {
2768 mbfl_string_init(&string);
2769 string.val = (unsigned char *)str;
2770 string.len = str_len;
2771 ret = mbfl_identify_encoding(&string, elist, size, strict);
2772 }
2773
2774 efree(ZEND_VOIDP(elist));
2775
2776 if (ret == NULL) {
2777 RETURN_FALSE;
2778 }
2779
2780 RETVAL_STRING((char *)ret->name);
2781 }
2782 /* }}} */
2783
2784 /* {{{ Returns an array of all supported entity encodings */
PHP_FUNCTION(mb_list_encodings)2785 PHP_FUNCTION(mb_list_encodings)
2786 {
2787 ZEND_PARSE_PARAMETERS_NONE();
2788
2789 array_init(return_value);
2790 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
2791 add_next_index_string(return_value, (*encodings)->name);
2792 }
2793 }
2794 /* }}} */
2795
2796 /* {{{ Returns an array of the aliases of a given encoding name */
PHP_FUNCTION(mb_encoding_aliases)2797 PHP_FUNCTION(mb_encoding_aliases)
2798 {
2799 const mbfl_encoding *encoding;
2800 zend_string *encoding_name = NULL;
2801
2802 ZEND_PARSE_PARAMETERS_START(1, 1)
2803 Z_PARAM_STR(encoding_name)
2804 ZEND_PARSE_PARAMETERS_END();
2805
2806 encoding = php_mb_get_encoding(encoding_name, 1);
2807 if (!encoding) {
2808 RETURN_THROWS();
2809 }
2810
2811 array_init(return_value);
2812 if (encoding->aliases != NULL) {
2813 for (const char **alias = encoding->aliases; *alias; ++alias) {
2814 add_next_index_string(return_value, (char *)*alias);
2815 }
2816 }
2817 }
2818 /* }}} */
2819
2820 /* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)2821 PHP_FUNCTION(mb_encode_mimeheader)
2822 {
2823 const mbfl_encoding *charset, *transenc;
2824 mbfl_string string, result, *ret;
2825 zend_string *charset_name = NULL;
2826 char *trans_enc_name = NULL, *string_val;
2827 size_t trans_enc_name_len;
2828 char *linefeed = "\r\n";
2829 size_t linefeed_len;
2830 zend_long indent = 0;
2831
2832 string.encoding = MBSTRG(current_internal_encoding);
2833
2834 ZEND_PARSE_PARAMETERS_START(1, 5)
2835 Z_PARAM_STRING(string_val, string.len)
2836 Z_PARAM_OPTIONAL
2837 Z_PARAM_STR(charset_name)
2838 Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
2839 Z_PARAM_STRING(linefeed, linefeed_len)
2840 Z_PARAM_LONG(indent)
2841 ZEND_PARSE_PARAMETERS_END();
2842
2843 string.val = (unsigned char*)string_val;
2844 charset = &mbfl_encoding_pass;
2845 transenc = &mbfl_encoding_base64;
2846
2847 if (charset_name != NULL) {
2848 charset = php_mb_get_encoding(charset_name, 2);
2849 if (!charset) {
2850 RETURN_THROWS();
2851 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
2852 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
2853 RETURN_THROWS();
2854 }
2855 } else {
2856 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
2857 if (lang != NULL) {
2858 charset = mbfl_no2encoding(lang->mail_charset);
2859 transenc = mbfl_no2encoding(lang->mail_header_encoding);
2860 }
2861 }
2862
2863 if (trans_enc_name != NULL) {
2864 if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
2865 transenc = &mbfl_encoding_base64;
2866 } else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
2867 transenc = &mbfl_encoding_qprint;
2868 }
2869 }
2870
2871 mbfl_string_init(&result);
2872 ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
2873 ZEND_ASSERT(ret != NULL);
2874 // TODO: avoid reallocation ???
2875 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2876 efree(ret->val);
2877 }
2878 /* }}} */
2879
2880 /* {{{ Decodes the MIME "encoded-word" in the string */
PHP_FUNCTION(mb_decode_mimeheader)2881 PHP_FUNCTION(mb_decode_mimeheader)
2882 {
2883 char *string_val;
2884 mbfl_string string, result, *ret;
2885
2886 string.encoding = MBSTRG(current_internal_encoding);
2887
2888 ZEND_PARSE_PARAMETERS_START(1, 1)
2889 Z_PARAM_STRING(string_val, string.len)
2890 ZEND_PARSE_PARAMETERS_END();
2891
2892 string.val = (unsigned char*)string_val;
2893 mbfl_string_init(&result);
2894 ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding));
2895 ZEND_ASSERT(ret != NULL);
2896 // TODO: avoid reallocation ???
2897 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2898 efree(ret->val);
2899 }
2900 /* }}} */
2901
2902 /* {{{ Conversion between full-width character and half-width character (Japanese) */
PHP_FUNCTION(mb_convert_kana)2903 PHP_FUNCTION(mb_convert_kana)
2904 {
2905 int opt;
2906 mbfl_string string, result, *ret;
2907 char *optstr = NULL, *string_val;
2908 size_t optstr_len;
2909 zend_string *encname = NULL;
2910
2911 ZEND_PARSE_PARAMETERS_START(1, 3)
2912 Z_PARAM_STRING(string_val, string.len)
2913 Z_PARAM_OPTIONAL
2914 Z_PARAM_STRING(optstr, optstr_len)
2915 Z_PARAM_STR_OR_NULL(encname)
2916 ZEND_PARSE_PARAMETERS_END();
2917
2918 string.val = (unsigned char*)string_val;
2919
2920 /* "Zen" is 全, or "full"; "Han" is 半, or "half"
2921 * This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
2922 if (optstr != NULL) {
2923 char *p = optstr, *e = p + optstr_len;
2924 opt = 0;
2925 while (p < e) {
2926 switch (*p++) {
2927 case 'A':
2928 opt |= MBFL_FILT_TL_HAN2ZEN_ALL;
2929 break;
2930 case 'a':
2931 opt |= MBFL_FILT_TL_ZEN2HAN_ALL;
2932 break;
2933 case 'R':
2934 opt |= MBFL_FILT_TL_HAN2ZEN_ALPHA;
2935 break;
2936 case 'r':
2937 opt |= MBFL_FILT_TL_ZEN2HAN_ALPHA;
2938 break;
2939 case 'N':
2940 opt |= MBFL_FILT_TL_HAN2ZEN_NUMERIC;
2941 break;
2942 case 'n':
2943 opt |= MBFL_FILT_TL_ZEN2HAN_NUMERIC;
2944 break;
2945 case 'S':
2946 opt |= MBFL_FILT_TL_HAN2ZEN_SPACE;
2947 break;
2948 case 's':
2949 opt |= MBFL_FILT_TL_ZEN2HAN_SPACE;
2950 break;
2951 case 'K':
2952 opt |= MBFL_FILT_TL_HAN2ZEN_KATAKANA;
2953 break;
2954 case 'k':
2955 opt |= MBFL_FILT_TL_ZEN2HAN_KATAKANA;
2956 break;
2957 case 'H':
2958 opt |= MBFL_FILT_TL_HAN2ZEN_HIRAGANA;
2959 break;
2960 case 'h':
2961 opt |= MBFL_FILT_TL_ZEN2HAN_HIRAGANA;
2962 break;
2963 case 'V':
2964 opt |= MBFL_FILT_TL_HAN2ZEN_GLUE;
2965 break;
2966 case 'C':
2967 opt |= MBFL_FILT_TL_ZEN2HAN_HIRA2KANA;
2968 break;
2969 case 'c':
2970 opt |= MBFL_FILT_TL_ZEN2HAN_KANA2HIRA;
2971 break;
2972 case 'M':
2973 /* TODO: figure out what 'M' and 'm' are for, and rename the constant
2974 * to something meaningful */
2975 opt |= MBFL_FILT_TL_HAN2ZEN_COMPAT1;
2976 break;
2977 case 'm':
2978 opt |= MBFL_FILT_TL_ZEN2HAN_COMPAT1;
2979 break;
2980 }
2981 }
2982 } else {
2983 opt = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE;
2984 }
2985
2986 /* encoding */
2987 string.encoding = php_mb_get_encoding(encname, 3);
2988 if (!string.encoding) {
2989 RETURN_THROWS();
2990 }
2991
2992 ret = mbfl_ja_jp_hantozen(&string, &result, opt);
2993 ZEND_ASSERT(ret != NULL);
2994 // TODO: avoid reallocation ???
2995 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2996 efree(ret->val);
2997 }
2998 /* }}} */
2999
mb_recursive_encoder_detector_feed(mbfl_encoding_detector * identd,zval * var,int * recursion_error)3000 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
3001 {
3002 mbfl_string string;
3003 HashTable *ht;
3004 zval *entry;
3005
3006 ZVAL_DEREF(var);
3007 if (Z_TYPE_P(var) == IS_STRING) {
3008 string.val = (unsigned char *)Z_STRVAL_P(var);
3009 string.len = Z_STRLEN_P(var);
3010 if (mbfl_encoding_detector_feed(identd, &string)) {
3011 return 1; /* complete detecting */
3012 }
3013 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3014 if (Z_REFCOUNTED_P(var)) {
3015 if (Z_IS_RECURSIVE_P(var)) {
3016 *recursion_error = 1;
3017 return 0;
3018 }
3019 Z_PROTECT_RECURSION_P(var);
3020 }
3021
3022 ht = HASH_OF(var);
3023 if (ht != NULL) {
3024 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3025 if (mb_recursive_encoder_detector_feed(identd, entry, recursion_error)) {
3026 if (Z_REFCOUNTED_P(var)) {
3027 Z_UNPROTECT_RECURSION_P(var);
3028 }
3029 return 1;
3030 } else if (*recursion_error) {
3031 if (Z_REFCOUNTED_P(var)) {
3032 Z_UNPROTECT_RECURSION_P(var);
3033 }
3034 return 0;
3035 }
3036 } ZEND_HASH_FOREACH_END();
3037 }
3038
3039 if (Z_REFCOUNTED_P(var)) {
3040 Z_UNPROTECT_RECURSION_P(var);
3041 }
3042 }
3043 return 0;
3044 } /* }}} */
3045
mb_recursive_convert_variable(mbfl_buffer_converter * convd,zval * var)3046 static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
3047 {
3048 mbfl_string string, result, *ret;
3049 HashTable *ht;
3050 zval *entry, *orig_var;
3051
3052 orig_var = var;
3053 ZVAL_DEREF(var);
3054 if (Z_TYPE_P(var) == IS_STRING) {
3055 string.val = (unsigned char *)Z_STRVAL_P(var);
3056 string.len = Z_STRLEN_P(var);
3057 ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3058 if (ret != NULL) {
3059 zval_ptr_dtor(orig_var);
3060 // TODO: avoid reallocation ???
3061 ZVAL_STRINGL(orig_var, (char *)ret->val, ret->len);
3062 efree(ret->val);
3063 }
3064 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3065 if (Z_TYPE_P(var) == IS_ARRAY) {
3066 SEPARATE_ARRAY(var);
3067 }
3068 if (Z_REFCOUNTED_P(var)) {
3069 if (Z_IS_RECURSIVE_P(var)) {
3070 return 1;
3071 }
3072 Z_PROTECT_RECURSION_P(var);
3073 }
3074
3075 ht = HASH_OF(var);
3076 if (ht != NULL) {
3077 ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3078 if (mb_recursive_convert_variable(convd, entry)) {
3079 if (Z_REFCOUNTED_P(var)) {
3080 Z_UNPROTECT_RECURSION_P(var);
3081 }
3082 return 1;
3083 }
3084 } ZEND_HASH_FOREACH_END();
3085 }
3086
3087 if (Z_REFCOUNTED_P(var)) {
3088 Z_UNPROTECT_RECURSION_P(var);
3089 }
3090 }
3091 return 0;
3092 } /* }}} */
3093
3094 /* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)3095 PHP_FUNCTION(mb_convert_variables)
3096 {
3097 zval *args;
3098 zend_string *to_enc_str;
3099 zend_string *from_enc_str;
3100 HashTable *from_enc_ht;
3101 mbfl_string string, result;
3102 const mbfl_encoding *from_encoding, *to_encoding;
3103 mbfl_encoding_detector *identd;
3104 mbfl_buffer_converter *convd;
3105 int n, argc;
3106 size_t elistsz;
3107 const mbfl_encoding **elist;
3108 int recursion_error = 0;
3109
3110 ZEND_PARSE_PARAMETERS_START(3, -1)
3111 Z_PARAM_STR(to_enc_str)
3112 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3113 Z_PARAM_VARIADIC('+', args, argc)
3114 ZEND_PARSE_PARAMETERS_END();
3115
3116 /* new encoding */
3117 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3118 if (!to_encoding) {
3119 RETURN_THROWS();
3120 }
3121
3122 /* initialize string */
3123 from_encoding = MBSTRG(current_internal_encoding);
3124 mbfl_string_init_set(&string, from_encoding);
3125 mbfl_string_init(&result);
3126
3127 /* pre-conversion encoding */
3128 if (from_enc_ht) {
3129 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3130 RETURN_THROWS();
3131 }
3132 } else {
3133 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2, /* allow_pass_encoding */ 0) == FAILURE) {
3134 RETURN_THROWS();
3135 }
3136 }
3137
3138 if (elistsz == 0) {
3139 efree(ZEND_VOIDP(elist));
3140 zend_argument_value_error(2, "must specify at least one encoding");
3141 RETURN_THROWS();
3142 }
3143
3144 if (elistsz == 1) {
3145 from_encoding = *elist;
3146 } else {
3147 /* auto detect */
3148 from_encoding = NULL;
3149 identd = mbfl_encoding_detector_new(elist, elistsz, MBSTRG(strict_detection));
3150 if (identd != NULL) {
3151 n = 0;
3152 while (n < argc) {
3153 if (mb_recursive_encoder_detector_feed(identd, &args[n], &recursion_error)) {
3154 break;
3155 }
3156 n++;
3157 }
3158 from_encoding = mbfl_encoding_detector_judge(identd);
3159 mbfl_encoding_detector_delete(identd);
3160 if (recursion_error) {
3161 efree(ZEND_VOIDP(elist));
3162 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3163 RETURN_FALSE;
3164 }
3165 }
3166
3167 if (!from_encoding) {
3168 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3169 efree(ZEND_VOIDP(elist));
3170 RETURN_FALSE;
3171 }
3172 }
3173
3174 efree(ZEND_VOIDP(elist));
3175
3176 convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
3177 /* If this assertion fails this means some memory allocation failure which is a bug */
3178 ZEND_ASSERT(convd != NULL);
3179
3180 mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
3181 mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
3182
3183 /* convert */
3184 n = 0;
3185 while (n < argc) {
3186 zval *zv = &args[n];
3187
3188 ZVAL_DEREF(zv);
3189 recursion_error = mb_recursive_convert_variable(convd, zv);
3190 if (recursion_error) {
3191 break;
3192 }
3193 n++;
3194 }
3195
3196 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
3197 mbfl_buffer_converter_delete(convd);
3198
3199 if (recursion_error) {
3200 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3201 RETURN_FALSE;
3202 }
3203
3204 RETURN_STRING(from_encoding->name);
3205 }
3206 /* }}} */
3207
3208 /* HTML numeric entities */
3209
3210 /* Convert PHP array to data structure required by mbfl_html_numeric_entity */
make_conversion_map(HashTable * target_hash,int * convmap_size)3211 static int *make_conversion_map(HashTable *target_hash, int *convmap_size)
3212 {
3213 zval *hash_entry;
3214
3215 int n_elems = zend_hash_num_elements(target_hash);
3216 if (n_elems % 4 != 0) {
3217 zend_argument_value_error(2, "must have a multiple of 4 elements");
3218 return NULL;
3219 }
3220
3221 int *convmap = (int *)safe_emalloc(n_elems, sizeof(int), 0);
3222 int *mapelm = convmap;
3223
3224 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3225 *mapelm++ = zval_get_long(hash_entry);
3226 } ZEND_HASH_FOREACH_END();
3227
3228 *convmap_size = n_elems / 4;
3229 return convmap;
3230 }
3231
3232 /* {{{ Converts specified characters to HTML numeric entities */
PHP_FUNCTION(mb_encode_numericentity)3233 PHP_FUNCTION(mb_encode_numericentity)
3234 {
3235 char *str = NULL;
3236 zend_string *encoding = NULL;
3237 int mapsize;
3238 HashTable *target_hash;
3239 bool is_hex = 0;
3240 mbfl_string string, result, *ret;
3241
3242 ZEND_PARSE_PARAMETERS_START(2, 4)
3243 Z_PARAM_STRING(str, string.len)
3244 Z_PARAM_ARRAY_HT(target_hash)
3245 Z_PARAM_OPTIONAL
3246 Z_PARAM_STR_OR_NULL(encoding)
3247 Z_PARAM_BOOL(is_hex)
3248 ZEND_PARSE_PARAMETERS_END();
3249
3250 string.val = (unsigned char *)str;
3251 string.encoding = php_mb_get_encoding(encoding, 3);
3252 if (!string.encoding) {
3253 RETURN_THROWS();
3254 }
3255
3256 int *convmap = make_conversion_map(target_hash, &mapsize);
3257 if (convmap == NULL) {
3258 RETURN_THROWS();
3259 }
3260
3261 ret = mbfl_html_numeric_entity(&string, &result, convmap, mapsize, is_hex ? 2 : 0);
3262 ZEND_ASSERT(ret != NULL);
3263 // TODO: avoid reallocation ???
3264 RETVAL_STRINGL((char *)ret->val, ret->len);
3265 efree(ret->val);
3266 efree(convmap);
3267 }
3268 /* }}} */
3269
3270 /* {{{ Converts HTML numeric entities to character code */
PHP_FUNCTION(mb_decode_numericentity)3271 PHP_FUNCTION(mb_decode_numericentity)
3272 {
3273 char *str = NULL;
3274 zend_string *encoding = NULL;
3275 int mapsize;
3276 HashTable *target_hash;
3277 mbfl_string string, result, *ret;
3278
3279 ZEND_PARSE_PARAMETERS_START(2, 3)
3280 Z_PARAM_STRING(str, string.len)
3281 Z_PARAM_ARRAY_HT(target_hash)
3282 Z_PARAM_OPTIONAL
3283 Z_PARAM_STR_OR_NULL(encoding)
3284 ZEND_PARSE_PARAMETERS_END();
3285
3286 string.val = (unsigned char *)str;
3287 string.encoding = php_mb_get_encoding(encoding, 3);
3288 if (!string.encoding) {
3289 RETURN_THROWS();
3290 }
3291
3292 int *convmap = make_conversion_map(target_hash, &mapsize);
3293 if (convmap == NULL) {
3294 RETURN_THROWS();
3295 }
3296
3297 ret = mbfl_html_numeric_entity(&string, &result, convmap, mapsize, 1);
3298 ZEND_ASSERT(ret != NULL);
3299 // TODO: avoid reallocation ???
3300 RETVAL_STRINGL((char *)ret->val, ret->len);
3301 efree(ret->val);
3302 efree((void *)convmap);
3303 }
3304 /* }}} */
3305
3306 /* {{{ Sends an email message with MIME scheme */
3307
3308 #define SKIP_LONG_HEADER_SEP_MBSTRING(str, pos) \
3309 if (str[pos] == '\r' && str[pos + 1] == '\n' && (str[pos + 2] == ' ' || str[pos + 2] == '\t')) { \
3310 pos += 2; \
3311 while (str[pos + 1] == ' ' || str[pos + 1] == '\t') { \
3312 pos++; \
3313 } \
3314 continue; \
3315 }
3316
3317 #define CRLF "\r\n"
3318
_php_mbstr_parse_mail_headers(HashTable * ht,const char * str,size_t str_len)3319 static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
3320 {
3321 const char *ps;
3322 size_t icnt;
3323 int state = 0;
3324 int crlf_state = -1;
3325 char *token = NULL;
3326 size_t token_pos = 0;
3327 zend_string *fld_name, *fld_val;
3328
3329 ps = str;
3330 icnt = str_len;
3331 fld_name = fld_val = NULL;
3332
3333 /*
3334 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3335 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
3336 * state 0 1 2 3
3337 *
3338 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3339 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
3340 * crlf_state -1 0 1 -1
3341 *
3342 */
3343
3344 while (icnt > 0) {
3345 switch (*ps) {
3346 case ':':
3347 if (crlf_state == 1) {
3348 token_pos++;
3349 }
3350
3351 if (state == 0 || state == 1) {
3352 if(token && token_pos > 0) {
3353 fld_name = zend_string_init(token, token_pos, 0);
3354 }
3355 state = 2;
3356 } else {
3357 token_pos++;
3358 }
3359
3360 crlf_state = 0;
3361 break;
3362
3363 case '\n':
3364 if (crlf_state == -1) {
3365 goto out;
3366 }
3367 crlf_state = -1;
3368 break;
3369
3370 case '\r':
3371 if (crlf_state == 1) {
3372 token_pos++;
3373 } else {
3374 crlf_state = 1;
3375 }
3376 break;
3377
3378 case ' ': case '\t':
3379 if (crlf_state == -1) {
3380 if (state == 3) {
3381 /* continuing from the previous line */
3382 state = 4;
3383 } else {
3384 /* simply skipping this new line */
3385 state = 5;
3386 }
3387 } else {
3388 if (crlf_state == 1) {
3389 token_pos++;
3390 }
3391 if (state == 1 || state == 3) {
3392 token_pos++;
3393 }
3394 }
3395 crlf_state = 0;
3396 break;
3397
3398 default:
3399 switch (state) {
3400 case 0:
3401 token = (char*)ps;
3402 token_pos = 0;
3403 state = 1;
3404 break;
3405
3406 case 2:
3407 if (crlf_state != -1) {
3408 token = (char*)ps;
3409 token_pos = 0;
3410
3411 state = 3;
3412 break;
3413 }
3414 ZEND_FALLTHROUGH;
3415
3416 case 3:
3417 if (crlf_state == -1) {
3418 if(token && token_pos > 0) {
3419 fld_val = zend_string_init(token, token_pos, 0);
3420 }
3421
3422 if (fld_name != NULL && fld_val != NULL) {
3423 zval val;
3424 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3425 ZVAL_STR(&val, fld_val);
3426
3427 zend_hash_update(ht, fld_name, &val);
3428
3429 zend_string_release_ex(fld_name, 0);
3430 }
3431
3432 fld_name = fld_val = NULL;
3433 token = (char*)ps;
3434 token_pos = 0;
3435
3436 state = 1;
3437 }
3438 break;
3439
3440 case 4:
3441 token_pos++;
3442 state = 3;
3443 break;
3444 }
3445
3446 if (crlf_state == 1) {
3447 token_pos++;
3448 }
3449
3450 token_pos++;
3451
3452 crlf_state = 0;
3453 break;
3454 }
3455 ps++, icnt--;
3456 }
3457 out:
3458 if (state == 2) {
3459 token = "";
3460 token_pos = 0;
3461
3462 state = 3;
3463 }
3464 if (state == 3) {
3465 if(token && token_pos > 0) {
3466 fld_val = zend_string_init(token, token_pos, 0);
3467 }
3468 if (fld_name != NULL && fld_val != NULL) {
3469 zval val;
3470 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3471 ZVAL_STR(&val, fld_val);
3472 zend_hash_update(ht, fld_name, &val);
3473
3474 zend_string_release_ex(fld_name, 0);
3475 }
3476 }
3477 return state;
3478 }
3479
PHP_FUNCTION(mb_send_mail)3480 PHP_FUNCTION(mb_send_mail)
3481 {
3482 char *to;
3483 size_t to_len;
3484 char *message;
3485 size_t message_len;
3486 char *subject;
3487 size_t subject_len;
3488 zend_string *extra_cmd = NULL;
3489 HashTable *headers_ht = NULL;
3490 zend_string *str_headers = NULL;
3491 size_t n, i;
3492 char *to_r = NULL;
3493 char *force_extra_parameters = INI_STR("mail.force_extra_parameters");
3494 struct {
3495 int cnt_type:1;
3496 int cnt_trans_enc:1;
3497 } suppressed_hdrs = { 0, 0 };
3498
3499 char *message_buf = NULL, *subject_buf = NULL, *p;
3500 mbfl_string orig_str, conv_str;
3501 mbfl_string *pstr; /* pointer to mbfl string for return value */
3502 enum mbfl_no_encoding;
3503 const mbfl_encoding *tran_cs, /* transfer text charset */
3504 *head_enc, /* header transfer encoding */
3505 *body_enc; /* body transfer encoding */
3506 mbfl_memory_device device; /* automatic allocateable buffer for additional header */
3507 const mbfl_language *lang;
3508 int err = 0;
3509 HashTable ht_headers;
3510 zval *s;
3511 extern void mbfl_memory_device_unput(mbfl_memory_device *device);
3512
3513 /* initialize */
3514 mbfl_memory_device_init(&device, 0, 0);
3515 mbfl_string_init(&orig_str);
3516 mbfl_string_init(&conv_str);
3517
3518 /* character-set, transfer-encoding */
3519 tran_cs = &mbfl_encoding_utf8;
3520 head_enc = &mbfl_encoding_base64;
3521 body_enc = &mbfl_encoding_base64;
3522 lang = mbfl_no2language(MBSTRG(language));
3523 if (lang != NULL) {
3524 tran_cs = mbfl_no2encoding(lang->mail_charset);
3525 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
3526 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
3527 }
3528
3529 ZEND_PARSE_PARAMETERS_START(3, 5)
3530 Z_PARAM_PATH(to, to_len)
3531 Z_PARAM_PATH(subject, subject_len)
3532 Z_PARAM_PATH(message, message_len)
3533 Z_PARAM_OPTIONAL
3534 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
3535 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
3536 ZEND_PARSE_PARAMETERS_END();
3537
3538 if (str_headers) {
3539 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
3540 zend_argument_value_error(4, "must not contain any null bytes");
3541 RETURN_THROWS();
3542 }
3543 str_headers = php_trim(str_headers, NULL, 0, 2);
3544 } else if (headers_ht) {
3545 str_headers = php_mail_build_headers(headers_ht);
3546 if (EG(exception)) {
3547 RETURN_THROWS();
3548 }
3549 }
3550
3551 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
3552
3553 if (str_headers != NULL) {
3554 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
3555 }
3556
3557 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
3558 char *tmp;
3559 char *param_name;
3560 char *charset = NULL;
3561
3562 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
3563 p = strchr(Z_STRVAL_P(s), ';');
3564
3565 if (p != NULL) {
3566 /* skipping the padded spaces */
3567 do {
3568 ++p;
3569 } while (*p == ' ' || *p == '\t');
3570
3571 if (*p != '\0') {
3572 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
3573 if (strcasecmp(param_name, "charset") == 0) {
3574 const mbfl_encoding *_tran_cs = tran_cs;
3575
3576 charset = php_strtok_r(NULL, "= \"", &tmp);
3577 if (charset != NULL) {
3578 _tran_cs = mbfl_name2encoding(charset);
3579 }
3580
3581 if (!_tran_cs) {
3582 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
3583 _tran_cs = &mbfl_encoding_ascii;
3584 }
3585 tran_cs = _tran_cs;
3586 }
3587 }
3588 }
3589 }
3590 suppressed_hdrs.cnt_type = 1;
3591 }
3592
3593 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
3594 const mbfl_encoding *_body_enc;
3595
3596 ZEND_ASSERT(Z_TYPE_P(s) == IS_STRING);
3597 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
3598 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
3599 case mbfl_no_encoding_base64:
3600 case mbfl_no_encoding_7bit:
3601 case mbfl_no_encoding_8bit:
3602 body_enc = _body_enc;
3603 break;
3604
3605 default:
3606 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
3607 body_enc = &mbfl_encoding_8bit;
3608 break;
3609 }
3610 suppressed_hdrs.cnt_trans_enc = 1;
3611 }
3612
3613 /* To: */
3614 if (to_len > 0) {
3615 to_r = estrndup(to, to_len);
3616 for (; to_len; to_len--) {
3617 if (!isspace((unsigned char) to_r[to_len - 1])) {
3618 break;
3619 }
3620 to_r[to_len - 1] = '\0';
3621 }
3622 for (i = 0; to_r[i]; i++) {
3623 if (iscntrl((unsigned char) to_r[i])) {
3624 /* According to RFC 822, section 3.1.1 long headers may be separated into
3625 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
3626 * To prevent these separators from being replaced with a space, we use the
3627 * SKIP_LONG_HEADER_SEP_MBSTRING to skip over them.
3628 */
3629 SKIP_LONG_HEADER_SEP_MBSTRING(to_r, i);
3630 to_r[i] = ' ';
3631 }
3632 }
3633 } else {
3634 to_r = to;
3635 }
3636
3637 /* Subject: */
3638 orig_str.val = (unsigned char *)subject;
3639 orig_str.len = subject_len;
3640 orig_str.encoding = MBSTRG(current_internal_encoding);
3641 if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
3642 || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
3643 orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
3644 }
3645 pstr = mbfl_mime_header_encode(&orig_str, &conv_str, tran_cs, head_enc, CRLF, sizeof("Subject: [PHP-jp nnnnnnnn]" CRLF) - 1);
3646 if (pstr != NULL) {
3647 subject_buf = subject = (char *)pstr->val;
3648 }
3649
3650 /* message body */
3651 orig_str.val = (unsigned char *)message;
3652 orig_str.len = message_len;
3653 orig_str.encoding = MBSTRG(current_internal_encoding);
3654
3655 if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid
3656 || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
3657 orig_str.encoding = mbfl_identify_encoding(&orig_str, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
3658 }
3659
3660 pstr = NULL;
3661 {
3662 mbfl_string tmpstr;
3663
3664 if (mbfl_convert_encoding(&orig_str, &tmpstr, tran_cs) != NULL) {
3665 tmpstr.encoding = &mbfl_encoding_8bit;
3666 pstr = mbfl_convert_encoding(&tmpstr, &conv_str, body_enc);
3667 efree(tmpstr.val);
3668 }
3669 }
3670 if (pstr != NULL) {
3671 message_buf = message = (char *)pstr->val;
3672 }
3673
3674 /* other headers */
3675 #define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
3676 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
3677 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
3678 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
3679 if (str_headers != NULL) {
3680 p = ZSTR_VAL(str_headers);
3681 n = ZSTR_LEN(str_headers);
3682 mbfl_memory_device_strncat(&device, p, n);
3683 if (n > 0 && p[n - 1] != '\n') {
3684 mbfl_memory_device_strncat(&device, CRLF, sizeof(CRLF)-1);
3685 }
3686 zend_string_release_ex(str_headers, 0);
3687 }
3688
3689 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
3690 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
3691 mbfl_memory_device_strncat(&device, CRLF, sizeof(CRLF)-1);
3692 }
3693
3694 if (!suppressed_hdrs.cnt_type) {
3695 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
3696
3697 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
3698 if (p != NULL) {
3699 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
3700 mbfl_memory_device_strcat(&device, p);
3701 }
3702 mbfl_memory_device_strncat(&device, CRLF, sizeof(CRLF)-1);
3703 }
3704 if (!suppressed_hdrs.cnt_trans_enc) {
3705 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
3706 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
3707 if (p == NULL) {
3708 p = "7bit";
3709 }
3710 mbfl_memory_device_strcat(&device, p);
3711 mbfl_memory_device_strncat(&device, CRLF, sizeof(CRLF)-1);
3712 }
3713
3714 mbfl_memory_device_unput(&device);
3715 mbfl_memory_device_unput(&device);
3716 mbfl_memory_device_output('\0', &device);
3717 str_headers = zend_string_init((char *)device.buffer, strlen((char *)device.buffer), 0);
3718
3719 if (force_extra_parameters) {
3720 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
3721 } else if (extra_cmd) {
3722 extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
3723 }
3724
3725 if (!err && php_mail(to_r, subject, message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL)) {
3726 RETVAL_TRUE;
3727 } else {
3728 RETVAL_FALSE;
3729 }
3730
3731 if (extra_cmd) {
3732 zend_string_release_ex(extra_cmd, 0);
3733 }
3734
3735 if (to_r != to) {
3736 efree(to_r);
3737 }
3738 if (subject_buf) {
3739 efree((void *)subject_buf);
3740 }
3741 if (message_buf) {
3742 efree((void *)message_buf);
3743 }
3744 mbfl_memory_device_clear(&device);
3745 zend_hash_destroy(&ht_headers);
3746 if (str_headers) {
3747 zend_string_release_ex(str_headers, 0);
3748 }
3749 }
3750
3751 #undef SKIP_LONG_HEADER_SEP_MBSTRING
3752 #undef CRLF
3753 #undef MAIL_ASCIIZ_CHECK_MBSTRING
3754 #undef PHP_MBSTR_MAIL_MIME_HEADER1
3755 #undef PHP_MBSTR_MAIL_MIME_HEADER2
3756 #undef PHP_MBSTR_MAIL_MIME_HEADER3
3757 #undef PHP_MBSTR_MAIL_MIME_HEADER4
3758 /* }}} */
3759
3760 /* {{{ Returns the current settings of mbstring */
PHP_FUNCTION(mb_get_info)3761 PHP_FUNCTION(mb_get_info)
3762 {
3763 zend_string *type = NULL;
3764 size_t n;
3765 char *name;
3766 zval row;
3767 const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
3768 const mbfl_encoding **entry;
3769
3770 ZEND_PARSE_PARAMETERS_START(0, 1)
3771 Z_PARAM_OPTIONAL
3772 Z_PARAM_STR(type)
3773 ZEND_PARSE_PARAMETERS_END();
3774
3775 if (!type || zend_string_equals_literal_ci(type, "all")) {
3776 array_init(return_value);
3777 if (MBSTRG(current_internal_encoding)) {
3778 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
3779 }
3780 if (MBSTRG(http_input_identify)) {
3781 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
3782 }
3783 if (MBSTRG(current_http_output_encoding)) {
3784 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
3785 }
3786 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
3787 add_assoc_string(return_value, "http_output_conv_mimetypes", name);
3788 }
3789 if (lang != NULL) {
3790 if ((name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
3791 add_assoc_string(return_value, "mail_charset", name);
3792 }
3793 if ((name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
3794 add_assoc_string(return_value, "mail_header_encoding", name);
3795 }
3796 if ((name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
3797 add_assoc_string(return_value, "mail_body_encoding", name);
3798 }
3799 }
3800 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
3801 if (MBSTRG(encoding_translation)) {
3802 add_assoc_string(return_value, "encoding_translation", "On");
3803 } else {
3804 add_assoc_string(return_value, "encoding_translation", "Off");
3805 }
3806 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
3807 add_assoc_string(return_value, "language", name);
3808 }
3809 n = MBSTRG(current_detect_order_list_size);
3810 entry = MBSTRG(current_detect_order_list);
3811 if (n > 0) {
3812 size_t i;
3813 array_init(&row);
3814 for (i = 0; i < n; i++) {
3815 add_next_index_string(&row, (*entry)->name);
3816 entry++;
3817 }
3818 add_assoc_zval(return_value, "detect_order", &row);
3819 }
3820 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
3821 add_assoc_string(return_value, "substitute_character", "none");
3822 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
3823 add_assoc_string(return_value, "substitute_character", "long");
3824 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
3825 add_assoc_string(return_value, "substitute_character", "entity");
3826 } else {
3827 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
3828 }
3829 if (MBSTRG(strict_detection)) {
3830 add_assoc_string(return_value, "strict_detection", "On");
3831 } else {
3832 add_assoc_string(return_value, "strict_detection", "Off");
3833 }
3834 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
3835 if (MBSTRG(current_internal_encoding)) {
3836 RETVAL_STRING((char *)MBSTRG(current_internal_encoding)->name);
3837 }
3838 } else if (zend_string_equals_literal_ci(type, "http_input")) {
3839 if (MBSTRG(http_input_identify)) {
3840 RETVAL_STRING((char *)MBSTRG(http_input_identify)->name);
3841 }
3842 } else if (zend_string_equals_literal_ci(type, "http_output")) {
3843 if (MBSTRG(current_http_output_encoding)) {
3844 RETVAL_STRING((char *)MBSTRG(current_http_output_encoding)->name);
3845 }
3846 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
3847 if ((name = (char *)zend_ini_string("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)) != NULL) {
3848 RETVAL_STRING(name);
3849 }
3850 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
3851 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_charset)) != NULL) {
3852 RETVAL_STRING(name);
3853 }
3854 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
3855 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_header_encoding)) != NULL) {
3856 RETVAL_STRING(name);
3857 }
3858 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
3859 if (lang != NULL && (name = (char *)mbfl_no_encoding2name(lang->mail_body_encoding)) != NULL) {
3860 RETVAL_STRING(name);
3861 }
3862 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
3863 RETVAL_LONG(MBSTRG(illegalchars));
3864 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
3865 if (MBSTRG(encoding_translation)) {
3866 RETVAL_STRING("On");
3867 } else {
3868 RETVAL_STRING("Off");
3869 }
3870 } else if (zend_string_equals_literal_ci(type, "language")) {
3871 if ((name = (char *)mbfl_no_language2name(MBSTRG(language))) != NULL) {
3872 RETVAL_STRING(name);
3873 }
3874 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
3875 n = MBSTRG(current_detect_order_list_size);
3876 entry = MBSTRG(current_detect_order_list);
3877 if (n > 0) {
3878 size_t i;
3879 array_init(return_value);
3880 for (i = 0; i < n; i++) {
3881 add_next_index_string(return_value, (*entry)->name);
3882 entry++;
3883 }
3884 }
3885 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
3886 if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
3887 RETVAL_STRING("none");
3888 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
3889 RETVAL_STRING("long");
3890 } else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
3891 RETVAL_STRING("entity");
3892 } else {
3893 RETVAL_LONG(MBSTRG(current_filter_illegal_substchar));
3894 }
3895 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
3896 if (MBSTRG(strict_detection)) {
3897 RETVAL_STRING("On");
3898 } else {
3899 RETVAL_STRING("Off");
3900 }
3901 } else {
3902 // TODO Convert to ValueError
3903 RETURN_FALSE;
3904 }
3905 }
3906 /* }}} */
3907
mbfl_filt_check_errors(int c,void * data)3908 static int mbfl_filt_check_errors(int c, void* data)
3909 {
3910 if (c == MBFL_BAD_INPUT) {
3911 (*((mbfl_convert_filter**)data))->num_illegalchar++;
3912 }
3913 return 0;
3914 }
3915
php_mb_check_encoding(const char * input,size_t length,const mbfl_encoding * encoding)3916 MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
3917 {
3918 mbfl_convert_filter *filter = mbfl_convert_filter_new(encoding, &mbfl_encoding_wchar, mbfl_filt_check_errors, NULL, &filter);
3919
3920 if (encoding->check != NULL) {
3921 mbfl_convert_filter_delete(filter);
3922 return encoding->check((unsigned char*)input, length);
3923 }
3924
3925 while (length--) {
3926 unsigned char c = *input++;
3927 (filter->filter_function)(c, filter);
3928 if (filter->num_illegalchar) {
3929 mbfl_convert_filter_delete(filter);
3930 return 0;
3931 }
3932 }
3933
3934 (filter->filter_flush)(filter);
3935 int result = !filter->num_illegalchar;
3936 mbfl_convert_filter_delete(filter);
3937 return result;
3938 }
3939
php_mb_check_encoding_recursive(HashTable * vars,const mbfl_encoding * encoding)3940 static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
3941 {
3942 zend_long idx;
3943 zend_string *key;
3944 zval *entry;
3945 int valid = 1;
3946
3947 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
3948
3949 if (GC_IS_RECURSIVE(vars)) {
3950 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
3951 return 0;
3952 }
3953 GC_TRY_PROTECT_RECURSION(vars);
3954 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
3955 ZVAL_DEREF(entry);
3956 if (key) {
3957 if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
3958 valid = 0;
3959 break;
3960 }
3961 }
3962 switch (Z_TYPE_P(entry)) {
3963 case IS_STRING:
3964 if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
3965 valid = 0;
3966 break;
3967 }
3968 break;
3969 case IS_ARRAY:
3970 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
3971 valid = 0;
3972 break;
3973 }
3974 break;
3975 case IS_LONG:
3976 case IS_DOUBLE:
3977 case IS_NULL:
3978 case IS_TRUE:
3979 case IS_FALSE:
3980 break;
3981 default:
3982 /* Other types are error. */
3983 valid = 0;
3984 break;
3985 }
3986 } ZEND_HASH_FOREACH_END();
3987 GC_TRY_UNPROTECT_RECURSION(vars);
3988 return valid;
3989 }
3990
3991 /* {{{ Check if the string is valid for the specified encoding */
PHP_FUNCTION(mb_check_encoding)3992 PHP_FUNCTION(mb_check_encoding)
3993 {
3994 zend_string *input_str = NULL, *enc = NULL;
3995 HashTable *input_ht = NULL;
3996 const mbfl_encoding *encoding;
3997
3998 ZEND_PARSE_PARAMETERS_START(0, 2)
3999 Z_PARAM_OPTIONAL
4000 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
4001 Z_PARAM_STR_OR_NULL(enc)
4002 ZEND_PARSE_PARAMETERS_END();
4003
4004 encoding = php_mb_get_encoding(enc, 2);
4005 if (!encoding) {
4006 RETURN_THROWS();
4007 }
4008
4009 if (input_ht) {
4010 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
4011 } else if (input_str) {
4012 RETURN_BOOL(php_mb_check_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), encoding));
4013 } else {
4014 php_error_docref(NULL, E_DEPRECATED,
4015 "Calling mb_check_encoding() without argument is deprecated");
4016
4017 /* FIXME: Actually check all inputs, except $_FILES file content. */
4018 RETURN_BOOL(MBSTRG(illegalchars) == 0);
4019 }
4020 }
4021 /* }}} */
4022
4023
php_mb_ord(const char * str,size_t str_len,zend_string * enc_name,const uint32_t enc_name_arg_num)4024 static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
4025 const uint32_t enc_name_arg_num)
4026 {
4027 const mbfl_encoding *enc;
4028 enum mbfl_no_encoding no_enc;
4029
4030 ZEND_ASSERT(str_len > 0);
4031
4032 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4033 if (!enc) {
4034 return -2;
4035 }
4036
4037 no_enc = enc->no_encoding;
4038 if (php_mb_is_unsupported_no_encoding(no_enc)) {
4039 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
4040 return -2;
4041 }
4042
4043 {
4044 mbfl_wchar_device dev;
4045 mbfl_convert_filter *filter;
4046 zend_long cp;
4047
4048 mbfl_wchar_device_init(&dev);
4049 filter = mbfl_convert_filter_new(enc, &mbfl_encoding_wchar, mbfl_wchar_device_output, 0, &dev);
4050 /* If this assertion fails this means some memory allocation failure which is a bug */
4051 ZEND_ASSERT(filter != NULL);
4052
4053 mbfl_convert_filter_feed_string(filter, (unsigned char*)str, str_len);
4054 mbfl_convert_filter_flush(filter);
4055
4056 if (dev.pos < 1 || filter->num_illegalchar || dev.buffer[0] == MBFL_BAD_INPUT) {
4057 cp = -1;
4058 } else {
4059 cp = dev.buffer[0];
4060 }
4061
4062 mbfl_convert_filter_delete(filter);
4063 mbfl_wchar_device_clear(&dev);
4064 return cp;
4065 }
4066 }
4067
4068
4069 /* {{{ */
PHP_FUNCTION(mb_ord)4070 PHP_FUNCTION(mb_ord)
4071 {
4072 char *str;
4073 size_t str_len;
4074 zend_string *enc = NULL;
4075 zend_long cp;
4076
4077 ZEND_PARSE_PARAMETERS_START(1, 2)
4078 Z_PARAM_STRING(str, str_len)
4079 Z_PARAM_OPTIONAL
4080 Z_PARAM_STR_OR_NULL(enc)
4081 ZEND_PARSE_PARAMETERS_END();
4082
4083 if (str_len == 0) {
4084 zend_argument_value_error(1, "must not be empty");
4085 RETURN_THROWS();
4086 }
4087
4088 cp = php_mb_ord(str, str_len, enc, 2);
4089
4090 if (0 > cp) {
4091 if (cp == -2) {
4092 RETURN_THROWS();
4093 }
4094 RETURN_FALSE;
4095 }
4096
4097 RETURN_LONG(cp);
4098 }
4099 /* }}} */
4100
4101
php_mb_chr(zend_long cp,zend_string * enc_name,uint32_t enc_name_arg_num)4102 static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
4103 {
4104 const mbfl_encoding *enc;
4105 enum mbfl_no_encoding no_enc;
4106 zend_string *ret;
4107 char* buf;
4108 size_t buf_len;
4109
4110 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
4111 if (!enc) {
4112 return NULL;
4113 }
4114
4115 no_enc = enc->no_encoding;
4116 if (php_mb_is_unsupported_no_encoding(no_enc)) {
4117 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
4118 return NULL;
4119 }
4120
4121 if (cp < 0 || cp > 0x10ffff) {
4122 return NULL;
4123 }
4124
4125 if (php_mb_is_no_encoding_utf8(no_enc)) {
4126 if (cp > 0xd7ff && 0xe000 > cp) {
4127 return NULL;
4128 }
4129
4130 if (cp < 0x80) {
4131 ret = ZSTR_CHAR(cp);
4132 } else if (cp < 0x800) {
4133 ret = zend_string_alloc(2, 0);
4134 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
4135 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
4136 ZSTR_VAL(ret)[2] = 0;
4137 } else if (cp < 0x10000) {
4138 ret = zend_string_alloc(3, 0);
4139 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
4140 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
4141 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
4142 ZSTR_VAL(ret)[3] = 0;
4143 } else {
4144 ret = zend_string_alloc(4, 0);
4145 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
4146 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
4147 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
4148 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
4149 ZSTR_VAL(ret)[4] = 0;
4150 }
4151
4152 return ret;
4153 }
4154
4155 buf_len = 4;
4156 buf = (char *) emalloc(buf_len + 1);
4157 buf[0] = (cp >> 24) & 0xff;
4158 buf[1] = (cp >> 16) & 0xff;
4159 buf[2] = (cp >> 8) & 0xff;
4160 buf[3] = cp & 0xff;
4161 buf[4] = 0;
4162
4163 char *ret_str;
4164 size_t ret_len;
4165 long orig_illegalchars = MBSTRG(illegalchars);
4166 MBSTRG(illegalchars) = 0;
4167 ret_str = php_mb_convert_encoding_ex(buf, buf_len, enc, &mbfl_encoding_ucs4be, &ret_len);
4168 if (MBSTRG(illegalchars) != 0) {
4169 efree(buf);
4170 efree(ret_str);
4171 MBSTRG(illegalchars) = orig_illegalchars;
4172 return NULL;
4173 }
4174
4175 ret = zend_string_init(ret_str, ret_len, 0);
4176 efree(ret_str);
4177 MBSTRG(illegalchars) = orig_illegalchars;
4178
4179 efree(buf);
4180 return ret;
4181 }
4182
4183
4184 /* {{{ */
PHP_FUNCTION(mb_chr)4185 PHP_FUNCTION(mb_chr)
4186 {
4187 zend_long cp;
4188 zend_string *enc = NULL;
4189
4190 ZEND_PARSE_PARAMETERS_START(1, 2)
4191 Z_PARAM_LONG(cp)
4192 Z_PARAM_OPTIONAL
4193 Z_PARAM_STR_OR_NULL(enc)
4194 ZEND_PARSE_PARAMETERS_END();
4195
4196 zend_string* ret = php_mb_chr(cp, enc, 2);
4197 if (ret == NULL) {
4198 RETURN_FALSE;
4199 }
4200
4201 RETURN_STR(ret);
4202 }
4203 /* }}} */
4204
4205 /* {{{ */
PHP_FUNCTION(mb_scrub)4206 PHP_FUNCTION(mb_scrub)
4207 {
4208 char* str;
4209 size_t str_len;
4210 zend_string *enc_name = NULL;
4211
4212 ZEND_PARSE_PARAMETERS_START(1, 2)
4213 Z_PARAM_STRING(str, str_len)
4214 Z_PARAM_OPTIONAL
4215 Z_PARAM_STR_OR_NULL(enc_name)
4216 ZEND_PARSE_PARAMETERS_END();
4217
4218 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
4219 if (!enc) {
4220 RETURN_THROWS();
4221 }
4222
4223 size_t ret_len;
4224 char *ret = php_mb_convert_encoding_ex(str, str_len, enc, enc, &ret_len);
4225
4226 RETVAL_STRINGL(ret, ret_len);
4227 efree(ret);
4228 }
4229 /* }}} */
4230
4231
4232 /* {{{ php_mb_populate_current_detect_order_list */
php_mb_populate_current_detect_order_list(void)4233 static void php_mb_populate_current_detect_order_list(void)
4234 {
4235 const mbfl_encoding **entry = 0;
4236 size_t nentries;
4237
4238 if (MBSTRG(detect_order_list) && MBSTRG(detect_order_list_size)) {
4239 nentries = MBSTRG(detect_order_list_size);
4240 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4241 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
4242 } else {
4243 const enum mbfl_no_encoding *src = MBSTRG(default_detect_order_list);
4244 size_t i;
4245 nentries = MBSTRG(default_detect_order_list_size);
4246 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
4247 for (i = 0; i < nentries; i++) {
4248 entry[i] = mbfl_no2encoding(src[i]);
4249 }
4250 }
4251 MBSTRG(current_detect_order_list) = entry;
4252 MBSTRG(current_detect_order_list_size) = nentries;
4253 }
4254 /* }}} */
4255
4256 /* {{{ static int php_mb_encoding_translation() */
php_mb_encoding_translation(void)4257 static int php_mb_encoding_translation(void)
4258 {
4259 return MBSTRG(encoding_translation);
4260 }
4261 /* }}} */
4262
4263 /* {{{ MBSTRING_API size_t php_mb_mbchar_bytes_ex() */
php_mb_mbchar_bytes_ex(const char * s,const mbfl_encoding * enc)4264 MBSTRING_API size_t php_mb_mbchar_bytes_ex(const char *s, const mbfl_encoding *enc)
4265 {
4266 if (enc) {
4267 if (enc->mblen_table) {
4268 if (s) {
4269 return enc->mblen_table[*(unsigned char *)s];
4270 }
4271 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
4272 return 2;
4273 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
4274 return 4;
4275 }
4276 }
4277 return 1;
4278 }
4279 /* }}} */
4280
4281 /* {{{ MBSTRING_API size_t php_mb_mbchar_bytes() */
php_mb_mbchar_bytes(const char * s)4282 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s)
4283 {
4284 return php_mb_mbchar_bytes_ex(s, MBSTRG(internal_encoding));
4285 }
4286 /* }}} */
4287
4288 /* {{{ MBSTRING_API char *php_mb_safe_strrchr_ex() */
php_mb_safe_strrchr_ex(const char * s,unsigned int c,size_t nbytes,const mbfl_encoding * enc)4289 MBSTRING_API char *php_mb_safe_strrchr_ex(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
4290 {
4291 const char *p = s;
4292 char *last=NULL;
4293
4294 if (nbytes == (size_t)-1) {
4295 size_t nb = 0;
4296
4297 while (*p != '\0') {
4298 if (nb == 0) {
4299 if ((unsigned char)*p == (unsigned char)c) {
4300 last = (char *)p;
4301 }
4302 nb = php_mb_mbchar_bytes_ex(p, enc);
4303 if (nb == 0) {
4304 return NULL; /* something is going wrong! */
4305 }
4306 }
4307 --nb;
4308 ++p;
4309 }
4310 } else {
4311 size_t bcnt = nbytes;
4312 size_t nbytes_char;
4313 while (bcnt > 0) {
4314 if ((unsigned char)*p == (unsigned char)c) {
4315 last = (char *)p;
4316 }
4317 nbytes_char = php_mb_mbchar_bytes_ex(p, enc);
4318 if (bcnt < nbytes_char) {
4319 return NULL;
4320 }
4321 p += nbytes_char;
4322 bcnt -= nbytes_char;
4323 }
4324 }
4325 return last;
4326 }
4327 /* }}} */
4328
4329 /* {{{ MBSTRING_API char *php_mb_safe_strrchr() */
php_mb_safe_strrchr(const char * s,unsigned int c,size_t nbytes)4330 MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes)
4331 {
4332 return php_mb_safe_strrchr_ex(s, c, nbytes, MBSTRG(internal_encoding));
4333 }
4334 /* }}} */
4335
4336 /* {{{ MBSTRING_API int php_mb_stripos() */
php_mb_stripos(int mode,const char * old_haystack,size_t old_haystack_len,const char * old_needle,size_t old_needle_len,zend_long offset,const mbfl_encoding * enc)4337 MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *enc)
4338 {
4339 size_t n = (size_t) -1;
4340 mbfl_string haystack, needle;
4341
4342 mbfl_string_init_set(&haystack, enc);
4343 mbfl_string_init_set(&needle, enc);
4344
4345 do {
4346 /* We're using simple case-folding here, because we'd have to deal with remapping of
4347 * offsets otherwise. */
4348
4349 size_t len = 0;
4350 haystack.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc);
4351 haystack.len = len;
4352
4353 if (!haystack.val) {
4354 break;
4355 }
4356
4357 if (haystack.len == 0) {
4358 break;
4359 }
4360
4361 needle.val = (unsigned char *)mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc);
4362 needle.len = len;
4363
4364 if (!needle.val) {
4365 break;
4366 }
4367
4368 n = mbfl_strpos(&haystack, &needle, offset, mode);
4369 } while(0);
4370
4371 if (haystack.val) {
4372 efree(haystack.val);
4373 }
4374
4375 if (needle.val) {
4376 efree(needle.val);
4377 }
4378
4379 return n;
4380 }
4381 /* }}} */
4382
php_mb_gpc_get_detect_order(const zend_encoding *** list,size_t * list_size)4383 static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
4384 {
4385 *list = (const zend_encoding **)MBSTRG(http_input_list);
4386 *list_size = MBSTRG(http_input_list_size);
4387 }
4388 /* }}} */
4389
php_mb_gpc_set_input_encoding(const zend_encoding * encoding)4390 static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
4391 {
4392 MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding;
4393 }
4394 /* }}} */
4395