1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 +----------------------------------------------------------------------+
15 */
16
17 #include "libmbfl/config.h"
18
19 #include "php.h"
20 #include "php_ini.h"
21
22 #ifdef HAVE_MBREGEX
23
24 #include "zend_smart_str.h"
25 #include "ext/standard/info.h"
26 #include "php_mbregex.h"
27 #include "mbstring.h"
28 #include "libmbfl/filters/mbfilter_utf8.h"
29
30 #include "php_onig_compat.h" /* must come prior to the oniguruma header */
31 #include <oniguruma.h>
32 #undef UChar
33
34 #if !defined(ONIGURUMA_VERSION_INT) || ONIGURUMA_VERSION_INT < 60800
35 typedef void OnigMatchParam;
36 #define onig_new_match_param() (NULL)
37 #define onig_initialize_match_param(x) (void)(x)
38 #define onig_set_match_stack_limit_size_of_match_param(x, y)
39 #define onig_set_retry_limit_in_match_of_match_param(x, y)
40 #define onig_free_match_param(x)
41 #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
42 onig_search(reg, str, end, start, range, region, option)
43 #define onig_match_with_param(re, str, end, at, region, option, mp) \
44 onig_match(re, str, end, at, region, option)
45 #endif
46
47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48
49 struct _zend_mb_regex_globals {
50 OnigEncoding default_mbctype;
51 OnigEncoding current_mbctype;
52 const mbfl_encoding *current_mbctype_mbfl_encoding;
53 HashTable ht_rc;
54 zval search_str;
55 zval *search_str_val;
56 size_t search_pos;
57 php_mb_regex_t *search_re;
58 OnigRegion *search_regs;
59 OnigOptionType regex_default_options;
60 OnigSyntaxType *regex_default_syntax;
61 };
62
63 #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
64
65 /* {{{ static void php_mb_regex_free_cache() */
php_mb_regex_free_cache(zval * el)66 static void php_mb_regex_free_cache(zval *el) {
67 onig_free((php_mb_regex_t *)Z_PTR_P(el));
68 }
69 /* }}} */
70
71 /* {{{ _php_mb_regex_globals_ctor */
_php_mb_regex_globals_ctor(zend_mb_regex_globals * pglobals)72 static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
73 {
74 pglobals->default_mbctype = ONIG_ENCODING_UTF8;
75 pglobals->current_mbctype = ONIG_ENCODING_UTF8;
76 pglobals->current_mbctype_mbfl_encoding = &mbfl_encoding_utf8;
77 ZVAL_UNDEF(&pglobals->search_str);
78 pglobals->search_re = (php_mb_regex_t*)NULL;
79 pglobals->search_pos = 0;
80 pglobals->search_regs = (OnigRegion*)NULL;
81 pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
82 pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
83 return SUCCESS;
84 }
85 /* }}} */
86
87 /* {{{ php_mb_regex_globals_alloc */
php_mb_regex_globals_alloc(void)88 zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
89 {
90 zend_mb_regex_globals *pglobals = pemalloc(
91 sizeof(zend_mb_regex_globals), 1);
92 if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
93 pefree(pglobals, 1);
94 return NULL;
95 }
96 return pglobals;
97 }
98 /* }}} */
99
100 /* {{{ php_mb_regex_globals_free */
php_mb_regex_globals_free(zend_mb_regex_globals * pglobals)101 void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
102 {
103 if (!pglobals) {
104 return;
105 }
106 pefree(pglobals, 1);
107 }
108 /* }}} */
109
110 /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
PHP_MINIT_FUNCTION(mb_regex)111 PHP_MINIT_FUNCTION(mb_regex)
112 {
113 char version[256];
114
115 onig_init();
116
117 snprintf(version, sizeof(version), "%d.%d.%d",
118 ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY);
119 REGISTER_STRING_CONSTANT("MB_ONIGURUMA_VERSION", version, CONST_CS | CONST_PERSISTENT);
120 return SUCCESS;
121 }
122 /* }}} */
123
124 /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
PHP_MSHUTDOWN_FUNCTION(mb_regex)125 PHP_MSHUTDOWN_FUNCTION(mb_regex)
126 {
127 onig_end();
128 return SUCCESS;
129 }
130 /* }}} */
131
132 /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)133 PHP_RINIT_FUNCTION(mb_regex)
134 {
135 if (!MBSTRG(mb_regex_globals)) return FAILURE;
136 zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
137 return SUCCESS;
138 }
139 /* }}} */
140
141 /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
PHP_RSHUTDOWN_FUNCTION(mb_regex)142 PHP_RSHUTDOWN_FUNCTION(mb_regex)
143 {
144 MBREX(current_mbctype) = MBREX(default_mbctype);
145 MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(php_mb_regex_get_default_mbctype());
146
147 if (!Z_ISUNDEF(MBREX(search_str))) {
148 zval_ptr_dtor(&MBREX(search_str));
149 ZVAL_UNDEF(&MBREX(search_str));
150 }
151 MBREX(search_pos) = 0;
152 MBREX(search_re) = NULL;
153
154 if (MBREX(search_regs) != NULL) {
155 onig_region_free(MBREX(search_regs), 1);
156 MBREX(search_regs) = (OnigRegion *)NULL;
157 }
158 zend_hash_destroy(&MBREX(ht_rc));
159
160 return SUCCESS;
161 }
162 /* }}} */
163
164 /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
PHP_MINFO_FUNCTION(mb_regex)165 PHP_MINFO_FUNCTION(mb_regex)
166 {
167 char buf[32];
168 php_info_print_table_start();
169 php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
170 snprintf(buf, sizeof(buf), "%d.%d.%d",
171 ONIGURUMA_VERSION_MAJOR,
172 ONIGURUMA_VERSION_MINOR,
173 ONIGURUMA_VERSION_TEENY);
174 php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
175 php_info_print_table_end();
176 }
177 /* }}} */
178
179 /*
180 * encoding name resolver
181 */
182
183 /* {{{ encoding name map */
184 typedef struct _php_mb_regex_enc_name_map_t {
185 const char *names;
186 OnigEncoding code;
187 } php_mb_regex_enc_name_map_t;
188
189 static const php_mb_regex_enc_name_map_t enc_name_map[] = {
190 #ifdef ONIG_ENCODING_EUC_JP
191 {
192 "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
193 ONIG_ENCODING_EUC_JP
194 },
195 #endif
196 #ifdef ONIG_ENCODING_UTF8
197 {
198 "UTF-8\0UTF8\0",
199 ONIG_ENCODING_UTF8
200 },
201 #endif
202 #ifdef ONIG_ENCODING_UTF16_BE
203 {
204 "UTF-16\0UTF-16BE\0",
205 ONIG_ENCODING_UTF16_BE
206 },
207 #endif
208 #ifdef ONIG_ENCODING_UTF16_LE
209 {
210 "UTF-16LE\0",
211 ONIG_ENCODING_UTF16_LE
212 },
213 #endif
214 #ifdef ONIG_ENCODING_UTF32_BE
215 {
216 "UCS-4\0UTF-32\0UTF-32BE\0",
217 ONIG_ENCODING_UTF32_BE
218 },
219 #endif
220 #ifdef ONIG_ENCODING_UTF32_LE
221 {
222 "UCS-4LE\0UTF-32LE\0",
223 ONIG_ENCODING_UTF32_LE
224 },
225 #endif
226 #ifdef ONIG_ENCODING_SJIS
227 {
228 "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
229 ONIG_ENCODING_SJIS
230 },
231 #endif
232 #ifdef ONIG_ENCODING_BIG5
233 {
234 "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
235 ONIG_ENCODING_BIG5
236 },
237 #endif
238 #ifdef ONIG_ENCODING_EUC_CN
239 {
240 "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
241 ONIG_ENCODING_EUC_CN
242 },
243 #endif
244 #ifdef ONIG_ENCODING_EUC_TW
245 {
246 "EUC-TW\0EUCTW\0EUC_TW\0",
247 ONIG_ENCODING_EUC_TW
248 },
249 #endif
250 #ifdef ONIG_ENCODING_EUC_KR
251 {
252 "EUC-KR\0EUCKR\0EUC_KR\0",
253 ONIG_ENCODING_EUC_KR
254 },
255 #endif
256 #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
257 {
258 "KOI8\0KOI-8\0",
259 ONIG_ENCODING_KOI8
260 },
261 #endif
262 #ifdef ONIG_ENCODING_KOI8_R
263 {
264 "KOI8R\0KOI8-R\0KOI-8R\0",
265 ONIG_ENCODING_KOI8_R
266 },
267 #endif
268 #ifdef ONIG_ENCODING_ISO_8859_1
269 {
270 "ISO-8859-1\0ISO8859-1\0",
271 ONIG_ENCODING_ISO_8859_1
272 },
273 #endif
274 #ifdef ONIG_ENCODING_ISO_8859_2
275 {
276 "ISO-8859-2\0ISO8859-2\0",
277 ONIG_ENCODING_ISO_8859_2
278 },
279 #endif
280 #ifdef ONIG_ENCODING_ISO_8859_3
281 {
282 "ISO-8859-3\0ISO8859-3\0",
283 ONIG_ENCODING_ISO_8859_3
284 },
285 #endif
286 #ifdef ONIG_ENCODING_ISO_8859_4
287 {
288 "ISO-8859-4\0ISO8859-4\0",
289 ONIG_ENCODING_ISO_8859_4
290 },
291 #endif
292 #ifdef ONIG_ENCODING_ISO_8859_5
293 {
294 "ISO-8859-5\0ISO8859-5\0",
295 ONIG_ENCODING_ISO_8859_5
296 },
297 #endif
298 #ifdef ONIG_ENCODING_ISO_8859_6
299 {
300 "ISO-8859-6\0ISO8859-6\0",
301 ONIG_ENCODING_ISO_8859_6
302 },
303 #endif
304 #ifdef ONIG_ENCODING_ISO_8859_7
305 {
306 "ISO-8859-7\0ISO8859-7\0",
307 ONIG_ENCODING_ISO_8859_7
308 },
309 #endif
310 #ifdef ONIG_ENCODING_ISO_8859_8
311 {
312 "ISO-8859-8\0ISO8859-8\0",
313 ONIG_ENCODING_ISO_8859_8
314 },
315 #endif
316 #ifdef ONIG_ENCODING_ISO_8859_9
317 {
318 "ISO-8859-9\0ISO8859-9\0",
319 ONIG_ENCODING_ISO_8859_9
320 },
321 #endif
322 #ifdef ONIG_ENCODING_ISO_8859_10
323 {
324 "ISO-8859-10\0ISO8859-10\0",
325 ONIG_ENCODING_ISO_8859_10
326 },
327 #endif
328 #ifdef ONIG_ENCODING_ISO_8859_11
329 {
330 "ISO-8859-11\0ISO8859-11\0",
331 ONIG_ENCODING_ISO_8859_11
332 },
333 #endif
334 #ifdef ONIG_ENCODING_ISO_8859_13
335 {
336 "ISO-8859-13\0ISO8859-13\0",
337 ONIG_ENCODING_ISO_8859_13
338 },
339 #endif
340 #ifdef ONIG_ENCODING_ISO_8859_14
341 {
342 "ISO-8859-14\0ISO8859-14\0",
343 ONIG_ENCODING_ISO_8859_14
344 },
345 #endif
346 #ifdef ONIG_ENCODING_ISO_8859_15
347 {
348 "ISO-8859-15\0ISO8859-15\0",
349 ONIG_ENCODING_ISO_8859_15
350 },
351 #endif
352 #ifdef ONIG_ENCODING_ISO_8859_16
353 {
354 "ISO-8859-16\0ISO8859-16\0",
355 ONIG_ENCODING_ISO_8859_16
356 },
357 #endif
358 #ifdef ONIG_ENCODING_ASCII
359 {
360 "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
361 ONIG_ENCODING_ASCII
362 },
363 #endif
364 { NULL, ONIG_ENCODING_UNDEF }
365 };
366 /* }}} */
367
368 /* {{{ php_mb_regex_name2mbctype */
_php_mb_regex_name2mbctype(const char * pname)369 static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
370 {
371 const char *p;
372 const php_mb_regex_enc_name_map_t *mapping;
373
374 if (pname == NULL || !*pname) {
375 return ONIG_ENCODING_UNDEF;
376 }
377
378 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
379 for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
380 if (strcasecmp(p, pname) == 0) {
381 return mapping->code;
382 }
383 }
384 }
385
386 return ONIG_ENCODING_UNDEF;
387 }
388 /* }}} */
389
390 /* {{{ php_mb_regex_mbctype2name */
_php_mb_regex_mbctype2name(OnigEncoding mbctype)391 static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
392 {
393 const php_mb_regex_enc_name_map_t *mapping;
394
395 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
396 if (mapping->code == mbctype) {
397 return mapping->names;
398 }
399 }
400
401 return NULL;
402 }
403 /* }}} */
404
405 /* {{{ php_mb_regex_set_mbctype */
php_mb_regex_set_mbctype(const char * encname)406 int php_mb_regex_set_mbctype(const char *encname)
407 {
408 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
409 if (mbctype == ONIG_ENCODING_UNDEF) {
410 return FAILURE;
411 }
412 MBREX(current_mbctype) = mbctype;
413 MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(encname);
414 return SUCCESS;
415 }
416 /* }}} */
417
418 /* {{{ php_mb_regex_set_default_mbctype */
php_mb_regex_set_default_mbctype(const char * encname)419 int php_mb_regex_set_default_mbctype(const char *encname)
420 {
421 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
422 if (mbctype == ONIG_ENCODING_UNDEF) {
423 return FAILURE;
424 }
425 MBREX(default_mbctype) = mbctype;
426 return SUCCESS;
427 }
428 /* }}} */
429
430 /* {{{ php_mb_regex_get_mbctype */
php_mb_regex_get_mbctype(void)431 const char *php_mb_regex_get_mbctype(void)
432 {
433 return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
434 }
435 /* }}} */
436
437 /* {{{ php_mb_regex_get_mbctype_encoding */
php_mb_regex_get_mbctype_encoding(void)438 const mbfl_encoding *php_mb_regex_get_mbctype_encoding(void)
439 {
440 return MBREX(current_mbctype_mbfl_encoding);
441 }
442 /* }}} */
443
444 /* {{{ php_mb_regex_get_default_mbctype */
php_mb_regex_get_default_mbctype(void)445 const char *php_mb_regex_get_default_mbctype(void)
446 {
447 return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
448 }
449 /* }}} */
450
451 /*
452 * regex cache
453 */
454 /* {{{ php_mbregex_compile_pattern */
php_mbregex_compile_pattern(const char * pattern,size_t patlen,OnigOptionType options,OnigSyntaxType * syntax)455 static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigSyntaxType *syntax)
456 {
457 int err_code = 0;
458 php_mb_regex_t *retval = NULL, *rc = NULL;
459 OnigErrorInfo err_info;
460 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
461 OnigEncoding enc = MBREX(current_mbctype);
462
463 if (!php_mb_check_encoding(pattern, patlen, php_mb_regex_get_mbctype_encoding())) {
464 php_error_docref(NULL, E_WARNING,
465 "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
466 return NULL;
467 }
468
469 rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
470 if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
471 if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
472 onig_error_code_to_str(err_str, err_code, &err_info);
473 php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
474 return NULL;
475 }
476 if (rc == MBREX(search_re)) {
477 /* reuse the new rc? see bug #72399 */
478 MBREX(search_re) = NULL;
479 }
480 zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
481 } else {
482 retval = rc;
483 }
484 return retval;
485 }
486 /* }}} */
487
488 /* {{{ _php_mb_regex_get_option_string */
_php_mb_regex_get_option_string(char * str,size_t len,OnigOptionType option,OnigSyntaxType * syntax)489 static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
490 {
491 size_t len_left = len;
492 size_t len_req = 0;
493 char *p = str;
494 char c;
495
496 if ((option & ONIG_OPTION_IGNORECASE) != 0) {
497 if (len_left > 0) {
498 --len_left;
499 *(p++) = 'i';
500 }
501 ++len_req;
502 }
503
504 if ((option & ONIG_OPTION_EXTEND) != 0) {
505 if (len_left > 0) {
506 --len_left;
507 *(p++) = 'x';
508 }
509 ++len_req;
510 }
511
512 if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
513 (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
514 if (len_left > 0) {
515 --len_left;
516 *(p++) = 'p';
517 }
518 ++len_req;
519 } else {
520 if ((option & ONIG_OPTION_MULTILINE) != 0) {
521 if (len_left > 0) {
522 --len_left;
523 *(p++) = 'm';
524 }
525 ++len_req;
526 }
527
528 if ((option & ONIG_OPTION_SINGLELINE) != 0) {
529 if (len_left > 0) {
530 --len_left;
531 *(p++) = 's';
532 }
533 ++len_req;
534 }
535 }
536 if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
537 if (len_left > 0) {
538 --len_left;
539 *(p++) = 'l';
540 }
541 ++len_req;
542 }
543 if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
544 if (len_left > 0) {
545 --len_left;
546 *(p++) = 'n';
547 }
548 ++len_req;
549 }
550
551 c = 0;
552
553 if (syntax == ONIG_SYNTAX_JAVA) {
554 c = 'j';
555 } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
556 c = 'u';
557 } else if (syntax == ONIG_SYNTAX_GREP) {
558 c = 'g';
559 } else if (syntax == ONIG_SYNTAX_EMACS) {
560 c = 'c';
561 } else if (syntax == ONIG_SYNTAX_RUBY) {
562 c = 'r';
563 } else if (syntax == ONIG_SYNTAX_PERL) {
564 c = 'z';
565 } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
566 c = 'b';
567 } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
568 c = 'd';
569 }
570
571 if (c != 0) {
572 if (len_left > 0) {
573 --len_left;
574 *(p++) = c;
575 }
576 ++len_req;
577 }
578
579
580 if (len_left > 0) {
581 --len_left;
582 *(p++) = '\0';
583 }
584 ++len_req;
585 if (len < len_req) {
586 return len_req;
587 }
588
589 return 0;
590 }
591 /* }}} */
592
593 /* {{{ _php_mb_regex_init_options */
_php_mb_regex_init_options(const char * parg,size_t narg,OnigOptionType * option,OnigSyntaxType ** syntax)594 static bool _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option,
595 OnigSyntaxType **syntax)
596 {
597 size_t n;
598 char c;
599 OnigOptionType optm = 0;
600
601 *syntax = ONIG_SYNTAX_RUBY;
602
603 if (parg != NULL) {
604 n = 0;
605 while(n < narg) {
606 c = parg[n++];
607 switch (c) {
608 case 'i':
609 optm |= ONIG_OPTION_IGNORECASE;
610 break;
611 case 'x':
612 optm |= ONIG_OPTION_EXTEND;
613 break;
614 case 'm':
615 optm |= ONIG_OPTION_MULTILINE;
616 break;
617 case 's':
618 optm |= ONIG_OPTION_SINGLELINE;
619 break;
620 case 'p':
621 optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
622 break;
623 case 'l':
624 optm |= ONIG_OPTION_FIND_LONGEST;
625 break;
626 case 'n':
627 optm |= ONIG_OPTION_FIND_NOT_EMPTY;
628 break;
629 case 'j':
630 *syntax = ONIG_SYNTAX_JAVA;
631 break;
632 case 'u':
633 *syntax = ONIG_SYNTAX_GNU_REGEX;
634 break;
635 case 'g':
636 *syntax = ONIG_SYNTAX_GREP;
637 break;
638 case 'c':
639 *syntax = ONIG_SYNTAX_EMACS;
640 break;
641 case 'r':
642 *syntax = ONIG_SYNTAX_RUBY;
643 break;
644 case 'z':
645 *syntax = ONIG_SYNTAX_PERL;
646 break;
647 case 'b':
648 *syntax = ONIG_SYNTAX_POSIX_BASIC;
649 break;
650 case 'd':
651 *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
652 break;
653 default:
654 zend_value_error("Option \"%c\" is not supported", c);
655 return false;
656 }
657 }
658 if (option != NULL) *option|=optm;
659 }
660 return true;
661 }
662 /* }}} */
663
664
665 /*
666 * Callbacks for named subpatterns
667 */
668
669 /* {{{ struct mb_ereg_groups_iter_arg */
670 typedef struct mb_regex_groups_iter_args {
671 zval *groups;
672 char *search_str;
673 size_t search_len;
674 OnigRegion *region;
675 } mb_regex_groups_iter_args;
676 /* }}} */
677
678 /* {{{ mb_ereg_groups_iter */
679 static int
mb_regex_groups_iter(const OnigUChar * name,const OnigUChar * name_end,int ngroup_num,int * group_nums,regex_t * reg,void * parg)680 mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
681 {
682 mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
683 int gn, beg, end;
684
685 /*
686 * In case of duplicate groups, keep only the last succeeding one
687 * to be consistent with preg_match with the PCRE_DUPNAMES option.
688 */
689 gn = onig_name_to_backref_number(reg, name, name_end, args->region);
690 beg = args->region->beg[gn];
691 end = args->region->end[gn];
692 if (beg >= 0 && beg < end && end <= args->search_len) {
693 add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
694 } else {
695 add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
696 }
697
698 return 0;
699 }
700 /* }}} */
701
702 /*
703 * Helper for _php_mb_regex_ereg_replace_exec
704 */
705 /* {{{ mb_regex_substitute */
mb_regex_substitute(smart_str * pbuf,const char * subject,size_t subject_len,char * replace,size_t replace_len,php_mb_regex_t * regexp,OnigRegion * regs,const mbfl_encoding * enc)706 static inline void mb_regex_substitute(
707 smart_str *pbuf,
708 const char *subject,
709 size_t subject_len,
710 char *replace,
711 size_t replace_len,
712 php_mb_regex_t *regexp,
713 OnigRegion *regs,
714 const mbfl_encoding *enc
715 ) {
716 char *p, *sp, *eos;
717 int no; /* bakreference group number */
718 int clen; /* byte-length of the current character */
719
720 p = replace;
721 eos = replace + replace_len;
722
723 while (p < eos) {
724 clen = (int) php_mb_mbchar_bytes_ex(p, enc);
725 if (clen != 1 || p == eos || p[0] != '\\') {
726 /* skip anything that's not an ascii backslash */
727 smart_str_appendl(pbuf, p, clen);
728 p += clen;
729 continue;
730 }
731 sp = p; /* save position */
732 clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
733 if (clen != 1 || p == eos) {
734 /* skip backslash followed by multibyte char */
735 smart_str_appendl(pbuf, sp, p - sp);
736 continue;
737 }
738 no = -1;
739 switch (p[0]) {
740 case '0':
741 no = 0;
742 p++;
743 break;
744 case '1': case '2': case '3': case '4':
745 case '5': case '6': case '7': case '8': case '9':
746 if (!onig_noname_group_capture_is_active(regexp)) {
747 /*
748 * FIXME:
749 * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
750 * For now we just ignore them, but in the future we might want to raise a warning
751 * and abort the whole replace operation.
752 */
753 p++;
754 smart_str_appendl(pbuf, sp, p - sp);
755 continue;
756 }
757 no = p[0] - '0';
758 p++;
759 break;
760 case 'k':
761 {
762 clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
763 if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
764 /* not a backref delimiter */
765 p += clen;
766 smart_str_appendl(pbuf, sp, p - sp);
767 continue;
768 }
769 /* try to consume everything until next delimiter */
770 char delim = p[0] == '<' ? '>' : '\'';
771 char *name, *name_end;
772 char maybe_num = 1;
773 name_end = name = p + 1;
774 while (name_end < eos) {
775 clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
776 if (clen != 1) {
777 name_end += clen;
778 maybe_num = 0;
779 continue;
780 }
781 if (name_end[0] == delim) break;
782 if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
783 name_end++;
784 }
785 p = name_end + 1;
786 if (name_end - name < 1 || name_end >= eos) {
787 /* the backref was empty or we failed to find the end delimiter */
788 smart_str_appendl(pbuf, sp, p - sp);
789 continue;
790 }
791 /* we have either a name or a number */
792 if (maybe_num) {
793 if (!onig_noname_group_capture_is_active(regexp)) {
794 /* see above note on mixing numbered & named backrefs */
795 smart_str_appendl(pbuf, sp, p - sp);
796 continue;
797 }
798 if (name_end - name == 1) {
799 no = name[0] - '0';
800 break;
801 }
802 if (name[0] == '0') {
803 /* 01 is not a valid number */
804 break;
805 }
806 no = (int) strtoul(name, NULL, 10);
807 break;
808 }
809 no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
810 break;
811 }
812 default:
813 /* We're not treating \ as an escape character and will interpret something like
814 * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
815 * function has not supported escaping of backslashes historically. */
816 smart_str_appendl(pbuf, sp, p - sp);
817 continue;
818 }
819 if (no < 0 || no >= regs->num_regs) {
820 /* invalid group number reference, keep the escape sequence in the output */
821 smart_str_appendl(pbuf, sp, p - sp);
822 continue;
823 }
824 if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
825 smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
826 }
827 }
828
829 if (p < eos) {
830 smart_str_appendl(pbuf, p, eos - p);
831 }
832 }
833 /* }}} */
834
835 /*
836 * php functions
837 */
838
839 /* {{{ Returns the current encoding for regex as a string. */
PHP_FUNCTION(mb_regex_encoding)840 PHP_FUNCTION(mb_regex_encoding)
841 {
842 char *encoding = NULL;
843 size_t encoding_len;
844
845 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!", &encoding, &encoding_len) == FAILURE) {
846 RETURN_THROWS();
847 }
848
849 if (!encoding) {
850 const char *retval = php_mb_regex_get_mbctype();
851 ZEND_ASSERT(retval != NULL);
852
853 RETURN_STRING(retval);
854 } else {
855 if (php_mb_regex_set_mbctype(encoding) == FAILURE) {
856 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", encoding);
857 RETURN_THROWS();
858 }
859
860 /* TODO Make function return previous encoding? */
861 RETURN_TRUE;
862 }
863 }
864 /* }}} */
865
866 /* {{{ _php_mb_onig_search */
_php_mb_onig_search(regex_t * reg,const OnigUChar * str,const OnigUChar * end,const OnigUChar * start,const OnigUChar * range,OnigRegion * region,OnigOptionType option)867 static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
868 const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
869 OnigMatchParam *mp = onig_new_match_param();
870 int err;
871 onig_initialize_match_param(mp);
872 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
873 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
874 }
875 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_retry_limit))) {
876 onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
877 }
878 /* search */
879 err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
880 onig_free_match_param(mp);
881 return err;
882 }
883 /* }}} */
884
885
886 /* {{{ _php_mb_regex_ereg_exec */
_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS,int icase)887 static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
888 {
889 zval *array = NULL;
890 char *arg_pattern, *string;
891 size_t arg_pattern_len, string_len;
892 php_mb_regex_t *re;
893 OnigRegion *regs = NULL;
894 int i, beg, end;
895 OnigOptionType options;
896 char *str;
897
898 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|z", &arg_pattern, &arg_pattern_len, &string, &string_len, &array) == FAILURE) {
899 RETURN_THROWS();
900 }
901
902 if (arg_pattern_len == 0) {
903 zend_argument_value_error(1, "must not be empty");
904 RETURN_THROWS();
905 }
906
907 if (array != NULL) {
908 array = zend_try_array_init(array);
909 if (!array) {
910 RETURN_THROWS();
911 }
912 }
913
914 if (!php_mb_check_encoding(
915 string,
916 string_len,
917 php_mb_regex_get_mbctype_encoding()
918 )) {
919 RETURN_FALSE;
920 }
921
922 options = MBREX(regex_default_options);
923 if (icase) {
924 options |= ONIG_OPTION_IGNORECASE;
925 }
926
927 re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(regex_default_syntax));
928 if (re == NULL) {
929 RETVAL_FALSE;
930 goto out;
931 }
932
933 regs = onig_region_new();
934
935 /* actually execute the regular expression */
936 if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
937 RETVAL_FALSE;
938 goto out;
939 }
940
941 str = string;
942 if (array != NULL) {
943 for (i = 0; i < regs->num_regs; i++) {
944 beg = regs->beg[i];
945 end = regs->end[i];
946 if (beg >= 0 && beg < end && (size_t)end <= string_len) {
947 add_index_stringl(array, i, (char *)&str[beg], end - beg);
948 } else {
949 add_index_bool(array, i, 0);
950 }
951 }
952
953 if (onig_number_of_names(re) > 0) {
954 mb_regex_groups_iter_args args = {array, string, string_len, regs};
955 onig_foreach_name(re, mb_regex_groups_iter, &args);
956 }
957 }
958
959 RETVAL_TRUE;
960 out:
961 if (regs != NULL) {
962 onig_region_free(regs, 1);
963 }
964 }
965 /* }}} */
966
967 /* {{{ Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg)968 PHP_FUNCTION(mb_ereg)
969 {
970 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
971 }
972 /* }}} */
973
974 /* {{{ Case-insensitive regular expression match for multibyte string */
PHP_FUNCTION(mb_eregi)975 PHP_FUNCTION(mb_eregi)
976 {
977 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
978 }
979 /* }}} */
980
981 /* {{{ _php_mb_regex_ereg_replace_exec */
_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS,OnigOptionType options,int is_callable)982 static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
983 {
984 char *arg_pattern;
985 size_t arg_pattern_len;
986
987 char *replace;
988 size_t replace_len;
989
990 zend_fcall_info arg_replace_fci;
991 zend_fcall_info_cache arg_replace_fci_cache;
992
993 char *string;
994 size_t string_len;
995
996 php_mb_regex_t *re;
997 OnigSyntaxType *syntax;
998 OnigRegion *regs = NULL;
999 smart_str out_buf = {0};
1000 smart_str eval_buf = {0};
1001 smart_str *pbuf;
1002 int err, n;
1003 OnigUChar *pos;
1004 OnigUChar *string_lim;
1005 char *description = NULL;
1006
1007 const mbfl_encoding *enc = php_mb_regex_get_mbctype_encoding();
1008 ZEND_ASSERT(enc != NULL);
1009
1010 {
1011 char *option_str = NULL;
1012 size_t option_str_len = 0;
1013
1014 if (!is_callable) {
1015 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sss|s!",
1016 &arg_pattern, &arg_pattern_len,
1017 &replace, &replace_len,
1018 &string, &string_len,
1019 &option_str, &option_str_len) == FAILURE) {
1020 RETURN_THROWS();
1021 }
1022 } else {
1023 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sfs|s!",
1024 &arg_pattern, &arg_pattern_len,
1025 &arg_replace_fci, &arg_replace_fci_cache,
1026 &string, &string_len,
1027 &option_str, &option_str_len) == FAILURE) {
1028 RETURN_THROWS();
1029 }
1030 }
1031
1032 if (!php_mb_check_encoding(string, string_len, enc)) {
1033 RETURN_NULL();
1034 }
1035
1036 if (option_str != NULL) {
1037 /* Initialize option and in case of failure it means there is a value error */
1038 if (!_php_mb_regex_init_options(option_str, option_str_len, &options, &syntax)) {
1039 RETURN_THROWS();
1040 }
1041 } else {
1042 options |= MBREX(regex_default_options);
1043 syntax = MBREX(regex_default_syntax);
1044 }
1045 }
1046
1047 /* create regex pattern buffer */
1048 re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, syntax);
1049 if (re == NULL) {
1050 RETURN_FALSE;
1051 }
1052
1053 if (is_callable) {
1054 pbuf = &eval_buf;
1055 description = zend_make_compiled_string_description("mbregex replace");
1056 } else {
1057 pbuf = &out_buf;
1058 description = NULL;
1059 }
1060
1061 /* do the actual work */
1062 err = 0;
1063 pos = (OnigUChar *)string;
1064 string_lim = (OnigUChar*)(string + string_len);
1065 regs = onig_region_new();
1066 while (err >= 0) {
1067 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
1068 if (err <= -2) {
1069 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1070 onig_error_code_to_str(err_str, err);
1071 php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
1072 break;
1073 }
1074 if (err >= 0) {
1075 /* copy the part of the string before the match */
1076 smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
1077
1078 if (!is_callable) {
1079 mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
1080 }
1081
1082 if (is_callable) {
1083 zval args[1];
1084 zval subpats, retval;
1085 int i;
1086
1087 array_init(&subpats);
1088 for (i = 0; i < regs->num_regs; i++) {
1089 add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
1090 }
1091 if (onig_number_of_names(re) > 0) {
1092 mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1093 onig_foreach_name(re, mb_regex_groups_iter, &args);
1094 }
1095
1096 ZVAL_COPY_VALUE(&args[0], &subpats);
1097 /* null terminate buffer */
1098 smart_str_0(&eval_buf);
1099
1100 arg_replace_fci.param_count = 1;
1101 arg_replace_fci.params = args;
1102 arg_replace_fci.retval = &retval;
1103 if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
1104 !Z_ISUNDEF(retval)) {
1105 convert_to_string(&retval);
1106 smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
1107 smart_str_free(&eval_buf);
1108 zval_ptr_dtor(&retval);
1109 } else {
1110 if (!EG(exception)) {
1111 zend_throw_error(NULL, "Unable to call custom replacement function");
1112 zval_ptr_dtor(&subpats);
1113 RETURN_THROWS();
1114 }
1115 }
1116 zval_ptr_dtor(&subpats);
1117 }
1118
1119 n = regs->end[0];
1120 if ((pos - (OnigUChar *)string) < n) {
1121 pos = (OnigUChar *)string + n;
1122 } else {
1123 if (pos < string_lim) {
1124 smart_str_appendl(&out_buf, (char *)pos, 1);
1125 }
1126 pos++;
1127 }
1128 } else { /* nomatch */
1129 /* stick that last bit of string on our output */
1130 if (string_lim - pos > 0) {
1131 smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
1132 }
1133 }
1134 onig_region_free(regs, 0);
1135 }
1136
1137 if (description) {
1138 efree(description);
1139 }
1140 if (regs != NULL) {
1141 onig_region_free(regs, 1);
1142 }
1143 smart_str_free(&eval_buf);
1144
1145 if (err <= -2) {
1146 smart_str_free(&out_buf);
1147 RETVAL_FALSE;
1148 } else if (out_buf.s) {
1149 smart_str_0(&out_buf);
1150 RETVAL_STR(out_buf.s);
1151 } else {
1152 RETVAL_EMPTY_STRING();
1153 }
1154 }
1155 /* }}} */
1156
1157 /* {{{ Replace regular expression for multibyte string */
PHP_FUNCTION(mb_ereg_replace)1158 PHP_FUNCTION(mb_ereg_replace)
1159 {
1160 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1161 }
1162 /* }}} */
1163
1164 /* {{{ Case insensitive replace regular expression for multibyte string */
PHP_FUNCTION(mb_eregi_replace)1165 PHP_FUNCTION(mb_eregi_replace)
1166 {
1167 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
1168 }
1169 /* }}} */
1170
1171 /* {{{ regular expression for multibyte string using replacement callback */
PHP_FUNCTION(mb_ereg_replace_callback)1172 PHP_FUNCTION(mb_ereg_replace_callback)
1173 {
1174 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1175 }
1176 /* }}} */
1177
1178 /* {{{ split multibyte string into array by regular expression */
PHP_FUNCTION(mb_split)1179 PHP_FUNCTION(mb_split)
1180 {
1181 char *arg_pattern;
1182 size_t arg_pattern_len;
1183 php_mb_regex_t *re;
1184 OnigRegion *regs = NULL;
1185 char *string;
1186 OnigUChar *pos, *chunk_pos;
1187 size_t string_len;
1188
1189 int err;
1190 zend_long count = -1;
1191
1192 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
1193 RETURN_THROWS();
1194 }
1195
1196 if (count > 0) {
1197 count--;
1198 }
1199
1200 if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
1201 RETURN_FALSE;
1202 }
1203
1204 /* create regex pattern buffer */
1205 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(regex_default_syntax))) == NULL) {
1206 RETURN_FALSE;
1207 }
1208
1209 array_init(return_value);
1210
1211 chunk_pos = pos = (OnigUChar *)string;
1212 err = 0;
1213 regs = onig_region_new();
1214 /* churn through str, generating array entries as we go */
1215 while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
1216 size_t beg, end;
1217 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
1218 if (err < 0) {
1219 break;
1220 }
1221 beg = regs->beg[0], end = regs->end[0];
1222 /* add it to the array */
1223 if ((size_t)(pos - (OnigUChar *)string) < end) {
1224 if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
1225 add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
1226 --count;
1227 } else {
1228 err = -2;
1229 break;
1230 }
1231 /* point at our new starting point */
1232 chunk_pos = pos = (OnigUChar *)string + end;
1233 } else {
1234 pos++;
1235 }
1236 onig_region_free(regs, 0);
1237 }
1238
1239 onig_region_free(regs, 1);
1240
1241 /* see if we encountered an error */
1242 // ToDo investigate if this can actually/should happen ...
1243 if (err <= -2) {
1244 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1245 onig_error_code_to_str(err_str, err);
1246 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
1247 zend_array_destroy(Z_ARR_P(return_value));
1248 RETURN_FALSE;
1249 }
1250
1251 /* otherwise we just have one last element to add to the array */
1252 if ((OnigUChar *)(string + string_len) > chunk_pos) {
1253 size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
1254 add_next_index_stringl(return_value, (char *)chunk_pos, n);
1255 } else {
1256 add_next_index_stringl(return_value, "", 0);
1257 }
1258 }
1259 /* }}} */
1260
1261 /* {{{ Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg_match)1262 PHP_FUNCTION(mb_ereg_match)
1263 {
1264 char *arg_pattern;
1265 size_t arg_pattern_len;
1266
1267 char *string;
1268 size_t string_len;
1269
1270 php_mb_regex_t *re;
1271 OnigSyntaxType *syntax;
1272 OnigOptionType option = 0;
1273 int err;
1274 OnigMatchParam *mp;
1275
1276 {
1277 char *option_str = NULL;
1278 size_t option_str_len = 0;
1279
1280 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s!",
1281 &arg_pattern, &arg_pattern_len, &string, &string_len,
1282 &option_str, &option_str_len)==FAILURE) {
1283 RETURN_THROWS();
1284 }
1285
1286 if (option_str != NULL) {
1287 if(!_php_mb_regex_init_options(option_str, option_str_len, &option, &syntax)) {
1288 RETURN_THROWS();
1289 }
1290 } else {
1291 option |= MBREX(regex_default_options);
1292 syntax = MBREX(regex_default_syntax);
1293 }
1294 }
1295
1296 if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
1297 RETURN_FALSE;
1298 }
1299
1300 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1301 RETURN_FALSE;
1302 }
1303
1304 mp = onig_new_match_param();
1305 onig_initialize_match_param(mp);
1306 if (MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
1307 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
1308 }
1309 if (MBSTRG(regex_retry_limit) > 0 && MBSTRG(regex_retry_limit) < UINT_MAX) {
1310 onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
1311 }
1312 /* match */
1313 err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
1314 onig_free_match_param(mp);
1315 if (err >= 0) {
1316 RETVAL_TRUE;
1317 } else {
1318 RETVAL_FALSE;
1319 }
1320 }
1321 /* }}} */
1322
1323 /* regex search */
1324 /* {{{ _php_mb_regex_ereg_search_exec */
_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS,int mode)1325 static void _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
1326 {
1327 char *arg_pattern = NULL, *arg_options = NULL;
1328 size_t arg_pattern_len, arg_options_len;
1329 int err;
1330 size_t n, i, pos, len;
1331 /* Stored as int* in the OnigRegion struct */
1332 int beg, end;
1333 OnigOptionType option = 0;
1334 OnigUChar *str;
1335 OnigSyntaxType *syntax;
1336
1337 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!s!", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1338 RETURN_THROWS();
1339 }
1340
1341 if (arg_options) {
1342 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
1343 } else {
1344 option |= MBREX(regex_default_options);
1345 syntax = MBREX(regex_default_syntax);
1346 }
1347
1348 if (MBREX(search_regs)) {
1349 onig_region_free(MBREX(search_regs), 1);
1350 MBREX(search_regs) = NULL;
1351 }
1352
1353 if (arg_pattern) {
1354 /* create regex pattern buffer */
1355 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1356 RETURN_FALSE;
1357 }
1358 }
1359
1360 pos = MBREX(search_pos);
1361 str = NULL;
1362 len = 0;
1363 if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
1364 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1365 len = Z_STRLEN(MBREX(search_str));
1366 }
1367
1368 if (MBREX(search_re) == NULL) {
1369 zend_throw_error(NULL, "No pattern was provided");
1370 RETURN_THROWS();
1371 }
1372
1373 if (str == NULL) {
1374 zend_throw_error(NULL, "No string was provided");
1375 RETURN_THROWS();
1376 }
1377
1378 MBREX(search_regs) = onig_region_new();
1379
1380 err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
1381 if (err == ONIG_MISMATCH) {
1382 MBREX(search_pos) = len;
1383 RETVAL_FALSE;
1384 } else if (err <= -2) {
1385 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1386 onig_error_code_to_str(err_str, err);
1387 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
1388 RETVAL_FALSE;
1389 } else {
1390 switch (mode) {
1391 case 1:
1392 array_init(return_value);
1393 beg = MBREX(search_regs)->beg[0];
1394 end = MBREX(search_regs)->end[0];
1395 add_next_index_long(return_value, beg);
1396 add_next_index_long(return_value, end - beg);
1397 break;
1398 case 2:
1399 array_init(return_value);
1400 n = MBREX(search_regs)->num_regs;
1401 for (i = 0; i < n; i++) {
1402 beg = MBREX(search_regs)->beg[i];
1403 end = MBREX(search_regs)->end[i];
1404 if (beg >= 0 && beg <= end && end <= len) {
1405 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1406 } else {
1407 add_index_bool(return_value, i, 0);
1408 }
1409 }
1410 if (onig_number_of_names(MBREX(search_re)) > 0) {
1411 mb_regex_groups_iter_args args = {
1412 return_value,
1413 Z_STRVAL(MBREX(search_str)),
1414 Z_STRLEN(MBREX(search_str)),
1415 MBREX(search_regs)
1416 };
1417 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1418 }
1419 break;
1420 default:
1421 RETVAL_TRUE;
1422 break;
1423 }
1424 end = MBREX(search_regs)->end[0];
1425 if (pos <= end) {
1426 MBREX(search_pos) = end;
1427 } else {
1428 MBREX(search_pos) = pos + 1;
1429 }
1430 }
1431
1432 if (err < 0) {
1433 onig_region_free(MBREX(search_regs), 1);
1434 MBREX(search_regs) = (OnigRegion *)NULL;
1435 }
1436 }
1437 /* }}} */
1438
1439 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search)1440 PHP_FUNCTION(mb_ereg_search)
1441 {
1442 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1443 }
1444 /* }}} */
1445
1446 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_pos)1447 PHP_FUNCTION(mb_ereg_search_pos)
1448 {
1449 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1450 }
1451 /* }}} */
1452
1453 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_regs)1454 PHP_FUNCTION(mb_ereg_search_regs)
1455 {
1456 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
1457 }
1458 /* }}} */
1459
1460 /* {{{ Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)1461 PHP_FUNCTION(mb_ereg_search_init)
1462 {
1463 zend_string *arg_str;
1464 char *arg_pattern = NULL, *arg_options = NULL;
1465 size_t arg_pattern_len = 0, arg_options_len = 0;
1466 OnigSyntaxType *syntax = NULL;
1467 OnigOptionType option;
1468
1469 if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|s!s!", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1470 RETURN_THROWS();
1471 }
1472
1473 if (arg_pattern && arg_pattern_len == 0) {
1474 zend_argument_value_error(2, "must not be empty");
1475 RETURN_THROWS();
1476 }
1477
1478 if (arg_options) {
1479 option = 0;
1480 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
1481 } else {
1482 option = MBREX(regex_default_options);
1483 syntax = MBREX(regex_default_syntax);
1484 }
1485
1486 if (arg_pattern) {
1487 /* create regex pattern buffer */
1488 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1489 RETURN_FALSE;
1490 }
1491 }
1492
1493 if (!Z_ISNULL(MBREX(search_str))) {
1494 zval_ptr_dtor(&MBREX(search_str));
1495 }
1496
1497 ZVAL_STR_COPY(&MBREX(search_str), arg_str);
1498
1499 if (php_mb_check_encoding(ZSTR_VAL(arg_str), ZSTR_LEN(arg_str), php_mb_regex_get_mbctype_encoding())) {
1500 MBREX(search_pos) = 0;
1501 RETVAL_TRUE;
1502 } else {
1503 MBREX(search_pos) = ZSTR_LEN(arg_str);
1504 RETVAL_FALSE;
1505 }
1506
1507 if (MBREX(search_regs) != NULL) {
1508 onig_region_free(MBREX(search_regs), 1);
1509 MBREX(search_regs) = NULL;
1510 }
1511 }
1512 /* }}} */
1513
1514 /* {{{ Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)1515 PHP_FUNCTION(mb_ereg_search_getregs)
1516 {
1517 size_t n, i, len;
1518 /* Stored as int* in the OnigRegion struct */
1519 int beg, end;
1520 OnigUChar *str;
1521
1522 if (zend_parse_parameters_none() == FAILURE) {
1523 RETURN_THROWS();
1524 }
1525
1526 if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
1527 array_init(return_value);
1528
1529 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1530 len = Z_STRLEN(MBREX(search_str));
1531 n = MBREX(search_regs)->num_regs;
1532 for (i = 0; i < n; i++) {
1533 beg = MBREX(search_regs)->beg[i];
1534 end = MBREX(search_regs)->end[i];
1535 if (beg >= 0 && beg <= end && end <= len) {
1536 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1537 } else {
1538 add_index_bool(return_value, i, 0);
1539 }
1540 }
1541 if (onig_number_of_names(MBREX(search_re)) > 0) {
1542 mb_regex_groups_iter_args args = {
1543 return_value,
1544 Z_STRVAL(MBREX(search_str)),
1545 len,
1546 MBREX(search_regs)
1547 };
1548 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1549 }
1550 } else {
1551 // TODO This seems to be some logical error, promote to Error
1552 RETVAL_FALSE;
1553 }
1554 }
1555 /* }}} */
1556
1557 /* {{{ Get search start position */
PHP_FUNCTION(mb_ereg_search_getpos)1558 PHP_FUNCTION(mb_ereg_search_getpos)
1559 {
1560 if (zend_parse_parameters_none() == FAILURE) {
1561 RETURN_THROWS();
1562 }
1563
1564 RETVAL_LONG(MBREX(search_pos));
1565 }
1566 /* }}} */
1567
1568 /* {{{ Set search start position */
PHP_FUNCTION(mb_ereg_search_setpos)1569 PHP_FUNCTION(mb_ereg_search_setpos)
1570 {
1571 zend_long position;
1572
1573 if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
1574 RETURN_THROWS();
1575 }
1576
1577 /* Accept negative position if length of search string can be determined */
1578 if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
1579 position += Z_STRLEN(MBREX(search_str));
1580 }
1581
1582 if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
1583 zend_argument_value_error(1, "is out of range");
1584 RETURN_THROWS();
1585 }
1586
1587 MBREX(search_pos) = position;
1588 // TODO Return void
1589 RETURN_TRUE;
1590 }
1591 /* }}} */
1592
1593 /* {{{ php_mb_regex_set_options */
_php_mb_regex_set_options(OnigOptionType options,OnigSyntaxType * syntax,OnigOptionType * prev_options,OnigSyntaxType ** prev_syntax)1594 static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
1595 {
1596 if (prev_options != NULL) {
1597 *prev_options = MBREX(regex_default_options);
1598 }
1599 if (prev_syntax != NULL) {
1600 *prev_syntax = MBREX(regex_default_syntax);
1601 }
1602 MBREX(regex_default_options) = options;
1603 MBREX(regex_default_syntax) = syntax;
1604 }
1605 /* }}} */
1606
1607 /* {{{ Set or get the default options for mbregex functions */
PHP_FUNCTION(mb_regex_set_options)1608 PHP_FUNCTION(mb_regex_set_options)
1609 {
1610 OnigOptionType opt, prev_opt;
1611 OnigSyntaxType *syntax, *prev_syntax;
1612 char *string = NULL;
1613 size_t string_len;
1614 char buf[16];
1615
1616 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!",
1617 &string, &string_len) == FAILURE) {
1618 RETURN_THROWS();
1619 }
1620 if (string != NULL) {
1621 opt = 0;
1622 syntax = NULL;
1623 if(!_php_mb_regex_init_options(string, string_len, &opt, &syntax)) {
1624 RETURN_THROWS();
1625 }
1626 _php_mb_regex_set_options(opt, syntax, &prev_opt, &prev_syntax);
1627 opt = prev_opt;
1628 syntax = prev_syntax;
1629 } else {
1630 opt = MBREX(regex_default_options);
1631 syntax = MBREX(regex_default_syntax);
1632 }
1633 _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
1634
1635 RETVAL_STRING(buf);
1636 }
1637 /* }}} */
1638
1639 #endif /* HAVE_MBREGEX */
1640