1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 +----------------------------------------------------------------------+
15 */
16
17 #include "libmbfl/config.h"
18
19 #include "php.h"
20 #include "php_ini.h"
21
22 #ifdef HAVE_MBREGEX
23
24 #include "zend_smart_str.h"
25 #include "ext/standard/info.h"
26 #include "php_mbregex.h"
27 #include "mbstring.h"
28 #include "libmbfl/filters/mbfilter_utf8.h"
29
30 #include "php_onig_compat.h" /* must come prior to the oniguruma header */
31 #include <oniguruma.h>
32 #undef UChar
33
34 #if !defined(ONIGURUMA_VERSION_INT) || ONIGURUMA_VERSION_INT < 60800
35 typedef void OnigMatchParam;
36 #define onig_new_match_param() (NULL)
37 #define onig_initialize_match_param(x) (void)(x)
38 #define onig_set_match_stack_limit_size_of_match_param(x, y)
39 #define onig_set_retry_limit_in_match_of_match_param(x, y)
40 #define onig_free_match_param(x)
41 #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
42 onig_search(reg, str, end, start, range, region, option)
43 #define onig_match_with_param(re, str, end, at, region, option, mp) \
44 onig_match(re, str, end, at, region, option)
45 #endif
46
ZEND_EXTERN_MODULE_GLOBALS(mbstring)47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48
49 char php_mb_oniguruma_version[256];
50
51 struct _zend_mb_regex_globals {
52 OnigEncoding default_mbctype;
53 OnigEncoding current_mbctype;
54 const mbfl_encoding *current_mbctype_mbfl_encoding;
55 HashTable ht_rc;
56 zval search_str;
57 zval *search_str_val;
58 size_t search_pos;
59 php_mb_regex_t *search_re;
60 OnigRegion *search_regs;
61 OnigOptionType regex_default_options;
62 OnigSyntaxType *regex_default_syntax;
63 };
64
65 #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
66
67 /* {{{ static void php_mb_regex_free_cache() */
php_mb_regex_free_cache(zval * el)68 static void php_mb_regex_free_cache(zval *el) {
69 onig_free((php_mb_regex_t *)Z_PTR_P(el));
70 }
71 /* }}} */
72
73 /* {{{ _php_mb_regex_globals_ctor */
_php_mb_regex_globals_ctor(zend_mb_regex_globals * pglobals)74 static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
75 {
76 pglobals->default_mbctype = ONIG_ENCODING_UTF8;
77 pglobals->current_mbctype = ONIG_ENCODING_UTF8;
78 pglobals->current_mbctype_mbfl_encoding = &mbfl_encoding_utf8;
79 ZVAL_UNDEF(&pglobals->search_str);
80 pglobals->search_re = (php_mb_regex_t*)NULL;
81 pglobals->search_pos = 0;
82 pglobals->search_regs = (OnigRegion*)NULL;
83 pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
84 pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
85 return SUCCESS;
86 }
87 /* }}} */
88
89 /* {{{ php_mb_regex_globals_alloc */
php_mb_regex_globals_alloc(void)90 zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
91 {
92 zend_mb_regex_globals *pglobals = pemalloc(
93 sizeof(zend_mb_regex_globals), 1);
94 if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
95 pefree(pglobals, 1);
96 return NULL;
97 }
98 return pglobals;
99 }
100 /* }}} */
101
102 /* {{{ php_mb_regex_globals_free */
php_mb_regex_globals_free(zend_mb_regex_globals * pglobals)103 void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
104 {
105 if (!pglobals) {
106 return;
107 }
108 pefree(pglobals, 1);
109 }
110 /* }}} */
111
112 /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
PHP_MINIT_FUNCTION(mb_regex)113 PHP_MINIT_FUNCTION(mb_regex)
114 {
115 onig_init();
116
117 snprintf(php_mb_oniguruma_version, sizeof(php_mb_oniguruma_version), "%d.%d.%d",
118 ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY);
119
120 return SUCCESS;
121 }
122 /* }}} */
123
124 /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
PHP_MSHUTDOWN_FUNCTION(mb_regex)125 PHP_MSHUTDOWN_FUNCTION(mb_regex)
126 {
127 onig_end();
128 return SUCCESS;
129 }
130 /* }}} */
131
132 /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)133 PHP_RINIT_FUNCTION(mb_regex)
134 {
135 if (!MBSTRG(mb_regex_globals)) return FAILURE;
136 zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
137 return SUCCESS;
138 }
139 /* }}} */
140
141 /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
PHP_RSHUTDOWN_FUNCTION(mb_regex)142 PHP_RSHUTDOWN_FUNCTION(mb_regex)
143 {
144 MBREX(current_mbctype) = MBREX(default_mbctype);
145 MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(php_mb_regex_get_default_mbctype());
146
147 if (!Z_ISUNDEF(MBREX(search_str))) {
148 zval_ptr_dtor(&MBREX(search_str));
149 ZVAL_UNDEF(&MBREX(search_str));
150 }
151 MBREX(search_pos) = 0;
152 MBREX(search_re) = NULL;
153
154 if (MBREX(search_regs) != NULL) {
155 onig_region_free(MBREX(search_regs), 1);
156 MBREX(search_regs) = (OnigRegion *)NULL;
157 }
158 zend_hash_destroy(&MBREX(ht_rc));
159
160 return SUCCESS;
161 }
162 /* }}} */
163
164 /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
PHP_MINFO_FUNCTION(mb_regex)165 PHP_MINFO_FUNCTION(mb_regex)
166 {
167 char buf[32];
168 php_info_print_table_start();
169 php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
170 snprintf(buf, sizeof(buf), "%d.%d.%d",
171 ONIGURUMA_VERSION_MAJOR,
172 ONIGURUMA_VERSION_MINOR,
173 ONIGURUMA_VERSION_TEENY);
174 php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
175 php_info_print_table_end();
176 }
177 /* }}} */
178
179 /*
180 * encoding name resolver
181 */
182
183 /* {{{ encoding name map */
184 typedef struct _php_mb_regex_enc_name_map_t {
185 const char *names;
186 OnigEncoding code;
187 } php_mb_regex_enc_name_map_t;
188
189 static const php_mb_regex_enc_name_map_t enc_name_map[] = {
190 #ifdef ONIG_ENCODING_EUC_JP
191 {
192 "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
193 ONIG_ENCODING_EUC_JP
194 },
195 #endif
196 #ifdef ONIG_ENCODING_UTF8
197 {
198 "UTF-8\0UTF8\0",
199 ONIG_ENCODING_UTF8
200 },
201 #endif
202 #ifdef ONIG_ENCODING_UTF16_BE
203 {
204 "UTF-16\0UTF-16BE\0",
205 ONIG_ENCODING_UTF16_BE
206 },
207 #endif
208 #ifdef ONIG_ENCODING_UTF16_LE
209 {
210 "UTF-16LE\0",
211 ONIG_ENCODING_UTF16_LE
212 },
213 #endif
214 #ifdef ONIG_ENCODING_UTF32_BE
215 {
216 "UCS-4\0UTF-32\0UTF-32BE\0",
217 ONIG_ENCODING_UTF32_BE
218 },
219 #endif
220 #ifdef ONIG_ENCODING_UTF32_LE
221 {
222 "UCS-4LE\0UTF-32LE\0",
223 ONIG_ENCODING_UTF32_LE
224 },
225 #endif
226 #ifdef ONIG_ENCODING_SJIS
227 {
228 "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
229 ONIG_ENCODING_SJIS
230 },
231 #endif
232 #ifdef ONIG_ENCODING_BIG5
233 {
234 "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
235 ONIG_ENCODING_BIG5
236 },
237 #endif
238 #ifdef ONIG_ENCODING_EUC_CN
239 {
240 "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
241 ONIG_ENCODING_EUC_CN
242 },
243 #endif
244 #ifdef ONIG_ENCODING_EUC_TW
245 {
246 "EUC-TW\0EUCTW\0EUC_TW\0",
247 ONIG_ENCODING_EUC_TW
248 },
249 #endif
250 #ifdef ONIG_ENCODING_EUC_KR
251 {
252 "EUC-KR\0EUCKR\0EUC_KR\0",
253 ONIG_ENCODING_EUC_KR
254 },
255 #endif
256 #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
257 {
258 "KOI8\0KOI-8\0",
259 ONIG_ENCODING_KOI8
260 },
261 #endif
262 #ifdef ONIG_ENCODING_KOI8_R
263 {
264 "KOI8R\0KOI8-R\0KOI-8R\0",
265 ONIG_ENCODING_KOI8_R
266 },
267 #endif
268 #ifdef ONIG_ENCODING_ISO_8859_1
269 {
270 "ISO-8859-1\0ISO8859-1\0",
271 ONIG_ENCODING_ISO_8859_1
272 },
273 #endif
274 #ifdef ONIG_ENCODING_ISO_8859_2
275 {
276 "ISO-8859-2\0ISO8859-2\0",
277 ONIG_ENCODING_ISO_8859_2
278 },
279 #endif
280 #ifdef ONIG_ENCODING_ISO_8859_3
281 {
282 "ISO-8859-3\0ISO8859-3\0",
283 ONIG_ENCODING_ISO_8859_3
284 },
285 #endif
286 #ifdef ONIG_ENCODING_ISO_8859_4
287 {
288 "ISO-8859-4\0ISO8859-4\0",
289 ONIG_ENCODING_ISO_8859_4
290 },
291 #endif
292 #ifdef ONIG_ENCODING_ISO_8859_5
293 {
294 "ISO-8859-5\0ISO8859-5\0",
295 ONIG_ENCODING_ISO_8859_5
296 },
297 #endif
298 #ifdef ONIG_ENCODING_ISO_8859_6
299 {
300 "ISO-8859-6\0ISO8859-6\0",
301 ONIG_ENCODING_ISO_8859_6
302 },
303 #endif
304 #ifdef ONIG_ENCODING_ISO_8859_7
305 {
306 "ISO-8859-7\0ISO8859-7\0",
307 ONIG_ENCODING_ISO_8859_7
308 },
309 #endif
310 #ifdef ONIG_ENCODING_ISO_8859_8
311 {
312 "ISO-8859-8\0ISO8859-8\0",
313 ONIG_ENCODING_ISO_8859_8
314 },
315 #endif
316 #ifdef ONIG_ENCODING_ISO_8859_9
317 {
318 "ISO-8859-9\0ISO8859-9\0",
319 ONIG_ENCODING_ISO_8859_9
320 },
321 #endif
322 #ifdef ONIG_ENCODING_ISO_8859_10
323 {
324 "ISO-8859-10\0ISO8859-10\0",
325 ONIG_ENCODING_ISO_8859_10
326 },
327 #endif
328 #ifdef ONIG_ENCODING_ISO_8859_11
329 {
330 "ISO-8859-11\0ISO8859-11\0",
331 ONIG_ENCODING_ISO_8859_11
332 },
333 #endif
334 #ifdef ONIG_ENCODING_ISO_8859_13
335 {
336 "ISO-8859-13\0ISO8859-13\0",
337 ONIG_ENCODING_ISO_8859_13
338 },
339 #endif
340 #ifdef ONIG_ENCODING_ISO_8859_14
341 {
342 "ISO-8859-14\0ISO8859-14\0",
343 ONIG_ENCODING_ISO_8859_14
344 },
345 #endif
346 #ifdef ONIG_ENCODING_ISO_8859_15
347 {
348 "ISO-8859-15\0ISO8859-15\0",
349 ONIG_ENCODING_ISO_8859_15
350 },
351 #endif
352 #ifdef ONIG_ENCODING_ISO_8859_16
353 {
354 "ISO-8859-16\0ISO8859-16\0",
355 ONIG_ENCODING_ISO_8859_16
356 },
357 #endif
358 #ifdef ONIG_ENCODING_ASCII
359 {
360 "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
361 ONIG_ENCODING_ASCII
362 },
363 #endif
364 { NULL, ONIG_ENCODING_UNDEF }
365 };
366 /* }}} */
367
368 /* {{{ php_mb_regex_name2mbctype */
_php_mb_regex_name2mbctype(const char * pname)369 static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
370 {
371 const char *p;
372 const php_mb_regex_enc_name_map_t *mapping;
373
374 if (pname == NULL || !*pname) {
375 return ONIG_ENCODING_UNDEF;
376 }
377
378 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
379 for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
380 if (strcasecmp(p, pname) == 0) {
381 return mapping->code;
382 }
383 }
384 }
385
386 return ONIG_ENCODING_UNDEF;
387 }
388 /* }}} */
389
390 /* {{{ php_mb_regex_mbctype2name */
_php_mb_regex_mbctype2name(OnigEncoding mbctype)391 static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
392 {
393 const php_mb_regex_enc_name_map_t *mapping;
394
395 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
396 if (mapping->code == mbctype) {
397 return mapping->names;
398 }
399 }
400
401 return NULL;
402 }
403 /* }}} */
404
405 /* {{{ php_mb_regex_set_mbctype */
php_mb_regex_set_mbctype(const char * encname)406 int php_mb_regex_set_mbctype(const char *encname)
407 {
408 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
409 if (mbctype == ONIG_ENCODING_UNDEF) {
410 return FAILURE;
411 }
412 MBREX(current_mbctype) = mbctype;
413 MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(encname);
414 return SUCCESS;
415 }
416 /* }}} */
417
418 /* {{{ php_mb_regex_set_default_mbctype */
php_mb_regex_set_default_mbctype(const char * encname)419 int php_mb_regex_set_default_mbctype(const char *encname)
420 {
421 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
422 if (mbctype == ONIG_ENCODING_UNDEF) {
423 return FAILURE;
424 }
425 MBREX(default_mbctype) = mbctype;
426 return SUCCESS;
427 }
428 /* }}} */
429
430 /* {{{ php_mb_regex_get_mbctype */
php_mb_regex_get_mbctype(void)431 const char *php_mb_regex_get_mbctype(void)
432 {
433 return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
434 }
435 /* }}} */
436
437 /* {{{ php_mb_regex_get_mbctype_encoding */
php_mb_regex_get_mbctype_encoding(void)438 const mbfl_encoding *php_mb_regex_get_mbctype_encoding(void)
439 {
440 return MBREX(current_mbctype_mbfl_encoding);
441 }
442 /* }}} */
443
444 /* {{{ php_mb_regex_get_default_mbctype */
php_mb_regex_get_default_mbctype(void)445 const char *php_mb_regex_get_default_mbctype(void)
446 {
447 return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
448 }
449 /* }}} */
450
451 /*
452 * regex cache
453 */
454 /* {{{ php_mbregex_compile_pattern */
php_mbregex_compile_pattern(const char * pattern,size_t patlen,OnigOptionType options,OnigSyntaxType * syntax)455 static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigSyntaxType *syntax)
456 {
457 int err_code = 0;
458 php_mb_regex_t *retval = NULL, *rc = NULL;
459 OnigErrorInfo err_info;
460 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
461 OnigEncoding enc = MBREX(current_mbctype);
462
463 if (!php_mb_check_encoding(pattern, patlen, php_mb_regex_get_mbctype_encoding())) {
464 php_error_docref(NULL, E_WARNING,
465 "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
466 return NULL;
467 }
468
469 rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
470 if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
471 if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
472 onig_error_code_to_str(err_str, err_code, &err_info);
473 php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
474 return NULL;
475 }
476 if (rc == MBREX(search_re)) {
477 /* reuse the new rc? see bug #72399 */
478 MBREX(search_re) = NULL;
479 }
480 zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
481 } else {
482 retval = rc;
483 }
484 return retval;
485 }
486 /* }}} */
487
488 /* {{{ _php_mb_regex_get_option_string */
_php_mb_regex_get_option_string(char * str,size_t len,OnigOptionType option,OnigSyntaxType * syntax)489 static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
490 {
491 size_t len_left = len;
492 size_t len_req = 0;
493 char *p = str;
494 char c;
495
496 if ((option & ONIG_OPTION_IGNORECASE) != 0) {
497 if (len_left > 0) {
498 --len_left;
499 *(p++) = 'i';
500 }
501 ++len_req;
502 }
503
504 if ((option & ONIG_OPTION_EXTEND) != 0) {
505 if (len_left > 0) {
506 --len_left;
507 *(p++) = 'x';
508 }
509 ++len_req;
510 }
511
512 if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
513 (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
514 if (len_left > 0) {
515 --len_left;
516 *(p++) = 'p';
517 }
518 ++len_req;
519 } else {
520 if ((option & ONIG_OPTION_MULTILINE) != 0) {
521 if (len_left > 0) {
522 --len_left;
523 *(p++) = 'm';
524 }
525 ++len_req;
526 }
527
528 if ((option & ONIG_OPTION_SINGLELINE) != 0) {
529 if (len_left > 0) {
530 --len_left;
531 *(p++) = 's';
532 }
533 ++len_req;
534 }
535 }
536 if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
537 if (len_left > 0) {
538 --len_left;
539 *(p++) = 'l';
540 }
541 ++len_req;
542 }
543 if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
544 if (len_left > 0) {
545 --len_left;
546 *(p++) = 'n';
547 }
548 ++len_req;
549 }
550
551 c = 0;
552
553 if (syntax == ONIG_SYNTAX_JAVA) {
554 c = 'j';
555 } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
556 c = 'u';
557 } else if (syntax == ONIG_SYNTAX_GREP) {
558 c = 'g';
559 } else if (syntax == ONIG_SYNTAX_EMACS) {
560 c = 'c';
561 } else if (syntax == ONIG_SYNTAX_RUBY) {
562 c = 'r';
563 } else if (syntax == ONIG_SYNTAX_PERL) {
564 c = 'z';
565 } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
566 c = 'b';
567 } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
568 c = 'd';
569 }
570
571 if (c != 0) {
572 if (len_left > 0) {
573 --len_left;
574 *(p++) = c;
575 }
576 ++len_req;
577 }
578
579
580 if (len_left > 0) {
581 --len_left;
582 *(p++) = '\0';
583 }
584 ++len_req;
585 if (len < len_req) {
586 return len_req;
587 }
588
589 return 0;
590 }
591 /* }}} */
592
593 /* {{{ _php_mb_regex_init_options */
_php_mb_regex_init_options(const char * parg,size_t narg,OnigOptionType * option,OnigSyntaxType ** syntax)594 static bool _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option,
595 OnigSyntaxType **syntax)
596 {
597 size_t n;
598 char c;
599 OnigOptionType optm = 0;
600
601 *syntax = ONIG_SYNTAX_RUBY;
602
603 if (parg != NULL) {
604 n = 0;
605 while(n < narg) {
606 c = parg[n++];
607 switch (c) {
608 case 'i':
609 optm |= ONIG_OPTION_IGNORECASE;
610 break;
611 case 'x':
612 optm |= ONIG_OPTION_EXTEND;
613 break;
614 case 'm':
615 optm |= ONIG_OPTION_MULTILINE;
616 break;
617 case 's':
618 optm |= ONIG_OPTION_SINGLELINE;
619 break;
620 case 'p':
621 optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
622 break;
623 case 'l':
624 optm |= ONIG_OPTION_FIND_LONGEST;
625 break;
626 case 'n':
627 optm |= ONIG_OPTION_FIND_NOT_EMPTY;
628 break;
629 case 'j':
630 *syntax = ONIG_SYNTAX_JAVA;
631 break;
632 case 'u':
633 *syntax = ONIG_SYNTAX_GNU_REGEX;
634 break;
635 case 'g':
636 *syntax = ONIG_SYNTAX_GREP;
637 break;
638 case 'c':
639 *syntax = ONIG_SYNTAX_EMACS;
640 break;
641 case 'r':
642 *syntax = ONIG_SYNTAX_RUBY;
643 break;
644 case 'z':
645 *syntax = ONIG_SYNTAX_PERL;
646 break;
647 case 'b':
648 *syntax = ONIG_SYNTAX_POSIX_BASIC;
649 break;
650 case 'd':
651 *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
652 break;
653 default:
654 zend_value_error("Option \"%c\" is not supported", c);
655 return false;
656 }
657 }
658 if (option != NULL) *option|=optm;
659 }
660 return true;
661 }
662 /* }}} */
663
664
665 /*
666 * Callbacks for named subpatterns
667 */
668
669 /* {{{ struct mb_ereg_groups_iter_arg */
670 typedef struct mb_regex_groups_iter_args {
671 zval *groups;
672 char *search_str;
673 size_t search_len;
674 OnigRegion *region;
675 } mb_regex_groups_iter_args;
676 /* }}} */
677
678 /* {{{ mb_ereg_groups_iter */
679 static int
mb_regex_groups_iter(const OnigUChar * name,const OnigUChar * name_end,int ngroup_num,int * group_nums,regex_t * reg,void * parg)680 mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
681 {
682 mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
683 int gn, beg, end;
684
685 /*
686 * In case of duplicate groups, keep only the last succeeding one
687 * to be consistent with preg_match with the PCRE_DUPNAMES option.
688 */
689 gn = onig_name_to_backref_number(reg, name, name_end, args->region);
690 beg = args->region->beg[gn];
691 end = args->region->end[gn];
692 if (beg >= 0 && beg < end && end <= args->search_len) {
693 add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
694 } else {
695 add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
696 }
697
698 return 0;
699 }
700 /* }}} */
701
702 /*
703 * Helper for _php_mb_regex_ereg_replace_exec
704 */
705 /* {{{ mb_regex_substitute */
mb_regex_substitute(smart_str * pbuf,const char * subject,size_t subject_len,char * replace,size_t replace_len,php_mb_regex_t * regexp,OnigRegion * regs,const mbfl_encoding * enc)706 static inline void mb_regex_substitute(
707 smart_str *pbuf,
708 const char *subject,
709 size_t subject_len,
710 char *replace,
711 size_t replace_len,
712 php_mb_regex_t *regexp,
713 OnigRegion *regs,
714 const mbfl_encoding *enc
715 ) {
716 char *p, *sp, *eos;
717 int no; /* bakreference group number */
718 int clen; /* byte-length of the current character */
719
720 p = replace;
721 eos = replace + replace_len;
722
723 while (p < eos) {
724 clen = (int) php_mb_mbchar_bytes(p, enc);
725 if (clen != 1 || p == eos || p[0] != '\\') {
726 /* skip anything that's not an ascii backslash */
727 smart_str_appendl(pbuf, p, clen);
728 p += clen;
729 continue;
730 }
731 sp = p; /* save position */
732 clen = (int) php_mb_mbchar_bytes(++p, enc);
733 if (clen != 1 || p == eos) {
734 /* skip backslash followed by multibyte char */
735 smart_str_appendl(pbuf, sp, p - sp);
736 continue;
737 }
738 no = -1;
739 switch (p[0]) {
740 case '0':
741 no = 0;
742 p++;
743 break;
744 case '1': case '2': case '3': case '4':
745 case '5': case '6': case '7': case '8': case '9':
746 if (!onig_noname_group_capture_is_active(regexp)) {
747 /*
748 * FIXME:
749 * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
750 * For now we just ignore them, but in the future we might want to raise a warning
751 * and abort the whole replace operation.
752 */
753 p++;
754 smart_str_appendl(pbuf, sp, p - sp);
755 continue;
756 }
757 no = p[0] - '0';
758 p++;
759 break;
760 case 'k':
761 {
762 clen = (int) php_mb_mbchar_bytes(++p, enc);
763 if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
764 /* not a backref delimiter */
765 p += clen;
766 smart_str_appendl(pbuf, sp, p - sp);
767 continue;
768 }
769 /* try to consume everything until next delimiter */
770 char delim = p[0] == '<' ? '>' : '\'';
771 char *name, *name_end;
772 char maybe_num = 1;
773 name_end = name = p + 1;
774 while (name_end < eos) {
775 clen = (int) php_mb_mbchar_bytes(name_end, enc);
776 if (clen != 1) {
777 name_end += clen;
778 maybe_num = 0;
779 continue;
780 }
781 if (name_end[0] == delim) break;
782 if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
783 name_end++;
784 }
785 p = name_end + 1;
786 if (name_end - name < 1 || name_end >= eos) {
787 /* the backref was empty or we failed to find the end delimiter */
788 smart_str_appendl(pbuf, sp, p - sp);
789 continue;
790 }
791 /* we have either a name or a number */
792 if (maybe_num) {
793 if (!onig_noname_group_capture_is_active(regexp)) {
794 /* see above note on mixing numbered & named backrefs */
795 smart_str_appendl(pbuf, sp, p - sp);
796 continue;
797 }
798 if (name_end - name == 1) {
799 no = name[0] - '0';
800 break;
801 }
802 if (name[0] == '0') {
803 /* 01 is not a valid number */
804 break;
805 }
806 no = (int) strtoul(name, NULL, 10);
807 break;
808 }
809 no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
810 break;
811 }
812 default:
813 /* We're not treating \ as an escape character and will interpret something like
814 * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
815 * function has not supported escaping of backslashes historically. */
816 smart_str_appendl(pbuf, sp, p - sp);
817 continue;
818 }
819 if (no < 0 || no >= regs->num_regs) {
820 /* invalid group number reference, keep the escape sequence in the output */
821 smart_str_appendl(pbuf, sp, p - sp);
822 continue;
823 }
824 if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
825 smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
826 }
827 }
828
829 if (p < eos) {
830 smart_str_appendl(pbuf, p, eos - p);
831 }
832 }
833 /* }}} */
834
835 /*
836 * php functions
837 */
838
839 /* {{{ Returns the current encoding for regex as a string. */
PHP_FUNCTION(mb_regex_encoding)840 PHP_FUNCTION(mb_regex_encoding)
841 {
842 char *encoding = NULL;
843 size_t encoding_len;
844
845 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!", &encoding, &encoding_len) == FAILURE) {
846 RETURN_THROWS();
847 }
848
849 if (!encoding) {
850 const char *retval = php_mb_regex_get_mbctype();
851 ZEND_ASSERT(retval != NULL);
852
853 RETURN_STRING(retval);
854 } else {
855 if (php_mb_regex_set_mbctype(encoding) == FAILURE) {
856 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", encoding);
857 RETURN_THROWS();
858 }
859
860 /* TODO Make function return previous encoding? */
861 RETURN_TRUE;
862 }
863 }
864 /* }}} */
865
866 /* {{{ _php_mb_onig_search */
_php_mb_onig_search(regex_t * reg,const OnigUChar * str,const OnigUChar * end,const OnigUChar * start,const OnigUChar * range,OnigRegion * region,OnigOptionType option)867 static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
868 const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
869 OnigMatchParam *mp = onig_new_match_param();
870 int err;
871 onig_initialize_match_param(mp);
872 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
873 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
874 }
875 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_retry_limit))) {
876 onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
877 }
878 /* search */
879 err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
880 onig_free_match_param(mp);
881 return err;
882 }
883 /* }}} */
884
885
886 /* {{{ _php_mb_regex_ereg_exec */
_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS,int icase)887 static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
888 {
889 zval *array = NULL;
890 char *arg_pattern, *string;
891 size_t arg_pattern_len, string_len;
892 php_mb_regex_t *re;
893 OnigRegion *regs = NULL;
894 int i, beg, end;
895 OnigOptionType options;
896 char *str;
897
898 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|z", &arg_pattern, &arg_pattern_len, &string, &string_len, &array) == FAILURE) {
899 RETURN_THROWS();
900 }
901
902 if (arg_pattern_len == 0) {
903 zend_argument_value_error(1, "must not be empty");
904 RETURN_THROWS();
905 }
906
907 if (array != NULL) {
908 array = zend_try_array_init(array);
909 if (!array) {
910 RETURN_THROWS();
911 }
912 }
913
914 if (!php_mb_check_encoding(
915 string,
916 string_len,
917 php_mb_regex_get_mbctype_encoding()
918 )) {
919 RETURN_FALSE;
920 }
921
922 options = MBREX(regex_default_options);
923 if (icase) {
924 options |= ONIG_OPTION_IGNORECASE;
925 }
926
927 re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(regex_default_syntax));
928 if (re == NULL) {
929 RETVAL_FALSE;
930 goto out;
931 }
932
933 regs = onig_region_new();
934
935 /* actually execute the regular expression */
936 if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
937 RETVAL_FALSE;
938 goto out;
939 }
940
941 str = string;
942 if (array != NULL) {
943 for (i = 0; i < regs->num_regs; i++) {
944 beg = regs->beg[i];
945 end = regs->end[i];
946 if (beg >= 0 && beg < end && (size_t)end <= string_len) {
947 add_index_stringl(array, i, (char *)&str[beg], end - beg);
948 } else {
949 add_index_bool(array, i, 0);
950 }
951 }
952
953 if (onig_number_of_names(re) > 0) {
954 mb_regex_groups_iter_args args = {array, string, string_len, regs};
955 onig_foreach_name(re, mb_regex_groups_iter, &args);
956 }
957 }
958
959 RETVAL_TRUE;
960 out:
961 if (regs != NULL) {
962 onig_region_free(regs, 1);
963 }
964 }
965 /* }}} */
966
967 /* {{{ Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg)968 PHP_FUNCTION(mb_ereg)
969 {
970 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
971 }
972 /* }}} */
973
974 /* {{{ Case-insensitive regular expression match for multibyte string */
PHP_FUNCTION(mb_eregi)975 PHP_FUNCTION(mb_eregi)
976 {
977 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
978 }
979 /* }}} */
980
981 /* {{{ _php_mb_regex_ereg_replace_exec */
_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS,OnigOptionType options,int is_callable)982 static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
983 {
984 char *arg_pattern;
985 size_t arg_pattern_len;
986
987 char *replace;
988 size_t replace_len;
989
990 zend_fcall_info arg_replace_fci;
991 zend_fcall_info_cache arg_replace_fci_cache;
992
993 char *string;
994 size_t string_len;
995
996 php_mb_regex_t *re;
997 OnigSyntaxType *syntax;
998 OnigRegion *regs = NULL;
999 smart_str out_buf = {0};
1000 smart_str eval_buf = {0};
1001 smart_str *pbuf;
1002 int err, n;
1003 OnigUChar *pos;
1004 OnigUChar *string_lim;
1005 char *description = NULL;
1006
1007 const mbfl_encoding *enc = php_mb_regex_get_mbctype_encoding();
1008 ZEND_ASSERT(enc != NULL);
1009
1010 {
1011 char *option_str = NULL;
1012 size_t option_str_len = 0;
1013
1014 if (!is_callable) {
1015 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sss|s!",
1016 &arg_pattern, &arg_pattern_len,
1017 &replace, &replace_len,
1018 &string, &string_len,
1019 &option_str, &option_str_len) == FAILURE) {
1020 RETURN_THROWS();
1021 }
1022 } else {
1023 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sfs|s!",
1024 &arg_pattern, &arg_pattern_len,
1025 &arg_replace_fci, &arg_replace_fci_cache,
1026 &string, &string_len,
1027 &option_str, &option_str_len) == FAILURE) {
1028 RETURN_THROWS();
1029 }
1030 }
1031
1032 if (!php_mb_check_encoding(string, string_len, enc)) {
1033 RETURN_NULL();
1034 }
1035
1036 if (option_str != NULL) {
1037 /* Initialize option and in case of failure it means there is a value error */
1038 if (!_php_mb_regex_init_options(option_str, option_str_len, &options, &syntax)) {
1039 RETURN_THROWS();
1040 }
1041 } else {
1042 options |= MBREX(regex_default_options);
1043 syntax = MBREX(regex_default_syntax);
1044 }
1045 }
1046
1047 /* create regex pattern buffer */
1048 re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, syntax);
1049 if (re == NULL) {
1050 RETURN_FALSE;
1051 }
1052
1053 if (is_callable) {
1054 pbuf = &eval_buf;
1055 description = zend_make_compiled_string_description("mbregex replace");
1056 } else {
1057 pbuf = &out_buf;
1058 description = NULL;
1059 }
1060
1061 /* do the actual work */
1062 err = 0;
1063 pos = (OnigUChar *)string;
1064 string_lim = (OnigUChar*)(string + string_len);
1065 regs = onig_region_new();
1066 while (err >= 0) {
1067 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
1068 if (err <= -2) {
1069 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1070 onig_error_code_to_str(err_str, err);
1071 php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
1072 break;
1073 }
1074 if (err >= 0) {
1075 /* copy the part of the string before the match */
1076 smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
1077
1078 if (!is_callable) {
1079 mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
1080 }
1081
1082 if (is_callable) {
1083 zval args[1];
1084 zval subpats, retval;
1085 int i;
1086
1087 array_init(&subpats);
1088 for (i = 0; i < regs->num_regs; i++) {
1089 add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
1090 }
1091 if (onig_number_of_names(re) > 0) {
1092 mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1093 onig_foreach_name(re, mb_regex_groups_iter, &args);
1094 }
1095
1096 ZVAL_COPY_VALUE(&args[0], &subpats);
1097 /* null terminate buffer */
1098 smart_str_0(&eval_buf);
1099
1100 arg_replace_fci.param_count = 1;
1101 arg_replace_fci.params = args;
1102 arg_replace_fci.retval = &retval;
1103 if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
1104 !Z_ISUNDEF(retval)) {
1105 convert_to_string(&retval);
1106 smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
1107 smart_str_free(&eval_buf);
1108 zval_ptr_dtor(&retval);
1109 } else {
1110 if (!EG(exception)) {
1111 zend_throw_error(NULL, "Unable to call custom replacement function");
1112 zval_ptr_dtor(&subpats);
1113 RETURN_THROWS();
1114 }
1115 }
1116 zval_ptr_dtor(&subpats);
1117 }
1118
1119 n = regs->end[0];
1120 if ((pos - (OnigUChar *)string) < n) {
1121 pos = (OnigUChar *)string + n;
1122 } else {
1123 if (pos < string_lim) {
1124 smart_str_appendl(&out_buf, (char *)pos, 1);
1125 }
1126 pos++;
1127 }
1128 } else { /* nomatch */
1129 /* stick that last bit of string on our output */
1130 if (string_lim - pos > 0) {
1131 smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
1132 }
1133 }
1134 onig_region_free(regs, 0);
1135 }
1136
1137 if (description) {
1138 efree(description);
1139 }
1140 if (regs != NULL) {
1141 onig_region_free(regs, 1);
1142 }
1143 smart_str_free(&eval_buf);
1144
1145 if (err <= -2) {
1146 smart_str_free(&out_buf);
1147 RETURN_FALSE;
1148 }
1149
1150 RETURN_STR(smart_str_extract(&out_buf));
1151 }
1152 /* }}} */
1153
1154 /* {{{ Replace regular expression for multibyte string */
PHP_FUNCTION(mb_ereg_replace)1155 PHP_FUNCTION(mb_ereg_replace)
1156 {
1157 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1158 }
1159 /* }}} */
1160
1161 /* {{{ Case insensitive replace regular expression for multibyte string */
PHP_FUNCTION(mb_eregi_replace)1162 PHP_FUNCTION(mb_eregi_replace)
1163 {
1164 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
1165 }
1166 /* }}} */
1167
1168 /* {{{ regular expression for multibyte string using replacement callback */
PHP_FUNCTION(mb_ereg_replace_callback)1169 PHP_FUNCTION(mb_ereg_replace_callback)
1170 {
1171 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1172 }
1173 /* }}} */
1174
1175 /* {{{ split multibyte string into array by regular expression */
PHP_FUNCTION(mb_split)1176 PHP_FUNCTION(mb_split)
1177 {
1178 char *arg_pattern;
1179 size_t arg_pattern_len;
1180 php_mb_regex_t *re;
1181 OnigRegion *regs = NULL;
1182 char *string;
1183 OnigUChar *pos, *chunk_pos;
1184 size_t string_len;
1185
1186 int err;
1187 zend_long count = -1;
1188
1189 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
1190 RETURN_THROWS();
1191 }
1192
1193 if (count > 0) {
1194 count--;
1195 }
1196
1197 if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
1198 RETURN_FALSE;
1199 }
1200
1201 /* create regex pattern buffer */
1202 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(regex_default_syntax))) == NULL) {
1203 RETURN_FALSE;
1204 }
1205
1206 array_init(return_value);
1207
1208 chunk_pos = pos = (OnigUChar *)string;
1209 err = 0;
1210 regs = onig_region_new();
1211 /* churn through str, generating array entries as we go */
1212 while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
1213 size_t beg, end;
1214 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
1215 if (err < 0) {
1216 break;
1217 }
1218 beg = regs->beg[0], end = regs->end[0];
1219 /* add it to the array */
1220 if ((size_t)(pos - (OnigUChar *)string) < end) {
1221 if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
1222 add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
1223 --count;
1224 } else {
1225 err = -2;
1226 break;
1227 }
1228 /* point at our new starting point */
1229 chunk_pos = pos = (OnigUChar *)string + end;
1230 } else {
1231 pos++;
1232 }
1233 onig_region_free(regs, 0);
1234 }
1235
1236 onig_region_free(regs, 1);
1237
1238 /* see if we encountered an error */
1239 // ToDo investigate if this can actually/should happen ...
1240 if (err <= -2) {
1241 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1242 onig_error_code_to_str(err_str, err);
1243 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
1244 zend_array_destroy(Z_ARR_P(return_value));
1245 RETURN_FALSE;
1246 }
1247
1248 /* otherwise we just have one last element to add to the array */
1249 if ((OnigUChar *)(string + string_len) > chunk_pos) {
1250 size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
1251 add_next_index_stringl(return_value, (char *)chunk_pos, n);
1252 } else {
1253 add_next_index_stringl(return_value, "", 0);
1254 }
1255 }
1256 /* }}} */
1257
1258 /* {{{ Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg_match)1259 PHP_FUNCTION(mb_ereg_match)
1260 {
1261 char *arg_pattern;
1262 size_t arg_pattern_len;
1263
1264 char *string;
1265 size_t string_len;
1266
1267 php_mb_regex_t *re;
1268 OnigSyntaxType *syntax;
1269 OnigOptionType option = 0;
1270 int err;
1271 OnigMatchParam *mp;
1272
1273 {
1274 char *option_str = NULL;
1275 size_t option_str_len = 0;
1276
1277 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s!",
1278 &arg_pattern, &arg_pattern_len, &string, &string_len,
1279 &option_str, &option_str_len)==FAILURE) {
1280 RETURN_THROWS();
1281 }
1282
1283 if (option_str != NULL) {
1284 if(!_php_mb_regex_init_options(option_str, option_str_len, &option, &syntax)) {
1285 RETURN_THROWS();
1286 }
1287 } else {
1288 option |= MBREX(regex_default_options);
1289 syntax = MBREX(regex_default_syntax);
1290 }
1291 }
1292
1293 if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
1294 RETURN_FALSE;
1295 }
1296
1297 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1298 RETURN_FALSE;
1299 }
1300
1301 mp = onig_new_match_param();
1302 onig_initialize_match_param(mp);
1303 if (MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
1304 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
1305 }
1306 if (MBSTRG(regex_retry_limit) > 0 && MBSTRG(regex_retry_limit) < UINT_MAX) {
1307 onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
1308 }
1309 /* match */
1310 err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
1311 onig_free_match_param(mp);
1312 if (err >= 0) {
1313 RETVAL_TRUE;
1314 } else {
1315 RETVAL_FALSE;
1316 }
1317 }
1318 /* }}} */
1319
1320 /* regex search */
1321 /* {{{ _php_mb_regex_ereg_search_exec */
_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS,int mode)1322 static void _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
1323 {
1324 char *arg_pattern = NULL, *arg_options = NULL;
1325 size_t arg_pattern_len, arg_options_len;
1326 int err;
1327 size_t n, i, pos, len;
1328 /* Stored as int* in the OnigRegion struct */
1329 int beg, end;
1330 OnigOptionType option = 0;
1331 OnigUChar *str;
1332 OnigSyntaxType *syntax;
1333
1334 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!s!", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1335 RETURN_THROWS();
1336 }
1337
1338 if (arg_options) {
1339 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
1340 } else {
1341 option |= MBREX(regex_default_options);
1342 syntax = MBREX(regex_default_syntax);
1343 }
1344
1345 if (MBREX(search_regs)) {
1346 onig_region_free(MBREX(search_regs), 1);
1347 MBREX(search_regs) = NULL;
1348 }
1349
1350 if (arg_pattern) {
1351 /* create regex pattern buffer */
1352 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1353 RETURN_FALSE;
1354 }
1355 }
1356
1357 pos = MBREX(search_pos);
1358 str = NULL;
1359 len = 0;
1360 if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
1361 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1362 len = Z_STRLEN(MBREX(search_str));
1363 }
1364
1365 if (MBREX(search_re) == NULL) {
1366 zend_throw_error(NULL, "No pattern was provided");
1367 RETURN_THROWS();
1368 }
1369
1370 if (str == NULL) {
1371 zend_throw_error(NULL, "No string was provided");
1372 RETURN_THROWS();
1373 }
1374
1375 MBREX(search_regs) = onig_region_new();
1376
1377 err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
1378 if (err == ONIG_MISMATCH) {
1379 MBREX(search_pos) = len;
1380 RETVAL_FALSE;
1381 } else if (err <= -2) {
1382 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1383 onig_error_code_to_str(err_str, err);
1384 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
1385 RETVAL_FALSE;
1386 } else {
1387 switch (mode) {
1388 case 1:
1389 array_init(return_value);
1390 beg = MBREX(search_regs)->beg[0];
1391 end = MBREX(search_regs)->end[0];
1392 add_next_index_long(return_value, beg);
1393 add_next_index_long(return_value, end - beg);
1394 break;
1395 case 2:
1396 array_init(return_value);
1397 n = MBREX(search_regs)->num_regs;
1398 for (i = 0; i < n; i++) {
1399 beg = MBREX(search_regs)->beg[i];
1400 end = MBREX(search_regs)->end[i];
1401 if (beg >= 0 && beg <= end && end <= len) {
1402 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1403 } else {
1404 add_index_bool(return_value, i, 0);
1405 }
1406 }
1407 if (onig_number_of_names(MBREX(search_re)) > 0) {
1408 mb_regex_groups_iter_args args = {
1409 return_value,
1410 Z_STRVAL(MBREX(search_str)),
1411 Z_STRLEN(MBREX(search_str)),
1412 MBREX(search_regs)
1413 };
1414 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1415 }
1416 break;
1417 default:
1418 RETVAL_TRUE;
1419 break;
1420 }
1421 end = MBREX(search_regs)->end[0];
1422 if (pos <= end) {
1423 MBREX(search_pos) = end;
1424 } else {
1425 MBREX(search_pos) = pos + 1;
1426 }
1427 }
1428
1429 if (err < 0) {
1430 onig_region_free(MBREX(search_regs), 1);
1431 MBREX(search_regs) = (OnigRegion *)NULL;
1432 }
1433 }
1434 /* }}} */
1435
1436 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search)1437 PHP_FUNCTION(mb_ereg_search)
1438 {
1439 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1440 }
1441 /* }}} */
1442
1443 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_pos)1444 PHP_FUNCTION(mb_ereg_search_pos)
1445 {
1446 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1447 }
1448 /* }}} */
1449
1450 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_regs)1451 PHP_FUNCTION(mb_ereg_search_regs)
1452 {
1453 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
1454 }
1455 /* }}} */
1456
1457 /* {{{ Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)1458 PHP_FUNCTION(mb_ereg_search_init)
1459 {
1460 zend_string *arg_str;
1461 char *arg_pattern = NULL, *arg_options = NULL;
1462 size_t arg_pattern_len = 0, arg_options_len = 0;
1463 OnigSyntaxType *syntax = NULL;
1464 OnigOptionType option;
1465
1466 if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|s!s!", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1467 RETURN_THROWS();
1468 }
1469
1470 if (arg_pattern && arg_pattern_len == 0) {
1471 zend_argument_value_error(2, "must not be empty");
1472 RETURN_THROWS();
1473 }
1474
1475 if (arg_options) {
1476 option = 0;
1477 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
1478 } else {
1479 option = MBREX(regex_default_options);
1480 syntax = MBREX(regex_default_syntax);
1481 }
1482
1483 if (arg_pattern) {
1484 /* create regex pattern buffer */
1485 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1486 RETURN_FALSE;
1487 }
1488 }
1489
1490 if (!Z_ISNULL(MBREX(search_str))) {
1491 zval_ptr_dtor(&MBREX(search_str));
1492 }
1493
1494 ZVAL_STR_COPY(&MBREX(search_str), arg_str);
1495
1496 if (php_mb_check_encoding(ZSTR_VAL(arg_str), ZSTR_LEN(arg_str), php_mb_regex_get_mbctype_encoding())) {
1497 MBREX(search_pos) = 0;
1498 RETVAL_TRUE;
1499 } else {
1500 MBREX(search_pos) = ZSTR_LEN(arg_str);
1501 RETVAL_FALSE;
1502 }
1503
1504 if (MBREX(search_regs) != NULL) {
1505 onig_region_free(MBREX(search_regs), 1);
1506 MBREX(search_regs) = NULL;
1507 }
1508 }
1509 /* }}} */
1510
1511 /* {{{ Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)1512 PHP_FUNCTION(mb_ereg_search_getregs)
1513 {
1514 size_t n, i, len;
1515 /* Stored as int* in the OnigRegion struct */
1516 int beg, end;
1517 OnigUChar *str;
1518
1519 if (zend_parse_parameters_none() == FAILURE) {
1520 RETURN_THROWS();
1521 }
1522
1523 if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
1524 array_init(return_value);
1525
1526 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1527 len = Z_STRLEN(MBREX(search_str));
1528 n = MBREX(search_regs)->num_regs;
1529 for (i = 0; i < n; i++) {
1530 beg = MBREX(search_regs)->beg[i];
1531 end = MBREX(search_regs)->end[i];
1532 if (beg >= 0 && beg <= end && end <= len) {
1533 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1534 } else {
1535 add_index_bool(return_value, i, 0);
1536 }
1537 }
1538 if (onig_number_of_names(MBREX(search_re)) > 0) {
1539 mb_regex_groups_iter_args args = {
1540 return_value,
1541 Z_STRVAL(MBREX(search_str)),
1542 len,
1543 MBREX(search_regs)
1544 };
1545 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1546 }
1547 } else {
1548 // TODO This seems to be some logical error, promote to Error
1549 RETVAL_FALSE;
1550 }
1551 }
1552 /* }}} */
1553
1554 /* {{{ Get search start position */
PHP_FUNCTION(mb_ereg_search_getpos)1555 PHP_FUNCTION(mb_ereg_search_getpos)
1556 {
1557 if (zend_parse_parameters_none() == FAILURE) {
1558 RETURN_THROWS();
1559 }
1560
1561 RETVAL_LONG(MBREX(search_pos));
1562 }
1563 /* }}} */
1564
1565 /* {{{ Set search start position */
PHP_FUNCTION(mb_ereg_search_setpos)1566 PHP_FUNCTION(mb_ereg_search_setpos)
1567 {
1568 zend_long position;
1569
1570 if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
1571 RETURN_THROWS();
1572 }
1573
1574 /* Accept negative position if length of search string can be determined */
1575 if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
1576 position += Z_STRLEN(MBREX(search_str));
1577 }
1578
1579 if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
1580 zend_argument_value_error(1, "is out of range");
1581 RETURN_THROWS();
1582 }
1583
1584 MBREX(search_pos) = position;
1585 // TODO Return void
1586 RETURN_TRUE;
1587 }
1588 /* }}} */
1589
1590 /* {{{ php_mb_regex_set_options */
_php_mb_regex_set_options(OnigOptionType options,OnigSyntaxType * syntax,OnigOptionType * prev_options,OnigSyntaxType ** prev_syntax)1591 static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
1592 {
1593 if (prev_options != NULL) {
1594 *prev_options = MBREX(regex_default_options);
1595 }
1596 if (prev_syntax != NULL) {
1597 *prev_syntax = MBREX(regex_default_syntax);
1598 }
1599 MBREX(regex_default_options) = options;
1600 MBREX(regex_default_syntax) = syntax;
1601 }
1602 /* }}} */
1603
1604 /* {{{ Set or get the default options for mbregex functions */
PHP_FUNCTION(mb_regex_set_options)1605 PHP_FUNCTION(mb_regex_set_options)
1606 {
1607 OnigOptionType opt, prev_opt;
1608 OnigSyntaxType *syntax, *prev_syntax;
1609 char *string = NULL;
1610 size_t string_len;
1611 char buf[16];
1612
1613 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!",
1614 &string, &string_len) == FAILURE) {
1615 RETURN_THROWS();
1616 }
1617 if (string != NULL) {
1618 opt = 0;
1619 syntax = NULL;
1620 if(!_php_mb_regex_init_options(string, string_len, &opt, &syntax)) {
1621 RETURN_THROWS();
1622 }
1623 _php_mb_regex_set_options(opt, syntax, &prev_opt, &prev_syntax);
1624 opt = prev_opt;
1625 syntax = prev_syntax;
1626 } else {
1627 opt = MBREX(regex_default_options);
1628 syntax = MBREX(regex_default_syntax);
1629 }
1630 _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
1631
1632 RETVAL_STRING(buf);
1633 }
1634 /* }}} */
1635
1636 #endif /* HAVE_MBREGEX */
1637