1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
16 +----------------------------------------------------------------------+
17 */
18
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22
23 #include "php.h"
24 #include "php_ini.h"
25
26 #if HAVE_MBREGEX
27
28 #include "zend_smart_str.h"
29 #include "ext/standard/info.h"
30 #include "php_mbregex.h"
31 #include "mbstring.h"
32
33 #include "php_onig_compat.h" /* must come prior to the oniguruma header */
34 #include <oniguruma.h>
35 #undef UChar
36
37 #if ONIGURUMA_VERSION_INT < 60800
38 typedef void OnigMatchParam;
39 #define onig_new_match_param() (NULL)
40 #define onig_initialize_match_param(x) (void)(x)
41 #define onig_set_match_stack_limit_size_of_match_param(x, y)
42 #define onig_set_retry_limit_in_match_of_match_param(x, y)
43 #define onig_free_match_param(x)
44 #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
45 onig_search(reg, str, end, start, range, region, option)
46 #define onig_match_with_param(re, str, end, at, region, option, mp) \
47 onig_match(re, str, end, at, region, option)
48 #endif
49
50 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
51
52 struct _zend_mb_regex_globals {
53 OnigEncoding default_mbctype;
54 OnigEncoding current_mbctype;
55 HashTable ht_rc;
56 zval search_str;
57 zval *search_str_val;
58 size_t search_pos;
59 php_mb_regex_t *search_re;
60 OnigRegion *search_regs;
61 OnigOptionType regex_default_options;
62 OnigSyntaxType *regex_default_syntax;
63 };
64
65 #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
66
67 /* {{{ static void php_mb_regex_free_cache() */
php_mb_regex_free_cache(zval * el)68 static void php_mb_regex_free_cache(zval *el) {
69 onig_free((php_mb_regex_t *)Z_PTR_P(el));
70 }
71 /* }}} */
72
73 /* {{{ _php_mb_regex_globals_ctor */
_php_mb_regex_globals_ctor(zend_mb_regex_globals * pglobals)74 static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
75 {
76 pglobals->default_mbctype = ONIG_ENCODING_UTF8;
77 pglobals->current_mbctype = ONIG_ENCODING_UTF8;
78 ZVAL_UNDEF(&pglobals->search_str);
79 pglobals->search_re = (php_mb_regex_t*)NULL;
80 pglobals->search_pos = 0;
81 pglobals->search_regs = (OnigRegion*)NULL;
82 pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
83 pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
84 return SUCCESS;
85 }
86 /* }}} */
87
88 /* {{{ _php_mb_regex_globals_dtor */
_php_mb_regex_globals_dtor(zend_mb_regex_globals * pglobals)89 static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals)
90 {
91 }
92 /* }}} */
93
94 /* {{{ php_mb_regex_globals_alloc */
php_mb_regex_globals_alloc(void)95 zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
96 {
97 zend_mb_regex_globals *pglobals = pemalloc(
98 sizeof(zend_mb_regex_globals), 1);
99 if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
100 pefree(pglobals, 1);
101 return NULL;
102 }
103 return pglobals;
104 }
105 /* }}} */
106
107 /* {{{ php_mb_regex_globals_free */
php_mb_regex_globals_free(zend_mb_regex_globals * pglobals)108 void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
109 {
110 if (!pglobals) {
111 return;
112 }
113 _php_mb_regex_globals_dtor(pglobals);
114 pefree(pglobals, 1);
115 }
116 /* }}} */
117
118 /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
PHP_MINIT_FUNCTION(mb_regex)119 PHP_MINIT_FUNCTION(mb_regex)
120 {
121 char version[256];
122
123 onig_init();
124
125 snprintf(version, sizeof(version), "%d.%d.%d",
126 ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY);
127 REGISTER_STRING_CONSTANT("MB_ONIGURUMA_VERSION", version, CONST_CS | CONST_PERSISTENT);
128 return SUCCESS;
129 }
130 /* }}} */
131
132 /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
PHP_MSHUTDOWN_FUNCTION(mb_regex)133 PHP_MSHUTDOWN_FUNCTION(mb_regex)
134 {
135 onig_end();
136 return SUCCESS;
137 }
138 /* }}} */
139
140 /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)141 PHP_RINIT_FUNCTION(mb_regex)
142 {
143 if (!MBSTRG(mb_regex_globals)) return FAILURE;
144 zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
145 return SUCCESS;
146 }
147 /* }}} */
148
149 /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
PHP_RSHUTDOWN_FUNCTION(mb_regex)150 PHP_RSHUTDOWN_FUNCTION(mb_regex)
151 {
152 MBREX(current_mbctype) = MBREX(default_mbctype);
153
154 if (!Z_ISUNDEF(MBREX(search_str))) {
155 zval_ptr_dtor(&MBREX(search_str));
156 ZVAL_UNDEF(&MBREX(search_str));
157 }
158 MBREX(search_pos) = 0;
159 MBREX(search_re) = NULL;
160
161 if (MBREX(search_regs) != NULL) {
162 onig_region_free(MBREX(search_regs), 1);
163 MBREX(search_regs) = (OnigRegion *)NULL;
164 }
165 zend_hash_destroy(&MBREX(ht_rc));
166
167 return SUCCESS;
168 }
169 /* }}} */
170
171 /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
PHP_MINFO_FUNCTION(mb_regex)172 PHP_MINFO_FUNCTION(mb_regex)
173 {
174 char buf[32];
175 php_info_print_table_start();
176 php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
177 snprintf(buf, sizeof(buf), "%d.%d.%d",
178 ONIGURUMA_VERSION_MAJOR,
179 ONIGURUMA_VERSION_MINOR,
180 ONIGURUMA_VERSION_TEENY);
181 php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
182 php_info_print_table_end();
183 }
184 /* }}} */
185
186 /*
187 * encoding name resolver
188 */
189
190 /* {{{ encoding name map */
191 typedef struct _php_mb_regex_enc_name_map_t {
192 const char *names;
193 OnigEncoding code;
194 } php_mb_regex_enc_name_map_t;
195
196 static const php_mb_regex_enc_name_map_t enc_name_map[] = {
197 #ifdef ONIG_ENCODING_EUC_JP
198 {
199 "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
200 ONIG_ENCODING_EUC_JP
201 },
202 #endif
203 #ifdef ONIG_ENCODING_UTF8
204 {
205 "UTF-8\0UTF8\0",
206 ONIG_ENCODING_UTF8
207 },
208 #endif
209 #ifdef ONIG_ENCODING_UTF16_BE
210 {
211 "UTF-16\0UTF-16BE\0",
212 ONIG_ENCODING_UTF16_BE
213 },
214 #endif
215 #ifdef ONIG_ENCODING_UTF16_LE
216 {
217 "UTF-16LE\0",
218 ONIG_ENCODING_UTF16_LE
219 },
220 #endif
221 #ifdef ONIG_ENCODING_UTF32_BE
222 {
223 "UCS-4\0UTF-32\0UTF-32BE\0",
224 ONIG_ENCODING_UTF32_BE
225 },
226 #endif
227 #ifdef ONIG_ENCODING_UTF32_LE
228 {
229 "UCS-4LE\0UTF-32LE\0",
230 ONIG_ENCODING_UTF32_LE
231 },
232 #endif
233 #ifdef ONIG_ENCODING_SJIS
234 {
235 "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
236 ONIG_ENCODING_SJIS
237 },
238 #endif
239 #ifdef ONIG_ENCODING_BIG5
240 {
241 "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
242 ONIG_ENCODING_BIG5
243 },
244 #endif
245 #ifdef ONIG_ENCODING_EUC_CN
246 {
247 "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
248 ONIG_ENCODING_EUC_CN
249 },
250 #endif
251 #ifdef ONIG_ENCODING_EUC_TW
252 {
253 "EUC-TW\0EUCTW\0EUC_TW\0",
254 ONIG_ENCODING_EUC_TW
255 },
256 #endif
257 #ifdef ONIG_ENCODING_EUC_KR
258 {
259 "EUC-KR\0EUCKR\0EUC_KR\0",
260 ONIG_ENCODING_EUC_KR
261 },
262 #endif
263 #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
264 {
265 "KOI8\0KOI-8\0",
266 ONIG_ENCODING_KOI8
267 },
268 #endif
269 #ifdef ONIG_ENCODING_KOI8_R
270 {
271 "KOI8R\0KOI8-R\0KOI-8R\0",
272 ONIG_ENCODING_KOI8_R
273 },
274 #endif
275 #ifdef ONIG_ENCODING_ISO_8859_1
276 {
277 "ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0",
278 ONIG_ENCODING_ISO_8859_1
279 },
280 #endif
281 #ifdef ONIG_ENCODING_ISO_8859_2
282 {
283 "ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0",
284 ONIG_ENCODING_ISO_8859_2
285 },
286 #endif
287 #ifdef ONIG_ENCODING_ISO_8859_3
288 {
289 "ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0",
290 ONIG_ENCODING_ISO_8859_3
291 },
292 #endif
293 #ifdef ONIG_ENCODING_ISO_8859_4
294 {
295 "ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0",
296 ONIG_ENCODING_ISO_8859_4
297 },
298 #endif
299 #ifdef ONIG_ENCODING_ISO_8859_5
300 {
301 "ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0",
302 ONIG_ENCODING_ISO_8859_5
303 },
304 #endif
305 #ifdef ONIG_ENCODING_ISO_8859_6
306 {
307 "ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0",
308 ONIG_ENCODING_ISO_8859_6
309 },
310 #endif
311 #ifdef ONIG_ENCODING_ISO_8859_7
312 {
313 "ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0",
314 ONIG_ENCODING_ISO_8859_7
315 },
316 #endif
317 #ifdef ONIG_ENCODING_ISO_8859_8
318 {
319 "ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0",
320 ONIG_ENCODING_ISO_8859_8
321 },
322 #endif
323 #ifdef ONIG_ENCODING_ISO_8859_9
324 {
325 "ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0",
326 ONIG_ENCODING_ISO_8859_9
327 },
328 #endif
329 #ifdef ONIG_ENCODING_ISO_8859_10
330 {
331 "ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0",
332 ONIG_ENCODING_ISO_8859_10
333 },
334 #endif
335 #ifdef ONIG_ENCODING_ISO_8859_11
336 {
337 "ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0",
338 ONIG_ENCODING_ISO_8859_11
339 },
340 #endif
341 #ifdef ONIG_ENCODING_ISO_8859_13
342 {
343 "ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0",
344 ONIG_ENCODING_ISO_8859_13
345 },
346 #endif
347 #ifdef ONIG_ENCODING_ISO_8859_14
348 {
349 "ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0",
350 ONIG_ENCODING_ISO_8859_14
351 },
352 #endif
353 #ifdef ONIG_ENCODING_ISO_8859_15
354 {
355 "ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0",
356 ONIG_ENCODING_ISO_8859_15
357 },
358 #endif
359 #ifdef ONIG_ENCODING_ISO_8859_16
360 {
361 "ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0",
362 ONIG_ENCODING_ISO_8859_16
363 },
364 #endif
365 #ifdef ONIG_ENCODING_ASCII
366 {
367 "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
368 ONIG_ENCODING_ASCII
369 },
370 #endif
371 { NULL, ONIG_ENCODING_UNDEF }
372 };
373 /* }}} */
374
375 /* {{{ php_mb_regex_name2mbctype */
_php_mb_regex_name2mbctype(const char * pname)376 static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
377 {
378 const char *p;
379 const php_mb_regex_enc_name_map_t *mapping;
380
381 if (pname == NULL || !*pname) {
382 return ONIG_ENCODING_UNDEF;
383 }
384
385 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
386 for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
387 if (strcasecmp(p, pname) == 0) {
388 return mapping->code;
389 }
390 }
391 }
392
393 return ONIG_ENCODING_UNDEF;
394 }
395 /* }}} */
396
397 /* {{{ php_mb_regex_mbctype2name */
_php_mb_regex_mbctype2name(OnigEncoding mbctype)398 static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
399 {
400 const php_mb_regex_enc_name_map_t *mapping;
401
402 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
403 if (mapping->code == mbctype) {
404 return mapping->names;
405 }
406 }
407
408 return NULL;
409 }
410 /* }}} */
411
412 /* {{{ php_mb_regex_set_mbctype */
php_mb_regex_set_mbctype(const char * encname)413 int php_mb_regex_set_mbctype(const char *encname)
414 {
415 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
416 if (mbctype == ONIG_ENCODING_UNDEF) {
417 return FAILURE;
418 }
419 MBREX(current_mbctype) = mbctype;
420 return SUCCESS;
421 }
422 /* }}} */
423
424 /* {{{ php_mb_regex_set_default_mbctype */
php_mb_regex_set_default_mbctype(const char * encname)425 int php_mb_regex_set_default_mbctype(const char *encname)
426 {
427 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
428 if (mbctype == ONIG_ENCODING_UNDEF) {
429 return FAILURE;
430 }
431 MBREX(default_mbctype) = mbctype;
432 return SUCCESS;
433 }
434 /* }}} */
435
436 /* {{{ php_mb_regex_get_mbctype */
php_mb_regex_get_mbctype(void)437 const char *php_mb_regex_get_mbctype(void)
438 {
439 return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
440 }
441 /* }}} */
442
443 /* {{{ php_mb_regex_get_default_mbctype */
php_mb_regex_get_default_mbctype(void)444 const char *php_mb_regex_get_default_mbctype(void)
445 {
446 return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
447 }
448 /* }}} */
449
450 /*
451 * regex cache
452 */
453 /* {{{ php_mbregex_compile_pattern */
php_mbregex_compile_pattern(const char * pattern,size_t patlen,OnigOptionType options,OnigEncoding enc,OnigSyntaxType * syntax)454 static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax)
455 {
456 int err_code = 0;
457 php_mb_regex_t *retval = NULL, *rc = NULL;
458 OnigErrorInfo err_info;
459 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
460
461 if (!php_mb_check_encoding(pattern, patlen, _php_mb_regex_mbctype2name(enc))) {
462 php_error_docref(NULL, E_WARNING,
463 "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
464 return NULL;
465 }
466
467 rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
468 if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
469 if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
470 onig_error_code_to_str(err_str, err_code, &err_info);
471 php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
472 return NULL;
473 }
474 if (rc == MBREX(search_re)) {
475 /* reuse the new rc? see bug #72399 */
476 MBREX(search_re) = NULL;
477 }
478 zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
479 } else {
480 retval = rc;
481 }
482 return retval;
483 }
484 /* }}} */
485
486 /* {{{ _php_mb_regex_get_option_string */
_php_mb_regex_get_option_string(char * str,size_t len,OnigOptionType option,OnigSyntaxType * syntax)487 static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
488 {
489 size_t len_left = len;
490 size_t len_req = 0;
491 char *p = str;
492 char c;
493
494 if ((option & ONIG_OPTION_IGNORECASE) != 0) {
495 if (len_left > 0) {
496 --len_left;
497 *(p++) = 'i';
498 }
499 ++len_req;
500 }
501
502 if ((option & ONIG_OPTION_EXTEND) != 0) {
503 if (len_left > 0) {
504 --len_left;
505 *(p++) = 'x';
506 }
507 ++len_req;
508 }
509
510 if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
511 (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
512 if (len_left > 0) {
513 --len_left;
514 *(p++) = 'p';
515 }
516 ++len_req;
517 } else {
518 if ((option & ONIG_OPTION_MULTILINE) != 0) {
519 if (len_left > 0) {
520 --len_left;
521 *(p++) = 'm';
522 }
523 ++len_req;
524 }
525
526 if ((option & ONIG_OPTION_SINGLELINE) != 0) {
527 if (len_left > 0) {
528 --len_left;
529 *(p++) = 's';
530 }
531 ++len_req;
532 }
533 }
534 if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
535 if (len_left > 0) {
536 --len_left;
537 *(p++) = 'l';
538 }
539 ++len_req;
540 }
541 if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
542 if (len_left > 0) {
543 --len_left;
544 *(p++) = 'n';
545 }
546 ++len_req;
547 }
548
549 c = 0;
550
551 if (syntax == ONIG_SYNTAX_JAVA) {
552 c = 'j';
553 } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
554 c = 'u';
555 } else if (syntax == ONIG_SYNTAX_GREP) {
556 c = 'g';
557 } else if (syntax == ONIG_SYNTAX_EMACS) {
558 c = 'c';
559 } else if (syntax == ONIG_SYNTAX_RUBY) {
560 c = 'r';
561 } else if (syntax == ONIG_SYNTAX_PERL) {
562 c = 'z';
563 } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
564 c = 'b';
565 } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
566 c = 'd';
567 }
568
569 if (c != 0) {
570 if (len_left > 0) {
571 --len_left;
572 *(p++) = c;
573 }
574 ++len_req;
575 }
576
577
578 if (len_left > 0) {
579 --len_left;
580 *(p++) = '\0';
581 }
582 ++len_req;
583 if (len < len_req) {
584 return len_req;
585 }
586
587 return 0;
588 }
589 /* }}} */
590
591 /* {{{ _php_mb_regex_init_options */
592 static void
_php_mb_regex_init_options(const char * parg,size_t narg,OnigOptionType * option,OnigSyntaxType ** syntax,int * eval)593 _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
594 {
595 size_t n;
596 char c;
597 OnigOptionType optm = 0;
598
599 *syntax = ONIG_SYNTAX_RUBY;
600
601 if (parg != NULL) {
602 n = 0;
603 while(n < narg) {
604 c = parg[n++];
605 switch (c) {
606 case 'i':
607 optm |= ONIG_OPTION_IGNORECASE;
608 break;
609 case 'x':
610 optm |= ONIG_OPTION_EXTEND;
611 break;
612 case 'm':
613 optm |= ONIG_OPTION_MULTILINE;
614 break;
615 case 's':
616 optm |= ONIG_OPTION_SINGLELINE;
617 break;
618 case 'p':
619 optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
620 break;
621 case 'l':
622 optm |= ONIG_OPTION_FIND_LONGEST;
623 break;
624 case 'n':
625 optm |= ONIG_OPTION_FIND_NOT_EMPTY;
626 break;
627 case 'j':
628 *syntax = ONIG_SYNTAX_JAVA;
629 break;
630 case 'u':
631 *syntax = ONIG_SYNTAX_GNU_REGEX;
632 break;
633 case 'g':
634 *syntax = ONIG_SYNTAX_GREP;
635 break;
636 case 'c':
637 *syntax = ONIG_SYNTAX_EMACS;
638 break;
639 case 'r':
640 *syntax = ONIG_SYNTAX_RUBY;
641 break;
642 case 'z':
643 *syntax = ONIG_SYNTAX_PERL;
644 break;
645 case 'b':
646 *syntax = ONIG_SYNTAX_POSIX_BASIC;
647 break;
648 case 'd':
649 *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
650 break;
651 case 'e':
652 if (eval != NULL) *eval = 1;
653 break;
654 default:
655 break;
656 }
657 }
658 if (option != NULL) *option|=optm;
659 }
660 }
661 /* }}} */
662
663
664 /*
665 * Callbacks for named subpatterns
666 */
667
668 /* {{{ struct mb_ereg_groups_iter_arg */
669 typedef struct mb_regex_groups_iter_args {
670 zval *groups;
671 char *search_str;
672 size_t search_len;
673 OnigRegion *region;
674 } mb_regex_groups_iter_args;
675 /* }}} */
676
677 /* {{{ mb_ereg_groups_iter */
678 static int
mb_regex_groups_iter(const OnigUChar * name,const OnigUChar * name_end,int ngroup_num,int * group_nums,regex_t * reg,void * parg)679 mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
680 {
681 mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
682 int gn, beg, end;
683
684 /*
685 * In case of duplicate groups, keep only the last succeeding one
686 * to be consistent with preg_match with the PCRE_DUPNAMES option.
687 */
688 gn = onig_name_to_backref_number(reg, name, name_end, args->region);
689 beg = args->region->beg[gn];
690 end = args->region->end[gn];
691 if (beg >= 0 && beg < end && end <= args->search_len) {
692 add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
693 } else {
694 add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
695 }
696
697 return 0;
698 }
699 /* }}} */
700
701 /*
702 * Helper for _php_mb_regex_ereg_replace_exec
703 */
704 /* {{{ mb_regex_substitute */
mb_regex_substitute(smart_str * pbuf,const char * subject,size_t subject_len,char * replace,size_t replace_len,php_mb_regex_t * regexp,OnigRegion * regs,const mbfl_encoding * enc)705 static inline void mb_regex_substitute(
706 smart_str *pbuf,
707 const char *subject,
708 size_t subject_len,
709 char *replace,
710 size_t replace_len,
711 php_mb_regex_t *regexp,
712 OnigRegion *regs,
713 const mbfl_encoding *enc
714 ) {
715 char *p, *sp, *eos;
716 int no; /* bakreference group number */
717 int clen; /* byte-length of the current character */
718
719 p = replace;
720 eos = replace + replace_len;
721
722 while (p < eos) {
723 clen = (int) php_mb_mbchar_bytes_ex(p, enc);
724 if (clen != 1 || p == eos || p[0] != '\\') {
725 /* skip anything that's not an ascii backslash */
726 smart_str_appendl(pbuf, p, clen);
727 p += clen;
728 continue;
729 }
730 sp = p; /* save position */
731 clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
732 if (clen != 1 || p == eos) {
733 /* skip backslash followed by multibyte char */
734 smart_str_appendl(pbuf, sp, p - sp);
735 continue;
736 }
737 no = -1;
738 switch (p[0]) {
739 case '0':
740 no = 0;
741 p++;
742 break;
743 case '1': case '2': case '3': case '4':
744 case '5': case '6': case '7': case '8': case '9':
745 if (!onig_noname_group_capture_is_active(regexp)) {
746 /*
747 * FIXME:
748 * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
749 * For now we just ignore them, but in the future we might want to raise a warning
750 * and abort the whole replace operation.
751 */
752 p++;
753 smart_str_appendl(pbuf, sp, p - sp);
754 continue;
755 }
756 no = p[0] - '0';
757 p++;
758 break;
759 case 'k':
760 clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
761 if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
762 /* not a backref delimiter */
763 p += clen;
764 smart_str_appendl(pbuf, sp, p - sp);
765 continue;
766 }
767 /* try to consume everything until next delimiter */
768 char delim = p[0] == '<' ? '>' : '\'';
769 char *name, *name_end;
770 char maybe_num = 1;
771 name_end = name = p + 1;
772 while (name_end < eos) {
773 clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
774 if (clen != 1) {
775 name_end += clen;
776 maybe_num = 0;
777 continue;
778 }
779 if (name_end[0] == delim) break;
780 if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
781 name_end++;
782 }
783 p = name_end + 1;
784 if (name_end - name < 1 || name_end >= eos) {
785 /* the backref was empty or we failed to find the end delimiter */
786 smart_str_appendl(pbuf, sp, p - sp);
787 continue;
788 }
789 /* we have either a name or a number */
790 if (maybe_num) {
791 if (!onig_noname_group_capture_is_active(regexp)) {
792 /* see above note on mixing numbered & named backrefs */
793 smart_str_appendl(pbuf, sp, p - sp);
794 continue;
795 }
796 if (name_end - name == 1) {
797 no = name[0] - '0';
798 break;
799 }
800 if (name[0] == '0') {
801 /* 01 is not a valid number */
802 break;
803 }
804 no = (int) strtoul(name, NULL, 10);
805 break;
806 }
807 no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
808 break;
809 default:
810 /* We're not treating \ as an escape character and will interpret something like
811 * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
812 * function has not supported escaping of backslashes historically. */
813 smart_str_appendl(pbuf, sp, p - sp);
814 continue;
815 }
816 if (no < 0 || no >= regs->num_regs) {
817 /* invalid group number reference, keep the escape sequence in the output */
818 smart_str_appendl(pbuf, sp, p - sp);
819 continue;
820 }
821 if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
822 smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
823 }
824 }
825
826 if (p < eos) {
827 smart_str_appendl(pbuf, p, eos - p);
828 }
829 }
830 /* }}} */
831
832 /*
833 * php functions
834 */
835
836 /* {{{ proto string mb_regex_encoding([string encoding])
837 Returns the current encoding for regex as a string. */
PHP_FUNCTION(mb_regex_encoding)838 PHP_FUNCTION(mb_regex_encoding)
839 {
840 char *encoding = NULL;
841 size_t encoding_len;
842 OnigEncoding mbctype;
843
844 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s", &encoding, &encoding_len) == FAILURE) {
845 return;
846 }
847
848 if (!encoding) {
849 const char *retval = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
850
851 if (retval == NULL) {
852 RETURN_FALSE;
853 }
854
855 RETURN_STRING((char *)retval);
856 } else {
857 mbctype = _php_mb_regex_name2mbctype(encoding);
858
859 if (mbctype == ONIG_ENCODING_UNDEF) {
860 php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", encoding);
861 RETURN_FALSE;
862 }
863
864 MBREX(current_mbctype) = mbctype;
865 RETURN_TRUE;
866 }
867 }
868 /* }}} */
869
870 /* {{{ _php_mb_onig_search */
_php_mb_onig_search(regex_t * reg,const OnigUChar * str,const OnigUChar * end,const OnigUChar * start,const OnigUChar * range,OnigRegion * region,OnigOptionType option)871 static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
872 const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
873 OnigMatchParam *mp = onig_new_match_param();
874 int err;
875 onig_initialize_match_param(mp);
876 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
877 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
878 }
879 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_retry_limit))) {
880 onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
881 }
882 /* search */
883 err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
884 onig_free_match_param(mp);
885 return err;
886 }
887 /* }}} */
888
889
890 /* {{{ _php_mb_regex_ereg_exec */
_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS,int icase)891 static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
892 {
893 zval *arg_pattern, *array = NULL;
894 char *string;
895 size_t string_len;
896 php_mb_regex_t *re;
897 OnigRegion *regs = NULL;
898 int i, match_len, beg, end;
899 OnigOptionType options;
900 char *str;
901
902 if (zend_parse_parameters(ZEND_NUM_ARGS(), "zs|z", &arg_pattern, &string, &string_len, &array) == FAILURE) {
903 RETURN_FALSE;
904 }
905
906 if (array != NULL) {
907 array = zend_try_array_init(array);
908 if (!array) {
909 return;
910 }
911 }
912
913 if (!php_mb_check_encoding(
914 string,
915 string_len,
916 _php_mb_regex_mbctype2name(MBREX(current_mbctype))
917 )) {
918 RETURN_FALSE;
919 }
920
921 options = MBREX(regex_default_options);
922 if (icase) {
923 options |= ONIG_OPTION_IGNORECASE;
924 }
925
926 /* compile the regular expression from the supplied regex */
927 if (Z_TYPE_P(arg_pattern) != IS_STRING) {
928 /* we convert numbers to integers and treat them as a string */
929 if (Z_TYPE_P(arg_pattern) == IS_DOUBLE) {
930 convert_to_long_ex(arg_pattern); /* get rid of decimal places */
931 }
932 if (!try_convert_to_string(arg_pattern)) {
933 return;
934 }
935 /* don't bother doing an extended regex with just a number */
936 }
937
938 if (Z_STRLEN_P(arg_pattern) == 0) {
939 php_error_docref(NULL, E_WARNING, "empty pattern");
940 RETVAL_FALSE;
941 goto out;
942 }
943
944 re = php_mbregex_compile_pattern(Z_STRVAL_P(arg_pattern), Z_STRLEN_P(arg_pattern), options, MBREX(current_mbctype), MBREX(regex_default_syntax));
945 if (re == NULL) {
946 RETVAL_FALSE;
947 goto out;
948 }
949
950 regs = onig_region_new();
951
952 /* actually execute the regular expression */
953 if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
954 RETVAL_FALSE;
955 goto out;
956 }
957
958 match_len = 1;
959 str = string;
960 if (array != NULL) {
961
962 match_len = regs->end[0] - regs->beg[0];
963 for (i = 0; i < regs->num_regs; i++) {
964 beg = regs->beg[i];
965 end = regs->end[i];
966 if (beg >= 0 && beg < end && (size_t)end <= string_len) {
967 add_index_stringl(array, i, (char *)&str[beg], end - beg);
968 } else {
969 add_index_bool(array, i, 0);
970 }
971 }
972
973 if (onig_number_of_names(re) > 0) {
974 mb_regex_groups_iter_args args = {array, string, string_len, regs};
975 onig_foreach_name(re, mb_regex_groups_iter, &args);
976 }
977 }
978
979 if (match_len == 0) {
980 match_len = 1;
981 }
982 RETVAL_LONG(match_len);
983 out:
984 if (regs != NULL) {
985 onig_region_free(regs, 1);
986 }
987 }
988 /* }}} */
989
990 /* {{{ proto int mb_ereg(string pattern, string string [, array registers])
991 Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg)992 PHP_FUNCTION(mb_ereg)
993 {
994 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
995 }
996 /* }}} */
997
998 /* {{{ proto int mb_eregi(string pattern, string string [, array registers])
999 Case-insensitive regular expression match for multibyte string */
PHP_FUNCTION(mb_eregi)1000 PHP_FUNCTION(mb_eregi)
1001 {
1002 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1003 }
1004 /* }}} */
1005
1006 /* {{{ _php_mb_regex_ereg_replace_exec */
_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS,OnigOptionType options,int is_callable)1007 static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
1008 {
1009 zval *arg_pattern_zval;
1010
1011 char *arg_pattern;
1012 size_t arg_pattern_len;
1013
1014 char *replace;
1015 size_t replace_len;
1016
1017 zend_fcall_info arg_replace_fci;
1018 zend_fcall_info_cache arg_replace_fci_cache;
1019
1020 char *string;
1021 size_t string_len;
1022
1023 php_mb_regex_t *re;
1024 OnigSyntaxType *syntax;
1025 OnigRegion *regs = NULL;
1026 smart_str out_buf = {0};
1027 smart_str eval_buf = {0};
1028 smart_str *pbuf;
1029 int err, eval, n;
1030 OnigUChar *pos;
1031 OnigUChar *string_lim;
1032 char *description = NULL;
1033 char pat_buf[6];
1034
1035 const mbfl_encoding *enc;
1036
1037 {
1038 const char *current_enc_name;
1039 current_enc_name = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
1040 if (current_enc_name == NULL ||
1041 (enc = mbfl_name2encoding(current_enc_name)) == NULL) {
1042 php_error_docref(NULL, E_WARNING, "Unknown error");
1043 RETURN_FALSE;
1044 }
1045 }
1046 eval = 0;
1047 {
1048 char *option_str = NULL;
1049 size_t option_str_len = 0;
1050
1051 if (!is_callable) {
1052 if (zend_parse_parameters(ZEND_NUM_ARGS(), "zss|s",
1053 &arg_pattern_zval,
1054 &replace, &replace_len,
1055 &string, &string_len,
1056 &option_str, &option_str_len) == FAILURE) {
1057 RETURN_FALSE;
1058 }
1059 } else {
1060 if (zend_parse_parameters(ZEND_NUM_ARGS(), "zfs|s",
1061 &arg_pattern_zval,
1062 &arg_replace_fci, &arg_replace_fci_cache,
1063 &string, &string_len,
1064 &option_str, &option_str_len) == FAILURE) {
1065 RETURN_FALSE;
1066 }
1067 }
1068
1069 if (!php_mb_check_encoding(
1070 string,
1071 string_len,
1072 _php_mb_regex_mbctype2name(MBREX(current_mbctype))
1073 )) {
1074 RETURN_NULL();
1075 }
1076
1077 if (option_str != NULL) {
1078 _php_mb_regex_init_options(option_str, option_str_len, &options, &syntax, &eval);
1079 } else {
1080 options |= MBREX(regex_default_options);
1081 syntax = MBREX(regex_default_syntax);
1082 }
1083 }
1084 if (eval && !is_callable) {
1085 php_error_docref(NULL, E_DEPRECATED, "The 'e' option is deprecated, use mb_ereg_replace_callback instead");
1086 }
1087 if (Z_TYPE_P(arg_pattern_zval) == IS_STRING) {
1088 arg_pattern = Z_STRVAL_P(arg_pattern_zval);
1089 arg_pattern_len = Z_STRLEN_P(arg_pattern_zval);
1090 } else {
1091 php_error_docref(NULL, E_DEPRECATED,
1092 "Non-string patterns will be interpreted as strings in the future. "
1093 "Use an explicit chr() call to preserve the current behavior");
1094
1095 /* FIXME: this code is not multibyte aware! */
1096 convert_to_long_ex(arg_pattern_zval);
1097 pat_buf[0] = (char)Z_LVAL_P(arg_pattern_zval);
1098 pat_buf[1] = '\0';
1099 pat_buf[2] = '\0';
1100 pat_buf[3] = '\0';
1101 pat_buf[4] = '\0';
1102 pat_buf[5] = '\0';
1103
1104 arg_pattern = pat_buf;
1105 arg_pattern_len = 1;
1106 }
1107 /* create regex pattern buffer */
1108 re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(current_mbctype), syntax);
1109 if (re == NULL) {
1110 RETURN_FALSE;
1111 }
1112
1113 if (eval || is_callable) {
1114 pbuf = &eval_buf;
1115 description = zend_make_compiled_string_description("mbregex replace");
1116 } else {
1117 pbuf = &out_buf;
1118 description = NULL;
1119 }
1120
1121 if (is_callable) {
1122 if (eval) {
1123 php_error_docref(NULL, E_WARNING, "Option 'e' cannot be used with replacement callback");
1124 RETURN_FALSE;
1125 }
1126 }
1127
1128 /* do the actual work */
1129 err = 0;
1130 pos = (OnigUChar *)string;
1131 string_lim = (OnigUChar*)(string + string_len);
1132 regs = onig_region_new();
1133 while (err >= 0) {
1134 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
1135 if (err <= -2) {
1136 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1137 onig_error_code_to_str(err_str, err);
1138 php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
1139 break;
1140 }
1141 if (err >= 0) {
1142 /* copy the part of the string before the match */
1143 smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
1144
1145 if (!is_callable) {
1146 mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
1147 }
1148
1149 if (eval) {
1150 zval v;
1151 zend_string *eval_str;
1152 /* null terminate buffer */
1153 smart_str_0(&eval_buf);
1154
1155 if (eval_buf.s) {
1156 eval_str = eval_buf.s;
1157 } else {
1158 eval_str = ZSTR_EMPTY_ALLOC();
1159 }
1160
1161 /* do eval */
1162 if (zend_eval_stringl(ZSTR_VAL(eval_str), ZSTR_LEN(eval_str), &v, description) == FAILURE) {
1163 efree(description);
1164 zend_throw_error(NULL, "Failed evaluating code: %s%s", PHP_EOL, ZSTR_VAL(eval_str));
1165 onig_region_free(regs, 1);
1166 smart_str_free(&out_buf);
1167 smart_str_free(&eval_buf);
1168 RETURN_FALSE;
1169 }
1170
1171 /* result of eval */
1172 convert_to_string(&v);
1173 smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v));
1174 /* Clean up */
1175 smart_str_free(&eval_buf);
1176 zval_ptr_dtor_str(&v);
1177 } else if (is_callable) {
1178 zval args[1];
1179 zval subpats, retval;
1180 int i;
1181
1182 array_init(&subpats);
1183 for (i = 0; i < regs->num_regs; i++) {
1184 add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
1185 }
1186 if (onig_number_of_names(re) > 0) {
1187 mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1188 onig_foreach_name(re, mb_regex_groups_iter, &args);
1189 }
1190
1191 ZVAL_COPY_VALUE(&args[0], &subpats);
1192 /* null terminate buffer */
1193 smart_str_0(&eval_buf);
1194
1195 arg_replace_fci.param_count = 1;
1196 arg_replace_fci.params = args;
1197 arg_replace_fci.retval = &retval;
1198 if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
1199 !Z_ISUNDEF(retval)) {
1200 convert_to_string_ex(&retval);
1201 smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
1202 smart_str_free(&eval_buf);
1203 zval_ptr_dtor(&retval);
1204 } else {
1205 if (!EG(exception)) {
1206 php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1207 }
1208 }
1209 zval_ptr_dtor(&subpats);
1210 }
1211
1212 n = regs->end[0];
1213 if ((pos - (OnigUChar *)string) < n) {
1214 pos = (OnigUChar *)string + n;
1215 } else {
1216 if (pos < string_lim) {
1217 smart_str_appendl(&out_buf, (char *)pos, 1);
1218 }
1219 pos++;
1220 }
1221 } else { /* nomatch */
1222 /* stick that last bit of string on our output */
1223 if (string_lim - pos > 0) {
1224 smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
1225 }
1226 }
1227 onig_region_free(regs, 0);
1228 }
1229
1230 if (description) {
1231 efree(description);
1232 }
1233 if (regs != NULL) {
1234 onig_region_free(regs, 1);
1235 }
1236 smart_str_free(&eval_buf);
1237
1238 if (err <= -2) {
1239 smart_str_free(&out_buf);
1240 RETVAL_FALSE;
1241 } else if (out_buf.s) {
1242 smart_str_0(&out_buf);
1243 RETVAL_STR(out_buf.s);
1244 } else {
1245 RETVAL_EMPTY_STRING();
1246 }
1247 }
1248 /* }}} */
1249
1250 /* {{{ proto string mb_ereg_replace(string pattern, string replacement, string string [, string option])
1251 Replace regular expression for multibyte string */
PHP_FUNCTION(mb_ereg_replace)1252 PHP_FUNCTION(mb_ereg_replace)
1253 {
1254 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1255 }
1256 /* }}} */
1257
1258 /* {{{ proto string mb_eregi_replace(string pattern, string replacement, string string)
1259 Case insensitive replace regular expression for multibyte string */
PHP_FUNCTION(mb_eregi_replace)1260 PHP_FUNCTION(mb_eregi_replace)
1261 {
1262 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
1263 }
1264 /* }}} */
1265
1266 /* {{{ proto string mb_ereg_replace_callback(string pattern, string callback, string string [, string option])
1267 regular expression for multibyte string using replacement callback */
PHP_FUNCTION(mb_ereg_replace_callback)1268 PHP_FUNCTION(mb_ereg_replace_callback)
1269 {
1270 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1271 }
1272 /* }}} */
1273
1274 /* {{{ proto array mb_split(string pattern, string string [, int limit])
1275 split multibyte string into array by regular expression */
PHP_FUNCTION(mb_split)1276 PHP_FUNCTION(mb_split)
1277 {
1278 char *arg_pattern;
1279 size_t arg_pattern_len;
1280 php_mb_regex_t *re;
1281 OnigRegion *regs = NULL;
1282 char *string;
1283 OnigUChar *pos, *chunk_pos;
1284 size_t string_len;
1285
1286 int err;
1287 zend_long count = -1;
1288
1289 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
1290 RETURN_FALSE;
1291 }
1292
1293 if (count > 0) {
1294 count--;
1295 }
1296
1297 if (!php_mb_check_encoding(string, string_len,
1298 _php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
1299 RETURN_FALSE;
1300 }
1301
1302 /* create regex pattern buffer */
1303 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
1304 RETURN_FALSE;
1305 }
1306
1307 array_init(return_value);
1308
1309 chunk_pos = pos = (OnigUChar *)string;
1310 err = 0;
1311 regs = onig_region_new();
1312 /* churn through str, generating array entries as we go */
1313 while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
1314 size_t beg, end;
1315 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
1316 if (err < 0) {
1317 break;
1318 }
1319 beg = regs->beg[0], end = regs->end[0];
1320 /* add it to the array */
1321 if ((size_t)(pos - (OnigUChar *)string) < end) {
1322 if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
1323 add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
1324 --count;
1325 } else {
1326 err = -2;
1327 break;
1328 }
1329 /* point at our new starting point */
1330 chunk_pos = pos = (OnigUChar *)string + end;
1331 } else {
1332 pos++;
1333 }
1334 onig_region_free(regs, 0);
1335 }
1336
1337 onig_region_free(regs, 1);
1338
1339 /* see if we encountered an error */
1340 if (err <= -2) {
1341 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1342 onig_error_code_to_str(err_str, err);
1343 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
1344 zend_array_destroy(Z_ARR_P(return_value));
1345 RETURN_FALSE;
1346 }
1347
1348 /* otherwise we just have one last element to add to the array */
1349 if ((OnigUChar *)(string + string_len) > chunk_pos) {
1350 size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
1351 add_next_index_stringl(return_value, (char *)chunk_pos, n);
1352 } else {
1353 add_next_index_stringl(return_value, "", 0);
1354 }
1355 }
1356 /* }}} */
1357
1358 /* {{{ proto bool mb_ereg_match(string pattern, string string [,string option])
1359 Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg_match)1360 PHP_FUNCTION(mb_ereg_match)
1361 {
1362 char *arg_pattern;
1363 size_t arg_pattern_len;
1364
1365 char *string;
1366 size_t string_len;
1367
1368 php_mb_regex_t *re;
1369 OnigSyntaxType *syntax;
1370 OnigOptionType option = 0;
1371 int err;
1372 OnigMatchParam *mp;
1373
1374 {
1375 char *option_str = NULL;
1376 size_t option_str_len = 0;
1377
1378 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s",
1379 &arg_pattern, &arg_pattern_len, &string, &string_len,
1380 &option_str, &option_str_len)==FAILURE) {
1381 RETURN_FALSE;
1382 }
1383
1384 if (option_str != NULL) {
1385 _php_mb_regex_init_options(option_str, option_str_len, &option, &syntax, NULL);
1386 } else {
1387 option |= MBREX(regex_default_options);
1388 syntax = MBREX(regex_default_syntax);
1389 }
1390 }
1391
1392 if (!php_mb_check_encoding(string, string_len,
1393 _php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
1394 RETURN_FALSE;
1395 }
1396
1397 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1398 RETURN_FALSE;
1399 }
1400
1401 mp = onig_new_match_param();
1402 onig_initialize_match_param(mp);
1403 if (MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
1404 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
1405 }
1406 if (MBSTRG(regex_retry_limit) > 0 && MBSTRG(regex_retry_limit) < UINT_MAX) {
1407 onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
1408 }
1409 /* match */
1410 err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
1411 onig_free_match_param(mp);
1412 if (err >= 0) {
1413 RETVAL_TRUE;
1414 } else {
1415 RETVAL_FALSE;
1416 }
1417 }
1418 /* }}} */
1419
1420 /* regex search */
1421 /* {{{ _php_mb_regex_ereg_search_exec */
1422 static void
_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS,int mode)1423 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
1424 {
1425 char *arg_pattern = NULL, *arg_options = NULL;
1426 size_t arg_pattern_len, arg_options_len;
1427 int err;
1428 size_t n, i, pos, len, beg, end;
1429 OnigOptionType option = 0;
1430 OnigUChar *str;
1431 OnigSyntaxType *syntax;
1432
1433 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|ss", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1434 return;
1435 }
1436
1437 if (arg_options) {
1438 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
1439 } else {
1440 option |= MBREX(regex_default_options);
1441 syntax = MBREX(regex_default_syntax);
1442 }
1443
1444 if (MBREX(search_regs)) {
1445 onig_region_free(MBREX(search_regs), 1);
1446 MBREX(search_regs) = NULL;
1447 }
1448
1449 if (arg_pattern) {
1450 /* create regex pattern buffer */
1451 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1452 RETURN_FALSE;
1453 }
1454 }
1455
1456 pos = MBREX(search_pos);
1457 str = NULL;
1458 len = 0;
1459 if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
1460 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1461 len = Z_STRLEN(MBREX(search_str));
1462 }
1463
1464 if (MBREX(search_re) == NULL) {
1465 php_error_docref(NULL, E_WARNING, "No regex given");
1466 RETURN_FALSE;
1467 }
1468
1469 if (str == NULL) {
1470 php_error_docref(NULL, E_WARNING, "No string given");
1471 RETURN_FALSE;
1472 }
1473
1474 MBREX(search_regs) = onig_region_new();
1475
1476 err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
1477 if (err == ONIG_MISMATCH) {
1478 MBREX(search_pos) = len;
1479 RETVAL_FALSE;
1480 } else if (err <= -2) {
1481 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1482 onig_error_code_to_str(err_str, err);
1483 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
1484 RETVAL_FALSE;
1485 } else {
1486 switch (mode) {
1487 case 1:
1488 array_init(return_value);
1489 beg = MBREX(search_regs)->beg[0];
1490 end = MBREX(search_regs)->end[0];
1491 add_next_index_long(return_value, beg);
1492 add_next_index_long(return_value, end - beg);
1493 break;
1494 case 2:
1495 array_init(return_value);
1496 n = MBREX(search_regs)->num_regs;
1497 for (i = 0; i < n; i++) {
1498 beg = MBREX(search_regs)->beg[i];
1499 end = MBREX(search_regs)->end[i];
1500 if (beg >= 0 && beg <= end && end <= len) {
1501 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1502 } else {
1503 add_index_bool(return_value, i, 0);
1504 }
1505 }
1506 if (onig_number_of_names(MBREX(search_re)) > 0) {
1507 mb_regex_groups_iter_args args = {
1508 return_value,
1509 Z_STRVAL(MBREX(search_str)),
1510 Z_STRLEN(MBREX(search_str)),
1511 MBREX(search_regs)
1512 };
1513 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1514 }
1515 break;
1516 default:
1517 RETVAL_TRUE;
1518 break;
1519 }
1520 end = MBREX(search_regs)->end[0];
1521 if (pos <= end) {
1522 MBREX(search_pos) = end;
1523 } else {
1524 MBREX(search_pos) = pos + 1;
1525 }
1526 }
1527
1528 if (err < 0) {
1529 onig_region_free(MBREX(search_regs), 1);
1530 MBREX(search_regs) = (OnigRegion *)NULL;
1531 }
1532 }
1533 /* }}} */
1534
1535 /* {{{ proto bool mb_ereg_search([string pattern[, string option]])
1536 Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search)1537 PHP_FUNCTION(mb_ereg_search)
1538 {
1539 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1540 }
1541 /* }}} */
1542
1543 /* {{{ proto array mb_ereg_search_pos([string pattern[, string option]])
1544 Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_pos)1545 PHP_FUNCTION(mb_ereg_search_pos)
1546 {
1547 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1548 }
1549 /* }}} */
1550
1551 /* {{{ proto array mb_ereg_search_regs([string pattern[, string option]])
1552 Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_regs)1553 PHP_FUNCTION(mb_ereg_search_regs)
1554 {
1555 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
1556 }
1557 /* }}} */
1558
1559 /* {{{ proto bool mb_ereg_search_init(string string [, string pattern[, string option]])
1560 Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)1561 PHP_FUNCTION(mb_ereg_search_init)
1562 {
1563 int argc = ZEND_NUM_ARGS();
1564 zend_string *arg_str;
1565 char *arg_pattern = NULL, *arg_options = NULL;
1566 size_t arg_pattern_len = 0, arg_options_len = 0;
1567 OnigSyntaxType *syntax = NULL;
1568 OnigOptionType option;
1569
1570 if (zend_parse_parameters(argc, "S|ss", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1571 return;
1572 }
1573
1574 if (argc > 1 && arg_pattern_len == 0) {
1575 php_error_docref(NULL, E_WARNING, "Empty pattern");
1576 RETURN_FALSE;
1577 }
1578
1579 option = MBREX(regex_default_options);
1580 syntax = MBREX(regex_default_syntax);
1581
1582 if (argc == 3) {
1583 option = 0;
1584 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
1585 }
1586
1587 if (argc > 1) {
1588 /* create regex pattern buffer */
1589 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1590 RETURN_FALSE;
1591 }
1592 }
1593
1594 if (!Z_ISNULL(MBREX(search_str))) {
1595 zval_ptr_dtor(&MBREX(search_str));
1596 }
1597
1598 ZVAL_STR_COPY(&MBREX(search_str), arg_str);
1599
1600 if (php_mb_check_encoding(
1601 ZSTR_VAL(arg_str),
1602 ZSTR_LEN(arg_str),
1603 _php_mb_regex_mbctype2name(MBREX(current_mbctype))
1604 )) {
1605 MBREX(search_pos) = 0;
1606 RETVAL_TRUE;
1607 } else {
1608 MBREX(search_pos) = ZSTR_LEN(arg_str);
1609 RETVAL_FALSE;
1610 }
1611
1612 if (MBREX(search_regs) != NULL) {
1613 onig_region_free(MBREX(search_regs), 1);
1614 MBREX(search_regs) = NULL;
1615 }
1616 }
1617 /* }}} */
1618
1619 /* {{{ proto array mb_ereg_search_getregs(void)
1620 Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)1621 PHP_FUNCTION(mb_ereg_search_getregs)
1622 {
1623 size_t n, i, len, beg, end;
1624 OnigUChar *str;
1625
1626 if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
1627 array_init(return_value);
1628
1629 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1630 len = Z_STRLEN(MBREX(search_str));
1631 n = MBREX(search_regs)->num_regs;
1632 for (i = 0; i < n; i++) {
1633 beg = MBREX(search_regs)->beg[i];
1634 end = MBREX(search_regs)->end[i];
1635 if (beg >= 0 && beg <= end && end <= len) {
1636 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1637 } else {
1638 add_index_bool(return_value, i, 0);
1639 }
1640 }
1641 if (onig_number_of_names(MBREX(search_re)) > 0) {
1642 mb_regex_groups_iter_args args = {
1643 return_value,
1644 Z_STRVAL(MBREX(search_str)),
1645 len,
1646 MBREX(search_regs)
1647 };
1648 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1649 }
1650 } else {
1651 RETVAL_FALSE;
1652 }
1653 }
1654 /* }}} */
1655
1656 /* {{{ proto int mb_ereg_search_getpos(void)
1657 Get search start position */
PHP_FUNCTION(mb_ereg_search_getpos)1658 PHP_FUNCTION(mb_ereg_search_getpos)
1659 {
1660 RETVAL_LONG(MBREX(search_pos));
1661 }
1662 /* }}} */
1663
1664 /* {{{ proto bool mb_ereg_search_setpos(int position)
1665 Set search start position */
PHP_FUNCTION(mb_ereg_search_setpos)1666 PHP_FUNCTION(mb_ereg_search_setpos)
1667 {
1668 zend_long position;
1669
1670 if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
1671 return;
1672 }
1673
1674 /* Accept negative position if length of search string can be determined */
1675 if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
1676 position += Z_STRLEN(MBREX(search_str));
1677 }
1678
1679 if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
1680 php_error_docref(NULL, E_WARNING, "Position is out of range");
1681 MBREX(search_pos) = 0;
1682 RETURN_FALSE;
1683 }
1684
1685 MBREX(search_pos) = position;
1686 RETURN_TRUE;
1687 }
1688 /* }}} */
1689
1690 /* {{{ php_mb_regex_set_options */
_php_mb_regex_set_options(OnigOptionType options,OnigSyntaxType * syntax,OnigOptionType * prev_options,OnigSyntaxType ** prev_syntax)1691 static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
1692 {
1693 if (prev_options != NULL) {
1694 *prev_options = MBREX(regex_default_options);
1695 }
1696 if (prev_syntax != NULL) {
1697 *prev_syntax = MBREX(regex_default_syntax);
1698 }
1699 MBREX(regex_default_options) = options;
1700 MBREX(regex_default_syntax) = syntax;
1701 }
1702 /* }}} */
1703
1704 /* {{{ proto string mb_regex_set_options([string options])
1705 Set or get the default options for mbregex functions */
PHP_FUNCTION(mb_regex_set_options)1706 PHP_FUNCTION(mb_regex_set_options)
1707 {
1708 OnigOptionType opt;
1709 OnigSyntaxType *syntax;
1710 char *string = NULL;
1711 size_t string_len;
1712 char buf[16];
1713
1714 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s",
1715 &string, &string_len) == FAILURE) {
1716 RETURN_FALSE;
1717 }
1718 if (string != NULL) {
1719 opt = 0;
1720 syntax = NULL;
1721 _php_mb_regex_init_options(string, string_len, &opt, &syntax, NULL);
1722 _php_mb_regex_set_options(opt, syntax, NULL, NULL);
1723 } else {
1724 opt = MBREX(regex_default_options);
1725 syntax = MBREX(regex_default_syntax);
1726 }
1727 _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
1728
1729 RETVAL_STRING(buf);
1730 }
1731 /* }}} */
1732
1733 #endif /* HAVE_MBREGEX */
1734