1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2018 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
16 +----------------------------------------------------------------------+
17 */
18
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22
23 #include "php.h"
24 #include "php_ini.h"
25
26 #if HAVE_MBREGEX
27
28 #include "zend_smart_str.h"
29 #include "ext/standard/info.h"
30 #include "php_mbregex.h"
31 #include "mbstring.h"
32
33 #include "php_onig_compat.h" /* must come prior to the oniguruma header */
34 #include <oniguruma.h>
35 #undef UChar
36
37 #if ONIGURUMA_VERSION_INT < 60800
38 typedef void OnigMatchParam;
39 #define onig_new_match_param() (NULL)
40 #define onig_initialize_match_param(x) (void)(x)
41 #define onig_set_match_stack_limit_size_of_match_param(x, y)
42 #define onig_free_match_param(x)
43 #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
44 onig_search(reg, str, end, start, range, region, option)
45 #define onig_match_with_param(re, str, end, at, region, option, mp) \
46 onig_match(re, str, end, at, region, option)
47 #endif
48
49 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
50
51 struct _zend_mb_regex_globals {
52 OnigEncoding default_mbctype;
53 OnigEncoding current_mbctype;
54 HashTable ht_rc;
55 zval search_str;
56 zval *search_str_val;
57 size_t search_pos;
58 php_mb_regex_t *search_re;
59 OnigRegion *search_regs;
60 OnigOptionType regex_default_options;
61 OnigSyntaxType *regex_default_syntax;
62 };
63
64 #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
65
66 /* {{{ static void php_mb_regex_free_cache() */
php_mb_regex_free_cache(zval * el)67 static void php_mb_regex_free_cache(zval *el) {
68 onig_free((php_mb_regex_t *)Z_PTR_P(el));
69 }
70 /* }}} */
71
72 /* {{{ _php_mb_regex_globals_ctor */
_php_mb_regex_globals_ctor(zend_mb_regex_globals * pglobals)73 static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
74 {
75 pglobals->default_mbctype = ONIG_ENCODING_UTF8;
76 pglobals->current_mbctype = ONIG_ENCODING_UTF8;
77 ZVAL_UNDEF(&pglobals->search_str);
78 pglobals->search_re = (php_mb_regex_t*)NULL;
79 pglobals->search_pos = 0;
80 pglobals->search_regs = (OnigRegion*)NULL;
81 pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
82 pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
83 return SUCCESS;
84 }
85 /* }}} */
86
87 /* {{{ _php_mb_regex_globals_dtor */
_php_mb_regex_globals_dtor(zend_mb_regex_globals * pglobals)88 static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals)
89 {
90 }
91 /* }}} */
92
93 /* {{{ php_mb_regex_globals_alloc */
php_mb_regex_globals_alloc(void)94 zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
95 {
96 zend_mb_regex_globals *pglobals = pemalloc(
97 sizeof(zend_mb_regex_globals), 1);
98 if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
99 pefree(pglobals, 1);
100 return NULL;
101 }
102 return pglobals;
103 }
104 /* }}} */
105
106 /* {{{ php_mb_regex_globals_free */
php_mb_regex_globals_free(zend_mb_regex_globals * pglobals)107 void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
108 {
109 if (!pglobals) {
110 return;
111 }
112 _php_mb_regex_globals_dtor(pglobals);
113 pefree(pglobals, 1);
114 }
115 /* }}} */
116
117 /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
PHP_MINIT_FUNCTION(mb_regex)118 PHP_MINIT_FUNCTION(mb_regex)
119 {
120 onig_init();
121 return SUCCESS;
122 }
123 /* }}} */
124
125 /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
PHP_MSHUTDOWN_FUNCTION(mb_regex)126 PHP_MSHUTDOWN_FUNCTION(mb_regex)
127 {
128 onig_end();
129 return SUCCESS;
130 }
131 /* }}} */
132
133 /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)134 PHP_RINIT_FUNCTION(mb_regex)
135 {
136 if (!MBSTRG(mb_regex_globals)) return FAILURE;
137 zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
138 return SUCCESS;
139 }
140 /* }}} */
141
142 /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
PHP_RSHUTDOWN_FUNCTION(mb_regex)143 PHP_RSHUTDOWN_FUNCTION(mb_regex)
144 {
145 MBREX(current_mbctype) = MBREX(default_mbctype);
146
147 if (!Z_ISUNDEF(MBREX(search_str))) {
148 zval_ptr_dtor(&MBREX(search_str));
149 ZVAL_UNDEF(&MBREX(search_str));
150 }
151 MBREX(search_pos) = 0;
152 MBREX(search_re) = NULL;
153
154 if (MBREX(search_regs) != NULL) {
155 onig_region_free(MBREX(search_regs), 1);
156 MBREX(search_regs) = (OnigRegion *)NULL;
157 }
158 zend_hash_destroy(&MBREX(ht_rc));
159
160 return SUCCESS;
161 }
162 /* }}} */
163
164 /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
PHP_MINFO_FUNCTION(mb_regex)165 PHP_MINFO_FUNCTION(mb_regex)
166 {
167 char buf[32];
168 php_info_print_table_start();
169 php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
170 snprintf(buf, sizeof(buf), "%d.%d.%d",
171 ONIGURUMA_VERSION_MAJOR,
172 ONIGURUMA_VERSION_MINOR,
173 ONIGURUMA_VERSION_TEENY);
174 #ifdef PHP_ONIG_BUNDLED
175 #ifdef USE_COMBINATION_EXPLOSION_CHECK
176 php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "On");
177 #else /* USE_COMBINATION_EXPLOSION_CHECK */
178 php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "Off");
179 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
180 #endif /* PHP_BUNDLED_ONIG */
181 php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
182 php_info_print_table_end();
183 }
184 /* }}} */
185
186 /*
187 * encoding name resolver
188 */
189
190 /* {{{ encoding name map */
191 typedef struct _php_mb_regex_enc_name_map_t {
192 const char *names;
193 OnigEncoding code;
194 } php_mb_regex_enc_name_map_t;
195
196 static const php_mb_regex_enc_name_map_t enc_name_map[] = {
197 #ifdef ONIG_ENCODING_EUC_JP
198 {
199 "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
200 ONIG_ENCODING_EUC_JP
201 },
202 #endif
203 #ifdef ONIG_ENCODING_UTF8
204 {
205 "UTF-8\0UTF8\0",
206 ONIG_ENCODING_UTF8
207 },
208 #endif
209 #ifdef ONIG_ENCODING_UTF16_BE
210 {
211 "UTF-16\0UTF-16BE\0",
212 ONIG_ENCODING_UTF16_BE
213 },
214 #endif
215 #ifdef ONIG_ENCODING_UTF16_LE
216 {
217 "UTF-16LE\0",
218 ONIG_ENCODING_UTF16_LE
219 },
220 #endif
221 #ifdef ONIG_ENCODING_UTF32_BE
222 {
223 "UCS-4\0UTF-32\0UTF-32BE\0",
224 ONIG_ENCODING_UTF32_BE
225 },
226 #endif
227 #ifdef ONIG_ENCODING_UTF32_LE
228 {
229 "UCS-4LE\0UTF-32LE\0",
230 ONIG_ENCODING_UTF32_LE
231 },
232 #endif
233 #ifdef ONIG_ENCODING_SJIS
234 {
235 "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
236 ONIG_ENCODING_SJIS
237 },
238 #endif
239 #ifdef ONIG_ENCODING_BIG5
240 {
241 "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
242 ONIG_ENCODING_BIG5
243 },
244 #endif
245 #ifdef ONIG_ENCODING_EUC_CN
246 {
247 "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
248 ONIG_ENCODING_EUC_CN
249 },
250 #endif
251 #ifdef ONIG_ENCODING_EUC_TW
252 {
253 "EUC-TW\0EUCTW\0EUC_TW\0",
254 ONIG_ENCODING_EUC_TW
255 },
256 #endif
257 #ifdef ONIG_ENCODING_EUC_KR
258 {
259 "EUC-KR\0EUCKR\0EUC_KR\0",
260 ONIG_ENCODING_EUC_KR
261 },
262 #endif
263 #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
264 {
265 "KOI8\0KOI-8\0",
266 ONIG_ENCODING_KOI8
267 },
268 #endif
269 #ifdef ONIG_ENCODING_KOI8_R
270 {
271 "KOI8R\0KOI8-R\0KOI-8R\0",
272 ONIG_ENCODING_KOI8_R
273 },
274 #endif
275 #ifdef ONIG_ENCODING_ISO_8859_1
276 {
277 "ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0",
278 ONIG_ENCODING_ISO_8859_1
279 },
280 #endif
281 #ifdef ONIG_ENCODING_ISO_8859_2
282 {
283 "ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0",
284 ONIG_ENCODING_ISO_8859_2
285 },
286 #endif
287 #ifdef ONIG_ENCODING_ISO_8859_3
288 {
289 "ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0",
290 ONIG_ENCODING_ISO_8859_3
291 },
292 #endif
293 #ifdef ONIG_ENCODING_ISO_8859_4
294 {
295 "ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0",
296 ONIG_ENCODING_ISO_8859_4
297 },
298 #endif
299 #ifdef ONIG_ENCODING_ISO_8859_5
300 {
301 "ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0",
302 ONIG_ENCODING_ISO_8859_5
303 },
304 #endif
305 #ifdef ONIG_ENCODING_ISO_8859_6
306 {
307 "ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0",
308 ONIG_ENCODING_ISO_8859_6
309 },
310 #endif
311 #ifdef ONIG_ENCODING_ISO_8859_7
312 {
313 "ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0",
314 ONIG_ENCODING_ISO_8859_7
315 },
316 #endif
317 #ifdef ONIG_ENCODING_ISO_8859_8
318 {
319 "ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0",
320 ONIG_ENCODING_ISO_8859_8
321 },
322 #endif
323 #ifdef ONIG_ENCODING_ISO_8859_9
324 {
325 "ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0",
326 ONIG_ENCODING_ISO_8859_9
327 },
328 #endif
329 #ifdef ONIG_ENCODING_ISO_8859_10
330 {
331 "ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0",
332 ONIG_ENCODING_ISO_8859_10
333 },
334 #endif
335 #ifdef ONIG_ENCODING_ISO_8859_11
336 {
337 "ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0",
338 ONIG_ENCODING_ISO_8859_11
339 },
340 #endif
341 #ifdef ONIG_ENCODING_ISO_8859_13
342 {
343 "ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0",
344 ONIG_ENCODING_ISO_8859_13
345 },
346 #endif
347 #ifdef ONIG_ENCODING_ISO_8859_14
348 {
349 "ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0",
350 ONIG_ENCODING_ISO_8859_14
351 },
352 #endif
353 #ifdef ONIG_ENCODING_ISO_8859_15
354 {
355 "ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0",
356 ONIG_ENCODING_ISO_8859_15
357 },
358 #endif
359 #ifdef ONIG_ENCODING_ISO_8859_16
360 {
361 "ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0",
362 ONIG_ENCODING_ISO_8859_16
363 },
364 #endif
365 #ifdef ONIG_ENCODING_ASCII
366 {
367 "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
368 ONIG_ENCODING_ASCII
369 },
370 #endif
371 { NULL, ONIG_ENCODING_UNDEF }
372 };
373 /* }}} */
374
375 /* {{{ php_mb_regex_name2mbctype */
_php_mb_regex_name2mbctype(const char * pname)376 static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
377 {
378 const char *p;
379 const php_mb_regex_enc_name_map_t *mapping;
380
381 if (pname == NULL || !*pname) {
382 return ONIG_ENCODING_UNDEF;
383 }
384
385 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
386 for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
387 if (strcasecmp(p, pname) == 0) {
388 return mapping->code;
389 }
390 }
391 }
392
393 return ONIG_ENCODING_UNDEF;
394 }
395 /* }}} */
396
397 /* {{{ php_mb_regex_mbctype2name */
_php_mb_regex_mbctype2name(OnigEncoding mbctype)398 static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
399 {
400 const php_mb_regex_enc_name_map_t *mapping;
401
402 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
403 if (mapping->code == mbctype) {
404 return mapping->names;
405 }
406 }
407
408 return NULL;
409 }
410 /* }}} */
411
412 /* {{{ php_mb_regex_set_mbctype */
php_mb_regex_set_mbctype(const char * encname)413 int php_mb_regex_set_mbctype(const char *encname)
414 {
415 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
416 if (mbctype == ONIG_ENCODING_UNDEF) {
417 return FAILURE;
418 }
419 MBREX(current_mbctype) = mbctype;
420 return SUCCESS;
421 }
422 /* }}} */
423
424 /* {{{ php_mb_regex_set_default_mbctype */
php_mb_regex_set_default_mbctype(const char * encname)425 int php_mb_regex_set_default_mbctype(const char *encname)
426 {
427 OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
428 if (mbctype == ONIG_ENCODING_UNDEF) {
429 return FAILURE;
430 }
431 MBREX(default_mbctype) = mbctype;
432 return SUCCESS;
433 }
434 /* }}} */
435
436 /* {{{ php_mb_regex_get_mbctype */
php_mb_regex_get_mbctype(void)437 const char *php_mb_regex_get_mbctype(void)
438 {
439 return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
440 }
441 /* }}} */
442
443 /* {{{ php_mb_regex_get_default_mbctype */
php_mb_regex_get_default_mbctype(void)444 const char *php_mb_regex_get_default_mbctype(void)
445 {
446 return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
447 }
448 /* }}} */
449
450 /*
451 * regex cache
452 */
453 /* {{{ php_mbregex_compile_pattern */
php_mbregex_compile_pattern(const char * pattern,size_t patlen,OnigOptionType options,OnigEncoding enc,OnigSyntaxType * syntax)454 static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax)
455 {
456 int err_code = 0;
457 php_mb_regex_t *retval = NULL, *rc = NULL;
458 OnigErrorInfo err_info;
459 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
460
461 if (!php_mb_check_encoding(pattern, patlen, _php_mb_regex_mbctype2name(enc))) {
462 php_error_docref(NULL, E_WARNING,
463 "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
464 return NULL;
465 }
466
467 rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
468 if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
469 if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
470 onig_error_code_to_str(err_str, err_code, &err_info);
471 php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
472 return NULL;
473 }
474 if (rc == MBREX(search_re)) {
475 /* reuse the new rc? see bug #72399 */
476 MBREX(search_re) = NULL;
477 }
478 zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
479 } else {
480 retval = rc;
481 }
482 return retval;
483 }
484 /* }}} */
485
486 /* {{{ _php_mb_regex_get_option_string */
_php_mb_regex_get_option_string(char * str,size_t len,OnigOptionType option,OnigSyntaxType * syntax)487 static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
488 {
489 size_t len_left = len;
490 size_t len_req = 0;
491 char *p = str;
492 char c;
493
494 if ((option & ONIG_OPTION_IGNORECASE) != 0) {
495 if (len_left > 0) {
496 --len_left;
497 *(p++) = 'i';
498 }
499 ++len_req;
500 }
501
502 if ((option & ONIG_OPTION_EXTEND) != 0) {
503 if (len_left > 0) {
504 --len_left;
505 *(p++) = 'x';
506 }
507 ++len_req;
508 }
509
510 if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
511 (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
512 if (len_left > 0) {
513 --len_left;
514 *(p++) = 'p';
515 }
516 ++len_req;
517 } else {
518 if ((option & ONIG_OPTION_MULTILINE) != 0) {
519 if (len_left > 0) {
520 --len_left;
521 *(p++) = 'm';
522 }
523 ++len_req;
524 }
525
526 if ((option & ONIG_OPTION_SINGLELINE) != 0) {
527 if (len_left > 0) {
528 --len_left;
529 *(p++) = 's';
530 }
531 ++len_req;
532 }
533 }
534 if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
535 if (len_left > 0) {
536 --len_left;
537 *(p++) = 'l';
538 }
539 ++len_req;
540 }
541 if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
542 if (len_left > 0) {
543 --len_left;
544 *(p++) = 'n';
545 }
546 ++len_req;
547 }
548
549 c = 0;
550
551 if (syntax == ONIG_SYNTAX_JAVA) {
552 c = 'j';
553 } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
554 c = 'u';
555 } else if (syntax == ONIG_SYNTAX_GREP) {
556 c = 'g';
557 } else if (syntax == ONIG_SYNTAX_EMACS) {
558 c = 'c';
559 } else if (syntax == ONIG_SYNTAX_RUBY) {
560 c = 'r';
561 } else if (syntax == ONIG_SYNTAX_PERL) {
562 c = 'z';
563 } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
564 c = 'b';
565 } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
566 c = 'd';
567 }
568
569 if (c != 0) {
570 if (len_left > 0) {
571 --len_left;
572 *(p++) = c;
573 }
574 ++len_req;
575 }
576
577
578 if (len_left > 0) {
579 --len_left;
580 *(p++) = '\0';
581 }
582 ++len_req;
583 if (len < len_req) {
584 return len_req;
585 }
586
587 return 0;
588 }
589 /* }}} */
590
591 /* {{{ _php_mb_regex_init_options */
592 static void
_php_mb_regex_init_options(const char * parg,size_t narg,OnigOptionType * option,OnigSyntaxType ** syntax,int * eval)593 _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
594 {
595 size_t n;
596 char c;
597 OnigOptionType optm = 0;
598
599 *syntax = ONIG_SYNTAX_RUBY;
600
601 if (parg != NULL) {
602 n = 0;
603 while(n < narg) {
604 c = parg[n++];
605 switch (c) {
606 case 'i':
607 optm |= ONIG_OPTION_IGNORECASE;
608 break;
609 case 'x':
610 optm |= ONIG_OPTION_EXTEND;
611 break;
612 case 'm':
613 optm |= ONIG_OPTION_MULTILINE;
614 break;
615 case 's':
616 optm |= ONIG_OPTION_SINGLELINE;
617 break;
618 case 'p':
619 optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
620 break;
621 case 'l':
622 optm |= ONIG_OPTION_FIND_LONGEST;
623 break;
624 case 'n':
625 optm |= ONIG_OPTION_FIND_NOT_EMPTY;
626 break;
627 case 'j':
628 *syntax = ONIG_SYNTAX_JAVA;
629 break;
630 case 'u':
631 *syntax = ONIG_SYNTAX_GNU_REGEX;
632 break;
633 case 'g':
634 *syntax = ONIG_SYNTAX_GREP;
635 break;
636 case 'c':
637 *syntax = ONIG_SYNTAX_EMACS;
638 break;
639 case 'r':
640 *syntax = ONIG_SYNTAX_RUBY;
641 break;
642 case 'z':
643 *syntax = ONIG_SYNTAX_PERL;
644 break;
645 case 'b':
646 *syntax = ONIG_SYNTAX_POSIX_BASIC;
647 break;
648 case 'd':
649 *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
650 break;
651 case 'e':
652 if (eval != NULL) *eval = 1;
653 break;
654 default:
655 break;
656 }
657 }
658 if (option != NULL) *option|=optm;
659 }
660 }
661 /* }}} */
662
663
664 /*
665 * Callbacks for named subpatterns
666 */
667
668 /* {{{ struct mb_ereg_groups_iter_arg */
669 typedef struct mb_regex_groups_iter_args {
670 zval *groups;
671 char *search_str;
672 size_t search_len;
673 OnigRegion *region;
674 } mb_regex_groups_iter_args;
675 /* }}} */
676
677 /* {{{ mb_ereg_groups_iter */
678 static int
mb_regex_groups_iter(const OnigUChar * name,const OnigUChar * name_end,int ngroup_num,int * group_nums,regex_t * reg,void * parg)679 mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
680 {
681 mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
682 int gn, beg, end;
683
684 /*
685 * In case of duplicate groups, keep only the last succeeding one
686 * to be consistent with preg_match with the PCRE_DUPNAMES option.
687 */
688 gn = onig_name_to_backref_number(reg, name, name_end, args->region);
689 beg = args->region->beg[gn];
690 end = args->region->end[gn];
691 if (beg >= 0 && beg < end && end <= args->search_len) {
692 add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
693 } else {
694 add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
695 }
696
697 return 0;
698 }
699 /* }}} */
700
701 /*
702 * Helper for _php_mb_regex_ereg_replace_exec
703 */
704 /* {{{ mb_regex_substitute */
mb_regex_substitute(smart_str * pbuf,const char * subject,size_t subject_len,char * replace,size_t replace_len,php_mb_regex_t * regexp,OnigRegion * regs,const mbfl_encoding * enc)705 static inline void mb_regex_substitute(
706 smart_str *pbuf,
707 const char *subject,
708 size_t subject_len,
709 char *replace,
710 size_t replace_len,
711 php_mb_regex_t *regexp,
712 OnigRegion *regs,
713 const mbfl_encoding *enc
714 ) {
715 char *p, *sp, *eos;
716 int no; /* bakreference group number */
717 int clen; /* byte-length of the current character */
718
719 p = replace;
720 eos = replace + replace_len;
721
722 while (p < eos) {
723 clen = (int) php_mb_mbchar_bytes_ex(p, enc);
724 if (clen != 1 || p == eos || p[0] != '\\') {
725 /* skip anything that's not an ascii backslash */
726 smart_str_appendl(pbuf, p, clen);
727 p += clen;
728 continue;
729 }
730 sp = p; /* save position */
731 clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
732 if (clen != 1 || p == eos) {
733 /* skip backslash followed by multibyte char */
734 smart_str_appendl(pbuf, sp, p - sp);
735 continue;
736 }
737 no = -1;
738 switch (p[0]) {
739 case '0':
740 no = 0;
741 p++;
742 break;
743 case '1': case '2': case '3': case '4':
744 case '5': case '6': case '7': case '8': case '9':
745 if (!onig_noname_group_capture_is_active(regexp)) {
746 /*
747 * FIXME:
748 * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
749 * For now we just ignore them, but in the future we might want to raise a warning
750 * and abort the whole replace operation.
751 */
752 p++;
753 smart_str_appendl(pbuf, sp, p - sp);
754 continue;
755 }
756 no = p[0] - '0';
757 p++;
758 break;
759 case 'k':
760 clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
761 if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
762 /* not a backref delimiter */
763 p += clen;
764 smart_str_appendl(pbuf, sp, p - sp);
765 continue;
766 }
767 /* try to consume everything until next delimiter */
768 char delim = p[0] == '<' ? '>' : '\'';
769 char *name, *name_end;
770 char maybe_num = 1;
771 name_end = name = p + 1;
772 while (name_end < eos) {
773 clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
774 if (clen != 1) {
775 name_end += clen;
776 maybe_num = 0;
777 continue;
778 }
779 if (name_end[0] == delim) break;
780 if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
781 name_end++;
782 }
783 p = name_end + 1;
784 if (name_end - name < 1 || name_end >= eos) {
785 /* the backref was empty or we failed to find the end delimiter */
786 smart_str_appendl(pbuf, sp, p - sp);
787 continue;
788 }
789 /* we have either a name or a number */
790 if (maybe_num) {
791 if (!onig_noname_group_capture_is_active(regexp)) {
792 /* see above note on mixing numbered & named backrefs */
793 smart_str_appendl(pbuf, sp, p - sp);
794 continue;
795 }
796 if (name_end - name == 1) {
797 no = name[0] - '0';
798 break;
799 }
800 if (name[0] == '0') {
801 /* 01 is not a valid number */
802 break;
803 }
804 no = (int) strtoul(name, NULL, 10);
805 break;
806 }
807 no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
808 break;
809 default:
810 /* We're not treating \ as an escape character and will interpret something like
811 * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
812 * function has not supported escaping of backslashes historically. */
813 smart_str_appendl(pbuf, sp, p - sp);
814 continue;
815 }
816 if (no < 0 || no >= regs->num_regs) {
817 /* invalid group number reference, keep the escape sequence in the output */
818 smart_str_appendl(pbuf, sp, p - sp);
819 continue;
820 }
821 if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
822 smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
823 }
824 }
825
826 if (p < eos) {
827 smart_str_appendl(pbuf, p, eos - p);
828 }
829 }
830 /* }}} */
831
832 /*
833 * php functions
834 */
835
836 /* {{{ proto string mb_regex_encoding([string encoding])
837 Returns the current encoding for regex as a string. */
PHP_FUNCTION(mb_regex_encoding)838 PHP_FUNCTION(mb_regex_encoding)
839 {
840 char *encoding = NULL;
841 size_t encoding_len;
842 OnigEncoding mbctype;
843
844 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s", &encoding, &encoding_len) == FAILURE) {
845 return;
846 }
847
848 if (!encoding) {
849 const char *retval = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
850
851 if (retval == NULL) {
852 RETURN_FALSE;
853 }
854
855 RETURN_STRING((char *)retval);
856 } else {
857 mbctype = _php_mb_regex_name2mbctype(encoding);
858
859 if (mbctype == ONIG_ENCODING_UNDEF) {
860 php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", encoding);
861 RETURN_FALSE;
862 }
863
864 MBREX(current_mbctype) = mbctype;
865 RETURN_TRUE;
866 }
867 }
868 /* }}} */
869
870 /* {{{ _php_mb_onig_search */
_php_mb_onig_search(regex_t * reg,const OnigUChar * str,const OnigUChar * end,const OnigUChar * start,const OnigUChar * range,OnigRegion * region,OnigOptionType option)871 static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
872 const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
873 OnigMatchParam *mp = onig_new_match_param();
874 int err;
875 onig_initialize_match_param(mp);
876 if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
877 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
878 }
879 /* search */
880 err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
881 onig_free_match_param(mp);
882 return err;
883 }
884 /* }}} */
885
886
887 /* {{{ _php_mb_regex_ereg_exec */
_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS,int icase)888 static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
889 {
890 zval *arg_pattern, *array = NULL;
891 char *string;
892 size_t string_len;
893 php_mb_regex_t *re;
894 OnigRegion *regs = NULL;
895 int i, match_len, beg, end;
896 OnigOptionType options;
897 char *str;
898
899 if (zend_parse_parameters(ZEND_NUM_ARGS(), "zs|z/", &arg_pattern, &string, &string_len, &array) == FAILURE) {
900 RETURN_FALSE;
901 }
902
903 if (array != NULL) {
904 zval_ptr_dtor(array);
905 array_init(array);
906 }
907
908 if (!php_mb_check_encoding(
909 string,
910 string_len,
911 _php_mb_regex_mbctype2name(MBREX(current_mbctype))
912 )) {
913 RETURN_FALSE;
914 }
915
916 options = MBREX(regex_default_options);
917 if (icase) {
918 options |= ONIG_OPTION_IGNORECASE;
919 }
920
921 /* compile the regular expression from the supplied regex */
922 if (Z_TYPE_P(arg_pattern) != IS_STRING) {
923 /* we convert numbers to integers and treat them as a string */
924 if (Z_TYPE_P(arg_pattern) == IS_DOUBLE) {
925 convert_to_long_ex(arg_pattern); /* get rid of decimal places */
926 }
927 convert_to_string_ex(arg_pattern);
928 /* don't bother doing an extended regex with just a number */
929 }
930
931 if (Z_STRLEN_P(arg_pattern) == 0) {
932 php_error_docref(NULL, E_WARNING, "empty pattern");
933 RETVAL_FALSE;
934 goto out;
935 }
936
937 re = php_mbregex_compile_pattern(Z_STRVAL_P(arg_pattern), Z_STRLEN_P(arg_pattern), options, MBREX(current_mbctype), MBREX(regex_default_syntax));
938 if (re == NULL) {
939 RETVAL_FALSE;
940 goto out;
941 }
942
943 regs = onig_region_new();
944
945 /* actually execute the regular expression */
946 if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
947 RETVAL_FALSE;
948 goto out;
949 }
950
951 match_len = 1;
952 str = string;
953 if (array != NULL) {
954
955 match_len = regs->end[0] - regs->beg[0];
956 for (i = 0; i < regs->num_regs; i++) {
957 beg = regs->beg[i];
958 end = regs->end[i];
959 if (beg >= 0 && beg < end && (size_t)end <= string_len) {
960 add_index_stringl(array, i, (char *)&str[beg], end - beg);
961 } else {
962 add_index_bool(array, i, 0);
963 }
964 }
965
966 if (onig_number_of_names(re) > 0) {
967 mb_regex_groups_iter_args args = {array, string, string_len, regs};
968 onig_foreach_name(re, mb_regex_groups_iter, &args);
969 }
970 }
971
972 if (match_len == 0) {
973 match_len = 1;
974 }
975 RETVAL_LONG(match_len);
976 out:
977 if (regs != NULL) {
978 onig_region_free(regs, 1);
979 }
980 }
981 /* }}} */
982
983 /* {{{ proto int mb_ereg(string pattern, string string [, array registers])
984 Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg)985 PHP_FUNCTION(mb_ereg)
986 {
987 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
988 }
989 /* }}} */
990
991 /* {{{ proto int mb_eregi(string pattern, string string [, array registers])
992 Case-insensitive regular expression match for multibyte string */
PHP_FUNCTION(mb_eregi)993 PHP_FUNCTION(mb_eregi)
994 {
995 _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
996 }
997 /* }}} */
998
999 /* {{{ _php_mb_regex_ereg_replace_exec */
_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS,OnigOptionType options,int is_callable)1000 static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
1001 {
1002 zval *arg_pattern_zval;
1003
1004 char *arg_pattern;
1005 size_t arg_pattern_len;
1006
1007 char *replace;
1008 size_t replace_len;
1009
1010 zend_fcall_info arg_replace_fci;
1011 zend_fcall_info_cache arg_replace_fci_cache;
1012
1013 char *string;
1014 size_t string_len;
1015
1016 php_mb_regex_t *re;
1017 OnigSyntaxType *syntax;
1018 OnigRegion *regs = NULL;
1019 smart_str out_buf = {0};
1020 smart_str eval_buf = {0};
1021 smart_str *pbuf;
1022 int err, eval, n;
1023 OnigUChar *pos;
1024 OnigUChar *string_lim;
1025 char *description = NULL;
1026 char pat_buf[6];
1027
1028 const mbfl_encoding *enc;
1029
1030 {
1031 const char *current_enc_name;
1032 current_enc_name = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
1033 if (current_enc_name == NULL ||
1034 (enc = mbfl_name2encoding(current_enc_name)) == NULL) {
1035 php_error_docref(NULL, E_WARNING, "Unknown error");
1036 RETURN_FALSE;
1037 }
1038 }
1039 eval = 0;
1040 {
1041 char *option_str = NULL;
1042 size_t option_str_len = 0;
1043
1044 if (!is_callable) {
1045 if (zend_parse_parameters(ZEND_NUM_ARGS(), "zss|s",
1046 &arg_pattern_zval,
1047 &replace, &replace_len,
1048 &string, &string_len,
1049 &option_str, &option_str_len) == FAILURE) {
1050 RETURN_FALSE;
1051 }
1052 } else {
1053 if (zend_parse_parameters(ZEND_NUM_ARGS(), "zfs|s",
1054 &arg_pattern_zval,
1055 &arg_replace_fci, &arg_replace_fci_cache,
1056 &string, &string_len,
1057 &option_str, &option_str_len) == FAILURE) {
1058 RETURN_FALSE;
1059 }
1060 }
1061
1062 if (!php_mb_check_encoding(
1063 string,
1064 string_len,
1065 _php_mb_regex_mbctype2name(MBREX(current_mbctype))
1066 )) {
1067 RETURN_NULL();
1068 }
1069
1070 if (option_str != NULL) {
1071 _php_mb_regex_init_options(option_str, option_str_len, &options, &syntax, &eval);
1072 } else {
1073 options |= MBREX(regex_default_options);
1074 syntax = MBREX(regex_default_syntax);
1075 }
1076 }
1077 if (eval && !is_callable) {
1078 php_error_docref(NULL, E_DEPRECATED, "The 'e' option is deprecated, use mb_ereg_replace_callback instead");
1079 }
1080 if (Z_TYPE_P(arg_pattern_zval) == IS_STRING) {
1081 arg_pattern = Z_STRVAL_P(arg_pattern_zval);
1082 arg_pattern_len = Z_STRLEN_P(arg_pattern_zval);
1083 } else {
1084 /* FIXME: this code is not multibyte aware! */
1085 convert_to_long_ex(arg_pattern_zval);
1086 pat_buf[0] = (char)Z_LVAL_P(arg_pattern_zval);
1087 pat_buf[1] = '\0';
1088 pat_buf[2] = '\0';
1089 pat_buf[3] = '\0';
1090 pat_buf[4] = '\0';
1091 pat_buf[5] = '\0';
1092
1093 arg_pattern = pat_buf;
1094 arg_pattern_len = 1;
1095 }
1096 /* create regex pattern buffer */
1097 re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(current_mbctype), syntax);
1098 if (re == NULL) {
1099 RETURN_FALSE;
1100 }
1101
1102 if (eval || is_callable) {
1103 pbuf = &eval_buf;
1104 description = zend_make_compiled_string_description("mbregex replace");
1105 } else {
1106 pbuf = &out_buf;
1107 description = NULL;
1108 }
1109
1110 if (is_callable) {
1111 if (eval) {
1112 php_error_docref(NULL, E_WARNING, "Option 'e' cannot be used with replacement callback");
1113 RETURN_FALSE;
1114 }
1115 }
1116
1117 /* do the actual work */
1118 err = 0;
1119 pos = (OnigUChar *)string;
1120 string_lim = (OnigUChar*)(string + string_len);
1121 regs = onig_region_new();
1122 while (err >= 0) {
1123 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
1124 if (err <= -2) {
1125 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1126 onig_error_code_to_str(err_str, err);
1127 php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
1128 break;
1129 }
1130 if (err >= 0) {
1131 /* copy the part of the string before the match */
1132 smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
1133
1134 if (!is_callable) {
1135 mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
1136 }
1137
1138 if (eval) {
1139 zval v;
1140 zend_string *eval_str;
1141 /* null terminate buffer */
1142 smart_str_0(&eval_buf);
1143
1144 if (eval_buf.s) {
1145 eval_str = eval_buf.s;
1146 } else {
1147 eval_str = ZSTR_EMPTY_ALLOC();
1148 }
1149
1150 /* do eval */
1151 if (zend_eval_stringl(ZSTR_VAL(eval_str), ZSTR_LEN(eval_str), &v, description) == FAILURE) {
1152 efree(description);
1153 zend_throw_error(NULL, "Failed evaluating code: %s%s", PHP_EOL, ZSTR_VAL(eval_str));
1154 onig_region_free(regs, 0);
1155 smart_str_free(&out_buf);
1156 smart_str_free(&eval_buf);
1157 RETURN_FALSE;
1158 }
1159
1160 /* result of eval */
1161 convert_to_string(&v);
1162 smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v));
1163 /* Clean up */
1164 smart_str_free(&eval_buf);
1165 zval_ptr_dtor_str(&v);
1166 } else if (is_callable) {
1167 zval args[1];
1168 zval subpats, retval;
1169 int i;
1170
1171 array_init(&subpats);
1172 for (i = 0; i < regs->num_regs; i++) {
1173 add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
1174 }
1175 if (onig_number_of_names(re) > 0) {
1176 mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1177 onig_foreach_name(re, mb_regex_groups_iter, &args);
1178 }
1179
1180 ZVAL_COPY_VALUE(&args[0], &subpats);
1181 /* null terminate buffer */
1182 smart_str_0(&eval_buf);
1183
1184 arg_replace_fci.param_count = 1;
1185 arg_replace_fci.params = args;
1186 arg_replace_fci.retval = &retval;
1187 if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
1188 !Z_ISUNDEF(retval)) {
1189 convert_to_string_ex(&retval);
1190 smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
1191 smart_str_free(&eval_buf);
1192 zval_ptr_dtor(&retval);
1193 } else {
1194 if (!EG(exception)) {
1195 php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1196 }
1197 }
1198 zval_ptr_dtor(&subpats);
1199 }
1200
1201 n = regs->end[0];
1202 if ((pos - (OnigUChar *)string) < n) {
1203 pos = (OnigUChar *)string + n;
1204 } else {
1205 if (pos < string_lim) {
1206 smart_str_appendl(&out_buf, (char *)pos, 1);
1207 }
1208 pos++;
1209 }
1210 } else { /* nomatch */
1211 /* stick that last bit of string on our output */
1212 if (string_lim - pos > 0) {
1213 smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
1214 }
1215 }
1216 onig_region_free(regs, 0);
1217 }
1218
1219 if (description) {
1220 efree(description);
1221 }
1222 if (regs != NULL) {
1223 onig_region_free(regs, 1);
1224 }
1225 smart_str_free(&eval_buf);
1226
1227 if (err <= -2) {
1228 smart_str_free(&out_buf);
1229 RETVAL_FALSE;
1230 } else if (out_buf.s) {
1231 smart_str_0(&out_buf);
1232 RETVAL_STR(out_buf.s);
1233 } else {
1234 RETVAL_EMPTY_STRING();
1235 }
1236 }
1237 /* }}} */
1238
1239 /* {{{ proto string mb_ereg_replace(string pattern, string replacement, string string [, string option])
1240 Replace regular expression for multibyte string */
PHP_FUNCTION(mb_ereg_replace)1241 PHP_FUNCTION(mb_ereg_replace)
1242 {
1243 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1244 }
1245 /* }}} */
1246
1247 /* {{{ proto string mb_eregi_replace(string pattern, string replacement, string string)
1248 Case insensitive replace regular expression for multibyte string */
PHP_FUNCTION(mb_eregi_replace)1249 PHP_FUNCTION(mb_eregi_replace)
1250 {
1251 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
1252 }
1253 /* }}} */
1254
1255 /* {{{ proto string mb_ereg_replace_callback(string pattern, string callback, string string [, string option])
1256 regular expression for multibyte string using replacement callback */
PHP_FUNCTION(mb_ereg_replace_callback)1257 PHP_FUNCTION(mb_ereg_replace_callback)
1258 {
1259 _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1260 }
1261 /* }}} */
1262
1263 /* {{{ proto array mb_split(string pattern, string string [, int limit])
1264 split multibyte string into array by regular expression */
PHP_FUNCTION(mb_split)1265 PHP_FUNCTION(mb_split)
1266 {
1267 char *arg_pattern;
1268 size_t arg_pattern_len;
1269 php_mb_regex_t *re;
1270 OnigRegion *regs = NULL;
1271 char *string;
1272 OnigUChar *pos, *chunk_pos;
1273 size_t string_len;
1274
1275 int err;
1276 zend_long count = -1;
1277
1278 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
1279 RETURN_FALSE;
1280 }
1281
1282 if (count > 0) {
1283 count--;
1284 }
1285
1286 if (!php_mb_check_encoding(string, string_len,
1287 _php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
1288 RETURN_FALSE;
1289 }
1290
1291 /* create regex pattern buffer */
1292 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
1293 RETURN_FALSE;
1294 }
1295
1296 array_init(return_value);
1297
1298 chunk_pos = pos = (OnigUChar *)string;
1299 err = 0;
1300 regs = onig_region_new();
1301 /* churn through str, generating array entries as we go */
1302 while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
1303 size_t beg, end;
1304 err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
1305 if (err < 0) {
1306 break;
1307 }
1308 beg = regs->beg[0], end = regs->end[0];
1309 /* add it to the array */
1310 if ((size_t)(pos - (OnigUChar *)string) < end) {
1311 if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
1312 add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
1313 --count;
1314 } else {
1315 err = -2;
1316 break;
1317 }
1318 /* point at our new starting point */
1319 chunk_pos = pos = (OnigUChar *)string + end;
1320 } else {
1321 pos++;
1322 }
1323 onig_region_free(regs, 0);
1324 }
1325
1326 onig_region_free(regs, 1);
1327
1328 /* see if we encountered an error */
1329 if (err <= -2) {
1330 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1331 onig_error_code_to_str(err_str, err);
1332 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
1333 zend_array_destroy(Z_ARR_P(return_value));
1334 RETURN_FALSE;
1335 }
1336
1337 /* otherwise we just have one last element to add to the array */
1338 if ((OnigUChar *)(string + string_len) > chunk_pos) {
1339 size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
1340 add_next_index_stringl(return_value, (char *)chunk_pos, n);
1341 } else {
1342 add_next_index_stringl(return_value, "", 0);
1343 }
1344 }
1345 /* }}} */
1346
1347 /* {{{ proto bool mb_ereg_match(string pattern, string string [,string option])
1348 Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg_match)1349 PHP_FUNCTION(mb_ereg_match)
1350 {
1351 char *arg_pattern;
1352 size_t arg_pattern_len;
1353
1354 char *string;
1355 size_t string_len;
1356
1357 php_mb_regex_t *re;
1358 OnigSyntaxType *syntax;
1359 OnigOptionType option = 0;
1360 int err;
1361 OnigMatchParam *mp;
1362
1363 {
1364 char *option_str = NULL;
1365 size_t option_str_len = 0;
1366
1367 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s",
1368 &arg_pattern, &arg_pattern_len, &string, &string_len,
1369 &option_str, &option_str_len)==FAILURE) {
1370 RETURN_FALSE;
1371 }
1372
1373 if (option_str != NULL) {
1374 _php_mb_regex_init_options(option_str, option_str_len, &option, &syntax, NULL);
1375 } else {
1376 option |= MBREX(regex_default_options);
1377 syntax = MBREX(regex_default_syntax);
1378 }
1379 }
1380
1381 if (!php_mb_check_encoding(string, string_len,
1382 _php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
1383 RETURN_FALSE;
1384 }
1385
1386 if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1387 RETURN_FALSE;
1388 }
1389
1390 mp = onig_new_match_param();
1391 onig_initialize_match_param(mp);
1392 if(MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
1393 onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
1394 }
1395 /* match */
1396 err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
1397 onig_free_match_param(mp);
1398 if (err >= 0) {
1399 RETVAL_TRUE;
1400 } else {
1401 RETVAL_FALSE;
1402 }
1403 }
1404 /* }}} */
1405
1406 /* regex search */
1407 /* {{{ _php_mb_regex_ereg_search_exec */
1408 static void
_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS,int mode)1409 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
1410 {
1411 char *arg_pattern = NULL, *arg_options = NULL;
1412 size_t arg_pattern_len, arg_options_len;
1413 int err;
1414 size_t n, i, pos, len, beg, end;
1415 OnigOptionType option;
1416 OnigUChar *str;
1417 OnigSyntaxType *syntax;
1418
1419 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|ss", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1420 return;
1421 }
1422
1423 option = MBREX(regex_default_options);
1424
1425 if (arg_options) {
1426 option = 0;
1427 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
1428 }
1429
1430 if (MBREX(search_regs)) {
1431 onig_region_free(MBREX(search_regs), 1);
1432 MBREX(search_regs) = NULL;
1433 }
1434
1435 if (arg_pattern) {
1436 /* create regex pattern buffer */
1437 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
1438 RETURN_FALSE;
1439 }
1440 }
1441
1442 pos = MBREX(search_pos);
1443 str = NULL;
1444 len = 0;
1445 if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
1446 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1447 len = Z_STRLEN(MBREX(search_str));
1448 }
1449
1450 if (MBREX(search_re) == NULL) {
1451 php_error_docref(NULL, E_WARNING, "No regex given");
1452 RETURN_FALSE;
1453 }
1454
1455 if (str == NULL) {
1456 php_error_docref(NULL, E_WARNING, "No string given");
1457 RETURN_FALSE;
1458 }
1459
1460 MBREX(search_regs) = onig_region_new();
1461
1462 err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
1463 if (err == ONIG_MISMATCH) {
1464 MBREX(search_pos) = len;
1465 RETVAL_FALSE;
1466 } else if (err <= -2) {
1467 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1468 onig_error_code_to_str(err_str, err);
1469 php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
1470 RETVAL_FALSE;
1471 } else {
1472 switch (mode) {
1473 case 1:
1474 array_init(return_value);
1475 beg = MBREX(search_regs)->beg[0];
1476 end = MBREX(search_regs)->end[0];
1477 add_next_index_long(return_value, beg);
1478 add_next_index_long(return_value, end - beg);
1479 break;
1480 case 2:
1481 array_init(return_value);
1482 n = MBREX(search_regs)->num_regs;
1483 for (i = 0; i < n; i++) {
1484 beg = MBREX(search_regs)->beg[i];
1485 end = MBREX(search_regs)->end[i];
1486 if (beg >= 0 && beg <= end && end <= len) {
1487 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1488 } else {
1489 add_index_bool(return_value, i, 0);
1490 }
1491 }
1492 if (onig_number_of_names(MBREX(search_re)) > 0) {
1493 mb_regex_groups_iter_args args = {
1494 return_value,
1495 Z_STRVAL(MBREX(search_str)),
1496 Z_STRLEN(MBREX(search_str)),
1497 MBREX(search_regs)
1498 };
1499 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1500 }
1501 break;
1502 default:
1503 RETVAL_TRUE;
1504 break;
1505 }
1506 end = MBREX(search_regs)->end[0];
1507 if (pos <= end) {
1508 MBREX(search_pos) = end;
1509 } else {
1510 MBREX(search_pos) = pos + 1;
1511 }
1512 }
1513
1514 if (err < 0) {
1515 onig_region_free(MBREX(search_regs), 1);
1516 MBREX(search_regs) = (OnigRegion *)NULL;
1517 }
1518 }
1519 /* }}} */
1520
1521 /* {{{ proto bool mb_ereg_search([string pattern[, string option]])
1522 Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search)1523 PHP_FUNCTION(mb_ereg_search)
1524 {
1525 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1526 }
1527 /* }}} */
1528
1529 /* {{{ proto array mb_ereg_search_pos([string pattern[, string option]])
1530 Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_pos)1531 PHP_FUNCTION(mb_ereg_search_pos)
1532 {
1533 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1534 }
1535 /* }}} */
1536
1537 /* {{{ proto array mb_ereg_search_regs([string pattern[, string option]])
1538 Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_regs)1539 PHP_FUNCTION(mb_ereg_search_regs)
1540 {
1541 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
1542 }
1543 /* }}} */
1544
1545 /* {{{ proto bool mb_ereg_search_init(string string [, string pattern[, string option]])
1546 Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)1547 PHP_FUNCTION(mb_ereg_search_init)
1548 {
1549 int argc = ZEND_NUM_ARGS();
1550 zend_string *arg_str;
1551 char *arg_pattern = NULL, *arg_options = NULL;
1552 size_t arg_pattern_len = 0, arg_options_len = 0;
1553 OnigSyntaxType *syntax = NULL;
1554 OnigOptionType option;
1555
1556 if (zend_parse_parameters(argc, "S|ss", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1557 return;
1558 }
1559
1560 if (argc > 1 && arg_pattern_len == 0) {
1561 php_error_docref(NULL, E_WARNING, "Empty pattern");
1562 RETURN_FALSE;
1563 }
1564
1565 option = MBREX(regex_default_options);
1566 syntax = MBREX(regex_default_syntax);
1567
1568 if (argc == 3) {
1569 option = 0;
1570 _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
1571 }
1572
1573 if (argc > 1) {
1574 /* create regex pattern buffer */
1575 if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1576 RETURN_FALSE;
1577 }
1578 }
1579
1580 if (!Z_ISNULL(MBREX(search_str))) {
1581 zval_ptr_dtor(&MBREX(search_str));
1582 }
1583
1584 ZVAL_STR_COPY(&MBREX(search_str), arg_str);
1585
1586 if (php_mb_check_encoding(
1587 ZSTR_VAL(arg_str),
1588 ZSTR_LEN(arg_str),
1589 _php_mb_regex_mbctype2name(MBREX(current_mbctype))
1590 )) {
1591 MBREX(search_pos) = 0;
1592 RETVAL_TRUE;
1593 } else {
1594 MBREX(search_pos) = ZSTR_LEN(arg_str);
1595 RETVAL_FALSE;
1596 }
1597
1598 if (MBREX(search_regs) != NULL) {
1599 onig_region_free(MBREX(search_regs), 1);
1600 MBREX(search_regs) = NULL;
1601 }
1602 }
1603 /* }}} */
1604
1605 /* {{{ proto array mb_ereg_search_getregs(void)
1606 Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)1607 PHP_FUNCTION(mb_ereg_search_getregs)
1608 {
1609 size_t n, i, len, beg, end;
1610 OnigUChar *str;
1611
1612 if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
1613 array_init(return_value);
1614
1615 str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1616 len = Z_STRLEN(MBREX(search_str));
1617 n = MBREX(search_regs)->num_regs;
1618 for (i = 0; i < n; i++) {
1619 beg = MBREX(search_regs)->beg[i];
1620 end = MBREX(search_regs)->end[i];
1621 if (beg >= 0 && beg <= end && end <= len) {
1622 add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1623 } else {
1624 add_index_bool(return_value, i, 0);
1625 }
1626 }
1627 if (onig_number_of_names(MBREX(search_re)) > 0) {
1628 mb_regex_groups_iter_args args = {
1629 return_value,
1630 Z_STRVAL(MBREX(search_str)),
1631 len,
1632 MBREX(search_regs)
1633 };
1634 onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1635 }
1636 } else {
1637 RETVAL_FALSE;
1638 }
1639 }
1640 /* }}} */
1641
1642 /* {{{ proto int mb_ereg_search_getpos(void)
1643 Get search start position */
PHP_FUNCTION(mb_ereg_search_getpos)1644 PHP_FUNCTION(mb_ereg_search_getpos)
1645 {
1646 RETVAL_LONG(MBREX(search_pos));
1647 }
1648 /* }}} */
1649
1650 /* {{{ proto bool mb_ereg_search_setpos(int position)
1651 Set search start position */
PHP_FUNCTION(mb_ereg_search_setpos)1652 PHP_FUNCTION(mb_ereg_search_setpos)
1653 {
1654 zend_long position;
1655
1656 if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
1657 return;
1658 }
1659
1660 /* Accept negative position if length of search string can be determined */
1661 if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
1662 position += Z_STRLEN(MBREX(search_str));
1663 }
1664
1665 if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
1666 php_error_docref(NULL, E_WARNING, "Position is out of range");
1667 MBREX(search_pos) = 0;
1668 RETURN_FALSE;
1669 }
1670
1671 MBREX(search_pos) = position;
1672 RETURN_TRUE;
1673 }
1674 /* }}} */
1675
1676 /* {{{ php_mb_regex_set_options */
_php_mb_regex_set_options(OnigOptionType options,OnigSyntaxType * syntax,OnigOptionType * prev_options,OnigSyntaxType ** prev_syntax)1677 static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
1678 {
1679 if (prev_options != NULL) {
1680 *prev_options = MBREX(regex_default_options);
1681 }
1682 if (prev_syntax != NULL) {
1683 *prev_syntax = MBREX(regex_default_syntax);
1684 }
1685 MBREX(regex_default_options) = options;
1686 MBREX(regex_default_syntax) = syntax;
1687 }
1688 /* }}} */
1689
1690 /* {{{ proto string mb_regex_set_options([string options])
1691 Set or get the default options for mbregex functions */
PHP_FUNCTION(mb_regex_set_options)1692 PHP_FUNCTION(mb_regex_set_options)
1693 {
1694 OnigOptionType opt;
1695 OnigSyntaxType *syntax;
1696 char *string = NULL;
1697 size_t string_len;
1698 char buf[16];
1699
1700 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s",
1701 &string, &string_len) == FAILURE) {
1702 RETURN_FALSE;
1703 }
1704 if (string != NULL) {
1705 opt = 0;
1706 syntax = NULL;
1707 _php_mb_regex_init_options(string, string_len, &opt, &syntax, NULL);
1708 _php_mb_regex_set_options(opt, syntax, NULL, NULL);
1709 } else {
1710 opt = MBREX(regex_default_options);
1711 syntax = MBREX(regex_default_syntax);
1712 }
1713 _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
1714
1715 RETVAL_STRING(buf);
1716 }
1717 /* }}} */
1718
1719 #endif /* HAVE_MBREGEX */
1720
1721 /*
1722 * Local variables:
1723 * tab-width: 4
1724 * c-basic-offset: 4
1725 * End:
1726 * vim600: fdm=marker
1727 * vim: noet sw=4 ts=4
1728 */
1729