xref: /PHP-7.3/ext/mbstring/php_mbregex.c (revision 560ff972)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2018 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
16    +----------------------------------------------------------------------+
17  */
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22 
23 #include "php.h"
24 #include "php_ini.h"
25 
26 #if HAVE_MBREGEX
27 
28 #include "zend_smart_str.h"
29 #include "ext/standard/info.h"
30 #include "php_mbregex.h"
31 #include "mbstring.h"
32 
33 #include "php_onig_compat.h" /* must come prior to the oniguruma header */
34 #include <oniguruma.h>
35 #undef UChar
36 
37 #if ONIGURUMA_VERSION_INT < 60800
38 typedef void OnigMatchParam;
39 #define onig_new_match_param() (NULL)
40 #define onig_initialize_match_param(x) (void)(x)
41 #define onig_set_match_stack_limit_size_of_match_param(x, y)
42 #define onig_free_match_param(x)
43 #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
44 		onig_search(reg, str, end, start, range, region, option)
45 #define onig_match_with_param(re, str, end, at, region, option, mp) \
46 		onig_match(re, str, end, at, region, option)
47 #endif
48 
49 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
50 
51 struct _zend_mb_regex_globals {
52 	OnigEncoding default_mbctype;
53 	OnigEncoding current_mbctype;
54 	HashTable ht_rc;
55 	zval search_str;
56 	zval *search_str_val;
57 	size_t search_pos;
58 	php_mb_regex_t *search_re;
59 	OnigRegion *search_regs;
60 	OnigOptionType regex_default_options;
61 	OnigSyntaxType *regex_default_syntax;
62 };
63 
64 #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
65 
66 /* {{{ static void php_mb_regex_free_cache() */
php_mb_regex_free_cache(zval * el)67 static void php_mb_regex_free_cache(zval *el) {
68 	onig_free((php_mb_regex_t *)Z_PTR_P(el));
69 }
70 /* }}} */
71 
72 /* {{{ _php_mb_regex_globals_ctor */
_php_mb_regex_globals_ctor(zend_mb_regex_globals * pglobals)73 static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
74 {
75 	pglobals->default_mbctype = ONIG_ENCODING_UTF8;
76 	pglobals->current_mbctype = ONIG_ENCODING_UTF8;
77 	ZVAL_UNDEF(&pglobals->search_str);
78 	pglobals->search_re = (php_mb_regex_t*)NULL;
79 	pglobals->search_pos = 0;
80 	pglobals->search_regs = (OnigRegion*)NULL;
81 	pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
82 	pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
83 	return SUCCESS;
84 }
85 /* }}} */
86 
87 /* {{{ _php_mb_regex_globals_dtor */
_php_mb_regex_globals_dtor(zend_mb_regex_globals * pglobals)88 static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals)
89 {
90 }
91 /* }}} */
92 
93 /* {{{ php_mb_regex_globals_alloc */
php_mb_regex_globals_alloc(void)94 zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
95 {
96 	zend_mb_regex_globals *pglobals = pemalloc(
97 			sizeof(zend_mb_regex_globals), 1);
98 	if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
99 		pefree(pglobals, 1);
100 		return NULL;
101 	}
102 	return pglobals;
103 }
104 /* }}} */
105 
106 /* {{{ php_mb_regex_globals_free */
php_mb_regex_globals_free(zend_mb_regex_globals * pglobals)107 void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
108 {
109 	if (!pglobals) {
110 		return;
111 	}
112 	_php_mb_regex_globals_dtor(pglobals);
113 	pefree(pglobals, 1);
114 }
115 /* }}} */
116 
117 /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
PHP_MINIT_FUNCTION(mb_regex)118 PHP_MINIT_FUNCTION(mb_regex)
119 {
120 	onig_init();
121 	return SUCCESS;
122 }
123 /* }}} */
124 
125 /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
PHP_MSHUTDOWN_FUNCTION(mb_regex)126 PHP_MSHUTDOWN_FUNCTION(mb_regex)
127 {
128 	onig_end();
129 	return SUCCESS;
130 }
131 /* }}} */
132 
133 /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)134 PHP_RINIT_FUNCTION(mb_regex)
135 {
136 	if (!MBSTRG(mb_regex_globals)) return FAILURE;
137 	zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
138 	return SUCCESS;
139 }
140 /* }}} */
141 
142 /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
PHP_RSHUTDOWN_FUNCTION(mb_regex)143 PHP_RSHUTDOWN_FUNCTION(mb_regex)
144 {
145 	MBREX(current_mbctype) = MBREX(default_mbctype);
146 
147 	if (!Z_ISUNDEF(MBREX(search_str))) {
148 		zval_ptr_dtor(&MBREX(search_str));
149 		ZVAL_UNDEF(&MBREX(search_str));
150 	}
151 	MBREX(search_pos) = 0;
152 	MBREX(search_re) = NULL;
153 
154 	if (MBREX(search_regs) != NULL) {
155 		onig_region_free(MBREX(search_regs), 1);
156 		MBREX(search_regs) = (OnigRegion *)NULL;
157 	}
158 	zend_hash_destroy(&MBREX(ht_rc));
159 
160 	return SUCCESS;
161 }
162 /* }}} */
163 
164 /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
PHP_MINFO_FUNCTION(mb_regex)165 PHP_MINFO_FUNCTION(mb_regex)
166 {
167 	char buf[32];
168 	php_info_print_table_start();
169 	php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
170 	snprintf(buf, sizeof(buf), "%d.%d.%d",
171 			ONIGURUMA_VERSION_MAJOR,
172 			ONIGURUMA_VERSION_MINOR,
173 			ONIGURUMA_VERSION_TEENY);
174 #ifdef PHP_ONIG_BUNDLED
175 #ifdef USE_COMBINATION_EXPLOSION_CHECK
176 	php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "On");
177 #else	/* USE_COMBINATION_EXPLOSION_CHECK */
178 	php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "Off");
179 #endif	/* USE_COMBINATION_EXPLOSION_CHECK */
180 #endif /* PHP_BUNDLED_ONIG */
181 	php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
182 	php_info_print_table_end();
183 }
184 /* }}} */
185 
186 /*
187  * encoding name resolver
188  */
189 
190 /* {{{ encoding name map */
191 typedef struct _php_mb_regex_enc_name_map_t {
192 	const char *names;
193 	OnigEncoding code;
194 } php_mb_regex_enc_name_map_t;
195 
196 static const php_mb_regex_enc_name_map_t enc_name_map[] = {
197 #ifdef ONIG_ENCODING_EUC_JP
198 	{
199 		"EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
200 		ONIG_ENCODING_EUC_JP
201 	},
202 #endif
203 #ifdef ONIG_ENCODING_UTF8
204 	{
205 		"UTF-8\0UTF8\0",
206 		ONIG_ENCODING_UTF8
207 	},
208 #endif
209 #ifdef ONIG_ENCODING_UTF16_BE
210 	{
211 		"UTF-16\0UTF-16BE\0",
212 		ONIG_ENCODING_UTF16_BE
213 	},
214 #endif
215 #ifdef ONIG_ENCODING_UTF16_LE
216 	{
217 		"UTF-16LE\0",
218 		ONIG_ENCODING_UTF16_LE
219 	},
220 #endif
221 #ifdef ONIG_ENCODING_UTF32_BE
222 	{
223 		"UCS-4\0UTF-32\0UTF-32BE\0",
224 		ONIG_ENCODING_UTF32_BE
225 	},
226 #endif
227 #ifdef ONIG_ENCODING_UTF32_LE
228 	{
229 		"UCS-4LE\0UTF-32LE\0",
230 		ONIG_ENCODING_UTF32_LE
231 	},
232 #endif
233 #ifdef ONIG_ENCODING_SJIS
234 	{
235 		"SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
236 		ONIG_ENCODING_SJIS
237 	},
238 #endif
239 #ifdef ONIG_ENCODING_BIG5
240 	{
241 		"BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
242 		ONIG_ENCODING_BIG5
243 	},
244 #endif
245 #ifdef ONIG_ENCODING_EUC_CN
246 	{
247 		"EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
248 		ONIG_ENCODING_EUC_CN
249 	},
250 #endif
251 #ifdef ONIG_ENCODING_EUC_TW
252 	{
253 		"EUC-TW\0EUCTW\0EUC_TW\0",
254 		ONIG_ENCODING_EUC_TW
255 	},
256 #endif
257 #ifdef ONIG_ENCODING_EUC_KR
258 	{
259 		"EUC-KR\0EUCKR\0EUC_KR\0",
260 		ONIG_ENCODING_EUC_KR
261 	},
262 #endif
263 #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
264 	{
265 		"KOI8\0KOI-8\0",
266 		ONIG_ENCODING_KOI8
267 	},
268 #endif
269 #ifdef ONIG_ENCODING_KOI8_R
270 	{
271 		"KOI8R\0KOI8-R\0KOI-8R\0",
272 		ONIG_ENCODING_KOI8_R
273 	},
274 #endif
275 #ifdef ONIG_ENCODING_ISO_8859_1
276 	{
277 		"ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0",
278 		ONIG_ENCODING_ISO_8859_1
279 	},
280 #endif
281 #ifdef ONIG_ENCODING_ISO_8859_2
282 	{
283 		"ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0",
284 		ONIG_ENCODING_ISO_8859_2
285 	},
286 #endif
287 #ifdef ONIG_ENCODING_ISO_8859_3
288 	{
289 		"ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0",
290 		ONIG_ENCODING_ISO_8859_3
291 	},
292 #endif
293 #ifdef ONIG_ENCODING_ISO_8859_4
294 	{
295 		"ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0",
296 		ONIG_ENCODING_ISO_8859_4
297 	},
298 #endif
299 #ifdef ONIG_ENCODING_ISO_8859_5
300 	{
301 		"ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0",
302 		ONIG_ENCODING_ISO_8859_5
303 	},
304 #endif
305 #ifdef ONIG_ENCODING_ISO_8859_6
306 	{
307 		"ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0",
308 		ONIG_ENCODING_ISO_8859_6
309 	},
310 #endif
311 #ifdef ONIG_ENCODING_ISO_8859_7
312 	{
313 		"ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0",
314 		ONIG_ENCODING_ISO_8859_7
315 	},
316 #endif
317 #ifdef ONIG_ENCODING_ISO_8859_8
318 	{
319 		"ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0",
320 		ONIG_ENCODING_ISO_8859_8
321 	},
322 #endif
323 #ifdef ONIG_ENCODING_ISO_8859_9
324 	{
325 		"ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0",
326 		ONIG_ENCODING_ISO_8859_9
327 	},
328 #endif
329 #ifdef ONIG_ENCODING_ISO_8859_10
330 	{
331 		"ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0",
332 		ONIG_ENCODING_ISO_8859_10
333 	},
334 #endif
335 #ifdef ONIG_ENCODING_ISO_8859_11
336 	{
337 		"ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0",
338 		ONIG_ENCODING_ISO_8859_11
339 	},
340 #endif
341 #ifdef ONIG_ENCODING_ISO_8859_13
342 	{
343 		"ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0",
344 		ONIG_ENCODING_ISO_8859_13
345 	},
346 #endif
347 #ifdef ONIG_ENCODING_ISO_8859_14
348 	{
349 		"ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0",
350 		ONIG_ENCODING_ISO_8859_14
351 	},
352 #endif
353 #ifdef ONIG_ENCODING_ISO_8859_15
354 	{
355 		"ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0",
356 		ONIG_ENCODING_ISO_8859_15
357 	},
358 #endif
359 #ifdef ONIG_ENCODING_ISO_8859_16
360 	{
361 		"ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0",
362 		ONIG_ENCODING_ISO_8859_16
363 	},
364 #endif
365 #ifdef ONIG_ENCODING_ASCII
366 	{
367 		"ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
368 		ONIG_ENCODING_ASCII
369 	},
370 #endif
371 	{ NULL, ONIG_ENCODING_UNDEF }
372 };
373 /* }}} */
374 
375 /* {{{ php_mb_regex_name2mbctype */
_php_mb_regex_name2mbctype(const char * pname)376 static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
377 {
378 	const char *p;
379 	const php_mb_regex_enc_name_map_t *mapping;
380 
381 	if (pname == NULL || !*pname) {
382 		return ONIG_ENCODING_UNDEF;
383 	}
384 
385 	for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
386 		for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
387 			if (strcasecmp(p, pname) == 0) {
388 				return mapping->code;
389 			}
390 		}
391 	}
392 
393 	return ONIG_ENCODING_UNDEF;
394 }
395 /* }}} */
396 
397 /* {{{ php_mb_regex_mbctype2name */
_php_mb_regex_mbctype2name(OnigEncoding mbctype)398 static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
399 {
400 	const php_mb_regex_enc_name_map_t *mapping;
401 
402 	for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
403 		if (mapping->code == mbctype) {
404 			return mapping->names;
405 		}
406 	}
407 
408 	return NULL;
409 }
410 /* }}} */
411 
412 /* {{{ php_mb_regex_set_mbctype */
php_mb_regex_set_mbctype(const char * encname)413 int php_mb_regex_set_mbctype(const char *encname)
414 {
415 	OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
416 	if (mbctype == ONIG_ENCODING_UNDEF) {
417 		return FAILURE;
418 	}
419 	MBREX(current_mbctype) = mbctype;
420 	return SUCCESS;
421 }
422 /* }}} */
423 
424 /* {{{ php_mb_regex_set_default_mbctype */
php_mb_regex_set_default_mbctype(const char * encname)425 int php_mb_regex_set_default_mbctype(const char *encname)
426 {
427 	OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
428 	if (mbctype == ONIG_ENCODING_UNDEF) {
429 		return FAILURE;
430 	}
431 	MBREX(default_mbctype) = mbctype;
432 	return SUCCESS;
433 }
434 /* }}} */
435 
436 /* {{{ php_mb_regex_get_mbctype */
php_mb_regex_get_mbctype(void)437 const char *php_mb_regex_get_mbctype(void)
438 {
439 	return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
440 }
441 /* }}} */
442 
443 /* {{{ php_mb_regex_get_default_mbctype */
php_mb_regex_get_default_mbctype(void)444 const char *php_mb_regex_get_default_mbctype(void)
445 {
446 	return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
447 }
448 /* }}} */
449 
450 /*
451  * regex cache
452  */
453 /* {{{ php_mbregex_compile_pattern */
php_mbregex_compile_pattern(const char * pattern,size_t patlen,OnigOptionType options,OnigEncoding enc,OnigSyntaxType * syntax)454 static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax)
455 {
456 	int err_code = 0;
457 	php_mb_regex_t *retval = NULL, *rc = NULL;
458 	OnigErrorInfo err_info;
459 	OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
460 
461 	if (!php_mb_check_encoding(pattern, patlen, _php_mb_regex_mbctype2name(enc))) {
462 		php_error_docref(NULL, E_WARNING,
463 			"Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
464 		return NULL;
465 	}
466 
467 	rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
468 	if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
469 		if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
470 			onig_error_code_to_str(err_str, err_code, &err_info);
471 			php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
472 			return NULL;
473 		}
474 		if (rc == MBREX(search_re)) {
475 			/* reuse the new rc? see bug #72399 */
476 			MBREX(search_re) = NULL;
477 		}
478 		zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
479 	} else {
480 		retval = rc;
481 	}
482 	return retval;
483 }
484 /* }}} */
485 
486 /* {{{ _php_mb_regex_get_option_string */
_php_mb_regex_get_option_string(char * str,size_t len,OnigOptionType option,OnigSyntaxType * syntax)487 static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
488 {
489 	size_t len_left = len;
490 	size_t len_req = 0;
491 	char *p = str;
492 	char c;
493 
494 	if ((option & ONIG_OPTION_IGNORECASE) != 0) {
495 		if (len_left > 0) {
496 			--len_left;
497 			*(p++) = 'i';
498 		}
499 		++len_req;
500 	}
501 
502 	if ((option & ONIG_OPTION_EXTEND) != 0) {
503 		if (len_left > 0) {
504 			--len_left;
505 			*(p++) = 'x';
506 		}
507 		++len_req;
508 	}
509 
510 	if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
511 			(ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
512 		if (len_left > 0) {
513 			--len_left;
514 			*(p++) = 'p';
515 		}
516 		++len_req;
517 	} else {
518 		if ((option & ONIG_OPTION_MULTILINE) != 0) {
519 			if (len_left > 0) {
520 				--len_left;
521 				*(p++) = 'm';
522 			}
523 			++len_req;
524 		}
525 
526 		if ((option & ONIG_OPTION_SINGLELINE) != 0) {
527 			if (len_left > 0) {
528 				--len_left;
529 				*(p++) = 's';
530 			}
531 			++len_req;
532 		}
533 	}
534 	if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
535 		if (len_left > 0) {
536 			--len_left;
537 			*(p++) = 'l';
538 		}
539 		++len_req;
540 	}
541 	if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
542 		if (len_left > 0) {
543 			--len_left;
544 			*(p++) = 'n';
545 		}
546 		++len_req;
547 	}
548 
549 	c = 0;
550 
551 	if (syntax == ONIG_SYNTAX_JAVA) {
552 		c = 'j';
553 	} else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
554 		c = 'u';
555 	} else if (syntax == ONIG_SYNTAX_GREP) {
556 		c = 'g';
557 	} else if (syntax == ONIG_SYNTAX_EMACS) {
558 		c = 'c';
559 	} else if (syntax == ONIG_SYNTAX_RUBY) {
560 		c = 'r';
561 	} else if (syntax == ONIG_SYNTAX_PERL) {
562 		c = 'z';
563 	} else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
564 		c = 'b';
565 	} else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
566 		c = 'd';
567 	}
568 
569 	if (c != 0) {
570 		if (len_left > 0) {
571 			--len_left;
572 			*(p++) = c;
573 		}
574 		++len_req;
575 	}
576 
577 
578 	if (len_left > 0) {
579 		--len_left;
580 		*(p++) = '\0';
581 	}
582 	++len_req;
583 	if (len < len_req) {
584 		return len_req;
585 	}
586 
587 	return 0;
588 }
589 /* }}} */
590 
591 /* {{{ _php_mb_regex_init_options */
592 static void
_php_mb_regex_init_options(const char * parg,size_t narg,OnigOptionType * option,OnigSyntaxType ** syntax,int * eval)593 _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
594 {
595 	size_t n;
596 	char c;
597 	OnigOptionType optm = 0;
598 
599 	*syntax = ONIG_SYNTAX_RUBY;
600 
601 	if (parg != NULL) {
602 		n = 0;
603 		while(n < narg) {
604 			c = parg[n++];
605 			switch (c) {
606 				case 'i':
607 					optm |= ONIG_OPTION_IGNORECASE;
608 					break;
609 				case 'x':
610 					optm |= ONIG_OPTION_EXTEND;
611 					break;
612 				case 'm':
613 					optm |= ONIG_OPTION_MULTILINE;
614 					break;
615 				case 's':
616 					optm |= ONIG_OPTION_SINGLELINE;
617 					break;
618 				case 'p':
619 					optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
620 					break;
621 				case 'l':
622 					optm |= ONIG_OPTION_FIND_LONGEST;
623 					break;
624 				case 'n':
625 					optm |= ONIG_OPTION_FIND_NOT_EMPTY;
626 					break;
627 				case 'j':
628 					*syntax = ONIG_SYNTAX_JAVA;
629 					break;
630 				case 'u':
631 					*syntax = ONIG_SYNTAX_GNU_REGEX;
632 					break;
633 				case 'g':
634 					*syntax = ONIG_SYNTAX_GREP;
635 					break;
636 				case 'c':
637 					*syntax = ONIG_SYNTAX_EMACS;
638 					break;
639 				case 'r':
640 					*syntax = ONIG_SYNTAX_RUBY;
641 					break;
642 				case 'z':
643 					*syntax = ONIG_SYNTAX_PERL;
644 					break;
645 				case 'b':
646 					*syntax = ONIG_SYNTAX_POSIX_BASIC;
647 					break;
648 				case 'd':
649 					*syntax = ONIG_SYNTAX_POSIX_EXTENDED;
650 					break;
651 				case 'e':
652 					if (eval != NULL) *eval = 1;
653 					break;
654 				default:
655 					break;
656 			}
657 		}
658 		if (option != NULL) *option|=optm;
659 	}
660 }
661 /* }}} */
662 
663 
664 /*
665  * Callbacks for named subpatterns
666  */
667 
668 /* {{{ struct mb_ereg_groups_iter_arg */
669 typedef struct mb_regex_groups_iter_args {
670 	zval		*groups;
671 	char		*search_str;
672 	size_t		search_len;
673 	OnigRegion	*region;
674 } mb_regex_groups_iter_args;
675 /* }}} */
676 
677 /* {{{ mb_ereg_groups_iter */
678 static int
mb_regex_groups_iter(const OnigUChar * name,const OnigUChar * name_end,int ngroup_num,int * group_nums,regex_t * reg,void * parg)679 mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
680 {
681 	mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
682 	int gn, beg, end;
683 
684 	/*
685 	 * In case of duplicate groups, keep only the last succeeding one
686 	 * to be consistent with preg_match with the PCRE_DUPNAMES option.
687 	 */
688 	gn = onig_name_to_backref_number(reg, name, name_end, args->region);
689 	beg = args->region->beg[gn];
690 	end = args->region->end[gn];
691 	if (beg >= 0 && beg < end && end <= args->search_len) {
692 		add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
693 	} else {
694 		add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
695 	}
696 
697 	return 0;
698 }
699 /* }}} */
700 
701 /*
702  * Helper for _php_mb_regex_ereg_replace_exec
703  */
704 /* {{{ mb_regex_substitute */
mb_regex_substitute(smart_str * pbuf,const char * subject,size_t subject_len,char * replace,size_t replace_len,php_mb_regex_t * regexp,OnigRegion * regs,const mbfl_encoding * enc)705 static inline void mb_regex_substitute(
706 	smart_str *pbuf,
707 	const char *subject,
708 	size_t subject_len,
709 	char *replace,
710 	size_t replace_len,
711 	php_mb_regex_t *regexp,
712 	OnigRegion *regs,
713 	const mbfl_encoding *enc
714 ) {
715 	char *p, *sp, *eos;
716 	int no; /* bakreference group number */
717 	int clen; /* byte-length of the current character */
718 
719 	p = replace;
720 	eos = replace + replace_len;
721 
722 	while (p < eos) {
723 		clen = (int) php_mb_mbchar_bytes_ex(p, enc);
724 		if (clen != 1 || p == eos || p[0] != '\\') {
725 			/* skip anything that's not an ascii backslash */
726 			smart_str_appendl(pbuf, p, clen);
727 			p += clen;
728 			continue;
729 		}
730 		sp = p; /* save position */
731 		clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
732 		if (clen != 1 || p == eos) {
733 			/* skip backslash followed by multibyte char */
734 			smart_str_appendl(pbuf, sp, p - sp);
735 			continue;
736 		}
737 		no = -1;
738 		switch (p[0]) {
739 			case '0':
740 				no = 0;
741 				p++;
742 				break;
743 			case '1': case '2': case '3': case '4':
744 			case '5': case '6': case '7': case '8': case '9':
745 				if (!onig_noname_group_capture_is_active(regexp)) {
746 					/*
747 					 * FIXME:
748 					 * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
749 					 * For now we just ignore them, but in the future we might want to raise a warning
750 					 * and abort the whole replace operation.
751 					 */
752 					p++;
753 					smart_str_appendl(pbuf, sp, p - sp);
754 					continue;
755 				}
756 				no = p[0] - '0';
757 				p++;
758 				break;
759 			case 'k':
760 				clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
761 				if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
762 					/* not a backref delimiter */
763 					p += clen;
764 					smart_str_appendl(pbuf, sp, p - sp);
765 					continue;
766 				}
767 				/* try to consume everything until next delimiter */
768 				char delim = p[0] == '<' ? '>' : '\'';
769 				char *name, *name_end;
770 				char maybe_num = 1;
771 				name_end = name = p + 1;
772 				while (name_end < eos) {
773 					clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
774 					if (clen != 1) {
775 						name_end += clen;
776 						maybe_num = 0;
777 						continue;
778 					}
779 					if (name_end[0] == delim) break;
780 					if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
781 					name_end++;
782 				}
783 				p = name_end + 1;
784 				if (name_end - name < 1 || name_end >= eos) {
785 					/* the backref was empty or we failed to find the end delimiter */
786 					smart_str_appendl(pbuf, sp, p - sp);
787 					continue;
788 				}
789 				/* we have either a name or a number */
790 				if (maybe_num) {
791 					if (!onig_noname_group_capture_is_active(regexp)) {
792 						/* see above note on mixing numbered & named backrefs */
793 						smart_str_appendl(pbuf, sp, p - sp);
794 						continue;
795 					}
796 					if (name_end - name == 1) {
797 						no = name[0] - '0';
798 						break;
799 					}
800 					if (name[0] == '0') {
801 						/* 01 is not a valid number */
802 						break;
803 					}
804 					no = (int) strtoul(name, NULL, 10);
805 					break;
806 				}
807 				no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
808 				break;
809 			default:
810 				/* We're not treating \ as an escape character and will interpret something like
811 				 * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
812 				 * function has not supported escaping of backslashes historically. */
813 				smart_str_appendl(pbuf, sp, p - sp);
814 				continue;
815 		}
816 		if (no < 0 || no >= regs->num_regs) {
817 			/* invalid group number reference, keep the escape sequence in the output */
818 			smart_str_appendl(pbuf, sp, p - sp);
819 			continue;
820 		}
821 		if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
822 			smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
823 		}
824 	}
825 
826 	if (p < eos) {
827 		smart_str_appendl(pbuf, p, eos - p);
828 	}
829 }
830 /* }}} */
831 
832 /*
833  * php functions
834  */
835 
836 /* {{{ proto string mb_regex_encoding([string encoding])
837    Returns the current encoding for regex as a string. */
PHP_FUNCTION(mb_regex_encoding)838 PHP_FUNCTION(mb_regex_encoding)
839 {
840 	char *encoding = NULL;
841 	size_t encoding_len;
842 	OnigEncoding mbctype;
843 
844 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s", &encoding, &encoding_len) == FAILURE) {
845 		return;
846 	}
847 
848 	if (!encoding) {
849 		const char *retval = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
850 
851 		if (retval == NULL) {
852 			RETURN_FALSE;
853 		}
854 
855 		RETURN_STRING((char *)retval);
856 	} else {
857 		mbctype = _php_mb_regex_name2mbctype(encoding);
858 
859 		if (mbctype == ONIG_ENCODING_UNDEF) {
860 			php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", encoding);
861 			RETURN_FALSE;
862 		}
863 
864 		MBREX(current_mbctype) = mbctype;
865 		RETURN_TRUE;
866 	}
867 }
868 /* }}} */
869 
870 /* {{{ _php_mb_onig_search */
_php_mb_onig_search(regex_t * reg,const OnigUChar * str,const OnigUChar * end,const OnigUChar * start,const OnigUChar * range,OnigRegion * region,OnigOptionType option)871 static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
872                    const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
873 	OnigMatchParam *mp = onig_new_match_param();
874 	int err;
875 	onig_initialize_match_param(mp);
876 	if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
877 		onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
878 	}
879 	/* search */
880 	err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
881 	onig_free_match_param(mp);
882 	return err;
883 }
884 /* }}} */
885 
886 
887 /* {{{ _php_mb_regex_ereg_exec */
_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS,int icase)888 static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
889 {
890 	zval *arg_pattern, *array = NULL;
891 	char *string;
892 	size_t string_len;
893 	php_mb_regex_t *re;
894 	OnigRegion *regs = NULL;
895 	int i, match_len, beg, end;
896 	OnigOptionType options;
897 	char *str;
898 
899 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "zs|z/", &arg_pattern, &string, &string_len, &array) == FAILURE) {
900 		RETURN_FALSE;
901 	}
902 
903 	if (array != NULL) {
904 		zval_ptr_dtor(array);
905 		array_init(array);
906 	}
907 
908 	if (!php_mb_check_encoding(
909 		string,
910 		string_len,
911 		_php_mb_regex_mbctype2name(MBREX(current_mbctype))
912 	)) {
913 		RETURN_FALSE;
914 	}
915 
916 	options = MBREX(regex_default_options);
917 	if (icase) {
918 		options |= ONIG_OPTION_IGNORECASE;
919 	}
920 
921 	/* compile the regular expression from the supplied regex */
922 	if (Z_TYPE_P(arg_pattern) != IS_STRING) {
923 		/* we convert numbers to integers and treat them as a string */
924 		if (Z_TYPE_P(arg_pattern) == IS_DOUBLE) {
925 			convert_to_long_ex(arg_pattern);	/* get rid of decimal places */
926 		}
927 		convert_to_string_ex(arg_pattern);
928 		/* don't bother doing an extended regex with just a number */
929 	}
930 
931 	if (Z_STRLEN_P(arg_pattern) == 0) {
932 		php_error_docref(NULL, E_WARNING, "empty pattern");
933 		RETVAL_FALSE;
934 		goto out;
935 	}
936 
937 	re = php_mbregex_compile_pattern(Z_STRVAL_P(arg_pattern), Z_STRLEN_P(arg_pattern), options, MBREX(current_mbctype), MBREX(regex_default_syntax));
938 	if (re == NULL) {
939 		RETVAL_FALSE;
940 		goto out;
941 	}
942 
943 	regs = onig_region_new();
944 
945 	/* actually execute the regular expression */
946 	if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
947 		RETVAL_FALSE;
948 		goto out;
949 	}
950 
951 	match_len = 1;
952 	str = string;
953 	if (array != NULL) {
954 
955 		match_len = regs->end[0] - regs->beg[0];
956 		for (i = 0; i < regs->num_regs; i++) {
957 			beg = regs->beg[i];
958 			end = regs->end[i];
959 			if (beg >= 0 && beg < end && (size_t)end <= string_len) {
960 				add_index_stringl(array, i, (char *)&str[beg], end - beg);
961 			} else {
962 				add_index_bool(array, i, 0);
963 			}
964 		}
965 
966 		if (onig_number_of_names(re) > 0) {
967 			mb_regex_groups_iter_args args = {array, string, string_len, regs};
968 			onig_foreach_name(re, mb_regex_groups_iter, &args);
969 		}
970 	}
971 
972 	if (match_len == 0) {
973 		match_len = 1;
974 	}
975 	RETVAL_LONG(match_len);
976 out:
977 	if (regs != NULL) {
978 		onig_region_free(regs, 1);
979 	}
980 }
981 /* }}} */
982 
983 /* {{{ proto int mb_ereg(string pattern, string string [, array registers])
984    Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg)985 PHP_FUNCTION(mb_ereg)
986 {
987 	_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
988 }
989 /* }}} */
990 
991 /* {{{ proto int mb_eregi(string pattern, string string [, array registers])
992    Case-insensitive regular expression match for multibyte string */
PHP_FUNCTION(mb_eregi)993 PHP_FUNCTION(mb_eregi)
994 {
995 	_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
996 }
997 /* }}} */
998 
999 /* {{{ _php_mb_regex_ereg_replace_exec */
_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS,OnigOptionType options,int is_callable)1000 static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
1001 {
1002 	zval *arg_pattern_zval;
1003 
1004 	char *arg_pattern;
1005 	size_t arg_pattern_len;
1006 
1007 	char *replace;
1008 	size_t replace_len;
1009 
1010 	zend_fcall_info arg_replace_fci;
1011 	zend_fcall_info_cache arg_replace_fci_cache;
1012 
1013 	char *string;
1014 	size_t string_len;
1015 
1016 	php_mb_regex_t *re;
1017 	OnigSyntaxType *syntax;
1018 	OnigRegion *regs = NULL;
1019 	smart_str out_buf = {0};
1020 	smart_str eval_buf = {0};
1021 	smart_str *pbuf;
1022 	int err, eval, n;
1023 	OnigUChar *pos;
1024 	OnigUChar *string_lim;
1025 	char *description = NULL;
1026 	char pat_buf[6];
1027 
1028 	const mbfl_encoding *enc;
1029 
1030 	{
1031 		const char *current_enc_name;
1032 		current_enc_name = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
1033 		if (current_enc_name == NULL ||
1034 			(enc = mbfl_name2encoding(current_enc_name)) == NULL) {
1035 			php_error_docref(NULL, E_WARNING, "Unknown error");
1036 			RETURN_FALSE;
1037 		}
1038 	}
1039 	eval = 0;
1040 	{
1041 		char *option_str = NULL;
1042 		size_t option_str_len = 0;
1043 
1044 		if (!is_callable) {
1045 			if (zend_parse_parameters(ZEND_NUM_ARGS(), "zss|s",
1046 						&arg_pattern_zval,
1047 						&replace, &replace_len,
1048 						&string, &string_len,
1049 						&option_str, &option_str_len) == FAILURE) {
1050 				RETURN_FALSE;
1051 			}
1052 		} else {
1053 			if (zend_parse_parameters(ZEND_NUM_ARGS(), "zfs|s",
1054 						&arg_pattern_zval,
1055 						&arg_replace_fci, &arg_replace_fci_cache,
1056 						&string, &string_len,
1057 						&option_str, &option_str_len) == FAILURE) {
1058 				RETURN_FALSE;
1059 			}
1060 		}
1061 
1062 		if (!php_mb_check_encoding(
1063 		string,
1064 		string_len,
1065 		_php_mb_regex_mbctype2name(MBREX(current_mbctype))
1066 		)) {
1067 			RETURN_NULL();
1068 		}
1069 
1070 		if (option_str != NULL) {
1071 			_php_mb_regex_init_options(option_str, option_str_len, &options, &syntax, &eval);
1072 		} else {
1073 			options |= MBREX(regex_default_options);
1074 			syntax = MBREX(regex_default_syntax);
1075 		}
1076 	}
1077 	if (eval && !is_callable) {
1078 		php_error_docref(NULL, E_DEPRECATED, "The 'e' option is deprecated, use mb_ereg_replace_callback instead");
1079 	}
1080 	if (Z_TYPE_P(arg_pattern_zval) == IS_STRING) {
1081 		arg_pattern = Z_STRVAL_P(arg_pattern_zval);
1082 		arg_pattern_len = Z_STRLEN_P(arg_pattern_zval);
1083 	} else {
1084 		/* FIXME: this code is not multibyte aware! */
1085 		convert_to_long_ex(arg_pattern_zval);
1086 		pat_buf[0] = (char)Z_LVAL_P(arg_pattern_zval);
1087 		pat_buf[1] = '\0';
1088 		pat_buf[2] = '\0';
1089 		pat_buf[3] = '\0';
1090 		pat_buf[4] = '\0';
1091 		pat_buf[5] = '\0';
1092 
1093 		arg_pattern = pat_buf;
1094 		arg_pattern_len = 1;
1095 	}
1096 	/* create regex pattern buffer */
1097 	re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(current_mbctype), syntax);
1098 	if (re == NULL) {
1099 		RETURN_FALSE;
1100 	}
1101 
1102 	if (eval || is_callable) {
1103 		pbuf = &eval_buf;
1104 		description = zend_make_compiled_string_description("mbregex replace");
1105 	} else {
1106 		pbuf = &out_buf;
1107 		description = NULL;
1108 	}
1109 
1110 	if (is_callable) {
1111 		if (eval) {
1112 			php_error_docref(NULL, E_WARNING, "Option 'e' cannot be used with replacement callback");
1113 			RETURN_FALSE;
1114 		}
1115 	}
1116 
1117 	/* do the actual work */
1118 	err = 0;
1119 	pos = (OnigUChar *)string;
1120 	string_lim = (OnigUChar*)(string + string_len);
1121 	regs = onig_region_new();
1122 	while (err >= 0) {
1123 		err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
1124 		if (err <= -2) {
1125 			OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1126 			onig_error_code_to_str(err_str, err);
1127 			php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
1128 			break;
1129 		}
1130 		if (err >= 0) {
1131 			/* copy the part of the string before the match */
1132 			smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
1133 
1134 			if (!is_callable) {
1135 				mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
1136 			}
1137 
1138 			if (eval) {
1139 				zval v;
1140 				zend_string *eval_str;
1141 				/* null terminate buffer */
1142 				smart_str_0(&eval_buf);
1143 
1144 				if (eval_buf.s) {
1145 					eval_str = eval_buf.s;
1146 				} else {
1147 					eval_str = ZSTR_EMPTY_ALLOC();
1148 				}
1149 
1150 				/* do eval */
1151 				if (zend_eval_stringl(ZSTR_VAL(eval_str), ZSTR_LEN(eval_str), &v, description) == FAILURE) {
1152 					efree(description);
1153 					zend_throw_error(NULL, "Failed evaluating code: %s%s", PHP_EOL, ZSTR_VAL(eval_str));
1154 					onig_region_free(regs, 0);
1155 					smart_str_free(&out_buf);
1156 					smart_str_free(&eval_buf);
1157 					RETURN_FALSE;
1158 				}
1159 
1160 				/* result of eval */
1161 				convert_to_string(&v);
1162 				smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v));
1163 				/* Clean up */
1164 				smart_str_free(&eval_buf);
1165 				zval_ptr_dtor_str(&v);
1166 			} else if (is_callable) {
1167 				zval args[1];
1168 				zval subpats, retval;
1169 				int i;
1170 
1171 				array_init(&subpats);
1172 				for (i = 0; i < regs->num_regs; i++) {
1173 					add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
1174 				}
1175 				if (onig_number_of_names(re) > 0) {
1176 					mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1177 					onig_foreach_name(re, mb_regex_groups_iter, &args);
1178 				}
1179 
1180 				ZVAL_COPY_VALUE(&args[0], &subpats);
1181 				/* null terminate buffer */
1182 				smart_str_0(&eval_buf);
1183 
1184 				arg_replace_fci.param_count = 1;
1185 				arg_replace_fci.params = args;
1186 				arg_replace_fci.retval = &retval;
1187 				if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
1188 						!Z_ISUNDEF(retval)) {
1189 					convert_to_string_ex(&retval);
1190 					smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
1191 					smart_str_free(&eval_buf);
1192 					zval_ptr_dtor(&retval);
1193 				} else {
1194 					if (!EG(exception)) {
1195 						php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1196 					}
1197 				}
1198 				zval_ptr_dtor(&subpats);
1199 			}
1200 
1201 			n = regs->end[0];
1202 			if ((pos - (OnigUChar *)string) < n) {
1203 				pos = (OnigUChar *)string + n;
1204 			} else {
1205 				if (pos < string_lim) {
1206 					smart_str_appendl(&out_buf, (char *)pos, 1);
1207 				}
1208 				pos++;
1209 			}
1210 		} else { /* nomatch */
1211 			/* stick that last bit of string on our output */
1212 			if (string_lim - pos > 0) {
1213 				smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
1214 			}
1215 		}
1216 		onig_region_free(regs, 0);
1217 	}
1218 
1219 	if (description) {
1220 		efree(description);
1221 	}
1222 	if (regs != NULL) {
1223 		onig_region_free(regs, 1);
1224 	}
1225 	smart_str_free(&eval_buf);
1226 
1227 	if (err <= -2) {
1228 		smart_str_free(&out_buf);
1229 		RETVAL_FALSE;
1230 	} else if (out_buf.s) {
1231 		smart_str_0(&out_buf);
1232 		RETVAL_STR(out_buf.s);
1233 	} else {
1234 		RETVAL_EMPTY_STRING();
1235 	}
1236 }
1237 /* }}} */
1238 
1239 /* {{{ proto string mb_ereg_replace(string pattern, string replacement, string string [, string option])
1240    Replace regular expression for multibyte string */
PHP_FUNCTION(mb_ereg_replace)1241 PHP_FUNCTION(mb_ereg_replace)
1242 {
1243 	_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1244 }
1245 /* }}} */
1246 
1247 /* {{{ proto string mb_eregi_replace(string pattern, string replacement, string string)
1248    Case insensitive replace regular expression for multibyte string */
PHP_FUNCTION(mb_eregi_replace)1249 PHP_FUNCTION(mb_eregi_replace)
1250 {
1251 	_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
1252 }
1253 /* }}} */
1254 
1255 /* {{{ proto string mb_ereg_replace_callback(string pattern, string callback, string string [, string option])
1256     regular expression for multibyte string using replacement callback */
PHP_FUNCTION(mb_ereg_replace_callback)1257 PHP_FUNCTION(mb_ereg_replace_callback)
1258 {
1259 	_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1260 }
1261 /* }}} */
1262 
1263 /* {{{ proto array mb_split(string pattern, string string [, int limit])
1264    split multibyte string into array by regular expression */
PHP_FUNCTION(mb_split)1265 PHP_FUNCTION(mb_split)
1266 {
1267 	char *arg_pattern;
1268 	size_t arg_pattern_len;
1269 	php_mb_regex_t *re;
1270 	OnigRegion *regs = NULL;
1271 	char *string;
1272 	OnigUChar *pos, *chunk_pos;
1273 	size_t string_len;
1274 
1275 	int err;
1276 	zend_long count = -1;
1277 
1278 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
1279 		RETURN_FALSE;
1280 	}
1281 
1282 	if (count > 0) {
1283 		count--;
1284 	}
1285 
1286 	if (!php_mb_check_encoding(string, string_len,
1287 			_php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
1288 		RETURN_FALSE;
1289 	}
1290 
1291 	/* create regex pattern buffer */
1292 	if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
1293 		RETURN_FALSE;
1294 	}
1295 
1296 	array_init(return_value);
1297 
1298 	chunk_pos = pos = (OnigUChar *)string;
1299 	err = 0;
1300 	regs = onig_region_new();
1301 	/* churn through str, generating array entries as we go */
1302 	while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
1303 		size_t beg, end;
1304 		err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
1305 		if (err < 0) {
1306 			break;
1307 		}
1308 		beg = regs->beg[0], end = regs->end[0];
1309 		/* add it to the array */
1310 		if ((size_t)(pos - (OnigUChar *)string) < end) {
1311 			if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
1312 				add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
1313 				--count;
1314 			} else {
1315 				err = -2;
1316 				break;
1317 			}
1318 			/* point at our new starting point */
1319 			chunk_pos = pos = (OnigUChar *)string + end;
1320 		} else {
1321 			pos++;
1322 		}
1323 		onig_region_free(regs, 0);
1324 	}
1325 
1326 	onig_region_free(regs, 1);
1327 
1328 	/* see if we encountered an error */
1329 	if (err <= -2) {
1330 		OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1331 		onig_error_code_to_str(err_str, err);
1332 		php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
1333 		zend_array_destroy(Z_ARR_P(return_value));
1334 		RETURN_FALSE;
1335 	}
1336 
1337 	/* otherwise we just have one last element to add to the array */
1338 	if ((OnigUChar *)(string + string_len) > chunk_pos) {
1339 		size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
1340 		add_next_index_stringl(return_value, (char *)chunk_pos, n);
1341 	} else {
1342 		add_next_index_stringl(return_value, "", 0);
1343 	}
1344 }
1345 /* }}} */
1346 
1347 /* {{{ proto bool mb_ereg_match(string pattern, string string [,string option])
1348    Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg_match)1349 PHP_FUNCTION(mb_ereg_match)
1350 {
1351 	char *arg_pattern;
1352 	size_t arg_pattern_len;
1353 
1354 	char *string;
1355 	size_t string_len;
1356 
1357 	php_mb_regex_t *re;
1358 	OnigSyntaxType *syntax;
1359 	OnigOptionType option = 0;
1360 	int err;
1361 	OnigMatchParam *mp;
1362 
1363 	{
1364 		char *option_str = NULL;
1365 		size_t option_str_len = 0;
1366 
1367 		if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s",
1368 		                          &arg_pattern, &arg_pattern_len, &string, &string_len,
1369 		                          &option_str, &option_str_len)==FAILURE) {
1370 			RETURN_FALSE;
1371 		}
1372 
1373 		if (option_str != NULL) {
1374 			_php_mb_regex_init_options(option_str, option_str_len, &option, &syntax, NULL);
1375 		} else {
1376 			option |= MBREX(regex_default_options);
1377 			syntax = MBREX(regex_default_syntax);
1378 		}
1379 	}
1380 
1381 	if (!php_mb_check_encoding(string, string_len,
1382 			_php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
1383 		RETURN_FALSE;
1384 	}
1385 
1386 	if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1387 		RETURN_FALSE;
1388 	}
1389 
1390 	mp = onig_new_match_param();
1391 	onig_initialize_match_param(mp);
1392 	if(MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
1393 		onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
1394 	}
1395 	/* match */
1396 	err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
1397 	onig_free_match_param(mp);
1398 	if (err >= 0) {
1399 		RETVAL_TRUE;
1400 	} else {
1401 		RETVAL_FALSE;
1402 	}
1403 }
1404 /* }}} */
1405 
1406 /* regex search */
1407 /* {{{ _php_mb_regex_ereg_search_exec */
1408 static void
_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS,int mode)1409 _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
1410 {
1411 	char *arg_pattern = NULL, *arg_options = NULL;
1412 	size_t arg_pattern_len, arg_options_len;
1413 	int err;
1414 	size_t n, i, pos, len, beg, end;
1415 	OnigOptionType option;
1416 	OnigUChar *str;
1417 	OnigSyntaxType *syntax;
1418 
1419 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|ss", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1420 		return;
1421 	}
1422 
1423 	option = MBREX(regex_default_options);
1424 
1425 	if (arg_options) {
1426 		option = 0;
1427 		_php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
1428 	}
1429 
1430 	if (MBREX(search_regs)) {
1431 		onig_region_free(MBREX(search_regs), 1);
1432 		MBREX(search_regs) = NULL;
1433 	}
1434 
1435 	if (arg_pattern) {
1436 		/* create regex pattern buffer */
1437 		if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
1438 			RETURN_FALSE;
1439 		}
1440 	}
1441 
1442 	pos = MBREX(search_pos);
1443 	str = NULL;
1444 	len = 0;
1445 	if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
1446 		str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1447 		len = Z_STRLEN(MBREX(search_str));
1448 	}
1449 
1450 	if (MBREX(search_re) == NULL) {
1451 		php_error_docref(NULL, E_WARNING, "No regex given");
1452 		RETURN_FALSE;
1453 	}
1454 
1455 	if (str == NULL) {
1456 		php_error_docref(NULL, E_WARNING, "No string given");
1457 		RETURN_FALSE;
1458 	}
1459 
1460 	MBREX(search_regs) = onig_region_new();
1461 
1462 	err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str  + len, MBREX(search_regs), 0);
1463 	if (err == ONIG_MISMATCH) {
1464 		MBREX(search_pos) = len;
1465 		RETVAL_FALSE;
1466 	} else if (err <= -2) {
1467 		OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1468 		onig_error_code_to_str(err_str, err);
1469 		php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
1470 		RETVAL_FALSE;
1471 	} else {
1472 		switch (mode) {
1473 		case 1:
1474 			array_init(return_value);
1475 			beg = MBREX(search_regs)->beg[0];
1476 			end = MBREX(search_regs)->end[0];
1477 			add_next_index_long(return_value, beg);
1478 			add_next_index_long(return_value, end - beg);
1479 			break;
1480 		case 2:
1481 			array_init(return_value);
1482 			n = MBREX(search_regs)->num_regs;
1483 			for (i = 0; i < n; i++) {
1484 				beg = MBREX(search_regs)->beg[i];
1485 				end = MBREX(search_regs)->end[i];
1486 				if (beg >= 0 && beg <= end && end <= len) {
1487 					add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1488 				} else {
1489 					add_index_bool(return_value, i, 0);
1490 				}
1491 			}
1492 			if (onig_number_of_names(MBREX(search_re)) > 0) {
1493 				mb_regex_groups_iter_args args = {
1494 					return_value,
1495 					Z_STRVAL(MBREX(search_str)),
1496 					Z_STRLEN(MBREX(search_str)),
1497 					MBREX(search_regs)
1498 				};
1499 				onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1500 			}
1501 			break;
1502 		default:
1503 			RETVAL_TRUE;
1504 			break;
1505 		}
1506 		end = MBREX(search_regs)->end[0];
1507 		if (pos <= end) {
1508 			MBREX(search_pos) = end;
1509 		} else {
1510 			MBREX(search_pos) = pos + 1;
1511 		}
1512 	}
1513 
1514 	if (err < 0) {
1515 		onig_region_free(MBREX(search_regs), 1);
1516 		MBREX(search_regs) = (OnigRegion *)NULL;
1517 	}
1518 }
1519 /* }}} */
1520 
1521 /* {{{ proto bool mb_ereg_search([string pattern[, string option]])
1522    Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search)1523 PHP_FUNCTION(mb_ereg_search)
1524 {
1525 	_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1526 }
1527 /* }}} */
1528 
1529 /* {{{ proto array mb_ereg_search_pos([string pattern[, string option]])
1530    Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_pos)1531 PHP_FUNCTION(mb_ereg_search_pos)
1532 {
1533 	_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1534 }
1535 /* }}} */
1536 
1537 /* {{{ proto array mb_ereg_search_regs([string pattern[, string option]])
1538    Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_regs)1539 PHP_FUNCTION(mb_ereg_search_regs)
1540 {
1541 	_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
1542 }
1543 /* }}} */
1544 
1545 /* {{{ proto bool mb_ereg_search_init(string string [, string pattern[, string option]])
1546    Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)1547 PHP_FUNCTION(mb_ereg_search_init)
1548 {
1549 	int argc = ZEND_NUM_ARGS();
1550 	zend_string *arg_str;
1551 	char *arg_pattern = NULL, *arg_options = NULL;
1552 	size_t arg_pattern_len = 0, arg_options_len = 0;
1553 	OnigSyntaxType *syntax = NULL;
1554 	OnigOptionType option;
1555 
1556 	if (zend_parse_parameters(argc, "S|ss", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1557 		return;
1558 	}
1559 
1560 	if (argc > 1 && arg_pattern_len == 0) {
1561 		php_error_docref(NULL, E_WARNING, "Empty pattern");
1562 		RETURN_FALSE;
1563 	}
1564 
1565 	option = MBREX(regex_default_options);
1566 	syntax = MBREX(regex_default_syntax);
1567 
1568 	if (argc == 3) {
1569 		option = 0;
1570 		_php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
1571 	}
1572 
1573 	if (argc > 1) {
1574 		/* create regex pattern buffer */
1575 		if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
1576 			RETURN_FALSE;
1577 		}
1578 	}
1579 
1580 	if (!Z_ISNULL(MBREX(search_str))) {
1581 		zval_ptr_dtor(&MBREX(search_str));
1582 	}
1583 
1584 	ZVAL_STR_COPY(&MBREX(search_str), arg_str);
1585 
1586 	if (php_mb_check_encoding(
1587 	ZSTR_VAL(arg_str),
1588 	ZSTR_LEN(arg_str),
1589 	_php_mb_regex_mbctype2name(MBREX(current_mbctype))
1590 	)) {
1591 		MBREX(search_pos) = 0;
1592 		RETVAL_TRUE;
1593 	} else {
1594 		MBREX(search_pos) = ZSTR_LEN(arg_str);
1595 		RETVAL_FALSE;
1596 	}
1597 
1598 	if (MBREX(search_regs) != NULL) {
1599 		onig_region_free(MBREX(search_regs), 1);
1600 		MBREX(search_regs) = NULL;
1601 	}
1602 }
1603 /* }}} */
1604 
1605 /* {{{ proto array mb_ereg_search_getregs(void)
1606    Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)1607 PHP_FUNCTION(mb_ereg_search_getregs)
1608 {
1609 	size_t n, i, len, beg, end;
1610 	OnigUChar *str;
1611 
1612 	if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
1613 		array_init(return_value);
1614 
1615 		str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1616 		len = Z_STRLEN(MBREX(search_str));
1617 		n = MBREX(search_regs)->num_regs;
1618 		for (i = 0; i < n; i++) {
1619 			beg = MBREX(search_regs)->beg[i];
1620 			end = MBREX(search_regs)->end[i];
1621 			if (beg >= 0 && beg <= end && end <= len) {
1622 				add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1623 			} else {
1624 				add_index_bool(return_value, i, 0);
1625 			}
1626 		}
1627 		if (onig_number_of_names(MBREX(search_re)) > 0) {
1628 			mb_regex_groups_iter_args args = {
1629 				return_value,
1630 				Z_STRVAL(MBREX(search_str)),
1631 				len,
1632 				MBREX(search_regs)
1633 			};
1634 			onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1635 		}
1636 	} else {
1637 		RETVAL_FALSE;
1638 	}
1639 }
1640 /* }}} */
1641 
1642 /* {{{ proto int mb_ereg_search_getpos(void)
1643    Get search start position */
PHP_FUNCTION(mb_ereg_search_getpos)1644 PHP_FUNCTION(mb_ereg_search_getpos)
1645 {
1646 	RETVAL_LONG(MBREX(search_pos));
1647 }
1648 /* }}} */
1649 
1650 /* {{{ proto bool mb_ereg_search_setpos(int position)
1651    Set search start position */
PHP_FUNCTION(mb_ereg_search_setpos)1652 PHP_FUNCTION(mb_ereg_search_setpos)
1653 {
1654 	zend_long position;
1655 
1656 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
1657 		return;
1658 	}
1659 
1660 	/* Accept negative position if length of search string can be determined */
1661 	if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
1662 		position += Z_STRLEN(MBREX(search_str));
1663 	}
1664 
1665 	if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
1666 		php_error_docref(NULL, E_WARNING, "Position is out of range");
1667 		MBREX(search_pos) = 0;
1668 		RETURN_FALSE;
1669 	}
1670 
1671 	MBREX(search_pos) = position;
1672 	RETURN_TRUE;
1673 }
1674 /* }}} */
1675 
1676 /* {{{ php_mb_regex_set_options */
_php_mb_regex_set_options(OnigOptionType options,OnigSyntaxType * syntax,OnigOptionType * prev_options,OnigSyntaxType ** prev_syntax)1677 static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
1678 {
1679 	if (prev_options != NULL) {
1680 		*prev_options = MBREX(regex_default_options);
1681 	}
1682 	if (prev_syntax != NULL) {
1683 		*prev_syntax = MBREX(regex_default_syntax);
1684 	}
1685 	MBREX(regex_default_options) = options;
1686 	MBREX(regex_default_syntax) = syntax;
1687 }
1688 /* }}} */
1689 
1690 /* {{{ proto string mb_regex_set_options([string options])
1691    Set or get the default options for mbregex functions */
PHP_FUNCTION(mb_regex_set_options)1692 PHP_FUNCTION(mb_regex_set_options)
1693 {
1694 	OnigOptionType opt;
1695 	OnigSyntaxType *syntax;
1696 	char *string = NULL;
1697 	size_t string_len;
1698 	char buf[16];
1699 
1700 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s",
1701 	                          &string, &string_len) == FAILURE) {
1702 		RETURN_FALSE;
1703 	}
1704 	if (string != NULL) {
1705 		opt = 0;
1706 		syntax = NULL;
1707 		_php_mb_regex_init_options(string, string_len, &opt, &syntax, NULL);
1708 		_php_mb_regex_set_options(opt, syntax, NULL, NULL);
1709 	} else {
1710 		opt = MBREX(regex_default_options);
1711 		syntax = MBREX(regex_default_syntax);
1712 	}
1713 	_php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
1714 
1715 	RETVAL_STRING(buf);
1716 }
1717 /* }}} */
1718 
1719 #endif	/* HAVE_MBREGEX */
1720 
1721 /*
1722  * Local variables:
1723  * tab-width: 4
1724  * c-basic-offset: 4
1725  * End:
1726  * vim600: fdm=marker
1727  * vim: noet sw=4 ts=4
1728  */
1729