xref: /PHP-8.0/ext/mbstring/php_mbregex.c (revision 5582490b)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | http://www.php.net/license/3_01.txt                                  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp>              |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "libmbfl/config.h"
18 
19 #include "php.h"
20 #include "php_ini.h"
21 
22 #ifdef HAVE_MBREGEX
23 
24 #include "zend_smart_str.h"
25 #include "ext/standard/info.h"
26 #include "php_mbregex.h"
27 #include "mbstring.h"
28 #include "libmbfl/filters/mbfilter_utf8.h"
29 
30 #include "php_onig_compat.h" /* must come prior to the oniguruma header */
31 #include <oniguruma.h>
32 #undef UChar
33 
34 #if !defined(ONIGURUMA_VERSION_INT) || ONIGURUMA_VERSION_INT < 60800
35 typedef void OnigMatchParam;
36 #define onig_new_match_param() (NULL)
37 #define onig_initialize_match_param(x) (void)(x)
38 #define onig_set_match_stack_limit_size_of_match_param(x, y)
39 #define onig_set_retry_limit_in_match_of_match_param(x, y)
40 #define onig_free_match_param(x)
41 #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
42 		onig_search(reg, str, end, start, range, region, option)
43 #define onig_match_with_param(re, str, end, at, region, option, mp) \
44 		onig_match(re, str, end, at, region, option)
45 #endif
46 
47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48 
49 struct _zend_mb_regex_globals {
50 	OnigEncoding default_mbctype;
51 	OnigEncoding current_mbctype;
52 	const mbfl_encoding *current_mbctype_mbfl_encoding;
53 	HashTable ht_rc;
54 	zval search_str;
55 	zval *search_str_val;
56 	size_t search_pos;
57 	php_mb_regex_t *search_re;
58 	OnigRegion *search_regs;
59 	OnigOptionType regex_default_options;
60 	OnigSyntaxType *regex_default_syntax;
61 };
62 
63 #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
64 
65 /* {{{ static void php_mb_regex_free_cache() */
php_mb_regex_free_cache(zval * el)66 static void php_mb_regex_free_cache(zval *el) {
67 	onig_free((php_mb_regex_t *)Z_PTR_P(el));
68 }
69 /* }}} */
70 
71 /* {{{ _php_mb_regex_globals_ctor */
_php_mb_regex_globals_ctor(zend_mb_regex_globals * pglobals)72 static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
73 {
74 	pglobals->default_mbctype = ONIG_ENCODING_UTF8;
75 	pglobals->current_mbctype = ONIG_ENCODING_UTF8;
76 	pglobals->current_mbctype_mbfl_encoding = &mbfl_encoding_utf8;
77 	ZVAL_UNDEF(&pglobals->search_str);
78 	pglobals->search_re = (php_mb_regex_t*)NULL;
79 	pglobals->search_pos = 0;
80 	pglobals->search_regs = (OnigRegion*)NULL;
81 	pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
82 	pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
83 	return SUCCESS;
84 }
85 /* }}} */
86 
87 /* {{{ php_mb_regex_globals_alloc */
php_mb_regex_globals_alloc(void)88 zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
89 {
90 	zend_mb_regex_globals *pglobals = pemalloc(
91 			sizeof(zend_mb_regex_globals), 1);
92 	if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
93 		pefree(pglobals, 1);
94 		return NULL;
95 	}
96 	return pglobals;
97 }
98 /* }}} */
99 
100 /* {{{ php_mb_regex_globals_free */
php_mb_regex_globals_free(zend_mb_regex_globals * pglobals)101 void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
102 {
103 	if (!pglobals) {
104 		return;
105 	}
106 	pefree(pglobals, 1);
107 }
108 /* }}} */
109 
110 /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
PHP_MINIT_FUNCTION(mb_regex)111 PHP_MINIT_FUNCTION(mb_regex)
112 {
113 	char version[256];
114 
115 	onig_init();
116 
117 	snprintf(version, sizeof(version), "%d.%d.%d",
118 		ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY);
119 	REGISTER_STRING_CONSTANT("MB_ONIGURUMA_VERSION", version, CONST_CS | CONST_PERSISTENT);
120 	return SUCCESS;
121 }
122 /* }}} */
123 
124 /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
PHP_MSHUTDOWN_FUNCTION(mb_regex)125 PHP_MSHUTDOWN_FUNCTION(mb_regex)
126 {
127 	onig_end();
128 	return SUCCESS;
129 }
130 /* }}} */
131 
132 /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)133 PHP_RINIT_FUNCTION(mb_regex)
134 {
135 	if (!MBSTRG(mb_regex_globals)) return FAILURE;
136 	zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
137 	return SUCCESS;
138 }
139 /* }}} */
140 
141 /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
PHP_RSHUTDOWN_FUNCTION(mb_regex)142 PHP_RSHUTDOWN_FUNCTION(mb_regex)
143 {
144 	MBREX(current_mbctype) = MBREX(default_mbctype);
145 	MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(php_mb_regex_get_default_mbctype());
146 
147 	if (!Z_ISUNDEF(MBREX(search_str))) {
148 		zval_ptr_dtor(&MBREX(search_str));
149 		ZVAL_UNDEF(&MBREX(search_str));
150 	}
151 	MBREX(search_pos) = 0;
152 	MBREX(search_re) = NULL;
153 
154 	if (MBREX(search_regs) != NULL) {
155 		onig_region_free(MBREX(search_regs), 1);
156 		MBREX(search_regs) = (OnigRegion *)NULL;
157 	}
158 	zend_hash_destroy(&MBREX(ht_rc));
159 
160 	return SUCCESS;
161 }
162 /* }}} */
163 
164 /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
PHP_MINFO_FUNCTION(mb_regex)165 PHP_MINFO_FUNCTION(mb_regex)
166 {
167 	char buf[32];
168 	php_info_print_table_start();
169 	php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
170 	snprintf(buf, sizeof(buf), "%d.%d.%d",
171 			ONIGURUMA_VERSION_MAJOR,
172 			ONIGURUMA_VERSION_MINOR,
173 			ONIGURUMA_VERSION_TEENY);
174 	php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
175 	php_info_print_table_end();
176 }
177 /* }}} */
178 
179 /*
180  * encoding name resolver
181  */
182 
183 /* {{{ encoding name map */
184 typedef struct _php_mb_regex_enc_name_map_t {
185 	const char *names;
186 	OnigEncoding code;
187 } php_mb_regex_enc_name_map_t;
188 
189 static const php_mb_regex_enc_name_map_t enc_name_map[] = {
190 #ifdef ONIG_ENCODING_EUC_JP
191 	{
192 		"EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
193 		ONIG_ENCODING_EUC_JP
194 	},
195 #endif
196 #ifdef ONIG_ENCODING_UTF8
197 	{
198 		"UTF-8\0UTF8\0",
199 		ONIG_ENCODING_UTF8
200 	},
201 #endif
202 #ifdef ONIG_ENCODING_UTF16_BE
203 	{
204 		"UTF-16\0UTF-16BE\0",
205 		ONIG_ENCODING_UTF16_BE
206 	},
207 #endif
208 #ifdef ONIG_ENCODING_UTF16_LE
209 	{
210 		"UTF-16LE\0",
211 		ONIG_ENCODING_UTF16_LE
212 	},
213 #endif
214 #ifdef ONIG_ENCODING_UTF32_BE
215 	{
216 		"UCS-4\0UTF-32\0UTF-32BE\0",
217 		ONIG_ENCODING_UTF32_BE
218 	},
219 #endif
220 #ifdef ONIG_ENCODING_UTF32_LE
221 	{
222 		"UCS-4LE\0UTF-32LE\0",
223 		ONIG_ENCODING_UTF32_LE
224 	},
225 #endif
226 #ifdef ONIG_ENCODING_SJIS
227 	{
228 		"SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
229 		ONIG_ENCODING_SJIS
230 	},
231 #endif
232 #ifdef ONIG_ENCODING_BIG5
233 	{
234 		"BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
235 		ONIG_ENCODING_BIG5
236 	},
237 #endif
238 #ifdef ONIG_ENCODING_EUC_CN
239 	{
240 		"EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
241 		ONIG_ENCODING_EUC_CN
242 	},
243 #endif
244 #ifdef ONIG_ENCODING_EUC_TW
245 	{
246 		"EUC-TW\0EUCTW\0EUC_TW\0",
247 		ONIG_ENCODING_EUC_TW
248 	},
249 #endif
250 #ifdef ONIG_ENCODING_EUC_KR
251 	{
252 		"EUC-KR\0EUCKR\0EUC_KR\0",
253 		ONIG_ENCODING_EUC_KR
254 	},
255 #endif
256 #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
257 	{
258 		"KOI8\0KOI-8\0",
259 		ONIG_ENCODING_KOI8
260 	},
261 #endif
262 #ifdef ONIG_ENCODING_KOI8_R
263 	{
264 		"KOI8R\0KOI8-R\0KOI-8R\0",
265 		ONIG_ENCODING_KOI8_R
266 	},
267 #endif
268 #ifdef ONIG_ENCODING_ISO_8859_1
269 	{
270 		"ISO-8859-1\0ISO8859-1\0",
271 		ONIG_ENCODING_ISO_8859_1
272 	},
273 #endif
274 #ifdef ONIG_ENCODING_ISO_8859_2
275 	{
276 		"ISO-8859-2\0ISO8859-2\0",
277 		ONIG_ENCODING_ISO_8859_2
278 	},
279 #endif
280 #ifdef ONIG_ENCODING_ISO_8859_3
281 	{
282 		"ISO-8859-3\0ISO8859-3\0",
283 		ONIG_ENCODING_ISO_8859_3
284 	},
285 #endif
286 #ifdef ONIG_ENCODING_ISO_8859_4
287 	{
288 		"ISO-8859-4\0ISO8859-4\0",
289 		ONIG_ENCODING_ISO_8859_4
290 	},
291 #endif
292 #ifdef ONIG_ENCODING_ISO_8859_5
293 	{
294 		"ISO-8859-5\0ISO8859-5\0",
295 		ONIG_ENCODING_ISO_8859_5
296 	},
297 #endif
298 #ifdef ONIG_ENCODING_ISO_8859_6
299 	{
300 		"ISO-8859-6\0ISO8859-6\0",
301 		ONIG_ENCODING_ISO_8859_6
302 	},
303 #endif
304 #ifdef ONIG_ENCODING_ISO_8859_7
305 	{
306 		"ISO-8859-7\0ISO8859-7\0",
307 		ONIG_ENCODING_ISO_8859_7
308 	},
309 #endif
310 #ifdef ONIG_ENCODING_ISO_8859_8
311 	{
312 		"ISO-8859-8\0ISO8859-8\0",
313 		ONIG_ENCODING_ISO_8859_8
314 	},
315 #endif
316 #ifdef ONIG_ENCODING_ISO_8859_9
317 	{
318 		"ISO-8859-9\0ISO8859-9\0",
319 		ONIG_ENCODING_ISO_8859_9
320 	},
321 #endif
322 #ifdef ONIG_ENCODING_ISO_8859_10
323 	{
324 		"ISO-8859-10\0ISO8859-10\0",
325 		ONIG_ENCODING_ISO_8859_10
326 	},
327 #endif
328 #ifdef ONIG_ENCODING_ISO_8859_11
329 	{
330 		"ISO-8859-11\0ISO8859-11\0",
331 		ONIG_ENCODING_ISO_8859_11
332 	},
333 #endif
334 #ifdef ONIG_ENCODING_ISO_8859_13
335 	{
336 		"ISO-8859-13\0ISO8859-13\0",
337 		ONIG_ENCODING_ISO_8859_13
338 	},
339 #endif
340 #ifdef ONIG_ENCODING_ISO_8859_14
341 	{
342 		"ISO-8859-14\0ISO8859-14\0",
343 		ONIG_ENCODING_ISO_8859_14
344 	},
345 #endif
346 #ifdef ONIG_ENCODING_ISO_8859_15
347 	{
348 		"ISO-8859-15\0ISO8859-15\0",
349 		ONIG_ENCODING_ISO_8859_15
350 	},
351 #endif
352 #ifdef ONIG_ENCODING_ISO_8859_16
353 	{
354 		"ISO-8859-16\0ISO8859-16\0",
355 		ONIG_ENCODING_ISO_8859_16
356 	},
357 #endif
358 #ifdef ONIG_ENCODING_ASCII
359 	{
360 		"ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
361 		ONIG_ENCODING_ASCII
362 	},
363 #endif
364 	{ NULL, ONIG_ENCODING_UNDEF }
365 };
366 /* }}} */
367 
368 /* {{{ php_mb_regex_name2mbctype */
_php_mb_regex_name2mbctype(const char * pname)369 static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
370 {
371 	const char *p;
372 	const php_mb_regex_enc_name_map_t *mapping;
373 
374 	if (pname == NULL || !*pname) {
375 		return ONIG_ENCODING_UNDEF;
376 	}
377 
378 	for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
379 		for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
380 			if (strcasecmp(p, pname) == 0) {
381 				return mapping->code;
382 			}
383 		}
384 	}
385 
386 	return ONIG_ENCODING_UNDEF;
387 }
388 /* }}} */
389 
390 /* {{{ php_mb_regex_mbctype2name */
_php_mb_regex_mbctype2name(OnigEncoding mbctype)391 static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
392 {
393 	const php_mb_regex_enc_name_map_t *mapping;
394 
395 	for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
396 		if (mapping->code == mbctype) {
397 			return mapping->names;
398 		}
399 	}
400 
401 	return NULL;
402 }
403 /* }}} */
404 
405 /* {{{ php_mb_regex_set_mbctype */
php_mb_regex_set_mbctype(const char * encname)406 int php_mb_regex_set_mbctype(const char *encname)
407 {
408 	OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
409 	if (mbctype == ONIG_ENCODING_UNDEF) {
410 		return FAILURE;
411 	}
412 	MBREX(current_mbctype) = mbctype;
413 	MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(encname);
414 	return SUCCESS;
415 }
416 /* }}} */
417 
418 /* {{{ php_mb_regex_set_default_mbctype */
php_mb_regex_set_default_mbctype(const char * encname)419 int php_mb_regex_set_default_mbctype(const char *encname)
420 {
421 	OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
422 	if (mbctype == ONIG_ENCODING_UNDEF) {
423 		return FAILURE;
424 	}
425 	MBREX(default_mbctype) = mbctype;
426 	return SUCCESS;
427 }
428 /* }}} */
429 
430 /* {{{ php_mb_regex_get_mbctype */
php_mb_regex_get_mbctype(void)431 const char *php_mb_regex_get_mbctype(void)
432 {
433 	return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
434 }
435 /* }}} */
436 
437 /* {{{ php_mb_regex_get_mbctype_encoding */
php_mb_regex_get_mbctype_encoding(void)438 const mbfl_encoding *php_mb_regex_get_mbctype_encoding(void)
439 {
440 	return MBREX(current_mbctype_mbfl_encoding);
441 }
442 /* }}} */
443 
444 /* {{{ php_mb_regex_get_default_mbctype */
php_mb_regex_get_default_mbctype(void)445 const char *php_mb_regex_get_default_mbctype(void)
446 {
447 	return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
448 }
449 /* }}} */
450 
451 /*
452  * regex cache
453  */
454 /* {{{ php_mbregex_compile_pattern */
php_mbregex_compile_pattern(const char * pattern,size_t patlen,OnigOptionType options,OnigSyntaxType * syntax)455 static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigSyntaxType *syntax)
456 {
457 	int err_code = 0;
458 	php_mb_regex_t *retval = NULL, *rc = NULL;
459 	OnigErrorInfo err_info;
460 	OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
461 	OnigEncoding enc = MBREX(current_mbctype);
462 
463 	if (!php_mb_check_encoding(pattern, patlen, php_mb_regex_get_mbctype_encoding())) {
464 		php_error_docref(NULL, E_WARNING,
465 			"Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
466 		return NULL;
467 	}
468 
469 	rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
470 	if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
471 		if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
472 			onig_error_code_to_str(err_str, err_code, &err_info);
473 			php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
474 			return NULL;
475 		}
476 		if (rc == MBREX(search_re)) {
477 			/* reuse the new rc? see bug #72399 */
478 			MBREX(search_re) = NULL;
479 		}
480 		zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
481 	} else {
482 		retval = rc;
483 	}
484 	return retval;
485 }
486 /* }}} */
487 
488 /* {{{ _php_mb_regex_get_option_string */
_php_mb_regex_get_option_string(char * str,size_t len,OnigOptionType option,OnigSyntaxType * syntax)489 static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
490 {
491 	size_t len_left = len;
492 	size_t len_req = 0;
493 	char *p = str;
494 	char c;
495 
496 	if ((option & ONIG_OPTION_IGNORECASE) != 0) {
497 		if (len_left > 0) {
498 			--len_left;
499 			*(p++) = 'i';
500 		}
501 		++len_req;
502 	}
503 
504 	if ((option & ONIG_OPTION_EXTEND) != 0) {
505 		if (len_left > 0) {
506 			--len_left;
507 			*(p++) = 'x';
508 		}
509 		++len_req;
510 	}
511 
512 	if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
513 			(ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
514 		if (len_left > 0) {
515 			--len_left;
516 			*(p++) = 'p';
517 		}
518 		++len_req;
519 	} else {
520 		if ((option & ONIG_OPTION_MULTILINE) != 0) {
521 			if (len_left > 0) {
522 				--len_left;
523 				*(p++) = 'm';
524 			}
525 			++len_req;
526 		}
527 
528 		if ((option & ONIG_OPTION_SINGLELINE) != 0) {
529 			if (len_left > 0) {
530 				--len_left;
531 				*(p++) = 's';
532 			}
533 			++len_req;
534 		}
535 	}
536 	if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
537 		if (len_left > 0) {
538 			--len_left;
539 			*(p++) = 'l';
540 		}
541 		++len_req;
542 	}
543 	if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
544 		if (len_left > 0) {
545 			--len_left;
546 			*(p++) = 'n';
547 		}
548 		++len_req;
549 	}
550 
551 	c = 0;
552 
553 	if (syntax == ONIG_SYNTAX_JAVA) {
554 		c = 'j';
555 	} else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
556 		c = 'u';
557 	} else if (syntax == ONIG_SYNTAX_GREP) {
558 		c = 'g';
559 	} else if (syntax == ONIG_SYNTAX_EMACS) {
560 		c = 'c';
561 	} else if (syntax == ONIG_SYNTAX_RUBY) {
562 		c = 'r';
563 	} else if (syntax == ONIG_SYNTAX_PERL) {
564 		c = 'z';
565 	} else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
566 		c = 'b';
567 	} else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
568 		c = 'd';
569 	}
570 
571 	if (c != 0) {
572 		if (len_left > 0) {
573 			--len_left;
574 			*(p++) = c;
575 		}
576 		++len_req;
577 	}
578 
579 
580 	if (len_left > 0) {
581 		--len_left;
582 		*(p++) = '\0';
583 	}
584 	++len_req;
585 	if (len < len_req) {
586 		return len_req;
587 	}
588 
589 	return 0;
590 }
591 /* }}} */
592 
593 /* {{{ _php_mb_regex_init_options */
_php_mb_regex_init_options(const char * parg,size_t narg,OnigOptionType * option,OnigSyntaxType ** syntax)594 static bool _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option,
595 	OnigSyntaxType **syntax)
596 {
597 	size_t n;
598 	char c;
599 	OnigOptionType optm = 0;
600 
601 	*syntax = ONIG_SYNTAX_RUBY;
602 
603 	if (parg != NULL) {
604 		n = 0;
605 		while(n < narg) {
606 			c = parg[n++];
607 			switch (c) {
608 				case 'i':
609 					optm |= ONIG_OPTION_IGNORECASE;
610 					break;
611 				case 'x':
612 					optm |= ONIG_OPTION_EXTEND;
613 					break;
614 				case 'm':
615 					optm |= ONIG_OPTION_MULTILINE;
616 					break;
617 				case 's':
618 					optm |= ONIG_OPTION_SINGLELINE;
619 					break;
620 				case 'p':
621 					optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
622 					break;
623 				case 'l':
624 					optm |= ONIG_OPTION_FIND_LONGEST;
625 					break;
626 				case 'n':
627 					optm |= ONIG_OPTION_FIND_NOT_EMPTY;
628 					break;
629 				case 'j':
630 					*syntax = ONIG_SYNTAX_JAVA;
631 					break;
632 				case 'u':
633 					*syntax = ONIG_SYNTAX_GNU_REGEX;
634 					break;
635 				case 'g':
636 					*syntax = ONIG_SYNTAX_GREP;
637 					break;
638 				case 'c':
639 					*syntax = ONIG_SYNTAX_EMACS;
640 					break;
641 				case 'r':
642 					*syntax = ONIG_SYNTAX_RUBY;
643 					break;
644 				case 'z':
645 					*syntax = ONIG_SYNTAX_PERL;
646 					break;
647 				case 'b':
648 					*syntax = ONIG_SYNTAX_POSIX_BASIC;
649 					break;
650 				case 'd':
651 					*syntax = ONIG_SYNTAX_POSIX_EXTENDED;
652 					break;
653 				default:
654 					zend_value_error("Option \"%c\" is not supported", c);
655 					return false;
656 			}
657 		}
658 		if (option != NULL) *option|=optm;
659 	}
660 	return true;
661 }
662 /* }}} */
663 
664 
665 /*
666  * Callbacks for named subpatterns
667  */
668 
669 /* {{{ struct mb_ereg_groups_iter_arg */
670 typedef struct mb_regex_groups_iter_args {
671 	zval		*groups;
672 	char		*search_str;
673 	size_t		search_len;
674 	OnigRegion	*region;
675 } mb_regex_groups_iter_args;
676 /* }}} */
677 
678 /* {{{ mb_ereg_groups_iter */
679 static int
mb_regex_groups_iter(const OnigUChar * name,const OnigUChar * name_end,int ngroup_num,int * group_nums,regex_t * reg,void * parg)680 mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
681 {
682 	mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
683 	int gn, beg, end;
684 
685 	/*
686 	 * In case of duplicate groups, keep only the last succeeding one
687 	 * to be consistent with preg_match with the PCRE_DUPNAMES option.
688 	 */
689 	gn = onig_name_to_backref_number(reg, name, name_end, args->region);
690 	beg = args->region->beg[gn];
691 	end = args->region->end[gn];
692 	if (beg >= 0 && beg < end && end <= args->search_len) {
693 		add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
694 	} else {
695 		add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
696 	}
697 
698 	return 0;
699 }
700 /* }}} */
701 
702 /*
703  * Helper for _php_mb_regex_ereg_replace_exec
704  */
705 /* {{{ mb_regex_substitute */
mb_regex_substitute(smart_str * pbuf,const char * subject,size_t subject_len,char * replace,size_t replace_len,php_mb_regex_t * regexp,OnigRegion * regs,const mbfl_encoding * enc)706 static inline void mb_regex_substitute(
707 	smart_str *pbuf,
708 	const char *subject,
709 	size_t subject_len,
710 	char *replace,
711 	size_t replace_len,
712 	php_mb_regex_t *regexp,
713 	OnigRegion *regs,
714 	const mbfl_encoding *enc
715 ) {
716 	char *p, *sp, *eos;
717 	int no; /* bakreference group number */
718 	int clen; /* byte-length of the current character */
719 
720 	p = replace;
721 	eos = replace + replace_len;
722 
723 	while (p < eos) {
724 		clen = (int) php_mb_mbchar_bytes_ex(p, enc);
725 		if (clen != 1 || p == eos || p[0] != '\\') {
726 			/* skip anything that's not an ascii backslash */
727 			smart_str_appendl(pbuf, p, clen);
728 			p += clen;
729 			continue;
730 		}
731 		sp = p; /* save position */
732 		clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
733 		if (clen != 1 || p == eos) {
734 			/* skip backslash followed by multibyte char */
735 			smart_str_appendl(pbuf, sp, p - sp);
736 			continue;
737 		}
738 		no = -1;
739 		switch (p[0]) {
740 			case '0':
741 				no = 0;
742 				p++;
743 				break;
744 			case '1': case '2': case '3': case '4':
745 			case '5': case '6': case '7': case '8': case '9':
746 				if (!onig_noname_group_capture_is_active(regexp)) {
747 					/*
748 					 * FIXME:
749 					 * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
750 					 * For now we just ignore them, but in the future we might want to raise a warning
751 					 * and abort the whole replace operation.
752 					 */
753 					p++;
754 					smart_str_appendl(pbuf, sp, p - sp);
755 					continue;
756 				}
757 				no = p[0] - '0';
758 				p++;
759 				break;
760 			case 'k':
761 			{
762 				clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
763 				if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
764 					/* not a backref delimiter */
765 					p += clen;
766 					smart_str_appendl(pbuf, sp, p - sp);
767 					continue;
768 				}
769 				/* try to consume everything until next delimiter */
770 				char delim = p[0] == '<' ? '>' : '\'';
771 				char *name, *name_end;
772 				char maybe_num = 1;
773 				name_end = name = p + 1;
774 				while (name_end < eos) {
775 					clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
776 					if (clen != 1) {
777 						name_end += clen;
778 						maybe_num = 0;
779 						continue;
780 					}
781 					if (name_end[0] == delim) break;
782 					if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
783 					name_end++;
784 				}
785 				p = name_end + 1;
786 				if (name_end - name < 1 || name_end >= eos) {
787 					/* the backref was empty or we failed to find the end delimiter */
788 					smart_str_appendl(pbuf, sp, p - sp);
789 					continue;
790 				}
791 				/* we have either a name or a number */
792 				if (maybe_num) {
793 					if (!onig_noname_group_capture_is_active(regexp)) {
794 						/* see above note on mixing numbered & named backrefs */
795 						smart_str_appendl(pbuf, sp, p - sp);
796 						continue;
797 					}
798 					if (name_end - name == 1) {
799 						no = name[0] - '0';
800 						break;
801 					}
802 					if (name[0] == '0') {
803 						/* 01 is not a valid number */
804 						break;
805 					}
806 					no = (int) strtoul(name, NULL, 10);
807 					break;
808 				}
809 				no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
810 				break;
811 			}
812 			default:
813 				/* We're not treating \ as an escape character and will interpret something like
814 				 * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
815 				 * function has not supported escaping of backslashes historically. */
816 				smart_str_appendl(pbuf, sp, p - sp);
817 				continue;
818 		}
819 		if (no < 0 || no >= regs->num_regs) {
820 			/* invalid group number reference, keep the escape sequence in the output */
821 			smart_str_appendl(pbuf, sp, p - sp);
822 			continue;
823 		}
824 		if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
825 			smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
826 		}
827 	}
828 
829 	if (p < eos) {
830 		smart_str_appendl(pbuf, p, eos - p);
831 	}
832 }
833 /* }}} */
834 
835 /*
836  * php functions
837  */
838 
839 /* {{{ Returns the current encoding for regex as a string. */
PHP_FUNCTION(mb_regex_encoding)840 PHP_FUNCTION(mb_regex_encoding)
841 {
842 	char *encoding = NULL;
843 	size_t encoding_len;
844 
845 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!", &encoding, &encoding_len) == FAILURE) {
846 		RETURN_THROWS();
847 	}
848 
849 	if (!encoding) {
850 		const char *retval = php_mb_regex_get_mbctype();
851 		ZEND_ASSERT(retval != NULL);
852 
853 		RETURN_STRING(retval);
854 	} else {
855 		if (php_mb_regex_set_mbctype(encoding) == FAILURE) {
856 			zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", encoding);
857 			RETURN_THROWS();
858 		}
859 
860 		/* TODO Make function return previous encoding? */
861 		RETURN_TRUE;
862 	}
863 }
864 /* }}} */
865 
866 /* {{{ _php_mb_onig_search */
_php_mb_onig_search(regex_t * reg,const OnigUChar * str,const OnigUChar * end,const OnigUChar * start,const OnigUChar * range,OnigRegion * region,OnigOptionType option)867 static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
868                    const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
869 	OnigMatchParam *mp = onig_new_match_param();
870 	int err;
871 	onig_initialize_match_param(mp);
872 	if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
873 		onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
874 	}
875 	if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_retry_limit))) {
876 		onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
877 	}
878 	/* search */
879 	err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
880 	onig_free_match_param(mp);
881 	return err;
882 }
883 /* }}} */
884 
885 
886 /* {{{ _php_mb_regex_ereg_exec */
_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS,int icase)887 static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
888 {
889 	zval *array = NULL;
890 	char *arg_pattern, *string;
891 	size_t arg_pattern_len, string_len;
892 	php_mb_regex_t *re;
893 	OnigRegion *regs = NULL;
894 	int i, beg, end;
895 	OnigOptionType options;
896 	char *str;
897 
898 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|z", &arg_pattern, &arg_pattern_len, &string, &string_len, &array) == FAILURE) {
899 		RETURN_THROWS();
900 	}
901 
902 	if (arg_pattern_len == 0) {
903 		zend_argument_value_error(1, "must not be empty");
904 		RETURN_THROWS();
905 	}
906 
907 	if (array != NULL) {
908 		array = zend_try_array_init(array);
909 		if (!array) {
910 			RETURN_THROWS();
911 		}
912 	}
913 
914 	if (!php_mb_check_encoding(
915 		string,
916 		string_len,
917 		php_mb_regex_get_mbctype_encoding()
918 	)) {
919 		RETURN_FALSE;
920 	}
921 
922 	options = MBREX(regex_default_options);
923 	if (icase) {
924 		options |= ONIG_OPTION_IGNORECASE;
925 	}
926 
927 	re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(regex_default_syntax));
928 	if (re == NULL) {
929 		RETVAL_FALSE;
930 		goto out;
931 	}
932 
933 	regs = onig_region_new();
934 
935 	/* actually execute the regular expression */
936 	if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
937 		RETVAL_FALSE;
938 		goto out;
939 	}
940 
941 	str = string;
942 	if (array != NULL) {
943 		for (i = 0; i < regs->num_regs; i++) {
944 			beg = regs->beg[i];
945 			end = regs->end[i];
946 			if (beg >= 0 && beg < end && (size_t)end <= string_len) {
947 				add_index_stringl(array, i, (char *)&str[beg], end - beg);
948 			} else {
949 				add_index_bool(array, i, 0);
950 			}
951 		}
952 
953 		if (onig_number_of_names(re) > 0) {
954 			mb_regex_groups_iter_args args = {array, string, string_len, regs};
955 			onig_foreach_name(re, mb_regex_groups_iter, &args);
956 		}
957 	}
958 
959 	RETVAL_TRUE;
960 out:
961 	if (regs != NULL) {
962 		onig_region_free(regs, 1);
963 	}
964 }
965 /* }}} */
966 
967 /* {{{ Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg)968 PHP_FUNCTION(mb_ereg)
969 {
970 	_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
971 }
972 /* }}} */
973 
974 /* {{{ Case-insensitive regular expression match for multibyte string */
PHP_FUNCTION(mb_eregi)975 PHP_FUNCTION(mb_eregi)
976 {
977 	_php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
978 }
979 /* }}} */
980 
981 /* {{{ _php_mb_regex_ereg_replace_exec */
_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS,OnigOptionType options,int is_callable)982 static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
983 {
984 	char *arg_pattern;
985 	size_t arg_pattern_len;
986 
987 	char *replace;
988 	size_t replace_len;
989 
990 	zend_fcall_info arg_replace_fci;
991 	zend_fcall_info_cache arg_replace_fci_cache;
992 
993 	char *string;
994 	size_t string_len;
995 
996 	php_mb_regex_t *re;
997 	OnigSyntaxType *syntax;
998 	OnigRegion *regs = NULL;
999 	smart_str out_buf = {0};
1000 	smart_str eval_buf = {0};
1001 	smart_str *pbuf;
1002 	int err, n;
1003 	OnigUChar *pos;
1004 	OnigUChar *string_lim;
1005 	char *description = NULL;
1006 
1007 	const mbfl_encoding *enc = php_mb_regex_get_mbctype_encoding();
1008 	ZEND_ASSERT(enc != NULL);
1009 
1010 	{
1011 		char *option_str = NULL;
1012 		size_t option_str_len = 0;
1013 
1014 		if (!is_callable) {
1015 			if (zend_parse_parameters(ZEND_NUM_ARGS(), "sss|s!",
1016 						&arg_pattern, &arg_pattern_len,
1017 						&replace, &replace_len,
1018 						&string, &string_len,
1019 						&option_str, &option_str_len) == FAILURE) {
1020 				RETURN_THROWS();
1021 			}
1022 		} else {
1023 			if (zend_parse_parameters(ZEND_NUM_ARGS(), "sfs|s!",
1024 						&arg_pattern, &arg_pattern_len,
1025 						&arg_replace_fci, &arg_replace_fci_cache,
1026 						&string, &string_len,
1027 						&option_str, &option_str_len) == FAILURE) {
1028 				RETURN_THROWS();
1029 			}
1030 		}
1031 
1032 		if (!php_mb_check_encoding(string, string_len, enc)) {
1033 			RETURN_NULL();
1034 		}
1035 
1036 		if (option_str != NULL) {
1037 			/* Initialize option and in case of failure it means there is a value error */
1038 			if (!_php_mb_regex_init_options(option_str, option_str_len, &options, &syntax)) {
1039 				RETURN_THROWS();
1040 			}
1041 		} else {
1042 			options |= MBREX(regex_default_options);
1043 			syntax = MBREX(regex_default_syntax);
1044 		}
1045 	}
1046 
1047 	/* create regex pattern buffer */
1048 	re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, syntax);
1049 	if (re == NULL) {
1050 		RETURN_FALSE;
1051 	}
1052 
1053 	if (is_callable) {
1054 		pbuf = &eval_buf;
1055 		description = zend_make_compiled_string_description("mbregex replace");
1056 	} else {
1057 		pbuf = &out_buf;
1058 		description = NULL;
1059 	}
1060 
1061 	/* do the actual work */
1062 	err = 0;
1063 	pos = (OnigUChar *)string;
1064 	string_lim = (OnigUChar*)(string + string_len);
1065 	regs = onig_region_new();
1066 	while (err >= 0) {
1067 		err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
1068 		if (err <= -2) {
1069 			OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1070 			onig_error_code_to_str(err_str, err);
1071 			php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
1072 			break;
1073 		}
1074 		if (err >= 0) {
1075 			/* copy the part of the string before the match */
1076 			smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
1077 
1078 			if (!is_callable) {
1079 				mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
1080 			}
1081 
1082 			if (is_callable) {
1083 				zval args[1];
1084 				zval subpats, retval;
1085 				int i;
1086 
1087 				array_init(&subpats);
1088 				for (i = 0; i < regs->num_regs; i++) {
1089 					add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
1090 				}
1091 				if (onig_number_of_names(re) > 0) {
1092 					mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1093 					onig_foreach_name(re, mb_regex_groups_iter, &args);
1094 				}
1095 
1096 				ZVAL_COPY_VALUE(&args[0], &subpats);
1097 				/* null terminate buffer */
1098 				smart_str_0(&eval_buf);
1099 
1100 				arg_replace_fci.param_count = 1;
1101 				arg_replace_fci.params = args;
1102 				arg_replace_fci.retval = &retval;
1103 				if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
1104 						!Z_ISUNDEF(retval)) {
1105 					convert_to_string_ex(&retval);
1106 					smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
1107 					smart_str_free(&eval_buf);
1108 					zval_ptr_dtor(&retval);
1109 				} else {
1110 					if (!EG(exception)) {
1111 						zend_throw_error(NULL, "Unable to call custom replacement function");
1112 						zval_ptr_dtor(&subpats);
1113 						RETURN_THROWS();
1114 					}
1115 				}
1116 				zval_ptr_dtor(&subpats);
1117 			}
1118 
1119 			n = regs->end[0];
1120 			if ((pos - (OnigUChar *)string) < n) {
1121 				pos = (OnigUChar *)string + n;
1122 			} else {
1123 				if (pos < string_lim) {
1124 					smart_str_appendl(&out_buf, (char *)pos, 1);
1125 				}
1126 				pos++;
1127 			}
1128 		} else { /* nomatch */
1129 			/* stick that last bit of string on our output */
1130 			if (string_lim - pos > 0) {
1131 				smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
1132 			}
1133 		}
1134 		onig_region_free(regs, 0);
1135 	}
1136 
1137 	if (description) {
1138 		efree(description);
1139 	}
1140 	if (regs != NULL) {
1141 		onig_region_free(regs, 1);
1142 	}
1143 	smart_str_free(&eval_buf);
1144 
1145 	if (err <= -2) {
1146 		smart_str_free(&out_buf);
1147 		RETVAL_FALSE;
1148 	} else if (out_buf.s) {
1149 		smart_str_0(&out_buf);
1150 		RETVAL_STR(out_buf.s);
1151 	} else {
1152 		RETVAL_EMPTY_STRING();
1153 	}
1154 }
1155 /* }}} */
1156 
1157 /* {{{ Replace regular expression for multibyte string */
PHP_FUNCTION(mb_ereg_replace)1158 PHP_FUNCTION(mb_ereg_replace)
1159 {
1160 	_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1161 }
1162 /* }}} */
1163 
1164 /* {{{ Case insensitive replace regular expression for multibyte string */
PHP_FUNCTION(mb_eregi_replace)1165 PHP_FUNCTION(mb_eregi_replace)
1166 {
1167 	_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
1168 }
1169 /* }}} */
1170 
1171 /* {{{ regular expression for multibyte string using replacement callback */
PHP_FUNCTION(mb_ereg_replace_callback)1172 PHP_FUNCTION(mb_ereg_replace_callback)
1173 {
1174 	_php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1175 }
1176 /* }}} */
1177 
1178 /* {{{ split multibyte string into array by regular expression */
PHP_FUNCTION(mb_split)1179 PHP_FUNCTION(mb_split)
1180 {
1181 	char *arg_pattern;
1182 	size_t arg_pattern_len;
1183 	php_mb_regex_t *re;
1184 	OnigRegion *regs = NULL;
1185 	char *string;
1186 	OnigUChar *pos, *chunk_pos;
1187 	size_t string_len;
1188 
1189 	int err;
1190 	zend_long count = -1;
1191 
1192 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
1193 		RETURN_THROWS();
1194 	}
1195 
1196 	if (count > 0) {
1197 		count--;
1198 	}
1199 
1200 	if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
1201 		RETURN_FALSE;
1202 	}
1203 
1204 	/* create regex pattern buffer */
1205 	if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(regex_default_syntax))) == NULL) {
1206 		RETURN_FALSE;
1207 	}
1208 
1209 	array_init(return_value);
1210 
1211 	chunk_pos = pos = (OnigUChar *)string;
1212 	err = 0;
1213 	regs = onig_region_new();
1214 	/* churn through str, generating array entries as we go */
1215 	while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
1216 		size_t beg, end;
1217 		err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
1218 		if (err < 0) {
1219 			break;
1220 		}
1221 		beg = regs->beg[0], end = regs->end[0];
1222 		/* add it to the array */
1223 		if ((size_t)(pos - (OnigUChar *)string) < end) {
1224 			if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
1225 				add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
1226 				--count;
1227 			} else {
1228 				err = -2;
1229 				break;
1230 			}
1231 			/* point at our new starting point */
1232 			chunk_pos = pos = (OnigUChar *)string + end;
1233 		} else {
1234 			pos++;
1235 		}
1236 		onig_region_free(regs, 0);
1237 	}
1238 
1239 	onig_region_free(regs, 1);
1240 
1241 	/* see if we encountered an error */
1242 	// ToDo investigate if this can actually/should happen ...
1243 	if (err <= -2) {
1244 		OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1245 		onig_error_code_to_str(err_str, err);
1246 		php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
1247 		zend_array_destroy(Z_ARR_P(return_value));
1248 		RETURN_FALSE;
1249 	}
1250 
1251 	/* otherwise we just have one last element to add to the array */
1252 	if ((OnigUChar *)(string + string_len) > chunk_pos) {
1253 		size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
1254 		add_next_index_stringl(return_value, (char *)chunk_pos, n);
1255 	} else {
1256 		add_next_index_stringl(return_value, "", 0);
1257 	}
1258 }
1259 /* }}} */
1260 
1261 /* {{{ Regular expression match for multibyte string */
PHP_FUNCTION(mb_ereg_match)1262 PHP_FUNCTION(mb_ereg_match)
1263 {
1264 	char *arg_pattern;
1265 	size_t arg_pattern_len;
1266 
1267 	char *string;
1268 	size_t string_len;
1269 
1270 	php_mb_regex_t *re;
1271 	OnigSyntaxType *syntax;
1272 	OnigOptionType option = 0;
1273 	int err;
1274 	OnigMatchParam *mp;
1275 
1276 	{
1277 		char *option_str = NULL;
1278 		size_t option_str_len = 0;
1279 
1280 		if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s!",
1281 		                          &arg_pattern, &arg_pattern_len, &string, &string_len,
1282 		                          &option_str, &option_str_len)==FAILURE) {
1283 			RETURN_THROWS();
1284 		}
1285 
1286 		if (option_str != NULL) {
1287 			if(!_php_mb_regex_init_options(option_str, option_str_len, &option, &syntax)) {
1288 				RETURN_THROWS();
1289 			}
1290 		} else {
1291 			option |= MBREX(regex_default_options);
1292 			syntax = MBREX(regex_default_syntax);
1293 		}
1294 	}
1295 
1296 	if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
1297 		RETURN_FALSE;
1298 	}
1299 
1300 	if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1301 		RETURN_FALSE;
1302 	}
1303 
1304 	mp = onig_new_match_param();
1305 	onig_initialize_match_param(mp);
1306 	if (MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
1307 		onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
1308 	}
1309 	if (MBSTRG(regex_retry_limit) > 0 && MBSTRG(regex_retry_limit) < UINT_MAX) {
1310 		onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
1311 	}
1312 	/* match */
1313 	err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
1314 	onig_free_match_param(mp);
1315 	if (err >= 0) {
1316 		RETVAL_TRUE;
1317 	} else {
1318 		RETVAL_FALSE;
1319 	}
1320 }
1321 /* }}} */
1322 
1323 /* regex search */
1324 /* {{{ _php_mb_regex_ereg_search_exec */
_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS,int mode)1325 static void _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
1326 {
1327 	char *arg_pattern = NULL, *arg_options = NULL;
1328 	size_t arg_pattern_len, arg_options_len;
1329 	int err;
1330 	size_t n, i, pos, len;
1331 	/* Stored as int* in the OnigRegion struct */
1332 	int beg, end;
1333 	OnigOptionType option = 0;
1334 	OnigUChar *str;
1335 	OnigSyntaxType *syntax;
1336 
1337 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!s!", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1338 		RETURN_THROWS();
1339 	}
1340 
1341 	if (arg_options) {
1342 		_php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
1343 	} else {
1344 		option |= MBREX(regex_default_options);
1345 		syntax = MBREX(regex_default_syntax);
1346 	}
1347 
1348 	if (MBREX(search_regs)) {
1349 		onig_region_free(MBREX(search_regs), 1);
1350 		MBREX(search_regs) = NULL;
1351 	}
1352 
1353 	if (arg_pattern) {
1354 		/* create regex pattern buffer */
1355 		if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1356 			RETURN_FALSE;
1357 		}
1358 	}
1359 
1360 	pos = MBREX(search_pos);
1361 	str = NULL;
1362 	len = 0;
1363 	if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
1364 		str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1365 		len = Z_STRLEN(MBREX(search_str));
1366 	}
1367 
1368 	if (MBREX(search_re) == NULL) {
1369 		zend_throw_error(NULL, "No pattern was provided");
1370 		RETURN_THROWS();
1371 	}
1372 
1373 	if (str == NULL) {
1374 		zend_throw_error(NULL, "No string was provided");
1375 		RETURN_THROWS();
1376 	}
1377 
1378 	MBREX(search_regs) = onig_region_new();
1379 
1380 	err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str  + len, MBREX(search_regs), 0);
1381 	if (err == ONIG_MISMATCH) {
1382 		MBREX(search_pos) = len;
1383 		RETVAL_FALSE;
1384 	} else if (err <= -2) {
1385 		OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
1386 		onig_error_code_to_str(err_str, err);
1387 		php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
1388 		RETVAL_FALSE;
1389 	} else {
1390 		switch (mode) {
1391 		case 1:
1392 			array_init(return_value);
1393 			beg = MBREX(search_regs)->beg[0];
1394 			end = MBREX(search_regs)->end[0];
1395 			add_next_index_long(return_value, beg);
1396 			add_next_index_long(return_value, end - beg);
1397 			break;
1398 		case 2:
1399 			array_init(return_value);
1400 			n = MBREX(search_regs)->num_regs;
1401 			for (i = 0; i < n; i++) {
1402 				beg = MBREX(search_regs)->beg[i];
1403 				end = MBREX(search_regs)->end[i];
1404 				if (beg >= 0 && beg <= end && end <= len) {
1405 					add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1406 				} else {
1407 					add_index_bool(return_value, i, 0);
1408 				}
1409 			}
1410 			if (onig_number_of_names(MBREX(search_re)) > 0) {
1411 				mb_regex_groups_iter_args args = {
1412 					return_value,
1413 					Z_STRVAL(MBREX(search_str)),
1414 					Z_STRLEN(MBREX(search_str)),
1415 					MBREX(search_regs)
1416 				};
1417 				onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1418 			}
1419 			break;
1420 		default:
1421 			RETVAL_TRUE;
1422 			break;
1423 		}
1424 		end = MBREX(search_regs)->end[0];
1425 		if (pos <= end) {
1426 			MBREX(search_pos) = end;
1427 		} else {
1428 			MBREX(search_pos) = pos + 1;
1429 		}
1430 	}
1431 
1432 	if (err < 0) {
1433 		onig_region_free(MBREX(search_regs), 1);
1434 		MBREX(search_regs) = (OnigRegion *)NULL;
1435 	}
1436 }
1437 /* }}} */
1438 
1439 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search)1440 PHP_FUNCTION(mb_ereg_search)
1441 {
1442 	_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1443 }
1444 /* }}} */
1445 
1446 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_pos)1447 PHP_FUNCTION(mb_ereg_search_pos)
1448 {
1449 	_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1450 }
1451 /* }}} */
1452 
1453 /* {{{ Regular expression search for multibyte string */
PHP_FUNCTION(mb_ereg_search_regs)1454 PHP_FUNCTION(mb_ereg_search_regs)
1455 {
1456 	_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
1457 }
1458 /* }}} */
1459 
1460 /* {{{ Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)1461 PHP_FUNCTION(mb_ereg_search_init)
1462 {
1463 	zend_string *arg_str;
1464 	char *arg_pattern = NULL, *arg_options = NULL;
1465 	size_t arg_pattern_len = 0, arg_options_len = 0;
1466 	OnigSyntaxType *syntax = NULL;
1467 	OnigOptionType option;
1468 
1469 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|s!s!", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
1470 		RETURN_THROWS();
1471 	}
1472 
1473 	if (arg_pattern && arg_pattern_len == 0) {
1474 		zend_argument_value_error(2, "must not be empty");
1475 		RETURN_THROWS();
1476 	}
1477 
1478 	if (arg_options) {
1479 		option = 0;
1480 		_php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
1481 	} else {
1482 		option = MBREX(regex_default_options);
1483 		syntax = MBREX(regex_default_syntax);
1484 	}
1485 
1486 	if (arg_pattern) {
1487 		/* create regex pattern buffer */
1488 		if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
1489 			RETURN_FALSE;
1490 		}
1491 	}
1492 
1493 	if (!Z_ISNULL(MBREX(search_str))) {
1494 		zval_ptr_dtor(&MBREX(search_str));
1495 	}
1496 
1497 	ZVAL_STR_COPY(&MBREX(search_str), arg_str);
1498 
1499 	if (php_mb_check_encoding(ZSTR_VAL(arg_str), ZSTR_LEN(arg_str), php_mb_regex_get_mbctype_encoding())) {
1500 		MBREX(search_pos) = 0;
1501 		RETVAL_TRUE;
1502 	} else {
1503 		MBREX(search_pos) = ZSTR_LEN(arg_str);
1504 		RETVAL_FALSE;
1505 	}
1506 
1507 	if (MBREX(search_regs) != NULL) {
1508 		onig_region_free(MBREX(search_regs), 1);
1509 		MBREX(search_regs) = NULL;
1510 	}
1511 }
1512 /* }}} */
1513 
1514 /* {{{ Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)1515 PHP_FUNCTION(mb_ereg_search_getregs)
1516 {
1517 	size_t n, i, len;
1518 	/* Stored as int* in the OnigRegion struct */
1519 	int beg, end;
1520 	OnigUChar *str;
1521 
1522 	if (zend_parse_parameters_none() == FAILURE) {
1523 		RETURN_THROWS();
1524 	}
1525 
1526 	if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
1527 		array_init(return_value);
1528 
1529 		str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
1530 		len = Z_STRLEN(MBREX(search_str));
1531 		n = MBREX(search_regs)->num_regs;
1532 		for (i = 0; i < n; i++) {
1533 			beg = MBREX(search_regs)->beg[i];
1534 			end = MBREX(search_regs)->end[i];
1535 			if (beg >= 0 && beg <= end && end <= len) {
1536 				add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
1537 			} else {
1538 				add_index_bool(return_value, i, 0);
1539 			}
1540 		}
1541 		if (onig_number_of_names(MBREX(search_re)) > 0) {
1542 			mb_regex_groups_iter_args args = {
1543 				return_value,
1544 				Z_STRVAL(MBREX(search_str)),
1545 				len,
1546 				MBREX(search_regs)
1547 			};
1548 			onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
1549 		}
1550 	} else {
1551 		// TODO This seems to be some logical error, promote to Error
1552 		RETVAL_FALSE;
1553 	}
1554 }
1555 /* }}} */
1556 
1557 /* {{{ Get search start position */
PHP_FUNCTION(mb_ereg_search_getpos)1558 PHP_FUNCTION(mb_ereg_search_getpos)
1559 {
1560 	if (zend_parse_parameters_none() == FAILURE) {
1561 		RETURN_THROWS();
1562 	}
1563 
1564 	RETVAL_LONG(MBREX(search_pos));
1565 }
1566 /* }}} */
1567 
1568 /* {{{ Set search start position */
PHP_FUNCTION(mb_ereg_search_setpos)1569 PHP_FUNCTION(mb_ereg_search_setpos)
1570 {
1571 	zend_long position;
1572 
1573 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
1574 		RETURN_THROWS();
1575 	}
1576 
1577 	/* Accept negative position if length of search string can be determined */
1578 	if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
1579 		position += Z_STRLEN(MBREX(search_str));
1580 	}
1581 
1582 	if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
1583 		zend_argument_value_error(1, "is out of range");
1584 		RETURN_THROWS();
1585 	}
1586 
1587 	MBREX(search_pos) = position;
1588 	// TODO Return void
1589 	RETURN_TRUE;
1590 }
1591 /* }}} */
1592 
1593 /* {{{ php_mb_regex_set_options */
_php_mb_regex_set_options(OnigOptionType options,OnigSyntaxType * syntax,OnigOptionType * prev_options,OnigSyntaxType ** prev_syntax)1594 static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
1595 {
1596 	if (prev_options != NULL) {
1597 		*prev_options = MBREX(regex_default_options);
1598 	}
1599 	if (prev_syntax != NULL) {
1600 		*prev_syntax = MBREX(regex_default_syntax);
1601 	}
1602 	MBREX(regex_default_options) = options;
1603 	MBREX(regex_default_syntax) = syntax;
1604 }
1605 /* }}} */
1606 
1607 /* {{{ Set or get the default options for mbregex functions */
PHP_FUNCTION(mb_regex_set_options)1608 PHP_FUNCTION(mb_regex_set_options)
1609 {
1610 	OnigOptionType opt, prev_opt;
1611 	OnigSyntaxType *syntax, *prev_syntax;
1612 	char *string = NULL;
1613 	size_t string_len;
1614 	char buf[16];
1615 
1616 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!",
1617 	                          &string, &string_len) == FAILURE) {
1618 		RETURN_THROWS();
1619 	}
1620 	if (string != NULL) {
1621 		opt = 0;
1622 		syntax = NULL;
1623 		if(!_php_mb_regex_init_options(string, string_len, &opt, &syntax)) {
1624 			RETURN_THROWS();
1625 		}
1626 		_php_mb_regex_set_options(opt, syntax, &prev_opt, &prev_syntax);
1627 		opt = prev_opt;
1628 		syntax = prev_syntax;
1629 	} else {
1630 		opt = MBREX(regex_default_options);
1631 		syntax = MBREX(regex_default_syntax);
1632 	}
1633 	_php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
1634 
1635 	RETVAL_STRING(buf);
1636 }
1637 /* }}} */
1638 
1639 #endif	/* HAVE_MBREGEX */
1640