xref: /php-src/ext/pcre/php_pcre.c (revision d0e15c85)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Andrei Zmievski <andrei@php.net>                             |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "php.h"
18 #include "php_ini.h"
19 #include "php_globals.h"
20 #include "php_pcre.h"
21 #include "ext/standard/info.h"
22 #include "ext/standard/basic_functions.h"
23 #include "zend_smart_str.h"
24 #include "SAPI.h"
25 
26 #include "ext/standard/php_string.h"
27 
28 #define PREG_PATTERN_ORDER			1
29 #define PREG_SET_ORDER				2
30 #define PREG_OFFSET_CAPTURE			(1<<8)
31 #define PREG_UNMATCHED_AS_NULL		(1<<9)
32 
33 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
34 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
35 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
36 
37 #define PREG_GREP_INVERT			(1<<0)
38 
39 #define PREG_JIT                    (1<<3)
40 
41 #define PCRE_CACHE_SIZE 4096
42 
43 #ifdef HAVE_PCRE_JIT_SUPPORT
44 #define PHP_PCRE_JIT_SUPPORT 1
45 #else
46 #define PHP_PCRE_JIT_SUPPORT 0
47 #endif
48 
49 char *php_pcre_version;
50 
51 #include "php_pcre_arginfo.h"
52 
53 struct _pcre_cache_entry {
54 	pcre2_code *re;
55 	uint32_t preg_options;
56 	uint32_t capture_count;
57 	uint32_t name_count;
58 	uint32_t compile_options;
59 	uint32_t refcount;
60 };
61 
62 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
63 
64 #ifdef HAVE_PCRE_JIT_SUPPORT
65 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
66 #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024)
67 ZEND_TLS pcre2_jit_stack *jit_stack = NULL;
68 #endif
69 /* General context using (infallible) system allocator. */
70 ZEND_TLS pcre2_general_context *gctx = NULL;
71 /* These two are global per thread for now. Though it is possible to use these
72  	per pattern. Either one can copy it and use in pce, or one does no global
73 	contexts at all, but creates for every pce. */
74 ZEND_TLS pcre2_compile_context *cctx = NULL;
75 ZEND_TLS pcre2_match_context   *mctx = NULL;
76 ZEND_TLS pcre2_match_data      *mdata = NULL;
77 ZEND_TLS bool              mdata_used = 0;
78 ZEND_TLS uint8_t pcre2_init_ok = 0;
79 #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT)
80 static MUTEX_T pcre_mt = NULL;
81 #define php_pcre_mutex_alloc() \
82 	if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc();
83 #define php_pcre_mutex_free() \
84 	if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; }
85 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
86 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
87 #else
88 #define php_pcre_mutex_alloc()
89 #define php_pcre_mutex_free()
90 #define php_pcre_mutex_lock()
91 #define php_pcre_mutex_unlock()
92 #endif
93 
94 ZEND_TLS HashTable char_tables;
95 
php_pcre_free_char_table(zval * data)96 static void php_pcre_free_char_table(zval *data)
97 {/*{{{*/
98 	void *ptr = Z_PTR_P(data);
99 	pefree(ptr, 1);
100 }/*}}}*/
101 
pcre_handle_exec_error(int pcre_code)102 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
103 {
104 	int preg_code = 0;
105 
106 	switch (pcre_code) {
107 		case PCRE2_ERROR_MATCHLIMIT:
108 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
109 			break;
110 
111 		case PCRE2_ERROR_RECURSIONLIMIT:
112 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
113 			break;
114 
115 		case PCRE2_ERROR_BADUTFOFFSET:
116 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
117 			break;
118 
119 #ifdef HAVE_PCRE_JIT_SUPPORT
120 		case PCRE2_ERROR_JIT_STACKLIMIT:
121 			preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
122 			break;
123 #endif
124 
125 		default:
126 			if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) {
127 				preg_code = PHP_PCRE_BAD_UTF8_ERROR;
128 			} else  {
129 				preg_code = PHP_PCRE_INTERNAL_ERROR;
130 			}
131 			break;
132 	}
133 
134 	PCRE_G(error_code) = preg_code;
135 }
136 /* }}} */
137 
php_pcre_get_error_msg(php_pcre_error_code error_code)138 static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */
139 {
140 	switch (error_code) {
141 		case PHP_PCRE_NO_ERROR:
142 			return "No error";
143 		case PHP_PCRE_INTERNAL_ERROR:
144 			return "Internal error";
145 		case PHP_PCRE_BAD_UTF8_ERROR:
146 			return "Malformed UTF-8 characters, possibly incorrectly encoded";
147 		case PHP_PCRE_BAD_UTF8_OFFSET_ERROR:
148 			return "The offset did not correspond to the beginning of a valid UTF-8 code point";
149 		case PHP_PCRE_BACKTRACK_LIMIT_ERROR:
150 			return "Backtrack limit exhausted";
151 		case PHP_PCRE_RECURSION_LIMIT_ERROR:
152 			return "Recursion limit exhausted";
153 
154 #ifdef HAVE_PCRE_JIT_SUPPORT
155 		case PHP_PCRE_JIT_STACKLIMIT_ERROR:
156 			return "JIT stack limit exhausted";
157 #endif
158 
159 		default:
160 			return "Unknown error";
161 	}
162 }
163 /* }}} */
164 
php_free_pcre_cache(zval * data)165 static void php_free_pcre_cache(zval *data) /* {{{ */
166 {
167 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
168 	if (!pce) return;
169 	pcre2_code_free(pce->re);
170 	free(pce);
171 }
172 /* }}} */
173 
php_efree_pcre_cache(zval * data)174 static void php_efree_pcre_cache(zval *data) /* {{{ */
175 {
176 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
177 	if (!pce) return;
178 	pcre2_code_free(pce->re);
179 	efree(pce);
180 }
181 /* }}} */
182 
php_pcre_malloc(PCRE2_SIZE size,void * data)183 static void *php_pcre_malloc(PCRE2_SIZE size, void *data)
184 {
185 	return pemalloc(size, 1);
186 }
187 
php_pcre_free(void * block,void * data)188 static void php_pcre_free(void *block, void *data)
189 {
190 	pefree(block, 1);
191 }
192 
php_pcre_emalloc(PCRE2_SIZE size,void * data)193 static void *php_pcre_emalloc(PCRE2_SIZE size, void *data)
194 {
195 	return emalloc(size);
196 }
197 
php_pcre_efree(void * block,void * data)198 static void php_pcre_efree(void *block, void *data)
199 {
200 	efree(block);
201 }
202 
203 #ifdef PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
204 	/* pcre 10.38 needs PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, disabled by default */
205 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
206 #else
207 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS 0
208 #endif
209 
210 #define PHP_PCRE_PREALLOC_MDATA_SIZE 32
211 
php_pcre_init_pcre2(uint8_t jit)212 static void php_pcre_init_pcre2(uint8_t jit)
213 {/*{{{*/
214 	if (!gctx) {
215 		gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL);
216 		if (!gctx) {
217 			pcre2_init_ok = 0;
218 			return;
219 		}
220 	}
221 
222 	if (!cctx) {
223 		cctx = pcre2_compile_context_create(gctx);
224 		if (!cctx) {
225 			pcre2_init_ok = 0;
226 			return;
227 		}
228 	}
229 
230 	pcre2_set_compile_extra_options(cctx, PHP_PCRE_DEFAULT_EXTRA_COPTIONS);
231 
232 	if (!mctx) {
233 		mctx = pcre2_match_context_create(gctx);
234 		if (!mctx) {
235 			pcre2_init_ok = 0;
236 			return;
237 		}
238 	}
239 
240 #ifdef HAVE_PCRE_JIT_SUPPORT
241 	if (jit && !jit_stack) {
242 		jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx);
243 		if (!jit_stack) {
244 			pcre2_init_ok = 0;
245 			return;
246 		}
247 	}
248 #endif
249 
250 	if (!mdata) {
251 		mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx);
252 		if (!mdata) {
253 			pcre2_init_ok = 0;
254 			return;
255 		}
256 	}
257 
258 	pcre2_init_ok = 1;
259 }/*}}}*/
260 
php_pcre_shutdown_pcre2(void)261 static void php_pcre_shutdown_pcre2(void)
262 {/*{{{*/
263 	if (gctx) {
264 		pcre2_general_context_free(gctx);
265 		gctx = NULL;
266 	}
267 
268 	if (cctx) {
269 		pcre2_compile_context_free(cctx);
270 		cctx = NULL;
271 	}
272 
273 	if (mctx) {
274 		pcre2_match_context_free(mctx);
275 		mctx = NULL;
276 	}
277 
278 #ifdef HAVE_PCRE_JIT_SUPPORT
279 	/* Stack may only be destroyed when no cached patterns
280 	 	possibly associated with it do exist. */
281 	if (jit_stack) {
282 		pcre2_jit_stack_free(jit_stack);
283 		jit_stack = NULL;
284 	}
285 #endif
286 
287 	if (mdata) {
288 		pcre2_match_data_free(mdata);
289 		mdata = NULL;
290 	}
291 
292 	pcre2_init_ok = 0;
293 }/*}}}*/
294 
PHP_GINIT_FUNCTION(pcre)295 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
296 {
297 	php_pcre_mutex_alloc();
298 
299 	/* If we're on the CLI SAPI, there will only be one request, so we don't need the
300 	 * cache to survive after RSHUTDOWN. */
301 	pcre_globals->per_request_cache = strcmp(sapi_module.name, "cli") == 0;
302 	if (!pcre_globals->per_request_cache) {
303 		zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
304 	}
305 
306 	pcre_globals->backtrack_limit = 0;
307 	pcre_globals->recursion_limit = 0;
308 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
309 	ZVAL_UNDEF(&pcre_globals->unmatched_null_pair);
310 	ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair);
311 #ifdef HAVE_PCRE_JIT_SUPPORT
312 	pcre_globals->jit = 1;
313 #endif
314 
315 	php_pcre_init_pcre2(1);
316 	zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1);
317 }
318 /* }}} */
319 
PHP_GSHUTDOWN_FUNCTION(pcre)320 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
321 {
322 	if (!pcre_globals->per_request_cache) {
323 		zend_hash_destroy(&pcre_globals->pcre_cache);
324 	}
325 
326 	php_pcre_shutdown_pcre2();
327 	zend_hash_destroy(&char_tables);
328 	php_pcre_mutex_free();
329 }
330 /* }}} */
331 
PHP_INI_MH(OnUpdateBacktrackLimit)332 static PHP_INI_MH(OnUpdateBacktrackLimit)
333 {/*{{{*/
334 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
335 	if (mctx) {
336 		pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit));
337 	}
338 
339 	return SUCCESS;
340 }/*}}}*/
341 
PHP_INI_MH(OnUpdateRecursionLimit)342 static PHP_INI_MH(OnUpdateRecursionLimit)
343 {/*{{{*/
344 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
345 	if (mctx) {
346 		pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit));
347 	}
348 
349 	return SUCCESS;
350 }/*}}}*/
351 
352 #ifdef HAVE_PCRE_JIT_SUPPORT
PHP_INI_MH(OnUpdateJit)353 static PHP_INI_MH(OnUpdateJit)
354 {/*{{{*/
355 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
356 	if (PCRE_G(jit) && jit_stack) {
357 		pcre2_jit_stack_assign(mctx, NULL, jit_stack);
358 	} else {
359 		pcre2_jit_stack_assign(mctx, NULL, NULL);
360 	}
361 
362 	return SUCCESS;
363 }/*}}}*/
364 #endif
365 
366 PHP_INI_BEGIN()
367 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals)
368 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000",  PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals)
369 #ifdef HAVE_PCRE_JIT_SUPPORT
370 	STD_PHP_INI_BOOLEAN("pcre.jit",           "1",       PHP_INI_ALL, OnUpdateJit,            jit,             zend_pcre_globals, pcre_globals)
371 #endif
PHP_INI_END()372 PHP_INI_END()
373 
374 static char *_pcre2_config_str(uint32_t what)
375 {/*{{{*/
376 	int len = pcre2_config(what, NULL);
377 	char *ret = (char *) malloc(len + 1);
378 
379 	len = pcre2_config(what, ret);
380 	if (!len) {
381 		free(ret);
382 		return NULL;
383 	}
384 
385 	return ret;
386 }/*}}}*/
387 
388 /* {{{ PHP_MINFO_FUNCTION(pcre) */
PHP_MINFO_FUNCTION(pcre)389 static PHP_MINFO_FUNCTION(pcre)
390 {
391 #ifdef HAVE_PCRE_JIT_SUPPORT
392 	uint32_t flag = 0;
393 	char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET);
394 #endif
395 	char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
396 	char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION);
397 
398 	php_info_print_table_start();
399 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
400 	php_info_print_table_row(2, "PCRE Library Version", version);
401 	free(version);
402 	php_info_print_table_row(2, "PCRE Unicode Version", unicode);
403 	free(unicode);
404 
405 #ifdef HAVE_PCRE_JIT_SUPPORT
406 	if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) {
407 		php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled");
408 	} else {
409 		php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
410 	}
411 	if (jit_target) {
412 		php_info_print_table_row(2, "PCRE JIT Target", jit_target);
413 	}
414 	free(jit_target);
415 #else
416 	php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
417 #endif
418 
419 #ifdef HAVE_PCRE_VALGRIND_SUPPORT
420 	php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" );
421 #endif
422 
423 	php_info_print_table_end();
424 
425 	DISPLAY_INI_ENTRIES();
426 }
427 /* }}} */
428 
429 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)430 static PHP_MINIT_FUNCTION(pcre)
431 {
432 #ifdef HAVE_PCRE_JIT_SUPPORT
433 	if (UNEXPECTED(!pcre2_init_ok)) {
434 		/* Retry. */
435 		php_pcre_init_pcre2(PCRE_G(jit));
436 		if (!pcre2_init_ok) {
437 			return FAILURE;
438 		}
439 	}
440 #endif
441 
442 	REGISTER_INI_ENTRIES();
443 
444 	php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
445 
446 	register_php_pcre_symbols(module_number);
447 
448 	return SUCCESS;
449 }
450 /* }}} */
451 
452 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)453 static PHP_MSHUTDOWN_FUNCTION(pcre)
454 {
455 	UNREGISTER_INI_ENTRIES();
456 
457 	free(php_pcre_version);
458 
459 	return SUCCESS;
460 }
461 /* }}} */
462 
463 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)464 static PHP_RINIT_FUNCTION(pcre)
465 {
466 #ifdef HAVE_PCRE_JIT_SUPPORT
467 	if (UNEXPECTED(!pcre2_init_ok)) {
468 		/* Retry. */
469 		php_pcre_mutex_lock();
470 		php_pcre_init_pcre2(PCRE_G(jit));
471 		if (!pcre2_init_ok) {
472 			php_pcre_mutex_unlock();
473 			return FAILURE;
474 		}
475 		php_pcre_mutex_unlock();
476 	}
477 
478 	mdata_used = 0;
479 #endif
480 
481 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
482 	PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL);
483 	if (!PCRE_G(gctx_zmm)) {
484 		return FAILURE;
485 	}
486 
487 	if (PCRE_G(per_request_cache)) {
488 		zend_hash_init(&PCRE_G(pcre_cache), 0, NULL, php_efree_pcre_cache, 0);
489 	}
490 
491 	return SUCCESS;
492 }
493 /* }}} */
494 
PHP_RSHUTDOWN_FUNCTION(pcre)495 static PHP_RSHUTDOWN_FUNCTION(pcre)
496 {
497 	pcre2_general_context_free(PCRE_G(gctx_zmm));
498 	PCRE_G(gctx_zmm) = NULL;
499 
500 	if (PCRE_G(per_request_cache)) {
501 		zend_hash_destroy(&PCRE_G(pcre_cache));
502 	}
503 
504 	zval_ptr_dtor(&PCRE_G(unmatched_null_pair));
505 	zval_ptr_dtor(&PCRE_G(unmatched_empty_pair));
506 	ZVAL_UNDEF(&PCRE_G(unmatched_null_pair));
507 	ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair));
508 	return SUCCESS;
509 }
510 
511 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)512 static int pcre_clean_cache(zval *data, void *arg)
513 {
514 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
515 	int *num_clean = (int *)arg;
516 
517 	if (*num_clean > 0 && !pce->refcount) {
518 		(*num_clean)--;
519 		return ZEND_HASH_APPLY_REMOVE;
520 	} else {
521 		return ZEND_HASH_APPLY_KEEP;
522 	}
523 }
524 /* }}} */
525 
free_subpats_table(zend_string ** subpat_names,uint32_t num_subpats)526 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) {
527 	uint32_t i;
528 	for (i = 0; i < num_subpats; i++) {
529 		if (subpat_names[i]) {
530 			zend_string_release_ex(subpat_names[i], false);
531 		}
532 	}
533 	efree(subpat_names);
534 }
535 
536 /* {{{ static make_subpats_table */
make_subpats_table(uint32_t num_subpats,pcre_cache_entry * pce)537 static zend_string **make_subpats_table(uint32_t num_subpats, pcre_cache_entry *pce)
538 {
539 	uint32_t name_cnt = pce->name_count, name_size, ni = 0;
540 	char *name_table;
541 	zend_string **subpat_names;
542 	int rc1, rc2;
543 
544 	rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table);
545 	rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size);
546 	if (rc1 < 0 || rc2 < 0) {
547 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2);
548 		return NULL;
549 	}
550 
551 	subpat_names = ecalloc(num_subpats, sizeof(zend_string *));
552 	while (ni++ < name_cnt) {
553 		unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
554 		const char *name = name_table + 2;
555 		subpat_names[name_idx] = zend_string_init(name, strlen(name), 0);
556 		if (is_numeric_string(ZSTR_VAL(subpat_names[name_idx]), ZSTR_LEN(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
557 			php_error_docref(NULL, E_WARNING, "Numeric named subpatterns are not allowed");
558 			free_subpats_table(subpat_names, num_subpats);
559 			return NULL;
560 		}
561 		name_table += name_size;
562 	}
563 	return subpat_names;
564 }
565 /* }}} */
566 
567 /* {{{ static calculate_unit_length */
568 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */
calculate_unit_length(pcre_cache_entry * pce,const char * start)569 static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start)
570 {
571 	size_t unit_len;
572 
573 	if (pce->compile_options & PCRE2_UTF) {
574 		const char *end = start;
575 
576 		/* skip continuation bytes */
577 		while ((*++end & 0xC0) == 0x80);
578 		unit_len = end - start;
579 	} else {
580 		unit_len = 1;
581 	}
582 	return unit_len;
583 }
584 /* }}} */
585 
586 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache_ex(zend_string * regex,bool locale_aware)587 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware)
588 {
589 	pcre2_code			*re = NULL;
590 #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE
591 	uint32_t			 coptions = PCRE2_NO_START_OPTIMIZE;
592 #else
593 	uint32_t			 coptions = 0;
594 #endif
595 	uint32_t			 eoptions = PHP_PCRE_DEFAULT_EXTRA_COPTIONS;
596 	PCRE2_UCHAR	         error[128];
597 	PCRE2_SIZE           erroffset;
598 	int                  errnumber;
599 	char				 delimiter;
600 	char				 start_delimiter;
601 	char				 end_delimiter;
602 	char				*p, *pp;
603 	char				*pattern;
604 	size_t				 pattern_len;
605 	uint32_t			 poptions = 0;
606 	const uint8_t       *tables = NULL;
607 	zval                *zv;
608 	pcre_cache_entry	 new_entry;
609 	int					 rc;
610 	zend_string 		*key;
611 	pcre_cache_entry	*ret;
612 
613 	if (locale_aware && BG(ctype_string)) {
614 		key = zend_string_concat2(
615 			ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)),
616 			ZSTR_VAL(regex), ZSTR_LEN(regex));
617 	} else {
618 		key = regex;
619 	}
620 
621 	/* Try to lookup the cached regex entry, and if successful, just pass
622 	   back the compiled pattern, otherwise go on and compile it. */
623 	zv = zend_hash_find(&PCRE_G(pcre_cache), key);
624 	if (zv) {
625 		if (key != regex) {
626 			zend_string_release_ex(key, 0);
627 		}
628 		return (pcre_cache_entry*)Z_PTR_P(zv);
629 	}
630 
631 	p = ZSTR_VAL(regex);
632 	const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex);
633 
634 	/* Parse through the leading whitespace, and display a warning if we
635 	   get to the end without encountering a delimiter. */
636 	while (isspace((int)*(unsigned char *)p)) p++;
637 	if (p >= end_p) {
638 		if (key != regex) {
639 			zend_string_release_ex(key, 0);
640 		}
641 		php_error_docref(NULL, E_WARNING, "Empty regular expression");
642 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
643 		return NULL;
644 	}
645 
646 	/* Get the delimiter and display a warning if it is alphanumeric
647 	   or a backslash. */
648 	delimiter = *p++;
649 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\' || delimiter == '\0') {
650 		if (key != regex) {
651 			zend_string_release_ex(key, 0);
652 		}
653 		php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL byte");
654 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
655 		return NULL;
656 	}
657 
658 	start_delimiter = delimiter;
659 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
660 		delimiter = pp[5];
661 	end_delimiter = delimiter;
662 
663 	pp = p;
664 
665 	if (start_delimiter == end_delimiter) {
666 		/* We need to iterate through the pattern, searching for the ending delimiter,
667 		   but skipping the backslashed delimiters.  If the ending delimiter is not
668 		   found, display a warning. */
669 		while (pp < end_p) {
670 			if (*pp == '\\' && pp + 1 < end_p) pp++;
671 			else if (*pp == delimiter)
672 				break;
673 			pp++;
674 		}
675 	} else {
676 		/* We iterate through the pattern, searching for the matching ending
677 		 * delimiter. For each matching starting delimiter, we increment nesting
678 		 * level, and decrement it for each matching ending delimiter. If we
679 		 * reach the end of the pattern without matching, display a warning.
680 		 */
681 		int brackets = 1; 	/* brackets nesting level */
682 		while (pp < end_p) {
683 			if (*pp == '\\' && pp + 1 < end_p) pp++;
684 			else if (*pp == end_delimiter && --brackets <= 0)
685 				break;
686 			else if (*pp == start_delimiter)
687 				brackets++;
688 			pp++;
689 		}
690 	}
691 
692 	if (pp >= end_p) {
693 		if (key != regex) {
694 			zend_string_release_ex(key, 0);
695 		}
696 		if (start_delimiter == end_delimiter) {
697 			php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
698 		} else {
699 			php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
700 		}
701 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
702 		return NULL;
703 	}
704 
705 	/* Make a copy of the actual pattern. */
706 	pattern_len = pp - p;
707 	pattern = estrndup(p, pattern_len);
708 
709 	/* Move on to the options */
710 	pp++;
711 
712 	/* Parse through the options, setting appropriate flags.  Display
713 	   a warning if we encounter an unknown modifier. */
714 	while (pp < end_p) {
715 		switch (*pp++) {
716 			/* Perl compatible options */
717 			case 'i':	coptions |= PCRE2_CASELESS;		break;
718 			case 'm':	coptions |= PCRE2_MULTILINE;		break;
719 			case 'n':	coptions |= PCRE2_NO_AUTO_CAPTURE;	break;
720 			case 's':	coptions |= PCRE2_DOTALL;		break;
721 			case 'x':	coptions |= PCRE2_EXTENDED;		break;
722 
723 			/* PCRE specific options */
724 			case 'A':	coptions |= PCRE2_ANCHORED;		break;
725 			case 'D':	coptions |= PCRE2_DOLLAR_ENDONLY;break;
726 #ifdef PCRE2_EXTRA_CASELESS_RESTRICT
727 			case 'r':	eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break;
728 #endif
729 			case 'S':	/* Pass. */					break;
730 			case 'X':	/* Pass. */					break;
731 			case 'U':	coptions |= PCRE2_UNGREEDY;		break;
732 			case 'u':	coptions |= PCRE2_UTF;
733 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
734 	   characters, even in UTF-8 mode. However, this can be changed by setting
735 	   the PCRE2_UCP option. */
736 #ifdef PCRE2_UCP
737 						coptions |= PCRE2_UCP;
738 #endif
739 				break;
740 			case 'J':	coptions |= PCRE2_DUPNAMES;		break;
741 
742 			case ' ':
743 			case '\n':
744 			case '\r':
745 				break;
746 
747 			case 'e': /* legacy eval */
748 			default:
749 				if (pp[-1]) {
750 					php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]);
751 				} else {
752 					php_error_docref(NULL, E_WARNING, "NUL byte is not a valid modifier");
753 				}
754 				pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
755 				efree(pattern);
756 				if (key != regex) {
757 					zend_string_release_ex(key, 0);
758 				}
759 				return NULL;
760 		}
761 	}
762 
763 	if (key != regex) {
764 		tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string));
765 		if (!tables) {
766 			zend_string *_k;
767 			tables = pcre2_maketables(gctx);
768 			if (UNEXPECTED(!tables)) {
769 				php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables");
770 				pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY);
771 				zend_string_release_ex(key, 0);
772 				efree(pattern);
773 				return NULL;
774 			}
775 			_k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1);
776 			GC_MAKE_PERSISTENT_LOCAL(_k);
777 			zend_hash_add_ptr(&char_tables, _k, (void *)tables);
778 			zend_string_release(_k);
779 		}
780 	}
781 	pcre2_set_character_tables(cctx, tables);
782 
783 	pcre2_set_compile_extra_options(cctx, eoptions);
784 
785 	/* Compile pattern and display a warning if compilation failed. */
786 	re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
787 
788 	if (re == NULL) {
789 		if (key != regex) {
790 			zend_string_release_ex(key, 0);
791 		}
792 		pcre2_get_error_message(errnumber, error, sizeof(error));
793 		php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset);
794 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
795 		efree(pattern);
796 		return NULL;
797 	}
798 
799 #ifdef HAVE_PCRE_JIT_SUPPORT
800 	if (PCRE_G(jit)) {
801 		/* Enable PCRE JIT compiler */
802 		rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
803 		if (EXPECTED(rc >= 0)) {
804 			size_t jit_size = 0;
805 			if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) {
806 				poptions |= PREG_JIT;
807 			}
808 		} else if (rc == PCRE2_ERROR_NOMEMORY) {
809 			php_error_docref(NULL, E_WARNING,
810 				"Allocation of JIT memory failed, PCRE JIT will be disabled. "
811 				"This is likely caused by security restrictions. "
812 				"Either grant PHP permission to allocate executable memory, or set pcre.jit=0");
813 			PCRE_G(jit) = 0;
814 		} else {
815 			pcre2_get_error_message(rc, error, sizeof(error));
816 			php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error);
817 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
818 		}
819 	}
820 #endif
821 	efree(pattern);
822 
823 	/*
824 	 * If we reached cache limit, clean out the items from the head of the list;
825 	 * these are supposedly the oldest ones (but not necessarily the least used
826 	 * ones).
827 	 */
828 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
829 		int num_clean = PCRE_CACHE_SIZE / 8;
830 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
831 	}
832 
833 	/* Store the compiled pattern and extra info in the cache. */
834 	new_entry.re = re;
835 	new_entry.preg_options = poptions;
836 	new_entry.compile_options = coptions;
837 	new_entry.refcount = 0;
838 
839 	rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count);
840 	if (rc < 0) {
841 		if (key != regex) {
842 			zend_string_release_ex(key, 0);
843 		}
844 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc);
845 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
846 		return NULL;
847 	}
848 
849 	rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count);
850 	if (rc < 0) {
851 		if (key != regex) {
852 			zend_string_release_ex(key, 0);
853 		}
854 		php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc);
855 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
856 		return NULL;
857 	}
858 
859 	/*
860 	 * Interned strings are not duplicated when stored in HashTable,
861 	 * but all the interned strings created during HTTP request are removed
862 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
863 	 * on the next request as well. So we disable usage of interned strings
864 	 * as hash keys especually for this table.
865 	 * See bug #63180
866 	 */
867 	if (!(GC_FLAGS(key) & IS_STR_PERMANENT) && !PCRE_G(per_request_cache)) {
868 		zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1);
869 		GC_MAKE_PERSISTENT_LOCAL(str);
870 
871 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry));
872 		zend_string_release(str);
873 	} else {
874 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
875 	}
876 
877 	if (key != regex) {
878 		zend_string_release_ex(key, 0);
879 	}
880 
881 	return ret;
882 }
883 /* }}} */
884 
885 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache(zend_string * regex)886 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
887 {
888 	return pcre_get_compiled_regex_cache_ex(regex, true);
889 }
890 /* }}} */
891 
892 /* {{{ pcre_get_compiled_regex */
pcre_get_compiled_regex(zend_string * regex,uint32_t * capture_count)893 PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count)
894 {
895 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
896 
897 	if (capture_count) {
898 		*capture_count = pce ? pce->capture_count : 0;
899 	}
900 
901 	return pce ? pce->re : NULL;
902 }
903 /* }}} */
904 
905 /* XXX For the cases where it's only about match yes/no and no capture
906 		required, perhaps just a minimum sized data would suffice. */
php_pcre_create_match_data(uint32_t capture_count,pcre2_code * re)907 PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
908 {/*{{{*/
909 
910 	assert(NULL != re);
911 
912 	if (EXPECTED(!mdata_used)) {
913 		int rc = 0;
914 
915 		if (!capture_count) {
916 			/* As we deal with a non cached pattern, no other way to gather this info. */
917 			rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
918 		}
919 
920 		if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
921 			mdata_used = 1;
922 			return mdata;
923 		}
924 	}
925 
926 	return pcre2_match_data_create_from_pattern(re, gctx);
927 }/*}}}*/
928 
php_pcre_free_match_data(pcre2_match_data * match_data)929 PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
930 {/*{{{*/
931 	if (UNEXPECTED(match_data != mdata)) {
932 		pcre2_match_data_free(match_data);
933 	} else {
934 		mdata_used = 0;
935 	}
936 }/*}}}*/
937 
init_unmatched_null_pair(void)938 static void init_unmatched_null_pair(void) {
939 	zval val1, val2;
940 	ZVAL_NULL(&val1);
941 	ZVAL_LONG(&val2, -1);
942 	ZVAL_ARR(&PCRE_G(unmatched_null_pair), zend_new_pair(&val1, &val2));
943 }
944 
init_unmatched_empty_pair(void)945 static void init_unmatched_empty_pair(void) {
946 	zval val1, val2;
947 	ZVAL_EMPTY_STRING(&val1);
948 	ZVAL_LONG(&val2, -1);
949 	ZVAL_ARR(&PCRE_G(unmatched_empty_pair), zend_new_pair(&val1, &val2));
950 }
951 
populate_match_value_str(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset)952 static zend_always_inline void populate_match_value_str(
953 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) {
954 	ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
955 }
956 
populate_match_value(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,bool unmatched_as_null)957 static zend_always_inline void populate_match_value(
958 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
959 		bool unmatched_as_null) {
960 	if (PCRE2_UNSET == start_offset) {
961 		if (unmatched_as_null) {
962 			ZVAL_NULL(val);
963 		} else {
964 			ZVAL_EMPTY_STRING(val);
965 		}
966 	} else {
967 		populate_match_value_str(val, subject, start_offset, end_offset);
968 	}
969 }
970 
add_named(HashTable * const subpats,zend_string * name,zval * val,bool unmatched)971 static inline void add_named(
972 		HashTable *const subpats, zend_string *name, zval *val, bool unmatched) {
973 	/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
974 	 * In this case we want to preserve the one that actually has a value. */
975 	if (!unmatched) {
976 		zend_hash_update(subpats, name, val);
977 	} else {
978 		if (!zend_hash_add(subpats, name, val)) {
979 			return;
980 		}
981 	}
982 	Z_TRY_ADDREF_P(val);
983 }
984 
985 /* {{{ add_offset_pair */
add_offset_pair(HashTable * const result,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,zend_string * name,zend_long unmatched_as_null)986 static inline void add_offset_pair(
987 		HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
988 		zend_string *name, zend_long unmatched_as_null)
989 {
990 	zval match_pair;
991 
992 	/* Add (match, offset) to the return value */
993 	if (PCRE2_UNSET == start_offset) {
994 		if (unmatched_as_null) {
995 			if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) {
996 				init_unmatched_null_pair();
997 			}
998 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair));
999 		} else {
1000 			if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) {
1001 				init_unmatched_empty_pair();
1002 			}
1003 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair));
1004 		}
1005 	} else {
1006 		zval val1, val2;
1007 		populate_match_value_str(&val1, subject, start_offset, end_offset);
1008 		ZVAL_LONG(&val2, start_offset);
1009 		ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2));
1010 	}
1011 
1012 	if (name) {
1013 		add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
1014 	}
1015 	zend_hash_next_index_insert_new(result, &match_pair);
1016 }
1017 /* }}} */
1018 
populate_subpat_array(zval * subpats,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1019 static void populate_subpat_array(
1020 		zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
1021 		uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
1022 	zend_long offset_capture = flags & PREG_OFFSET_CAPTURE;
1023 	zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1024 	zval val;
1025 	int i;
1026 	HashTable *subpats_ht = Z_ARRVAL_P(subpats);
1027 	if (subpat_names) {
1028 		if (offset_capture) {
1029 			for (i = 0; i < count; i++) {
1030 				add_offset_pair(
1031 					subpats_ht, subject, offsets[2*i], offsets[2*i+1],
1032 					subpat_names[i], unmatched_as_null);
1033 			}
1034 			if (unmatched_as_null) {
1035 				for (i = count; i < num_subpats; i++) {
1036 					add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
1037 				}
1038 			}
1039 		} else {
1040 			for (i = 0; i < count; i++) {
1041 				populate_match_value(
1042 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1043 				if (subpat_names[i]) {
1044 					add_named(subpats_ht, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
1045 				}
1046 				zend_hash_next_index_insert_new(subpats_ht, &val);
1047 			}
1048 			if (unmatched_as_null) {
1049 				for (i = count; i < num_subpats; i++) {
1050 					ZVAL_NULL(&val);
1051 					if (subpat_names[i]) {
1052 						zend_hash_add(subpats_ht, subpat_names[i], &val);
1053 					}
1054 					zend_hash_next_index_insert_new(subpats_ht, &val);
1055 				}
1056 			}
1057 		}
1058 	} else {
1059 		if (offset_capture) {
1060 			for (i = 0; i < count; i++) {
1061 				add_offset_pair(
1062 					subpats_ht, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
1063 			}
1064 			if (unmatched_as_null) {
1065 				for (i = count; i < num_subpats; i++) {
1066 					add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
1067 				}
1068 			}
1069 		} else {
1070 			for (i = 0; i < count; i++) {
1071 				populate_match_value(
1072 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1073 				zend_hash_next_index_insert_new(subpats_ht, &val);
1074 			}
1075 			if (unmatched_as_null) {
1076 				for (i = count; i < num_subpats; i++) {
1077 					add_next_index_null(subpats);
1078 				}
1079 			}
1080 		}
1081 	}
1082 	/* Add MARK, if available */
1083 	if (mark) {
1084 		add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
1085 	}
1086 }
1087 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,bool global)1088 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */
1089 {
1090 	/* parameters */
1091 	zend_string		 *regex;			/* Regular expression */
1092 	zend_string		 *subject;			/* String to match against */
1093 	pcre_cache_entry *pce;				/* Compiled regular expression */
1094 	zval			 *subpats = NULL;	/* Array for subpatterns */
1095 	zend_long		  flags = 0;		/* Match control flags */
1096 	zend_long		  start_offset = 0;	/* Where the new search starts */
1097 
1098 	ZEND_PARSE_PARAMETERS_START(2, 5)
1099 		Z_PARAM_STR(regex)
1100 		Z_PARAM_STR(subject)
1101 		Z_PARAM_OPTIONAL
1102 		Z_PARAM_ZVAL(subpats)
1103 		Z_PARAM_LONG(flags)
1104 		Z_PARAM_LONG(start_offset)
1105 	ZEND_PARSE_PARAMETERS_END();
1106 
1107 	/* Compile regex or get it from cache. */
1108 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1109 		RETURN_FALSE;
1110 	}
1111 
1112 	pce->refcount++;
1113 	php_pcre_match_impl(pce, subject, return_value, subpats,
1114 		global, flags, start_offset);
1115 	pce->refcount--;
1116 }
1117 /* }}} */
1118 
is_known_valid_utf8(zend_string * subject_str,PCRE2_SIZE start_offset)1119 static zend_always_inline bool is_known_valid_utf8(
1120 		zend_string *subject_str, PCRE2_SIZE start_offset) {
1121 	if (!ZSTR_IS_VALID_UTF8(subject_str)) {
1122 		/* We don't know whether the string is valid UTF-8 or not. */
1123 		return 0;
1124 	}
1125 
1126 	if (start_offset == ZSTR_LEN(subject_str)) {
1127 		/* Degenerate case: Offset points to end of string. */
1128 		return 1;
1129 	}
1130 
1131 	/* Check that the offset does not point to an UTF-8 continuation byte. */
1132 	return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1133 }
1134 
1135 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zval * subpats,bool global,zend_long flags,zend_off_t start_offset)1136 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
1137 	zval *subpats, bool global, zend_long flags, zend_off_t start_offset)
1138 {
1139 	zval			 result_set;		/* Holds a set of subpatterns after
1140 										   a global match */
1141 	HashTable	   **match_sets = NULL;	/* An array of sets of matches for each
1142 										   subpattern after a global match */
1143 	uint32_t		 options;			/* Execution options */
1144 	int				 count;				/* Count of matched subpatterns */
1145 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1146 	int				 matched;			/* Has anything matched */
1147 	zend_string	   **subpat_names;		/* Array for named subpatterns */
1148 	size_t			 i;
1149 	uint32_t		 subpats_order;		/* Order of subpattern matches */
1150 	uint32_t		 offset_capture;	/* Capture match offsets: yes/no */
1151 	zend_long		 unmatched_as_null;	/* Null non-matches: yes/no */
1152 	PCRE2_SPTR       mark = NULL;		/* Target for MARK name */
1153 	HashTable		*marks = NULL;		/* Array of marks for PREG_PATTERN_ORDER */
1154 	pcre2_match_data *match_data;
1155 	PCRE2_SIZE		 start_offset2, orig_start_offset;
1156 
1157 	char *subject = ZSTR_VAL(subject_str);
1158 	size_t subject_len = ZSTR_LEN(subject_str);
1159 
1160 	/* Overwrite the passed-in value for subpatterns with an empty array. */
1161 	if (subpats != NULL) {
1162 		subpats = zend_try_array_init(subpats);
1163 		if (!subpats) {
1164 			RETURN_THROWS();
1165 		}
1166 	}
1167 
1168 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
1169 
1170 	if (flags) {
1171 		offset_capture = flags & PREG_OFFSET_CAPTURE;
1172 		unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1173 
1174 		/*
1175 		 * subpats_order is pre-set to pattern mode so we change it only if
1176 		 * necessary.
1177 		 */
1178 		if (flags & 0xff) {
1179 			subpats_order = flags & 0xff;
1180 			if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
1181 				(!global && subpats_order != 0)) {
1182 				zend_argument_value_error(4, "must be a PREG_* constant");
1183 				RETURN_THROWS();
1184 			}
1185 		}
1186 	} else {
1187 		offset_capture = 0;
1188 		unmatched_as_null = 0;
1189 	}
1190 
1191 	/* Negative offset counts from the end of the string. */
1192 	if (start_offset < 0) {
1193 		if ((PCRE2_SIZE)-start_offset <= subject_len) {
1194 			start_offset2 = subject_len + start_offset;
1195 		} else {
1196 			start_offset2 = 0;
1197 		}
1198 	} else {
1199 		start_offset2 = (PCRE2_SIZE)start_offset;
1200 	}
1201 
1202 	if (start_offset2 > subject_len) {
1203 		pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1204 		RETURN_FALSE;
1205 	}
1206 
1207 	/* Calculate the size of the offsets array, and allocate memory for it. */
1208 	num_subpats = pce->capture_count + 1;
1209 
1210 	/*
1211 	 * Build a mapping from subpattern numbers to their names. We will
1212 	 * allocate the table only if there are any named subpatterns.
1213 	 */
1214 	subpat_names = NULL;
1215 	if (subpats && pce->name_count > 0) {
1216 		subpat_names = make_subpats_table(num_subpats, pce);
1217 		if (!subpat_names) {
1218 			RETURN_FALSE;
1219 		}
1220 	}
1221 
1222 	matched = 0;
1223 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1224 
1225 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1226 		match_data = mdata;
1227 	} else {
1228 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1229 		if (!match_data) {
1230 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1231 			if (subpat_names) {
1232 				free_subpats_table(subpat_names, num_subpats);
1233 			}
1234 			RETURN_FALSE;
1235 		}
1236 	}
1237 
1238 	/* Allocate match sets array and initialize the values. */
1239 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1240 		match_sets = safe_emalloc(num_subpats, sizeof(HashTable *), 0);
1241 		for (i=0; i<num_subpats; i++) {
1242 			match_sets[i] = zend_new_array(0);
1243 		}
1244 	}
1245 
1246 	/* Array of subpattern offsets */
1247 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1248 
1249 	orig_start_offset = start_offset2;
1250 	options =
1251 		(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1252 			? 0 : PCRE2_NO_UTF_CHECK;
1253 
1254 	/* Execute the regular expression. */
1255 #ifdef HAVE_PCRE_JIT_SUPPORT
1256 	if ((pce->preg_options & PREG_JIT) && options) {
1257 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1258 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1259 	} else
1260 #endif
1261 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1262 			options, match_data, mctx);
1263 
1264 	while (1) {
1265 		/* If something has matched */
1266 		if (count >= 0) {
1267 			/* Check for too many substrings condition. */
1268 			if (UNEXPECTED(count == 0)) {
1269 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
1270 				count = num_subpats;
1271 			}
1272 
1273 matched:
1274 			matched++;
1275 
1276 			/* If subpatterns array has been passed, fill it in with values. */
1277 			if (subpats != NULL) {
1278 				/* Try to get the list of substrings and display a warning if failed. */
1279 				if (UNEXPECTED(offsets[1] < offsets[0])) {
1280 					if (subpat_names) {
1281 						free_subpats_table(subpat_names, num_subpats);
1282 					}
1283 					if (match_sets) efree(match_sets);
1284 					php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
1285 					RETURN_FALSE;
1286 				}
1287 
1288 				if (global) {	/* global pattern matching */
1289 					if (subpats_order == PREG_PATTERN_ORDER) {
1290 						/* For each subpattern, insert it into the appropriate array. */
1291 						if (offset_capture) {
1292 							for (i = 0; i < count; i++) {
1293 								add_offset_pair(
1294 									match_sets[i], subject, offsets[2*i], offsets[2*i+1],
1295 									NULL, unmatched_as_null);
1296 							}
1297 						} else {
1298 							for (i = 0; i < count; i++) {
1299 								zval val;
1300 								populate_match_value(
1301 									&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1302 								zend_hash_next_index_insert_new(match_sets[i], &val);
1303 							}
1304 						}
1305 						mark = pcre2_get_mark(match_data);
1306 						/* Add MARK, if available */
1307 						if (mark) {
1308 							if (!marks) {
1309 								marks = zend_new_array(0);
1310 							}
1311 							zval tmp;
1312 							ZVAL_STRING(&tmp, (char *) mark);
1313 							zend_hash_index_add_new(marks, matched - 1, &tmp);
1314 						}
1315 						/*
1316 						 * If the number of captured subpatterns on this run is
1317 						 * less than the total possible number, pad the result
1318 						 * arrays with NULLs or empty strings.
1319 						 */
1320 						if (count < num_subpats) {
1321 							for (int i = count; i < num_subpats; i++) {
1322 								if (offset_capture) {
1323 									add_offset_pair(
1324 										match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
1325 										NULL, unmatched_as_null);
1326 								} else if (unmatched_as_null) {
1327 									zval tmp;
1328 									ZVAL_NULL(&tmp);
1329 									zend_hash_next_index_insert_new(match_sets[i], &tmp);
1330 								} else {
1331 									zval tmp;
1332 									ZVAL_EMPTY_STRING(&tmp);
1333 									zend_hash_next_index_insert_new(match_sets[i], &tmp);
1334 								}
1335 							}
1336 						}
1337 					} else {
1338 						/* Allocate and populate the result set array */
1339 						mark = pcre2_get_mark(match_data);
1340 						array_init_size(&result_set, count + (mark ? 1 : 0));
1341 						populate_subpat_array(
1342 							&result_set, subject, offsets, subpat_names,
1343 							num_subpats, count, mark, flags);
1344 						/* And add it to the output array */
1345 						zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &result_set);
1346 					}
1347 				} else {			/* single pattern matching */
1348 					/* For each subpattern, insert it into the subpatterns array. */
1349 					mark = pcre2_get_mark(match_data);
1350 					populate_subpat_array(
1351 						subpats, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1352 					break;
1353 				}
1354 			}
1355 
1356 			/* Advance to the next piece. */
1357 			start_offset2 = offsets[1];
1358 
1359 			/* If we have matched an empty string, mimic what Perl's /g options does.
1360 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1361 			   the match again at the same point. If this fails (picked up above) we
1362 			   advance to the next character. */
1363 			if (start_offset2 == offsets[0]) {
1364 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1365 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1366 				if (count >= 0) {
1367 					if (global) {
1368 						goto matched;
1369 					} else {
1370 						break;
1371 					}
1372 				} else if (count == PCRE2_ERROR_NOMATCH) {
1373 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1374 					   this is not necessarily the end. We need to advance
1375 					   the start offset, and continue. Fudge the offset values
1376 					   to achieve this, unless we're already at the end of the string. */
1377 					if (start_offset2 < subject_len) {
1378 						size_t unit_len = calculate_unit_length(pce, subject + start_offset2);
1379 
1380 						start_offset2 += unit_len;
1381 					} else {
1382 						break;
1383 					}
1384 				} else {
1385 					goto error;
1386 				}
1387 			}
1388 		} else if (count == PCRE2_ERROR_NOMATCH) {
1389 			break;
1390 		} else {
1391 error:
1392 			pcre_handle_exec_error(count);
1393 			break;
1394 		}
1395 
1396 		if (!global) {
1397 			break;
1398 		}
1399 
1400 		/* Execute the regular expression. */
1401 #ifdef HAVE_PCRE_JIT_SUPPORT
1402 		if ((pce->preg_options & PREG_JIT)) {
1403 			if (start_offset2 > subject_len) {
1404 				pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1405 				break;
1406 			}
1407 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1408 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1409 		} else
1410 #endif
1411 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1412 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1413 	}
1414 	if (match_data != mdata) {
1415 		pcre2_match_data_free(match_data);
1416 	}
1417 
1418 	/* Add the match sets to the output array and clean up */
1419 	if (match_sets) {
1420 		if (subpat_names) {
1421 			for (i = 0; i < num_subpats; i++) {
1422 				zval wrapper;
1423 				ZVAL_ARR(&wrapper, match_sets[i]);
1424 				if (subpat_names[i]) {
1425 					zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper);
1426 					GC_ADDREF(match_sets[i]);
1427 				}
1428 				zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
1429 			}
1430 		} else {
1431 			for (i = 0; i < num_subpats; i++) {
1432 				zval wrapper;
1433 				ZVAL_ARR(&wrapper, match_sets[i]);
1434 				zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
1435 			}
1436 		}
1437 		efree(match_sets);
1438 
1439 		if (marks) {
1440 			zval tmp;
1441 			ZVAL_ARR(&tmp, marks);
1442 			zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp);
1443 		}
1444 	}
1445 
1446 	if (subpat_names) {
1447 		free_subpats_table(subpat_names, num_subpats);
1448 	}
1449 
1450 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1451 		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1452 		if ((pce->compile_options & PCRE2_UTF)
1453 				&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
1454 			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1455 		}
1456 
1457 		RETVAL_LONG(matched);
1458 	} else {
1459 		RETVAL_FALSE;
1460 	}
1461 }
1462 /* }}} */
1463 
1464 /* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1465 PHP_FUNCTION(preg_match)
1466 {
1467 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
1468 }
1469 /* }}} */
1470 
1471 ZEND_FRAMELESS_FUNCTION(preg_match, 2)
1472 {
1473 	zval regex_tmp, subject_tmp;
1474 	zend_string *regex, *subject;
1475 
1476 	Z_FLF_PARAM_STR(1, regex, regex_tmp);
1477 	Z_FLF_PARAM_STR(2, subject, subject_tmp);
1478 
1479 	/* Compile regex or get it from cache. */
1480 	pcre_cache_entry *pce;
1481 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1482 		RETURN_FALSE;
1483 	}
1484 
1485 	pce->refcount++;
1486 	php_pcre_match_impl(pce, subject, return_value, /* subpats */ NULL,
1487 		/* global */ false, /* flags */ 0, /* start_offset */ 0);
1488 	pce->refcount--;
1489 
1490 flf_clean:
1491 	Z_FLF_PARAM_FREE_STR(1, regex_tmp);
1492 	Z_FLF_PARAM_FREE_STR(2, subject_tmp);
1493 }
1494 
1495 /* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1496 PHP_FUNCTION(preg_match_all)
1497 {
1498 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
1499 }
1500 /* }}} */
1501 
1502 /* {{{ preg_get_backref */
preg_get_backref(char ** str,int * backref)1503 static int preg_get_backref(char **str, int *backref)
1504 {
1505 	char in_brace = 0;
1506 	char *walk = *str;
1507 
1508 	if (walk[1] == 0)
1509 		return 0;
1510 
1511 	if (*walk == '$' && walk[1] == '{') {
1512 		in_brace = 1;
1513 		walk++;
1514 	}
1515 	walk++;
1516 
1517 	if (*walk >= '0' && *walk <= '9') {
1518 		*backref = *walk - '0';
1519 		walk++;
1520 	} else
1521 		return 0;
1522 
1523 	if (*walk && *walk >= '0' && *walk <= '9') {
1524 		*backref = *backref * 10 + *walk - '0';
1525 		walk++;
1526 	}
1527 
1528 	if (in_brace) {
1529 		if (*walk != '}')
1530 			return 0;
1531 		else
1532 			walk++;
1533 	}
1534 
1535 	*str = walk;
1536 	return 1;
1537 }
1538 /* }}} */
1539 
1540 /* {{{ preg_do_repl_func */
preg_do_repl_func(zend_fcall_info * fci,zend_fcall_info_cache * fcc,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1541 static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags)
1542 {
1543 	zend_string *result_str;
1544 	zval		 retval;			/* Function return value */
1545 	zval	     arg;				/* Argument to pass to function */
1546 
1547 	array_init_size(&arg, count + (mark ? 1 : 0));
1548 	populate_subpat_array(&arg, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1549 
1550 	fci->retval = &retval;
1551 	fci->param_count = 1;
1552 	fci->params = &arg;
1553 
1554 	if (zend_call_function(fci, fcc) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1555 		if (EXPECTED(Z_TYPE(retval) == IS_STRING)) {
1556 			result_str = Z_STR(retval);
1557 		} else {
1558 			result_str = zval_get_string_func(&retval);
1559 			zval_ptr_dtor(&retval);
1560 		}
1561 	} else {
1562 		if (!EG(exception)) {
1563 			php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1564 		}
1565 
1566 		result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1567 	}
1568 
1569 	zval_ptr_dtor(&arg);
1570 
1571 	return result_str;
1572 }
1573 /* }}} */
1574 
1575 /* {{{ php_pcre_replace */
php_pcre_replace(zend_string * regex,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1576 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1577 							  zend_string *subject_str,
1578 							  const char *subject, size_t subject_len,
1579 							  zend_string *replace_str,
1580 							  size_t limit, size_t *replace_count)
1581 {
1582 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1583 	zend_string	 		*result;			/* Function result */
1584 
1585 	/* Abort on pending exception, e.g. thrown from __toString(). */
1586 	if (UNEXPECTED(EG(exception))) {
1587 		return NULL;
1588 	}
1589 
1590 	/* Compile regex or get it from cache. */
1591 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1592 		return NULL;
1593 	}
1594 	pce->refcount++;
1595 	result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str,
1596 		limit, replace_count);
1597 	pce->refcount--;
1598 
1599 	return result;
1600 }
1601 /* }}} */
1602 
1603 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1604 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count)
1605 {
1606 	uint32_t		 options;			/* Execution options */
1607 	int				 count;				/* Count of matched subpatterns */
1608 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1609 	size_t			 new_len;			/* Length of needed storage */
1610 	size_t			 alloc_len;			/* Actual allocated length */
1611 	size_t			 match_len;			/* Length of the current match */
1612 	int				 backref;			/* Backreference number */
1613 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1614 	size_t			 last_end_offset;	/* Where the last search ended */
1615 	char			*walkbuf,			/* Location of current replacement in the result */
1616 					*walk,				/* Used to walk the replacement string */
1617 					 walk_last;			/* Last walked character */
1618 	const char		*match,				/* The current match */
1619 					*piece,				/* The current piece of subject */
1620 					*replace_end;		/* End of replacement string */
1621 	size_t			result_len; 		/* Length of result */
1622 	zend_string		*result;			/* Result of replacement */
1623 	pcre2_match_data *match_data;
1624 
1625 	/* Calculate the size of the offsets array, and allocate memory for it. */
1626 	num_subpats = pce->capture_count + 1;
1627 	alloc_len = 0;
1628 	result = NULL;
1629 
1630 	/* Initialize */
1631 	match = NULL;
1632 	start_offset = 0;
1633 	last_end_offset = 0;
1634 	result_len = 0;
1635 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1636 
1637 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1638 		match_data = mdata;
1639 	} else {
1640 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1641 		if (!match_data) {
1642 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1643 			return NULL;
1644 		}
1645 	}
1646 
1647 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1648 
1649 	/* Array of subpattern offsets */
1650 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1651 
1652 	/* Execute the regular expression. */
1653 #ifdef HAVE_PCRE_JIT_SUPPORT
1654 	if ((pce->preg_options & PREG_JIT) && options) {
1655 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1656 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1657 	} else
1658 #endif
1659 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1660 			options, match_data, mctx);
1661 
1662 	while (1) {
1663 		piece = subject + last_end_offset;
1664 
1665 		if (count >= 0 && limit > 0) {
1666 			bool simple_string;
1667 
1668 			/* Check for too many substrings condition. */
1669 			if (UNEXPECTED(count == 0)) {
1670 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1671 				count = num_subpats;
1672 			}
1673 
1674 matched:
1675 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1676 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1677 				if (result) {
1678 					zend_string_release_ex(result, 0);
1679 					result = NULL;
1680 				}
1681 				break;
1682 			}
1683 
1684 			if (replace_count) {
1685 				++*replace_count;
1686 			}
1687 
1688 			/* Set the match location in subject */
1689 			match = subject + offsets[0];
1690 
1691 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1692 
1693 			walk = ZSTR_VAL(replace_str);
1694 			replace_end = walk + ZSTR_LEN(replace_str);
1695 			walk_last = 0;
1696 			simple_string = 1;
1697 			while (walk < replace_end) {
1698 				if ('\\' == *walk || '$' == *walk) {
1699 					simple_string = 0;
1700 					if (walk_last == '\\') {
1701 						walk++;
1702 						walk_last = 0;
1703 						continue;
1704 					}
1705 					if (preg_get_backref(&walk, &backref)) {
1706 						if (backref < count)
1707 							new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1708 						continue;
1709 					}
1710 				}
1711 				new_len++;
1712 				walk++;
1713 				walk_last = walk[-1];
1714 			}
1715 
1716 			if (new_len >= alloc_len) {
1717 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1718 				if (result == NULL) {
1719 					result = zend_string_alloc(alloc_len, 0);
1720 				} else {
1721 					result = zend_string_extend(result, alloc_len, 0);
1722 				}
1723 			}
1724 
1725 			if (match-piece > 0) {
1726 				/* copy the part of the string before the match */
1727 				memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1728 				result_len += (match-piece);
1729 			}
1730 
1731 			if (simple_string) {
1732 				/* copy replacement */
1733 				memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1);
1734 				result_len += ZSTR_LEN(replace_str);
1735 			} else {
1736 				/* copy replacement and backrefs */
1737 				walkbuf = ZSTR_VAL(result) + result_len;
1738 
1739 				walk = ZSTR_VAL(replace_str);
1740 				walk_last = 0;
1741 				while (walk < replace_end) {
1742 					if ('\\' == *walk || '$' == *walk) {
1743 						if (walk_last == '\\') {
1744 							*(walkbuf-1) = *walk++;
1745 							walk_last = 0;
1746 							continue;
1747 						}
1748 						if (preg_get_backref(&walk, &backref)) {
1749 							if (backref < count) {
1750 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1751 								walkbuf = zend_mempcpy(walkbuf, subject + offsets[backref << 1], match_len);
1752 							}
1753 							continue;
1754 						}
1755 					}
1756 					*walkbuf++ = *walk++;
1757 					walk_last = walk[-1];
1758 				}
1759 				*walkbuf = '\0';
1760 				/* increment the result length by how much we've added to the string */
1761 				result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1762 			}
1763 
1764 			limit--;
1765 
1766 			/* Advance to the next piece. */
1767 			start_offset = last_end_offset = offsets[1];
1768 
1769 			/* If we have matched an empty string, mimic what Perl's /g options does.
1770 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1771 			   the match again at the same point. If this fails (picked up above) we
1772 			   advance to the next character. */
1773 			if (start_offset == offsets[0]) {
1774 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1775 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1776 
1777 				piece = subject + start_offset;
1778 				if (count >= 0 && limit > 0) {
1779 					goto matched;
1780 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1781 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1782 					   this is not necessarily the end. We need to advance
1783 					   the start offset, and continue. Fudge the offset values
1784 					   to achieve this, unless we're already at the end of the string. */
1785 					if (start_offset < subject_len) {
1786 						size_t unit_len = calculate_unit_length(pce, piece);
1787 						start_offset += unit_len;
1788 					} else {
1789 						goto not_matched;
1790 					}
1791 				} else {
1792 					goto error;
1793 				}
1794 			}
1795 
1796 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1797 not_matched:
1798 			if (!result && subject_str) {
1799 				result = zend_string_copy(subject_str);
1800 				break;
1801 			}
1802 			/* now we know exactly how long it is */
1803 			alloc_len = result_len + subject_len - last_end_offset;
1804 			if (NULL != result) {
1805 				result = zend_string_realloc(result, alloc_len, 0);
1806 			} else {
1807 				result = zend_string_alloc(alloc_len, 0);
1808 			}
1809 			/* stick that last bit of string on our output */
1810 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1811 			result_len += subject_len - last_end_offset;
1812 			ZSTR_VAL(result)[result_len] = '\0';
1813 			ZSTR_LEN(result) = result_len;
1814 			break;
1815 		} else {
1816 error:
1817 			pcre_handle_exec_error(count);
1818 			if (result) {
1819 				zend_string_release_ex(result, 0);
1820 				result = NULL;
1821 			}
1822 			break;
1823 		}
1824 
1825 #ifdef HAVE_PCRE_JIT_SUPPORT
1826 		if (pce->preg_options & PREG_JIT) {
1827 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1828 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1829 		} else
1830 #endif
1831 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1832 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1833 	}
1834 	if (match_data != mdata) {
1835 		pcre2_match_data_free(match_data);
1836 	}
1837 
1838 	return result;
1839 }
1840 /* }}} */
1841 
1842 /* {{{ php_pcre_replace_func_impl() */
php_pcre_replace_func_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)1843 static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_fcall_info *fci, zend_fcall_info_cache *fcc, size_t limit, size_t *replace_count, zend_long flags)
1844 {
1845 	uint32_t		 options;			/* Execution options */
1846 	int				 count;				/* Count of matched subpatterns */
1847 	zend_string		**subpat_names;		/* Array for named subpatterns */
1848 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1849 	size_t			 new_len;			/* Length of needed storage */
1850 	size_t			 alloc_len;			/* Actual allocated length */
1851 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1852 	size_t			 last_end_offset;	/* Where the last search ended */
1853 	const char		*match,				/* The current match */
1854 					*piece;				/* The current piece of subject */
1855 	size_t			result_len; 		/* Length of result */
1856 	zend_string		*result;			/* Result of replacement */
1857 	zend_string     *eval_result;		/* Result of custom function */
1858 	pcre2_match_data *match_data;
1859 	bool old_mdata_used;
1860 
1861 	/* Calculate the size of the offsets array, and allocate memory for it. */
1862 	num_subpats = pce->capture_count + 1;
1863 
1864 	/*
1865 	 * Build a mapping from subpattern numbers to their names. We will
1866 	 * allocate the table only if there are any named subpatterns.
1867 	 */
1868 	subpat_names = NULL;
1869 	if (UNEXPECTED(pce->name_count > 0)) {
1870 		subpat_names = make_subpats_table(num_subpats, pce);
1871 		if (!subpat_names) {
1872 			return NULL;
1873 		}
1874 	}
1875 
1876 	alloc_len = 0;
1877 	result = NULL;
1878 
1879 	/* Initialize */
1880 	match = NULL;
1881 	start_offset = 0;
1882 	last_end_offset = 0;
1883 	result_len = 0;
1884 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1885 
1886 	old_mdata_used = mdata_used;
1887 	if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1888 		mdata_used = 1;
1889 		match_data = mdata;
1890 	} else {
1891 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1892 		if (!match_data) {
1893 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1894 			if (subpat_names) {
1895 				free_subpats_table(subpat_names, num_subpats);
1896 			}
1897 			mdata_used = old_mdata_used;
1898 			return NULL;
1899 		}
1900 	}
1901 
1902 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1903 
1904 	/* Array of subpattern offsets */
1905 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1906 
1907 	/* Execute the regular expression. */
1908 #ifdef HAVE_PCRE_JIT_SUPPORT
1909 	if ((pce->preg_options & PREG_JIT) && options) {
1910 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1911 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1912 	} else
1913 #endif
1914 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1915 			options, match_data, mctx);
1916 
1917 	while (1) {
1918 		piece = subject + last_end_offset;
1919 
1920 		if (count >= 0 && limit) {
1921 			/* Check for too many substrings condition. */
1922 			if (UNEXPECTED(count == 0)) {
1923 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1924 				count = num_subpats;
1925 			}
1926 
1927 matched:
1928 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1929 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1930 				if (result) {
1931 					zend_string_release_ex(result, 0);
1932 					result = NULL;
1933 				}
1934 				break;
1935 			}
1936 
1937 			if (replace_count) {
1938 				++*replace_count;
1939 			}
1940 
1941 			/* Set the match location in subject */
1942 			match = subject + offsets[0];
1943 
1944 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1945 
1946 			/* Use custom function to get replacement string and its length. */
1947 			eval_result = preg_do_repl_func(
1948 				fci, fcc, subject, offsets, subpat_names, num_subpats, count,
1949 				pcre2_get_mark(match_data), flags);
1950 
1951 			ZEND_ASSERT(eval_result);
1952 			new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD;
1953 			if (new_len >= alloc_len) {
1954 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1955 				if (result == NULL) {
1956 					result = zend_string_alloc(alloc_len, 0);
1957 				} else {
1958 					result = zend_string_extend(result, alloc_len, 0);
1959 				}
1960 			}
1961 
1962 			if (match-piece > 0) {
1963 				/* copy the part of the string before the match */
1964 				memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1965 				result_len += (match-piece);
1966 			}
1967 
1968 			/* If using custom function, copy result to the buffer and clean up. */
1969 			memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1970 			result_len += ZSTR_LEN(eval_result);
1971 			zend_string_release_ex(eval_result, 0);
1972 
1973 			limit--;
1974 
1975 			/* Advance to the next piece. */
1976 			start_offset = last_end_offset = offsets[1];
1977 
1978 			/* If we have matched an empty string, mimic what Perl's /g options does.
1979 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1980 			   the match again at the same point. If this fails (picked up above) we
1981 			   advance to the next character. */
1982 			if (start_offset == offsets[0]) {
1983 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1984 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1985 
1986 				piece = subject + start_offset;
1987 				if (count >= 0 && limit) {
1988 					goto matched;
1989 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1990 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1991 					   this is not necessarily the end. We need to advance
1992 					   the start offset, and continue. Fudge the offset values
1993 					   to achieve this, unless we're already at the end of the string. */
1994 					if (start_offset < subject_len) {
1995 						size_t unit_len = calculate_unit_length(pce, piece);
1996 						start_offset += unit_len;
1997 					} else {
1998 						goto not_matched;
1999 					}
2000 				} else {
2001 					goto error;
2002 				}
2003 			}
2004 
2005 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
2006 not_matched:
2007 			if (!result && subject_str) {
2008 				result = zend_string_copy(subject_str);
2009 				break;
2010 			}
2011 			/* now we know exactly how long it is */
2012 			alloc_len = result_len + subject_len - last_end_offset;
2013 			if (NULL != result) {
2014 				result = zend_string_realloc(result, alloc_len, 0);
2015 			} else {
2016 				result = zend_string_alloc(alloc_len, 0);
2017 			}
2018 			/* stick that last bit of string on our output */
2019 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
2020 			result_len += subject_len - last_end_offset;
2021 			ZSTR_VAL(result)[result_len] = '\0';
2022 			ZSTR_LEN(result) = result_len;
2023 			break;
2024 		} else {
2025 error:
2026 			pcre_handle_exec_error(count);
2027 			if (result) {
2028 				zend_string_release_ex(result, 0);
2029 				result = NULL;
2030 			}
2031 			break;
2032 		}
2033 #ifdef HAVE_PCRE_JIT_SUPPORT
2034 		if ((pce->preg_options & PREG_JIT)) {
2035 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2036 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2037 		} else
2038 #endif
2039 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2040 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2041 	}
2042 	if (match_data != mdata) {
2043 		pcre2_match_data_free(match_data);
2044 	}
2045 	mdata_used = old_mdata_used;
2046 
2047 	if (UNEXPECTED(subpat_names)) {
2048 		free_subpats_table(subpat_names, num_subpats);
2049 	}
2050 
2051 	return result;
2052 }
2053 /* }}} */
2054 
2055 /* {{{ php_pcre_replace_func */
php_pcre_replace_func(zend_string * regex,zend_string * subject_str,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)2056 static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex,
2057 							  zend_string *subject_str,
2058 							  zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2059 							  size_t limit, size_t *replace_count, zend_long flags)
2060 {
2061 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
2062 	zend_string	 		*result;			/* Function result */
2063 
2064 	/* Compile regex or get it from cache. */
2065 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2066 		return NULL;
2067 	}
2068 	pce->refcount++;
2069 	result = php_pcre_replace_func_impl(
2070 		pce, subject_str, ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), fci, fcc,
2071 		limit, replace_count, flags);
2072 	pce->refcount--;
2073 
2074 	return result;
2075 }
2076 /* }}} */
2077 
2078 /* {{{ php_pcre_replace_array */
php_pcre_replace_array(HashTable * regex,zend_string * replace_str,HashTable * replace_ht,zend_string * subject_str,size_t limit,size_t * replace_count)2079 static zend_string *php_pcre_replace_array(HashTable *regex,
2080 	zend_string *replace_str, HashTable *replace_ht,
2081 	zend_string *subject_str, size_t limit, size_t *replace_count)
2082 {
2083 	zval		*regex_entry;
2084 	zend_string *result;
2085 
2086 	zend_string_addref(subject_str);
2087 
2088 	if (replace_ht) {
2089 		uint32_t replace_idx = 0;
2090 
2091 		/* For each entry in the regex array, get the entry */
2092 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2093 			/* Make sure we're dealing with strings. */
2094 			zend_string *tmp_regex_str;
2095 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2096 			zend_string *replace_entry_str, *tmp_replace_entry_str;
2097 			zval *zv;
2098 
2099 			/* Get current entry */
2100 			while (1) {
2101 				if (replace_idx == replace_ht->nNumUsed) {
2102 					replace_entry_str = ZSTR_EMPTY_ALLOC();
2103 					tmp_replace_entry_str = NULL;
2104 					break;
2105 				}
2106 				zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx);
2107 				replace_idx++;
2108 				if (Z_TYPE_P(zv) != IS_UNDEF) {
2109 					replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str);
2110 					break;
2111 				}
2112 			}
2113 
2114 			/* Do the actual replacement and put the result back into subject_str
2115 			   for further replacements. */
2116 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2117 				ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count);
2118 			zend_tmp_string_release(tmp_replace_entry_str);
2119 			zend_tmp_string_release(tmp_regex_str);
2120 			zend_string_release_ex(subject_str, 0);
2121 			subject_str = result;
2122 			if (UNEXPECTED(result == NULL)) {
2123 				break;
2124 			}
2125 		} ZEND_HASH_FOREACH_END();
2126 
2127 	} else {
2128 		ZEND_ASSERT(replace_str != NULL);
2129 
2130 		/* For each entry in the regex array, get the entry */
2131 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2132 			/* Make sure we're dealing with strings. */
2133 			zend_string *tmp_regex_str;
2134 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2135 
2136 			/* Do the actual replacement and put the result back into subject_str
2137 			   for further replacements. */
2138 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2139 				ZSTR_LEN(subject_str), replace_str, limit, replace_count);
2140 			zend_tmp_string_release(tmp_regex_str);
2141 			zend_string_release_ex(subject_str, 0);
2142 			subject_str = result;
2143 
2144 			if (UNEXPECTED(result == NULL)) {
2145 				break;
2146 			}
2147 		} ZEND_HASH_FOREACH_END();
2148 	}
2149 
2150 	return subject_str;
2151 }
2152 /* }}} */
2153 
2154 /* {{{ php_replace_in_subject */
php_replace_in_subject(zend_string * regex_str,HashTable * regex_ht,zend_string * replace_str,HashTable * replace_ht,zend_string * subject,size_t limit,size_t * replace_count)2155 static zend_always_inline zend_string *php_replace_in_subject(
2156 	zend_string *regex_str, HashTable *regex_ht,
2157 	zend_string *replace_str, HashTable *replace_ht,
2158 	zend_string *subject, size_t limit, size_t *replace_count)
2159 {
2160 	zend_string *result;
2161 
2162 	if (regex_str) {
2163 		ZEND_ASSERT(replace_str != NULL);
2164 		result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject),
2165 			replace_str, limit, replace_count);
2166 	} else {
2167 		ZEND_ASSERT(regex_ht != NULL);
2168 		result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject,
2169 			limit, replace_count);
2170 	}
2171 	return result;
2172 }
2173 /* }}} */
2174 
2175 /* {{{ php_replace_in_subject_func */
php_replace_in_subject_func(zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject,size_t limit,size_t * replace_count,zend_long flags)2176 static zend_string *php_replace_in_subject_func(zend_string *regex_str, HashTable *regex_ht,
2177 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2178 	zend_string *subject, size_t limit, size_t *replace_count, zend_long flags)
2179 {
2180 	zend_string *result;
2181 
2182 	if (regex_str) {
2183 		result = php_pcre_replace_func(
2184 			regex_str, subject, fci, fcc, limit, replace_count, flags);
2185 		return result;
2186 	} else {
2187 		/* If regex is an array */
2188 		zval		*regex_entry;
2189 
2190 		ZEND_ASSERT(regex_ht != NULL);
2191 
2192 		zend_string_addref(subject);
2193 
2194 		/* For each entry in the regex array, get the entry */
2195 		ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) {
2196 			/* Make sure we're dealing with strings. */
2197 			zend_string *tmp_regex_entry_str;
2198 			zend_string *regex_entry_str = zval_get_tmp_string(regex_entry, &tmp_regex_entry_str);
2199 
2200 			/* Do the actual replacement and put the result back into subject
2201 			   for further replacements. */
2202 			result = php_pcre_replace_func(
2203 				regex_entry_str, subject, fci, fcc, limit, replace_count, flags);
2204 			zend_tmp_string_release(tmp_regex_entry_str);
2205 			zend_string_release(subject);
2206 			subject = result;
2207 			if (UNEXPECTED(result == NULL)) {
2208 				break;
2209 			}
2210 		} ZEND_HASH_FOREACH_END();
2211 
2212 		return subject;
2213 	}
2214 }
2215 /* }}} */
2216 
2217 /* {{{ preg_replace_func_impl */
preg_replace_func_impl(zval * return_value,zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject_str,HashTable * subject_ht,zend_long limit_val,zend_long flags)2218 static size_t preg_replace_func_impl(zval *return_value,
2219 	zend_string *regex_str, HashTable *regex_ht,
2220 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2221 	zend_string *subject_str, HashTable *subject_ht, zend_long limit_val, zend_long flags)
2222 {
2223 	zend_string	*result;
2224 	size_t replace_count = 0;
2225 
2226 	if (subject_str) {
2227 		result = php_replace_in_subject_func(
2228 			regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags);
2229 		if (result != NULL) {
2230 			RETVAL_STR(result);
2231 		} else {
2232 			RETVAL_NULL();
2233 		}
2234 	} else {
2235 		/* if subject is an array */
2236 		zval		*subject_entry, zv;
2237 		zend_string	*string_key;
2238 		zend_ulong	 num_key;
2239 
2240 		ZEND_ASSERT(subject_ht != NULL);
2241 
2242 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2243 		HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2244 
2245 		/* For each subject entry, convert it to string, then perform replacement
2246 		   and add the result to the return_value array. */
2247 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2248 			zend_string *tmp_subject_entry_str;
2249 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2250 
2251 			result = php_replace_in_subject_func(
2252 				regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags);
2253 			if (result != NULL) {
2254 				/* Add to return array */
2255 				ZVAL_STR(&zv, result);
2256 				if (string_key) {
2257 					zend_hash_add_new(return_value_ht, string_key, &zv);
2258 				} else {
2259 					zend_hash_index_add_new(return_value_ht, num_key, &zv);
2260 				}
2261 			}
2262 			zend_tmp_string_release(tmp_subject_entry_str);
2263 		} ZEND_HASH_FOREACH_END();
2264 	}
2265 
2266 	return replace_count;
2267 }
2268 /* }}} */
2269 
_preg_replace_common(zval * return_value,HashTable * regex_ht,zend_string * regex_str,HashTable * replace_ht,zend_string * replace_str,HashTable * subject_ht,zend_string * subject_str,zend_long limit,zval * zcount,bool is_filter)2270 static void _preg_replace_common(
2271 	zval *return_value,
2272 	HashTable *regex_ht, zend_string *regex_str,
2273 	HashTable *replace_ht, zend_string *replace_str,
2274 	HashTable *subject_ht, zend_string *subject_str,
2275 	zend_long limit,
2276 	zval *zcount,
2277 	bool is_filter
2278 ) {
2279 	size_t replace_count = 0;
2280 	zend_string	*result;
2281 	size_t old_replace_count;
2282 
2283 	/* If replace is an array then the regex argument needs to also be an array */
2284 	if (replace_ht && !regex_ht) {
2285 		zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given");
2286 		RETURN_THROWS();
2287 	}
2288 
2289 	if (subject_str) {
2290 		old_replace_count = replace_count;
2291 		result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2292 			subject_str, limit, &replace_count);
2293 		if (result != NULL) {
2294 			if (!is_filter || replace_count > old_replace_count) {
2295 				RETVAL_STR(result);
2296 			} else {
2297 				zend_string_release_ex(result, 0);
2298 				RETVAL_NULL();
2299 			}
2300 		} else {
2301 			RETVAL_NULL();
2302 		}
2303 	} else {
2304 		/* if subject is an array */
2305 		zval		*subject_entry, zv;
2306 		zend_string	*string_key;
2307 		zend_ulong	 num_key;
2308 
2309 		ZEND_ASSERT(subject_ht != NULL);
2310 
2311 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2312 		HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2313 
2314 		/* For each subject entry, convert it to string, then perform replacement
2315 		   and add the result to the return_value array. */
2316 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2317 			old_replace_count = replace_count;
2318 			zend_string *tmp_subject_entry_str;
2319 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2320 			result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2321 				subject_entry_str, limit, &replace_count);
2322 
2323 			if (result != NULL) {
2324 				if (!is_filter || replace_count > old_replace_count) {
2325 					/* Add to return array */
2326 					ZVAL_STR(&zv, result);
2327 					if (string_key) {
2328 						zend_hash_add_new(return_value_ht, string_key, &zv);
2329 					} else {
2330 						zend_hash_index_add_new(return_value_ht, num_key, &zv);
2331 					}
2332 				} else {
2333 					zend_string_release_ex(result, 0);
2334 				}
2335 			}
2336 			zend_tmp_string_release(tmp_subject_entry_str);
2337 		} ZEND_HASH_FOREACH_END();
2338 	}
2339 
2340 	if (zcount) {
2341 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2342 	}
2343 }
2344 
2345 /* {{{ preg_replace_common */
preg_replace_common(INTERNAL_FUNCTION_PARAMETERS,bool is_filter)2346 static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
2347 {
2348 	zend_string *regex_str, *replace_str, *subject_str;
2349 	HashTable *regex_ht, *replace_ht, *subject_ht;
2350 	zend_long limit = -1;
2351 	zval *zcount = NULL;
2352 
2353 	/* Get function parameters and do error-checking. */
2354 	ZEND_PARSE_PARAMETERS_START(3, 5)
2355 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2356 		Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str)
2357 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2358 		Z_PARAM_OPTIONAL
2359 		Z_PARAM_LONG(limit)
2360 		Z_PARAM_ZVAL(zcount)
2361 	ZEND_PARSE_PARAMETERS_END();
2362 
2363 	_preg_replace_common(
2364 		return_value,
2365 		regex_ht, regex_str,
2366 		replace_ht, replace_str,
2367 		subject_ht, subject_str,
2368 		limit, zcount, is_filter);
2369 }
2370 /* }}} */
2371 
2372 /* {{{ Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)2373 PHP_FUNCTION(preg_replace)
2374 {
2375 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
2376 }
2377 /* }}} */
2378 
2379 ZEND_FRAMELESS_FUNCTION(preg_replace, 3)
2380 {
2381 	zend_string *regex_str, *replace_str, *subject_str;
2382 	HashTable *regex_ht, *replace_ht, *subject_ht;
2383 	zval regex_tmp, replace_tmp, subject_tmp;
2384 
2385 	Z_FLF_PARAM_ARRAY_HT_OR_STR(1, regex_ht, regex_str, regex_tmp);
2386 	Z_FLF_PARAM_ARRAY_HT_OR_STR(2, replace_ht, replace_str, replace_tmp);
2387 	Z_FLF_PARAM_ARRAY_HT_OR_STR(3, subject_ht, subject_str, subject_tmp);
2388 
2389 	_preg_replace_common(
2390 		return_value,
2391 		regex_ht, regex_str,
2392 		replace_ht, replace_str,
2393 		subject_ht, subject_str,
2394 		/* limit */ -1, /* zcount */ NULL, /* is_filter */ false);
2395 
2396 flf_clean:;
2397 	Z_FLF_PARAM_FREE_STR(1, regex_tmp);
2398 	Z_FLF_PARAM_FREE_STR(2, replace_tmp);
2399 	Z_FLF_PARAM_FREE_STR(3, subject_tmp);
2400 }
2401 
2402 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)2403 PHP_FUNCTION(preg_replace_callback)
2404 {
2405 	zval *zcount = NULL;
2406 	zend_string *regex_str;
2407 	HashTable *regex_ht;
2408 	zend_string *subject_str;
2409 	HashTable *subject_ht;
2410 	zend_long limit = -1, flags = 0;
2411 	size_t replace_count;
2412 	zend_fcall_info fci;
2413 	zend_fcall_info_cache fcc;
2414 
2415 	/* Get function parameters and do error-checking. */
2416 	ZEND_PARSE_PARAMETERS_START(3, 6)
2417 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2418 		Z_PARAM_FUNC(fci, fcc)
2419 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2420 		Z_PARAM_OPTIONAL
2421 		Z_PARAM_LONG(limit)
2422 		Z_PARAM_ZVAL(zcount)
2423 		Z_PARAM_LONG(flags)
2424 	ZEND_PARSE_PARAMETERS_END();
2425 
2426 	replace_count = preg_replace_func_impl(return_value, regex_str, regex_ht,
2427 		&fci, &fcc,
2428 		subject_str, subject_ht, limit, flags);
2429 	if (zcount) {
2430 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2431 	}
2432 }
2433 /* }}} */
2434 
2435 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)2436 PHP_FUNCTION(preg_replace_callback_array)
2437 {
2438 	zval zv, *replace, *zcount = NULL;
2439 	HashTable *pattern, *subject_ht;
2440 	zend_string *subject_str, *str_idx_regex;
2441 	zend_long limit = -1, flags = 0;
2442 	size_t replace_count = 0;
2443 	zend_fcall_info fci;
2444 	zend_fcall_info_cache fcc;
2445 
2446 	/* Get function parameters and do error-checking. */
2447 	ZEND_PARSE_PARAMETERS_START(2, 5)
2448 		Z_PARAM_ARRAY_HT(pattern)
2449 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2450 		Z_PARAM_OPTIONAL
2451 		Z_PARAM_LONG(limit)
2452 		Z_PARAM_ZVAL(zcount)
2453 		Z_PARAM_LONG(flags)
2454 	ZEND_PARSE_PARAMETERS_END();
2455 
2456 	fci.size = sizeof(fci);
2457 	fci.object = NULL;
2458 	fci.named_params = NULL;
2459 
2460 	if (subject_ht) {
2461 		GC_TRY_ADDREF(subject_ht);
2462 	} else {
2463 		GC_TRY_ADDREF(subject_str);
2464 	}
2465 
2466 	ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) {
2467 		if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) {
2468 			zend_argument_type_error(1, "must contain only valid callbacks");
2469 			goto error;
2470 		}
2471 		if (!str_idx_regex) {
2472 			zend_argument_type_error(1, "must contain only string patterns as keys");
2473 			goto error;
2474 		}
2475 
2476 		ZVAL_COPY_VALUE(&fci.function_name, replace);
2477 
2478 		replace_count += preg_replace_func_impl(&zv, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc,
2479 			subject_str, subject_ht, limit, flags);
2480 		switch (Z_TYPE(zv)) {
2481 			case IS_ARRAY:
2482 				ZEND_ASSERT(subject_ht);
2483 				zend_array_release(subject_ht);
2484 				subject_ht = Z_ARR(zv);
2485 				break;
2486 			case IS_STRING:
2487 				ZEND_ASSERT(subject_str);
2488 				zend_string_release(subject_str);
2489 				subject_str = Z_STR(zv);
2490 				break;
2491 			case IS_NULL:
2492 				RETVAL_NULL();
2493 				goto error;
2494 			EMPTY_SWITCH_DEFAULT_CASE()
2495 		}
2496 
2497 		if (EG(exception)) {
2498 			goto error;
2499 		}
2500 	} ZEND_HASH_FOREACH_END();
2501 
2502 	if (zcount) {
2503 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2504 	}
2505 
2506 	if (subject_ht) {
2507 		RETVAL_ARR(subject_ht);
2508 		// Unset the type_flags of immutable arrays to prevent the VM from performing refcounting
2509 		if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) {
2510 			Z_TYPE_FLAGS_P(return_value) = 0;
2511 		}
2512 		return;
2513 	} else {
2514 		RETURN_STR(subject_str);
2515 	}
2516 
2517 error:
2518 	if (subject_ht) {
2519 		zend_array_release(subject_ht);
2520 	} else {
2521 		zend_string_release(subject_str);
2522 	}
2523 }
2524 /* }}} */
2525 
2526 /* {{{ Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)2527 PHP_FUNCTION(preg_filter)
2528 {
2529 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
2530 }
2531 /* }}} */
2532 
2533 /* {{{ Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)2534 PHP_FUNCTION(preg_split)
2535 {
2536 	zend_string			*regex;			/* Regular expression */
2537 	zend_string			*subject;		/* String to match against */
2538 	zend_long			 limit_val = -1;/* Integer value of limit */
2539 	zend_long			 flags = 0;		/* Match control flags */
2540 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2541 
2542 	/* Get function parameters and do error checking */
2543 	ZEND_PARSE_PARAMETERS_START(2, 4)
2544 		Z_PARAM_STR(regex)
2545 		Z_PARAM_STR(subject)
2546 		Z_PARAM_OPTIONAL
2547 		Z_PARAM_LONG(limit_val)
2548 		Z_PARAM_LONG(flags)
2549 	ZEND_PARSE_PARAMETERS_END();
2550 
2551 	/* Compile regex or get it from cache. */
2552 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2553 		RETURN_FALSE;
2554 	}
2555 
2556 	pce->refcount++;
2557 	php_pcre_split_impl(pce, subject, return_value, limit_val, flags);
2558 	pce->refcount--;
2559 }
2560 /* }}} */
2561 
2562 /* {{{ php_pcre_split */
php_pcre_split_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zend_long limit_val,zend_long flags)2563 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
2564 	zend_long limit_val, zend_long flags)
2565 {
2566 	uint32_t		 options;			/* Execution options */
2567 	int				 count;				/* Count of matched subpatterns */
2568 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
2569 	PCRE2_SIZE		 last_match_offset;	/* Location of last match */
2570 	uint32_t		 no_empty;			/* If NO_EMPTY flag is set */
2571 	uint32_t		 delim_capture; 	/* If delimiters should be captured */
2572 	uint32_t		 offset_capture;	/* If offsets should be captured */
2573 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2574 	zval			 tmp;
2575 	pcre2_match_data *match_data;
2576 	char *subject = ZSTR_VAL(subject_str);
2577 
2578 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
2579 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
2580 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
2581 
2582 	/* Initialize return value */
2583 	array_init(return_value);
2584 	HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2585 
2586 	/* Calculate the size of the offsets array, and allocate memory for it. */
2587 	num_subpats = pce->capture_count + 1;
2588 
2589 	/* Start at the beginning of the string */
2590 	start_offset = 0;
2591 	last_match_offset = 0;
2592 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2593 
2594 	if (limit_val == -1) {
2595 		/* pass */
2596 	} else if (limit_val == 0) {
2597 		limit_val = -1;
2598 	} else if (limit_val <= 1) {
2599 		goto last;
2600 	}
2601 
2602 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2603 		match_data = mdata;
2604 	} else {
2605 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2606 		if (!match_data) {
2607 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2608 			zval_ptr_dtor(return_value);
2609 			RETURN_FALSE;
2610 		}
2611 	}
2612 
2613 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2614 
2615 	/* Array of subpattern offsets */
2616 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
2617 
2618 #ifdef HAVE_PCRE_JIT_SUPPORT
2619 	if ((pce->preg_options & PREG_JIT) && options) {
2620 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2621 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2622 	} else
2623 #endif
2624 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2625 			options, match_data, mctx);
2626 
2627 	while (1) {
2628 		/* If something matched */
2629 		if (count >= 0) {
2630 			/* Check for too many substrings condition. */
2631 			if (UNEXPECTED(count == 0)) {
2632 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
2633 				count = num_subpats;
2634 			}
2635 
2636 matched:
2637 			if (UNEXPECTED(offsets[1] < offsets[0])) {
2638 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2639 				break;
2640 			}
2641 
2642 			if (!no_empty || offsets[0] != last_match_offset) {
2643 				if (offset_capture) {
2644 					/* Add (match, offset) pair to the return value */
2645 					add_offset_pair(
2646 						return_value_ht, subject, last_match_offset, offsets[0],
2647 						NULL, 0);
2648 				} else {
2649 					/* Add the piece to the return value */
2650 					populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
2651 					zend_hash_next_index_insert_new(return_value_ht, &tmp);
2652 				}
2653 
2654 				/* One less left to do */
2655 				if (limit_val != -1)
2656 					limit_val--;
2657 			}
2658 
2659 			if (delim_capture) {
2660 				size_t i;
2661 				for (i = 1; i < count; i++) {
2662 					/* If we have matched a delimiter */
2663 					if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
2664 						if (offset_capture) {
2665 							add_offset_pair(
2666 								return_value_ht, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
2667 						} else {
2668 							populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
2669 							zend_hash_next_index_insert_new(return_value_ht, &tmp);
2670 						}
2671 					}
2672 				}
2673 			}
2674 
2675 			/* Advance to the position right after the last full match */
2676 			start_offset = last_match_offset = offsets[1];
2677 
2678 			/* If we have matched an empty string, mimic what Perl's /g options does.
2679 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
2680 			   the match again at the same point. If this fails (picked up above) we
2681 			   advance to the next character. */
2682 			if (start_offset == offsets[0]) {
2683 				/* Get next piece if no limit or limit not yet reached and something matched*/
2684 				if (limit_val != -1 && limit_val <= 1) {
2685 					break;
2686 				}
2687 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2688 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
2689 				if (count >= 0) {
2690 					goto matched;
2691 				} else if (count == PCRE2_ERROR_NOMATCH) {
2692 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2693 					   this is not necessarily the end. We need to advance
2694 					   the start offset, and continue. Fudge the offset values
2695 					   to achieve this, unless we're already at the end of the string. */
2696 					if (start_offset < ZSTR_LEN(subject_str)) {
2697 						start_offset += calculate_unit_length(pce, subject + start_offset);
2698 					} else {
2699 						break;
2700 					}
2701 				} else {
2702 					goto error;
2703 				}
2704 			}
2705 
2706 		} else if (count == PCRE2_ERROR_NOMATCH) {
2707 			break;
2708 		} else {
2709 error:
2710 			pcre_handle_exec_error(count);
2711 			break;
2712 		}
2713 
2714 		/* Get next piece if no limit or limit not yet reached and something matched*/
2715 		if (limit_val != -1 && limit_val <= 1) {
2716 			break;
2717 		}
2718 
2719 #ifdef HAVE_PCRE_JIT_SUPPORT
2720 		if (pce->preg_options & PREG_JIT) {
2721 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2722 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2723 		} else
2724 #endif
2725 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2726 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2727 	}
2728 	if (match_data != mdata) {
2729 		pcre2_match_data_free(match_data);
2730 	}
2731 
2732 	if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) {
2733 		zval_ptr_dtor(return_value);
2734 		RETURN_FALSE;
2735 	}
2736 
2737 last:
2738 	start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */
2739 
2740 	if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
2741 		if (offset_capture) {
2742 			/* Add the last (match, offset) pair to the return value */
2743 			add_offset_pair(return_value_ht, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
2744 		} else {
2745 			/* Add the last piece to the return value */
2746 			if (start_offset == 0) {
2747 				ZVAL_STR_COPY(&tmp, subject_str);
2748 			} else {
2749 				populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
2750 			}
2751 			zend_hash_next_index_insert_new(return_value_ht, &tmp);
2752 		}
2753 	}
2754 }
2755 /* }}} */
2756 
2757 /* {{{ Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)2758 PHP_FUNCTION(preg_quote)
2759 {
2760 	zend_string *str;       		/* Input string argument */
2761 	zend_string	*delim = NULL;		/* Additional delimiter argument */
2762 	char		*in_str;			/* Input string */
2763 	char		*in_str_end;    	/* End of the input string */
2764 	zend_string	*out_str;			/* Output string with quoted characters */
2765 	size_t       extra_len;         /* Number of additional characters */
2766 	char 		*p,					/* Iterator for input string */
2767 				*q,					/* Iterator for output string */
2768 				 delim_char = '\0',	/* Delimiter character to be quoted */
2769 				 c;					/* Current character */
2770 
2771 	/* Get the arguments and check for errors */
2772 	ZEND_PARSE_PARAMETERS_START(1, 2)
2773 		Z_PARAM_STR(str)
2774 		Z_PARAM_OPTIONAL
2775 		Z_PARAM_STR_OR_NULL(delim)
2776 	ZEND_PARSE_PARAMETERS_END();
2777 
2778 	/* Nothing to do if we got an empty string */
2779 	if (ZSTR_LEN(str) == 0) {
2780 		RETURN_EMPTY_STRING();
2781 	}
2782 
2783 	in_str = ZSTR_VAL(str);
2784 	in_str_end = in_str + ZSTR_LEN(str);
2785 
2786 	if (delim) {
2787 		delim_char = ZSTR_VAL(delim)[0];
2788 	}
2789 
2790 	/* Go through the string and quote necessary characters */
2791 	extra_len = 0;
2792 	p = in_str;
2793 	do {
2794 		c = *p;
2795 		switch(c) {
2796 			case '.':
2797 			case '\\':
2798 			case '+':
2799 			case '*':
2800 			case '?':
2801 			case '[':
2802 			case '^':
2803 			case ']':
2804 			case '$':
2805 			case '(':
2806 			case ')':
2807 			case '{':
2808 			case '}':
2809 			case '=':
2810 			case '!':
2811 			case '>':
2812 			case '<':
2813 			case '|':
2814 			case ':':
2815 			case '-':
2816 			case '#':
2817 				extra_len++;
2818 				break;
2819 
2820 			case '\0':
2821 				extra_len+=3;
2822 				break;
2823 
2824 			default:
2825 				if (c == delim_char) {
2826 					extra_len++;
2827 				}
2828 				break;
2829 		}
2830 		p++;
2831 	} while (p != in_str_end);
2832 
2833 	if (extra_len == 0) {
2834 		RETURN_STR_COPY(str);
2835 	}
2836 
2837 	/* Allocate enough memory so that even if each character
2838 	   is quoted, we won't run out of room */
2839 	out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0);
2840 	q = ZSTR_VAL(out_str);
2841 	p = in_str;
2842 
2843 	do {
2844 		c = *p;
2845 		switch(c) {
2846 			case '.':
2847 			case '\\':
2848 			case '+':
2849 			case '*':
2850 			case '?':
2851 			case '[':
2852 			case '^':
2853 			case ']':
2854 			case '$':
2855 			case '(':
2856 			case ')':
2857 			case '{':
2858 			case '}':
2859 			case '=':
2860 			case '!':
2861 			case '>':
2862 			case '<':
2863 			case '|':
2864 			case ':':
2865 			case '-':
2866 			case '#':
2867 				*q++ = '\\';
2868 				*q++ = c;
2869 				break;
2870 
2871 			case '\0':
2872 				*q++ = '\\';
2873 				*q++ = '0';
2874 				*q++ = '0';
2875 				*q++ = '0';
2876 				break;
2877 
2878 			default:
2879 				if (c == delim_char) {
2880 					*q++ = '\\';
2881 				}
2882 				*q++ = c;
2883 				break;
2884 		}
2885 		p++;
2886 	} while (p != in_str_end);
2887 	*q = '\0';
2888 
2889 	RETURN_NEW_STR(out_str);
2890 }
2891 /* }}} */
2892 
2893 /* {{{ Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2894 PHP_FUNCTION(preg_grep)
2895 {
2896 	zend_string			*regex;			/* Regular expression */
2897 	zval				*input;			/* Input array */
2898 	zend_long			 flags = 0;		/* Match control flags */
2899 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2900 
2901 	/* Get arguments and do error checking */
2902 	ZEND_PARSE_PARAMETERS_START(2, 3)
2903 		Z_PARAM_STR(regex)
2904 		Z_PARAM_ARRAY(input)
2905 		Z_PARAM_OPTIONAL
2906 		Z_PARAM_LONG(flags)
2907 	ZEND_PARSE_PARAMETERS_END();
2908 
2909 	/* Compile regex or get it from cache. */
2910 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2911 		RETURN_FALSE;
2912 	}
2913 
2914 	pce->refcount++;
2915 	php_pcre_grep_impl(pce, input, return_value, flags);
2916 	pce->refcount--;
2917 }
2918 /* }}} */
2919 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2920 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2921 {
2922 	zval            *entry;             /* An entry in the input array */
2923 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2924 	int				 count;				/* Count of matched subpatterns */
2925 	uint32_t		 options;			/* Execution options */
2926 	zend_string		*string_key;
2927 	zend_ulong		 num_key;
2928 	bool		 invert;			/* Whether to return non-matching
2929 										   entries */
2930 	pcre2_match_data *match_data;
2931 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
2932 
2933 	/* Calculate the size of the offsets array, and allocate memory for it. */
2934 	num_subpats = pce->capture_count + 1;
2935 
2936 	/* Initialize return array */
2937 	array_init(return_value);
2938 	HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2939 
2940 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2941 
2942 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2943 		match_data = mdata;
2944 	} else {
2945 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2946 		if (!match_data) {
2947 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2948 			return;
2949 		}
2950 	}
2951 
2952 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2953 
2954 	/* Go through the input array */
2955 	ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2956 		zend_string *tmp_subject_str;
2957 		zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str);
2958 
2959 		/* Perform the match */
2960 #ifdef HAVE_PCRE_JIT_SUPPORT
2961 		if ((pce->preg_options & PREG_JIT) && options) {
2962 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2963 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2964 		} else
2965 #endif
2966 		count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2967 				options, match_data, mctx);
2968 
2969 		/* If the entry fits our requirements */
2970 		if (count >= 0) {
2971 			/* Check for too many substrings condition. */
2972 			if (UNEXPECTED(count == 0)) {
2973 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2974 			}
2975 			if (!invert) {
2976 				Z_TRY_ADDREF_P(entry);
2977 
2978 				/* Add to return array */
2979 				if (string_key) {
2980 					zend_hash_update(return_value_ht, string_key, entry);
2981 				} else {
2982 					zend_hash_index_update(return_value_ht, num_key, entry);
2983 				}
2984 			}
2985 		} else if (count == PCRE2_ERROR_NOMATCH) {
2986 			if (invert) {
2987 				Z_TRY_ADDREF_P(entry);
2988 
2989 				/* Add to return array */
2990 				if (string_key) {
2991 					zend_hash_update(return_value_ht, string_key, entry);
2992 				} else {
2993 					zend_hash_index_update(return_value_ht, num_key, entry);
2994 				}
2995 			}
2996 		} else {
2997 			pcre_handle_exec_error(count);
2998 			zend_tmp_string_release(tmp_subject_str);
2999 			break;
3000 		}
3001 
3002 		zend_tmp_string_release(tmp_subject_str);
3003 	} ZEND_HASH_FOREACH_END();
3004 	if (match_data != mdata) {
3005 		pcre2_match_data_free(match_data);
3006 	}
3007 }
3008 /* }}} */
3009 
3010 /* {{{ Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)3011 PHP_FUNCTION(preg_last_error)
3012 {
3013 	ZEND_PARSE_PARAMETERS_NONE();
3014 
3015 	RETURN_LONG(PCRE_G(error_code));
3016 }
3017 /* }}} */
3018 
3019 /* {{{ Returns the error message of the last regexp execution. */
PHP_FUNCTION(preg_last_error_msg)3020 PHP_FUNCTION(preg_last_error_msg)
3021 {
3022 	ZEND_PARSE_PARAMETERS_NONE();
3023 
3024 	RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code)));
3025 }
3026 /* }}} */
3027 
3028 /* {{{ module definition structures */
3029 
3030 zend_module_entry pcre_module_entry = {
3031 	STANDARD_MODULE_HEADER,
3032 	"pcre",
3033 	ext_functions,
3034 	PHP_MINIT(pcre),
3035 	PHP_MSHUTDOWN(pcre),
3036 	PHP_RINIT(pcre),
3037 	PHP_RSHUTDOWN(pcre),
3038 	PHP_MINFO(pcre),
3039 	PHP_PCRE_VERSION,
3040 	PHP_MODULE_GLOBALS(pcre),
3041 	PHP_GINIT(pcre),
3042 	PHP_GSHUTDOWN(pcre),
3043 	NULL,
3044 	STANDARD_MODULE_PROPERTIES_EX
3045 };
3046 
3047 #ifdef COMPILE_DL_PCRE
ZEND_GET_MODULE(pcre)3048 ZEND_GET_MODULE(pcre)
3049 #endif
3050 
3051 /* }}} */
3052 
3053 PHPAPI pcre2_match_context *php_pcre_mctx(void)
3054 {/*{{{*/
3055 	return mctx;
3056 }/*}}}*/
3057 
php_pcre_gctx(void)3058 PHPAPI pcre2_general_context *php_pcre_gctx(void)
3059 {/*{{{*/
3060 	return gctx;
3061 }/*}}}*/
3062 
php_pcre_cctx(void)3063 PHPAPI pcre2_compile_context *php_pcre_cctx(void)
3064 {/*{{{*/
3065 	return cctx;
3066 }/*}}}*/
3067 
php_pcre_pce_incref(pcre_cache_entry * pce)3068 PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce)
3069 {/*{{{*/
3070 	assert(NULL != pce);
3071 	pce->refcount++;
3072 }/*}}}*/
3073 
php_pcre_pce_decref(pcre_cache_entry * pce)3074 PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce)
3075 {/*{{{*/
3076 	assert(NULL != pce);
3077 	assert(0 != pce->refcount);
3078 	pce->refcount--;
3079 }/*}}}*/
3080 
php_pcre_pce_re(pcre_cache_entry * pce)3081 PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce)
3082 {/*{{{*/
3083 	assert(NULL != pce);
3084 	return pce->re;
3085 }/*}}}*/
3086