xref: /PHP-8.3/ext/pcre/php_pcre.c (revision ddc7a6b1)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Andrei Zmievski <andrei@php.net>                             |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "php.h"
18 #include "php_ini.h"
19 #include "php_globals.h"
20 #include "php_pcre.h"
21 #include "ext/standard/info.h"
22 #include "ext/standard/basic_functions.h"
23 #include "zend_smart_str.h"
24 #include "SAPI.h"
25 
26 #include "ext/standard/php_string.h"
27 
28 #define PREG_PATTERN_ORDER			1
29 #define PREG_SET_ORDER				2
30 #define PREG_OFFSET_CAPTURE			(1<<8)
31 #define PREG_UNMATCHED_AS_NULL		(1<<9)
32 
33 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
34 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
35 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
36 
37 #define PREG_REPLACE_EVAL			(1<<0)
38 
39 #define PREG_GREP_INVERT			(1<<0)
40 
41 #define PREG_JIT                    (1<<3)
42 
43 #define PCRE_CACHE_SIZE 4096
44 
45 #ifdef HAVE_PCRE_JIT_SUPPORT
46 #define PHP_PCRE_JIT_SUPPORT 1
47 #else
48 #define PHP_PCRE_JIT_SUPPORT 0
49 #endif
50 
51 char *php_pcre_version;
52 
53 #include "php_pcre_arginfo.h"
54 
55 struct _pcre_cache_entry {
56 	pcre2_code *re;
57 	uint32_t preg_options;
58 	uint32_t capture_count;
59 	uint32_t name_count;
60 	uint32_t compile_options;
61 	uint32_t refcount;
62 };
63 
64 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
65 
66 #ifdef HAVE_PCRE_JIT_SUPPORT
67 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
68 #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024)
69 ZEND_TLS pcre2_jit_stack *jit_stack = NULL;
70 #endif
71 /* General context using (infallible) system allocator. */
72 ZEND_TLS pcre2_general_context *gctx = NULL;
73 /* These two are global per thread for now. Though it is possible to use these
74  	per pattern. Either one can copy it and use in pce, or one does no global
75 	contexts at all, but creates for every pce. */
76 ZEND_TLS pcre2_compile_context *cctx = NULL;
77 ZEND_TLS pcre2_match_context   *mctx = NULL;
78 ZEND_TLS pcre2_match_data      *mdata = NULL;
79 ZEND_TLS bool              mdata_used = 0;
80 ZEND_TLS uint8_t pcre2_init_ok = 0;
81 #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT)
82 static MUTEX_T pcre_mt = NULL;
83 #define php_pcre_mutex_alloc() \
84 	if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc();
85 #define php_pcre_mutex_free() \
86 	if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; }
87 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
88 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
89 #else
90 #define php_pcre_mutex_alloc()
91 #define php_pcre_mutex_free()
92 #define php_pcre_mutex_lock()
93 #define php_pcre_mutex_unlock()
94 #endif
95 
96 ZEND_TLS HashTable char_tables;
97 
php_pcre_free_char_table(zval * data)98 static void php_pcre_free_char_table(zval *data)
99 {/*{{{*/
100 	void *ptr = Z_PTR_P(data);
101 	pefree(ptr, 1);
102 }/*}}}*/
103 
pcre_handle_exec_error(int pcre_code)104 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
105 {
106 	int preg_code = 0;
107 
108 	switch (pcre_code) {
109 		case PCRE2_ERROR_MATCHLIMIT:
110 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
111 			break;
112 
113 		case PCRE2_ERROR_RECURSIONLIMIT:
114 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
115 			break;
116 
117 		case PCRE2_ERROR_BADUTFOFFSET:
118 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
119 			break;
120 
121 #ifdef HAVE_PCRE_JIT_SUPPORT
122 		case PCRE2_ERROR_JIT_STACKLIMIT:
123 			preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
124 			break;
125 #endif
126 
127 		default:
128 			if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) {
129 				preg_code = PHP_PCRE_BAD_UTF8_ERROR;
130 			} else  {
131 				preg_code = PHP_PCRE_INTERNAL_ERROR;
132 			}
133 			break;
134 	}
135 
136 	PCRE_G(error_code) = preg_code;
137 }
138 /* }}} */
139 
php_pcre_get_error_msg(php_pcre_error_code error_code)140 static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */
141 {
142 	switch (error_code) {
143 		case PHP_PCRE_NO_ERROR:
144 			return "No error";
145 		case PHP_PCRE_INTERNAL_ERROR:
146 			return "Internal error";
147 		case PHP_PCRE_BAD_UTF8_ERROR:
148 			return "Malformed UTF-8 characters, possibly incorrectly encoded";
149 		case PHP_PCRE_BAD_UTF8_OFFSET_ERROR:
150 			return "The offset did not correspond to the beginning of a valid UTF-8 code point";
151 		case PHP_PCRE_BACKTRACK_LIMIT_ERROR:
152 			return "Backtrack limit exhausted";
153 		case PHP_PCRE_RECURSION_LIMIT_ERROR:
154 			return "Recursion limit exhausted";
155 
156 #ifdef HAVE_PCRE_JIT_SUPPORT
157 		case PHP_PCRE_JIT_STACKLIMIT_ERROR:
158 			return "JIT stack limit exhausted";
159 #endif
160 
161 		default:
162 			return "Unknown error";
163 	}
164 }
165 /* }}} */
166 
php_free_pcre_cache(zval * data)167 static void php_free_pcre_cache(zval *data) /* {{{ */
168 {
169 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
170 	if (!pce) return;
171 	pcre2_code_free(pce->re);
172 	free(pce);
173 }
174 /* }}} */
175 
php_efree_pcre_cache(zval * data)176 static void php_efree_pcre_cache(zval *data) /* {{{ */
177 {
178 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
179 	if (!pce) return;
180 	pcre2_code_free(pce->re);
181 	efree(pce);
182 }
183 /* }}} */
184 
php_pcre_malloc(PCRE2_SIZE size,void * data)185 static void *php_pcre_malloc(PCRE2_SIZE size, void *data)
186 {
187 	return pemalloc(size, 1);
188 }
189 
php_pcre_free(void * block,void * data)190 static void php_pcre_free(void *block, void *data)
191 {
192 	pefree(block, 1);
193 }
194 
php_pcre_emalloc(PCRE2_SIZE size,void * data)195 static void *php_pcre_emalloc(PCRE2_SIZE size, void *data)
196 {
197 	return emalloc(size);
198 }
199 
php_pcre_efree(void * block,void * data)200 static void php_pcre_efree(void *block, void *data)
201 {
202 	efree(block);
203 }
204 
205 #ifdef PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
206 	/* pcre 10.38 needs PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, disabled by default */
207 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
208 #else
209 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS 0
210 #endif
211 
212 #define PHP_PCRE_PREALLOC_MDATA_SIZE 32
213 
php_pcre_init_pcre2(uint8_t jit)214 static void php_pcre_init_pcre2(uint8_t jit)
215 {/*{{{*/
216 	if (!gctx) {
217 		gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL);
218 		if (!gctx) {
219 			pcre2_init_ok = 0;
220 			return;
221 		}
222 	}
223 
224 	if (!cctx) {
225 		cctx = pcre2_compile_context_create(gctx);
226 		if (!cctx) {
227 			pcre2_init_ok = 0;
228 			return;
229 		}
230 	}
231 
232 	pcre2_set_compile_extra_options(cctx, PHP_PCRE_DEFAULT_EXTRA_COPTIONS);
233 
234 	if (!mctx) {
235 		mctx = pcre2_match_context_create(gctx);
236 		if (!mctx) {
237 			pcre2_init_ok = 0;
238 			return;
239 		}
240 	}
241 
242 #ifdef HAVE_PCRE_JIT_SUPPORT
243 	if (jit && !jit_stack) {
244 		jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx);
245 		if (!jit_stack) {
246 			pcre2_init_ok = 0;
247 			return;
248 		}
249 	}
250 #endif
251 
252 	if (!mdata) {
253 		mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx);
254 		if (!mdata) {
255 			pcre2_init_ok = 0;
256 			return;
257 		}
258 	}
259 
260 	pcre2_init_ok = 1;
261 }/*}}}*/
262 
php_pcre_shutdown_pcre2(void)263 static void php_pcre_shutdown_pcre2(void)
264 {/*{{{*/
265 	if (gctx) {
266 		pcre2_general_context_free(gctx);
267 		gctx = NULL;
268 	}
269 
270 	if (cctx) {
271 		pcre2_compile_context_free(cctx);
272 		cctx = NULL;
273 	}
274 
275 	if (mctx) {
276 		pcre2_match_context_free(mctx);
277 		mctx = NULL;
278 	}
279 
280 #ifdef HAVE_PCRE_JIT_SUPPORT
281 	/* Stack may only be destroyed when no cached patterns
282 	 	possibly associated with it do exist. */
283 	if (jit_stack) {
284 		pcre2_jit_stack_free(jit_stack);
285 		jit_stack = NULL;
286 	}
287 #endif
288 
289 	if (mdata) {
290 		pcre2_match_data_free(mdata);
291 		mdata = NULL;
292 	}
293 
294 	pcre2_init_ok = 0;
295 }/*}}}*/
296 
PHP_GINIT_FUNCTION(pcre)297 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
298 {
299 	php_pcre_mutex_alloc();
300 
301 	/* If we're on the CLI SAPI, there will only be one request, so we don't need the
302 	 * cache to survive after RSHUTDOWN. */
303 	pcre_globals->per_request_cache = strcmp(sapi_module.name, "cli") == 0;
304 	if (!pcre_globals->per_request_cache) {
305 		zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
306 	}
307 
308 	pcre_globals->backtrack_limit = 0;
309 	pcre_globals->recursion_limit = 0;
310 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
311 	ZVAL_UNDEF(&pcre_globals->unmatched_null_pair);
312 	ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair);
313 #ifdef HAVE_PCRE_JIT_SUPPORT
314 	pcre_globals->jit = 1;
315 #endif
316 
317 	php_pcre_init_pcre2(1);
318 	zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1);
319 }
320 /* }}} */
321 
PHP_GSHUTDOWN_FUNCTION(pcre)322 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
323 {
324 	if (!pcre_globals->per_request_cache) {
325 		zend_hash_destroy(&pcre_globals->pcre_cache);
326 	}
327 
328 	php_pcre_shutdown_pcre2();
329 	zend_hash_destroy(&char_tables);
330 	php_pcre_mutex_free();
331 }
332 /* }}} */
333 
PHP_INI_MH(OnUpdateBacktrackLimit)334 static PHP_INI_MH(OnUpdateBacktrackLimit)
335 {/*{{{*/
336 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
337 	if (mctx) {
338 		pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit));
339 	}
340 
341 	return SUCCESS;
342 }/*}}}*/
343 
PHP_INI_MH(OnUpdateRecursionLimit)344 static PHP_INI_MH(OnUpdateRecursionLimit)
345 {/*{{{*/
346 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
347 	if (mctx) {
348 		pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit));
349 	}
350 
351 	return SUCCESS;
352 }/*}}}*/
353 
354 #ifdef HAVE_PCRE_JIT_SUPPORT
PHP_INI_MH(OnUpdateJit)355 static PHP_INI_MH(OnUpdateJit)
356 {/*{{{*/
357 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
358 	if (PCRE_G(jit) && jit_stack) {
359 		pcre2_jit_stack_assign(mctx, NULL, jit_stack);
360 	} else {
361 		pcre2_jit_stack_assign(mctx, NULL, NULL);
362 	}
363 
364 	return SUCCESS;
365 }/*}}}*/
366 #endif
367 
368 PHP_INI_BEGIN()
369 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals)
370 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000",  PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals)
371 #ifdef HAVE_PCRE_JIT_SUPPORT
372 	STD_PHP_INI_BOOLEAN("pcre.jit",           "1",       PHP_INI_ALL, OnUpdateJit,            jit,             zend_pcre_globals, pcre_globals)
373 #endif
PHP_INI_END()374 PHP_INI_END()
375 
376 static char *_pcre2_config_str(uint32_t what)
377 {/*{{{*/
378 	int len = pcre2_config(what, NULL);
379 	char *ret = (char *) malloc(len + 1);
380 
381 	len = pcre2_config(what, ret);
382 	if (!len) {
383 		free(ret);
384 		return NULL;
385 	}
386 
387 	return ret;
388 }/*}}}*/
389 
390 /* {{{ PHP_MINFO_FUNCTION(pcre) */
PHP_MINFO_FUNCTION(pcre)391 static PHP_MINFO_FUNCTION(pcre)
392 {
393 #ifdef HAVE_PCRE_JIT_SUPPORT
394 	uint32_t flag = 0;
395 	char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET);
396 #endif
397 	char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
398 	char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION);
399 
400 	php_info_print_table_start();
401 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
402 	php_info_print_table_row(2, "PCRE Library Version", version);
403 	free(version);
404 	php_info_print_table_row(2, "PCRE Unicode Version", unicode);
405 	free(unicode);
406 
407 #ifdef HAVE_PCRE_JIT_SUPPORT
408 	if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) {
409 		php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled");
410 	} else {
411 		php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
412 	}
413 	if (jit_target) {
414 		php_info_print_table_row(2, "PCRE JIT Target", jit_target);
415 	}
416 	free(jit_target);
417 #else
418 	php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
419 #endif
420 
421 #ifdef HAVE_PCRE_VALGRIND_SUPPORT
422 	php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" );
423 #endif
424 
425 	php_info_print_table_end();
426 
427 	DISPLAY_INI_ENTRIES();
428 }
429 /* }}} */
430 
431 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)432 static PHP_MINIT_FUNCTION(pcre)
433 {
434 #ifdef HAVE_PCRE_JIT_SUPPORT
435 	if (UNEXPECTED(!pcre2_init_ok)) {
436 		/* Retry. */
437 		php_pcre_init_pcre2(PCRE_G(jit));
438 		if (!pcre2_init_ok) {
439 			return FAILURE;
440 		}
441 	}
442 #endif
443 
444 	REGISTER_INI_ENTRIES();
445 
446 	php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
447 
448 	register_php_pcre_symbols(module_number);
449 
450 	return SUCCESS;
451 }
452 /* }}} */
453 
454 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)455 static PHP_MSHUTDOWN_FUNCTION(pcre)
456 {
457 	UNREGISTER_INI_ENTRIES();
458 
459 	free(php_pcre_version);
460 
461 	return SUCCESS;
462 }
463 /* }}} */
464 
465 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)466 static PHP_RINIT_FUNCTION(pcre)
467 {
468 #ifdef HAVE_PCRE_JIT_SUPPORT
469 	if (UNEXPECTED(!pcre2_init_ok)) {
470 		/* Retry. */
471 		php_pcre_mutex_lock();
472 		php_pcre_init_pcre2(PCRE_G(jit));
473 		if (!pcre2_init_ok) {
474 			php_pcre_mutex_unlock();
475 			return FAILURE;
476 		}
477 		php_pcre_mutex_unlock();
478 	}
479 
480 	mdata_used = 0;
481 #endif
482 
483 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
484 	PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL);
485 	if (!PCRE_G(gctx_zmm)) {
486 		return FAILURE;
487 	}
488 
489 	if (PCRE_G(per_request_cache)) {
490 		zend_hash_init(&PCRE_G(pcre_cache), 0, NULL, php_efree_pcre_cache, 0);
491 	}
492 
493 	return SUCCESS;
494 }
495 /* }}} */
496 
PHP_RSHUTDOWN_FUNCTION(pcre)497 static PHP_RSHUTDOWN_FUNCTION(pcre)
498 {
499 	pcre2_general_context_free(PCRE_G(gctx_zmm));
500 	PCRE_G(gctx_zmm) = NULL;
501 
502 	if (PCRE_G(per_request_cache)) {
503 		zend_hash_destroy(&PCRE_G(pcre_cache));
504 	}
505 
506 	zval_ptr_dtor(&PCRE_G(unmatched_null_pair));
507 	zval_ptr_dtor(&PCRE_G(unmatched_empty_pair));
508 	ZVAL_UNDEF(&PCRE_G(unmatched_null_pair));
509 	ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair));
510 	return SUCCESS;
511 }
512 
513 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)514 static int pcre_clean_cache(zval *data, void *arg)
515 {
516 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
517 	int *num_clean = (int *)arg;
518 
519 	if (*num_clean > 0 && !pce->refcount) {
520 		(*num_clean)--;
521 		return ZEND_HASH_APPLY_REMOVE;
522 	} else {
523 		return ZEND_HASH_APPLY_KEEP;
524 	}
525 }
526 /* }}} */
527 
free_subpats_table(zend_string ** subpat_names,uint32_t num_subpats)528 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) {
529 	uint32_t i;
530 	for (i = 0; i < num_subpats; i++) {
531 		if (subpat_names[i]) {
532 			zend_string_release(subpat_names[i]);
533 		}
534 	}
535 	efree(subpat_names);
536 }
537 
538 /* {{{ static make_subpats_table */
make_subpats_table(uint32_t num_subpats,pcre_cache_entry * pce)539 static zend_string **make_subpats_table(uint32_t num_subpats, pcre_cache_entry *pce)
540 {
541 	uint32_t name_cnt = pce->name_count, name_size, ni = 0;
542 	char *name_table;
543 	zend_string **subpat_names;
544 	int rc1, rc2;
545 
546 	rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table);
547 	rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size);
548 	if (rc1 < 0 || rc2 < 0) {
549 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2);
550 		return NULL;
551 	}
552 
553 	subpat_names = ecalloc(num_subpats, sizeof(zend_string *));
554 	while (ni++ < name_cnt) {
555 		unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
556 		const char *name = name_table + 2;
557 		subpat_names[name_idx] = zend_string_init(name, strlen(name), 0);
558 		if (is_numeric_string(ZSTR_VAL(subpat_names[name_idx]), ZSTR_LEN(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
559 			php_error_docref(NULL, E_WARNING, "Numeric named subpatterns are not allowed");
560 			free_subpats_table(subpat_names, num_subpats);
561 			return NULL;
562 		}
563 		name_table += name_size;
564 	}
565 	return subpat_names;
566 }
567 /* }}} */
568 
569 /* {{{ static calculate_unit_length */
570 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */
calculate_unit_length(pcre_cache_entry * pce,const char * start)571 static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start)
572 {
573 	size_t unit_len;
574 
575 	if (pce->compile_options & PCRE2_UTF) {
576 		const char *end = start;
577 
578 		/* skip continuation bytes */
579 		while ((*++end & 0xC0) == 0x80);
580 		unit_len = end - start;
581 	} else {
582 		unit_len = 1;
583 	}
584 	return unit_len;
585 }
586 /* }}} */
587 
588 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache_ex(zend_string * regex,int locale_aware)589 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware)
590 {
591 	pcre2_code			*re = NULL;
592 #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE
593 	uint32_t			 coptions = PCRE2_NO_START_OPTIMIZE;
594 #else
595 	uint32_t			 coptions = 0;
596 #endif
597 	PCRE2_UCHAR	         error[128];
598 	PCRE2_SIZE           erroffset;
599 	int                  errnumber;
600 	char				 delimiter;
601 	char				 start_delimiter;
602 	char				 end_delimiter;
603 	char				*p, *pp;
604 	char				*pattern;
605 	size_t				 pattern_len;
606 	uint32_t			 poptions = 0;
607 	const uint8_t       *tables = NULL;
608 	zval                *zv;
609 	pcre_cache_entry	 new_entry;
610 	int					 rc;
611 	zend_string 		*key;
612 	pcre_cache_entry	*ret;
613 
614 	if (locale_aware && BG(ctype_string)) {
615 		key = zend_string_concat2(
616 			ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)),
617 			ZSTR_VAL(regex), ZSTR_LEN(regex));
618 	} else {
619 		key = regex;
620 	}
621 
622 	/* Try to lookup the cached regex entry, and if successful, just pass
623 	   back the compiled pattern, otherwise go on and compile it. */
624 	zv = zend_hash_find(&PCRE_G(pcre_cache), key);
625 	if (zv) {
626 		if (key != regex) {
627 			zend_string_release_ex(key, 0);
628 		}
629 		return (pcre_cache_entry*)Z_PTR_P(zv);
630 	}
631 
632 	p = ZSTR_VAL(regex);
633 	const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex);
634 
635 	/* Parse through the leading whitespace, and display a warning if we
636 	   get to the end without encountering a delimiter. */
637 	while (isspace((int)*(unsigned char *)p)) p++;
638 	if (p >= end_p) {
639 		if (key != regex) {
640 			zend_string_release_ex(key, 0);
641 		}
642 		php_error_docref(NULL, E_WARNING, "Empty regular expression");
643 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
644 		return NULL;
645 	}
646 
647 	/* Get the delimiter and display a warning if it is alphanumeric
648 	   or a backslash. */
649 	delimiter = *p++;
650 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\' || delimiter == '\0') {
651 		if (key != regex) {
652 			zend_string_release_ex(key, 0);
653 		}
654 		php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL");
655 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
656 		return NULL;
657 	}
658 
659 	start_delimiter = delimiter;
660 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
661 		delimiter = pp[5];
662 	end_delimiter = delimiter;
663 
664 	pp = p;
665 
666 	if (start_delimiter == end_delimiter) {
667 		/* We need to iterate through the pattern, searching for the ending delimiter,
668 		   but skipping the backslashed delimiters.  If the ending delimiter is not
669 		   found, display a warning. */
670 		while (pp < end_p) {
671 			if (*pp == '\\' && pp + 1 < end_p) pp++;
672 			else if (*pp == delimiter)
673 				break;
674 			pp++;
675 		}
676 	} else {
677 		/* We iterate through the pattern, searching for the matching ending
678 		 * delimiter. For each matching starting delimiter, we increment nesting
679 		 * level, and decrement it for each matching ending delimiter. If we
680 		 * reach the end of the pattern without matching, display a warning.
681 		 */
682 		int brackets = 1; 	/* brackets nesting level */
683 		while (pp < end_p) {
684 			if (*pp == '\\' && pp + 1 < end_p) pp++;
685 			else if (*pp == end_delimiter && --brackets <= 0)
686 				break;
687 			else if (*pp == start_delimiter)
688 				brackets++;
689 			pp++;
690 		}
691 	}
692 
693 	if (pp >= end_p) {
694 		if (key != regex) {
695 			zend_string_release_ex(key, 0);
696 		}
697 		if (start_delimiter == end_delimiter) {
698 			php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
699 		} else {
700 			php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
701 		}
702 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
703 		return NULL;
704 	}
705 
706 	/* Make a copy of the actual pattern. */
707 	pattern_len = pp - p;
708 	pattern = estrndup(p, pattern_len);
709 
710 	/* Move on to the options */
711 	pp++;
712 
713 	/* Parse through the options, setting appropriate flags.  Display
714 	   a warning if we encounter an unknown modifier. */
715 	while (pp < end_p) {
716 		switch (*pp++) {
717 			/* Perl compatible options */
718 			case 'i':	coptions |= PCRE2_CASELESS;		break;
719 			case 'm':	coptions |= PCRE2_MULTILINE;		break;
720 			case 'n':	coptions |= PCRE2_NO_AUTO_CAPTURE;	break;
721 			case 's':	coptions |= PCRE2_DOTALL;		break;
722 			case 'x':	coptions |= PCRE2_EXTENDED;		break;
723 
724 			/* PCRE specific options */
725 			case 'A':	coptions |= PCRE2_ANCHORED;		break;
726 			case 'D':	coptions |= PCRE2_DOLLAR_ENDONLY;break;
727 			case 'S':	/* Pass. */					break;
728 			case 'X':	/* Pass. */					break;
729 			case 'U':	coptions |= PCRE2_UNGREEDY;		break;
730 			case 'u':	coptions |= PCRE2_UTF;
731 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
732 	   characters, even in UTF-8 mode. However, this can be changed by setting
733 	   the PCRE2_UCP option. */
734 #ifdef PCRE2_UCP
735 						coptions |= PCRE2_UCP;
736 #endif
737 				break;
738 			case 'J':	coptions |= PCRE2_DUPNAMES;		break;
739 
740 			/* Custom preg options */
741 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
742 
743 			case ' ':
744 			case '\n':
745 			case '\r':
746 				break;
747 
748 			default:
749 				if (pp[-1]) {
750 					php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]);
751 				} else {
752 					php_error_docref(NULL, E_WARNING, "NUL is not a valid modifier");
753 				}
754 				pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
755 				efree(pattern);
756 				if (key != regex) {
757 					zend_string_release_ex(key, 0);
758 				}
759 				return NULL;
760 		}
761 	}
762 
763 	if (poptions & PREG_REPLACE_EVAL) {
764 		php_error_docref(NULL, E_WARNING, "The /e modifier is no longer supported, use preg_replace_callback instead");
765 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
766 		efree(pattern);
767 		if (key != regex) {
768 			zend_string_release_ex(key, 0);
769 		}
770 		return NULL;
771 	}
772 
773 	if (key != regex) {
774 		tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string));
775 		if (!tables) {
776 			zend_string *_k;
777 			tables = pcre2_maketables(gctx);
778 			if (UNEXPECTED(!tables)) {
779 				php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables");
780 				pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY);
781 				zend_string_release_ex(key, 0);
782 				efree(pattern);
783 				return NULL;
784 			}
785 			_k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1);
786 			GC_MAKE_PERSISTENT_LOCAL(_k);
787 			zend_hash_add_ptr(&char_tables, _k, (void *)tables);
788 			zend_string_release(_k);
789 		}
790 	}
791 	pcre2_set_character_tables(cctx, tables);
792 
793 	/* Compile pattern and display a warning if compilation failed. */
794 	re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
795 
796 	if (re == NULL) {
797 		if (key != regex) {
798 			zend_string_release_ex(key, 0);
799 		}
800 		pcre2_get_error_message(errnumber, error, sizeof(error));
801 		php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset);
802 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
803 		efree(pattern);
804 		return NULL;
805 	}
806 
807 #ifdef HAVE_PCRE_JIT_SUPPORT
808 	if (PCRE_G(jit)) {
809 		/* Enable PCRE JIT compiler */
810 		rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
811 		if (EXPECTED(rc >= 0)) {
812 			size_t jit_size = 0;
813 			if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) {
814 				poptions |= PREG_JIT;
815 			}
816 		} else if (rc == PCRE2_ERROR_NOMEMORY) {
817 			php_error_docref(NULL, E_WARNING,
818 				"Allocation of JIT memory failed, PCRE JIT will be disabled. "
819 				"This is likely caused by security restrictions. "
820 				"Either grant PHP permission to allocate executable memory, or set pcre.jit=0");
821 			PCRE_G(jit) = 0;
822 		} else {
823 			pcre2_get_error_message(rc, error, sizeof(error));
824 			php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error);
825 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
826 		}
827 	}
828 #endif
829 	efree(pattern);
830 
831 	/*
832 	 * If we reached cache limit, clean out the items from the head of the list;
833 	 * these are supposedly the oldest ones (but not necessarily the least used
834 	 * ones).
835 	 */
836 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
837 		int num_clean = PCRE_CACHE_SIZE / 8;
838 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
839 	}
840 
841 	/* Store the compiled pattern and extra info in the cache. */
842 	new_entry.re = re;
843 	new_entry.preg_options = poptions;
844 	new_entry.compile_options = coptions;
845 	new_entry.refcount = 0;
846 
847 	rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count);
848 	if (rc < 0) {
849 		if (key != regex) {
850 			zend_string_release_ex(key, 0);
851 		}
852 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc);
853 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
854 		return NULL;
855 	}
856 
857 	rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count);
858 	if (rc < 0) {
859 		if (key != regex) {
860 			zend_string_release_ex(key, 0);
861 		}
862 		php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc);
863 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
864 		return NULL;
865 	}
866 
867 	/*
868 	 * Interned strings are not duplicated when stored in HashTable,
869 	 * but all the interned strings created during HTTP request are removed
870 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
871 	 * on the next request as well. So we disable usage of interned strings
872 	 * as hash keys especually for this table.
873 	 * See bug #63180
874 	 */
875 	if (!(GC_FLAGS(key) & IS_STR_PERMANENT) && !PCRE_G(per_request_cache)) {
876 		zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1);
877 		GC_MAKE_PERSISTENT_LOCAL(str);
878 
879 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry));
880 		zend_string_release(str);
881 	} else {
882 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
883 	}
884 
885 	if (key != regex) {
886 		zend_string_release_ex(key, 0);
887 	}
888 
889 	return ret;
890 }
891 /* }}} */
892 
893 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache(zend_string * regex)894 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
895 {
896 	return pcre_get_compiled_regex_cache_ex(regex, 1);
897 }
898 /* }}} */
899 
900 /* {{{ pcre_get_compiled_regex */
pcre_get_compiled_regex(zend_string * regex,uint32_t * capture_count)901 PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count)
902 {
903 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
904 
905 	if (capture_count) {
906 		*capture_count = pce ? pce->capture_count : 0;
907 	}
908 
909 	return pce ? pce->re : NULL;
910 }
911 /* }}} */
912 
913 /* XXX For the cases where it's only about match yes/no and no capture
914 		required, perhaps just a minimum sized data would suffice. */
php_pcre_create_match_data(uint32_t capture_count,pcre2_code * re)915 PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
916 {/*{{{*/
917 
918 	assert(NULL != re);
919 
920 	if (EXPECTED(!mdata_used)) {
921 		int rc = 0;
922 
923 		if (!capture_count) {
924 			/* As we deal with a non cached pattern, no other way to gather this info. */
925 			rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
926 		}
927 
928 		if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
929 			mdata_used = 1;
930 			return mdata;
931 		}
932 	}
933 
934 	return pcre2_match_data_create_from_pattern(re, gctx);
935 }/*}}}*/
936 
php_pcre_free_match_data(pcre2_match_data * match_data)937 PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
938 {/*{{{*/
939 	if (UNEXPECTED(match_data != mdata)) {
940 		pcre2_match_data_free(match_data);
941 	} else {
942 		mdata_used = 0;
943 	}
944 }/*}}}*/
945 
init_unmatched_null_pair(void)946 static void init_unmatched_null_pair(void) {
947 	zval val1, val2;
948 	ZVAL_NULL(&val1);
949 	ZVAL_LONG(&val2, -1);
950 	ZVAL_ARR(&PCRE_G(unmatched_null_pair), zend_new_pair(&val1, &val2));
951 }
952 
init_unmatched_empty_pair(void)953 static void init_unmatched_empty_pair(void) {
954 	zval val1, val2;
955 	ZVAL_EMPTY_STRING(&val1);
956 	ZVAL_LONG(&val2, -1);
957 	ZVAL_ARR(&PCRE_G(unmatched_empty_pair), zend_new_pair(&val1, &val2));
958 }
959 
populate_match_value_str(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset)960 static zend_always_inline void populate_match_value_str(
961 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) {
962 	ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
963 }
964 
populate_match_value(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,uint32_t unmatched_as_null)965 static inline void populate_match_value(
966 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
967 		uint32_t unmatched_as_null) {
968 	if (PCRE2_UNSET == start_offset) {
969 		if (unmatched_as_null) {
970 			ZVAL_NULL(val);
971 		} else {
972 			ZVAL_EMPTY_STRING(val);
973 		}
974 	} else {
975 		populate_match_value_str(val, subject, start_offset, end_offset);
976 	}
977 }
978 
add_named(zval * subpats,zend_string * name,zval * val,bool unmatched)979 static inline void add_named(
980 		zval *subpats, zend_string *name, zval *val, bool unmatched) {
981 	/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
982 	 * In this case we want to preserve the one that actually has a value. */
983 	if (!unmatched) {
984 		zend_hash_update(Z_ARRVAL_P(subpats), name, val);
985 	} else {
986 		if (!zend_hash_add(Z_ARRVAL_P(subpats), name, val)) {
987 			return;
988 		}
989 	}
990 	Z_TRY_ADDREF_P(val);
991 }
992 
993 /* {{{ add_offset_pair */
add_offset_pair(zval * result,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,zend_string * name,uint32_t unmatched_as_null)994 static inline void add_offset_pair(
995 		zval *result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
996 		zend_string *name, uint32_t unmatched_as_null)
997 {
998 	zval match_pair;
999 
1000 	/* Add (match, offset) to the return value */
1001 	if (PCRE2_UNSET == start_offset) {
1002 		if (unmatched_as_null) {
1003 			if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) {
1004 				init_unmatched_null_pair();
1005 			}
1006 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair));
1007 		} else {
1008 			if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) {
1009 				init_unmatched_empty_pair();
1010 			}
1011 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair));
1012 		}
1013 	} else {
1014 		zval val1, val2;
1015 		populate_match_value_str(&val1, subject, start_offset, end_offset);
1016 		ZVAL_LONG(&val2, start_offset);
1017 		ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2));
1018 	}
1019 
1020 	if (name) {
1021 		add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
1022 	}
1023 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair);
1024 }
1025 /* }}} */
1026 
populate_subpat_array(zval * subpats,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1027 static void populate_subpat_array(
1028 		zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
1029 		uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
1030 	bool offset_capture = (flags & PREG_OFFSET_CAPTURE) != 0;
1031 	bool unmatched_as_null = (flags & PREG_UNMATCHED_AS_NULL) != 0;
1032 	zval val;
1033 	int i;
1034 	if (subpat_names) {
1035 		if (offset_capture) {
1036 			for (i = 0; i < count; i++) {
1037 				add_offset_pair(
1038 					subpats, subject, offsets[2*i], offsets[2*i+1],
1039 					subpat_names[i], unmatched_as_null);
1040 			}
1041 			if (unmatched_as_null) {
1042 				for (i = count; i < num_subpats; i++) {
1043 					add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
1044 				}
1045 			}
1046 		} else {
1047 			for (i = 0; i < count; i++) {
1048 				populate_match_value(
1049 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1050 				if (subpat_names[i]) {
1051 					add_named(subpats, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
1052 				}
1053 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1054 			}
1055 			if (unmatched_as_null) {
1056 				for (i = count; i < num_subpats; i++) {
1057 					ZVAL_NULL(&val);
1058 					if (subpat_names[i]) {
1059 						zend_hash_add(Z_ARRVAL_P(subpats), subpat_names[i], &val);
1060 					}
1061 					zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1062 				}
1063 			}
1064 		}
1065 	} else {
1066 		if (offset_capture) {
1067 			for (i = 0; i < count; i++) {
1068 				add_offset_pair(
1069 					subpats, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
1070 			}
1071 			if (unmatched_as_null) {
1072 				for (i = count; i < num_subpats; i++) {
1073 					add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
1074 				}
1075 			}
1076 		} else {
1077 			for (i = 0; i < count; i++) {
1078 				populate_match_value(
1079 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1080 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1081 			}
1082 			if (unmatched_as_null) {
1083 				for (i = count; i < num_subpats; i++) {
1084 					add_next_index_null(subpats);
1085 				}
1086 			}
1087 		}
1088 	}
1089 	/* Add MARK, if available */
1090 	if (mark) {
1091 		add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
1092 	}
1093 }
1094 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)1095 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
1096 {
1097 	/* parameters */
1098 	zend_string		 *regex;			/* Regular expression */
1099 	zend_string		 *subject;			/* String to match against */
1100 	pcre_cache_entry *pce;				/* Compiled regular expression */
1101 	zval			 *subpats = NULL;	/* Array for subpatterns */
1102 	zend_long		  flags = 0;		/* Match control flags */
1103 	zend_long		  start_offset = 0;	/* Where the new search starts */
1104 
1105 	ZEND_PARSE_PARAMETERS_START(2, 5)
1106 		Z_PARAM_STR(regex)
1107 		Z_PARAM_STR(subject)
1108 		Z_PARAM_OPTIONAL
1109 		Z_PARAM_ZVAL(subpats)
1110 		Z_PARAM_LONG(flags)
1111 		Z_PARAM_LONG(start_offset)
1112 	ZEND_PARSE_PARAMETERS_END();
1113 
1114 	/* Compile regex or get it from cache. */
1115 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1116 		RETURN_FALSE;
1117 	}
1118 
1119 	if (start_offset == ZEND_LONG_MIN) {
1120 		zend_argument_value_error(5, "must be greater than " ZEND_LONG_FMT, ZEND_LONG_MIN);
1121 		RETURN_THROWS();
1122 	}
1123 
1124 	pce->refcount++;
1125 	php_pcre_match_impl(pce, subject, return_value, subpats,
1126 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset);
1127 	pce->refcount--;
1128 }
1129 /* }}} */
1130 
is_known_valid_utf8(zend_string * subject_str,PCRE2_SIZE start_offset)1131 static zend_always_inline bool is_known_valid_utf8(
1132 		zend_string *subject_str, PCRE2_SIZE start_offset) {
1133 	if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
1134 		/* We don't know whether the string is valid UTF-8 or not. */
1135 		return 0;
1136 	}
1137 
1138 	if (start_offset == ZSTR_LEN(subject_str)) {
1139 		/* Degenerate case: Offset points to end of string. */
1140 		return 1;
1141 	}
1142 
1143 	/* Check that the offset does not point to an UTF-8 continuation byte. */
1144 	return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1145 }
1146 
1147 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zval * subpats,int global,int use_flags,zend_long flags,zend_off_t start_offset)1148 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
1149 	zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
1150 {
1151 	zval			 result_set,		/* Holds a set of subpatterns after
1152 										   a global match */
1153 					*match_sets = NULL;	/* An array of sets of matches for each
1154 										   subpattern after a global match */
1155 	uint32_t		 options;			/* Execution options */
1156 	int				 count;				/* Count of matched subpatterns */
1157 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1158 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1159 	int				 matched;			/* Has anything matched */
1160 	zend_string	   **subpat_names;		/* Array for named subpatterns */
1161 	size_t			 i;
1162 	uint32_t		 subpats_order;		/* Order of subpattern matches */
1163 	uint32_t		 offset_capture;	/* Capture match offsets: yes/no */
1164 	uint32_t		 unmatched_as_null;	/* Null non-matches: yes/no */
1165 	PCRE2_SPTR       mark = NULL;		/* Target for MARK name */
1166 	zval			 marks;				/* Array of marks for PREG_PATTERN_ORDER */
1167 	pcre2_match_data *match_data;
1168 	PCRE2_SIZE		 start_offset2, orig_start_offset;
1169 
1170 	char *subject = ZSTR_VAL(subject_str);
1171 	size_t subject_len = ZSTR_LEN(subject_str);
1172 
1173 	ZVAL_UNDEF(&marks);
1174 
1175 	/* Overwrite the passed-in value for subpatterns with an empty array. */
1176 	if (subpats != NULL) {
1177 		subpats = zend_try_array_init(subpats);
1178 		if (!subpats) {
1179 			RETURN_THROWS();
1180 		}
1181 	}
1182 
1183 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
1184 
1185 	if (use_flags) {
1186 		offset_capture = flags & PREG_OFFSET_CAPTURE;
1187 		unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1188 
1189 		/*
1190 		 * subpats_order is pre-set to pattern mode so we change it only if
1191 		 * necessary.
1192 		 */
1193 		if (flags & 0xff) {
1194 			subpats_order = flags & 0xff;
1195 		}
1196 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
1197 			(!global && subpats_order != 0)) {
1198 			zend_argument_value_error(4, "must be a PREG_* constant");
1199 			RETURN_THROWS();
1200 		}
1201 	} else {
1202 		offset_capture = 0;
1203 		unmatched_as_null = 0;
1204 	}
1205 
1206 	/* Negative offset counts from the end of the string. */
1207 	if (start_offset < 0) {
1208 		if ((PCRE2_SIZE)-start_offset <= subject_len) {
1209 			start_offset2 = subject_len + start_offset;
1210 		} else {
1211 			start_offset2 = 0;
1212 		}
1213 	} else {
1214 		start_offset2 = (PCRE2_SIZE)start_offset;
1215 	}
1216 
1217 	if (start_offset2 > subject_len) {
1218 		pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1219 		RETURN_FALSE;
1220 	}
1221 
1222 	/* Calculate the size of the offsets array, and allocate memory for it. */
1223 	num_subpats = pce->capture_count + 1;
1224 
1225 	/*
1226 	 * Build a mapping from subpattern numbers to their names. We will
1227 	 * allocate the table only if there are any named subpatterns.
1228 	 */
1229 	subpat_names = NULL;
1230 	if (subpats && pce->name_count > 0) {
1231 		subpat_names = make_subpats_table(num_subpats, pce);
1232 		if (!subpat_names) {
1233 			RETURN_FALSE;
1234 		}
1235 	}
1236 
1237 	/* Allocate match sets array and initialize the values. */
1238 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1239 		match_sets = (zval *)safe_emalloc(num_subpats, sizeof(zval), 0);
1240 		for (i=0; i<num_subpats; i++) {
1241 			array_init(&match_sets[i]);
1242 		}
1243 	}
1244 
1245 	matched = 0;
1246 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1247 
1248 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1249 		match_data = mdata;
1250 	} else {
1251 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1252 		if (!match_data) {
1253 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1254 			if (subpat_names) {
1255 				free_subpats_table(subpat_names, num_subpats);
1256 			}
1257 			if (match_sets) {
1258 				efree(match_sets);
1259 			}
1260 			RETURN_FALSE;
1261 		}
1262 	}
1263 
1264 	orig_start_offset = start_offset2;
1265 	options =
1266 		(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1267 			? 0 : PCRE2_NO_UTF_CHECK;
1268 
1269 	/* Execute the regular expression. */
1270 #ifdef HAVE_PCRE_JIT_SUPPORT
1271 	if ((pce->preg_options & PREG_JIT) && options) {
1272 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1273 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1274 	} else
1275 #endif
1276 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1277 			options, match_data, mctx);
1278 
1279 	while (1) {
1280 		/* If something has matched */
1281 		if (count >= 0) {
1282 			/* Check for too many substrings condition. */
1283 			if (UNEXPECTED(count == 0)) {
1284 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
1285 				count = num_subpats;
1286 			}
1287 
1288 matched:
1289 			matched++;
1290 
1291 			offsets = pcre2_get_ovector_pointer(match_data);
1292 
1293 			/* If subpatterns array has been passed, fill it in with values. */
1294 			if (subpats != NULL) {
1295 				/* Try to get the list of substrings and display a warning if failed. */
1296 				if (offsets[1] < offsets[0]) {
1297 					if (subpat_names) {
1298 						free_subpats_table(subpat_names, num_subpats);
1299 					}
1300 					if (match_sets) efree(match_sets);
1301 					php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
1302 					RETURN_FALSE;
1303 				}
1304 
1305 				if (global) {	/* global pattern matching */
1306 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
1307 						/* For each subpattern, insert it into the appropriate array. */
1308 						if (offset_capture) {
1309 							for (i = 0; i < count; i++) {
1310 								add_offset_pair(
1311 									&match_sets[i], subject, offsets[2*i], offsets[2*i+1],
1312 									NULL, unmatched_as_null);
1313 							}
1314 						} else {
1315 							for (i = 0; i < count; i++) {
1316 								zval val;
1317 								populate_match_value(
1318 									&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1319 								zend_hash_next_index_insert_new(Z_ARRVAL(match_sets[i]), &val);
1320 							}
1321 						}
1322 						mark = pcre2_get_mark(match_data);
1323 						/* Add MARK, if available */
1324 						if (mark) {
1325 							if (Z_TYPE(marks) == IS_UNDEF) {
1326 								array_init(&marks);
1327 							}
1328 							add_index_string(&marks, matched - 1, (char *) mark);
1329 						}
1330 						/*
1331 						 * If the number of captured subpatterns on this run is
1332 						 * less than the total possible number, pad the result
1333 						 * arrays with NULLs or empty strings.
1334 						 */
1335 						if (count < num_subpats) {
1336 							for (; i < num_subpats; i++) {
1337 								if (offset_capture) {
1338 									add_offset_pair(
1339 										&match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
1340 										NULL, unmatched_as_null);
1341 								} else if (unmatched_as_null) {
1342 									add_next_index_null(&match_sets[i]);
1343 								} else {
1344 									add_next_index_str(&match_sets[i], ZSTR_EMPTY_ALLOC());
1345 								}
1346 							}
1347 						}
1348 					} else {
1349 						/* Allocate and populate the result set array */
1350 						array_init_size(&result_set, count + (mark ? 1 : 0));
1351 						mark = pcre2_get_mark(match_data);
1352 						populate_subpat_array(
1353 							&result_set, subject, offsets, subpat_names,
1354 							num_subpats, count, mark, flags);
1355 						/* And add it to the output array */
1356 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set);
1357 					}
1358 				} else {			/* single pattern matching */
1359 					/* For each subpattern, insert it into the subpatterns array. */
1360 					mark = pcre2_get_mark(match_data);
1361 					populate_subpat_array(
1362 						subpats, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1363 					break;
1364 				}
1365 			}
1366 
1367 			/* Advance to the next piece. */
1368 			start_offset2 = offsets[1];
1369 
1370 			/* If we have matched an empty string, mimic what Perl's /g options does.
1371 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1372 			   the match again at the same point. If this fails (picked up above) we
1373 			   advance to the next character. */
1374 			if (start_offset2 == offsets[0]) {
1375 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1376 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1377 				if (count >= 0) {
1378 					if (global) {
1379 						goto matched;
1380 					} else {
1381 						break;
1382 					}
1383 				} else if (count == PCRE2_ERROR_NOMATCH) {
1384 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1385 					   this is not necessarily the end. We need to advance
1386 					   the start offset, and continue. Fudge the offset values
1387 					   to achieve this, unless we're already at the end of the string. */
1388 					if (start_offset2 < subject_len) {
1389 						size_t unit_len = calculate_unit_length(pce, subject + start_offset2);
1390 
1391 						start_offset2 += unit_len;
1392 					} else {
1393 						break;
1394 					}
1395 				} else {
1396 					goto error;
1397 				}
1398 			}
1399 		} else if (count == PCRE2_ERROR_NOMATCH) {
1400 			break;
1401 		} else {
1402 error:
1403 			pcre_handle_exec_error(count);
1404 			break;
1405 		}
1406 
1407 		if (!global) {
1408 			break;
1409 		}
1410 
1411 		/* Execute the regular expression. */
1412 #ifdef HAVE_PCRE_JIT_SUPPORT
1413 		if ((pce->preg_options & PREG_JIT)) {
1414 			if (PCRE2_UNSET == start_offset2 || start_offset2 > subject_len) {
1415 				pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1416 				break;
1417 			}
1418 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1419 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1420 		} else
1421 #endif
1422 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1423 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1424 	}
1425 	if (match_data != mdata) {
1426 		pcre2_match_data_free(match_data);
1427 	}
1428 
1429 	/* Add the match sets to the output array and clean up */
1430 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1431 		if (subpat_names) {
1432 			for (i = 0; i < num_subpats; i++) {
1433 				if (subpat_names[i]) {
1434 					zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &match_sets[i]);
1435 					Z_ADDREF(match_sets[i]);
1436 				}
1437 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1438 			}
1439 		} else {
1440 			for (i = 0; i < num_subpats; i++) {
1441 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1442 			}
1443 		}
1444 		efree(match_sets);
1445 
1446 		if (Z_TYPE(marks) != IS_UNDEF) {
1447 			add_assoc_zval(subpats, "MARK", &marks);
1448 		}
1449 	}
1450 
1451 	if (subpat_names) {
1452 		free_subpats_table(subpat_names, num_subpats);
1453 	}
1454 
1455 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1456 		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1457 		if ((pce->compile_options & PCRE2_UTF)
1458 				&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
1459 			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1460 		}
1461 
1462 		RETVAL_LONG(matched);
1463 	} else {
1464 		RETVAL_FALSE;
1465 	}
1466 }
1467 /* }}} */
1468 
1469 /* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1470 PHP_FUNCTION(preg_match)
1471 {
1472 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1473 }
1474 /* }}} */
1475 
1476 /* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1477 PHP_FUNCTION(preg_match_all)
1478 {
1479 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1480 }
1481 /* }}} */
1482 
1483 /* {{{ preg_get_backref */
preg_get_backref(char ** str,int * backref)1484 static int preg_get_backref(char **str, int *backref)
1485 {
1486 	char in_brace = 0;
1487 	char *walk = *str;
1488 
1489 	if (walk[1] == 0)
1490 		return 0;
1491 
1492 	if (*walk == '$' && walk[1] == '{') {
1493 		in_brace = 1;
1494 		walk++;
1495 	}
1496 	walk++;
1497 
1498 	if (*walk >= '0' && *walk <= '9') {
1499 		*backref = *walk - '0';
1500 		walk++;
1501 	} else
1502 		return 0;
1503 
1504 	if (*walk && *walk >= '0' && *walk <= '9') {
1505 		*backref = *backref * 10 + *walk - '0';
1506 		walk++;
1507 	}
1508 
1509 	if (in_brace) {
1510 		if (*walk != '}')
1511 			return 0;
1512 		else
1513 			walk++;
1514 	}
1515 
1516 	*str = walk;
1517 	return 1;
1518 }
1519 /* }}} */
1520 
1521 /* {{{ preg_do_repl_func */
preg_do_repl_func(zend_fcall_info * fci,zend_fcall_info_cache * fcc,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1522 static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags)
1523 {
1524 	zend_string *result_str;
1525 	zval		 retval;			/* Function return value */
1526 	zval	     arg;				/* Argument to pass to function */
1527 
1528 	array_init_size(&arg, count + (mark ? 1 : 0));
1529 	populate_subpat_array(&arg, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1530 
1531 	fci->retval = &retval;
1532 	fci->param_count = 1;
1533 	fci->params = &arg;
1534 
1535 	if (zend_call_function(fci, fcc) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1536 		if (EXPECTED(Z_TYPE(retval) == IS_STRING)) {
1537 			result_str = Z_STR(retval);
1538 		} else {
1539 			result_str = zval_get_string_func(&retval);
1540 			zval_ptr_dtor(&retval);
1541 		}
1542 	} else {
1543 		if (!EG(exception)) {
1544 			php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1545 		}
1546 
1547 		result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1548 	}
1549 
1550 	zval_ptr_dtor(&arg);
1551 
1552 	return result_str;
1553 }
1554 /* }}} */
1555 
1556 /* {{{ php_pcre_replace */
php_pcre_replace(zend_string * regex,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1557 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1558 							  zend_string *subject_str,
1559 							  const char *subject, size_t subject_len,
1560 							  zend_string *replace_str,
1561 							  size_t limit, size_t *replace_count)
1562 {
1563 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1564 	zend_string	 		*result;			/* Function result */
1565 
1566 	/* Abort on pending exception, e.g. thrown from __toString(). */
1567 	if (UNEXPECTED(EG(exception))) {
1568 		return NULL;
1569 	}
1570 
1571 	/* Compile regex or get it from cache. */
1572 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1573 		return NULL;
1574 	}
1575 	pce->refcount++;
1576 	result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str,
1577 		limit, replace_count);
1578 	pce->refcount--;
1579 
1580 	return result;
1581 }
1582 /* }}} */
1583 
1584 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1585 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count)
1586 {
1587 	uint32_t		 options;			/* Execution options */
1588 	int				 count;				/* Count of matched subpatterns */
1589 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1590 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1591 	size_t			 new_len;			/* Length of needed storage */
1592 	size_t			 alloc_len;			/* Actual allocated length */
1593 	size_t			 match_len;			/* Length of the current match */
1594 	int				 backref;			/* Backreference number */
1595 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1596 	size_t			 last_end_offset;	/* Where the last search ended */
1597 	char			*walkbuf,			/* Location of current replacement in the result */
1598 					*walk,				/* Used to walk the replacement string */
1599 					 walk_last;			/* Last walked character */
1600 	const char		*match,				/* The current match */
1601 					*piece,				/* The current piece of subject */
1602 					*replace_end;		/* End of replacement string */
1603 	size_t			result_len; 		/* Length of result */
1604 	zend_string		*result;			/* Result of replacement */
1605 	pcre2_match_data *match_data;
1606 
1607 	/* Calculate the size of the offsets array, and allocate memory for it. */
1608 	num_subpats = pce->capture_count + 1;
1609 	alloc_len = 0;
1610 	result = NULL;
1611 
1612 	/* Initialize */
1613 	match = NULL;
1614 	start_offset = 0;
1615 	last_end_offset = 0;
1616 	result_len = 0;
1617 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1618 
1619 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1620 		match_data = mdata;
1621 	} else {
1622 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1623 		if (!match_data) {
1624 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1625 			return NULL;
1626 		}
1627 	}
1628 
1629 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1630 
1631 	/* Execute the regular expression. */
1632 #ifdef HAVE_PCRE_JIT_SUPPORT
1633 	if ((pce->preg_options & PREG_JIT) && options) {
1634 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1635 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1636 	} else
1637 #endif
1638 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1639 			options, match_data, mctx);
1640 
1641 	while (1) {
1642 		piece = subject + last_end_offset;
1643 
1644 		if (count >= 0 && limit > 0) {
1645 			bool simple_string;
1646 
1647 			/* Check for too many substrings condition. */
1648 			if (UNEXPECTED(count == 0)) {
1649 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1650 				count = num_subpats;
1651 			}
1652 
1653 matched:
1654 			offsets = pcre2_get_ovector_pointer(match_data);
1655 
1656 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1657 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1658 				if (result) {
1659 					zend_string_release_ex(result, 0);
1660 					result = NULL;
1661 				}
1662 				break;
1663 			}
1664 
1665 			if (replace_count) {
1666 				++*replace_count;
1667 			}
1668 
1669 			/* Set the match location in subject */
1670 			match = subject + offsets[0];
1671 
1672 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1673 
1674 			walk = ZSTR_VAL(replace_str);
1675 			replace_end = walk + ZSTR_LEN(replace_str);
1676 			walk_last = 0;
1677 			simple_string = 1;
1678 			while (walk < replace_end) {
1679 				if ('\\' == *walk || '$' == *walk) {
1680 					simple_string = 0;
1681 					if (walk_last == '\\') {
1682 						walk++;
1683 						walk_last = 0;
1684 						continue;
1685 					}
1686 					if (preg_get_backref(&walk, &backref)) {
1687 						if (backref < count)
1688 							new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1689 						continue;
1690 					}
1691 				}
1692 				new_len++;
1693 				walk++;
1694 				walk_last = walk[-1];
1695 			}
1696 
1697 			if (new_len >= alloc_len) {
1698 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1699 				if (result == NULL) {
1700 					result = zend_string_alloc(alloc_len, 0);
1701 				} else {
1702 					result = zend_string_extend(result, alloc_len, 0);
1703 				}
1704 			}
1705 
1706 			if (match-piece > 0) {
1707 				/* copy the part of the string before the match */
1708 				memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1709 				result_len += (match-piece);
1710 			}
1711 
1712 			if (simple_string) {
1713 				/* copy replacement */
1714 				memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1);
1715 				result_len += ZSTR_LEN(replace_str);
1716 			} else {
1717 				/* copy replacement and backrefs */
1718 				walkbuf = ZSTR_VAL(result) + result_len;
1719 
1720 				walk = ZSTR_VAL(replace_str);
1721 				walk_last = 0;
1722 				while (walk < replace_end) {
1723 					if ('\\' == *walk || '$' == *walk) {
1724 						if (walk_last == '\\') {
1725 							*(walkbuf-1) = *walk++;
1726 							walk_last = 0;
1727 							continue;
1728 						}
1729 						if (preg_get_backref(&walk, &backref)) {
1730 							if (backref < count) {
1731 								if (offsets[backref<<1] < SIZE_MAX) {
1732 									match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1733 									memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1734 									walkbuf += match_len;
1735 								}
1736 							}
1737 							continue;
1738 						}
1739 					}
1740 					*walkbuf++ = *walk++;
1741 					walk_last = walk[-1];
1742 				}
1743 				*walkbuf = '\0';
1744 				/* increment the result length by how much we've added to the string */
1745 				result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1746 			}
1747 
1748 			limit--;
1749 
1750 			/* Advance to the next piece. */
1751 			start_offset = last_end_offset = offsets[1];
1752 
1753 			/* If we have matched an empty string, mimic what Perl's /g options does.
1754 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1755 			   the match again at the same point. If this fails (picked up above) we
1756 			   advance to the next character. */
1757 			if (start_offset == offsets[0]) {
1758 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1759 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1760 
1761 				piece = subject + start_offset;
1762 				if (count >= 0 && limit > 0) {
1763 					goto matched;
1764 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1765 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1766 					   this is not necessarily the end. We need to advance
1767 					   the start offset, and continue. Fudge the offset values
1768 					   to achieve this, unless we're already at the end of the string. */
1769 					if (start_offset < subject_len) {
1770 						size_t unit_len = calculate_unit_length(pce, piece);
1771 						start_offset += unit_len;
1772 					} else {
1773 						goto not_matched;
1774 					}
1775 				} else {
1776 					goto error;
1777 				}
1778 			}
1779 
1780 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1781 not_matched:
1782 			if (!result && subject_str) {
1783 				result = zend_string_copy(subject_str);
1784 				break;
1785 			}
1786 			/* now we know exactly how long it is */
1787 			alloc_len = result_len + subject_len - last_end_offset;
1788 			if (NULL != result) {
1789 				result = zend_string_realloc(result, alloc_len, 0);
1790 			} else {
1791 				result = zend_string_alloc(alloc_len, 0);
1792 			}
1793 			/* stick that last bit of string on our output */
1794 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1795 			result_len += subject_len - last_end_offset;
1796 			ZSTR_VAL(result)[result_len] = '\0';
1797 			ZSTR_LEN(result) = result_len;
1798 			break;
1799 		} else {
1800 error:
1801 			pcre_handle_exec_error(count);
1802 			if (result) {
1803 				zend_string_release_ex(result, 0);
1804 				result = NULL;
1805 			}
1806 			break;
1807 		}
1808 
1809 #ifdef HAVE_PCRE_JIT_SUPPORT
1810 		if (pce->preg_options & PREG_JIT) {
1811 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1812 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1813 		} else
1814 #endif
1815 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1816 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1817 	}
1818 	if (match_data != mdata) {
1819 		pcre2_match_data_free(match_data);
1820 	}
1821 
1822 	return result;
1823 }
1824 /* }}} */
1825 
1826 /* {{{ php_pcre_replace_func_impl() */
php_pcre_replace_func_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)1827 static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_fcall_info *fci, zend_fcall_info_cache *fcc, size_t limit, size_t *replace_count, zend_long flags)
1828 {
1829 	uint32_t		 options;			/* Execution options */
1830 	int				 count;				/* Count of matched subpatterns */
1831 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1832 	zend_string		**subpat_names;		/* Array for named subpatterns */
1833 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1834 	size_t			 new_len;			/* Length of needed storage */
1835 	size_t			 alloc_len;			/* Actual allocated length */
1836 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1837 	size_t			 last_end_offset;	/* Where the last search ended */
1838 	const char		*match,				/* The current match */
1839 					*piece;				/* The current piece of subject */
1840 	size_t			result_len; 		/* Length of result */
1841 	zend_string		*result;			/* Result of replacement */
1842 	zend_string     *eval_result;		/* Result of custom function */
1843 	pcre2_match_data *match_data;
1844 	bool old_mdata_used;
1845 
1846 	/* Calculate the size of the offsets array, and allocate memory for it. */
1847 	num_subpats = pce->capture_count + 1;
1848 
1849 	/*
1850 	 * Build a mapping from subpattern numbers to their names. We will
1851 	 * allocate the table only if there are any named subpatterns.
1852 	 */
1853 	subpat_names = NULL;
1854 	if (UNEXPECTED(pce->name_count > 0)) {
1855 		subpat_names = make_subpats_table(num_subpats, pce);
1856 		if (!subpat_names) {
1857 			return NULL;
1858 		}
1859 	}
1860 
1861 	alloc_len = 0;
1862 	result = NULL;
1863 
1864 	/* Initialize */
1865 	match = NULL;
1866 	start_offset = 0;
1867 	last_end_offset = 0;
1868 	result_len = 0;
1869 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1870 
1871 	old_mdata_used = mdata_used;
1872 	if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1873 		mdata_used = 1;
1874 		match_data = mdata;
1875 	} else {
1876 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1877 		if (!match_data) {
1878 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1879 			if (subpat_names) {
1880 				free_subpats_table(subpat_names, num_subpats);
1881 			}
1882 			mdata_used = old_mdata_used;
1883 			return NULL;
1884 		}
1885 	}
1886 
1887 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1888 
1889 	/* Execute the regular expression. */
1890 #ifdef HAVE_PCRE_JIT_SUPPORT
1891 	if ((pce->preg_options & PREG_JIT) && options) {
1892 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1893 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1894 	} else
1895 #endif
1896 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1897 			options, match_data, mctx);
1898 
1899 	while (1) {
1900 		piece = subject + last_end_offset;
1901 
1902 		if (count >= 0 && limit) {
1903 			/* Check for too many substrings condition. */
1904 			if (UNEXPECTED(count == 0)) {
1905 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1906 				count = num_subpats;
1907 			}
1908 
1909 matched:
1910 			offsets = pcre2_get_ovector_pointer(match_data);
1911 
1912 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1913 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1914 				if (result) {
1915 					zend_string_release_ex(result, 0);
1916 					result = NULL;
1917 				}
1918 				break;
1919 			}
1920 
1921 			if (replace_count) {
1922 				++*replace_count;
1923 			}
1924 
1925 			/* Set the match location in subject */
1926 			match = subject + offsets[0];
1927 
1928 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1929 
1930 			/* Use custom function to get replacement string and its length. */
1931 			eval_result = preg_do_repl_func(
1932 				fci, fcc, subject, offsets, subpat_names, num_subpats, count,
1933 				pcre2_get_mark(match_data), flags);
1934 
1935 			ZEND_ASSERT(eval_result);
1936 			new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD;
1937 			if (new_len >= alloc_len) {
1938 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1939 				if (result == NULL) {
1940 					result = zend_string_alloc(alloc_len, 0);
1941 				} else {
1942 					result = zend_string_extend(result, alloc_len, 0);
1943 				}
1944 			}
1945 
1946 			if (match-piece > 0) {
1947 				/* copy the part of the string before the match */
1948 				memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1949 				result_len += (match-piece);
1950 			}
1951 
1952 			/* If using custom function, copy result to the buffer and clean up. */
1953 			memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1954 			result_len += ZSTR_LEN(eval_result);
1955 			zend_string_release_ex(eval_result, 0);
1956 
1957 			limit--;
1958 
1959 			/* Advance to the next piece. */
1960 			start_offset = last_end_offset = offsets[1];
1961 
1962 			/* If we have matched an empty string, mimic what Perl's /g options does.
1963 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1964 			   the match again at the same point. If this fails (picked up above) we
1965 			   advance to the next character. */
1966 			if (start_offset == offsets[0]) {
1967 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1968 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1969 
1970 				piece = subject + start_offset;
1971 				if (count >= 0 && limit) {
1972 					goto matched;
1973 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1974 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1975 					   this is not necessarily the end. We need to advance
1976 					   the start offset, and continue. Fudge the offset values
1977 					   to achieve this, unless we're already at the end of the string. */
1978 					if (start_offset < subject_len) {
1979 						size_t unit_len = calculate_unit_length(pce, piece);
1980 						start_offset += unit_len;
1981 					} else {
1982 						goto not_matched;
1983 					}
1984 				} else {
1985 					goto error;
1986 				}
1987 			}
1988 
1989 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1990 not_matched:
1991 			if (!result && subject_str) {
1992 				result = zend_string_copy(subject_str);
1993 				break;
1994 			}
1995 			/* now we know exactly how long it is */
1996 			alloc_len = result_len + subject_len - last_end_offset;
1997 			if (NULL != result) {
1998 				result = zend_string_realloc(result, alloc_len, 0);
1999 			} else {
2000 				result = zend_string_alloc(alloc_len, 0);
2001 			}
2002 			/* stick that last bit of string on our output */
2003 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
2004 			result_len += subject_len - last_end_offset;
2005 			ZSTR_VAL(result)[result_len] = '\0';
2006 			ZSTR_LEN(result) = result_len;
2007 			break;
2008 		} else {
2009 error:
2010 			pcre_handle_exec_error(count);
2011 			if (result) {
2012 				zend_string_release_ex(result, 0);
2013 				result = NULL;
2014 			}
2015 			break;
2016 		}
2017 #ifdef HAVE_PCRE_JIT_SUPPORT
2018 		if ((pce->preg_options & PREG_JIT)) {
2019 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2020 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2021 		} else
2022 #endif
2023 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2024 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2025 	}
2026 	if (match_data != mdata) {
2027 		pcre2_match_data_free(match_data);
2028 	}
2029 	mdata_used = old_mdata_used;
2030 
2031 	if (UNEXPECTED(subpat_names)) {
2032 		free_subpats_table(subpat_names, num_subpats);
2033 	}
2034 
2035 	return result;
2036 }
2037 /* }}} */
2038 
2039 /* {{{ php_pcre_replace_func */
php_pcre_replace_func(zend_string * regex,zend_string * subject_str,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)2040 static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex,
2041 							  zend_string *subject_str,
2042 							  zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2043 							  size_t limit, size_t *replace_count, zend_long flags)
2044 {
2045 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
2046 	zend_string	 		*result;			/* Function result */
2047 
2048 	/* Compile regex or get it from cache. */
2049 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2050 		return NULL;
2051 	}
2052 	pce->refcount++;
2053 	result = php_pcre_replace_func_impl(
2054 		pce, subject_str, ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), fci, fcc,
2055 		limit, replace_count, flags);
2056 	pce->refcount--;
2057 
2058 	return result;
2059 }
2060 /* }}} */
2061 
2062 /* {{{ php_pcre_replace_array */
php_pcre_replace_array(HashTable * regex,zend_string * replace_str,HashTable * replace_ht,zend_string * subject_str,size_t limit,size_t * replace_count)2063 static zend_string *php_pcre_replace_array(HashTable *regex,
2064 	zend_string *replace_str, HashTable *replace_ht,
2065 	zend_string *subject_str, size_t limit, size_t *replace_count)
2066 {
2067 	zval		*regex_entry;
2068 	zend_string *result;
2069 
2070 	zend_string_addref(subject_str);
2071 
2072 	if (replace_ht) {
2073 		uint32_t replace_idx = 0;
2074 
2075 		/* For each entry in the regex array, get the entry */
2076 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2077 			/* Make sure we're dealing with strings. */
2078 			zend_string *tmp_regex_str;
2079 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2080 			zend_string *replace_entry_str, *tmp_replace_entry_str;
2081 			zval *zv;
2082 
2083 			/* Get current entry */
2084 			while (1) {
2085 				if (replace_idx == replace_ht->nNumUsed) {
2086 					replace_entry_str = ZSTR_EMPTY_ALLOC();
2087 					tmp_replace_entry_str = NULL;
2088 					break;
2089 				}
2090 				zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx);
2091 				replace_idx++;
2092 				if (Z_TYPE_P(zv) != IS_UNDEF) {
2093 					replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str);
2094 					break;
2095 				}
2096 			}
2097 
2098 			/* Do the actual replacement and put the result back into subject_str
2099 			   for further replacements. */
2100 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2101 				ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count);
2102 			zend_tmp_string_release(tmp_replace_entry_str);
2103 			zend_tmp_string_release(tmp_regex_str);
2104 			zend_string_release_ex(subject_str, 0);
2105 			subject_str = result;
2106 			if (UNEXPECTED(result == NULL)) {
2107 				break;
2108 			}
2109 		} ZEND_HASH_FOREACH_END();
2110 
2111 	} else {
2112 		ZEND_ASSERT(replace_str != NULL);
2113 
2114 		/* For each entry in the regex array, get the entry */
2115 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2116 			/* Make sure we're dealing with strings. */
2117 			zend_string *tmp_regex_str;
2118 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2119 
2120 			/* Do the actual replacement and put the result back into subject_str
2121 			   for further replacements. */
2122 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2123 				ZSTR_LEN(subject_str), replace_str, limit, replace_count);
2124 			zend_tmp_string_release(tmp_regex_str);
2125 			zend_string_release_ex(subject_str, 0);
2126 			subject_str = result;
2127 
2128 			if (UNEXPECTED(result == NULL)) {
2129 				break;
2130 			}
2131 		} ZEND_HASH_FOREACH_END();
2132 	}
2133 
2134 	return subject_str;
2135 }
2136 /* }}} */
2137 
2138 /* {{{ php_replace_in_subject */
php_replace_in_subject(zend_string * regex_str,HashTable * regex_ht,zend_string * replace_str,HashTable * replace_ht,zend_string * subject,size_t limit,size_t * replace_count)2139 static zend_always_inline zend_string *php_replace_in_subject(
2140 	zend_string *regex_str, HashTable *regex_ht,
2141 	zend_string *replace_str, HashTable *replace_ht,
2142 	zend_string *subject, size_t limit, size_t *replace_count)
2143 {
2144 	zend_string *result;
2145 
2146 	if (regex_str) {
2147 		ZEND_ASSERT(replace_str != NULL);
2148 		result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject),
2149 			replace_str, limit, replace_count);
2150 	} else {
2151 		ZEND_ASSERT(regex_ht != NULL);
2152 		result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject,
2153 			limit, replace_count);
2154 	}
2155 	return result;
2156 }
2157 /* }}} */
2158 
2159 /* {{{ php_replace_in_subject_func */
php_replace_in_subject_func(zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject,size_t limit,size_t * replace_count,zend_long flags)2160 static zend_string *php_replace_in_subject_func(zend_string *regex_str, HashTable *regex_ht,
2161 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2162 	zend_string *subject, size_t limit, size_t *replace_count, zend_long flags)
2163 {
2164 	zend_string *result;
2165 
2166 	if (regex_str) {
2167 		result = php_pcre_replace_func(
2168 			regex_str, subject, fci, fcc, limit, replace_count, flags);
2169 		return result;
2170 	} else {
2171 		/* If regex is an array */
2172 		zval		*regex_entry;
2173 
2174 		ZEND_ASSERT(regex_ht != NULL);
2175 
2176 		zend_string_addref(subject);
2177 
2178 		/* For each entry in the regex array, get the entry */
2179 		ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) {
2180 			/* Make sure we're dealing with strings. */
2181 			zend_string *tmp_regex_entry_str;
2182 			zend_string *regex_entry_str = zval_get_tmp_string(regex_entry, &tmp_regex_entry_str);
2183 
2184 			/* Do the actual replacement and put the result back into subject
2185 			   for further replacements. */
2186 			result = php_pcre_replace_func(
2187 				regex_entry_str, subject, fci, fcc, limit, replace_count, flags);
2188 			zend_tmp_string_release(tmp_regex_entry_str);
2189 			zend_string_release(subject);
2190 			subject = result;
2191 			if (UNEXPECTED(result == NULL)) {
2192 				break;
2193 			}
2194 		} ZEND_HASH_FOREACH_END();
2195 
2196 		return subject;
2197 	}
2198 }
2199 /* }}} */
2200 
2201 /* {{{ preg_replace_func_impl */
preg_replace_func_impl(zval * return_value,zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject_str,HashTable * subject_ht,zend_long limit_val,zend_long flags)2202 static size_t preg_replace_func_impl(zval *return_value,
2203 	zend_string *regex_str, HashTable *regex_ht,
2204 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2205 	zend_string *subject_str, HashTable *subject_ht, zend_long limit_val, zend_long flags)
2206 {
2207 	zend_string	*result;
2208 	size_t replace_count = 0;
2209 
2210 	if (subject_str) {
2211 		result = php_replace_in_subject_func(
2212 			regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags);
2213 		if (result != NULL) {
2214 			RETVAL_STR(result);
2215 		} else {
2216 			RETVAL_NULL();
2217 		}
2218 	} else {
2219 		/* if subject is an array */
2220 		zval		*subject_entry, zv;
2221 		zend_string	*string_key;
2222 		zend_ulong	 num_key;
2223 
2224 		ZEND_ASSERT(subject_ht != NULL);
2225 
2226 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2227 
2228 		/* For each subject entry, convert it to string, then perform replacement
2229 		   and add the result to the return_value array. */
2230 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2231 			zend_string *tmp_subject_entry_str;
2232 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2233 
2234 			result = php_replace_in_subject_func(
2235 				regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags);
2236 			if (result != NULL) {
2237 				/* Add to return array */
2238 				ZVAL_STR(&zv, result);
2239 				if (string_key) {
2240 					zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
2241 				} else {
2242 					zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
2243 				}
2244 			}
2245 			zend_tmp_string_release(tmp_subject_entry_str);
2246 		} ZEND_HASH_FOREACH_END();
2247 	}
2248 
2249 	return replace_count;
2250 }
2251 /* }}} */
2252 
2253 /* {{{ preg_replace_common */
preg_replace_common(INTERNAL_FUNCTION_PARAMETERS,bool is_filter)2254 static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
2255 {
2256 	zval *zcount = NULL;
2257 	zend_string *regex_str;
2258 	HashTable *regex_ht;
2259 	zend_string *replace_str;
2260 	HashTable *replace_ht;
2261 	zend_string *subject_str;
2262 	HashTable *subject_ht;
2263 	zend_long limit = -1;
2264 	size_t replace_count = 0;
2265 	zend_string	*result;
2266 	size_t old_replace_count;
2267 
2268 	/* Get function parameters and do error-checking. */
2269 	ZEND_PARSE_PARAMETERS_START(3, 5)
2270 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2271 		Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str)
2272 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2273 		Z_PARAM_OPTIONAL
2274 		Z_PARAM_LONG(limit)
2275 		Z_PARAM_ZVAL(zcount)
2276 	ZEND_PARSE_PARAMETERS_END();
2277 
2278 	/* If replace is an array then the regex argument needs to also be an array */
2279 	if (replace_ht && !regex_ht) {
2280 		zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given");
2281 		RETURN_THROWS();
2282 	}
2283 
2284 	if (subject_str) {
2285 		old_replace_count = replace_count;
2286 		result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2287 			subject_str, limit, &replace_count);
2288 		if (result != NULL) {
2289 			if (!is_filter || replace_count > old_replace_count) {
2290 				RETVAL_STR(result);
2291 			} else {
2292 				zend_string_release_ex(result, 0);
2293 				RETVAL_NULL();
2294 			}
2295 		} else {
2296 			RETVAL_NULL();
2297 		}
2298 	} else {
2299 		/* if subject is an array */
2300 		zval		*subject_entry, zv;
2301 		zend_string	*string_key;
2302 		zend_ulong	 num_key;
2303 
2304 		ZEND_ASSERT(subject_ht != NULL);
2305 
2306 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2307 
2308 		/* For each subject entry, convert it to string, then perform replacement
2309 		   and add the result to the return_value array. */
2310 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2311 			old_replace_count = replace_count;
2312 			zend_string *tmp_subject_entry_str;
2313 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2314 			result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2315 				subject_entry_str, limit, &replace_count);
2316 
2317 			if (result != NULL) {
2318 				if (!is_filter || replace_count > old_replace_count) {
2319 					/* Add to return array */
2320 					ZVAL_STR(&zv, result);
2321 					if (string_key) {
2322 						zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
2323 					} else {
2324 						zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
2325 					}
2326 				} else {
2327 					zend_string_release_ex(result, 0);
2328 				}
2329 			}
2330 			zend_tmp_string_release(tmp_subject_entry_str);
2331 		} ZEND_HASH_FOREACH_END();
2332 	}
2333 
2334 	if (zcount) {
2335 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2336 	}
2337 }
2338 /* }}} */
2339 
2340 /* {{{ Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)2341 PHP_FUNCTION(preg_replace)
2342 {
2343 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
2344 }
2345 /* }}} */
2346 
2347 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)2348 PHP_FUNCTION(preg_replace_callback)
2349 {
2350 	zval *zcount = NULL;
2351 	zend_string *regex_str;
2352 	HashTable *regex_ht;
2353 	zend_string *subject_str;
2354 	HashTable *subject_ht;
2355 	zend_long limit = -1, flags = 0;
2356 	size_t replace_count;
2357 	zend_fcall_info fci;
2358 	zend_fcall_info_cache fcc;
2359 
2360 	/* Get function parameters and do error-checking. */
2361 	ZEND_PARSE_PARAMETERS_START(3, 6)
2362 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2363 		Z_PARAM_FUNC(fci, fcc)
2364 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2365 		Z_PARAM_OPTIONAL
2366 		Z_PARAM_LONG(limit)
2367 		Z_PARAM_ZVAL(zcount)
2368 		Z_PARAM_LONG(flags)
2369 	ZEND_PARSE_PARAMETERS_END();
2370 
2371 	replace_count = preg_replace_func_impl(return_value, regex_str, regex_ht,
2372 		&fci, &fcc,
2373 		subject_str, subject_ht, limit, flags);
2374 	if (zcount) {
2375 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2376 	}
2377 }
2378 /* }}} */
2379 
2380 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)2381 PHP_FUNCTION(preg_replace_callback_array)
2382 {
2383 	zval zv, *replace, *zcount = NULL;
2384 	HashTable *pattern, *subject_ht;
2385 	zend_string *subject_str, *str_idx_regex;
2386 	zend_long limit = -1, flags = 0;
2387 	size_t replace_count = 0;
2388 	zend_fcall_info fci;
2389 	zend_fcall_info_cache fcc;
2390 
2391 	/* Get function parameters and do error-checking. */
2392 	ZEND_PARSE_PARAMETERS_START(2, 5)
2393 		Z_PARAM_ARRAY_HT(pattern)
2394 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2395 		Z_PARAM_OPTIONAL
2396 		Z_PARAM_LONG(limit)
2397 		Z_PARAM_ZVAL(zcount)
2398 		Z_PARAM_LONG(flags)
2399 	ZEND_PARSE_PARAMETERS_END();
2400 
2401 	fci.size = sizeof(fci);
2402 	fci.object = NULL;
2403 	fci.named_params = NULL;
2404 
2405 	if (subject_ht) {
2406 		GC_TRY_ADDREF(subject_ht);
2407 	} else {
2408 		GC_TRY_ADDREF(subject_str);
2409 	}
2410 
2411 	ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) {
2412 		if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) {
2413 			zend_argument_type_error(1, "must contain only valid callbacks");
2414 			goto error;
2415 		}
2416 		if (!str_idx_regex) {
2417 			zend_argument_type_error(1, "must contain only string patterns as keys");
2418 			goto error;
2419 		}
2420 
2421 		ZVAL_COPY_VALUE(&fci.function_name, replace);
2422 
2423 		replace_count += preg_replace_func_impl(&zv, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc,
2424 			subject_str, subject_ht, limit, flags);
2425 		switch (Z_TYPE(zv)) {
2426 			case IS_ARRAY:
2427 				ZEND_ASSERT(subject_ht);
2428 				zend_array_release(subject_ht);
2429 				subject_ht = Z_ARR(zv);
2430 				break;
2431 			case IS_STRING:
2432 				ZEND_ASSERT(subject_str);
2433 				zend_string_release(subject_str);
2434 				subject_str = Z_STR(zv);
2435 				break;
2436 			case IS_NULL:
2437 				RETVAL_NULL();
2438 				goto error;
2439 			EMPTY_SWITCH_DEFAULT_CASE()
2440 		}
2441 
2442 		if (EG(exception)) {
2443 			goto error;
2444 		}
2445 	} ZEND_HASH_FOREACH_END();
2446 
2447 	if (zcount) {
2448 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2449 	}
2450 
2451 	if (subject_ht) {
2452 		RETVAL_ARR(subject_ht);
2453 		// Unset the type_flags of immutable arrays to prevent the VM from performing refcounting
2454 		if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) {
2455 			Z_TYPE_FLAGS_P(return_value) = 0;
2456 		}
2457 		return;
2458 	} else {
2459 		RETURN_STR(subject_str);
2460 	}
2461 
2462 error:
2463 	if (subject_ht) {
2464 		zend_array_release(subject_ht);
2465 	} else {
2466 		zend_string_release(subject_str);
2467 	}
2468 }
2469 /* }}} */
2470 
2471 /* {{{ Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)2472 PHP_FUNCTION(preg_filter)
2473 {
2474 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
2475 }
2476 /* }}} */
2477 
2478 /* {{{ Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)2479 PHP_FUNCTION(preg_split)
2480 {
2481 	zend_string			*regex;			/* Regular expression */
2482 	zend_string			*subject;		/* String to match against */
2483 	zend_long			 limit_val = -1;/* Integer value of limit */
2484 	zend_long			 flags = 0;		/* Match control flags */
2485 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2486 
2487 	/* Get function parameters and do error checking */
2488 	ZEND_PARSE_PARAMETERS_START(2, 4)
2489 		Z_PARAM_STR(regex)
2490 		Z_PARAM_STR(subject)
2491 		Z_PARAM_OPTIONAL
2492 		Z_PARAM_LONG(limit_val)
2493 		Z_PARAM_LONG(flags)
2494 	ZEND_PARSE_PARAMETERS_END();
2495 
2496 	/* Compile regex or get it from cache. */
2497 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2498 		RETURN_FALSE;
2499 	}
2500 
2501 	pce->refcount++;
2502 	php_pcre_split_impl(pce, subject, return_value, limit_val, flags);
2503 	pce->refcount--;
2504 }
2505 /* }}} */
2506 
2507 /* {{{ php_pcre_split */
php_pcre_split_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zend_long limit_val,zend_long flags)2508 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
2509 	zend_long limit_val, zend_long flags)
2510 {
2511 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
2512 	uint32_t		 options;			/* Execution options */
2513 	int				 count;				/* Count of matched subpatterns */
2514 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
2515 	PCRE2_SIZE		 last_match_offset;	/* Location of last match */
2516 	uint32_t		 no_empty;			/* If NO_EMPTY flag is set */
2517 	uint32_t		 delim_capture; 	/* If delimiters should be captured */
2518 	uint32_t		 offset_capture;	/* If offsets should be captured */
2519 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2520 	zval			 tmp;
2521 	pcre2_match_data *match_data;
2522 	char *subject = ZSTR_VAL(subject_str);
2523 
2524 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
2525 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
2526 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
2527 
2528 	/* Initialize return value */
2529 	array_init(return_value);
2530 
2531 	/* Calculate the size of the offsets array, and allocate memory for it. */
2532 	num_subpats = pce->capture_count + 1;
2533 
2534 	/* Start at the beginning of the string */
2535 	start_offset = 0;
2536 	last_match_offset = 0;
2537 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2538 
2539 	if (limit_val == -1) {
2540 		/* pass */
2541 	} else if (limit_val == 0) {
2542 		limit_val = -1;
2543 	} else if (limit_val <= 1) {
2544 		goto last;
2545 	}
2546 
2547 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2548 		match_data = mdata;
2549 	} else {
2550 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2551 		if (!match_data) {
2552 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2553 			zval_ptr_dtor(return_value);
2554 			RETURN_FALSE;
2555 		}
2556 	}
2557 
2558 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2559 
2560 #ifdef HAVE_PCRE_JIT_SUPPORT
2561 	if ((pce->preg_options & PREG_JIT) && options) {
2562 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2563 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2564 	} else
2565 #endif
2566 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2567 			options, match_data, mctx);
2568 
2569 	while (1) {
2570 		/* If something matched */
2571 		if (count >= 0) {
2572 			/* Check for too many substrings condition. */
2573 			if (UNEXPECTED(count == 0)) {
2574 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
2575 				count = num_subpats;
2576 			}
2577 
2578 matched:
2579 			offsets = pcre2_get_ovector_pointer(match_data);
2580 
2581 			if (UNEXPECTED(offsets[1] < offsets[0])) {
2582 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2583 				break;
2584 			}
2585 
2586 			if (!no_empty || offsets[0] != last_match_offset) {
2587 				if (offset_capture) {
2588 					/* Add (match, offset) pair to the return value */
2589 					add_offset_pair(
2590 						return_value, subject, last_match_offset, offsets[0],
2591 						NULL, 0);
2592 				} else {
2593 					/* Add the piece to the return value */
2594 					populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
2595 					zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2596 				}
2597 
2598 				/* One less left to do */
2599 				if (limit_val != -1)
2600 					limit_val--;
2601 			}
2602 
2603 			if (delim_capture) {
2604 				size_t i;
2605 				for (i = 1; i < count; i++) {
2606 					/* If we have matched a delimiter */
2607 					if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
2608 						if (offset_capture) {
2609 							add_offset_pair(
2610 								return_value, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
2611 						} else {
2612 							populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
2613 							zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2614 						}
2615 					}
2616 				}
2617 			}
2618 
2619 			/* Advance to the position right after the last full match */
2620 			start_offset = last_match_offset = offsets[1];
2621 
2622 			/* If we have matched an empty string, mimic what Perl's /g options does.
2623 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
2624 			   the match again at the same point. If this fails (picked up above) we
2625 			   advance to the next character. */
2626 			if (start_offset == offsets[0]) {
2627 				/* Get next piece if no limit or limit not yet reached and something matched*/
2628 				if (limit_val != -1 && limit_val <= 1) {
2629 					break;
2630 				}
2631 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2632 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
2633 				if (count >= 0) {
2634 					goto matched;
2635 				} else if (count == PCRE2_ERROR_NOMATCH) {
2636 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2637 					   this is not necessarily the end. We need to advance
2638 					   the start offset, and continue. Fudge the offset values
2639 					   to achieve this, unless we're already at the end of the string. */
2640 					if (start_offset < ZSTR_LEN(subject_str)) {
2641 						start_offset += calculate_unit_length(pce, subject + start_offset);
2642 					} else {
2643 						break;
2644 					}
2645 				} else {
2646 					goto error;
2647 				}
2648 			}
2649 
2650 		} else if (count == PCRE2_ERROR_NOMATCH) {
2651 			break;
2652 		} else {
2653 error:
2654 			pcre_handle_exec_error(count);
2655 			break;
2656 		}
2657 
2658 		/* Get next piece if no limit or limit not yet reached and something matched*/
2659 		if (limit_val != -1 && limit_val <= 1) {
2660 			break;
2661 		}
2662 
2663 #ifdef HAVE_PCRE_JIT_SUPPORT
2664 		if (pce->preg_options & PREG_JIT) {
2665 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2666 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2667 		} else
2668 #endif
2669 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2670 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2671 	}
2672 	if (match_data != mdata) {
2673 		pcre2_match_data_free(match_data);
2674 	}
2675 
2676 	if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) {
2677 		zval_ptr_dtor(return_value);
2678 		RETURN_FALSE;
2679 	}
2680 
2681 last:
2682 	start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */
2683 
2684 	if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
2685 		if (offset_capture) {
2686 			/* Add the last (match, offset) pair to the return value */
2687 			add_offset_pair(return_value, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
2688 		} else {
2689 			/* Add the last piece to the return value */
2690 			if (start_offset == 0) {
2691 				ZVAL_STR_COPY(&tmp, subject_str);
2692 			} else {
2693 				populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
2694 			}
2695 			zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2696 		}
2697 	}
2698 }
2699 /* }}} */
2700 
2701 /* {{{ Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)2702 PHP_FUNCTION(preg_quote)
2703 {
2704 	zend_string *str;       		/* Input string argument */
2705 	zend_string	*delim = NULL;		/* Additional delimiter argument */
2706 	char		*in_str;			/* Input string */
2707 	char		*in_str_end;    	/* End of the input string */
2708 	zend_string	*out_str;			/* Output string with quoted characters */
2709 	size_t       extra_len;         /* Number of additional characters */
2710 	char 		*p,					/* Iterator for input string */
2711 				*q,					/* Iterator for output string */
2712 				 delim_char = '\0',	/* Delimiter character to be quoted */
2713 				 c;					/* Current character */
2714 
2715 	/* Get the arguments and check for errors */
2716 	ZEND_PARSE_PARAMETERS_START(1, 2)
2717 		Z_PARAM_STR(str)
2718 		Z_PARAM_OPTIONAL
2719 		Z_PARAM_STR_OR_NULL(delim)
2720 	ZEND_PARSE_PARAMETERS_END();
2721 
2722 	/* Nothing to do if we got an empty string */
2723 	if (ZSTR_LEN(str) == 0) {
2724 		RETURN_EMPTY_STRING();
2725 	}
2726 
2727 	in_str = ZSTR_VAL(str);
2728 	in_str_end = in_str + ZSTR_LEN(str);
2729 
2730 	if (delim) {
2731 		delim_char = ZSTR_VAL(delim)[0];
2732 	}
2733 
2734 	/* Go through the string and quote necessary characters */
2735 	extra_len = 0;
2736 	p = in_str;
2737 	do {
2738 		c = *p;
2739 		switch(c) {
2740 			case '.':
2741 			case '\\':
2742 			case '+':
2743 			case '*':
2744 			case '?':
2745 			case '[':
2746 			case '^':
2747 			case ']':
2748 			case '$':
2749 			case '(':
2750 			case ')':
2751 			case '{':
2752 			case '}':
2753 			case '=':
2754 			case '!':
2755 			case '>':
2756 			case '<':
2757 			case '|':
2758 			case ':':
2759 			case '-':
2760 			case '#':
2761 				extra_len++;
2762 				break;
2763 
2764 			case '\0':
2765 				extra_len+=3;
2766 				break;
2767 
2768 			default:
2769 				if (c == delim_char) {
2770 					extra_len++;
2771 				}
2772 				break;
2773 		}
2774 		p++;
2775 	} while (p != in_str_end);
2776 
2777 	if (extra_len == 0) {
2778 		RETURN_STR_COPY(str);
2779 	}
2780 
2781 	/* Allocate enough memory so that even if each character
2782 	   is quoted, we won't run out of room */
2783 	out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0);
2784 	q = ZSTR_VAL(out_str);
2785 	p = in_str;
2786 
2787 	do {
2788 		c = *p;
2789 		switch(c) {
2790 			case '.':
2791 			case '\\':
2792 			case '+':
2793 			case '*':
2794 			case '?':
2795 			case '[':
2796 			case '^':
2797 			case ']':
2798 			case '$':
2799 			case '(':
2800 			case ')':
2801 			case '{':
2802 			case '}':
2803 			case '=':
2804 			case '!':
2805 			case '>':
2806 			case '<':
2807 			case '|':
2808 			case ':':
2809 			case '-':
2810 			case '#':
2811 				*q++ = '\\';
2812 				*q++ = c;
2813 				break;
2814 
2815 			case '\0':
2816 				*q++ = '\\';
2817 				*q++ = '0';
2818 				*q++ = '0';
2819 				*q++ = '0';
2820 				break;
2821 
2822 			default:
2823 				if (c == delim_char) {
2824 					*q++ = '\\';
2825 				}
2826 				*q++ = c;
2827 				break;
2828 		}
2829 		p++;
2830 	} while (p != in_str_end);
2831 	*q = '\0';
2832 
2833 	RETURN_NEW_STR(out_str);
2834 }
2835 /* }}} */
2836 
2837 /* {{{ Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2838 PHP_FUNCTION(preg_grep)
2839 {
2840 	zend_string			*regex;			/* Regular expression */
2841 	zval				*input;			/* Input array */
2842 	zend_long			 flags = 0;		/* Match control flags */
2843 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2844 
2845 	/* Get arguments and do error checking */
2846 	ZEND_PARSE_PARAMETERS_START(2, 3)
2847 		Z_PARAM_STR(regex)
2848 		Z_PARAM_ARRAY(input)
2849 		Z_PARAM_OPTIONAL
2850 		Z_PARAM_LONG(flags)
2851 	ZEND_PARSE_PARAMETERS_END();
2852 
2853 	/* Compile regex or get it from cache. */
2854 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2855 		RETURN_FALSE;
2856 	}
2857 
2858 	pce->refcount++;
2859 	php_pcre_grep_impl(pce, input, return_value, flags);
2860 	pce->refcount--;
2861 }
2862 /* }}} */
2863 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2864 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2865 {
2866 	zval            *entry;             /* An entry in the input array */
2867 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2868 	int				 count;				/* Count of matched subpatterns */
2869 	uint32_t		 options;			/* Execution options */
2870 	zend_string		*string_key;
2871 	zend_ulong		 num_key;
2872 	bool		 invert;			/* Whether to return non-matching
2873 										   entries */
2874 	pcre2_match_data *match_data;
2875 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
2876 
2877 	/* Calculate the size of the offsets array, and allocate memory for it. */
2878 	num_subpats = pce->capture_count + 1;
2879 
2880 	/* Initialize return array */
2881 	array_init(return_value);
2882 
2883 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2884 
2885 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2886 		match_data = mdata;
2887 	} else {
2888 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2889 		if (!match_data) {
2890 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2891 			return;
2892 		}
2893 	}
2894 
2895 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2896 
2897 	/* Go through the input array */
2898 	ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2899 		zend_string *tmp_subject_str;
2900 		zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str);
2901 
2902 		/* Perform the match */
2903 #ifdef HAVE_PCRE_JIT_SUPPORT
2904 		if ((pce->preg_options & PREG_JIT) && options) {
2905 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2906 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2907 		} else
2908 #endif
2909 		count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2910 				options, match_data, mctx);
2911 
2912 		/* If the entry fits our requirements */
2913 		if (count >= 0) {
2914 			/* Check for too many substrings condition. */
2915 			if (UNEXPECTED(count == 0)) {
2916 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2917 			}
2918 			if (!invert) {
2919 				Z_TRY_ADDREF_P(entry);
2920 
2921 				/* Add to return array */
2922 				if (string_key) {
2923 					zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2924 				} else {
2925 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2926 				}
2927 			}
2928 		} else if (count == PCRE2_ERROR_NOMATCH) {
2929 			if (invert) {
2930 				Z_TRY_ADDREF_P(entry);
2931 
2932 				/* Add to return array */
2933 				if (string_key) {
2934 					zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2935 				} else {
2936 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2937 				}
2938 			}
2939 		} else {
2940 			pcre_handle_exec_error(count);
2941 			zend_tmp_string_release(tmp_subject_str);
2942 			break;
2943 		}
2944 
2945 		zend_tmp_string_release(tmp_subject_str);
2946 	} ZEND_HASH_FOREACH_END();
2947 	if (match_data != mdata) {
2948 		pcre2_match_data_free(match_data);
2949 	}
2950 }
2951 /* }}} */
2952 
2953 /* {{{ Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)2954 PHP_FUNCTION(preg_last_error)
2955 {
2956 	ZEND_PARSE_PARAMETERS_NONE();
2957 
2958 	RETURN_LONG(PCRE_G(error_code));
2959 }
2960 /* }}} */
2961 
2962 /* {{{ Returns the error message of the last regexp execution. */
PHP_FUNCTION(preg_last_error_msg)2963 PHP_FUNCTION(preg_last_error_msg)
2964 {
2965 	ZEND_PARSE_PARAMETERS_NONE();
2966 
2967 	RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code)));
2968 }
2969 /* }}} */
2970 
2971 /* {{{ module definition structures */
2972 
2973 zend_module_entry pcre_module_entry = {
2974 	STANDARD_MODULE_HEADER,
2975 	"pcre",
2976 	ext_functions,
2977 	PHP_MINIT(pcre),
2978 	PHP_MSHUTDOWN(pcre),
2979 	PHP_RINIT(pcre),
2980 	PHP_RSHUTDOWN(pcre),
2981 	PHP_MINFO(pcre),
2982 	PHP_PCRE_VERSION,
2983 	PHP_MODULE_GLOBALS(pcre),
2984 	PHP_GINIT(pcre),
2985 	PHP_GSHUTDOWN(pcre),
2986 	NULL,
2987 	STANDARD_MODULE_PROPERTIES_EX
2988 };
2989 
2990 #ifdef COMPILE_DL_PCRE
ZEND_GET_MODULE(pcre)2991 ZEND_GET_MODULE(pcre)
2992 #endif
2993 
2994 /* }}} */
2995 
2996 PHPAPI pcre2_match_context *php_pcre_mctx(void)
2997 {/*{{{*/
2998 	return mctx;
2999 }/*}}}*/
3000 
php_pcre_gctx(void)3001 PHPAPI pcre2_general_context *php_pcre_gctx(void)
3002 {/*{{{*/
3003 	return gctx;
3004 }/*}}}*/
3005 
php_pcre_cctx(void)3006 PHPAPI pcre2_compile_context *php_pcre_cctx(void)
3007 {/*{{{*/
3008 	return cctx;
3009 }/*}}}*/
3010 
php_pcre_pce_incref(pcre_cache_entry * pce)3011 PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce)
3012 {/*{{{*/
3013 	assert(NULL != pce);
3014 	pce->refcount++;
3015 }/*}}}*/
3016 
php_pcre_pce_decref(pcre_cache_entry * pce)3017 PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce)
3018 {/*{{{*/
3019 	assert(NULL != pce);
3020 	assert(0 != pce->refcount);
3021 	pce->refcount--;
3022 }/*}}}*/
3023 
php_pcre_pce_re(pcre_cache_entry * pce)3024 PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce)
3025 {/*{{{*/
3026 	assert(NULL != pce);
3027 	return pce->re;
3028 }/*}}}*/
3029