xref: /php-src/ext/pcre/php_pcre.c (revision 25974414)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Andrei Zmievski <andrei@php.net>                             |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "php.h"
18 #include "php_ini.h"
19 #include "php_pcre.h"
20 #include "ext/standard/info.h"
21 #include "ext/standard/basic_functions.h"
22 #include "zend_smart_str.h"
23 #include "SAPI.h"
24 
25 #define PREG_PATTERN_ORDER			1
26 #define PREG_SET_ORDER				2
27 #define PREG_OFFSET_CAPTURE			(1<<8)
28 #define PREG_UNMATCHED_AS_NULL		(1<<9)
29 
30 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
31 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
32 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
33 
34 #define PREG_GREP_INVERT			(1<<0)
35 
36 #define PREG_JIT                    (1<<3)
37 
38 #define PCRE_CACHE_SIZE 4096
39 
40 #ifdef HAVE_PCRE_JIT_SUPPORT
41 #define PHP_PCRE_JIT_SUPPORT 1
42 #else
43 #define PHP_PCRE_JIT_SUPPORT 0
44 #endif
45 
46 char *php_pcre_version;
47 
48 #include "php_pcre_arginfo.h"
49 
50 struct _pcre_cache_entry {
51 	pcre2_code *re;
52 	/* Pointer is not NULL when there are named captures.
53 	 * Length is equal to capture_count + 1 to account for capture group 0. */
54 	zend_string **subpats_table;
55 	uint32_t preg_options;
56 	uint32_t capture_count;
57 	uint32_t compile_options;
58 	uint32_t refcount;
59 };
60 
61 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
62 
63 #ifdef HAVE_PCRE_JIT_SUPPORT
64 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
65 #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024)
66 ZEND_TLS pcre2_jit_stack *jit_stack = NULL;
67 #endif
68 /* General context using (infallible) system allocator. */
69 ZEND_TLS pcre2_general_context *gctx = NULL;
70 /* These two are global per thread for now. Though it is possible to use these
71  	per pattern. Either one can copy it and use in pce, or one does no global
72 	contexts at all, but creates for every pce. */
73 ZEND_TLS pcre2_compile_context *cctx = NULL;
74 ZEND_TLS pcre2_match_context   *mctx = NULL;
75 ZEND_TLS pcre2_match_data      *mdata = NULL;
76 ZEND_TLS bool              mdata_used = 0;
77 ZEND_TLS uint8_t pcre2_init_ok = 0;
78 #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT)
79 static MUTEX_T pcre_mt = NULL;
80 #define php_pcre_mutex_alloc() \
81 	if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc();
82 #define php_pcre_mutex_free() \
83 	if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; }
84 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
85 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
86 #else
87 #define php_pcre_mutex_alloc()
88 #define php_pcre_mutex_free()
89 #define php_pcre_mutex_lock()
90 #define php_pcre_mutex_unlock()
91 #endif
92 
93 ZEND_TLS HashTable char_tables;
94 
95 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats, bool persistent);
96 
php_pcre_free_char_table(zval * data)97 static void php_pcre_free_char_table(zval *data)
98 {/*{{{*/
99 	void *ptr = Z_PTR_P(data);
100 	pefree(ptr, 1);
101 }/*}}}*/
102 
pcre_handle_exec_error(int pcre_code)103 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
104 {
105 	int preg_code = 0;
106 
107 	switch (pcre_code) {
108 		case PCRE2_ERROR_MATCHLIMIT:
109 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
110 			break;
111 
112 		case PCRE2_ERROR_RECURSIONLIMIT:
113 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
114 			break;
115 
116 		case PCRE2_ERROR_BADUTFOFFSET:
117 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
118 			break;
119 
120 #ifdef HAVE_PCRE_JIT_SUPPORT
121 		case PCRE2_ERROR_JIT_STACKLIMIT:
122 			preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
123 			break;
124 #endif
125 
126 		default:
127 			if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) {
128 				preg_code = PHP_PCRE_BAD_UTF8_ERROR;
129 			} else  {
130 				preg_code = PHP_PCRE_INTERNAL_ERROR;
131 			}
132 			break;
133 	}
134 
135 	PCRE_G(error_code) = preg_code;
136 }
137 /* }}} */
138 
php_pcre_get_error_msg(php_pcre_error_code error_code)139 static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */
140 {
141 	switch (error_code) {
142 		case PHP_PCRE_NO_ERROR:
143 			return "No error";
144 		case PHP_PCRE_INTERNAL_ERROR:
145 			return "Internal error";
146 		case PHP_PCRE_BAD_UTF8_ERROR:
147 			return "Malformed UTF-8 characters, possibly incorrectly encoded";
148 		case PHP_PCRE_BAD_UTF8_OFFSET_ERROR:
149 			return "The offset did not correspond to the beginning of a valid UTF-8 code point";
150 		case PHP_PCRE_BACKTRACK_LIMIT_ERROR:
151 			return "Backtrack limit exhausted";
152 		case PHP_PCRE_RECURSION_LIMIT_ERROR:
153 			return "Recursion limit exhausted";
154 
155 #ifdef HAVE_PCRE_JIT_SUPPORT
156 		case PHP_PCRE_JIT_STACKLIMIT_ERROR:
157 			return "JIT stack limit exhausted";
158 #endif
159 
160 		default:
161 			return "Unknown error";
162 	}
163 }
164 /* }}} */
165 
php_free_pcre_cache(zval * data)166 static void php_free_pcre_cache(zval *data) /* {{{ */
167 {
168 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
169 	if (!pce) return;
170 	if (pce->subpats_table) {
171 		free_subpats_table(pce->subpats_table, pce->capture_count + 1, true);
172 	}
173 	pcre2_code_free(pce->re);
174 	free(pce);
175 }
176 /* }}} */
177 
php_efree_pcre_cache(zval * data)178 static void php_efree_pcre_cache(zval *data) /* {{{ */
179 {
180 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
181 	if (!pce) return;
182 	if (pce->subpats_table) {
183 		free_subpats_table(pce->subpats_table, pce->capture_count + 1, false);
184 	}
185 	pcre2_code_free(pce->re);
186 	efree(pce);
187 }
188 /* }}} */
189 
php_pcre_malloc(PCRE2_SIZE size,void * data)190 static void *php_pcre_malloc(PCRE2_SIZE size, void *data)
191 {
192 	return pemalloc(size, 1);
193 }
194 
php_pcre_free(void * block,void * data)195 static void php_pcre_free(void *block, void *data)
196 {
197 	pefree(block, 1);
198 }
199 
php_pcre_emalloc(PCRE2_SIZE size,void * data)200 static void *php_pcre_emalloc(PCRE2_SIZE size, void *data)
201 {
202 	return emalloc(size);
203 }
204 
php_pcre_efree(void * block,void * data)205 static void php_pcre_efree(void *block, void *data)
206 {
207 	efree(block);
208 }
209 
210 #ifdef PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
211 	/* pcre 10.38 needs PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, disabled by default */
212 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
213 #else
214 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS 0
215 #endif
216 
217 #define PHP_PCRE_PREALLOC_MDATA_SIZE 32
218 
php_pcre_init_pcre2(uint8_t jit)219 static void php_pcre_init_pcre2(uint8_t jit)
220 {/*{{{*/
221 	if (!gctx) {
222 		gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL);
223 		if (!gctx) {
224 			pcre2_init_ok = 0;
225 			return;
226 		}
227 	}
228 
229 	if (!cctx) {
230 		cctx = pcre2_compile_context_create(gctx);
231 		if (!cctx) {
232 			pcre2_init_ok = 0;
233 			return;
234 		}
235 	}
236 
237 	pcre2_set_compile_extra_options(cctx, PHP_PCRE_DEFAULT_EXTRA_COPTIONS);
238 
239 	if (!mctx) {
240 		mctx = pcre2_match_context_create(gctx);
241 		if (!mctx) {
242 			pcre2_init_ok = 0;
243 			return;
244 		}
245 	}
246 
247 #ifdef HAVE_PCRE_JIT_SUPPORT
248 	if (jit && !jit_stack) {
249 		jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx);
250 		if (!jit_stack) {
251 			pcre2_init_ok = 0;
252 			return;
253 		}
254 	}
255 #endif
256 
257 	if (!mdata) {
258 		mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx);
259 		if (!mdata) {
260 			pcre2_init_ok = 0;
261 			return;
262 		}
263 	}
264 
265 	pcre2_init_ok = 1;
266 }/*}}}*/
267 
php_pcre_shutdown_pcre2(void)268 static void php_pcre_shutdown_pcre2(void)
269 {/*{{{*/
270 	if (gctx) {
271 		pcre2_general_context_free(gctx);
272 		gctx = NULL;
273 	}
274 
275 	if (cctx) {
276 		pcre2_compile_context_free(cctx);
277 		cctx = NULL;
278 	}
279 
280 	if (mctx) {
281 		pcre2_match_context_free(mctx);
282 		mctx = NULL;
283 	}
284 
285 #ifdef HAVE_PCRE_JIT_SUPPORT
286 	/* Stack may only be destroyed when no cached patterns
287 	 	possibly associated with it do exist. */
288 	if (jit_stack) {
289 		pcre2_jit_stack_free(jit_stack);
290 		jit_stack = NULL;
291 	}
292 #endif
293 
294 	if (mdata) {
295 		pcre2_match_data_free(mdata);
296 		mdata = NULL;
297 	}
298 
299 	pcre2_init_ok = 0;
300 }/*}}}*/
301 
PHP_GINIT_FUNCTION(pcre)302 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
303 {
304 	php_pcre_mutex_alloc();
305 
306 	/* If we're on the CLI SAPI, there will only be one request, so we don't need the
307 	 * cache to survive after RSHUTDOWN. */
308 	pcre_globals->per_request_cache = strcmp(sapi_module.name, "cli") == 0;
309 	if (!pcre_globals->per_request_cache) {
310 		zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
311 	}
312 
313 	pcre_globals->backtrack_limit = 0;
314 	pcre_globals->recursion_limit = 0;
315 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
316 	ZVAL_UNDEF(&pcre_globals->unmatched_null_pair);
317 	ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair);
318 #ifdef HAVE_PCRE_JIT_SUPPORT
319 	pcre_globals->jit = 1;
320 #endif
321 
322 	php_pcre_init_pcre2(1);
323 	zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1);
324 }
325 /* }}} */
326 
PHP_GSHUTDOWN_FUNCTION(pcre)327 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
328 {
329 	if (!pcre_globals->per_request_cache) {
330 		zend_hash_destroy(&pcre_globals->pcre_cache);
331 	}
332 
333 	php_pcre_shutdown_pcre2();
334 	zend_hash_destroy(&char_tables);
335 	php_pcre_mutex_free();
336 }
337 /* }}} */
338 
PHP_INI_MH(OnUpdateBacktrackLimit)339 static PHP_INI_MH(OnUpdateBacktrackLimit)
340 {/*{{{*/
341 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
342 	if (mctx) {
343 		pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit));
344 	}
345 
346 	return SUCCESS;
347 }/*}}}*/
348 
PHP_INI_MH(OnUpdateRecursionLimit)349 static PHP_INI_MH(OnUpdateRecursionLimit)
350 {/*{{{*/
351 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
352 	if (mctx) {
353 		pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit));
354 	}
355 
356 	return SUCCESS;
357 }/*}}}*/
358 
359 #ifdef HAVE_PCRE_JIT_SUPPORT
PHP_INI_MH(OnUpdateJit)360 static PHP_INI_MH(OnUpdateJit)
361 {/*{{{*/
362 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
363 	if (PCRE_G(jit) && jit_stack) {
364 		pcre2_jit_stack_assign(mctx, NULL, jit_stack);
365 	} else {
366 		pcre2_jit_stack_assign(mctx, NULL, NULL);
367 	}
368 
369 	return SUCCESS;
370 }/*}}}*/
371 #endif
372 
373 PHP_INI_BEGIN()
374 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals)
375 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000",  PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals)
376 #ifdef HAVE_PCRE_JIT_SUPPORT
377 	STD_PHP_INI_BOOLEAN("pcre.jit",           "1",       PHP_INI_ALL, OnUpdateJit,            jit,             zend_pcre_globals, pcre_globals)
378 #endif
PHP_INI_END()379 PHP_INI_END()
380 
381 static char *_pcre2_config_str(uint32_t what)
382 {/*{{{*/
383 	int len = pcre2_config(what, NULL);
384 	char *ret = (char *) malloc(len + 1);
385 
386 	len = pcre2_config(what, ret);
387 	if (!len) {
388 		free(ret);
389 		return NULL;
390 	}
391 
392 	return ret;
393 }/*}}}*/
394 
395 /* {{{ PHP_MINFO_FUNCTION(pcre) */
PHP_MINFO_FUNCTION(pcre)396 static PHP_MINFO_FUNCTION(pcre)
397 {
398 #ifdef HAVE_PCRE_JIT_SUPPORT
399 	uint32_t flag = 0;
400 	char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET);
401 #endif
402 	char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
403 	char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION);
404 
405 	php_info_print_table_start();
406 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
407 	php_info_print_table_row(2, "PCRE Library Version", version);
408 	free(version);
409 	php_info_print_table_row(2, "PCRE Unicode Version", unicode);
410 	free(unicode);
411 
412 #ifdef HAVE_PCRE_JIT_SUPPORT
413 	if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) {
414 		php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled");
415 	} else {
416 		php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
417 	}
418 	if (jit_target) {
419 		php_info_print_table_row(2, "PCRE JIT Target", jit_target);
420 	}
421 	free(jit_target);
422 #else
423 	php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
424 #endif
425 
426 #ifdef HAVE_PCRE_VALGRIND_SUPPORT
427 	php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" );
428 #endif
429 
430 	php_info_print_table_end();
431 
432 	DISPLAY_INI_ENTRIES();
433 }
434 /* }}} */
435 
436 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)437 static PHP_MINIT_FUNCTION(pcre)
438 {
439 #ifdef HAVE_PCRE_JIT_SUPPORT
440 	if (UNEXPECTED(!pcre2_init_ok)) {
441 		/* Retry. */
442 		php_pcre_init_pcre2(PCRE_G(jit));
443 		if (!pcre2_init_ok) {
444 			return FAILURE;
445 		}
446 	}
447 #endif
448 
449 	REGISTER_INI_ENTRIES();
450 
451 	php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
452 
453 	register_php_pcre_symbols(module_number);
454 
455 	return SUCCESS;
456 }
457 /* }}} */
458 
459 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)460 static PHP_MSHUTDOWN_FUNCTION(pcre)
461 {
462 	UNREGISTER_INI_ENTRIES();
463 
464 	free(php_pcre_version);
465 
466 	return SUCCESS;
467 }
468 /* }}} */
469 
470 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)471 static PHP_RINIT_FUNCTION(pcre)
472 {
473 #ifdef HAVE_PCRE_JIT_SUPPORT
474 	if (UNEXPECTED(!pcre2_init_ok)) {
475 		/* Retry. */
476 		php_pcre_mutex_lock();
477 		php_pcre_init_pcre2(PCRE_G(jit));
478 		if (!pcre2_init_ok) {
479 			php_pcre_mutex_unlock();
480 			return FAILURE;
481 		}
482 		php_pcre_mutex_unlock();
483 	}
484 
485 	mdata_used = 0;
486 #endif
487 
488 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
489 	PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL);
490 	if (!PCRE_G(gctx_zmm)) {
491 		return FAILURE;
492 	}
493 
494 	if (PCRE_G(per_request_cache)) {
495 		zend_hash_init(&PCRE_G(pcre_cache), 0, NULL, php_efree_pcre_cache, 0);
496 	}
497 
498 	return SUCCESS;
499 }
500 /* }}} */
501 
PHP_RSHUTDOWN_FUNCTION(pcre)502 static PHP_RSHUTDOWN_FUNCTION(pcre)
503 {
504 	pcre2_general_context_free(PCRE_G(gctx_zmm));
505 	PCRE_G(gctx_zmm) = NULL;
506 
507 	if (PCRE_G(per_request_cache)) {
508 		zend_hash_destroy(&PCRE_G(pcre_cache));
509 	}
510 
511 	zval_ptr_dtor(&PCRE_G(unmatched_null_pair));
512 	zval_ptr_dtor(&PCRE_G(unmatched_empty_pair));
513 	ZVAL_UNDEF(&PCRE_G(unmatched_null_pair));
514 	ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair));
515 	return SUCCESS;
516 }
517 
518 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)519 static int pcre_clean_cache(zval *data, void *arg)
520 {
521 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
522 	int *num_clean = (int *)arg;
523 
524 	if (*num_clean > 0 && !pce->refcount) {
525 		(*num_clean)--;
526 		return ZEND_HASH_APPLY_REMOVE;
527 	} else {
528 		return ZEND_HASH_APPLY_KEEP;
529 	}
530 }
531 /* }}} */
532 
free_subpats_table(zend_string ** subpat_names,uint32_t num_subpats,bool persistent)533 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats, bool persistent) {
534 	uint32_t i;
535 	for (i = 0; i < num_subpats; i++) {
536 		if (subpat_names[i]) {
537 			zend_string_release_ex(subpat_names[i], persistent);
538 		}
539 	}
540 	pefree(subpat_names, persistent);
541 }
542 
543 /* {{{ static make_subpats_table */
make_subpats_table(uint32_t name_cnt,pcre_cache_entry * pce,bool persistent)544 static zend_string **make_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce, bool persistent)
545 {
546 	uint32_t num_subpats = pce->capture_count + 1;
547 	uint32_t name_size, ni = 0;
548 	char *name_table;
549 	zend_string **subpat_names;
550 	int rc1, rc2;
551 
552 	rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table);
553 	rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size);
554 	if (rc1 < 0 || rc2 < 0) {
555 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2);
556 		return NULL;
557 	}
558 
559 	subpat_names = pecalloc(num_subpats, sizeof(zend_string *), persistent);
560 	while (ni++ < name_cnt) {
561 		unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
562 		const char *name = name_table + 2;
563 		/* Note: this makes a persistent string when the cache is not request-based because the string
564 		 * has to outlive the request. In that case, they will only be used within this thread
565 		 * and never be shared.
566 		 * Although we will be storing them in user-exposed arrays, they cannot cause problems
567 		 * because they only live in this thread and the last reference is deleted on shutdown
568 		 * instead of by user code. */
569 		subpat_names[name_idx] = zend_string_init(name, strlen(name), persistent);
570 		if (persistent) {
571 			GC_MAKE_PERSISTENT_LOCAL(subpat_names[name_idx]);
572 		}
573 		name_table += name_size;
574 	}
575 	return subpat_names;
576 }
577 /* }}} */
578 
579 /* {{{ static calculate_unit_length */
580 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */
calculate_unit_length(pcre_cache_entry * pce,const char * start)581 static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start)
582 {
583 	size_t unit_len;
584 
585 	if (pce->compile_options & PCRE2_UTF) {
586 		const char *end = start;
587 
588 		/* skip continuation bytes */
589 		while ((*++end & 0xC0) == 0x80);
590 		unit_len = end - start;
591 	} else {
592 		unit_len = 1;
593 	}
594 	return unit_len;
595 }
596 /* }}} */
597 
598 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache_ex(zend_string * regex,bool locale_aware)599 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware)
600 {
601 	pcre2_code			*re = NULL;
602 #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !defined(HAVE_BUNDLED_PCRE)
603 	uint32_t			 coptions = PCRE2_NO_START_OPTIMIZE;
604 #else
605 	uint32_t			 coptions = 0;
606 #endif
607 	uint32_t			 eoptions = PHP_PCRE_DEFAULT_EXTRA_COPTIONS;
608 	PCRE2_UCHAR	         error[128];
609 	PCRE2_SIZE           erroffset;
610 	int                  errnumber;
611 	char				 delimiter;
612 	char				 start_delimiter;
613 	char				 end_delimiter;
614 	char				*p, *pp;
615 	char				*pattern;
616 	size_t				 pattern_len;
617 	uint32_t			 poptions = 0;
618 	const uint8_t       *tables = NULL;
619 	zval                *zv;
620 	pcre_cache_entry	 new_entry;
621 	int					 rc;
622 	zend_string 		*key;
623 	pcre_cache_entry	*ret;
624 
625 	if (locale_aware && BG(ctype_string)) {
626 		key = zend_string_concat2(
627 			ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)),
628 			ZSTR_VAL(regex), ZSTR_LEN(regex));
629 	} else {
630 		key = regex;
631 	}
632 
633 	/* Try to lookup the cached regex entry, and if successful, just pass
634 	   back the compiled pattern, otherwise go on and compile it. */
635 	zv = zend_hash_find(&PCRE_G(pcre_cache), key);
636 	if (zv) {
637 		if (key != regex) {
638 			zend_string_release_ex(key, 0);
639 		}
640 		return (pcre_cache_entry*)Z_PTR_P(zv);
641 	}
642 
643 	p = ZSTR_VAL(regex);
644 	const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex);
645 
646 	/* Parse through the leading whitespace, and display a warning if we
647 	   get to the end without encountering a delimiter. */
648 	while (isspace((int)*(unsigned char *)p)) p++;
649 	if (p >= end_p) {
650 		if (key != regex) {
651 			zend_string_release_ex(key, 0);
652 		}
653 		php_error_docref(NULL, E_WARNING, "Empty regular expression");
654 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
655 		return NULL;
656 	}
657 
658 	/* Get the delimiter and display a warning if it is alphanumeric
659 	   or a backslash. */
660 	delimiter = *p++;
661 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\' || delimiter == '\0') {
662 		if (key != regex) {
663 			zend_string_release_ex(key, 0);
664 		}
665 		php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL byte");
666 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
667 		return NULL;
668 	}
669 
670 	start_delimiter = delimiter;
671 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
672 		delimiter = pp[5];
673 	end_delimiter = delimiter;
674 
675 	pp = p;
676 
677 	if (start_delimiter == end_delimiter) {
678 		/* We need to iterate through the pattern, searching for the ending delimiter,
679 		   but skipping the backslashed delimiters.  If the ending delimiter is not
680 		   found, display a warning. */
681 		while (pp < end_p) {
682 			if (*pp == '\\' && pp + 1 < end_p) pp++;
683 			else if (*pp == delimiter)
684 				break;
685 			pp++;
686 		}
687 	} else {
688 		/* We iterate through the pattern, searching for the matching ending
689 		 * delimiter. For each matching starting delimiter, we increment nesting
690 		 * level, and decrement it for each matching ending delimiter. If we
691 		 * reach the end of the pattern without matching, display a warning.
692 		 */
693 		int brackets = 1; 	/* brackets nesting level */
694 		while (pp < end_p) {
695 			if (*pp == '\\' && pp + 1 < end_p) pp++;
696 			else if (*pp == end_delimiter && --brackets <= 0)
697 				break;
698 			else if (*pp == start_delimiter)
699 				brackets++;
700 			pp++;
701 		}
702 	}
703 
704 	if (pp >= end_p) {
705 		if (key != regex) {
706 			zend_string_release_ex(key, 0);
707 		}
708 		if (start_delimiter == end_delimiter) {
709 			php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
710 		} else {
711 			php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
712 		}
713 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
714 		return NULL;
715 	}
716 
717 	/* Make a copy of the actual pattern. */
718 	pattern_len = pp - p;
719 	pattern = estrndup(p, pattern_len);
720 
721 	/* Move on to the options */
722 	pp++;
723 
724 	/* Parse through the options, setting appropriate flags.  Display
725 	   a warning if we encounter an unknown modifier. */
726 	while (pp < end_p) {
727 		switch (*pp++) {
728 			/* Perl compatible options */
729 			case 'i':	coptions |= PCRE2_CASELESS;		break;
730 			case 'm':	coptions |= PCRE2_MULTILINE;		break;
731 			case 'n':	coptions |= PCRE2_NO_AUTO_CAPTURE;	break;
732 			case 's':	coptions |= PCRE2_DOTALL;		break;
733 			case 'x':	coptions |= PCRE2_EXTENDED;		break;
734 
735 			/* PCRE specific options */
736 			case 'A':	coptions |= PCRE2_ANCHORED;		break;
737 			case 'D':	coptions |= PCRE2_DOLLAR_ENDONLY;break;
738 #ifdef PCRE2_EXTRA_CASELESS_RESTRICT
739 			case 'r':	eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break;
740 #endif
741 			case 'S':	/* Pass. */					break;
742 			case 'X':	/* Pass. */					break;
743 			case 'U':	coptions |= PCRE2_UNGREEDY;		break;
744 			case 'u':	coptions |= PCRE2_UTF;
745 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
746 	   characters, even in UTF-8 mode. However, this can be changed by setting
747 	   the PCRE2_UCP option. */
748 #ifdef PCRE2_UCP
749 						coptions |= PCRE2_UCP;
750 #endif
751 				break;
752 			case 'J':	coptions |= PCRE2_DUPNAMES;		break;
753 
754 			case ' ':
755 			case '\n':
756 			case '\r':
757 				break;
758 
759 			case 'e': /* legacy eval */
760 			default:
761 				if (pp[-1]) {
762 					php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]);
763 				} else {
764 					php_error_docref(NULL, E_WARNING, "NUL byte is not a valid modifier");
765 				}
766 				pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
767 				efree(pattern);
768 				if (key != regex) {
769 					zend_string_release_ex(key, 0);
770 				}
771 				return NULL;
772 		}
773 	}
774 
775 	if (key != regex) {
776 		tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string));
777 		if (!tables) {
778 			zend_string *_k;
779 			tables = pcre2_maketables(gctx);
780 			if (UNEXPECTED(!tables)) {
781 				php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables");
782 				pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY);
783 				zend_string_release_ex(key, 0);
784 				efree(pattern);
785 				return NULL;
786 			}
787 			_k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1);
788 			GC_MAKE_PERSISTENT_LOCAL(_k);
789 			zend_hash_add_ptr(&char_tables, _k, (void *)tables);
790 			zend_string_release(_k);
791 		}
792 	}
793 	pcre2_set_character_tables(cctx, tables);
794 
795 	pcre2_set_compile_extra_options(cctx, eoptions);
796 
797 	/* Compile pattern and display a warning if compilation failed. */
798 	re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
799 
800 	if (re == NULL) {
801 		if (key != regex) {
802 			zend_string_release_ex(key, 0);
803 		}
804 		pcre2_get_error_message(errnumber, error, sizeof(error));
805 		php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset);
806 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
807 		efree(pattern);
808 		return NULL;
809 	}
810 
811 #ifdef HAVE_PCRE_JIT_SUPPORT
812 	if (PCRE_G(jit)) {
813 		/* Enable PCRE JIT compiler */
814 		rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
815 		if (EXPECTED(rc >= 0)) {
816 			size_t jit_size = 0;
817 			if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) {
818 				poptions |= PREG_JIT;
819 			}
820 		} else if (rc == PCRE2_ERROR_NOMEMORY) {
821 			php_error_docref(NULL, E_WARNING,
822 				"Allocation of JIT memory failed, PCRE JIT will be disabled. "
823 				"This is likely caused by security restrictions. "
824 				"Either grant PHP permission to allocate executable memory, or set pcre.jit=0");
825 			PCRE_G(jit) = 0;
826 		} else {
827 			pcre2_get_error_message(rc, error, sizeof(error));
828 			php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error);
829 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
830 		}
831 	}
832 #endif
833 	efree(pattern);
834 
835 	/*
836 	 * If we reached cache limit, clean out the items from the head of the list;
837 	 * these are supposedly the oldest ones (but not necessarily the least used
838 	 * ones).
839 	 */
840 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
841 		int num_clean = PCRE_CACHE_SIZE / 8;
842 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
843 	}
844 
845 	/* Store the compiled pattern and extra info in the cache. */
846 	new_entry.re = re;
847 	new_entry.preg_options = poptions;
848 	new_entry.compile_options = coptions;
849 	new_entry.refcount = 0;
850 
851 	rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count);
852 	if (rc < 0) {
853 		if (key != regex) {
854 			zend_string_release_ex(key, 0);
855 		}
856 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc);
857 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
858 		return NULL;
859 	}
860 
861 	uint32_t name_count;
862 	rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &name_count);
863 	if (rc < 0) {
864 		if (key != regex) {
865 			zend_string_release_ex(key, 0);
866 		}
867 		php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc);
868 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
869 		return NULL;
870 	}
871 
872 	/* Compute and cache the subpattern table to avoid computing it again over and over. */
873 	if (name_count > 0) {
874 		new_entry.subpats_table = make_subpats_table(name_count, &new_entry, !PCRE_G(per_request_cache));
875 		if (!new_entry.subpats_table) {
876 			if (key != regex) {
877 				zend_string_release_ex(key, false);
878 			}
879 			/* Warning already emitted by make_subpats_table() */
880 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
881 			return NULL;
882 		}
883 	} else {
884 		new_entry.subpats_table = NULL;
885 	}
886 
887 	/*
888 	 * Interned strings are not duplicated when stored in HashTable,
889 	 * but all the interned strings created during HTTP request are removed
890 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
891 	 * on the next request as well. So we disable usage of interned strings
892 	 * as hash keys especually for this table.
893 	 * See bug #63180
894 	 */
895 	if (!(GC_FLAGS(key) & IS_STR_PERMANENT) && !PCRE_G(per_request_cache)) {
896 		zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1);
897 		GC_MAKE_PERSISTENT_LOCAL(str);
898 
899 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry));
900 		zend_string_release(str);
901 	} else {
902 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
903 	}
904 
905 	if (key != regex) {
906 		zend_string_release_ex(key, 0);
907 	}
908 
909 	return ret;
910 }
911 /* }}} */
912 
913 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache(zend_string * regex)914 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
915 {
916 	return pcre_get_compiled_regex_cache_ex(regex, true);
917 }
918 /* }}} */
919 
920 /* {{{ pcre_get_compiled_regex */
pcre_get_compiled_regex(zend_string * regex,uint32_t * capture_count)921 PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count)
922 {
923 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
924 
925 	if (capture_count) {
926 		*capture_count = pce ? pce->capture_count : 0;
927 	}
928 
929 	return pce ? pce->re : NULL;
930 }
931 /* }}} */
932 
933 /* XXX For the cases where it's only about match yes/no and no capture
934 		required, perhaps just a minimum sized data would suffice. */
php_pcre_create_match_data(uint32_t capture_count,pcre2_code * re)935 PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
936 {/*{{{*/
937 
938 	assert(NULL != re);
939 
940 	if (EXPECTED(!mdata_used)) {
941 		int rc = 0;
942 
943 		if (!capture_count) {
944 			/* As we deal with a non cached pattern, no other way to gather this info. */
945 			rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
946 		}
947 
948 		if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
949 			mdata_used = 1;
950 			return mdata;
951 		}
952 	}
953 
954 	return pcre2_match_data_create_from_pattern(re, gctx);
955 }/*}}}*/
956 
php_pcre_free_match_data(pcre2_match_data * match_data)957 PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
958 {/*{{{*/
959 	if (UNEXPECTED(match_data != mdata)) {
960 		pcre2_match_data_free(match_data);
961 	} else {
962 		mdata_used = 0;
963 	}
964 }/*}}}*/
965 
init_unmatched_null_pair(void)966 static void init_unmatched_null_pair(void) {
967 	zval val1, val2;
968 	ZVAL_NULL(&val1);
969 	ZVAL_LONG(&val2, -1);
970 	ZVAL_ARR(&PCRE_G(unmatched_null_pair), zend_new_pair(&val1, &val2));
971 }
972 
init_unmatched_empty_pair(void)973 static void init_unmatched_empty_pair(void) {
974 	zval val1, val2;
975 	ZVAL_EMPTY_STRING(&val1);
976 	ZVAL_LONG(&val2, -1);
977 	ZVAL_ARR(&PCRE_G(unmatched_empty_pair), zend_new_pair(&val1, &val2));
978 }
979 
populate_match_value_str(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset)980 static zend_always_inline void populate_match_value_str(
981 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) {
982 	ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
983 }
984 
populate_match_value(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,bool unmatched_as_null)985 static zend_always_inline void populate_match_value(
986 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
987 		bool unmatched_as_null) {
988 	if (PCRE2_UNSET == start_offset) {
989 		if (unmatched_as_null) {
990 			ZVAL_NULL(val);
991 		} else {
992 			ZVAL_EMPTY_STRING(val);
993 		}
994 	} else {
995 		populate_match_value_str(val, subject, start_offset, end_offset);
996 	}
997 }
998 
add_named(HashTable * const subpats,zend_string * name,zval * val,bool unmatched)999 static inline void add_named(
1000 		HashTable *const subpats, zend_string *name, zval *val, bool unmatched) {
1001 	/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
1002 	 * In this case we want to preserve the one that actually has a value. */
1003 	if (!unmatched) {
1004 		zend_hash_update(subpats, name, val);
1005 	} else {
1006 		if (!zend_hash_add(subpats, name, val)) {
1007 			return;
1008 		}
1009 	}
1010 	Z_TRY_ADDREF_P(val);
1011 }
1012 
1013 /* {{{ add_offset_pair */
add_offset_pair(HashTable * const result,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,zend_string * name,zend_long unmatched_as_null)1014 static inline void add_offset_pair(
1015 		HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
1016 		zend_string *name, zend_long unmatched_as_null)
1017 {
1018 	zval match_pair;
1019 
1020 	/* Add (match, offset) to the return value */
1021 	if (PCRE2_UNSET == start_offset) {
1022 		if (unmatched_as_null) {
1023 			if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) {
1024 				init_unmatched_null_pair();
1025 			}
1026 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair));
1027 		} else {
1028 			if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) {
1029 				init_unmatched_empty_pair();
1030 			}
1031 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair));
1032 		}
1033 	} else {
1034 		zval val1, val2;
1035 		populate_match_value_str(&val1, subject, start_offset, end_offset);
1036 		ZVAL_LONG(&val2, start_offset);
1037 		ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2));
1038 	}
1039 
1040 	if (name) {
1041 		add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
1042 	}
1043 	zend_hash_next_index_insert_new(result, &match_pair);
1044 }
1045 /* }}} */
1046 
populate_subpat_array(zval * subpats,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1047 static void populate_subpat_array(
1048 		zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
1049 		uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
1050 	zend_long offset_capture = flags & PREG_OFFSET_CAPTURE;
1051 	zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1052 	zval val;
1053 	int i;
1054 	HashTable *subpats_ht = Z_ARRVAL_P(subpats);
1055 	if (subpat_names) {
1056 		if (offset_capture) {
1057 			for (i = 0; i < count; i++) {
1058 				add_offset_pair(
1059 					subpats_ht, subject, offsets[2*i], offsets[2*i+1],
1060 					subpat_names[i], unmatched_as_null);
1061 			}
1062 			if (unmatched_as_null) {
1063 				for (i = count; i < num_subpats; i++) {
1064 					add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
1065 				}
1066 			}
1067 		} else {
1068 			for (i = 0; i < count; i++) {
1069 				populate_match_value(
1070 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1071 				if (subpat_names[i]) {
1072 					add_named(subpats_ht, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
1073 				}
1074 				zend_hash_next_index_insert_new(subpats_ht, &val);
1075 			}
1076 			if (unmatched_as_null) {
1077 				for (i = count; i < num_subpats; i++) {
1078 					ZVAL_NULL(&val);
1079 					if (subpat_names[i]) {
1080 						zend_hash_add(subpats_ht, subpat_names[i], &val);
1081 					}
1082 					zend_hash_next_index_insert_new(subpats_ht, &val);
1083 				}
1084 			}
1085 		}
1086 	} else {
1087 		if (offset_capture) {
1088 			for (i = 0; i < count; i++) {
1089 				add_offset_pair(
1090 					subpats_ht, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
1091 			}
1092 			if (unmatched_as_null) {
1093 				for (i = count; i < num_subpats; i++) {
1094 					add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
1095 				}
1096 			}
1097 		} else {
1098 			for (i = 0; i < count; i++) {
1099 				populate_match_value(
1100 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1101 				zend_hash_next_index_insert_new(subpats_ht, &val);
1102 			}
1103 			if (unmatched_as_null) {
1104 				for (i = count; i < num_subpats; i++) {
1105 					add_next_index_null(subpats);
1106 				}
1107 			}
1108 		}
1109 	}
1110 	/* Add MARK, if available */
1111 	if (mark) {
1112 		add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
1113 	}
1114 }
1115 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,bool global)1116 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */
1117 {
1118 	/* parameters */
1119 	zend_string		 *regex;			/* Regular expression */
1120 	zend_string		 *subject;			/* String to match against */
1121 	pcre_cache_entry *pce;				/* Compiled regular expression */
1122 	zval			 *subpats = NULL;	/* Array for subpatterns */
1123 	zend_long		  flags = 0;		/* Match control flags */
1124 	zend_long		  start_offset = 0;	/* Where the new search starts */
1125 
1126 	ZEND_PARSE_PARAMETERS_START(2, 5)
1127 		Z_PARAM_STR(regex)
1128 		Z_PARAM_STR(subject)
1129 		Z_PARAM_OPTIONAL
1130 		Z_PARAM_ZVAL(subpats)
1131 		Z_PARAM_LONG(flags)
1132 		Z_PARAM_LONG(start_offset)
1133 	ZEND_PARSE_PARAMETERS_END();
1134 
1135 	/* Compile regex or get it from cache. */
1136 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1137 		RETURN_FALSE;
1138 	}
1139 
1140 	pce->refcount++;
1141 	php_pcre_match_impl(pce, subject, return_value, subpats,
1142 		global, flags, start_offset);
1143 	pce->refcount--;
1144 }
1145 /* }}} */
1146 
is_known_valid_utf8(zend_string * subject_str,PCRE2_SIZE start_offset)1147 static zend_always_inline bool is_known_valid_utf8(
1148 		zend_string *subject_str, PCRE2_SIZE start_offset) {
1149 	if (!ZSTR_IS_VALID_UTF8(subject_str)) {
1150 		/* We don't know whether the string is valid UTF-8 or not. */
1151 		return 0;
1152 	}
1153 
1154 	if (start_offset == ZSTR_LEN(subject_str)) {
1155 		/* Degenerate case: Offset points to end of string. */
1156 		return 1;
1157 	}
1158 
1159 	/* Check that the offset does not point to an UTF-8 continuation byte. */
1160 	return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1161 }
1162 
1163 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zval * subpats,bool global,zend_long flags,zend_off_t start_offset)1164 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
1165 	zval *subpats, bool global, zend_long flags, zend_off_t start_offset)
1166 {
1167 	zval			 result_set;		/* Holds a set of subpatterns after
1168 										   a global match */
1169 	HashTable	   **match_sets = NULL;	/* An array of sets of matches for each
1170 										   subpattern after a global match */
1171 	uint32_t		 options;			/* Execution options */
1172 	int				 count;				/* Count of matched subpatterns */
1173 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1174 	int				 matched;			/* Has anything matched */
1175 	zend_string	   **subpat_names;		/* Array for named subpatterns */
1176 	size_t			 i;
1177 	uint32_t		 subpats_order;		/* Order of subpattern matches */
1178 	uint32_t		 offset_capture;	/* Capture match offsets: yes/no */
1179 	zend_long		 unmatched_as_null;	/* Null non-matches: yes/no */
1180 	PCRE2_SPTR       mark = NULL;		/* Target for MARK name */
1181 	HashTable		*marks = NULL;		/* Array of marks for PREG_PATTERN_ORDER */
1182 	pcre2_match_data *match_data;
1183 	PCRE2_SIZE		 start_offset2, orig_start_offset;
1184 
1185 	char *subject = ZSTR_VAL(subject_str);
1186 	size_t subject_len = ZSTR_LEN(subject_str);
1187 
1188 	/* Overwrite the passed-in value for subpatterns with an empty array. */
1189 	if (subpats != NULL) {
1190 		subpats = zend_try_array_init(subpats);
1191 		if (!subpats) {
1192 			RETURN_THROWS();
1193 		}
1194 	}
1195 
1196 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
1197 
1198 	if (flags) {
1199 		offset_capture = flags & PREG_OFFSET_CAPTURE;
1200 		unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1201 
1202 		/*
1203 		 * subpats_order is pre-set to pattern mode so we change it only if
1204 		 * necessary.
1205 		 */
1206 		if (flags & 0xff) {
1207 			subpats_order = flags & 0xff;
1208 			if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
1209 				(!global && subpats_order != 0)) {
1210 				zend_argument_value_error(4, "must be a PREG_* constant");
1211 				RETURN_THROWS();
1212 			}
1213 		}
1214 	} else {
1215 		offset_capture = 0;
1216 		unmatched_as_null = 0;
1217 	}
1218 
1219 	/* Negative offset counts from the end of the string. */
1220 	if (start_offset < 0) {
1221 		if ((PCRE2_SIZE)-start_offset <= subject_len) {
1222 			start_offset2 = subject_len + start_offset;
1223 		} else {
1224 			start_offset2 = 0;
1225 		}
1226 	} else {
1227 		start_offset2 = (PCRE2_SIZE)start_offset;
1228 	}
1229 
1230 	if (start_offset2 > subject_len) {
1231 		pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1232 		RETURN_FALSE;
1233 	}
1234 
1235 	/* Calculate the size of the offsets array, and allocate memory for it. */
1236 	num_subpats = pce->capture_count + 1;
1237 
1238 	/*
1239 	 * Build a mapping from subpattern numbers to their names. We will
1240 	 * allocate the table only if there are any named subpatterns.
1241 	 */
1242 	subpat_names = NULL;
1243 	if (subpats) {
1244 		subpat_names = pce->subpats_table;
1245 	}
1246 
1247 	matched = 0;
1248 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1249 
1250 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1251 		match_data = mdata;
1252 	} else {
1253 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1254 		if (!match_data) {
1255 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1256 			RETURN_FALSE;
1257 		}
1258 	}
1259 
1260 	/* Allocate match sets array and initialize the values. */
1261 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1262 		match_sets = safe_emalloc(num_subpats, sizeof(HashTable *), 0);
1263 		for (i=0; i<num_subpats; i++) {
1264 			match_sets[i] = zend_new_array(0);
1265 		}
1266 	}
1267 
1268 	/* Array of subpattern offsets */
1269 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1270 
1271 	orig_start_offset = start_offset2;
1272 	options =
1273 		(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1274 			? 0 : PCRE2_NO_UTF_CHECK;
1275 
1276 	/* Execute the regular expression. */
1277 #ifdef HAVE_PCRE_JIT_SUPPORT
1278 	if ((pce->preg_options & PREG_JIT) && options) {
1279 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1280 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1281 	} else
1282 #endif
1283 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1284 			options, match_data, mctx);
1285 
1286 	while (1) {
1287 		/* If something has matched */
1288 		if (count >= 0) {
1289 			/* Check for too many substrings condition. */
1290 			if (UNEXPECTED(count == 0)) {
1291 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
1292 				count = num_subpats;
1293 			}
1294 
1295 matched:
1296 			matched++;
1297 
1298 			/* If subpatterns array has been passed, fill it in with values. */
1299 			if (subpats != NULL) {
1300 				/* Try to get the list of substrings and display a warning if failed. */
1301 				if (UNEXPECTED(offsets[1] < offsets[0])) {
1302 					if (match_sets) efree(match_sets);
1303 					php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
1304 					RETURN_FALSE;
1305 				}
1306 
1307 				if (global) {	/* global pattern matching */
1308 					if (subpats_order == PREG_PATTERN_ORDER) {
1309 						/* For each subpattern, insert it into the appropriate array. */
1310 						if (offset_capture) {
1311 							for (i = 0; i < count; i++) {
1312 								add_offset_pair(
1313 									match_sets[i], subject, offsets[2*i], offsets[2*i+1],
1314 									NULL, unmatched_as_null);
1315 							}
1316 						} else {
1317 							for (i = 0; i < count; i++) {
1318 								zval val;
1319 								populate_match_value(
1320 									&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1321 								zend_hash_next_index_insert_new(match_sets[i], &val);
1322 							}
1323 						}
1324 						mark = pcre2_get_mark(match_data);
1325 						/* Add MARK, if available */
1326 						if (mark) {
1327 							if (!marks) {
1328 								marks = zend_new_array(0);
1329 							}
1330 							zval tmp;
1331 							ZVAL_STRING(&tmp, (char *) mark);
1332 							zend_hash_index_add_new(marks, matched - 1, &tmp);
1333 						}
1334 						/*
1335 						 * If the number of captured subpatterns on this run is
1336 						 * less than the total possible number, pad the result
1337 						 * arrays with NULLs or empty strings.
1338 						 */
1339 						if (count < num_subpats) {
1340 							for (int i = count; i < num_subpats; i++) {
1341 								if (offset_capture) {
1342 									add_offset_pair(
1343 										match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
1344 										NULL, unmatched_as_null);
1345 								} else if (unmatched_as_null) {
1346 									zval tmp;
1347 									ZVAL_NULL(&tmp);
1348 									zend_hash_next_index_insert_new(match_sets[i], &tmp);
1349 								} else {
1350 									zval tmp;
1351 									ZVAL_EMPTY_STRING(&tmp);
1352 									zend_hash_next_index_insert_new(match_sets[i], &tmp);
1353 								}
1354 							}
1355 						}
1356 					} else {
1357 						/* Allocate and populate the result set array */
1358 						mark = pcre2_get_mark(match_data);
1359 						array_init_size(&result_set, count + (mark ? 1 : 0));
1360 						populate_subpat_array(
1361 							&result_set, subject, offsets, subpat_names,
1362 							num_subpats, count, mark, flags);
1363 						/* And add it to the output array */
1364 						zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &result_set);
1365 					}
1366 				} else {			/* single pattern matching */
1367 					/* For each subpattern, insert it into the subpatterns array. */
1368 					mark = pcre2_get_mark(match_data);
1369 					populate_subpat_array(
1370 						subpats, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1371 					break;
1372 				}
1373 			}
1374 
1375 			/* Advance to the next piece. */
1376 			start_offset2 = offsets[1];
1377 
1378 			/* If we have matched an empty string, mimic what Perl's /g options does.
1379 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1380 			   the match again at the same point. If this fails (picked up above) we
1381 			   advance to the next character. */
1382 			if (start_offset2 == offsets[0]) {
1383 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1384 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1385 				if (count >= 0) {
1386 					if (global) {
1387 						goto matched;
1388 					} else {
1389 						break;
1390 					}
1391 				} else if (count == PCRE2_ERROR_NOMATCH) {
1392 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1393 					   this is not necessarily the end. We need to advance
1394 					   the start offset, and continue. Fudge the offset values
1395 					   to achieve this, unless we're already at the end of the string. */
1396 					if (start_offset2 < subject_len) {
1397 						size_t unit_len = calculate_unit_length(pce, subject + start_offset2);
1398 
1399 						start_offset2 += unit_len;
1400 					} else {
1401 						break;
1402 					}
1403 				} else {
1404 					goto error;
1405 				}
1406 			}
1407 		} else if (count == PCRE2_ERROR_NOMATCH) {
1408 			break;
1409 		} else {
1410 error:
1411 			pcre_handle_exec_error(count);
1412 			break;
1413 		}
1414 
1415 		if (!global) {
1416 			break;
1417 		}
1418 
1419 		/* Execute the regular expression. */
1420 #ifdef HAVE_PCRE_JIT_SUPPORT
1421 		if ((pce->preg_options & PREG_JIT)) {
1422 			if (start_offset2 > subject_len) {
1423 				pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1424 				break;
1425 			}
1426 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1427 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1428 		} else
1429 #endif
1430 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1431 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1432 	}
1433 	if (match_data != mdata) {
1434 		pcre2_match_data_free(match_data);
1435 	}
1436 
1437 	/* Add the match sets to the output array and clean up */
1438 	if (match_sets) {
1439 		if (subpat_names) {
1440 			for (i = 0; i < num_subpats; i++) {
1441 				zval wrapper;
1442 				ZVAL_ARR(&wrapper, match_sets[i]);
1443 				if (subpat_names[i]) {
1444 					zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper);
1445 					GC_ADDREF(match_sets[i]);
1446 				}
1447 				zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
1448 			}
1449 		} else {
1450 			for (i = 0; i < num_subpats; i++) {
1451 				zval wrapper;
1452 				ZVAL_ARR(&wrapper, match_sets[i]);
1453 				zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
1454 			}
1455 		}
1456 		efree(match_sets);
1457 
1458 		if (marks) {
1459 			zval tmp;
1460 			ZVAL_ARR(&tmp, marks);
1461 			zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp);
1462 		}
1463 	}
1464 
1465 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1466 		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1467 		if ((pce->compile_options & PCRE2_UTF)
1468 				&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
1469 			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1470 		}
1471 
1472 		RETVAL_LONG(matched);
1473 	} else {
1474 		RETVAL_FALSE;
1475 	}
1476 }
1477 /* }}} */
1478 
1479 /* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1480 PHP_FUNCTION(preg_match)
1481 {
1482 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
1483 }
1484 /* }}} */
1485 
1486 ZEND_FRAMELESS_FUNCTION(preg_match, 2)
1487 {
1488 	zval regex_tmp, subject_tmp;
1489 	zend_string *regex, *subject;
1490 
1491 	Z_FLF_PARAM_STR(1, regex, regex_tmp);
1492 	Z_FLF_PARAM_STR(2, subject, subject_tmp);
1493 
1494 	/* Compile regex or get it from cache. */
1495 	pcre_cache_entry *pce;
1496 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1497 		RETURN_FALSE;
1498 	}
1499 
1500 	pce->refcount++;
1501 	php_pcre_match_impl(pce, subject, return_value, /* subpats */ NULL,
1502 		/* global */ false, /* flags */ 0, /* start_offset */ 0);
1503 	pce->refcount--;
1504 
1505 flf_clean:
1506 	Z_FLF_PARAM_FREE_STR(1, regex_tmp);
1507 	Z_FLF_PARAM_FREE_STR(2, subject_tmp);
1508 }
1509 
1510 /* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1511 PHP_FUNCTION(preg_match_all)
1512 {
1513 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
1514 }
1515 /* }}} */
1516 
1517 /* {{{ preg_get_backref */
preg_get_backref(char ** str,int * backref)1518 static int preg_get_backref(char **str, int *backref)
1519 {
1520 	char in_brace = 0;
1521 	char *walk = *str;
1522 
1523 	if (walk[1] == 0)
1524 		return 0;
1525 
1526 	if (*walk == '$' && walk[1] == '{') {
1527 		in_brace = 1;
1528 		walk++;
1529 	}
1530 	walk++;
1531 
1532 	if (*walk >= '0' && *walk <= '9') {
1533 		*backref = *walk - '0';
1534 		walk++;
1535 	} else
1536 		return 0;
1537 
1538 	if (*walk && *walk >= '0' && *walk <= '9') {
1539 		*backref = *backref * 10 + *walk - '0';
1540 		walk++;
1541 	}
1542 
1543 	if (in_brace) {
1544 		if (*walk != '}')
1545 			return 0;
1546 		else
1547 			walk++;
1548 	}
1549 
1550 	*str = walk;
1551 	return 1;
1552 }
1553 /* }}} */
1554 
1555 /* {{{ preg_do_repl_func */
preg_do_repl_func(zend_fcall_info * fci,zend_fcall_info_cache * fcc,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1556 static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags)
1557 {
1558 	zend_string *result_str;
1559 	zval		 retval;			/* Function return value */
1560 	zval	     arg;				/* Argument to pass to function */
1561 
1562 	array_init_size(&arg, count + (mark ? 1 : 0));
1563 	populate_subpat_array(&arg, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1564 
1565 	fci->retval = &retval;
1566 	fci->param_count = 1;
1567 	fci->params = &arg;
1568 
1569 	if (zend_call_function(fci, fcc) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1570 		if (EXPECTED(Z_TYPE(retval) == IS_STRING)) {
1571 			result_str = Z_STR(retval);
1572 		} else {
1573 			result_str = zval_get_string_func(&retval);
1574 			zval_ptr_dtor(&retval);
1575 		}
1576 	} else {
1577 		if (!EG(exception)) {
1578 			php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1579 		}
1580 
1581 		result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1582 	}
1583 
1584 	zval_ptr_dtor(&arg);
1585 
1586 	return result_str;
1587 }
1588 /* }}} */
1589 
1590 /* {{{ php_pcre_replace */
php_pcre_replace(zend_string * regex,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1591 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1592 							  zend_string *subject_str,
1593 							  const char *subject, size_t subject_len,
1594 							  zend_string *replace_str,
1595 							  size_t limit, size_t *replace_count)
1596 {
1597 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1598 	zend_string	 		*result;			/* Function result */
1599 
1600 	/* Abort on pending exception, e.g. thrown from __toString(). */
1601 	if (UNEXPECTED(EG(exception))) {
1602 		return NULL;
1603 	}
1604 
1605 	/* Compile regex or get it from cache. */
1606 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1607 		return NULL;
1608 	}
1609 	pce->refcount++;
1610 	result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str,
1611 		limit, replace_count);
1612 	pce->refcount--;
1613 
1614 	return result;
1615 }
1616 /* }}} */
1617 
1618 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1619 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count)
1620 {
1621 	uint32_t		 options;			/* Execution options */
1622 	int				 count;				/* Count of matched subpatterns */
1623 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1624 	size_t			 new_len;			/* Length of needed storage */
1625 	size_t			 alloc_len;			/* Actual allocated length */
1626 	size_t			 match_len;			/* Length of the current match */
1627 	int				 backref;			/* Backreference number */
1628 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1629 	size_t			 last_end_offset;	/* Where the last search ended */
1630 	char			*walkbuf,			/* Location of current replacement in the result */
1631 					*walk,				/* Used to walk the replacement string */
1632 					 walk_last;			/* Last walked character */
1633 	const char		*match,				/* The current match */
1634 					*piece,				/* The current piece of subject */
1635 					*replace_end;		/* End of replacement string */
1636 	size_t			result_len; 		/* Length of result */
1637 	zend_string		*result;			/* Result of replacement */
1638 	pcre2_match_data *match_data;
1639 
1640 	/* Calculate the size of the offsets array, and allocate memory for it. */
1641 	num_subpats = pce->capture_count + 1;
1642 	alloc_len = 0;
1643 	result = NULL;
1644 
1645 	/* Initialize */
1646 	match = NULL;
1647 	start_offset = 0;
1648 	last_end_offset = 0;
1649 	result_len = 0;
1650 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1651 
1652 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1653 		match_data = mdata;
1654 	} else {
1655 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1656 		if (!match_data) {
1657 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1658 			return NULL;
1659 		}
1660 	}
1661 
1662 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1663 
1664 	/* Array of subpattern offsets */
1665 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1666 
1667 	/* Execute the regular expression. */
1668 #ifdef HAVE_PCRE_JIT_SUPPORT
1669 	if ((pce->preg_options & PREG_JIT) && options) {
1670 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1671 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1672 	} else
1673 #endif
1674 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1675 			options, match_data, mctx);
1676 
1677 	while (1) {
1678 		piece = subject + last_end_offset;
1679 
1680 		if (count >= 0 && limit > 0) {
1681 			bool simple_string;
1682 
1683 			/* Check for too many substrings condition. */
1684 			if (UNEXPECTED(count == 0)) {
1685 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1686 				count = num_subpats;
1687 			}
1688 
1689 matched:
1690 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1691 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1692 				if (result) {
1693 					zend_string_release_ex(result, 0);
1694 					result = NULL;
1695 				}
1696 				break;
1697 			}
1698 
1699 			if (replace_count) {
1700 				++*replace_count;
1701 			}
1702 
1703 			/* Set the match location in subject */
1704 			match = subject + offsets[0];
1705 
1706 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1707 
1708 			walk = ZSTR_VAL(replace_str);
1709 			replace_end = walk + ZSTR_LEN(replace_str);
1710 			walk_last = 0;
1711 			simple_string = 1;
1712 			while (walk < replace_end) {
1713 				if ('\\' == *walk || '$' == *walk) {
1714 					simple_string = 0;
1715 					if (walk_last == '\\') {
1716 						walk++;
1717 						walk_last = 0;
1718 						continue;
1719 					}
1720 					if (preg_get_backref(&walk, &backref)) {
1721 						if (backref < count)
1722 							new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1723 						continue;
1724 					}
1725 				}
1726 				new_len++;
1727 				walk++;
1728 				walk_last = walk[-1];
1729 			}
1730 
1731 			if (new_len >= alloc_len) {
1732 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1733 				if (result == NULL) {
1734 					result = zend_string_alloc(alloc_len, 0);
1735 				} else {
1736 					result = zend_string_extend(result, alloc_len, 0);
1737 				}
1738 			}
1739 
1740 			if (match-piece > 0) {
1741 				/* copy the part of the string before the match */
1742 				memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1743 				result_len += (match-piece);
1744 			}
1745 
1746 			if (simple_string) {
1747 				/* copy replacement */
1748 				memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1);
1749 				result_len += ZSTR_LEN(replace_str);
1750 			} else {
1751 				/* copy replacement and backrefs */
1752 				walkbuf = ZSTR_VAL(result) + result_len;
1753 
1754 				walk = ZSTR_VAL(replace_str);
1755 				walk_last = 0;
1756 				while (walk < replace_end) {
1757 					if ('\\' == *walk || '$' == *walk) {
1758 						if (walk_last == '\\') {
1759 							*(walkbuf-1) = *walk++;
1760 							walk_last = 0;
1761 							continue;
1762 						}
1763 						if (preg_get_backref(&walk, &backref)) {
1764 							if (backref < count) {
1765 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1766 								walkbuf = zend_mempcpy(walkbuf, subject + offsets[backref << 1], match_len);
1767 							}
1768 							continue;
1769 						}
1770 					}
1771 					*walkbuf++ = *walk++;
1772 					walk_last = walk[-1];
1773 				}
1774 				*walkbuf = '\0';
1775 				/* increment the result length by how much we've added to the string */
1776 				result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1777 			}
1778 
1779 			limit--;
1780 
1781 			/* Advance to the next piece. */
1782 			start_offset = last_end_offset = offsets[1];
1783 
1784 			/* If we have matched an empty string, mimic what Perl's /g options does.
1785 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1786 			   the match again at the same point. If this fails (picked up above) we
1787 			   advance to the next character. */
1788 			if (start_offset == offsets[0]) {
1789 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1790 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1791 
1792 				piece = subject + start_offset;
1793 				if (count >= 0 && limit > 0) {
1794 					goto matched;
1795 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1796 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1797 					   this is not necessarily the end. We need to advance
1798 					   the start offset, and continue. Fudge the offset values
1799 					   to achieve this, unless we're already at the end of the string. */
1800 					if (start_offset < subject_len) {
1801 						size_t unit_len = calculate_unit_length(pce, piece);
1802 						start_offset += unit_len;
1803 					} else {
1804 						goto not_matched;
1805 					}
1806 				} else {
1807 					goto error;
1808 				}
1809 			}
1810 
1811 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1812 not_matched:
1813 			if (!result && subject_str) {
1814 				result = zend_string_copy(subject_str);
1815 				break;
1816 			}
1817 			/* now we know exactly how long it is */
1818 			alloc_len = result_len + subject_len - last_end_offset;
1819 			if (NULL != result) {
1820 				result = zend_string_realloc(result, alloc_len, 0);
1821 			} else {
1822 				result = zend_string_alloc(alloc_len, 0);
1823 			}
1824 			/* stick that last bit of string on our output */
1825 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1826 			result_len += subject_len - last_end_offset;
1827 			ZSTR_VAL(result)[result_len] = '\0';
1828 			ZSTR_LEN(result) = result_len;
1829 			break;
1830 		} else {
1831 error:
1832 			pcre_handle_exec_error(count);
1833 			if (result) {
1834 				zend_string_release_ex(result, 0);
1835 				result = NULL;
1836 			}
1837 			break;
1838 		}
1839 
1840 #ifdef HAVE_PCRE_JIT_SUPPORT
1841 		if (pce->preg_options & PREG_JIT) {
1842 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1843 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1844 		} else
1845 #endif
1846 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1847 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1848 	}
1849 	if (match_data != mdata) {
1850 		pcre2_match_data_free(match_data);
1851 	}
1852 
1853 	return result;
1854 }
1855 /* }}} */
1856 
1857 /* {{{ php_pcre_replace_func_impl() */
php_pcre_replace_func_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)1858 static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_fcall_info *fci, zend_fcall_info_cache *fcc, size_t limit, size_t *replace_count, zend_long flags)
1859 {
1860 	uint32_t		 options;			/* Execution options */
1861 	int				 count;				/* Count of matched subpatterns */
1862 	zend_string		**subpat_names;		/* Array for named subpatterns */
1863 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1864 	size_t			 new_len;			/* Length of needed storage */
1865 	size_t			 alloc_len;			/* Actual allocated length */
1866 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1867 	size_t			 last_end_offset;	/* Where the last search ended */
1868 	const char		*match,				/* The current match */
1869 					*piece;				/* The current piece of subject */
1870 	size_t			result_len; 		/* Length of result */
1871 	zend_string		*result;			/* Result of replacement */
1872 	zend_string     *eval_result;		/* Result of custom function */
1873 	pcre2_match_data *match_data;
1874 	bool old_mdata_used;
1875 
1876 	/* Calculate the size of the offsets array, and allocate memory for it. */
1877 	num_subpats = pce->capture_count + 1;
1878 	subpat_names = pce->subpats_table;
1879 
1880 	alloc_len = 0;
1881 	result = NULL;
1882 
1883 	/* Initialize */
1884 	match = NULL;
1885 	start_offset = 0;
1886 	last_end_offset = 0;
1887 	result_len = 0;
1888 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1889 
1890 	old_mdata_used = mdata_used;
1891 	if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1892 		mdata_used = 1;
1893 		match_data = mdata;
1894 	} else {
1895 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1896 		if (!match_data) {
1897 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1898 			mdata_used = old_mdata_used;
1899 			return NULL;
1900 		}
1901 	}
1902 
1903 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1904 
1905 	/* Array of subpattern offsets */
1906 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1907 
1908 	/* Execute the regular expression. */
1909 #ifdef HAVE_PCRE_JIT_SUPPORT
1910 	if ((pce->preg_options & PREG_JIT) && options) {
1911 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1912 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1913 	} else
1914 #endif
1915 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1916 			options, match_data, mctx);
1917 
1918 	while (1) {
1919 		piece = subject + last_end_offset;
1920 
1921 		if (count >= 0 && limit) {
1922 			/* Check for too many substrings condition. */
1923 			if (UNEXPECTED(count == 0)) {
1924 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1925 				count = num_subpats;
1926 			}
1927 
1928 matched:
1929 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1930 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1931 				if (result) {
1932 					zend_string_release_ex(result, 0);
1933 					result = NULL;
1934 				}
1935 				break;
1936 			}
1937 
1938 			if (replace_count) {
1939 				++*replace_count;
1940 			}
1941 
1942 			/* Set the match location in subject */
1943 			match = subject + offsets[0];
1944 
1945 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1946 
1947 			/* Use custom function to get replacement string and its length. */
1948 			eval_result = preg_do_repl_func(
1949 				fci, fcc, subject, offsets, subpat_names, num_subpats, count,
1950 				pcre2_get_mark(match_data), flags);
1951 
1952 			ZEND_ASSERT(eval_result);
1953 			new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD;
1954 			if (new_len >= alloc_len) {
1955 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1956 				if (result == NULL) {
1957 					result = zend_string_alloc(alloc_len, 0);
1958 				} else {
1959 					result = zend_string_extend(result, alloc_len, 0);
1960 				}
1961 			}
1962 
1963 			if (match-piece > 0) {
1964 				/* copy the part of the string before the match */
1965 				memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1966 				result_len += (match-piece);
1967 			}
1968 
1969 			/* If using custom function, copy result to the buffer and clean up. */
1970 			memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1971 			result_len += ZSTR_LEN(eval_result);
1972 			zend_string_release_ex(eval_result, 0);
1973 
1974 			limit--;
1975 
1976 			/* Advance to the next piece. */
1977 			start_offset = last_end_offset = offsets[1];
1978 
1979 			/* If we have matched an empty string, mimic what Perl's /g options does.
1980 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1981 			   the match again at the same point. If this fails (picked up above) we
1982 			   advance to the next character. */
1983 			if (start_offset == offsets[0]) {
1984 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1985 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1986 
1987 				piece = subject + start_offset;
1988 				if (count >= 0 && limit) {
1989 					goto matched;
1990 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1991 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1992 					   this is not necessarily the end. We need to advance
1993 					   the start offset, and continue. Fudge the offset values
1994 					   to achieve this, unless we're already at the end of the string. */
1995 					if (start_offset < subject_len) {
1996 						size_t unit_len = calculate_unit_length(pce, piece);
1997 						start_offset += unit_len;
1998 					} else {
1999 						goto not_matched;
2000 					}
2001 				} else {
2002 					goto error;
2003 				}
2004 			}
2005 
2006 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
2007 not_matched:
2008 			if (!result && subject_str) {
2009 				result = zend_string_copy(subject_str);
2010 				break;
2011 			}
2012 			/* now we know exactly how long it is */
2013 			alloc_len = result_len + subject_len - last_end_offset;
2014 			if (NULL != result) {
2015 				result = zend_string_realloc(result, alloc_len, 0);
2016 			} else {
2017 				result = zend_string_alloc(alloc_len, 0);
2018 			}
2019 			/* stick that last bit of string on our output */
2020 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
2021 			result_len += subject_len - last_end_offset;
2022 			ZSTR_VAL(result)[result_len] = '\0';
2023 			ZSTR_LEN(result) = result_len;
2024 			break;
2025 		} else {
2026 error:
2027 			pcre_handle_exec_error(count);
2028 			if (result) {
2029 				zend_string_release_ex(result, 0);
2030 				result = NULL;
2031 			}
2032 			break;
2033 		}
2034 #ifdef HAVE_PCRE_JIT_SUPPORT
2035 		if ((pce->preg_options & PREG_JIT)) {
2036 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2037 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2038 		} else
2039 #endif
2040 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2041 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2042 	}
2043 	if (match_data != mdata) {
2044 		pcre2_match_data_free(match_data);
2045 	}
2046 	mdata_used = old_mdata_used;
2047 
2048 	return result;
2049 }
2050 /* }}} */
2051 
2052 /* {{{ php_pcre_replace_func */
php_pcre_replace_func(zend_string * regex,zend_string * subject_str,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)2053 static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex,
2054 							  zend_string *subject_str,
2055 							  zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2056 							  size_t limit, size_t *replace_count, zend_long flags)
2057 {
2058 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
2059 	zend_string	 		*result;			/* Function result */
2060 
2061 	/* Compile regex or get it from cache. */
2062 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2063 		return NULL;
2064 	}
2065 	pce->refcount++;
2066 	result = php_pcre_replace_func_impl(
2067 		pce, subject_str, ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), fci, fcc,
2068 		limit, replace_count, flags);
2069 	pce->refcount--;
2070 
2071 	return result;
2072 }
2073 /* }}} */
2074 
2075 /* {{{ php_pcre_replace_array */
php_pcre_replace_array(HashTable * regex,zend_string * replace_str,HashTable * replace_ht,zend_string * subject_str,size_t limit,size_t * replace_count)2076 static zend_string *php_pcre_replace_array(HashTable *regex,
2077 	zend_string *replace_str, HashTable *replace_ht,
2078 	zend_string *subject_str, size_t limit, size_t *replace_count)
2079 {
2080 	zval		*regex_entry;
2081 	zend_string *result;
2082 
2083 	zend_string_addref(subject_str);
2084 
2085 	if (replace_ht) {
2086 		uint32_t replace_idx = 0;
2087 
2088 		/* For each entry in the regex array, get the entry */
2089 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2090 			/* Make sure we're dealing with strings. */
2091 			zend_string *tmp_regex_str;
2092 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2093 			zend_string *replace_entry_str, *tmp_replace_entry_str;
2094 			zval *zv;
2095 
2096 			/* Get current entry */
2097 			while (1) {
2098 				if (replace_idx == replace_ht->nNumUsed) {
2099 					replace_entry_str = ZSTR_EMPTY_ALLOC();
2100 					tmp_replace_entry_str = NULL;
2101 					break;
2102 				}
2103 				zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx);
2104 				replace_idx++;
2105 				if (Z_TYPE_P(zv) != IS_UNDEF) {
2106 					replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str);
2107 					break;
2108 				}
2109 			}
2110 
2111 			/* Do the actual replacement and put the result back into subject_str
2112 			   for further replacements. */
2113 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2114 				ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count);
2115 			zend_tmp_string_release(tmp_replace_entry_str);
2116 			zend_tmp_string_release(tmp_regex_str);
2117 			zend_string_release_ex(subject_str, 0);
2118 			subject_str = result;
2119 			if (UNEXPECTED(result == NULL)) {
2120 				break;
2121 			}
2122 		} ZEND_HASH_FOREACH_END();
2123 
2124 	} else {
2125 		ZEND_ASSERT(replace_str != NULL);
2126 
2127 		/* For each entry in the regex array, get the entry */
2128 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2129 			/* Make sure we're dealing with strings. */
2130 			zend_string *tmp_regex_str;
2131 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2132 
2133 			/* Do the actual replacement and put the result back into subject_str
2134 			   for further replacements. */
2135 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2136 				ZSTR_LEN(subject_str), replace_str, limit, replace_count);
2137 			zend_tmp_string_release(tmp_regex_str);
2138 			zend_string_release_ex(subject_str, 0);
2139 			subject_str = result;
2140 
2141 			if (UNEXPECTED(result == NULL)) {
2142 				break;
2143 			}
2144 		} ZEND_HASH_FOREACH_END();
2145 	}
2146 
2147 	return subject_str;
2148 }
2149 /* }}} */
2150 
2151 /* {{{ php_replace_in_subject */
php_replace_in_subject(zend_string * regex_str,HashTable * regex_ht,zend_string * replace_str,HashTable * replace_ht,zend_string * subject,size_t limit,size_t * replace_count)2152 static zend_always_inline zend_string *php_replace_in_subject(
2153 	zend_string *regex_str, HashTable *regex_ht,
2154 	zend_string *replace_str, HashTable *replace_ht,
2155 	zend_string *subject, size_t limit, size_t *replace_count)
2156 {
2157 	zend_string *result;
2158 
2159 	if (regex_str) {
2160 		ZEND_ASSERT(replace_str != NULL);
2161 		result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject),
2162 			replace_str, limit, replace_count);
2163 	} else {
2164 		ZEND_ASSERT(regex_ht != NULL);
2165 		result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject,
2166 			limit, replace_count);
2167 	}
2168 	return result;
2169 }
2170 /* }}} */
2171 
2172 /* {{{ php_replace_in_subject_func */
php_replace_in_subject_func(zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject,size_t limit,size_t * replace_count,zend_long flags)2173 static zend_string *php_replace_in_subject_func(zend_string *regex_str, HashTable *regex_ht,
2174 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2175 	zend_string *subject, size_t limit, size_t *replace_count, zend_long flags)
2176 {
2177 	zend_string *result;
2178 
2179 	if (regex_str) {
2180 		result = php_pcre_replace_func(
2181 			regex_str, subject, fci, fcc, limit, replace_count, flags);
2182 		return result;
2183 	} else {
2184 		/* If regex is an array */
2185 		zval		*regex_entry;
2186 
2187 		ZEND_ASSERT(regex_ht != NULL);
2188 
2189 		zend_string_addref(subject);
2190 
2191 		/* For each entry in the regex array, get the entry */
2192 		ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) {
2193 			/* Make sure we're dealing with strings. */
2194 			zend_string *tmp_regex_entry_str;
2195 			zend_string *regex_entry_str = zval_get_tmp_string(regex_entry, &tmp_regex_entry_str);
2196 
2197 			/* Do the actual replacement and put the result back into subject
2198 			   for further replacements. */
2199 			result = php_pcre_replace_func(
2200 				regex_entry_str, subject, fci, fcc, limit, replace_count, flags);
2201 			zend_tmp_string_release(tmp_regex_entry_str);
2202 			zend_string_release(subject);
2203 			subject = result;
2204 			if (UNEXPECTED(result == NULL)) {
2205 				break;
2206 			}
2207 		} ZEND_HASH_FOREACH_END();
2208 
2209 		return subject;
2210 	}
2211 }
2212 /* }}} */
2213 
2214 /* {{{ preg_replace_func_impl */
preg_replace_func_impl(zval * return_value,zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject_str,HashTable * subject_ht,zend_long limit_val,zend_long flags)2215 static size_t preg_replace_func_impl(zval *return_value,
2216 	zend_string *regex_str, HashTable *regex_ht,
2217 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2218 	zend_string *subject_str, HashTable *subject_ht, zend_long limit_val, zend_long flags)
2219 {
2220 	zend_string	*result;
2221 	size_t replace_count = 0;
2222 
2223 	if (subject_str) {
2224 		result = php_replace_in_subject_func(
2225 			regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags);
2226 		if (result != NULL) {
2227 			RETVAL_STR(result);
2228 		} else {
2229 			RETVAL_NULL();
2230 		}
2231 	} else {
2232 		/* if subject is an array */
2233 		zval		*subject_entry, zv;
2234 		zend_string	*string_key;
2235 		zend_ulong	 num_key;
2236 
2237 		ZEND_ASSERT(subject_ht != NULL);
2238 
2239 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2240 		HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2241 
2242 		/* For each subject entry, convert it to string, then perform replacement
2243 		   and add the result to the return_value array. */
2244 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2245 			zend_string *tmp_subject_entry_str;
2246 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2247 
2248 			result = php_replace_in_subject_func(
2249 				regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags);
2250 			if (result != NULL) {
2251 				/* Add to return array */
2252 				ZVAL_STR(&zv, result);
2253 				if (string_key) {
2254 					zend_hash_add_new(return_value_ht, string_key, &zv);
2255 				} else {
2256 					zend_hash_index_add_new(return_value_ht, num_key, &zv);
2257 				}
2258 			}
2259 			zend_tmp_string_release(tmp_subject_entry_str);
2260 		} ZEND_HASH_FOREACH_END();
2261 	}
2262 
2263 	return replace_count;
2264 }
2265 /* }}} */
2266 
_preg_replace_common(zval * return_value,HashTable * regex_ht,zend_string * regex_str,HashTable * replace_ht,zend_string * replace_str,HashTable * subject_ht,zend_string * subject_str,zend_long limit,zval * zcount,bool is_filter)2267 static void _preg_replace_common(
2268 	zval *return_value,
2269 	HashTable *regex_ht, zend_string *regex_str,
2270 	HashTable *replace_ht, zend_string *replace_str,
2271 	HashTable *subject_ht, zend_string *subject_str,
2272 	zend_long limit,
2273 	zval *zcount,
2274 	bool is_filter
2275 ) {
2276 	size_t replace_count = 0;
2277 	zend_string	*result;
2278 	size_t old_replace_count;
2279 
2280 	/* If replace is an array then the regex argument needs to also be an array */
2281 	if (replace_ht && !regex_ht) {
2282 		zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given");
2283 		RETURN_THROWS();
2284 	}
2285 
2286 	if (subject_str) {
2287 		old_replace_count = replace_count;
2288 		result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2289 			subject_str, limit, &replace_count);
2290 		if (result != NULL) {
2291 			if (!is_filter || replace_count > old_replace_count) {
2292 				RETVAL_STR(result);
2293 			} else {
2294 				zend_string_release_ex(result, 0);
2295 				RETVAL_NULL();
2296 			}
2297 		} else {
2298 			RETVAL_NULL();
2299 		}
2300 	} else {
2301 		/* if subject is an array */
2302 		zval		*subject_entry, zv;
2303 		zend_string	*string_key;
2304 		zend_ulong	 num_key;
2305 
2306 		ZEND_ASSERT(subject_ht != NULL);
2307 
2308 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2309 		HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2310 
2311 		/* For each subject entry, convert it to string, then perform replacement
2312 		   and add the result to the return_value array. */
2313 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2314 			old_replace_count = replace_count;
2315 			zend_string *tmp_subject_entry_str;
2316 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2317 			result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2318 				subject_entry_str, limit, &replace_count);
2319 
2320 			if (result != NULL) {
2321 				if (!is_filter || replace_count > old_replace_count) {
2322 					/* Add to return array */
2323 					ZVAL_STR(&zv, result);
2324 					if (string_key) {
2325 						zend_hash_add_new(return_value_ht, string_key, &zv);
2326 					} else {
2327 						zend_hash_index_add_new(return_value_ht, num_key, &zv);
2328 					}
2329 				} else {
2330 					zend_string_release_ex(result, 0);
2331 				}
2332 			}
2333 			zend_tmp_string_release(tmp_subject_entry_str);
2334 		} ZEND_HASH_FOREACH_END();
2335 	}
2336 
2337 	if (zcount) {
2338 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2339 	}
2340 }
2341 
2342 /* {{{ preg_replace_common */
preg_replace_common(INTERNAL_FUNCTION_PARAMETERS,bool is_filter)2343 static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
2344 {
2345 	zend_string *regex_str, *replace_str, *subject_str;
2346 	HashTable *regex_ht, *replace_ht, *subject_ht;
2347 	zend_long limit = -1;
2348 	zval *zcount = NULL;
2349 
2350 	/* Get function parameters and do error-checking. */
2351 	ZEND_PARSE_PARAMETERS_START(3, 5)
2352 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2353 		Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str)
2354 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2355 		Z_PARAM_OPTIONAL
2356 		Z_PARAM_LONG(limit)
2357 		Z_PARAM_ZVAL(zcount)
2358 	ZEND_PARSE_PARAMETERS_END();
2359 
2360 	_preg_replace_common(
2361 		return_value,
2362 		regex_ht, regex_str,
2363 		replace_ht, replace_str,
2364 		subject_ht, subject_str,
2365 		limit, zcount, is_filter);
2366 }
2367 /* }}} */
2368 
2369 /* {{{ Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)2370 PHP_FUNCTION(preg_replace)
2371 {
2372 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
2373 }
2374 /* }}} */
2375 
2376 ZEND_FRAMELESS_FUNCTION(preg_replace, 3)
2377 {
2378 	zend_string *regex_str, *replace_str, *subject_str;
2379 	HashTable *regex_ht, *replace_ht, *subject_ht;
2380 	zval regex_tmp, replace_tmp, subject_tmp;
2381 
2382 	Z_FLF_PARAM_ARRAY_HT_OR_STR(1, regex_ht, regex_str, regex_tmp);
2383 	Z_FLF_PARAM_ARRAY_HT_OR_STR(2, replace_ht, replace_str, replace_tmp);
2384 	Z_FLF_PARAM_ARRAY_HT_OR_STR(3, subject_ht, subject_str, subject_tmp);
2385 
2386 	_preg_replace_common(
2387 		return_value,
2388 		regex_ht, regex_str,
2389 		replace_ht, replace_str,
2390 		subject_ht, subject_str,
2391 		/* limit */ -1, /* zcount */ NULL, /* is_filter */ false);
2392 
2393 flf_clean:;
2394 	Z_FLF_PARAM_FREE_STR(1, regex_tmp);
2395 	Z_FLF_PARAM_FREE_STR(2, replace_tmp);
2396 	Z_FLF_PARAM_FREE_STR(3, subject_tmp);
2397 }
2398 
2399 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)2400 PHP_FUNCTION(preg_replace_callback)
2401 {
2402 	zval *zcount = NULL;
2403 	zend_string *regex_str;
2404 	HashTable *regex_ht;
2405 	zend_string *subject_str;
2406 	HashTable *subject_ht;
2407 	zend_long limit = -1, flags = 0;
2408 	size_t replace_count;
2409 	zend_fcall_info fci;
2410 	zend_fcall_info_cache fcc;
2411 
2412 	/* Get function parameters and do error-checking. */
2413 	ZEND_PARSE_PARAMETERS_START(3, 6)
2414 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2415 		Z_PARAM_FUNC(fci, fcc)
2416 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2417 		Z_PARAM_OPTIONAL
2418 		Z_PARAM_LONG(limit)
2419 		Z_PARAM_ZVAL(zcount)
2420 		Z_PARAM_LONG(flags)
2421 	ZEND_PARSE_PARAMETERS_END();
2422 
2423 	replace_count = preg_replace_func_impl(return_value, regex_str, regex_ht,
2424 		&fci, &fcc,
2425 		subject_str, subject_ht, limit, flags);
2426 	if (zcount) {
2427 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2428 	}
2429 }
2430 /* }}} */
2431 
2432 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)2433 PHP_FUNCTION(preg_replace_callback_array)
2434 {
2435 	zval zv, *replace, *zcount = NULL;
2436 	HashTable *pattern, *subject_ht;
2437 	zend_string *subject_str, *str_idx_regex;
2438 	zend_long limit = -1, flags = 0;
2439 	size_t replace_count = 0;
2440 	zend_fcall_info fci;
2441 	zend_fcall_info_cache fcc;
2442 
2443 	/* Get function parameters and do error-checking. */
2444 	ZEND_PARSE_PARAMETERS_START(2, 5)
2445 		Z_PARAM_ARRAY_HT(pattern)
2446 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2447 		Z_PARAM_OPTIONAL
2448 		Z_PARAM_LONG(limit)
2449 		Z_PARAM_ZVAL(zcount)
2450 		Z_PARAM_LONG(flags)
2451 	ZEND_PARSE_PARAMETERS_END();
2452 
2453 	fci.size = sizeof(fci);
2454 	fci.object = NULL;
2455 	fci.named_params = NULL;
2456 
2457 	if (subject_ht) {
2458 		GC_TRY_ADDREF(subject_ht);
2459 	} else {
2460 		GC_TRY_ADDREF(subject_str);
2461 	}
2462 
2463 	ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) {
2464 		if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) {
2465 			zend_argument_type_error(1, "must contain only valid callbacks");
2466 			goto error;
2467 		}
2468 		if (!str_idx_regex) {
2469 			zend_argument_type_error(1, "must contain only string patterns as keys");
2470 			goto error;
2471 		}
2472 
2473 		ZVAL_COPY_VALUE(&fci.function_name, replace);
2474 
2475 		replace_count += preg_replace_func_impl(&zv, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc,
2476 			subject_str, subject_ht, limit, flags);
2477 		switch (Z_TYPE(zv)) {
2478 			case IS_ARRAY:
2479 				ZEND_ASSERT(subject_ht);
2480 				zend_array_release(subject_ht);
2481 				subject_ht = Z_ARR(zv);
2482 				break;
2483 			case IS_STRING:
2484 				ZEND_ASSERT(subject_str);
2485 				zend_string_release(subject_str);
2486 				subject_str = Z_STR(zv);
2487 				break;
2488 			case IS_NULL:
2489 				RETVAL_NULL();
2490 				goto error;
2491 			EMPTY_SWITCH_DEFAULT_CASE()
2492 		}
2493 
2494 		if (EG(exception)) {
2495 			goto error;
2496 		}
2497 	} ZEND_HASH_FOREACH_END();
2498 
2499 	if (zcount) {
2500 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2501 	}
2502 
2503 	if (subject_ht) {
2504 		RETVAL_ARR(subject_ht);
2505 		// Unset the type_flags of immutable arrays to prevent the VM from performing refcounting
2506 		if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) {
2507 			Z_TYPE_FLAGS_P(return_value) = 0;
2508 		}
2509 		return;
2510 	} else {
2511 		RETURN_STR(subject_str);
2512 	}
2513 
2514 error:
2515 	if (subject_ht) {
2516 		zend_array_release(subject_ht);
2517 	} else {
2518 		zend_string_release(subject_str);
2519 	}
2520 }
2521 /* }}} */
2522 
2523 /* {{{ Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)2524 PHP_FUNCTION(preg_filter)
2525 {
2526 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
2527 }
2528 /* }}} */
2529 
2530 /* {{{ Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)2531 PHP_FUNCTION(preg_split)
2532 {
2533 	zend_string			*regex;			/* Regular expression */
2534 	zend_string			*subject;		/* String to match against */
2535 	zend_long			 limit_val = -1;/* Integer value of limit */
2536 	zend_long			 flags = 0;		/* Match control flags */
2537 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2538 
2539 	/* Get function parameters and do error checking */
2540 	ZEND_PARSE_PARAMETERS_START(2, 4)
2541 		Z_PARAM_STR(regex)
2542 		Z_PARAM_STR(subject)
2543 		Z_PARAM_OPTIONAL
2544 		Z_PARAM_LONG(limit_val)
2545 		Z_PARAM_LONG(flags)
2546 	ZEND_PARSE_PARAMETERS_END();
2547 
2548 	/* Compile regex or get it from cache. */
2549 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2550 		RETURN_FALSE;
2551 	}
2552 
2553 	pce->refcount++;
2554 	php_pcre_split_impl(pce, subject, return_value, limit_val, flags);
2555 	pce->refcount--;
2556 }
2557 /* }}} */
2558 
2559 /* {{{ php_pcre_split */
php_pcre_split_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zend_long limit_val,zend_long flags)2560 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
2561 	zend_long limit_val, zend_long flags)
2562 {
2563 	uint32_t		 options;			/* Execution options */
2564 	int				 count;				/* Count of matched subpatterns */
2565 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
2566 	PCRE2_SIZE		 last_match_offset;	/* Location of last match */
2567 	uint32_t		 no_empty;			/* If NO_EMPTY flag is set */
2568 	uint32_t		 delim_capture; 	/* If delimiters should be captured */
2569 	uint32_t		 offset_capture;	/* If offsets should be captured */
2570 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2571 	zval			 tmp;
2572 	pcre2_match_data *match_data;
2573 	char *subject = ZSTR_VAL(subject_str);
2574 
2575 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
2576 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
2577 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
2578 
2579 	/* Initialize return value */
2580 	array_init(return_value);
2581 	HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2582 
2583 	/* Calculate the size of the offsets array, and allocate memory for it. */
2584 	num_subpats = pce->capture_count + 1;
2585 
2586 	/* Start at the beginning of the string */
2587 	start_offset = 0;
2588 	last_match_offset = 0;
2589 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2590 
2591 	if (limit_val == -1) {
2592 		/* pass */
2593 	} else if (limit_val == 0) {
2594 		limit_val = -1;
2595 	} else if (limit_val <= 1) {
2596 		goto last;
2597 	}
2598 
2599 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2600 		match_data = mdata;
2601 	} else {
2602 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2603 		if (!match_data) {
2604 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2605 			zval_ptr_dtor(return_value);
2606 			RETURN_FALSE;
2607 		}
2608 	}
2609 
2610 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2611 
2612 	/* Array of subpattern offsets */
2613 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
2614 
2615 #ifdef HAVE_PCRE_JIT_SUPPORT
2616 	if ((pce->preg_options & PREG_JIT) && options) {
2617 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2618 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2619 	} else
2620 #endif
2621 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2622 			options, match_data, mctx);
2623 
2624 	while (1) {
2625 		/* If something matched */
2626 		if (count >= 0) {
2627 			/* Check for too many substrings condition. */
2628 			if (UNEXPECTED(count == 0)) {
2629 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
2630 				count = num_subpats;
2631 			}
2632 
2633 matched:
2634 			if (UNEXPECTED(offsets[1] < offsets[0])) {
2635 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2636 				break;
2637 			}
2638 
2639 			if (!no_empty || offsets[0] != last_match_offset) {
2640 				if (offset_capture) {
2641 					/* Add (match, offset) pair to the return value */
2642 					add_offset_pair(
2643 						return_value_ht, subject, last_match_offset, offsets[0],
2644 						NULL, 0);
2645 				} else {
2646 					/* Add the piece to the return value */
2647 					populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
2648 					zend_hash_next_index_insert_new(return_value_ht, &tmp);
2649 				}
2650 
2651 				/* One less left to do */
2652 				if (limit_val != -1)
2653 					limit_val--;
2654 			}
2655 
2656 			if (delim_capture) {
2657 				size_t i;
2658 				for (i = 1; i < count; i++) {
2659 					/* If we have matched a delimiter */
2660 					if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
2661 						if (offset_capture) {
2662 							add_offset_pair(
2663 								return_value_ht, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
2664 						} else {
2665 							populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
2666 							zend_hash_next_index_insert_new(return_value_ht, &tmp);
2667 						}
2668 					}
2669 				}
2670 			}
2671 
2672 			/* Advance to the position right after the last full match */
2673 			start_offset = last_match_offset = offsets[1];
2674 
2675 			/* If we have matched an empty string, mimic what Perl's /g options does.
2676 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
2677 			   the match again at the same point. If this fails (picked up above) we
2678 			   advance to the next character. */
2679 			if (start_offset == offsets[0]) {
2680 				/* Get next piece if no limit or limit not yet reached and something matched*/
2681 				if (limit_val != -1 && limit_val <= 1) {
2682 					break;
2683 				}
2684 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2685 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
2686 				if (count >= 0) {
2687 					goto matched;
2688 				} else if (count == PCRE2_ERROR_NOMATCH) {
2689 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2690 					   this is not necessarily the end. We need to advance
2691 					   the start offset, and continue. Fudge the offset values
2692 					   to achieve this, unless we're already at the end of the string. */
2693 					if (start_offset < ZSTR_LEN(subject_str)) {
2694 						start_offset += calculate_unit_length(pce, subject + start_offset);
2695 					} else {
2696 						break;
2697 					}
2698 				} else {
2699 					goto error;
2700 				}
2701 			}
2702 
2703 		} else if (count == PCRE2_ERROR_NOMATCH) {
2704 			break;
2705 		} else {
2706 error:
2707 			pcre_handle_exec_error(count);
2708 			break;
2709 		}
2710 
2711 		/* Get next piece if no limit or limit not yet reached and something matched*/
2712 		if (limit_val != -1 && limit_val <= 1) {
2713 			break;
2714 		}
2715 
2716 #ifdef HAVE_PCRE_JIT_SUPPORT
2717 		if (pce->preg_options & PREG_JIT) {
2718 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2719 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2720 		} else
2721 #endif
2722 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2723 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2724 	}
2725 	if (match_data != mdata) {
2726 		pcre2_match_data_free(match_data);
2727 	}
2728 
2729 	if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) {
2730 		zval_ptr_dtor(return_value);
2731 		RETURN_FALSE;
2732 	}
2733 
2734 last:
2735 	start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */
2736 
2737 	if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
2738 		if (offset_capture) {
2739 			/* Add the last (match, offset) pair to the return value */
2740 			add_offset_pair(return_value_ht, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
2741 		} else {
2742 			/* Add the last piece to the return value */
2743 			if (start_offset == 0) {
2744 				ZVAL_STR_COPY(&tmp, subject_str);
2745 			} else {
2746 				populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
2747 			}
2748 			zend_hash_next_index_insert_new(return_value_ht, &tmp);
2749 		}
2750 	}
2751 }
2752 /* }}} */
2753 
2754 /* {{{ Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)2755 PHP_FUNCTION(preg_quote)
2756 {
2757 	zend_string *str;       		/* Input string argument */
2758 	zend_string	*delim = NULL;		/* Additional delimiter argument */
2759 	char		*in_str;			/* Input string */
2760 	char		*in_str_end;    	/* End of the input string */
2761 	zend_string	*out_str;			/* Output string with quoted characters */
2762 	size_t       extra_len;         /* Number of additional characters */
2763 	char 		*p,					/* Iterator for input string */
2764 				*q,					/* Iterator for output string */
2765 				 delim_char = '\0',	/* Delimiter character to be quoted */
2766 				 c;					/* Current character */
2767 
2768 	/* Get the arguments and check for errors */
2769 	ZEND_PARSE_PARAMETERS_START(1, 2)
2770 		Z_PARAM_STR(str)
2771 		Z_PARAM_OPTIONAL
2772 		Z_PARAM_STR_OR_NULL(delim)
2773 	ZEND_PARSE_PARAMETERS_END();
2774 
2775 	/* Nothing to do if we got an empty string */
2776 	if (ZSTR_LEN(str) == 0) {
2777 		RETURN_EMPTY_STRING();
2778 	}
2779 
2780 	in_str = ZSTR_VAL(str);
2781 	in_str_end = in_str + ZSTR_LEN(str);
2782 
2783 	if (delim) {
2784 		delim_char = ZSTR_VAL(delim)[0];
2785 	}
2786 
2787 	/* Go through the string and quote necessary characters */
2788 	extra_len = 0;
2789 	p = in_str;
2790 	do {
2791 		c = *p;
2792 		switch(c) {
2793 			case '.':
2794 			case '\\':
2795 			case '+':
2796 			case '*':
2797 			case '?':
2798 			case '[':
2799 			case '^':
2800 			case ']':
2801 			case '$':
2802 			case '(':
2803 			case ')':
2804 			case '{':
2805 			case '}':
2806 			case '=':
2807 			case '!':
2808 			case '>':
2809 			case '<':
2810 			case '|':
2811 			case ':':
2812 			case '-':
2813 			case '#':
2814 				extra_len++;
2815 				break;
2816 
2817 			case '\0':
2818 				extra_len+=3;
2819 				break;
2820 
2821 			default:
2822 				if (c == delim_char) {
2823 					extra_len++;
2824 				}
2825 				break;
2826 		}
2827 		p++;
2828 	} while (p != in_str_end);
2829 
2830 	if (extra_len == 0) {
2831 		RETURN_STR_COPY(str);
2832 	}
2833 
2834 	/* Allocate enough memory so that even if each character
2835 	   is quoted, we won't run out of room */
2836 	out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0);
2837 	q = ZSTR_VAL(out_str);
2838 	p = in_str;
2839 
2840 	do {
2841 		c = *p;
2842 		switch(c) {
2843 			case '.':
2844 			case '\\':
2845 			case '+':
2846 			case '*':
2847 			case '?':
2848 			case '[':
2849 			case '^':
2850 			case ']':
2851 			case '$':
2852 			case '(':
2853 			case ')':
2854 			case '{':
2855 			case '}':
2856 			case '=':
2857 			case '!':
2858 			case '>':
2859 			case '<':
2860 			case '|':
2861 			case ':':
2862 			case '-':
2863 			case '#':
2864 				*q++ = '\\';
2865 				*q++ = c;
2866 				break;
2867 
2868 			case '\0':
2869 				*q++ = '\\';
2870 				*q++ = '0';
2871 				*q++ = '0';
2872 				*q++ = '0';
2873 				break;
2874 
2875 			default:
2876 				if (c == delim_char) {
2877 					*q++ = '\\';
2878 				}
2879 				*q++ = c;
2880 				break;
2881 		}
2882 		p++;
2883 	} while (p != in_str_end);
2884 	*q = '\0';
2885 
2886 	RETURN_NEW_STR(out_str);
2887 }
2888 /* }}} */
2889 
2890 /* {{{ Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2891 PHP_FUNCTION(preg_grep)
2892 {
2893 	zend_string			*regex;			/* Regular expression */
2894 	zval				*input;			/* Input array */
2895 	zend_long			 flags = 0;		/* Match control flags */
2896 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2897 
2898 	/* Get arguments and do error checking */
2899 	ZEND_PARSE_PARAMETERS_START(2, 3)
2900 		Z_PARAM_STR(regex)
2901 		Z_PARAM_ARRAY(input)
2902 		Z_PARAM_OPTIONAL
2903 		Z_PARAM_LONG(flags)
2904 	ZEND_PARSE_PARAMETERS_END();
2905 
2906 	/* Compile regex or get it from cache. */
2907 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2908 		RETURN_FALSE;
2909 	}
2910 
2911 	pce->refcount++;
2912 	php_pcre_grep_impl(pce, input, return_value, flags);
2913 	pce->refcount--;
2914 }
2915 /* }}} */
2916 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2917 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2918 {
2919 	zval            *entry;             /* An entry in the input array */
2920 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2921 	int				 count;				/* Count of matched subpatterns */
2922 	uint32_t		 options;			/* Execution options */
2923 	zend_string		*string_key;
2924 	zend_ulong		 num_key;
2925 	bool		 invert;			/* Whether to return non-matching
2926 										   entries */
2927 	pcre2_match_data *match_data;
2928 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
2929 
2930 	/* Calculate the size of the offsets array, and allocate memory for it. */
2931 	num_subpats = pce->capture_count + 1;
2932 
2933 	/* Initialize return array */
2934 	array_init(return_value);
2935 	HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2936 
2937 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2938 
2939 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2940 		match_data = mdata;
2941 	} else {
2942 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2943 		if (!match_data) {
2944 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2945 			return;
2946 		}
2947 	}
2948 
2949 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2950 
2951 	/* Go through the input array */
2952 	ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2953 		zend_string *tmp_subject_str;
2954 		zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str);
2955 
2956 		/* Perform the match */
2957 #ifdef HAVE_PCRE_JIT_SUPPORT
2958 		if ((pce->preg_options & PREG_JIT) && options) {
2959 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2960 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2961 		} else
2962 #endif
2963 		count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2964 				options, match_data, mctx);
2965 
2966 		/* If the entry fits our requirements */
2967 		if (count >= 0) {
2968 			/* Check for too many substrings condition. */
2969 			if (UNEXPECTED(count == 0)) {
2970 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2971 			}
2972 			if (!invert) {
2973 				Z_TRY_ADDREF_P(entry);
2974 
2975 				/* Add to return array */
2976 				if (string_key) {
2977 					zend_hash_update(return_value_ht, string_key, entry);
2978 				} else {
2979 					zend_hash_index_update(return_value_ht, num_key, entry);
2980 				}
2981 			}
2982 		} else if (count == PCRE2_ERROR_NOMATCH) {
2983 			if (invert) {
2984 				Z_TRY_ADDREF_P(entry);
2985 
2986 				/* Add to return array */
2987 				if (string_key) {
2988 					zend_hash_update(return_value_ht, string_key, entry);
2989 				} else {
2990 					zend_hash_index_update(return_value_ht, num_key, entry);
2991 				}
2992 			}
2993 		} else {
2994 			pcre_handle_exec_error(count);
2995 			zend_tmp_string_release(tmp_subject_str);
2996 			break;
2997 		}
2998 
2999 		zend_tmp_string_release(tmp_subject_str);
3000 	} ZEND_HASH_FOREACH_END();
3001 	if (match_data != mdata) {
3002 		pcre2_match_data_free(match_data);
3003 	}
3004 }
3005 /* }}} */
3006 
3007 /* {{{ Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)3008 PHP_FUNCTION(preg_last_error)
3009 {
3010 	ZEND_PARSE_PARAMETERS_NONE();
3011 
3012 	RETURN_LONG(PCRE_G(error_code));
3013 }
3014 /* }}} */
3015 
3016 /* {{{ Returns the error message of the last regexp execution. */
PHP_FUNCTION(preg_last_error_msg)3017 PHP_FUNCTION(preg_last_error_msg)
3018 {
3019 	ZEND_PARSE_PARAMETERS_NONE();
3020 
3021 	RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code)));
3022 }
3023 /* }}} */
3024 
3025 /* {{{ module definition structures */
3026 
3027 zend_module_entry pcre_module_entry = {
3028 	STANDARD_MODULE_HEADER,
3029 	"pcre",
3030 	ext_functions,
3031 	PHP_MINIT(pcre),
3032 	PHP_MSHUTDOWN(pcre),
3033 	PHP_RINIT(pcre),
3034 	PHP_RSHUTDOWN(pcre),
3035 	PHP_MINFO(pcre),
3036 	PHP_PCRE_VERSION,
3037 	PHP_MODULE_GLOBALS(pcre),
3038 	PHP_GINIT(pcre),
3039 	PHP_GSHUTDOWN(pcre),
3040 	NULL,
3041 	STANDARD_MODULE_PROPERTIES_EX
3042 };
3043 
3044 #ifdef COMPILE_DL_PCRE
ZEND_GET_MODULE(pcre)3045 ZEND_GET_MODULE(pcre)
3046 #endif
3047 
3048 /* }}} */
3049 
3050 PHPAPI pcre2_match_context *php_pcre_mctx(void)
3051 {/*{{{*/
3052 	return mctx;
3053 }/*}}}*/
3054 
php_pcre_gctx(void)3055 PHPAPI pcre2_general_context *php_pcre_gctx(void)
3056 {/*{{{*/
3057 	return gctx;
3058 }/*}}}*/
3059 
php_pcre_cctx(void)3060 PHPAPI pcre2_compile_context *php_pcre_cctx(void)
3061 {/*{{{*/
3062 	return cctx;
3063 }/*}}}*/
3064 
php_pcre_pce_incref(pcre_cache_entry * pce)3065 PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce)
3066 {/*{{{*/
3067 	assert(NULL != pce);
3068 	pce->refcount++;
3069 }/*}}}*/
3070 
php_pcre_pce_decref(pcre_cache_entry * pce)3071 PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce)
3072 {/*{{{*/
3073 	assert(NULL != pce);
3074 	assert(0 != pce->refcount);
3075 	pce->refcount--;
3076 }/*}}}*/
3077 
php_pcre_pce_re(pcre_cache_entry * pce)3078 PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce)
3079 {/*{{{*/
3080 	assert(NULL != pce);
3081 	return pce->re;
3082 }/*}}}*/
3083